1 | //===- bolt/Passes/TailDuplication.cpp ------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the TailDuplication class. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "bolt/Passes/TailDuplication.h" |
14 | #include "llvm/ADT/DenseMap.h" |
15 | #include "llvm/MC/MCRegisterInfo.h" |
16 | |
17 | #include <numeric> |
18 | #include <queue> |
19 | |
20 | #define DEBUG_TYPE "taildup" |
21 | |
22 | using namespace llvm; |
23 | |
24 | namespace opts { |
25 | |
26 | extern cl::OptionCategory BoltOptCategory; |
27 | extern cl::opt<bool> NoThreads; |
28 | |
29 | cl::opt<bolt::TailDuplication::DuplicationMode> TailDuplicationMode( |
30 | "tail-duplication" , |
31 | cl::desc("duplicate unconditional branches that cross a cache line" ), |
32 | cl::init(Val: bolt::TailDuplication::TD_NONE), |
33 | cl::values(clEnumValN(bolt::TailDuplication::TD_NONE, "none" , |
34 | "do not apply" ), |
35 | clEnumValN(bolt::TailDuplication::TD_AGGRESSIVE, "aggressive" , |
36 | "aggressive strategy" ), |
37 | clEnumValN(bolt::TailDuplication::TD_MODERATE, "moderate" , |
38 | "moderate strategy" ), |
39 | clEnumValN(bolt::TailDuplication::TD_CACHE, "cache" , |
40 | "cache-aware duplication strategy" )), |
41 | cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); |
42 | |
43 | static cl::opt<unsigned> |
44 | TailDuplicationMinimumOffset("tail-duplication-minimum-offset" , |
45 | cl::desc("minimum offset needed between block " |
46 | "and successor to allow duplication" ), |
47 | cl::ReallyHidden, cl::init(Val: 64), |
48 | cl::cat(BoltOptCategory)); |
49 | |
50 | static cl::opt<unsigned> TailDuplicationMaximumDuplication( |
51 | "tail-duplication-maximum-duplication" , |
52 | cl::desc("tail blocks whose size (in bytes) exceeds the value are never " |
53 | "duplicated" ), |
54 | cl::ZeroOrMore, cl::ReallyHidden, cl::init(Val: 24), cl::cat(BoltOptCategory)); |
55 | |
56 | static cl::opt<unsigned> TailDuplicationMinimumDuplication( |
57 | "tail-duplication-minimum-duplication" , |
58 | cl::desc("tail blocks with size (in bytes) not exceeding the value are " |
59 | "always duplicated" ), |
60 | cl::ReallyHidden, cl::init(Val: 2), cl::cat(BoltOptCategory)); |
61 | |
62 | static cl::opt<bool> TailDuplicationConstCopyPropagation( |
63 | "tail-duplication-const-copy-propagation" , |
64 | cl::desc("enable const and copy propagation after tail duplication" ), |
65 | cl::ReallyHidden, cl::init(Val: false), cl::cat(BoltOptCategory)); |
66 | |
67 | static cl::opt<unsigned> TailDuplicationMaxCacheDistance( |
68 | "tail-duplication-max-cache-distance" , |
69 | cl::desc("The weight of backward jumps for ExtTSP value" ), cl::init(Val: 256), |
70 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
71 | |
72 | static cl::opt<double> TailDuplicationCacheBackwardWeight( |
73 | "tail-duplication-cache-backward-weight" , |
74 | cl::desc( |
75 | "The maximum distance (in bytes) of backward jumps for ExtTSP value" ), |
76 | cl::init(Val: 0.5), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
77 | |
78 | } // namespace opts |
79 | |
80 | namespace llvm { |
81 | namespace bolt { |
82 | |
83 | void TailDuplication::getCallerSavedRegs(const MCInst &Inst, BitVector &Regs, |
84 | BinaryContext &BC) const { |
85 | if (!BC.MIB->isCall(Inst)) |
86 | return; |
87 | BitVector CallRegs = BitVector(BC.MRI->getNumRegs(), false); |
88 | BC.MIB->getCalleeSavedRegs(Regs&: CallRegs); |
89 | CallRegs.flip(); |
90 | Regs |= CallRegs; |
91 | } |
92 | |
93 | bool TailDuplication::regIsPossiblyOverwritten(const MCInst &Inst, unsigned Reg, |
94 | BinaryContext &BC) const { |
95 | BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false); |
96 | BC.MIB->getWrittenRegs(Inst, Regs&: WrittenRegs); |
97 | getCallerSavedRegs(Inst, Regs&: WrittenRegs, BC); |
98 | if (BC.MIB->isRep(Inst)) |
99 | BC.MIB->getRepRegs(Regs&: WrittenRegs); |
100 | WrittenRegs &= BC.MIB->getAliases(Reg, OnlySmaller: false); |
101 | return WrittenRegs.any(); |
102 | } |
103 | |
104 | bool TailDuplication::regIsDefinitelyOverwritten(const MCInst &Inst, |
105 | unsigned Reg, |
106 | BinaryContext &BC) const { |
107 | BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false); |
108 | BC.MIB->getWrittenRegs(Inst, Regs&: WrittenRegs); |
109 | getCallerSavedRegs(Inst, Regs&: WrittenRegs, BC); |
110 | if (BC.MIB->isRep(Inst)) |
111 | BC.MIB->getRepRegs(Regs&: WrittenRegs); |
112 | return (!regIsUsed(Inst, Reg, BC) && WrittenRegs.test(Idx: Reg) && |
113 | !BC.MIB->isConditionalMove(Inst)); |
114 | } |
115 | |
116 | bool TailDuplication::regIsUsed(const MCInst &Inst, unsigned Reg, |
117 | BinaryContext &BC) const { |
118 | BitVector SrcRegs = BitVector(BC.MRI->getNumRegs(), false); |
119 | BC.MIB->getSrcRegs(Inst, Regs&: SrcRegs); |
120 | SrcRegs &= BC.MIB->getAliases(Reg, OnlySmaller: true); |
121 | return SrcRegs.any(); |
122 | } |
123 | |
124 | bool TailDuplication::isOverwrittenBeforeUsed(BinaryBasicBlock &StartBB, |
125 | unsigned Reg) const { |
126 | BinaryFunction *BF = StartBB.getFunction(); |
127 | BinaryContext &BC = BF->getBinaryContext(); |
128 | std::queue<BinaryBasicBlock *> Q; |
129 | for (auto Itr = StartBB.succ_begin(); Itr != StartBB.succ_end(); ++Itr) { |
130 | BinaryBasicBlock *NextBB = *Itr; |
131 | Q.push(x: NextBB); |
132 | } |
133 | std::set<BinaryBasicBlock *> Visited; |
134 | // Breadth first search through successive blocks and see if Reg is ever used |
135 | // before its overwritten |
136 | while (Q.size() > 0) { |
137 | BinaryBasicBlock *CurrBB = Q.front(); |
138 | Q.pop(); |
139 | if (Visited.count(x: CurrBB)) |
140 | continue; |
141 | Visited.insert(x: CurrBB); |
142 | bool Overwritten = false; |
143 | for (auto Itr = CurrBB->begin(); Itr != CurrBB->end(); ++Itr) { |
144 | MCInst &Inst = *Itr; |
145 | if (regIsUsed(Inst, Reg, BC)) |
146 | return false; |
147 | if (regIsDefinitelyOverwritten(Inst, Reg, BC)) { |
148 | Overwritten = true; |
149 | break; |
150 | } |
151 | } |
152 | if (Overwritten) |
153 | continue; |
154 | for (auto Itr = CurrBB->succ_begin(); Itr != CurrBB->succ_end(); ++Itr) { |
155 | BinaryBasicBlock *NextBB = *Itr; |
156 | Q.push(x: NextBB); |
157 | } |
158 | } |
159 | return true; |
160 | } |
161 | |
162 | void TailDuplication::constantAndCopyPropagate( |
163 | BinaryBasicBlock &OriginalBB, |
164 | std::vector<BinaryBasicBlock *> &BlocksToPropagate) { |
165 | BinaryFunction *BF = OriginalBB.getFunction(); |
166 | BinaryContext &BC = BF->getBinaryContext(); |
167 | |
168 | BlocksToPropagate.insert(position: BlocksToPropagate.begin(), x: &OriginalBB); |
169 | // Iterate through the original instructions to find one to propagate |
170 | for (auto Itr = OriginalBB.begin(); Itr != OriginalBB.end(); ++Itr) { |
171 | MCInst &OriginalInst = *Itr; |
172 | // It must be a non conditional |
173 | if (BC.MIB->isConditionalMove(Inst: OriginalInst)) |
174 | continue; |
175 | |
176 | // Move immediate or move register |
177 | if ((!BC.MII->get(Opcode: OriginalInst.getOpcode()).isMoveImmediate() || |
178 | !OriginalInst.getOperand(i: 1).isImm()) && |
179 | (!BC.MII->get(Opcode: OriginalInst.getOpcode()).isMoveReg() || |
180 | !OriginalInst.getOperand(i: 1).isReg())) |
181 | continue; |
182 | |
183 | // True if this is constant propagation and not copy propagation |
184 | bool ConstantProp = BC.MII->get(Opcode: OriginalInst.getOpcode()).isMoveImmediate(); |
185 | // The Register to replaced |
186 | unsigned Reg = OriginalInst.getOperand(i: 0).getReg(); |
187 | // True if the register to replace was replaced everywhere it was used |
188 | bool ReplacedEverywhere = true; |
189 | // True if the register was definitely overwritten |
190 | bool Overwritten = false; |
191 | // True if the register to replace and the register to replace with (for |
192 | // copy propagation) has not been overwritten and is still usable |
193 | bool RegsActive = true; |
194 | |
195 | // Iterate through successor blocks and through their instructions |
196 | for (BinaryBasicBlock *NextBB : BlocksToPropagate) { |
197 | for (auto PropagateItr = |
198 | ((NextBB == &OriginalBB) ? Itr + 1 : NextBB->begin()); |
199 | PropagateItr < NextBB->end(); ++PropagateItr) { |
200 | MCInst &PropagateInst = *PropagateItr; |
201 | if (regIsUsed(Inst: PropagateInst, Reg, BC)) { |
202 | bool Replaced = false; |
203 | // If both registers are active for copy propagation or the register |
204 | // to replace is active for constant propagation |
205 | if (RegsActive) { |
206 | // Set Replaced and so ReplacedEverwhere to false if it cannot be |
207 | // replaced (no replacing that opcode, Register is src and dest) |
208 | if (ConstantProp) |
209 | Replaced = BC.MIB->replaceRegWithImm( |
210 | Inst&: PropagateInst, Register: Reg, Imm: OriginalInst.getOperand(i: 1).getImm()); |
211 | else |
212 | Replaced = BC.MIB->replaceRegWithReg( |
213 | Inst&: PropagateInst, ToReplace: Reg, ReplaceWith: OriginalInst.getOperand(i: 1).getReg()); |
214 | } |
215 | ReplacedEverywhere = ReplacedEverywhere && Replaced; |
216 | } |
217 | // For copy propagation, make sure no propagation happens after the |
218 | // register to replace with is overwritten |
219 | if (!ConstantProp && |
220 | regIsPossiblyOverwritten(Inst: PropagateInst, |
221 | Reg: OriginalInst.getOperand(i: 1).getReg(), BC)) |
222 | RegsActive = false; |
223 | |
224 | // Make sure no propagation happens after the register to replace is |
225 | // overwritten |
226 | if (regIsPossiblyOverwritten(Inst: PropagateInst, Reg, BC)) |
227 | RegsActive = false; |
228 | |
229 | // Record if the register to replace is overwritten |
230 | if (regIsDefinitelyOverwritten(Inst: PropagateInst, Reg, BC)) { |
231 | Overwritten = true; |
232 | break; |
233 | } |
234 | } |
235 | if (Overwritten) |
236 | break; |
237 | } |
238 | |
239 | // If the register was replaced everywhere and it was overwritten in either |
240 | // one of the iterated through blocks or one of the successor blocks, delete |
241 | // the original move instruction |
242 | if (ReplacedEverywhere && |
243 | (Overwritten || |
244 | isOverwrittenBeforeUsed( |
245 | StartBB&: *BlocksToPropagate[BlocksToPropagate.size() - 1], Reg))) { |
246 | // If both registers are active for copy propagation or the register |
247 | // to replace is active for constant propagation |
248 | StaticInstructionDeletionCount++; |
249 | DynamicInstructionDeletionCount += OriginalBB.getExecutionCount(); |
250 | Itr = std::prev(x: OriginalBB.eraseInstruction(II: Itr)); |
251 | } |
252 | } |
253 | } |
254 | |
255 | bool TailDuplication::isInCacheLine(const BinaryBasicBlock &BB, |
256 | const BinaryBasicBlock &Succ) const { |
257 | if (&BB == &Succ) |
258 | return true; |
259 | |
260 | uint64_t Distance = 0; |
261 | int Direction = (Succ.getLayoutIndex() > BB.getLayoutIndex()) ? 1 : -1; |
262 | |
263 | for (unsigned I = BB.getLayoutIndex() + Direction; I != Succ.getLayoutIndex(); |
264 | I += Direction) { |
265 | Distance += BB.getFunction()->getLayout().getBlock(Index: I)->getOriginalSize(); |
266 | if (Distance > opts::TailDuplicationMinimumOffset) |
267 | return false; |
268 | } |
269 | return true; |
270 | } |
271 | |
272 | std::vector<BinaryBasicBlock *> |
273 | TailDuplication::moderateDuplicate(BinaryBasicBlock &BB, |
274 | BinaryBasicBlock &Tail) const { |
275 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
276 | // The block must be hot |
277 | if (BB.getKnownExecutionCount() == 0) |
278 | return BlocksToDuplicate; |
279 | // and its successor is not already in the same cache line |
280 | if (isInCacheLine(BB, Succ: Tail)) |
281 | return BlocksToDuplicate; |
282 | // and its size do not exceed the maximum allowed size |
283 | if (Tail.getOriginalSize() > opts::TailDuplicationMaximumDuplication) |
284 | return BlocksToDuplicate; |
285 | // If duplicating would introduce a new branch, don't duplicate |
286 | for (auto Itr = Tail.succ_begin(); Itr != Tail.succ_end(); ++Itr) { |
287 | if ((*Itr)->getLayoutIndex() == Tail.getLayoutIndex() + 1) |
288 | return BlocksToDuplicate; |
289 | } |
290 | |
291 | BlocksToDuplicate.push_back(x: &Tail); |
292 | return BlocksToDuplicate; |
293 | } |
294 | |
295 | std::vector<BinaryBasicBlock *> |
296 | TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, |
297 | BinaryBasicBlock &Tail) const { |
298 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
299 | // The block must be hot |
300 | if (BB.getKnownExecutionCount() == 0) |
301 | return BlocksToDuplicate; |
302 | // and its successor is not already in the same cache line |
303 | if (isInCacheLine(BB, Succ: Tail)) |
304 | return BlocksToDuplicate; |
305 | |
306 | BinaryBasicBlock *CurrBB = &Tail; |
307 | while (CurrBB) { |
308 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " |
309 | << CurrBB->getName() << " to duplication list\n" ;); |
310 | BlocksToDuplicate.push_back(x: CurrBB); |
311 | |
312 | if (CurrBB->hasJumpTable()) { |
313 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing duplication " |
314 | "list due to a JT in " |
315 | << CurrBB->getName() << '\n';); |
316 | BlocksToDuplicate.clear(); |
317 | break; |
318 | } |
319 | |
320 | // With no successors, we've reached the end and should duplicate all of |
321 | // BlocksToDuplicate |
322 | if (CurrBB->succ_size() == 0) |
323 | break; |
324 | |
325 | // With two successors, if they're both a jump, we should duplicate all |
326 | // blocks in BlocksToDuplicate. Otherwise, we cannot find a simple stream of |
327 | // blocks to copy |
328 | if (CurrBB->succ_size() >= 2) { |
329 | if (CurrBB->getConditionalSuccessor(Condition: false)->getLayoutIndex() == |
330 | CurrBB->getLayoutIndex() + 1 || |
331 | CurrBB->getConditionalSuccessor(Condition: true)->getLayoutIndex() == |
332 | CurrBB->getLayoutIndex() + 1) { |
333 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing " |
334 | "duplication list, can't find a simple stream at " |
335 | << CurrBB->getName() << '\n';); |
336 | BlocksToDuplicate.clear(); |
337 | } |
338 | break; |
339 | } |
340 | |
341 | // With one successor, if its a jump, we should duplicate all blocks in |
342 | // BlocksToDuplicate. Otherwise, we should keep going |
343 | BinaryBasicBlock *SuccBB = CurrBB->getSuccessor(); |
344 | if (SuccBB->getLayoutIndex() != CurrBB->getLayoutIndex() + 1) |
345 | break; |
346 | CurrBB = SuccBB; |
347 | } |
348 | // Don't duplicate if its too much code |
349 | unsigned DuplicationByteCount = std::accumulate( |
350 | first: std::begin(cont&: BlocksToDuplicate), last: std::end(cont&: BlocksToDuplicate), init: 0, |
351 | binary_op: [](int value, BinaryBasicBlock *p) { |
352 | return value + p->getOriginalSize(); |
353 | }); |
354 | if (DuplicationByteCount > opts::TailDuplicationMaximumDuplication) { |
355 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: duplication byte count (" |
356 | << DuplicationByteCount << ") exceeds maximum " |
357 | << opts::TailDuplicationMaximumDuplication << '\n';); |
358 | BlocksToDuplicate.clear(); |
359 | } |
360 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: found " |
361 | << BlocksToDuplicate.size() << " blocks to duplicate\n" ;); |
362 | return BlocksToDuplicate; |
363 | } |
364 | |
365 | bool TailDuplication::shouldDuplicate(BinaryBasicBlock *Pred, |
366 | BinaryBasicBlock *Tail) const { |
367 | if (Pred == Tail) |
368 | return false; |
369 | // Cannot duplicate non-tail blocks |
370 | if (Tail->succ_size() != 0) |
371 | return false; |
372 | // The blocks are already in the order |
373 | if (Pred->getLayoutIndex() + 1 == Tail->getLayoutIndex()) |
374 | return false; |
375 | // No tail duplication for blocks with jump tables |
376 | if (Pred->hasJumpTable()) |
377 | return false; |
378 | if (Tail->hasJumpTable()) |
379 | return false; |
380 | |
381 | return true; |
382 | } |
383 | |
384 | double TailDuplication::cacheScore(uint64_t SrcAddr, uint64_t SrcSize, |
385 | uint64_t DstAddr, uint64_t DstSize, |
386 | uint64_t Count) const { |
387 | assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); |
388 | |
389 | bool IsForwardJump = SrcAddr <= DstAddr; |
390 | uint64_t JumpDistance = 0; |
391 | // Computing the length of the jump so that it takes the sizes of the two |
392 | // blocks into consideration |
393 | if (IsForwardJump) { |
394 | JumpDistance = (DstAddr + DstSize) - (SrcAddr); |
395 | } else { |
396 | JumpDistance = (SrcAddr + SrcSize) - (DstAddr); |
397 | } |
398 | |
399 | if (JumpDistance >= opts::TailDuplicationMaxCacheDistance) |
400 | return 0; |
401 | double Prob = 1.0 - static_cast<double>(JumpDistance) / |
402 | opts::TailDuplicationMaxCacheDistance; |
403 | return (IsForwardJump ? 1.0 : opts::TailDuplicationCacheBackwardWeight) * |
404 | Prob * Count; |
405 | } |
406 | |
407 | bool TailDuplication::cacheScoreImproved(const MCCodeEmitter *Emitter, |
408 | BinaryFunction &BF, |
409 | BinaryBasicBlock *Pred, |
410 | BinaryBasicBlock *Tail) const { |
411 | // Collect (estimated) basic block sizes |
412 | DenseMap<const BinaryBasicBlock *, uint64_t> BBSize; |
413 | for (const BinaryBasicBlock &BB : BF) { |
414 | BBSize[&BB] = std::max<uint64_t>(a: BB.estimateSize(Emitter), b: 1); |
415 | } |
416 | |
417 | // Build current addresses of basic blocks starting at the entry block |
418 | DenseMap<BinaryBasicBlock *, uint64_t> CurAddr; |
419 | uint64_t Addr = 0; |
420 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
421 | CurAddr[SrcBB] = Addr; |
422 | Addr += BBSize[SrcBB]; |
423 | } |
424 | |
425 | // Build new addresses (after duplication) starting at the entry block |
426 | DenseMap<BinaryBasicBlock *, uint64_t> NewAddr; |
427 | Addr = 0; |
428 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
429 | NewAddr[SrcBB] = Addr; |
430 | Addr += BBSize[SrcBB]; |
431 | if (SrcBB == Pred) |
432 | Addr += BBSize[Tail]; |
433 | } |
434 | |
435 | // Compute the cache score for the existing layout of basic blocks |
436 | double CurScore = 0; |
437 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
438 | auto BI = SrcBB->branch_info_begin(); |
439 | for (BinaryBasicBlock *DstBB : SrcBB->successors()) { |
440 | if (SrcBB != DstBB) { |
441 | CurScore += cacheScore(SrcAddr: CurAddr[SrcBB], SrcSize: BBSize[SrcBB], DstAddr: CurAddr[DstBB], |
442 | DstSize: BBSize[DstBB], Count: BI->Count); |
443 | } |
444 | ++BI; |
445 | } |
446 | } |
447 | |
448 | // Compute the cache score for the layout of blocks after tail duplication |
449 | double NewScore = 0; |
450 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
451 | auto BI = SrcBB->branch_info_begin(); |
452 | for (BinaryBasicBlock *DstBB : SrcBB->successors()) { |
453 | if (SrcBB != DstBB) { |
454 | if (SrcBB == Pred && DstBB == Tail) { |
455 | NewScore += cacheScore(SrcAddr: NewAddr[SrcBB], SrcSize: BBSize[SrcBB], |
456 | DstAddr: NewAddr[SrcBB] + BBSize[SrcBB], DstSize: BBSize[DstBB], |
457 | Count: BI->Count); |
458 | } else { |
459 | NewScore += cacheScore(SrcAddr: NewAddr[SrcBB], SrcSize: BBSize[SrcBB], DstAddr: NewAddr[DstBB], |
460 | DstSize: BBSize[DstBB], Count: BI->Count); |
461 | } |
462 | } |
463 | ++BI; |
464 | } |
465 | } |
466 | |
467 | return NewScore > CurScore; |
468 | } |
469 | |
470 | std::vector<BinaryBasicBlock *> |
471 | TailDuplication::cacheDuplicate(const MCCodeEmitter *Emitter, |
472 | BinaryFunction &BF, BinaryBasicBlock *Pred, |
473 | BinaryBasicBlock *Tail) const { |
474 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
475 | |
476 | // No need to duplicate cold basic blocks |
477 | if (Pred->isCold() || Tail->isCold()) { |
478 | return BlocksToDuplicate; |
479 | } |
480 | // Always duplicate "small" tail basic blocks, which might be beneficial for |
481 | // code size, since a jump instruction is eliminated |
482 | if (Tail->estimateSize(Emitter) <= opts::TailDuplicationMinimumDuplication) { |
483 | BlocksToDuplicate.push_back(x: Tail); |
484 | return BlocksToDuplicate; |
485 | } |
486 | // Never duplicate "large" tail basic blocks |
487 | if (Tail->estimateSize(Emitter) > opts::TailDuplicationMaximumDuplication) { |
488 | return BlocksToDuplicate; |
489 | } |
490 | // Do not append basic blocks after the last hot block in the current layout |
491 | auto NextBlock = BF.getLayout().getBasicBlockAfter(BB: Pred); |
492 | if (NextBlock == nullptr || (!Pred->isCold() && NextBlock->isCold())) { |
493 | return BlocksToDuplicate; |
494 | } |
495 | |
496 | // Duplicate the tail only if it improves the cache score |
497 | if (cacheScoreImproved(Emitter, BF, Pred, Tail)) { |
498 | BlocksToDuplicate.push_back(x: Tail); |
499 | } |
500 | |
501 | return BlocksToDuplicate; |
502 | } |
503 | |
504 | std::vector<BinaryBasicBlock *> TailDuplication::duplicateBlocks( |
505 | BinaryBasicBlock &BB, |
506 | const std::vector<BinaryBasicBlock *> &BlocksToDuplicate) const { |
507 | BinaryFunction *BF = BB.getFunction(); |
508 | BinaryContext &BC = BF->getBinaryContext(); |
509 | |
510 | // Ratio of this new branches execution count to the total size of the |
511 | // successor's execution count. Used to set this new branches execution count |
512 | // and lower the old successor's execution count |
513 | double ExecutionCountRatio = |
514 | BB.getExecutionCount() >= BB.getSuccessor()->getExecutionCount() |
515 | ? 1.0 |
516 | : (double)BB.getExecutionCount() / |
517 | BB.getSuccessor()->getExecutionCount(); |
518 | |
519 | // Use the last branch info when adding a successor to LastBB |
520 | BinaryBasicBlock::BinaryBranchInfo &LastBI = |
521 | BB.getBranchInfo(Succ: *(BB.getSuccessor())); |
522 | |
523 | BinaryBasicBlock *LastOriginalBB = &BB; |
524 | BinaryBasicBlock *LastDuplicatedBB = &BB; |
525 | assert(LastDuplicatedBB->succ_size() == 1 && |
526 | "tail duplication cannot act on a block with more than 1 successor" ); |
527 | LastDuplicatedBB->removeSuccessor(Succ: LastDuplicatedBB->getSuccessor()); |
528 | |
529 | std::vector<std::unique_ptr<BinaryBasicBlock>> DuplicatedBlocks; |
530 | std::vector<BinaryBasicBlock *> DuplicatedBlocksToReturn; |
531 | |
532 | for (BinaryBasicBlock *CurBB : BlocksToDuplicate) { |
533 | DuplicatedBlocks.emplace_back( |
534 | args: BF->createBasicBlock(Label: (BC.Ctx)->createNamedTempSymbol(Name: "tail-dup" ))); |
535 | BinaryBasicBlock *NewBB = DuplicatedBlocks.back().get(); |
536 | |
537 | NewBB->addInstructions(Begin: CurBB->begin(), End: CurBB->end()); |
538 | // Set execution count as if it was just a copy of the original |
539 | NewBB->setExecutionCount(CurBB->getExecutionCount()); |
540 | NewBB->setIsCold(CurBB->isCold()); |
541 | LastDuplicatedBB->addSuccessor(Succ: NewBB, BI: LastBI); |
542 | |
543 | DuplicatedBlocksToReturn.push_back(x: NewBB); |
544 | |
545 | // As long as its not the first block, adjust both original and duplicated |
546 | // to what they should be |
547 | if (LastDuplicatedBB != &BB) { |
548 | LastOriginalBB->adjustExecutionCount(Ratio: 1.0 - ExecutionCountRatio); |
549 | LastDuplicatedBB->adjustExecutionCount(Ratio: ExecutionCountRatio); |
550 | } |
551 | |
552 | if (CurBB->succ_size() == 1) |
553 | LastBI = CurBB->getBranchInfo(Succ: *(CurBB->getSuccessor())); |
554 | |
555 | LastOriginalBB = CurBB; |
556 | LastDuplicatedBB = NewBB; |
557 | } |
558 | |
559 | LastDuplicatedBB->addSuccessors( |
560 | Begin: LastOriginalBB->succ_begin(), End: LastOriginalBB->succ_end(), |
561 | BrBegin: LastOriginalBB->branch_info_begin(), BrEnd: LastOriginalBB->branch_info_end()); |
562 | |
563 | LastOriginalBB->adjustExecutionCount(Ratio: 1.0 - ExecutionCountRatio); |
564 | LastDuplicatedBB->adjustExecutionCount(Ratio: ExecutionCountRatio); |
565 | |
566 | BF->insertBasicBlocks(Start: &BB, NewBBs: std::move(DuplicatedBlocks)); |
567 | |
568 | return DuplicatedBlocksToReturn; |
569 | } |
570 | |
571 | void TailDuplication::runOnFunction(BinaryFunction &Function) { |
572 | // Create a separate MCCodeEmitter to allow lock-free execution |
573 | BinaryContext::IndependentCodeEmitter Emitter; |
574 | if (!opts::NoThreads) { |
575 | Emitter = Function.getBinaryContext().createIndependentMCCodeEmitter(); |
576 | } |
577 | |
578 | Function.getLayout().updateLayoutIndices(); |
579 | |
580 | // New blocks will be added and layout will change, |
581 | // so make a copy here to iterate over the original layout |
582 | BinaryFunction::BasicBlockOrderType BlockLayout( |
583 | Function.getLayout().block_begin(), Function.getLayout().block_end()); |
584 | bool ModifiedFunction = false; |
585 | for (BinaryBasicBlock *BB : BlockLayout) { |
586 | AllDynamicCount += BB->getKnownExecutionCount(); |
587 | |
588 | // The block must be with one successor |
589 | if (BB->succ_size() != 1) |
590 | continue; |
591 | BinaryBasicBlock *Tail = BB->getSuccessor(); |
592 | // Verify that the tail should be duplicated |
593 | if (!shouldDuplicate(Pred: BB, Tail)) |
594 | continue; |
595 | |
596 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
597 | if (opts::TailDuplicationMode == TailDuplication::TD_AGGRESSIVE) { |
598 | BlocksToDuplicate = aggressiveDuplicate(BB&: *BB, Tail&: *Tail); |
599 | } else if (opts::TailDuplicationMode == TailDuplication::TD_MODERATE) { |
600 | BlocksToDuplicate = moderateDuplicate(BB&: *BB, Tail&: *Tail); |
601 | } else if (opts::TailDuplicationMode == TailDuplication::TD_CACHE) { |
602 | BlocksToDuplicate = cacheDuplicate(Emitter: Emitter.MCE.get(), BF&: Function, Pred: BB, Tail); |
603 | } else { |
604 | llvm_unreachable("unknown tail duplication mode" ); |
605 | } |
606 | |
607 | if (BlocksToDuplicate.empty()) |
608 | continue; |
609 | |
610 | // Apply the duplication |
611 | ModifiedFunction = true; |
612 | DuplicationsDynamicCount += BB->getExecutionCount(); |
613 | auto DuplicatedBlocks = duplicateBlocks(BB&: *BB, BlocksToDuplicate); |
614 | for (BinaryBasicBlock *BB : DuplicatedBlocks) { |
615 | DuplicatedBlockCount++; |
616 | DuplicatedByteCount += BB->estimateSize(Emitter: Emitter.MCE.get()); |
617 | } |
618 | |
619 | if (opts::TailDuplicationConstCopyPropagation) { |
620 | constantAndCopyPropagate(OriginalBB&: *BB, BlocksToPropagate&: DuplicatedBlocks); |
621 | BinaryBasicBlock *FirstBB = BlocksToDuplicate[0]; |
622 | if (FirstBB->pred_size() == 1) { |
623 | BinaryBasicBlock *PredBB = *FirstBB->pred_begin(); |
624 | if (PredBB->succ_size() == 1) |
625 | constantAndCopyPropagate(OriginalBB&: *PredBB, BlocksToPropagate&: BlocksToDuplicate); |
626 | } |
627 | } |
628 | |
629 | // Layout indices might be stale after duplication |
630 | Function.getLayout().updateLayoutIndices(); |
631 | } |
632 | if (ModifiedFunction) |
633 | ModifiedFunctions++; |
634 | } |
635 | |
636 | Error TailDuplication::runOnFunctions(BinaryContext &BC) { |
637 | if (opts::TailDuplicationMode == TailDuplication::TD_NONE) |
638 | return Error::success(); |
639 | |
640 | for (auto &It : BC.getBinaryFunctions()) { |
641 | BinaryFunction &Function = It.second; |
642 | if (!shouldOptimize(BF: Function)) |
643 | continue; |
644 | runOnFunction(Function); |
645 | } |
646 | |
647 | BC.outs() |
648 | << "BOLT-INFO: tail duplication" |
649 | << format(Fmt: " modified %zu (%.2f%%) functions;" , Vals: ModifiedFunctions, |
650 | Vals: 100.0 * ModifiedFunctions / BC.getBinaryFunctions().size()) |
651 | << format(Fmt: " duplicated %zu blocks (%zu bytes) responsible for" , |
652 | Vals: DuplicatedBlockCount, Vals: DuplicatedByteCount) |
653 | << format(Fmt: " %zu dynamic executions (%.2f%% of all block executions)" , |
654 | Vals: DuplicationsDynamicCount, |
655 | Vals: 100.0 * DuplicationsDynamicCount / AllDynamicCount) |
656 | << "\n" ; |
657 | |
658 | if (opts::TailDuplicationConstCopyPropagation) { |
659 | BC.outs() << "BOLT-INFO: tail duplication " |
660 | << format( |
661 | Fmt: "applied %zu static and %zu dynamic propagation deletions" , |
662 | Vals: StaticInstructionDeletionCount, |
663 | Vals: DynamicInstructionDeletionCount) |
664 | << "\n" ; |
665 | } |
666 | return Error::success(); |
667 | } |
668 | |
669 | } // end namespace bolt |
670 | } // end namespace llvm |
671 | |