| 1 | //===- bolt/Passes/TailDuplication.cpp ------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the TailDuplication class. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "bolt/Passes/TailDuplication.h" |
| 14 | #include "llvm/ADT/DenseMap.h" |
| 15 | #include "llvm/MC/MCRegisterInfo.h" |
| 16 | |
| 17 | #include <numeric> |
| 18 | #include <queue> |
| 19 | |
| 20 | #define DEBUG_TYPE "taildup" |
| 21 | |
| 22 | using namespace llvm; |
| 23 | |
| 24 | namespace opts { |
| 25 | |
| 26 | extern cl::OptionCategory BoltOptCategory; |
| 27 | extern cl::opt<bool> NoThreads; |
| 28 | |
| 29 | static cl::opt<bolt::TailDuplication::DuplicationMode> TailDuplicationMode( |
| 30 | "tail-duplication" , |
| 31 | cl::desc("duplicate unconditional branches that cross a cache line" ), |
| 32 | cl::init(Val: bolt::TailDuplication::TD_NONE), |
| 33 | cl::values(clEnumValN(bolt::TailDuplication::TD_NONE, "none" , |
| 34 | "do not apply" ), |
| 35 | clEnumValN(bolt::TailDuplication::TD_AGGRESSIVE, "aggressive" , |
| 36 | "aggressive strategy" ), |
| 37 | clEnumValN(bolt::TailDuplication::TD_MODERATE, "moderate" , |
| 38 | "moderate strategy" ), |
| 39 | clEnumValN(bolt::TailDuplication::TD_CACHE, "cache" , |
| 40 | "cache-aware duplication strategy" )), |
| 41 | cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); |
| 42 | |
| 43 | static cl::opt<unsigned> |
| 44 | TailDuplicationMinimumOffset("tail-duplication-minimum-offset" , |
| 45 | cl::desc("minimum offset needed between block " |
| 46 | "and successor to allow duplication" ), |
| 47 | cl::ReallyHidden, cl::init(Val: 64), |
| 48 | cl::cat(BoltOptCategory)); |
| 49 | |
| 50 | static cl::opt<unsigned> TailDuplicationMaximumDuplication( |
| 51 | "tail-duplication-maximum-duplication" , |
| 52 | cl::desc("tail blocks whose size (in bytes) exceeds the value are never " |
| 53 | "duplicated" ), |
| 54 | cl::ZeroOrMore, cl::ReallyHidden, cl::init(Val: 24), cl::cat(BoltOptCategory)); |
| 55 | |
| 56 | static cl::opt<unsigned> TailDuplicationMinimumDuplication( |
| 57 | "tail-duplication-minimum-duplication" , |
| 58 | cl::desc("tail blocks with size (in bytes) not exceeding the value are " |
| 59 | "always duplicated" ), |
| 60 | cl::ReallyHidden, cl::init(Val: 2), cl::cat(BoltOptCategory)); |
| 61 | |
| 62 | static cl::opt<bool> TailDuplicationConstCopyPropagation( |
| 63 | "tail-duplication-const-copy-propagation" , |
| 64 | cl::desc("enable const and copy propagation after tail duplication" ), |
| 65 | cl::ReallyHidden, cl::init(Val: false), cl::cat(BoltOptCategory)); |
| 66 | |
| 67 | static cl::opt<unsigned> TailDuplicationMaxCacheDistance( |
| 68 | "tail-duplication-max-cache-distance" , |
| 69 | cl::desc("The weight of backward jumps for ExtTSP value" ), cl::init(Val: 256), |
| 70 | cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 71 | |
| 72 | static cl::opt<double> TailDuplicationCacheBackwardWeight( |
| 73 | "tail-duplication-cache-backward-weight" , |
| 74 | cl::desc( |
| 75 | "The maximum distance (in bytes) of backward jumps for ExtTSP value" ), |
| 76 | cl::init(Val: 0.5), cl::ReallyHidden, cl::cat(BoltOptCategory)); |
| 77 | |
| 78 | } // namespace opts |
| 79 | |
| 80 | namespace llvm { |
| 81 | namespace bolt { |
| 82 | |
| 83 | void TailDuplication::getCallerSavedRegs(const MCInst &Inst, BitVector &Regs, |
| 84 | BinaryContext &BC) const { |
| 85 | if (!BC.MIB->isCall(Inst)) |
| 86 | return; |
| 87 | BitVector CallRegs = BitVector(BC.MRI->getNumRegs(), false); |
| 88 | BC.MIB->getCalleeSavedRegs(Regs&: CallRegs); |
| 89 | CallRegs.flip(); |
| 90 | Regs |= CallRegs; |
| 91 | } |
| 92 | |
| 93 | bool TailDuplication::regIsPossiblyOverwritten(const MCInst &Inst, unsigned Reg, |
| 94 | BinaryContext &BC) const { |
| 95 | BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false); |
| 96 | BC.MIB->getWrittenRegs(Inst, Regs&: WrittenRegs); |
| 97 | getCallerSavedRegs(Inst, Regs&: WrittenRegs, BC); |
| 98 | if (BC.MIB->isRep(Inst)) |
| 99 | BC.MIB->getRepRegs(Regs&: WrittenRegs); |
| 100 | WrittenRegs &= BC.MIB->getAliases(Reg, OnlySmaller: false); |
| 101 | return WrittenRegs.any(); |
| 102 | } |
| 103 | |
| 104 | bool TailDuplication::regIsDefinitelyOverwritten(const MCInst &Inst, |
| 105 | unsigned Reg, |
| 106 | BinaryContext &BC) const { |
| 107 | BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false); |
| 108 | BC.MIB->getWrittenRegs(Inst, Regs&: WrittenRegs); |
| 109 | getCallerSavedRegs(Inst, Regs&: WrittenRegs, BC); |
| 110 | if (BC.MIB->isRep(Inst)) |
| 111 | BC.MIB->getRepRegs(Regs&: WrittenRegs); |
| 112 | return (!regIsUsed(Inst, Reg, BC) && WrittenRegs.test(Idx: Reg) && |
| 113 | !BC.MIB->isConditionalMove(Inst)); |
| 114 | } |
| 115 | |
| 116 | bool TailDuplication::regIsUsed(const MCInst &Inst, unsigned Reg, |
| 117 | BinaryContext &BC) const { |
| 118 | BitVector SrcRegs = BitVector(BC.MRI->getNumRegs(), false); |
| 119 | BC.MIB->getSrcRegs(Inst, Regs&: SrcRegs); |
| 120 | SrcRegs &= BC.MIB->getAliases(Reg, OnlySmaller: true); |
| 121 | return SrcRegs.any(); |
| 122 | } |
| 123 | |
| 124 | bool TailDuplication::isOverwrittenBeforeUsed(BinaryBasicBlock &StartBB, |
| 125 | unsigned Reg) const { |
| 126 | BinaryFunction *BF = StartBB.getFunction(); |
| 127 | BinaryContext &BC = BF->getBinaryContext(); |
| 128 | std::queue<BinaryBasicBlock *> Q; |
| 129 | for (auto Itr = StartBB.succ_begin(); Itr != StartBB.succ_end(); ++Itr) { |
| 130 | BinaryBasicBlock *NextBB = *Itr; |
| 131 | Q.push(x: NextBB); |
| 132 | } |
| 133 | std::set<BinaryBasicBlock *> Visited; |
| 134 | // Breadth first search through successive blocks and see if Reg is ever used |
| 135 | // before its overwritten |
| 136 | while (Q.size() > 0) { |
| 137 | BinaryBasicBlock *CurrBB = Q.front(); |
| 138 | Q.pop(); |
| 139 | if (Visited.count(x: CurrBB)) |
| 140 | continue; |
| 141 | Visited.insert(x: CurrBB); |
| 142 | bool Overwritten = false; |
| 143 | for (auto Itr = CurrBB->begin(); Itr != CurrBB->end(); ++Itr) { |
| 144 | MCInst &Inst = *Itr; |
| 145 | if (regIsUsed(Inst, Reg, BC)) |
| 146 | return false; |
| 147 | if (regIsDefinitelyOverwritten(Inst, Reg, BC)) { |
| 148 | Overwritten = true; |
| 149 | break; |
| 150 | } |
| 151 | } |
| 152 | if (Overwritten) |
| 153 | continue; |
| 154 | for (auto Itr = CurrBB->succ_begin(); Itr != CurrBB->succ_end(); ++Itr) { |
| 155 | BinaryBasicBlock *NextBB = *Itr; |
| 156 | Q.push(x: NextBB); |
| 157 | } |
| 158 | } |
| 159 | return true; |
| 160 | } |
| 161 | |
| 162 | void TailDuplication::constantAndCopyPropagate( |
| 163 | BinaryBasicBlock &OriginalBB, |
| 164 | std::vector<BinaryBasicBlock *> &BlocksToPropagate) { |
| 165 | BinaryFunction *BF = OriginalBB.getFunction(); |
| 166 | BinaryContext &BC = BF->getBinaryContext(); |
| 167 | |
| 168 | BlocksToPropagate.insert(position: BlocksToPropagate.begin(), x: &OriginalBB); |
| 169 | // Iterate through the original instructions to find one to propagate |
| 170 | for (auto Itr = OriginalBB.begin(); Itr != OriginalBB.end(); ++Itr) { |
| 171 | MCInst &OriginalInst = *Itr; |
| 172 | // It must be a non conditional |
| 173 | if (BC.MIB->isConditionalMove(Inst: OriginalInst)) |
| 174 | continue; |
| 175 | |
| 176 | // Move immediate or move register |
| 177 | if ((!BC.MII->get(Opcode: OriginalInst.getOpcode()).isMoveImmediate() || |
| 178 | !OriginalInst.getOperand(i: 1).isImm()) && |
| 179 | (!BC.MII->get(Opcode: OriginalInst.getOpcode()).isMoveReg() || |
| 180 | !OriginalInst.getOperand(i: 1).isReg())) |
| 181 | continue; |
| 182 | |
| 183 | // True if this is constant propagation and not copy propagation |
| 184 | bool ConstantProp = BC.MII->get(Opcode: OriginalInst.getOpcode()).isMoveImmediate(); |
| 185 | // The Register to replaced |
| 186 | unsigned Reg = OriginalInst.getOperand(i: 0).getReg(); |
| 187 | // True if the register to replace was replaced everywhere it was used |
| 188 | bool ReplacedEverywhere = true; |
| 189 | // True if the register was definitely overwritten |
| 190 | bool Overwritten = false; |
| 191 | // True if the register to replace and the register to replace with (for |
| 192 | // copy propagation) has not been overwritten and is still usable |
| 193 | bool RegsActive = true; |
| 194 | |
| 195 | // Iterate through successor blocks and through their instructions |
| 196 | for (BinaryBasicBlock *NextBB : BlocksToPropagate) { |
| 197 | for (auto PropagateItr = |
| 198 | ((NextBB == &OriginalBB) ? Itr + 1 : NextBB->begin()); |
| 199 | PropagateItr < NextBB->end(); ++PropagateItr) { |
| 200 | MCInst &PropagateInst = *PropagateItr; |
| 201 | if (regIsUsed(Inst: PropagateInst, Reg, BC)) { |
| 202 | bool Replaced = false; |
| 203 | // If both registers are active for copy propagation or the register |
| 204 | // to replace is active for constant propagation |
| 205 | if (RegsActive) { |
| 206 | // Set Replaced and so ReplacedEverwhere to false if it cannot be |
| 207 | // replaced (no replacing that opcode, Register is src and dest) |
| 208 | if (ConstantProp) |
| 209 | Replaced = BC.MIB->replaceRegWithImm( |
| 210 | Inst&: PropagateInst, Register: Reg, Imm: OriginalInst.getOperand(i: 1).getImm()); |
| 211 | else |
| 212 | Replaced = BC.MIB->replaceRegWithReg( |
| 213 | Inst&: PropagateInst, ToReplace: Reg, ReplaceWith: OriginalInst.getOperand(i: 1).getReg()); |
| 214 | } |
| 215 | ReplacedEverywhere = ReplacedEverywhere && Replaced; |
| 216 | } |
| 217 | // For copy propagation, make sure no propagation happens after the |
| 218 | // register to replace with is overwritten |
| 219 | if (!ConstantProp && |
| 220 | regIsPossiblyOverwritten(Inst: PropagateInst, |
| 221 | Reg: OriginalInst.getOperand(i: 1).getReg(), BC)) |
| 222 | RegsActive = false; |
| 223 | |
| 224 | // Make sure no propagation happens after the register to replace is |
| 225 | // overwritten |
| 226 | if (regIsPossiblyOverwritten(Inst: PropagateInst, Reg, BC)) |
| 227 | RegsActive = false; |
| 228 | |
| 229 | // Record if the register to replace is overwritten |
| 230 | if (regIsDefinitelyOverwritten(Inst: PropagateInst, Reg, BC)) { |
| 231 | Overwritten = true; |
| 232 | break; |
| 233 | } |
| 234 | } |
| 235 | if (Overwritten) |
| 236 | break; |
| 237 | } |
| 238 | |
| 239 | // If the register was replaced everywhere and it was overwritten in either |
| 240 | // one of the iterated through blocks or one of the successor blocks, delete |
| 241 | // the original move instruction |
| 242 | if (ReplacedEverywhere && |
| 243 | (Overwritten || |
| 244 | isOverwrittenBeforeUsed( |
| 245 | StartBB&: *BlocksToPropagate[BlocksToPropagate.size() - 1], Reg))) { |
| 246 | // If both registers are active for copy propagation or the register |
| 247 | // to replace is active for constant propagation |
| 248 | StaticInstructionDeletionCount++; |
| 249 | DynamicInstructionDeletionCount += OriginalBB.getExecutionCount(); |
| 250 | Itr = std::prev(x: OriginalBB.eraseInstruction(II: Itr)); |
| 251 | } |
| 252 | } |
| 253 | } |
| 254 | |
| 255 | bool TailDuplication::isInCacheLine(const BinaryBasicBlock &BB, |
| 256 | const BinaryBasicBlock &Succ) const { |
| 257 | if (&BB == &Succ) |
| 258 | return true; |
| 259 | |
| 260 | uint64_t Distance = 0; |
| 261 | int Direction = (Succ.getLayoutIndex() > BB.getLayoutIndex()) ? 1 : -1; |
| 262 | |
| 263 | for (unsigned I = BB.getLayoutIndex() + Direction; I != Succ.getLayoutIndex(); |
| 264 | I += Direction) { |
| 265 | Distance += BB.getFunction()->getLayout().getBlock(Index: I)->getOriginalSize(); |
| 266 | if (Distance > opts::TailDuplicationMinimumOffset) |
| 267 | return false; |
| 268 | } |
| 269 | return true; |
| 270 | } |
| 271 | |
| 272 | std::vector<BinaryBasicBlock *> |
| 273 | TailDuplication::moderateDuplicate(BinaryBasicBlock &BB, |
| 274 | BinaryBasicBlock &Tail) const { |
| 275 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
| 276 | // The block must be hot |
| 277 | if (BB.getKnownExecutionCount() == 0) |
| 278 | return BlocksToDuplicate; |
| 279 | // and its successor is not already in the same cache line |
| 280 | if (isInCacheLine(BB, Succ: Tail)) |
| 281 | return BlocksToDuplicate; |
| 282 | // and its size do not exceed the maximum allowed size |
| 283 | if (Tail.getOriginalSize() > opts::TailDuplicationMaximumDuplication) |
| 284 | return BlocksToDuplicate; |
| 285 | // If duplicating would introduce a new branch, don't duplicate |
| 286 | for (auto Itr = Tail.succ_begin(); Itr != Tail.succ_end(); ++Itr) { |
| 287 | if ((*Itr)->getLayoutIndex() == Tail.getLayoutIndex() + 1) |
| 288 | return BlocksToDuplicate; |
| 289 | } |
| 290 | |
| 291 | BlocksToDuplicate.push_back(x: &Tail); |
| 292 | return BlocksToDuplicate; |
| 293 | } |
| 294 | |
| 295 | std::vector<BinaryBasicBlock *> |
| 296 | TailDuplication::aggressiveDuplicate(BinaryBasicBlock &BB, |
| 297 | BinaryBasicBlock &Tail) const { |
| 298 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
| 299 | // The block must be hot |
| 300 | if (BB.getKnownExecutionCount() == 0) |
| 301 | return BlocksToDuplicate; |
| 302 | // and its successor is not already in the same cache line |
| 303 | if (isInCacheLine(BB, Succ: Tail)) |
| 304 | return BlocksToDuplicate; |
| 305 | |
| 306 | BinaryBasicBlock *CurrBB = &Tail; |
| 307 | while (CurrBB) { |
| 308 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: adding " |
| 309 | << CurrBB->getName() << " to duplication list\n" ;); |
| 310 | BlocksToDuplicate.push_back(x: CurrBB); |
| 311 | |
| 312 | if (CurrBB->hasJumpTable()) { |
| 313 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing duplication " |
| 314 | "list due to a JT in " |
| 315 | << CurrBB->getName() << '\n';); |
| 316 | BlocksToDuplicate.clear(); |
| 317 | break; |
| 318 | } |
| 319 | |
| 320 | // With no successors, we've reached the end and should duplicate all of |
| 321 | // BlocksToDuplicate |
| 322 | if (CurrBB->succ_size() == 0) |
| 323 | break; |
| 324 | |
| 325 | // With two successors, if they're both a jump, we should duplicate all |
| 326 | // blocks in BlocksToDuplicate. Otherwise, we cannot find a simple stream of |
| 327 | // blocks to copy |
| 328 | if (CurrBB->succ_size() >= 2) { |
| 329 | if (CurrBB->getConditionalSuccessor(Condition: false)->getLayoutIndex() == |
| 330 | CurrBB->getLayoutIndex() + 1 || |
| 331 | CurrBB->getConditionalSuccessor(Condition: true)->getLayoutIndex() == |
| 332 | CurrBB->getLayoutIndex() + 1) { |
| 333 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: clearing " |
| 334 | "duplication list, can't find a simple stream at " |
| 335 | << CurrBB->getName() << '\n';); |
| 336 | BlocksToDuplicate.clear(); |
| 337 | } |
| 338 | break; |
| 339 | } |
| 340 | |
| 341 | // With one successor, if its a jump, we should duplicate all blocks in |
| 342 | // BlocksToDuplicate. Otherwise, we should keep going |
| 343 | BinaryBasicBlock *SuccBB = CurrBB->getSuccessor(); |
| 344 | if (SuccBB->getLayoutIndex() != CurrBB->getLayoutIndex() + 1) |
| 345 | break; |
| 346 | CurrBB = SuccBB; |
| 347 | } |
| 348 | // Don't duplicate if its too much code |
| 349 | unsigned DuplicationByteCount = std::accumulate( |
| 350 | first: std::begin(cont&: BlocksToDuplicate), last: std::end(cont&: BlocksToDuplicate), init: 0, |
| 351 | binary_op: [](int value, BinaryBasicBlock *p) { |
| 352 | return value + p->getOriginalSize(); |
| 353 | }); |
| 354 | if (DuplicationByteCount > opts::TailDuplicationMaximumDuplication) { |
| 355 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: duplication byte count (" |
| 356 | << DuplicationByteCount << ") exceeds maximum " |
| 357 | << opts::TailDuplicationMaximumDuplication << '\n';); |
| 358 | BlocksToDuplicate.clear(); |
| 359 | } |
| 360 | LLVM_DEBUG(dbgs() << "Aggressive tail duplication: found " |
| 361 | << BlocksToDuplicate.size() << " blocks to duplicate\n" ;); |
| 362 | return BlocksToDuplicate; |
| 363 | } |
| 364 | |
| 365 | bool TailDuplication::shouldDuplicate(BinaryBasicBlock *Pred, |
| 366 | BinaryBasicBlock *Tail) const { |
| 367 | if (Pred == Tail) |
| 368 | return false; |
| 369 | // Cannot duplicate non-tail blocks |
| 370 | if (Tail->succ_size() != 0) |
| 371 | return false; |
| 372 | // The blocks are already in the order |
| 373 | if (Pred->getLayoutIndex() + 1 == Tail->getLayoutIndex()) |
| 374 | return false; |
| 375 | // No tail duplication for blocks with jump tables |
| 376 | if (Pred->hasJumpTable()) |
| 377 | return false; |
| 378 | if (Tail->hasJumpTable()) |
| 379 | return false; |
| 380 | |
| 381 | return true; |
| 382 | } |
| 383 | |
| 384 | double TailDuplication::cacheScore(uint64_t SrcAddr, uint64_t SrcSize, |
| 385 | uint64_t DstAddr, uint64_t DstSize, |
| 386 | uint64_t Count) const { |
| 387 | assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE); |
| 388 | |
| 389 | bool IsForwardJump = SrcAddr <= DstAddr; |
| 390 | uint64_t JumpDistance = 0; |
| 391 | // Computing the length of the jump so that it takes the sizes of the two |
| 392 | // blocks into consideration |
| 393 | if (IsForwardJump) { |
| 394 | JumpDistance = (DstAddr + DstSize) - (SrcAddr); |
| 395 | } else { |
| 396 | JumpDistance = (SrcAddr + SrcSize) - (DstAddr); |
| 397 | } |
| 398 | |
| 399 | if (JumpDistance >= opts::TailDuplicationMaxCacheDistance) |
| 400 | return 0; |
| 401 | double Prob = 1.0 - static_cast<double>(JumpDistance) / |
| 402 | opts::TailDuplicationMaxCacheDistance; |
| 403 | return (IsForwardJump ? 1.0 : opts::TailDuplicationCacheBackwardWeight) * |
| 404 | Prob * Count; |
| 405 | } |
| 406 | |
| 407 | bool TailDuplication::cacheScoreImproved(const MCCodeEmitter *Emitter, |
| 408 | BinaryFunction &BF, |
| 409 | BinaryBasicBlock *Pred, |
| 410 | BinaryBasicBlock *Tail) const { |
| 411 | // Collect (estimated) basic block sizes |
| 412 | DenseMap<const BinaryBasicBlock *, uint64_t> BBSize; |
| 413 | for (const BinaryBasicBlock &BB : BF) { |
| 414 | BBSize[&BB] = std::max<uint64_t>(a: BB.estimateSize(Emitter), b: 1); |
| 415 | } |
| 416 | |
| 417 | // Build current addresses of basic blocks starting at the entry block |
| 418 | DenseMap<BinaryBasicBlock *, uint64_t> CurAddr; |
| 419 | uint64_t Addr = 0; |
| 420 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
| 421 | CurAddr[SrcBB] = Addr; |
| 422 | Addr += BBSize[SrcBB]; |
| 423 | } |
| 424 | |
| 425 | // Build new addresses (after duplication) starting at the entry block |
| 426 | DenseMap<BinaryBasicBlock *, uint64_t> NewAddr; |
| 427 | Addr = 0; |
| 428 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
| 429 | NewAddr[SrcBB] = Addr; |
| 430 | Addr += BBSize[SrcBB]; |
| 431 | if (SrcBB == Pred) |
| 432 | Addr += BBSize[Tail]; |
| 433 | } |
| 434 | |
| 435 | // Compute the cache score for the existing layout of basic blocks |
| 436 | double CurScore = 0; |
| 437 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
| 438 | auto BI = SrcBB->branch_info_begin(); |
| 439 | for (BinaryBasicBlock *DstBB : SrcBB->successors()) { |
| 440 | if (SrcBB != DstBB) { |
| 441 | CurScore += cacheScore(SrcAddr: CurAddr[SrcBB], SrcSize: BBSize[SrcBB], DstAddr: CurAddr[DstBB], |
| 442 | DstSize: BBSize[DstBB], Count: BI->Count); |
| 443 | } |
| 444 | ++BI; |
| 445 | } |
| 446 | } |
| 447 | |
| 448 | // Compute the cache score for the layout of blocks after tail duplication |
| 449 | double NewScore = 0; |
| 450 | for (BinaryBasicBlock *SrcBB : BF.getLayout().blocks()) { |
| 451 | auto BI = SrcBB->branch_info_begin(); |
| 452 | for (BinaryBasicBlock *DstBB : SrcBB->successors()) { |
| 453 | if (SrcBB != DstBB) { |
| 454 | if (SrcBB == Pred && DstBB == Tail) { |
| 455 | NewScore += cacheScore(SrcAddr: NewAddr[SrcBB], SrcSize: BBSize[SrcBB], |
| 456 | DstAddr: NewAddr[SrcBB] + BBSize[SrcBB], DstSize: BBSize[DstBB], |
| 457 | Count: BI->Count); |
| 458 | } else { |
| 459 | NewScore += cacheScore(SrcAddr: NewAddr[SrcBB], SrcSize: BBSize[SrcBB], DstAddr: NewAddr[DstBB], |
| 460 | DstSize: BBSize[DstBB], Count: BI->Count); |
| 461 | } |
| 462 | } |
| 463 | ++BI; |
| 464 | } |
| 465 | } |
| 466 | |
| 467 | return NewScore > CurScore; |
| 468 | } |
| 469 | |
| 470 | std::vector<BinaryBasicBlock *> |
| 471 | TailDuplication::cacheDuplicate(const MCCodeEmitter *Emitter, |
| 472 | BinaryFunction &BF, BinaryBasicBlock *Pred, |
| 473 | BinaryBasicBlock *Tail) const { |
| 474 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
| 475 | |
| 476 | // No need to duplicate cold basic blocks |
| 477 | if (Pred->isCold() || Tail->isCold()) { |
| 478 | return BlocksToDuplicate; |
| 479 | } |
| 480 | // Always duplicate "small" tail basic blocks, which might be beneficial for |
| 481 | // code size, since a jump instruction is eliminated |
| 482 | if (Tail->estimateSize(Emitter) <= opts::TailDuplicationMinimumDuplication) { |
| 483 | BlocksToDuplicate.push_back(x: Tail); |
| 484 | return BlocksToDuplicate; |
| 485 | } |
| 486 | // Never duplicate "large" tail basic blocks |
| 487 | if (Tail->estimateSize(Emitter) > opts::TailDuplicationMaximumDuplication) { |
| 488 | return BlocksToDuplicate; |
| 489 | } |
| 490 | // Do not append basic blocks after the last hot block in the current layout |
| 491 | auto NextBlock = BF.getLayout().getBasicBlockAfter(BB: Pred); |
| 492 | if (NextBlock == nullptr || (!Pred->isCold() && NextBlock->isCold())) { |
| 493 | return BlocksToDuplicate; |
| 494 | } |
| 495 | |
| 496 | // Duplicate the tail only if it improves the cache score |
| 497 | if (cacheScoreImproved(Emitter, BF, Pred, Tail)) { |
| 498 | BlocksToDuplicate.push_back(x: Tail); |
| 499 | } |
| 500 | |
| 501 | return BlocksToDuplicate; |
| 502 | } |
| 503 | |
| 504 | std::vector<BinaryBasicBlock *> TailDuplication::duplicateBlocks( |
| 505 | BinaryBasicBlock &BB, |
| 506 | const std::vector<BinaryBasicBlock *> &BlocksToDuplicate) const { |
| 507 | BinaryFunction *BF = BB.getFunction(); |
| 508 | BinaryContext &BC = BF->getBinaryContext(); |
| 509 | |
| 510 | // Ratio of this new branches execution count to the total size of the |
| 511 | // successor's execution count. Used to set this new branches execution count |
| 512 | // and lower the old successor's execution count |
| 513 | double ExecutionCountRatio = |
| 514 | BB.getExecutionCount() >= BB.getSuccessor()->getExecutionCount() |
| 515 | ? 1.0 |
| 516 | : (double)BB.getExecutionCount() / |
| 517 | BB.getSuccessor()->getExecutionCount(); |
| 518 | |
| 519 | // Use the last branch info when adding a successor to LastBB |
| 520 | BinaryBasicBlock::BinaryBranchInfo &LastBI = |
| 521 | BB.getBranchInfo(Succ: *(BB.getSuccessor())); |
| 522 | |
| 523 | BinaryBasicBlock *LastOriginalBB = &BB; |
| 524 | BinaryBasicBlock *LastDuplicatedBB = &BB; |
| 525 | assert(LastDuplicatedBB->succ_size() == 1 && |
| 526 | "tail duplication cannot act on a block with more than 1 successor" ); |
| 527 | LastDuplicatedBB->removeSuccessor(Succ: LastDuplicatedBB->getSuccessor()); |
| 528 | |
| 529 | std::vector<std::unique_ptr<BinaryBasicBlock>> DuplicatedBlocks; |
| 530 | std::vector<BinaryBasicBlock *> DuplicatedBlocksToReturn; |
| 531 | |
| 532 | for (BinaryBasicBlock *CurBB : BlocksToDuplicate) { |
| 533 | DuplicatedBlocks.emplace_back( |
| 534 | args: BF->createBasicBlock(Label: (BC.Ctx)->createNamedTempSymbol(Name: "tail-dup" ))); |
| 535 | BinaryBasicBlock *NewBB = DuplicatedBlocks.back().get(); |
| 536 | |
| 537 | NewBB->addInstructions(Begin: CurBB->begin(), End: CurBB->end()); |
| 538 | // Set execution count as if it was just a copy of the original |
| 539 | NewBB->setExecutionCount(CurBB->getExecutionCount()); |
| 540 | NewBB->setIsCold(CurBB->isCold()); |
| 541 | LastDuplicatedBB->addSuccessor(Succ: NewBB, BI: LastBI); |
| 542 | |
| 543 | DuplicatedBlocksToReturn.push_back(x: NewBB); |
| 544 | |
| 545 | // As long as its not the first block, adjust both original and duplicated |
| 546 | // to what they should be |
| 547 | if (LastDuplicatedBB != &BB) { |
| 548 | LastOriginalBB->adjustExecutionCount(Ratio: 1.0 - ExecutionCountRatio); |
| 549 | LastDuplicatedBB->adjustExecutionCount(Ratio: ExecutionCountRatio); |
| 550 | } |
| 551 | |
| 552 | if (CurBB->succ_size() == 1) |
| 553 | LastBI = CurBB->getBranchInfo(Succ: *(CurBB->getSuccessor())); |
| 554 | |
| 555 | LastOriginalBB = CurBB; |
| 556 | LastDuplicatedBB = NewBB; |
| 557 | } |
| 558 | |
| 559 | LastDuplicatedBB->addSuccessors( |
| 560 | Begin: LastOriginalBB->succ_begin(), End: LastOriginalBB->succ_end(), |
| 561 | BrBegin: LastOriginalBB->branch_info_begin(), BrEnd: LastOriginalBB->branch_info_end()); |
| 562 | |
| 563 | LastOriginalBB->adjustExecutionCount(Ratio: 1.0 - ExecutionCountRatio); |
| 564 | LastDuplicatedBB->adjustExecutionCount(Ratio: ExecutionCountRatio); |
| 565 | |
| 566 | BF->insertBasicBlocks(Start: &BB, NewBBs: std::move(DuplicatedBlocks)); |
| 567 | |
| 568 | return DuplicatedBlocksToReturn; |
| 569 | } |
| 570 | |
| 571 | void TailDuplication::runOnFunction(BinaryFunction &Function) { |
| 572 | // Create a separate MCCodeEmitter to allow lock-free execution |
| 573 | BinaryContext::IndependentCodeEmitter Emitter; |
| 574 | if (!opts::NoThreads) { |
| 575 | Emitter = Function.getBinaryContext().createIndependentMCCodeEmitter(); |
| 576 | } |
| 577 | |
| 578 | Function.getLayout().updateLayoutIndices(); |
| 579 | |
| 580 | // New blocks will be added and layout will change, |
| 581 | // so make a copy here to iterate over the original layout |
| 582 | BinaryFunction::BasicBlockOrderType BlockLayout( |
| 583 | Function.getLayout().block_begin(), Function.getLayout().block_end()); |
| 584 | bool ModifiedFunction = false; |
| 585 | for (BinaryBasicBlock *BB : BlockLayout) { |
| 586 | AllDynamicCount += BB->getKnownExecutionCount(); |
| 587 | |
| 588 | // The block must be with one successor |
| 589 | if (BB->succ_size() != 1) |
| 590 | continue; |
| 591 | BinaryBasicBlock *Tail = BB->getSuccessor(); |
| 592 | // Verify that the tail should be duplicated |
| 593 | if (!shouldDuplicate(Pred: BB, Tail)) |
| 594 | continue; |
| 595 | |
| 596 | std::vector<BinaryBasicBlock *> BlocksToDuplicate; |
| 597 | if (opts::TailDuplicationMode == TailDuplication::TD_AGGRESSIVE) { |
| 598 | BlocksToDuplicate = aggressiveDuplicate(BB&: *BB, Tail&: *Tail); |
| 599 | } else if (opts::TailDuplicationMode == TailDuplication::TD_MODERATE) { |
| 600 | BlocksToDuplicate = moderateDuplicate(BB&: *BB, Tail&: *Tail); |
| 601 | } else if (opts::TailDuplicationMode == TailDuplication::TD_CACHE) { |
| 602 | BlocksToDuplicate = cacheDuplicate(Emitter: Emitter.MCE.get(), BF&: Function, Pred: BB, Tail); |
| 603 | } else { |
| 604 | llvm_unreachable("unknown tail duplication mode" ); |
| 605 | } |
| 606 | |
| 607 | if (BlocksToDuplicate.empty()) |
| 608 | continue; |
| 609 | |
| 610 | // Apply the duplication |
| 611 | ModifiedFunction = true; |
| 612 | DuplicationsDynamicCount += BB->getExecutionCount(); |
| 613 | auto DuplicatedBlocks = duplicateBlocks(BB&: *BB, BlocksToDuplicate); |
| 614 | for (BinaryBasicBlock *BB : DuplicatedBlocks) { |
| 615 | DuplicatedBlockCount++; |
| 616 | DuplicatedByteCount += BB->estimateSize(Emitter: Emitter.MCE.get()); |
| 617 | } |
| 618 | |
| 619 | if (opts::TailDuplicationConstCopyPropagation) { |
| 620 | constantAndCopyPropagate(OriginalBB&: *BB, BlocksToPropagate&: DuplicatedBlocks); |
| 621 | BinaryBasicBlock *FirstBB = BlocksToDuplicate[0]; |
| 622 | if (FirstBB->pred_size() == 1) { |
| 623 | BinaryBasicBlock *PredBB = *FirstBB->pred_begin(); |
| 624 | if (PredBB->succ_size() == 1) |
| 625 | constantAndCopyPropagate(OriginalBB&: *PredBB, BlocksToPropagate&: BlocksToDuplicate); |
| 626 | } |
| 627 | } |
| 628 | |
| 629 | // Layout indices might be stale after duplication |
| 630 | Function.getLayout().updateLayoutIndices(); |
| 631 | } |
| 632 | if (ModifiedFunction) |
| 633 | ModifiedFunctions++; |
| 634 | } |
| 635 | |
| 636 | Error TailDuplication::runOnFunctions(BinaryContext &BC) { |
| 637 | if (opts::TailDuplicationMode == TailDuplication::TD_NONE) |
| 638 | return Error::success(); |
| 639 | |
| 640 | for (auto &It : BC.getBinaryFunctions()) { |
| 641 | BinaryFunction &Function = It.second; |
| 642 | if (!shouldOptimize(BF: Function)) |
| 643 | continue; |
| 644 | runOnFunction(Function); |
| 645 | } |
| 646 | |
| 647 | BC.outs() |
| 648 | << "BOLT-INFO: tail duplication" |
| 649 | << format(Fmt: " modified %zu (%.2f%%) functions;" , Vals: ModifiedFunctions, |
| 650 | Vals: 100.0 * ModifiedFunctions / BC.getBinaryFunctions().size()) |
| 651 | << format(Fmt: " duplicated %zu blocks (%zu bytes) responsible for" , |
| 652 | Vals: DuplicatedBlockCount, Vals: DuplicatedByteCount) |
| 653 | << format(Fmt: " %zu dynamic executions (%.2f%% of all block executions)" , |
| 654 | Vals: DuplicationsDynamicCount, |
| 655 | Vals: 100.0 * DuplicationsDynamicCount / AllDynamicCount) |
| 656 | << "\n" ; |
| 657 | |
| 658 | if (opts::TailDuplicationConstCopyPropagation) { |
| 659 | BC.outs() << "BOLT-INFO: tail duplication " |
| 660 | << format( |
| 661 | Fmt: "applied %zu static and %zu dynamic propagation deletions" , |
| 662 | Vals: StaticInstructionDeletionCount, |
| 663 | Vals: DynamicInstructionDeletionCount) |
| 664 | << "\n" ; |
| 665 | } |
| 666 | return Error::success(); |
| 667 | } |
| 668 | |
| 669 | } // end namespace bolt |
| 670 | } // end namespace llvm |
| 671 | |