1 | ///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" |
10 | #include "llvm/ADT/DenseMap.h" |
11 | #include "llvm/ADT/STLExtras.h" |
12 | #include "llvm/ADT/Sequence.h" |
13 | #include "llvm/ADT/SetVector.h" |
14 | #include "llvm/ADT/SmallPtrSet.h" |
15 | #include "llvm/ADT/SmallVector.h" |
16 | #include "llvm/ADT/Statistic.h" |
17 | #include "llvm/ADT/Twine.h" |
18 | #include "llvm/Analysis/AssumptionCache.h" |
19 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
20 | #include "llvm/Analysis/CFG.h" |
21 | #include "llvm/Analysis/CodeMetrics.h" |
22 | #include "llvm/Analysis/DomTreeUpdater.h" |
23 | #include "llvm/Analysis/GuardUtils.h" |
24 | #include "llvm/Analysis/LoopAnalysisManager.h" |
25 | #include "llvm/Analysis/LoopInfo.h" |
26 | #include "llvm/Analysis/LoopIterator.h" |
27 | #include "llvm/Analysis/MemorySSA.h" |
28 | #include "llvm/Analysis/MemorySSAUpdater.h" |
29 | #include "llvm/Analysis/MustExecute.h" |
30 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
31 | #include "llvm/Analysis/ScalarEvolution.h" |
32 | #include "llvm/Analysis/TargetTransformInfo.h" |
33 | #include "llvm/Analysis/ValueTracking.h" |
34 | #include "llvm/IR/BasicBlock.h" |
35 | #include "llvm/IR/Constant.h" |
36 | #include "llvm/IR/Constants.h" |
37 | #include "llvm/IR/Dominators.h" |
38 | #include "llvm/IR/Function.h" |
39 | #include "llvm/IR/IRBuilder.h" |
40 | #include "llvm/IR/InstrTypes.h" |
41 | #include "llvm/IR/Instruction.h" |
42 | #include "llvm/IR/Instructions.h" |
43 | #include "llvm/IR/IntrinsicInst.h" |
44 | #include "llvm/IR/PatternMatch.h" |
45 | #include "llvm/IR/ProfDataUtils.h" |
46 | #include "llvm/IR/Use.h" |
47 | #include "llvm/IR/Value.h" |
48 | #include "llvm/Support/Casting.h" |
49 | #include "llvm/Support/CommandLine.h" |
50 | #include "llvm/Support/Debug.h" |
51 | #include "llvm/Support/ErrorHandling.h" |
52 | #include "llvm/Support/GenericDomTree.h" |
53 | #include "llvm/Support/InstructionCost.h" |
54 | #include "llvm/Support/raw_ostream.h" |
55 | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
56 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
57 | #include "llvm/Transforms/Utils/Cloning.h" |
58 | #include "llvm/Transforms/Utils/Local.h" |
59 | #include "llvm/Transforms/Utils/LoopUtils.h" |
60 | #include "llvm/Transforms/Utils/ValueMapper.h" |
61 | #include <algorithm> |
62 | #include <cassert> |
63 | #include <iterator> |
64 | #include <numeric> |
65 | #include <optional> |
66 | #include <utility> |
67 | |
68 | #define DEBUG_TYPE "simple-loop-unswitch" |
69 | |
70 | using namespace llvm; |
71 | using namespace llvm::PatternMatch; |
72 | |
73 | STATISTIC(NumBranches, "Number of branches unswitched" ); |
74 | STATISTIC(NumSwitches, "Number of switches unswitched" ); |
75 | STATISTIC(NumSelects, "Number of selects turned into branches for unswitching" ); |
76 | STATISTIC(NumGuards, "Number of guards turned into branches for unswitching" ); |
77 | STATISTIC(NumTrivial, "Number of unswitches that are trivial" ); |
78 | STATISTIC( |
79 | NumCostMultiplierSkipped, |
80 | "Number of unswitch candidates that had their cost multiplier skipped" ); |
81 | STATISTIC(NumInvariantConditionsInjected, |
82 | "Number of invariant conditions injected and unswitched" ); |
83 | |
84 | static cl::opt<bool> EnableNonTrivialUnswitch( |
85 | "enable-nontrivial-unswitch" , cl::init(Val: false), cl::Hidden, |
86 | cl::desc("Forcibly enables non-trivial loop unswitching rather than " |
87 | "following the configuration passed into the pass." )); |
88 | |
89 | static cl::opt<int> |
90 | UnswitchThreshold("unswitch-threshold" , cl::init(Val: 50), cl::Hidden, |
91 | cl::desc("The cost threshold for unswitching a loop." )); |
92 | |
93 | static cl::opt<bool> EnableUnswitchCostMultiplier( |
94 | "enable-unswitch-cost-multiplier" , cl::init(Val: true), cl::Hidden, |
95 | cl::desc("Enable unswitch cost multiplier that prohibits exponential " |
96 | "explosion in nontrivial unswitch." )); |
97 | static cl::opt<int> UnswitchSiblingsToplevelDiv( |
98 | "unswitch-siblings-toplevel-div" , cl::init(Val: 2), cl::Hidden, |
99 | cl::desc("Toplevel siblings divisor for cost multiplier." )); |
100 | static cl::opt<int> UnswitchNumInitialUnscaledCandidates( |
101 | "unswitch-num-initial-unscaled-candidates" , cl::init(Val: 8), cl::Hidden, |
102 | cl::desc("Number of unswitch candidates that are ignored when calculating " |
103 | "cost multiplier." )); |
104 | static cl::opt<bool> UnswitchGuards( |
105 | "simple-loop-unswitch-guards" , cl::init(Val: true), cl::Hidden, |
106 | cl::desc("If enabled, simple loop unswitching will also consider " |
107 | "llvm.experimental.guard intrinsics as unswitch candidates." )); |
108 | static cl::opt<bool> DropNonTrivialImplicitNullChecks( |
109 | "simple-loop-unswitch-drop-non-trivial-implicit-null-checks" , |
110 | cl::init(Val: false), cl::Hidden, |
111 | cl::desc("If enabled, drop make.implicit metadata in unswitched implicit " |
112 | "null checks to save time analyzing if we can keep it." )); |
113 | static cl::opt<unsigned> |
114 | MSSAThreshold("simple-loop-unswitch-memoryssa-threshold" , |
115 | cl::desc("Max number of memory uses to explore during " |
116 | "partial unswitching analysis" ), |
117 | cl::init(Val: 100), cl::Hidden); |
118 | static cl::opt<bool> FreezeLoopUnswitchCond( |
119 | "freeze-loop-unswitch-cond" , cl::init(Val: true), cl::Hidden, |
120 | cl::desc("If enabled, the freeze instruction will be added to condition " |
121 | "of loop unswitch to prevent miscompilation." )); |
122 | |
123 | static cl::opt<bool> InjectInvariantConditions( |
124 | "simple-loop-unswitch-inject-invariant-conditions" , cl::Hidden, |
125 | cl::desc("Whether we should inject new invariants and unswitch them to " |
126 | "eliminate some existing (non-invariant) conditions." ), |
127 | cl::init(Val: true)); |
128 | |
129 | static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( |
130 | "simple-loop-unswitch-inject-invariant-condition-hotness-threshold" , |
131 | cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " |
132 | "unswitch on them to eliminate branches that are " |
133 | "not-taken 1/<this option> times or less." ), |
134 | cl::init(Val: 16)); |
135 | |
136 | AnalysisKey ShouldRunExtraSimpleLoopUnswitch::; |
137 | namespace { |
138 | struct CompareDesc { |
139 | BranchInst *Term; |
140 | Value *Invariant; |
141 | BasicBlock *InLoopSucc; |
142 | |
143 | CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc) |
144 | : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {} |
145 | }; |
146 | |
147 | struct InjectedInvariant { |
148 | ICmpInst::Predicate Pred; |
149 | Value *LHS; |
150 | Value *RHS; |
151 | BasicBlock *InLoopSucc; |
152 | |
153 | InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS, |
154 | BasicBlock *InLoopSucc) |
155 | : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {} |
156 | }; |
157 | |
158 | struct NonTrivialUnswitchCandidate { |
159 | Instruction *TI = nullptr; |
160 | TinyPtrVector<Value *> Invariants; |
161 | std::optional<InstructionCost> Cost; |
162 | std::optional<InjectedInvariant> PendingInjection; |
163 | NonTrivialUnswitchCandidate( |
164 | Instruction *TI, ArrayRef<Value *> Invariants, |
165 | std::optional<InstructionCost> Cost = std::nullopt, |
166 | std::optional<InjectedInvariant> PendingInjection = std::nullopt) |
167 | : TI(TI), Invariants(Invariants), Cost(Cost), |
168 | PendingInjection(PendingInjection) {}; |
169 | |
170 | bool hasPendingInjection() const { return PendingInjection.has_value(); } |
171 | }; |
172 | } // end anonymous namespace. |
173 | |
174 | // Helper to skip (select x, true, false), which matches both a logical AND and |
175 | // OR and can confuse code that tries to determine if \p Cond is either a |
176 | // logical AND or OR but not both. |
177 | static Value *skipTrivialSelect(Value *Cond) { |
178 | Value *CondNext; |
179 | while (match(V: Cond, P: m_Select(C: m_Value(V&: CondNext), L: m_One(), R: m_Zero()))) |
180 | Cond = CondNext; |
181 | return Cond; |
182 | } |
183 | |
184 | /// Collect all of the loop invariant input values transitively used by the |
185 | /// homogeneous instruction graph from a given root. |
186 | /// |
187 | /// This essentially walks from a root recursively through loop variant operands |
188 | /// which have perform the same logical operation (AND or OR) and finds all |
189 | /// inputs which are loop invariant. For some operations these can be |
190 | /// re-associated and unswitched out of the loop entirely. |
191 | static TinyPtrVector<Value *> |
192 | collectHomogenousInstGraphLoopInvariants(const Loop &L, Instruction &Root, |
193 | const LoopInfo &LI) { |
194 | assert(!L.isLoopInvariant(&Root) && |
195 | "Only need to walk the graph if root itself is not invariant." ); |
196 | TinyPtrVector<Value *> Invariants; |
197 | |
198 | bool IsRootAnd = match(V: &Root, P: m_LogicalAnd()); |
199 | bool IsRootOr = match(V: &Root, P: m_LogicalOr()); |
200 | |
201 | // Build a worklist and recurse through operators collecting invariants. |
202 | SmallVector<Instruction *, 4> Worklist; |
203 | SmallPtrSet<Instruction *, 8> Visited; |
204 | Worklist.push_back(Elt: &Root); |
205 | Visited.insert(Ptr: &Root); |
206 | do { |
207 | Instruction &I = *Worklist.pop_back_val(); |
208 | for (Value *OpV : I.operand_values()) { |
209 | // Skip constants as unswitching isn't interesting for them. |
210 | if (isa<Constant>(Val: OpV)) |
211 | continue; |
212 | |
213 | // Add it to our result if loop invariant. |
214 | if (L.isLoopInvariant(V: OpV)) { |
215 | Invariants.push_back(NewVal: OpV); |
216 | continue; |
217 | } |
218 | |
219 | // If not an instruction with the same opcode, nothing we can do. |
220 | Instruction *OpI = dyn_cast<Instruction>(Val: skipTrivialSelect(Cond: OpV)); |
221 | |
222 | if (OpI && ((IsRootAnd && match(V: OpI, P: m_LogicalAnd())) || |
223 | (IsRootOr && match(V: OpI, P: m_LogicalOr())))) { |
224 | // Visit this operand. |
225 | if (Visited.insert(Ptr: OpI).second) |
226 | Worklist.push_back(Elt: OpI); |
227 | } |
228 | } |
229 | } while (!Worklist.empty()); |
230 | |
231 | return Invariants; |
232 | } |
233 | |
234 | static void replaceLoopInvariantUses(const Loop &L, Value *Invariant, |
235 | Constant &Replacement) { |
236 | assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?" ); |
237 | |
238 | // Replace uses of LIC in the loop with the given constant. |
239 | // We use make_early_inc_range as set invalidates the iterator. |
240 | for (Use &U : llvm::make_early_inc_range(Range: Invariant->uses())) { |
241 | Instruction *UserI = dyn_cast<Instruction>(Val: U.getUser()); |
242 | |
243 | // Replace this use within the loop body. |
244 | if (UserI && L.contains(Inst: UserI)) |
245 | U.set(&Replacement); |
246 | } |
247 | } |
248 | |
249 | /// Check that all the LCSSA PHI nodes in the loop exit block have trivial |
250 | /// incoming values along this edge. |
251 | static bool areLoopExitPHIsLoopInvariant(const Loop &L, |
252 | const BasicBlock &ExitingBB, |
253 | const BasicBlock &ExitBB) { |
254 | for (const Instruction &I : ExitBB) { |
255 | auto *PN = dyn_cast<PHINode>(Val: &I); |
256 | if (!PN) |
257 | // No more PHIs to check. |
258 | return true; |
259 | |
260 | // If the incoming value for this edge isn't loop invariant the unswitch |
261 | // won't be trivial. |
262 | if (!L.isLoopInvariant(V: PN->getIncomingValueForBlock(BB: &ExitingBB))) |
263 | return false; |
264 | } |
265 | llvm_unreachable("Basic blocks should never be empty!" ); |
266 | } |
267 | |
268 | /// Copy a set of loop invariant values \p ToDuplicate and insert them at the |
269 | /// end of \p BB and conditionally branch on the copied condition. We only |
270 | /// branch on a single value. |
271 | static void buildPartialUnswitchConditionalBranch( |
272 | BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, |
273 | BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, |
274 | const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { |
275 | IRBuilder<> IRB(&BB); |
276 | |
277 | SmallVector<Value *> FrozenInvariants; |
278 | for (Value *Inv : Invariants) { |
279 | if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(V: Inv, AC, CtxI: I, DT: &DT)) |
280 | Inv = IRB.CreateFreeze(V: Inv, Name: Inv->getName() + ".fr" ); |
281 | FrozenInvariants.push_back(Elt: Inv); |
282 | } |
283 | |
284 | Value *Cond = Direction ? IRB.CreateOr(Ops: FrozenInvariants) |
285 | : IRB.CreateAnd(Ops: FrozenInvariants); |
286 | IRB.CreateCondBr(Cond, True: Direction ? &UnswitchedSucc : &NormalSucc, |
287 | False: Direction ? &NormalSucc : &UnswitchedSucc); |
288 | } |
289 | |
290 | /// Copy a set of loop invariant values, and conditionally branch on them. |
291 | static void buildPartialInvariantUnswitchConditionalBranch( |
292 | BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, |
293 | BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, |
294 | MemorySSAUpdater *MSSAU) { |
295 | ValueToValueMapTy VMap; |
296 | for (auto *Val : reverse(C&: ToDuplicate)) { |
297 | Instruction *Inst = cast<Instruction>(Val); |
298 | Instruction *NewInst = Inst->clone(); |
299 | NewInst->insertInto(ParentBB: &BB, It: BB.end()); |
300 | RemapInstruction(I: NewInst, VM&: VMap, |
301 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
302 | VMap[Val] = NewInst; |
303 | |
304 | if (!MSSAU) |
305 | continue; |
306 | |
307 | MemorySSA *MSSA = MSSAU->getMemorySSA(); |
308 | if (auto *MemUse = |
309 | dyn_cast_or_null<MemoryUse>(Val: MSSA->getMemoryAccess(I: Inst))) { |
310 | auto *DefiningAccess = MemUse->getDefiningAccess(); |
311 | // Get the first defining access before the loop. |
312 | while (L.contains(BB: DefiningAccess->getBlock())) { |
313 | // If the defining access is a MemoryPhi, get the incoming |
314 | // value for the pre-header as defining access. |
315 | if (auto *MemPhi = dyn_cast<MemoryPhi>(Val: DefiningAccess)) |
316 | DefiningAccess = |
317 | MemPhi->getIncomingValueForBlock(BB: L.getLoopPreheader()); |
318 | else |
319 | DefiningAccess = cast<MemoryDef>(Val: DefiningAccess)->getDefiningAccess(); |
320 | } |
321 | MSSAU->createMemoryAccessInBB(I: NewInst, Definition: DefiningAccess, |
322 | BB: NewInst->getParent(), |
323 | Point: MemorySSA::BeforeTerminator); |
324 | } |
325 | } |
326 | |
327 | IRBuilder<> IRB(&BB); |
328 | Value *Cond = VMap[ToDuplicate[0]]; |
329 | IRB.CreateCondBr(Cond, True: Direction ? &UnswitchedSucc : &NormalSucc, |
330 | False: Direction ? &NormalSucc : &UnswitchedSucc); |
331 | } |
332 | |
333 | /// Rewrite the PHI nodes in an unswitched loop exit basic block. |
334 | /// |
335 | /// Requires that the loop exit and unswitched basic block are the same, and |
336 | /// that the exiting block was a unique predecessor of that block. Rewrites the |
337 | /// PHI nodes in that block such that what were LCSSA PHI nodes become trivial |
338 | /// PHI nodes from the old preheader that now contains the unswitched |
339 | /// terminator. |
340 | static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, |
341 | BasicBlock &OldExitingBB, |
342 | BasicBlock &OldPH) { |
343 | for (PHINode &PN : UnswitchedBB.phis()) { |
344 | // When the loop exit is directly unswitched we just need to update the |
345 | // incoming basic block. We loop to handle weird cases with repeated |
346 | // incoming blocks, but expect to typically only have one operand here. |
347 | for (auto i : seq<int>(Begin: 0, End: PN.getNumOperands())) { |
348 | assert(PN.getIncomingBlock(i) == &OldExitingBB && |
349 | "Found incoming block different from unique predecessor!" ); |
350 | PN.setIncomingBlock(i, BB: &OldPH); |
351 | } |
352 | } |
353 | } |
354 | |
355 | /// Rewrite the PHI nodes in the loop exit basic block and the split off |
356 | /// unswitched block. |
357 | /// |
358 | /// Because the exit block remains an exit from the loop, this rewrites the |
359 | /// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI |
360 | /// nodes into the unswitched basic block to select between the value in the |
361 | /// old preheader and the loop exit. |
362 | static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, |
363 | BasicBlock &UnswitchedBB, |
364 | BasicBlock &OldExitingBB, |
365 | BasicBlock &OldPH, |
366 | bool FullUnswitch) { |
367 | assert(&ExitBB != &UnswitchedBB && |
368 | "Must have different loop exit and unswitched blocks!" ); |
369 | BasicBlock::iterator InsertPt = UnswitchedBB.begin(); |
370 | for (PHINode &PN : ExitBB.phis()) { |
371 | auto *NewPN = PHINode::Create(Ty: PN.getType(), /*NumReservedValues*/ 2, |
372 | NameStr: PN.getName() + ".split" ); |
373 | NewPN->insertBefore(InsertPos: InsertPt); |
374 | |
375 | // Walk backwards over the old PHI node's inputs to minimize the cost of |
376 | // removing each one. We have to do this weird loop manually so that we |
377 | // create the same number of new incoming edges in the new PHI as we expect |
378 | // each case-based edge to be included in the unswitched switch in some |
379 | // cases. |
380 | // FIXME: This is really, really gross. It would be much cleaner if LLVM |
381 | // allowed us to create a single entry for a predecessor block without |
382 | // having separate entries for each "edge" even though these edges are |
383 | // required to produce identical results. |
384 | for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) { |
385 | if (PN.getIncomingBlock(i) != &OldExitingBB) |
386 | continue; |
387 | |
388 | Value *Incoming = PN.getIncomingValue(i); |
389 | if (FullUnswitch) |
390 | // No more edge from the old exiting block to the exit block. |
391 | PN.removeIncomingValue(Idx: i); |
392 | |
393 | NewPN->addIncoming(V: Incoming, BB: &OldPH); |
394 | } |
395 | |
396 | // Now replace the old PHI with the new one and wire the old one in as an |
397 | // input to the new one. |
398 | PN.replaceAllUsesWith(V: NewPN); |
399 | NewPN->addIncoming(V: &PN, BB: &ExitBB); |
400 | } |
401 | } |
402 | |
403 | /// Hoist the current loop up to the innermost loop containing a remaining exit. |
404 | /// |
405 | /// Because we've removed an exit from the loop, we may have changed the set of |
406 | /// loops reachable and need to move the current loop up the loop nest or even |
407 | /// to an entirely separate nest. |
408 | static void hoistLoopToNewParent(Loop &L, BasicBlock &, |
409 | DominatorTree &DT, LoopInfo &LI, |
410 | MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { |
411 | // If the loop is already at the top level, we can't hoist it anywhere. |
412 | Loop *OldParentL = L.getParentLoop(); |
413 | if (!OldParentL) |
414 | return; |
415 | |
416 | SmallVector<BasicBlock *, 4> Exits; |
417 | L.getExitBlocks(ExitBlocks&: Exits); |
418 | Loop *NewParentL = nullptr; |
419 | for (auto *ExitBB : Exits) |
420 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) |
421 | if (!NewParentL || NewParentL->contains(L: ExitL)) |
422 | NewParentL = ExitL; |
423 | |
424 | if (NewParentL == OldParentL) |
425 | return; |
426 | |
427 | // The new parent loop (if different) should always contain the old one. |
428 | if (NewParentL) |
429 | assert(NewParentL->contains(OldParentL) && |
430 | "Can only hoist this loop up the nest!" ); |
431 | |
432 | // The preheader will need to move with the body of this loop. However, |
433 | // because it isn't in this loop we also need to update the primary loop map. |
434 | assert(OldParentL == LI.getLoopFor(&Preheader) && |
435 | "Parent loop of this loop should contain this loop's preheader!" ); |
436 | LI.changeLoopFor(BB: &Preheader, L: NewParentL); |
437 | |
438 | // Remove this loop from its old parent. |
439 | OldParentL->removeChildLoop(Child: &L); |
440 | |
441 | // Add the loop either to the new parent or as a top-level loop. |
442 | if (NewParentL) |
443 | NewParentL->addChildLoop(NewChild: &L); |
444 | else |
445 | LI.addTopLevelLoop(New: &L); |
446 | |
447 | // Remove this loops blocks from the old parent and every other loop up the |
448 | // nest until reaching the new parent. Also update all of these |
449 | // no-longer-containing loops to reflect the nesting change. |
450 | for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL; |
451 | OldContainingL = OldContainingL->getParentLoop()) { |
452 | llvm::erase_if(C&: OldContainingL->getBlocksVector(), |
453 | P: [&](const BasicBlock *BB) { |
454 | return BB == &Preheader || L.contains(BB); |
455 | }); |
456 | |
457 | OldContainingL->getBlocksSet().erase(Ptr: &Preheader); |
458 | for (BasicBlock *BB : L.blocks()) |
459 | OldContainingL->getBlocksSet().erase(Ptr: BB); |
460 | |
461 | // Because we just hoisted a loop out of this one, we have essentially |
462 | // created new exit paths from it. That means we need to form LCSSA PHI |
463 | // nodes for values used in the no-longer-nested loop. |
464 | formLCSSA(L&: *OldContainingL, DT, LI: &LI, SE); |
465 | |
466 | // We shouldn't need to form dedicated exits because the exit introduced |
467 | // here is the (just split by unswitching) preheader. However, after trivial |
468 | // unswitching it is possible to get new non-dedicated exits out of parent |
469 | // loop so let's conservatively form dedicated exit blocks and figure out |
470 | // if we can optimize later. |
471 | formDedicatedExitBlocks(L: OldContainingL, DT: &DT, LI: &LI, MSSAU, |
472 | /*PreserveLCSSA*/ true); |
473 | } |
474 | } |
475 | |
476 | // Return the top-most loop containing ExitBB and having ExitBB as exiting block |
477 | // or the loop containing ExitBB, if there is no parent loop containing ExitBB |
478 | // as exiting block. |
479 | static Loop *getTopMostExitingLoop(const BasicBlock *ExitBB, |
480 | const LoopInfo &LI) { |
481 | Loop *TopMost = LI.getLoopFor(BB: ExitBB); |
482 | Loop *Current = TopMost; |
483 | while (Current) { |
484 | if (Current->isLoopExiting(BB: ExitBB)) |
485 | TopMost = Current; |
486 | Current = Current->getParentLoop(); |
487 | } |
488 | return TopMost; |
489 | } |
490 | |
491 | /// Unswitch a trivial branch if the condition is loop invariant. |
492 | /// |
493 | /// This routine should only be called when loop code leading to the branch has |
494 | /// been validated as trivial (no side effects). This routine checks if the |
495 | /// condition is invariant and one of the successors is a loop exit. This |
496 | /// allows us to unswitch without duplicating the loop, making it trivial. |
497 | /// |
498 | /// If this routine fails to unswitch the branch it returns false. |
499 | /// |
500 | /// If the branch can be unswitched, this routine splits the preheader and |
501 | /// hoists the branch above that split. Preserves loop simplified form |
502 | /// (splitting the exit block as necessary). It simplifies the branch within |
503 | /// the loop to an unconditional branch but doesn't remove it entirely. Further |
504 | /// cleanup can be done with some simplifycfg like pass. |
505 | /// |
506 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
507 | /// invalidated by this. |
508 | static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, |
509 | LoopInfo &LI, ScalarEvolution *SE, |
510 | MemorySSAUpdater *MSSAU) { |
511 | assert(BI.isConditional() && "Can only unswitch a conditional branch!" ); |
512 | LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n" ); |
513 | |
514 | // The loop invariant values that we want to unswitch. |
515 | TinyPtrVector<Value *> Invariants; |
516 | |
517 | // When true, we're fully unswitching the branch rather than just unswitching |
518 | // some input conditions to the branch. |
519 | bool FullUnswitch = false; |
520 | |
521 | Value *Cond = skipTrivialSelect(Cond: BI.getCondition()); |
522 | if (L.isLoopInvariant(V: Cond)) { |
523 | Invariants.push_back(NewVal: Cond); |
524 | FullUnswitch = true; |
525 | } else { |
526 | if (auto *CondInst = dyn_cast<Instruction>(Val: Cond)) |
527 | Invariants = collectHomogenousInstGraphLoopInvariants(L, Root&: *CondInst, LI); |
528 | if (Invariants.empty()) { |
529 | LLVM_DEBUG(dbgs() << " Couldn't find invariant inputs!\n" ); |
530 | return false; |
531 | } |
532 | } |
533 | |
534 | // Check that one of the branch's successors exits, and which one. |
535 | bool ExitDirection = true; |
536 | int LoopExitSuccIdx = 0; |
537 | auto *LoopExitBB = BI.getSuccessor(i: 0); |
538 | if (L.contains(BB: LoopExitBB)) { |
539 | ExitDirection = false; |
540 | LoopExitSuccIdx = 1; |
541 | LoopExitBB = BI.getSuccessor(i: 1); |
542 | if (L.contains(BB: LoopExitBB)) { |
543 | LLVM_DEBUG(dbgs() << " Branch doesn't exit the loop!\n" ); |
544 | return false; |
545 | } |
546 | } |
547 | auto *ContinueBB = BI.getSuccessor(i: 1 - LoopExitSuccIdx); |
548 | auto *ParentBB = BI.getParent(); |
549 | if (!areLoopExitPHIsLoopInvariant(L, ExitingBB: *ParentBB, ExitBB: *LoopExitBB)) { |
550 | LLVM_DEBUG(dbgs() << " Loop exit PHI's aren't loop-invariant!\n" ); |
551 | return false; |
552 | } |
553 | |
554 | // When unswitching only part of the branch's condition, we need the exit |
555 | // block to be reached directly from the partially unswitched input. This can |
556 | // be done when the exit block is along the true edge and the branch condition |
557 | // is a graph of `or` operations, or the exit block is along the false edge |
558 | // and the condition is a graph of `and` operations. |
559 | if (!FullUnswitch) { |
560 | if (ExitDirection ? !match(V: Cond, P: m_LogicalOr()) |
561 | : !match(V: Cond, P: m_LogicalAnd())) { |
562 | LLVM_DEBUG(dbgs() << " Branch condition is in improper form for " |
563 | "non-full unswitch!\n" ); |
564 | return false; |
565 | } |
566 | } |
567 | |
568 | LLVM_DEBUG({ |
569 | dbgs() << " unswitching trivial invariant conditions for: " << BI |
570 | << "\n" ; |
571 | for (Value *Invariant : Invariants) { |
572 | dbgs() << " " << *Invariant << " == true" ; |
573 | if (Invariant != Invariants.back()) |
574 | dbgs() << " ||" ; |
575 | dbgs() << "\n" ; |
576 | } |
577 | }); |
578 | |
579 | // If we have scalar evolutions, we need to invalidate them including this |
580 | // loop, the loop containing the exit block and the topmost parent loop |
581 | // exiting via LoopExitBB. |
582 | if (SE) { |
583 | if (const Loop *ExitL = getTopMostExitingLoop(ExitBB: LoopExitBB, LI)) |
584 | SE->forgetLoop(L: ExitL); |
585 | else |
586 | // Forget the entire nest as this exits the entire nest. |
587 | SE->forgetTopmostLoop(L: &L); |
588 | SE->forgetBlockAndLoopDispositions(); |
589 | } |
590 | |
591 | if (MSSAU && VerifyMemorySSA) |
592 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
593 | |
594 | // Split the preheader, so that we know that there is a safe place to insert |
595 | // the conditional branch. We will change the preheader to have a conditional |
596 | // branch on LoopCond. |
597 | BasicBlock *OldPH = L.getLoopPreheader(); |
598 | BasicBlock *NewPH = SplitEdge(From: OldPH, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
599 | |
600 | // Now that we have a place to insert the conditional branch, create a place |
601 | // to branch to: this is the exit block out of the loop that we are |
602 | // unswitching. We need to split this if there are other loop predecessors. |
603 | // Because the loop is in simplified form, *any* other predecessor is enough. |
604 | BasicBlock *UnswitchedBB; |
605 | if (FullUnswitch && LoopExitBB->getUniquePredecessor()) { |
606 | assert(LoopExitBB->getUniquePredecessor() == BI.getParent() && |
607 | "A branch's parent isn't a predecessor!" ); |
608 | UnswitchedBB = LoopExitBB; |
609 | } else { |
610 | UnswitchedBB = |
611 | SplitBlock(Old: LoopExitBB, SplitPt: LoopExitBB->begin(), DT: &DT, LI: &LI, MSSAU, BBName: "" , Before: false); |
612 | } |
613 | |
614 | if (MSSAU && VerifyMemorySSA) |
615 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
616 | |
617 | // Actually move the invariant uses into the unswitched position. If possible, |
618 | // we do this by moving the instructions, but when doing partial unswitching |
619 | // we do it by building a new merge of the values in the unswitched position. |
620 | OldPH->getTerminator()->eraseFromParent(); |
621 | if (FullUnswitch) { |
622 | // If fully unswitching, we can use the existing branch instruction. |
623 | // Splice it into the old PH to gate reaching the new preheader and re-point |
624 | // its successors. |
625 | BI.moveBefore(BB&: *OldPH, I: OldPH->end()); |
626 | BI.setCondition(Cond); |
627 | if (MSSAU) { |
628 | // Temporarily clone the terminator, to make MSSA update cheaper by |
629 | // separating "insert edge" updates from "remove edge" ones. |
630 | BI.clone()->insertInto(ParentBB, It: ParentBB->end()); |
631 | } else { |
632 | // Create a new unconditional branch that will continue the loop as a new |
633 | // terminator. |
634 | BranchInst::Create(IfTrue: ContinueBB, InsertAtEnd: ParentBB); |
635 | } |
636 | BI.setSuccessor(idx: LoopExitSuccIdx, NewSucc: UnswitchedBB); |
637 | BI.setSuccessor(idx: 1 - LoopExitSuccIdx, NewSucc: NewPH); |
638 | } else { |
639 | // Only unswitching a subset of inputs to the condition, so we will need to |
640 | // build a new branch that merges the invariant inputs. |
641 | if (ExitDirection) |
642 | assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalOr()) && |
643 | "Must have an `or` of `i1`s or `select i1 X, true, Y`s for the " |
644 | "condition!" ); |
645 | else |
646 | assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalAnd()) && |
647 | "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the" |
648 | " condition!" ); |
649 | buildPartialUnswitchConditionalBranch( |
650 | BB&: *OldPH, Invariants, Direction: ExitDirection, UnswitchedSucc&: *UnswitchedBB, NormalSucc&: *NewPH, |
651 | InsertFreeze: FreezeLoopUnswitchCond, I: OldPH->getTerminator(), AC: nullptr, DT); |
652 | } |
653 | |
654 | // Update the dominator tree with the added edge. |
655 | DT.insertEdge(From: OldPH, To: UnswitchedBB); |
656 | |
657 | // After the dominator tree was updated with the added edge, update MemorySSA |
658 | // if available. |
659 | if (MSSAU) { |
660 | SmallVector<CFGUpdate, 1> Updates; |
661 | Updates.push_back(Elt: {cfg::UpdateKind::Insert, OldPH, UnswitchedBB}); |
662 | MSSAU->applyInsertUpdates(Updates, DT); |
663 | } |
664 | |
665 | // Finish updating dominator tree and memory ssa for full unswitch. |
666 | if (FullUnswitch) { |
667 | if (MSSAU) { |
668 | // Remove the cloned branch instruction. |
669 | ParentBB->getTerminator()->eraseFromParent(); |
670 | // Create unconditional branch now. |
671 | BranchInst::Create(IfTrue: ContinueBB, InsertAtEnd: ParentBB); |
672 | MSSAU->removeEdge(From: ParentBB, To: LoopExitBB); |
673 | } |
674 | DT.deleteEdge(From: ParentBB, To: LoopExitBB); |
675 | } |
676 | |
677 | if (MSSAU && VerifyMemorySSA) |
678 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
679 | |
680 | // Rewrite the relevant PHI nodes. |
681 | if (UnswitchedBB == LoopExitBB) |
682 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *UnswitchedBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
683 | else |
684 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *LoopExitBB, UnswitchedBB&: *UnswitchedBB, |
685 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, FullUnswitch); |
686 | |
687 | // The constant we can replace all of our invariants with inside the loop |
688 | // body. If any of the invariants have a value other than this the loop won't |
689 | // be entered. |
690 | ConstantInt *Replacement = ExitDirection |
691 | ? ConstantInt::getFalse(Context&: BI.getContext()) |
692 | : ConstantInt::getTrue(Context&: BI.getContext()); |
693 | |
694 | // Since this is an i1 condition we can also trivially replace uses of it |
695 | // within the loop with a constant. |
696 | for (Value *Invariant : Invariants) |
697 | replaceLoopInvariantUses(L, Invariant, Replacement&: *Replacement); |
698 | |
699 | // If this was full unswitching, we may have changed the nesting relationship |
700 | // for this loop so hoist it to its correct parent if needed. |
701 | if (FullUnswitch) |
702 | hoistLoopToNewParent(L, Preheader&: *NewPH, DT, LI, MSSAU, SE); |
703 | |
704 | if (MSSAU && VerifyMemorySSA) |
705 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
706 | |
707 | LLVM_DEBUG(dbgs() << " done: unswitching trivial branch...\n" ); |
708 | ++NumTrivial; |
709 | ++NumBranches; |
710 | return true; |
711 | } |
712 | |
713 | /// Unswitch a trivial switch if the condition is loop invariant. |
714 | /// |
715 | /// This routine should only be called when loop code leading to the switch has |
716 | /// been validated as trivial (no side effects). This routine checks if the |
717 | /// condition is invariant and that at least one of the successors is a loop |
718 | /// exit. This allows us to unswitch without duplicating the loop, making it |
719 | /// trivial. |
720 | /// |
721 | /// If this routine fails to unswitch the switch it returns false. |
722 | /// |
723 | /// If the switch can be unswitched, this routine splits the preheader and |
724 | /// copies the switch above that split. If the default case is one of the |
725 | /// exiting cases, it copies the non-exiting cases and points them at the new |
726 | /// preheader. If the default case is not exiting, it copies the exiting cases |
727 | /// and points the default at the preheader. It preserves loop simplified form |
728 | /// (splitting the exit blocks as necessary). It simplifies the switch within |
729 | /// the loop by removing now-dead cases. If the default case is one of those |
730 | /// unswitched, it replaces its destination with a new basic block containing |
731 | /// only unreachable. Such basic blocks, while technically loop exits, are not |
732 | /// considered for unswitching so this is a stable transform and the same |
733 | /// switch will not be revisited. If after unswitching there is only a single |
734 | /// in-loop successor, the switch is further simplified to an unconditional |
735 | /// branch. Still more cleanup can be done with some simplifycfg like pass. |
736 | /// |
737 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
738 | /// invalidated by this. |
739 | static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, |
740 | LoopInfo &LI, ScalarEvolution *SE, |
741 | MemorySSAUpdater *MSSAU) { |
742 | LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n" ); |
743 | Value *LoopCond = SI.getCondition(); |
744 | |
745 | // If this isn't switching on an invariant condition, we can't unswitch it. |
746 | if (!L.isLoopInvariant(V: LoopCond)) |
747 | return false; |
748 | |
749 | auto *ParentBB = SI.getParent(); |
750 | |
751 | // The same check must be used both for the default and the exit cases. We |
752 | // should never leave edges from the switch instruction to a basic block that |
753 | // we are unswitching, hence the condition used to determine the default case |
754 | // needs to also be used to populate ExitCaseIndices, which is then used to |
755 | // remove cases from the switch. |
756 | auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) { |
757 | // BBToCheck is not an exit block if it is inside loop L. |
758 | if (L.contains(BB: &BBToCheck)) |
759 | return false; |
760 | // BBToCheck is not trivial to unswitch if its phis aren't loop invariant. |
761 | if (!areLoopExitPHIsLoopInvariant(L, ExitingBB: *ParentBB, ExitBB: BBToCheck)) |
762 | return false; |
763 | // We do not unswitch a block that only has an unreachable statement, as |
764 | // it's possible this is a previously unswitched block. Only unswitch if |
765 | // either the terminator is not unreachable, or, if it is, it's not the only |
766 | // instruction in the block. |
767 | auto *TI = BBToCheck.getTerminator(); |
768 | bool isUnreachable = isa<UnreachableInst>(Val: TI); |
769 | return !isUnreachable || |
770 | (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI)); |
771 | }; |
772 | |
773 | SmallVector<int, 4> ExitCaseIndices; |
774 | for (auto Case : SI.cases()) |
775 | if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor())) |
776 | ExitCaseIndices.push_back(Elt: Case.getCaseIndex()); |
777 | BasicBlock *DefaultExitBB = nullptr; |
778 | SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight = |
779 | SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, idx: 0); |
780 | if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) { |
781 | DefaultExitBB = SI.getDefaultDest(); |
782 | } else if (ExitCaseIndices.empty()) |
783 | return false; |
784 | |
785 | LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n" ); |
786 | |
787 | if (MSSAU && VerifyMemorySSA) |
788 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
789 | |
790 | // We may need to invalidate SCEVs for the outermost loop reached by any of |
791 | // the exits. |
792 | Loop *OuterL = &L; |
793 | |
794 | if (DefaultExitBB) { |
795 | // Check the loop containing this exit. |
796 | Loop *ExitL = getTopMostExitingLoop(ExitBB: DefaultExitBB, LI); |
797 | if (!ExitL || ExitL->contains(L: OuterL)) |
798 | OuterL = ExitL; |
799 | } |
800 | for (unsigned Index : ExitCaseIndices) { |
801 | auto CaseI = SI.case_begin() + Index; |
802 | // Compute the outer loop from this exit. |
803 | Loop *ExitL = getTopMostExitingLoop(ExitBB: CaseI->getCaseSuccessor(), LI); |
804 | if (!ExitL || ExitL->contains(L: OuterL)) |
805 | OuterL = ExitL; |
806 | } |
807 | |
808 | if (SE) { |
809 | if (OuterL) |
810 | SE->forgetLoop(L: OuterL); |
811 | else |
812 | SE->forgetTopmostLoop(L: &L); |
813 | } |
814 | |
815 | if (DefaultExitBB) { |
816 | // Clear out the default destination temporarily to allow accurate |
817 | // predecessor lists to be examined below. |
818 | SI.setDefaultDest(nullptr); |
819 | } |
820 | |
821 | // Store the exit cases into a separate data structure and remove them from |
822 | // the switch. |
823 | SmallVector<std::tuple<ConstantInt *, BasicBlock *, |
824 | SwitchInstProfUpdateWrapper::CaseWeightOpt>, |
825 | 4> ExitCases; |
826 | ExitCases.reserve(N: ExitCaseIndices.size()); |
827 | SwitchInstProfUpdateWrapper SIW(SI); |
828 | // We walk the case indices backwards so that we remove the last case first |
829 | // and don't disrupt the earlier indices. |
830 | for (unsigned Index : reverse(C&: ExitCaseIndices)) { |
831 | auto CaseI = SI.case_begin() + Index; |
832 | // Save the value of this case. |
833 | auto W = SIW.getSuccessorWeight(idx: CaseI->getSuccessorIndex()); |
834 | ExitCases.emplace_back(Args: CaseI->getCaseValue(), Args: CaseI->getCaseSuccessor(), Args&: W); |
835 | // Delete the unswitched cases. |
836 | SIW.removeCase(I: CaseI); |
837 | } |
838 | |
839 | // Check if after this all of the remaining cases point at the same |
840 | // successor. |
841 | BasicBlock *CommonSuccBB = nullptr; |
842 | if (SI.getNumCases() > 0 && |
843 | all_of(Range: drop_begin(RangeOrContainer: SI.cases()), P: [&SI](const SwitchInst::CaseHandle &Case) { |
844 | return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor(); |
845 | })) |
846 | CommonSuccBB = SI.case_begin()->getCaseSuccessor(); |
847 | if (!DefaultExitBB) { |
848 | // If we're not unswitching the default, we need it to match any cases to |
849 | // have a common successor or if we have no cases it is the common |
850 | // successor. |
851 | if (SI.getNumCases() == 0) |
852 | CommonSuccBB = SI.getDefaultDest(); |
853 | else if (SI.getDefaultDest() != CommonSuccBB) |
854 | CommonSuccBB = nullptr; |
855 | } |
856 | |
857 | // Split the preheader, so that we know that there is a safe place to insert |
858 | // the switch. |
859 | BasicBlock *OldPH = L.getLoopPreheader(); |
860 | BasicBlock *NewPH = SplitEdge(From: OldPH, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
861 | OldPH->getTerminator()->eraseFromParent(); |
862 | |
863 | // Now add the unswitched switch. |
864 | auto *NewSI = SwitchInst::Create(Value: LoopCond, Default: NewPH, NumCases: ExitCases.size(), InsertAtEnd: OldPH); |
865 | SwitchInstProfUpdateWrapper NewSIW(*NewSI); |
866 | |
867 | // Rewrite the IR for the unswitched basic blocks. This requires two steps. |
868 | // First, we split any exit blocks with remaining in-loop predecessors. Then |
869 | // we update the PHIs in one of two ways depending on if there was a split. |
870 | // We walk in reverse so that we split in the same order as the cases |
871 | // appeared. This is purely for convenience of reading the resulting IR, but |
872 | // it doesn't cost anything really. |
873 | SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs; |
874 | SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap; |
875 | // Handle the default exit if necessary. |
876 | // FIXME: It'd be great if we could merge this with the loop below but LLVM's |
877 | // ranges aren't quite powerful enough yet. |
878 | if (DefaultExitBB) { |
879 | if (pred_empty(BB: DefaultExitBB)) { |
880 | UnswitchedExitBBs.insert(Ptr: DefaultExitBB); |
881 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *DefaultExitBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
882 | } else { |
883 | auto *SplitBB = |
884 | SplitBlock(Old: DefaultExitBB, SplitPt: DefaultExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
885 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *DefaultExitBB, UnswitchedBB&: *SplitBB, |
886 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, |
887 | /*FullUnswitch*/ true); |
888 | DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; |
889 | } |
890 | } |
891 | // Note that we must use a reference in the for loop so that we update the |
892 | // container. |
893 | for (auto &ExitCase : reverse(C&: ExitCases)) { |
894 | // Grab a reference to the exit block in the pair so that we can update it. |
895 | BasicBlock *ExitBB = std::get<1>(t&: ExitCase); |
896 | |
897 | // If this case is the last edge into the exit block, we can simply reuse it |
898 | // as it will no longer be a loop exit. No mapping necessary. |
899 | if (pred_empty(BB: ExitBB)) { |
900 | // Only rewrite once. |
901 | if (UnswitchedExitBBs.insert(Ptr: ExitBB).second) |
902 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *ExitBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
903 | continue; |
904 | } |
905 | |
906 | // Otherwise we need to split the exit block so that we retain an exit |
907 | // block from the loop and a target for the unswitched condition. |
908 | BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB]; |
909 | if (!SplitExitBB) { |
910 | // If this is the first time we see this, do the split and remember it. |
911 | SplitExitBB = SplitBlock(Old: ExitBB, SplitPt: ExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
912 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *ExitBB, UnswitchedBB&: *SplitExitBB, |
913 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, |
914 | /*FullUnswitch*/ true); |
915 | } |
916 | // Update the case pair to point to the split block. |
917 | std::get<1>(t&: ExitCase) = SplitExitBB; |
918 | } |
919 | |
920 | // Now add the unswitched cases. We do this in reverse order as we built them |
921 | // in reverse order. |
922 | for (auto &ExitCase : reverse(C&: ExitCases)) { |
923 | ConstantInt *CaseVal = std::get<0>(t&: ExitCase); |
924 | BasicBlock *UnswitchedBB = std::get<1>(t&: ExitCase); |
925 | |
926 | NewSIW.addCase(OnVal: CaseVal, Dest: UnswitchedBB, W: std::get<2>(t&: ExitCase)); |
927 | } |
928 | |
929 | // If the default was unswitched, re-point it and add explicit cases for |
930 | // entering the loop. |
931 | if (DefaultExitBB) { |
932 | NewSIW->setDefaultDest(DefaultExitBB); |
933 | NewSIW.setSuccessorWeight(idx: 0, W: DefaultCaseWeight); |
934 | |
935 | // We removed all the exit cases, so we just copy the cases to the |
936 | // unswitched switch. |
937 | for (const auto &Case : SI.cases()) |
938 | NewSIW.addCase(OnVal: Case.getCaseValue(), Dest: NewPH, |
939 | W: SIW.getSuccessorWeight(idx: Case.getSuccessorIndex())); |
940 | } else if (DefaultCaseWeight) { |
941 | // We have to set branch weight of the default case. |
942 | uint64_t SW = *DefaultCaseWeight; |
943 | for (const auto &Case : SI.cases()) { |
944 | auto W = SIW.getSuccessorWeight(idx: Case.getSuccessorIndex()); |
945 | assert(W && |
946 | "case weight must be defined as default case weight is defined" ); |
947 | SW += *W; |
948 | } |
949 | NewSIW.setSuccessorWeight(idx: 0, W: SW); |
950 | } |
951 | |
952 | // If we ended up with a common successor for every path through the switch |
953 | // after unswitching, rewrite it to an unconditional branch to make it easy |
954 | // to recognize. Otherwise we potentially have to recognize the default case |
955 | // pointing at unreachable and other complexity. |
956 | if (CommonSuccBB) { |
957 | BasicBlock *BB = SI.getParent(); |
958 | // We may have had multiple edges to this common successor block, so remove |
959 | // them as predecessors. We skip the first one, either the default or the |
960 | // actual first case. |
961 | bool SkippedFirst = DefaultExitBB == nullptr; |
962 | for (auto Case : SI.cases()) { |
963 | assert(Case.getCaseSuccessor() == CommonSuccBB && |
964 | "Non-common successor!" ); |
965 | (void)Case; |
966 | if (!SkippedFirst) { |
967 | SkippedFirst = true; |
968 | continue; |
969 | } |
970 | CommonSuccBB->removePredecessor(Pred: BB, |
971 | /*KeepOneInputPHIs*/ true); |
972 | } |
973 | // Now nuke the switch and replace it with a direct branch. |
974 | SIW.eraseFromParent(); |
975 | BranchInst::Create(IfTrue: CommonSuccBB, InsertAtEnd: BB); |
976 | } else if (DefaultExitBB) { |
977 | assert(SI.getNumCases() > 0 && |
978 | "If we had no cases we'd have a common successor!" ); |
979 | // Move the last case to the default successor. This is valid as if the |
980 | // default got unswitched it cannot be reached. This has the advantage of |
981 | // being simple and keeping the number of edges from this switch to |
982 | // successors the same, and avoiding any PHI update complexity. |
983 | auto LastCaseI = std::prev(x: SI.case_end()); |
984 | |
985 | SI.setDefaultDest(LastCaseI->getCaseSuccessor()); |
986 | SIW.setSuccessorWeight( |
987 | idx: 0, W: SIW.getSuccessorWeight(idx: LastCaseI->getSuccessorIndex())); |
988 | SIW.removeCase(I: LastCaseI); |
989 | } |
990 | |
991 | // Walk the unswitched exit blocks and the unswitched split blocks and update |
992 | // the dominator tree based on the CFG edits. While we are walking unordered |
993 | // containers here, the API for applyUpdates takes an unordered list of |
994 | // updates and requires them to not contain duplicates. |
995 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
996 | for (auto *UnswitchedExitBB : UnswitchedExitBBs) { |
997 | DTUpdates.push_back(Elt: {DT.Delete, ParentBB, UnswitchedExitBB}); |
998 | DTUpdates.push_back(Elt: {DT.Insert, OldPH, UnswitchedExitBB}); |
999 | } |
1000 | for (auto SplitUnswitchedPair : SplitExitBBMap) { |
1001 | DTUpdates.push_back(Elt: {DT.Delete, ParentBB, SplitUnswitchedPair.first}); |
1002 | DTUpdates.push_back(Elt: {DT.Insert, OldPH, SplitUnswitchedPair.second}); |
1003 | } |
1004 | |
1005 | if (MSSAU) { |
1006 | MSSAU->applyUpdates(Updates: DTUpdates, DT, /*UpdateDT=*/UpdateDTFirst: true); |
1007 | if (VerifyMemorySSA) |
1008 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
1009 | } else { |
1010 | DT.applyUpdates(Updates: DTUpdates); |
1011 | } |
1012 | |
1013 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
1014 | |
1015 | // We may have changed the nesting relationship for this loop so hoist it to |
1016 | // its correct parent if needed. |
1017 | hoistLoopToNewParent(L, Preheader&: *NewPH, DT, LI, MSSAU, SE); |
1018 | |
1019 | if (MSSAU && VerifyMemorySSA) |
1020 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
1021 | |
1022 | ++NumTrivial; |
1023 | ++NumSwitches; |
1024 | LLVM_DEBUG(dbgs() << " done: unswitching trivial switch...\n" ); |
1025 | return true; |
1026 | } |
1027 | |
1028 | /// This routine scans the loop to find a branch or switch which occurs before |
1029 | /// any side effects occur. These can potentially be unswitched without |
1030 | /// duplicating the loop. If a branch or switch is successfully unswitched the |
1031 | /// scanning continues to see if subsequent branches or switches have become |
1032 | /// trivial. Once all trivial candidates have been unswitched, this routine |
1033 | /// returns. |
1034 | /// |
1035 | /// The return value indicates whether anything was unswitched (and therefore |
1036 | /// changed). |
1037 | /// |
1038 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
1039 | /// invalidated by this. |
1040 | static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, |
1041 | LoopInfo &LI, ScalarEvolution *SE, |
1042 | MemorySSAUpdater *MSSAU) { |
1043 | bool Changed = false; |
1044 | |
1045 | // If loop header has only one reachable successor we should keep looking for |
1046 | // trivial condition candidates in the successor as well. An alternative is |
1047 | // to constant fold conditions and merge successors into loop header (then we |
1048 | // only need to check header's terminator). The reason for not doing this in |
1049 | // LoopUnswitch pass is that it could potentially break LoopPassManager's |
1050 | // invariants. Folding dead branches could either eliminate the current loop |
1051 | // or make other loops unreachable. LCSSA form might also not be preserved |
1052 | // after deleting branches. The following code keeps traversing loop header's |
1053 | // successors until it finds the trivial condition candidate (condition that |
1054 | // is not a constant). Since unswitching generates branches with constant |
1055 | // conditions, this scenario could be very common in practice. |
1056 | BasicBlock *CurrentBB = L.getHeader(); |
1057 | SmallPtrSet<BasicBlock *, 8> Visited; |
1058 | Visited.insert(Ptr: CurrentBB); |
1059 | do { |
1060 | // Check if there are any side-effecting instructions (e.g. stores, calls, |
1061 | // volatile loads) in the part of the loop that the code *would* execute |
1062 | // without unswitching. |
1063 | if (MSSAU) // Possible early exit with MSSA |
1064 | if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(BB: CurrentBB)) |
1065 | if (!isa<MemoryPhi>(Val: *Defs->begin()) || (++Defs->begin() != Defs->end())) |
1066 | return Changed; |
1067 | if (llvm::any_of(Range&: *CurrentBB, |
1068 | P: [](Instruction &I) { return I.mayHaveSideEffects(); })) |
1069 | return Changed; |
1070 | |
1071 | Instruction *CurrentTerm = CurrentBB->getTerminator(); |
1072 | |
1073 | if (auto *SI = dyn_cast<SwitchInst>(Val: CurrentTerm)) { |
1074 | // Don't bother trying to unswitch past a switch with a constant |
1075 | // condition. This should be removed prior to running this pass by |
1076 | // simplifycfg. |
1077 | if (isa<Constant>(Val: SI->getCondition())) |
1078 | return Changed; |
1079 | |
1080 | if (!unswitchTrivialSwitch(L, SI&: *SI, DT, LI, SE, MSSAU)) |
1081 | // Couldn't unswitch this one so we're done. |
1082 | return Changed; |
1083 | |
1084 | // Mark that we managed to unswitch something. |
1085 | Changed = true; |
1086 | |
1087 | // If unswitching turned the terminator into an unconditional branch then |
1088 | // we can continue. The unswitching logic specifically works to fold any |
1089 | // cases it can into an unconditional branch to make it easier to |
1090 | // recognize here. |
1091 | auto *BI = dyn_cast<BranchInst>(Val: CurrentBB->getTerminator()); |
1092 | if (!BI || BI->isConditional()) |
1093 | return Changed; |
1094 | |
1095 | CurrentBB = BI->getSuccessor(i: 0); |
1096 | continue; |
1097 | } |
1098 | |
1099 | auto *BI = dyn_cast<BranchInst>(Val: CurrentTerm); |
1100 | if (!BI) |
1101 | // We do not understand other terminator instructions. |
1102 | return Changed; |
1103 | |
1104 | // Don't bother trying to unswitch past an unconditional branch or a branch |
1105 | // with a constant value. These should be removed by simplifycfg prior to |
1106 | // running this pass. |
1107 | if (!BI->isConditional() || |
1108 | isa<Constant>(Val: skipTrivialSelect(Cond: BI->getCondition()))) |
1109 | return Changed; |
1110 | |
1111 | // Found a trivial condition candidate: non-foldable conditional branch. If |
1112 | // we fail to unswitch this, we can't do anything else that is trivial. |
1113 | if (!unswitchTrivialBranch(L, BI&: *BI, DT, LI, SE, MSSAU)) |
1114 | return Changed; |
1115 | |
1116 | // Mark that we managed to unswitch something. |
1117 | Changed = true; |
1118 | |
1119 | // If we only unswitched some of the conditions feeding the branch, we won't |
1120 | // have collapsed it to a single successor. |
1121 | BI = cast<BranchInst>(Val: CurrentBB->getTerminator()); |
1122 | if (BI->isConditional()) |
1123 | return Changed; |
1124 | |
1125 | // Follow the newly unconditional branch into its successor. |
1126 | CurrentBB = BI->getSuccessor(i: 0); |
1127 | |
1128 | // When continuing, if we exit the loop or reach a previous visited block, |
1129 | // then we can not reach any trivial condition candidates (unfoldable |
1130 | // branch instructions or switch instructions) and no unswitch can happen. |
1131 | } while (L.contains(BB: CurrentBB) && Visited.insert(Ptr: CurrentBB).second); |
1132 | |
1133 | return Changed; |
1134 | } |
1135 | |
1136 | /// Build the cloned blocks for an unswitched copy of the given loop. |
1137 | /// |
1138 | /// The cloned blocks are inserted before the loop preheader (`LoopPH`) and |
1139 | /// after the split block (`SplitBB`) that will be used to select between the |
1140 | /// cloned and original loop. |
1141 | /// |
1142 | /// This routine handles cloning all of the necessary loop blocks and exit |
1143 | /// blocks including rewriting their instructions and the relevant PHI nodes. |
1144 | /// Any loop blocks or exit blocks which are dominated by a different successor |
1145 | /// than the one for this clone of the loop blocks can be trivially skipped. We |
1146 | /// use the `DominatingSucc` map to determine whether a block satisfies that |
1147 | /// property with a simple map lookup. |
1148 | /// |
1149 | /// It also correctly creates the unconditional branch in the cloned |
1150 | /// unswitched parent block to only point at the unswitched successor. |
1151 | /// |
1152 | /// This does not handle most of the necessary updates to `LoopInfo`. Only exit |
1153 | /// block splitting is correctly reflected in `LoopInfo`, essentially all of |
1154 | /// the cloned blocks (and their loops) are left without full `LoopInfo` |
1155 | /// updates. This also doesn't fully update `DominatorTree`. It adds the cloned |
1156 | /// blocks to them but doesn't create the cloned `DominatorTree` structure and |
1157 | /// instead the caller must recompute an accurate DT. It *does* correctly |
1158 | /// update the `AssumptionCache` provided in `AC`. |
1159 | static BasicBlock *buildClonedLoopBlocks( |
1160 | Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, |
1161 | ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, |
1162 | BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, |
1163 | const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, |
1164 | ValueToValueMapTy &VMap, |
1165 | SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, |
1166 | DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, |
1167 | ScalarEvolution *SE) { |
1168 | SmallVector<BasicBlock *, 4> NewBlocks; |
1169 | NewBlocks.reserve(N: L.getNumBlocks() + ExitBlocks.size()); |
1170 | |
1171 | // We will need to clone a bunch of blocks, wrap up the clone operation in |
1172 | // a helper. |
1173 | auto CloneBlock = [&](BasicBlock *OldBB) { |
1174 | // Clone the basic block and insert it before the new preheader. |
1175 | BasicBlock *NewBB = CloneBasicBlock(BB: OldBB, VMap, NameSuffix: ".us" , F: OldBB->getParent()); |
1176 | NewBB->moveBefore(MovePos: LoopPH); |
1177 | |
1178 | // Record this block and the mapping. |
1179 | NewBlocks.push_back(Elt: NewBB); |
1180 | VMap[OldBB] = NewBB; |
1181 | |
1182 | return NewBB; |
1183 | }; |
1184 | |
1185 | // We skip cloning blocks when they have a dominating succ that is not the |
1186 | // succ we are cloning for. |
1187 | auto SkipBlock = [&](BasicBlock *BB) { |
1188 | auto It = DominatingSucc.find(Val: BB); |
1189 | return It != DominatingSucc.end() && It->second != UnswitchedSuccBB; |
1190 | }; |
1191 | |
1192 | // First, clone the preheader. |
1193 | auto *ClonedPH = CloneBlock(LoopPH); |
1194 | |
1195 | // Then clone all the loop blocks, skipping the ones that aren't necessary. |
1196 | for (auto *LoopBB : L.blocks()) |
1197 | if (!SkipBlock(LoopBB)) |
1198 | CloneBlock(LoopBB); |
1199 | |
1200 | // Split all the loop exit edges so that when we clone the exit blocks, if |
1201 | // any of the exit blocks are *also* a preheader for some other loop, we |
1202 | // don't create multiple predecessors entering the loop header. |
1203 | for (auto *ExitBB : ExitBlocks) { |
1204 | if (SkipBlock(ExitBB)) |
1205 | continue; |
1206 | |
1207 | // When we are going to clone an exit, we don't need to clone all the |
1208 | // instructions in the exit block and we want to ensure we have an easy |
1209 | // place to merge the CFG, so split the exit first. This is always safe to |
1210 | // do because there cannot be any non-loop predecessors of a loop exit in |
1211 | // loop simplified form. |
1212 | auto *MergeBB = SplitBlock(Old: ExitBB, SplitPt: ExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
1213 | |
1214 | // Rearrange the names to make it easier to write test cases by having the |
1215 | // exit block carry the suffix rather than the merge block carrying the |
1216 | // suffix. |
1217 | MergeBB->takeName(V: ExitBB); |
1218 | ExitBB->setName(Twine(MergeBB->getName()) + ".split" ); |
1219 | |
1220 | // Now clone the original exit block. |
1221 | auto *ClonedExitBB = CloneBlock(ExitBB); |
1222 | assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 && |
1223 | "Exit block should have been split to have one successor!" ); |
1224 | assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && |
1225 | "Cloned exit block has the wrong successor!" ); |
1226 | |
1227 | // Remap any cloned instructions and create a merge phi node for them. |
1228 | for (auto ZippedInsts : llvm::zip_first( |
1229 | t: llvm::make_range(x: ExitBB->begin(), y: std::prev(x: ExitBB->end())), |
1230 | u: llvm::make_range(x: ClonedExitBB->begin(), |
1231 | y: std::prev(x: ClonedExitBB->end())))) { |
1232 | Instruction &I = std::get<0>(t&: ZippedInsts); |
1233 | Instruction &ClonedI = std::get<1>(t&: ZippedInsts); |
1234 | |
1235 | // The only instructions in the exit block should be PHI nodes and |
1236 | // potentially a landing pad. |
1237 | assert( |
1238 | (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) && |
1239 | "Bad instruction in exit block!" ); |
1240 | // We should have a value map between the instruction and its clone. |
1241 | assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!" ); |
1242 | |
1243 | // Forget SCEVs based on exit phis in case SCEV looked through the phi. |
1244 | if (SE && isa<PHINode>(Val: I)) |
1245 | SE->forgetValue(V: &I); |
1246 | |
1247 | auto *MergePN = |
1248 | PHINode::Create(Ty: I.getType(), /*NumReservedValues*/ 2, NameStr: ".us-phi" ); |
1249 | MergePN->insertBefore(InsertPos: MergeBB->getFirstInsertionPt()); |
1250 | I.replaceAllUsesWith(V: MergePN); |
1251 | MergePN->addIncoming(V: &I, BB: ExitBB); |
1252 | MergePN->addIncoming(V: &ClonedI, BB: ClonedExitBB); |
1253 | } |
1254 | } |
1255 | |
1256 | // Rewrite the instructions in the cloned blocks to refer to the instructions |
1257 | // in the cloned blocks. We have to do this as a second pass so that we have |
1258 | // everything available. Also, we have inserted new instructions which may |
1259 | // include assume intrinsics, so we update the assumption cache while |
1260 | // processing this. |
1261 | Module *M = ClonedPH->getParent()->getParent(); |
1262 | for (auto *ClonedBB : NewBlocks) |
1263 | for (Instruction &I : *ClonedBB) { |
1264 | RemapDbgVariableRecordRange(M, Range: I.getDbgRecordRange(), VM&: VMap, |
1265 | Flags: RF_NoModuleLevelChanges | |
1266 | RF_IgnoreMissingLocals); |
1267 | RemapInstruction(I: &I, VM&: VMap, |
1268 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
1269 | if (auto *II = dyn_cast<AssumeInst>(Val: &I)) |
1270 | AC.registerAssumption(CI: II); |
1271 | } |
1272 | |
1273 | // Update any PHI nodes in the cloned successors of the skipped blocks to not |
1274 | // have spurious incoming values. |
1275 | for (auto *LoopBB : L.blocks()) |
1276 | if (SkipBlock(LoopBB)) |
1277 | for (auto *SuccBB : successors(BB: LoopBB)) |
1278 | if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: SuccBB))) |
1279 | for (PHINode &PN : ClonedSuccBB->phis()) |
1280 | PN.removeIncomingValue(BB: LoopBB, /*DeletePHIIfEmpty*/ false); |
1281 | |
1282 | // Remove the cloned parent as a predecessor of any successor we ended up |
1283 | // cloning other than the unswitched one. |
1284 | auto *ClonedParentBB = cast<BasicBlock>(Val: VMap.lookup(Val: ParentBB)); |
1285 | for (auto *SuccBB : successors(BB: ParentBB)) { |
1286 | if (SuccBB == UnswitchedSuccBB) |
1287 | continue; |
1288 | |
1289 | auto *ClonedSuccBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: SuccBB)); |
1290 | if (!ClonedSuccBB) |
1291 | continue; |
1292 | |
1293 | ClonedSuccBB->removePredecessor(Pred: ClonedParentBB, |
1294 | /*KeepOneInputPHIs*/ true); |
1295 | } |
1296 | |
1297 | // Replace the cloned branch with an unconditional branch to the cloned |
1298 | // unswitched successor. |
1299 | auto *ClonedSuccBB = cast<BasicBlock>(Val: VMap.lookup(Val: UnswitchedSuccBB)); |
1300 | Instruction *ClonedTerminator = ClonedParentBB->getTerminator(); |
1301 | // Trivial Simplification. If Terminator is a conditional branch and |
1302 | // condition becomes dead - erase it. |
1303 | Value *ClonedConditionToErase = nullptr; |
1304 | if (auto *BI = dyn_cast<BranchInst>(Val: ClonedTerminator)) |
1305 | ClonedConditionToErase = BI->getCondition(); |
1306 | else if (auto *SI = dyn_cast<SwitchInst>(Val: ClonedTerminator)) |
1307 | ClonedConditionToErase = SI->getCondition(); |
1308 | |
1309 | ClonedTerminator->eraseFromParent(); |
1310 | BranchInst::Create(IfTrue: ClonedSuccBB, InsertAtEnd: ClonedParentBB); |
1311 | |
1312 | if (ClonedConditionToErase) |
1313 | RecursivelyDeleteTriviallyDeadInstructions(V: ClonedConditionToErase, TLI: nullptr, |
1314 | MSSAU); |
1315 | |
1316 | // If there are duplicate entries in the PHI nodes because of multiple edges |
1317 | // to the unswitched successor, we need to nuke all but one as we replaced it |
1318 | // with a direct branch. |
1319 | for (PHINode &PN : ClonedSuccBB->phis()) { |
1320 | bool Found = false; |
1321 | // Loop over the incoming operands backwards so we can easily delete as we |
1322 | // go without invalidating the index. |
1323 | for (int i = PN.getNumOperands() - 1; i >= 0; --i) { |
1324 | if (PN.getIncomingBlock(i) != ClonedParentBB) |
1325 | continue; |
1326 | if (!Found) { |
1327 | Found = true; |
1328 | continue; |
1329 | } |
1330 | PN.removeIncomingValue(Idx: i, /*DeletePHIIfEmpty*/ false); |
1331 | } |
1332 | } |
1333 | |
1334 | // Record the domtree updates for the new blocks. |
1335 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1336 | for (auto *ClonedBB : NewBlocks) { |
1337 | for (auto *SuccBB : successors(BB: ClonedBB)) |
1338 | if (SuccSet.insert(Ptr: SuccBB).second) |
1339 | DTUpdates.push_back(Elt: {DominatorTree::Insert, ClonedBB, SuccBB}); |
1340 | SuccSet.clear(); |
1341 | } |
1342 | |
1343 | return ClonedPH; |
1344 | } |
1345 | |
1346 | /// Recursively clone the specified loop and all of its children. |
1347 | /// |
1348 | /// The target parent loop for the clone should be provided, or can be null if |
1349 | /// the clone is a top-level loop. While cloning, all the blocks are mapped |
1350 | /// with the provided value map. The entire original loop must be present in |
1351 | /// the value map. The cloned loop is returned. |
1352 | static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, |
1353 | const ValueToValueMapTy &VMap, LoopInfo &LI) { |
1354 | auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) { |
1355 | assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!" ); |
1356 | ClonedL.reserveBlocks(size: OrigL.getNumBlocks()); |
1357 | for (auto *BB : OrigL.blocks()) { |
1358 | auto *ClonedBB = cast<BasicBlock>(Val: VMap.lookup(Val: BB)); |
1359 | ClonedL.addBlockEntry(BB: ClonedBB); |
1360 | if (LI.getLoopFor(BB) == &OrigL) |
1361 | LI.changeLoopFor(BB: ClonedBB, L: &ClonedL); |
1362 | } |
1363 | }; |
1364 | |
1365 | // We specially handle the first loop because it may get cloned into |
1366 | // a different parent and because we most commonly are cloning leaf loops. |
1367 | Loop *ClonedRootL = LI.AllocateLoop(); |
1368 | if (RootParentL) |
1369 | RootParentL->addChildLoop(NewChild: ClonedRootL); |
1370 | else |
1371 | LI.addTopLevelLoop(New: ClonedRootL); |
1372 | AddClonedBlocksToLoop(OrigRootL, *ClonedRootL); |
1373 | |
1374 | if (OrigRootL.isInnermost()) |
1375 | return ClonedRootL; |
1376 | |
1377 | // If we have a nest, we can quickly clone the entire loop nest using an |
1378 | // iterative approach because it is a tree. We keep the cloned parent in the |
1379 | // data structure to avoid repeatedly querying through a map to find it. |
1380 | SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone; |
1381 | // Build up the loops to clone in reverse order as we'll clone them from the |
1382 | // back. |
1383 | for (Loop *ChildL : llvm::reverse(C&: OrigRootL)) |
1384 | LoopsToClone.push_back(Elt: {ClonedRootL, ChildL}); |
1385 | do { |
1386 | Loop *ClonedParentL, *L; |
1387 | std::tie(args&: ClonedParentL, args&: L) = LoopsToClone.pop_back_val(); |
1388 | Loop *ClonedL = LI.AllocateLoop(); |
1389 | ClonedParentL->addChildLoop(NewChild: ClonedL); |
1390 | AddClonedBlocksToLoop(*L, *ClonedL); |
1391 | for (Loop *ChildL : llvm::reverse(C&: *L)) |
1392 | LoopsToClone.push_back(Elt: {ClonedL, ChildL}); |
1393 | } while (!LoopsToClone.empty()); |
1394 | |
1395 | return ClonedRootL; |
1396 | } |
1397 | |
1398 | /// Build the cloned loops of an original loop from unswitching. |
1399 | /// |
1400 | /// Because unswitching simplifies the CFG of the loop, this isn't a trivial |
1401 | /// operation. We need to re-verify that there even is a loop (as the backedge |
1402 | /// may not have been cloned), and even if there are remaining backedges the |
1403 | /// backedge set may be different. However, we know that each child loop is |
1404 | /// undisturbed, we only need to find where to place each child loop within |
1405 | /// either any parent loop or within a cloned version of the original loop. |
1406 | /// |
1407 | /// Because child loops may end up cloned outside of any cloned version of the |
1408 | /// original loop, multiple cloned sibling loops may be created. All of them |
1409 | /// are returned so that the newly introduced loop nest roots can be |
1410 | /// identified. |
1411 | static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, |
1412 | const ValueToValueMapTy &VMap, LoopInfo &LI, |
1413 | SmallVectorImpl<Loop *> &NonChildClonedLoops) { |
1414 | Loop *ClonedL = nullptr; |
1415 | |
1416 | auto *OrigPH = OrigL.getLoopPreheader(); |
1417 | auto * = OrigL.getHeader(); |
1418 | |
1419 | auto *ClonedPH = cast<BasicBlock>(Val: VMap.lookup(Val: OrigPH)); |
1420 | auto * = cast<BasicBlock>(Val: VMap.lookup(Val: OrigHeader)); |
1421 | |
1422 | // We need to know the loops of the cloned exit blocks to even compute the |
1423 | // accurate parent loop. If we only clone exits to some parent of the |
1424 | // original parent, we want to clone into that outer loop. We also keep track |
1425 | // of the loops that our cloned exit blocks participate in. |
1426 | Loop *ParentL = nullptr; |
1427 | SmallVector<BasicBlock *, 4> ClonedExitsInLoops; |
1428 | SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap; |
1429 | ClonedExitsInLoops.reserve(N: ExitBlocks.size()); |
1430 | for (auto *ExitBB : ExitBlocks) |
1431 | if (auto *ClonedExitBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ExitBB))) |
1432 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) { |
1433 | ExitLoopMap[ClonedExitBB] = ExitL; |
1434 | ClonedExitsInLoops.push_back(Elt: ClonedExitBB); |
1435 | if (!ParentL || (ParentL != ExitL && ParentL->contains(L: ExitL))) |
1436 | ParentL = ExitL; |
1437 | } |
1438 | assert((!ParentL || ParentL == OrigL.getParentLoop() || |
1439 | ParentL->contains(OrigL.getParentLoop())) && |
1440 | "The computed parent loop should always contain (or be) the parent of " |
1441 | "the original loop." ); |
1442 | |
1443 | // We build the set of blocks dominated by the cloned header from the set of |
1444 | // cloned blocks out of the original loop. While not all of these will |
1445 | // necessarily be in the cloned loop, it is enough to establish that they |
1446 | // aren't in unreachable cycles, etc. |
1447 | SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks; |
1448 | for (auto *BB : OrigL.blocks()) |
1449 | if (auto *ClonedBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: BB))) |
1450 | ClonedLoopBlocks.insert(X: ClonedBB); |
1451 | |
1452 | // Rebuild the set of blocks that will end up in the cloned loop. We may have |
1453 | // skipped cloning some region of this loop which can in turn skip some of |
1454 | // the backedges so we have to rebuild the blocks in the loop based on the |
1455 | // backedges that remain after cloning. |
1456 | SmallVector<BasicBlock *, 16> Worklist; |
1457 | SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop; |
1458 | for (auto *Pred : predecessors(BB: ClonedHeader)) { |
1459 | // The only possible non-loop header predecessor is the preheader because |
1460 | // we know we cloned the loop in simplified form. |
1461 | if (Pred == ClonedPH) |
1462 | continue; |
1463 | |
1464 | // Because the loop was in simplified form, the only non-loop predecessor |
1465 | // should be the preheader. |
1466 | assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop " |
1467 | "header other than the preheader " |
1468 | "that is not part of the loop!" ); |
1469 | |
1470 | // Insert this block into the loop set and on the first visit (and if it |
1471 | // isn't the header we're currently walking) put it into the worklist to |
1472 | // recurse through. |
1473 | if (BlocksInClonedLoop.insert(Ptr: Pred).second && Pred != ClonedHeader) |
1474 | Worklist.push_back(Elt: Pred); |
1475 | } |
1476 | |
1477 | // If we had any backedges then there *is* a cloned loop. Put the header into |
1478 | // the loop set and then walk the worklist backwards to find all the blocks |
1479 | // that remain within the loop after cloning. |
1480 | if (!BlocksInClonedLoop.empty()) { |
1481 | BlocksInClonedLoop.insert(Ptr: ClonedHeader); |
1482 | |
1483 | while (!Worklist.empty()) { |
1484 | BasicBlock *BB = Worklist.pop_back_val(); |
1485 | assert(BlocksInClonedLoop.count(BB) && |
1486 | "Didn't put block into the loop set!" ); |
1487 | |
1488 | // Insert any predecessors that are in the possible set into the cloned |
1489 | // set, and if the insert is successful, add them to the worklist. Note |
1490 | // that we filter on the blocks that are definitely reachable via the |
1491 | // backedge to the loop header so we may prune out dead code within the |
1492 | // cloned loop. |
1493 | for (auto *Pred : predecessors(BB)) |
1494 | if (ClonedLoopBlocks.count(key: Pred) && |
1495 | BlocksInClonedLoop.insert(Ptr: Pred).second) |
1496 | Worklist.push_back(Elt: Pred); |
1497 | } |
1498 | |
1499 | ClonedL = LI.AllocateLoop(); |
1500 | if (ParentL) { |
1501 | ParentL->addBasicBlockToLoop(NewBB: ClonedPH, LI); |
1502 | ParentL->addChildLoop(NewChild: ClonedL); |
1503 | } else { |
1504 | LI.addTopLevelLoop(New: ClonedL); |
1505 | } |
1506 | NonChildClonedLoops.push_back(Elt: ClonedL); |
1507 | |
1508 | ClonedL->reserveBlocks(size: BlocksInClonedLoop.size()); |
1509 | // We don't want to just add the cloned loop blocks based on how we |
1510 | // discovered them. The original order of blocks was carefully built in |
1511 | // a way that doesn't rely on predecessor ordering. Rather than re-invent |
1512 | // that logic, we just re-walk the original blocks (and those of the child |
1513 | // loops) and filter them as we add them into the cloned loop. |
1514 | for (auto *BB : OrigL.blocks()) { |
1515 | auto *ClonedBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: BB)); |
1516 | if (!ClonedBB || !BlocksInClonedLoop.count(Ptr: ClonedBB)) |
1517 | continue; |
1518 | |
1519 | // Directly add the blocks that are only in this loop. |
1520 | if (LI.getLoopFor(BB) == &OrigL) { |
1521 | ClonedL->addBasicBlockToLoop(NewBB: ClonedBB, LI); |
1522 | continue; |
1523 | } |
1524 | |
1525 | // We want to manually add it to this loop and parents. |
1526 | // Registering it with LoopInfo will happen when we clone the top |
1527 | // loop for this block. |
1528 | for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop()) |
1529 | PL->addBlockEntry(BB: ClonedBB); |
1530 | } |
1531 | |
1532 | // Now add each child loop whose header remains within the cloned loop. All |
1533 | // of the blocks within the loop must satisfy the same constraints as the |
1534 | // header so once we pass the header checks we can just clone the entire |
1535 | // child loop nest. |
1536 | for (Loop *ChildL : OrigL) { |
1537 | auto * = |
1538 | cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ChildL->getHeader())); |
1539 | if (!ClonedChildHeader || !BlocksInClonedLoop.count(Ptr: ClonedChildHeader)) |
1540 | continue; |
1541 | |
1542 | #ifndef NDEBUG |
1543 | // We should never have a cloned child loop header but fail to have |
1544 | // all of the blocks for that child loop. |
1545 | for (auto *ChildLoopBB : ChildL->blocks()) |
1546 | assert(BlocksInClonedLoop.count( |
1547 | cast<BasicBlock>(VMap.lookup(ChildLoopBB))) && |
1548 | "Child cloned loop has a header within the cloned outer " |
1549 | "loop but not all of its blocks!" ); |
1550 | #endif |
1551 | |
1552 | cloneLoopNest(OrigRootL&: *ChildL, RootParentL: ClonedL, VMap, LI); |
1553 | } |
1554 | } |
1555 | |
1556 | // Now that we've handled all the components of the original loop that were |
1557 | // cloned into a new loop, we still need to handle anything from the original |
1558 | // loop that wasn't in a cloned loop. |
1559 | |
1560 | // Figure out what blocks are left to place within any loop nest containing |
1561 | // the unswitched loop. If we never formed a loop, the cloned PH is one of |
1562 | // them. |
1563 | SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet; |
1564 | if (BlocksInClonedLoop.empty()) |
1565 | UnloopedBlockSet.insert(Ptr: ClonedPH); |
1566 | for (auto *ClonedBB : ClonedLoopBlocks) |
1567 | if (!BlocksInClonedLoop.count(Ptr: ClonedBB)) |
1568 | UnloopedBlockSet.insert(Ptr: ClonedBB); |
1569 | |
1570 | // Copy the cloned exits and sort them in ascending loop depth, we'll work |
1571 | // backwards across these to process them inside out. The order shouldn't |
1572 | // matter as we're just trying to build up the map from inside-out; we use |
1573 | // the map in a more stably ordered way below. |
1574 | auto OrderedClonedExitsInLoops = ClonedExitsInLoops; |
1575 | llvm::sort(C&: OrderedClonedExitsInLoops, Comp: [&](BasicBlock *LHS, BasicBlock *RHS) { |
1576 | return ExitLoopMap.lookup(Val: LHS)->getLoopDepth() < |
1577 | ExitLoopMap.lookup(Val: RHS)->getLoopDepth(); |
1578 | }); |
1579 | |
1580 | // Populate the existing ExitLoopMap with everything reachable from each |
1581 | // exit, starting from the inner most exit. |
1582 | while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) { |
1583 | assert(Worklist.empty() && "Didn't clear worklist!" ); |
1584 | |
1585 | BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val(); |
1586 | Loop *ExitL = ExitLoopMap.lookup(Val: ExitBB); |
1587 | |
1588 | // Walk the CFG back until we hit the cloned PH adding everything reachable |
1589 | // and in the unlooped set to this exit block's loop. |
1590 | Worklist.push_back(Elt: ExitBB); |
1591 | do { |
1592 | BasicBlock *BB = Worklist.pop_back_val(); |
1593 | // We can stop recursing at the cloned preheader (if we get there). |
1594 | if (BB == ClonedPH) |
1595 | continue; |
1596 | |
1597 | for (BasicBlock *PredBB : predecessors(BB)) { |
1598 | // If this pred has already been moved to our set or is part of some |
1599 | // (inner) loop, no update needed. |
1600 | if (!UnloopedBlockSet.erase(Ptr: PredBB)) { |
1601 | assert( |
1602 | (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) && |
1603 | "Predecessor not mapped to a loop!" ); |
1604 | continue; |
1605 | } |
1606 | |
1607 | // We just insert into the loop set here. We'll add these blocks to the |
1608 | // exit loop after we build up the set in an order that doesn't rely on |
1609 | // predecessor order (which in turn relies on use list order). |
1610 | bool Inserted = ExitLoopMap.insert(KV: {PredBB, ExitL}).second; |
1611 | (void)Inserted; |
1612 | assert(Inserted && "Should only visit an unlooped block once!" ); |
1613 | |
1614 | // And recurse through to its predecessors. |
1615 | Worklist.push_back(Elt: PredBB); |
1616 | } |
1617 | } while (!Worklist.empty()); |
1618 | } |
1619 | |
1620 | // Now that the ExitLoopMap gives as mapping for all the non-looping cloned |
1621 | // blocks to their outer loops, walk the cloned blocks and the cloned exits |
1622 | // in their original order adding them to the correct loop. |
1623 | |
1624 | // We need a stable insertion order. We use the order of the original loop |
1625 | // order and map into the correct parent loop. |
1626 | for (auto *BB : llvm::concat<BasicBlock *const>( |
1627 | Ranges: ArrayRef(ClonedPH), Ranges&: ClonedLoopBlocks, Ranges&: ClonedExitsInLoops)) |
1628 | if (Loop *OuterL = ExitLoopMap.lookup(Val: BB)) |
1629 | OuterL->addBasicBlockToLoop(NewBB: BB, LI); |
1630 | |
1631 | #ifndef NDEBUG |
1632 | for (auto &BBAndL : ExitLoopMap) { |
1633 | auto *BB = BBAndL.first; |
1634 | auto *OuterL = BBAndL.second; |
1635 | assert(LI.getLoopFor(BB) == OuterL && |
1636 | "Failed to put all blocks into outer loops!" ); |
1637 | } |
1638 | #endif |
1639 | |
1640 | // Now that all the blocks are placed into the correct containing loop in the |
1641 | // absence of child loops, find all the potentially cloned child loops and |
1642 | // clone them into whatever outer loop we placed their header into. |
1643 | for (Loop *ChildL : OrigL) { |
1644 | auto * = |
1645 | cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ChildL->getHeader())); |
1646 | if (!ClonedChildHeader || BlocksInClonedLoop.count(Ptr: ClonedChildHeader)) |
1647 | continue; |
1648 | |
1649 | #ifndef NDEBUG |
1650 | for (auto *ChildLoopBB : ChildL->blocks()) |
1651 | assert(VMap.count(ChildLoopBB) && |
1652 | "Cloned a child loop header but not all of that loops blocks!" ); |
1653 | #endif |
1654 | |
1655 | NonChildClonedLoops.push_back(Elt: cloneLoopNest( |
1656 | OrigRootL&: *ChildL, RootParentL: ExitLoopMap.lookup(Val: ClonedChildHeader), VMap, LI)); |
1657 | } |
1658 | } |
1659 | |
1660 | static void |
1661 | deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, |
1662 | ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, |
1663 | DominatorTree &DT, MemorySSAUpdater *MSSAU) { |
1664 | // Find all the dead clones, and remove them from their successors. |
1665 | SmallVector<BasicBlock *, 16> DeadBlocks; |
1666 | for (BasicBlock *BB : llvm::concat<BasicBlock *const>(Ranges: L.blocks(), Ranges&: ExitBlocks)) |
1667 | for (const auto &VMap : VMaps) |
1668 | if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(Val: VMap->lookup(Val: BB))) |
1669 | if (!DT.isReachableFromEntry(A: ClonedBB)) { |
1670 | for (BasicBlock *SuccBB : successors(BB: ClonedBB)) |
1671 | SuccBB->removePredecessor(Pred: ClonedBB); |
1672 | DeadBlocks.push_back(Elt: ClonedBB); |
1673 | } |
1674 | |
1675 | // Remove all MemorySSA in the dead blocks |
1676 | if (MSSAU) { |
1677 | SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(), |
1678 | DeadBlocks.end()); |
1679 | MSSAU->removeBlocks(DeadBlocks: DeadBlockSet); |
1680 | } |
1681 | |
1682 | // Drop any remaining references to break cycles. |
1683 | for (BasicBlock *BB : DeadBlocks) |
1684 | BB->dropAllReferences(); |
1685 | // Erase them from the IR. |
1686 | for (BasicBlock *BB : DeadBlocks) |
1687 | BB->eraseFromParent(); |
1688 | } |
1689 | |
1690 | static void deleteDeadBlocksFromLoop(Loop &L, |
1691 | SmallVectorImpl<BasicBlock *> &ExitBlocks, |
1692 | DominatorTree &DT, LoopInfo &LI, |
1693 | MemorySSAUpdater *MSSAU, |
1694 | ScalarEvolution *SE, |
1695 | LPMUpdater &LoopUpdater) { |
1696 | // Find all the dead blocks tied to this loop, and remove them from their |
1697 | // successors. |
1698 | SmallSetVector<BasicBlock *, 8> DeadBlockSet; |
1699 | |
1700 | // Start with loop/exit blocks and get a transitive closure of reachable dead |
1701 | // blocks. |
1702 | SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(), |
1703 | ExitBlocks.end()); |
1704 | DeathCandidates.append(in_start: L.blocks().begin(), in_end: L.blocks().end()); |
1705 | while (!DeathCandidates.empty()) { |
1706 | auto *BB = DeathCandidates.pop_back_val(); |
1707 | if (!DeadBlockSet.count(key: BB) && !DT.isReachableFromEntry(A: BB)) { |
1708 | for (BasicBlock *SuccBB : successors(BB)) { |
1709 | SuccBB->removePredecessor(Pred: BB); |
1710 | DeathCandidates.push_back(Elt: SuccBB); |
1711 | } |
1712 | DeadBlockSet.insert(X: BB); |
1713 | } |
1714 | } |
1715 | |
1716 | // Remove all MemorySSA in the dead blocks |
1717 | if (MSSAU) |
1718 | MSSAU->removeBlocks(DeadBlocks: DeadBlockSet); |
1719 | |
1720 | // Filter out the dead blocks from the exit blocks list so that it can be |
1721 | // used in the caller. |
1722 | llvm::erase_if(C&: ExitBlocks, |
1723 | P: [&](BasicBlock *BB) { return DeadBlockSet.count(key: BB); }); |
1724 | |
1725 | // Walk from this loop up through its parents removing all of the dead blocks. |
1726 | for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { |
1727 | for (auto *BB : DeadBlockSet) |
1728 | ParentL->getBlocksSet().erase(Ptr: BB); |
1729 | llvm::erase_if(C&: ParentL->getBlocksVector(), |
1730 | P: [&](BasicBlock *BB) { return DeadBlockSet.count(key: BB); }); |
1731 | } |
1732 | |
1733 | // Now delete the dead child loops. This raw delete will clear them |
1734 | // recursively. |
1735 | llvm::erase_if(C&: L.getSubLoopsVector(), P: [&](Loop *ChildL) { |
1736 | if (!DeadBlockSet.count(key: ChildL->getHeader())) |
1737 | return false; |
1738 | |
1739 | assert(llvm::all_of(ChildL->blocks(), |
1740 | [&](BasicBlock *ChildBB) { |
1741 | return DeadBlockSet.count(ChildBB); |
1742 | }) && |
1743 | "If the child loop header is dead all blocks in the child loop must " |
1744 | "be dead as well!" ); |
1745 | LoopUpdater.markLoopAsDeleted(L&: *ChildL, Name: ChildL->getName()); |
1746 | if (SE) |
1747 | SE->forgetBlockAndLoopDispositions(); |
1748 | LI.destroy(L: ChildL); |
1749 | return true; |
1750 | }); |
1751 | |
1752 | // Remove the loop mappings for the dead blocks and drop all the references |
1753 | // from these blocks to others to handle cyclic references as we start |
1754 | // deleting the blocks themselves. |
1755 | for (auto *BB : DeadBlockSet) { |
1756 | // Check that the dominator tree has already been updated. |
1757 | assert(!DT.getNode(BB) && "Should already have cleared domtree!" ); |
1758 | LI.changeLoopFor(BB, L: nullptr); |
1759 | // Drop all uses of the instructions to make sure we won't have dangling |
1760 | // uses in other blocks. |
1761 | for (auto &I : *BB) |
1762 | if (!I.use_empty()) |
1763 | I.replaceAllUsesWith(V: PoisonValue::get(T: I.getType())); |
1764 | BB->dropAllReferences(); |
1765 | } |
1766 | |
1767 | // Actually delete the blocks now that they've been fully unhooked from the |
1768 | // IR. |
1769 | for (auto *BB : DeadBlockSet) |
1770 | BB->eraseFromParent(); |
1771 | } |
1772 | |
1773 | /// Recompute the set of blocks in a loop after unswitching. |
1774 | /// |
1775 | /// This walks from the original headers predecessors to rebuild the loop. We |
1776 | /// take advantage of the fact that new blocks can't have been added, and so we |
1777 | /// filter by the original loop's blocks. This also handles potentially |
1778 | /// unreachable code that we don't want to explore but might be found examining |
1779 | /// the predecessors of the header. |
1780 | /// |
1781 | /// If the original loop is no longer a loop, this will return an empty set. If |
1782 | /// it remains a loop, all the blocks within it will be added to the set |
1783 | /// (including those blocks in inner loops). |
1784 | static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, |
1785 | LoopInfo &LI) { |
1786 | SmallPtrSet<const BasicBlock *, 16> LoopBlockSet; |
1787 | |
1788 | auto *PH = L.getLoopPreheader(); |
1789 | auto * = L.getHeader(); |
1790 | |
1791 | // A worklist to use while walking backwards from the header. |
1792 | SmallVector<BasicBlock *, 16> Worklist; |
1793 | |
1794 | // First walk the predecessors of the header to find the backedges. This will |
1795 | // form the basis of our walk. |
1796 | for (auto *Pred : predecessors(BB: Header)) { |
1797 | // Skip the preheader. |
1798 | if (Pred == PH) |
1799 | continue; |
1800 | |
1801 | // Because the loop was in simplified form, the only non-loop predecessor |
1802 | // is the preheader. |
1803 | assert(L.contains(Pred) && "Found a predecessor of the loop header other " |
1804 | "than the preheader that is not part of the " |
1805 | "loop!" ); |
1806 | |
1807 | // Insert this block into the loop set and on the first visit and, if it |
1808 | // isn't the header we're currently walking, put it into the worklist to |
1809 | // recurse through. |
1810 | if (LoopBlockSet.insert(Ptr: Pred).second && Pred != Header) |
1811 | Worklist.push_back(Elt: Pred); |
1812 | } |
1813 | |
1814 | // If no backedges were found, we're done. |
1815 | if (LoopBlockSet.empty()) |
1816 | return LoopBlockSet; |
1817 | |
1818 | // We found backedges, recurse through them to identify the loop blocks. |
1819 | while (!Worklist.empty()) { |
1820 | BasicBlock *BB = Worklist.pop_back_val(); |
1821 | assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!" ); |
1822 | |
1823 | // No need to walk past the header. |
1824 | if (BB == Header) |
1825 | continue; |
1826 | |
1827 | // Because we know the inner loop structure remains valid we can use the |
1828 | // loop structure to jump immediately across the entire nested loop. |
1829 | // Further, because it is in loop simplified form, we can directly jump |
1830 | // to its preheader afterward. |
1831 | if (Loop *InnerL = LI.getLoopFor(BB)) |
1832 | if (InnerL != &L) { |
1833 | assert(L.contains(InnerL) && |
1834 | "Should not reach a loop *outside* this loop!" ); |
1835 | // The preheader is the only possible predecessor of the loop so |
1836 | // insert it into the set and check whether it was already handled. |
1837 | auto *InnerPH = InnerL->getLoopPreheader(); |
1838 | assert(L.contains(InnerPH) && "Cannot contain an inner loop block " |
1839 | "but not contain the inner loop " |
1840 | "preheader!" ); |
1841 | if (!LoopBlockSet.insert(Ptr: InnerPH).second) |
1842 | // The only way to reach the preheader is through the loop body |
1843 | // itself so if it has been visited the loop is already handled. |
1844 | continue; |
1845 | |
1846 | // Insert all of the blocks (other than those already present) into |
1847 | // the loop set. We expect at least the block that led us to find the |
1848 | // inner loop to be in the block set, but we may also have other loop |
1849 | // blocks if they were already enqueued as predecessors of some other |
1850 | // outer loop block. |
1851 | for (auto *InnerBB : InnerL->blocks()) { |
1852 | if (InnerBB == BB) { |
1853 | assert(LoopBlockSet.count(InnerBB) && |
1854 | "Block should already be in the set!" ); |
1855 | continue; |
1856 | } |
1857 | |
1858 | LoopBlockSet.insert(Ptr: InnerBB); |
1859 | } |
1860 | |
1861 | // Add the preheader to the worklist so we will continue past the |
1862 | // loop body. |
1863 | Worklist.push_back(Elt: InnerPH); |
1864 | continue; |
1865 | } |
1866 | |
1867 | // Insert any predecessors that were in the original loop into the new |
1868 | // set, and if the insert is successful, add them to the worklist. |
1869 | for (auto *Pred : predecessors(BB)) |
1870 | if (L.contains(BB: Pred) && LoopBlockSet.insert(Ptr: Pred).second) |
1871 | Worklist.push_back(Elt: Pred); |
1872 | } |
1873 | |
1874 | assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!" ); |
1875 | |
1876 | // We've found all the blocks participating in the loop, return our completed |
1877 | // set. |
1878 | return LoopBlockSet; |
1879 | } |
1880 | |
1881 | /// Rebuild a loop after unswitching removes some subset of blocks and edges. |
1882 | /// |
1883 | /// The removal may have removed some child loops entirely but cannot have |
1884 | /// disturbed any remaining child loops. However, they may need to be hoisted |
1885 | /// to the parent loop (or to be top-level loops). The original loop may be |
1886 | /// completely removed. |
1887 | /// |
1888 | /// The sibling loops resulting from this update are returned. If the original |
1889 | /// loop remains a valid loop, it will be the first entry in this list with all |
1890 | /// of the newly sibling loops following it. |
1891 | /// |
1892 | /// Returns true if the loop remains a loop after unswitching, and false if it |
1893 | /// is no longer a loop after unswitching (and should not continue to be |
1894 | /// referenced). |
1895 | static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, |
1896 | LoopInfo &LI, |
1897 | SmallVectorImpl<Loop *> &HoistedLoops, |
1898 | ScalarEvolution *SE) { |
1899 | auto *PH = L.getLoopPreheader(); |
1900 | |
1901 | // Compute the actual parent loop from the exit blocks. Because we may have |
1902 | // pruned some exits the loop may be different from the original parent. |
1903 | Loop *ParentL = nullptr; |
1904 | SmallVector<Loop *, 4> ExitLoops; |
1905 | SmallVector<BasicBlock *, 4> ExitsInLoops; |
1906 | ExitsInLoops.reserve(N: ExitBlocks.size()); |
1907 | for (auto *ExitBB : ExitBlocks) |
1908 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) { |
1909 | ExitLoops.push_back(Elt: ExitL); |
1910 | ExitsInLoops.push_back(Elt: ExitBB); |
1911 | if (!ParentL || (ParentL != ExitL && ParentL->contains(L: ExitL))) |
1912 | ParentL = ExitL; |
1913 | } |
1914 | |
1915 | // Recompute the blocks participating in this loop. This may be empty if it |
1916 | // is no longer a loop. |
1917 | auto LoopBlockSet = recomputeLoopBlockSet(L, LI); |
1918 | |
1919 | // If we still have a loop, we need to re-set the loop's parent as the exit |
1920 | // block set changing may have moved it within the loop nest. Note that this |
1921 | // can only happen when this loop has a parent as it can only hoist the loop |
1922 | // *up* the nest. |
1923 | if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) { |
1924 | // Remove this loop's (original) blocks from all of the intervening loops. |
1925 | for (Loop *IL = L.getParentLoop(); IL != ParentL; |
1926 | IL = IL->getParentLoop()) { |
1927 | IL->getBlocksSet().erase(Ptr: PH); |
1928 | for (auto *BB : L.blocks()) |
1929 | IL->getBlocksSet().erase(Ptr: BB); |
1930 | llvm::erase_if(C&: IL->getBlocksVector(), P: [&](BasicBlock *BB) { |
1931 | return BB == PH || L.contains(BB); |
1932 | }); |
1933 | } |
1934 | |
1935 | LI.changeLoopFor(BB: PH, L: ParentL); |
1936 | L.getParentLoop()->removeChildLoop(Child: &L); |
1937 | if (ParentL) |
1938 | ParentL->addChildLoop(NewChild: &L); |
1939 | else |
1940 | LI.addTopLevelLoop(New: &L); |
1941 | } |
1942 | |
1943 | // Now we update all the blocks which are no longer within the loop. |
1944 | auto &Blocks = L.getBlocksVector(); |
1945 | auto BlocksSplitI = |
1946 | LoopBlockSet.empty() |
1947 | ? Blocks.begin() |
1948 | : std::stable_partition( |
1949 | first: Blocks.begin(), last: Blocks.end(), |
1950 | pred: [&](BasicBlock *BB) { return LoopBlockSet.count(Ptr: BB); }); |
1951 | |
1952 | // Before we erase the list of unlooped blocks, build a set of them. |
1953 | SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end()); |
1954 | if (LoopBlockSet.empty()) |
1955 | UnloopedBlocks.insert(Ptr: PH); |
1956 | |
1957 | // Now erase these blocks from the loop. |
1958 | for (auto *BB : make_range(x: BlocksSplitI, y: Blocks.end())) |
1959 | L.getBlocksSet().erase(Ptr: BB); |
1960 | Blocks.erase(first: BlocksSplitI, last: Blocks.end()); |
1961 | |
1962 | // Sort the exits in ascending loop depth, we'll work backwards across these |
1963 | // to process them inside out. |
1964 | llvm::stable_sort(Range&: ExitsInLoops, C: [&](BasicBlock *LHS, BasicBlock *RHS) { |
1965 | return LI.getLoopDepth(BB: LHS) < LI.getLoopDepth(BB: RHS); |
1966 | }); |
1967 | |
1968 | // We'll build up a set for each exit loop. |
1969 | SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks; |
1970 | Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop. |
1971 | |
1972 | auto RemoveUnloopedBlocksFromLoop = |
1973 | [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) { |
1974 | for (auto *BB : UnloopedBlocks) |
1975 | L.getBlocksSet().erase(Ptr: BB); |
1976 | llvm::erase_if(C&: L.getBlocksVector(), P: [&](BasicBlock *BB) { |
1977 | return UnloopedBlocks.count(Ptr: BB); |
1978 | }); |
1979 | }; |
1980 | |
1981 | SmallVector<BasicBlock *, 16> Worklist; |
1982 | while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) { |
1983 | assert(Worklist.empty() && "Didn't clear worklist!" ); |
1984 | assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!" ); |
1985 | |
1986 | // Grab the next exit block, in decreasing loop depth order. |
1987 | BasicBlock *ExitBB = ExitsInLoops.pop_back_val(); |
1988 | Loop &ExitL = *LI.getLoopFor(BB: ExitBB); |
1989 | assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!" ); |
1990 | |
1991 | // Erase all of the unlooped blocks from the loops between the previous |
1992 | // exit loop and this exit loop. This works because the ExitInLoops list is |
1993 | // sorted in increasing order of loop depth and thus we visit loops in |
1994 | // decreasing order of loop depth. |
1995 | for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop()) |
1996 | RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); |
1997 | |
1998 | // Walk the CFG back until we hit the cloned PH adding everything reachable |
1999 | // and in the unlooped set to this exit block's loop. |
2000 | Worklist.push_back(Elt: ExitBB); |
2001 | do { |
2002 | BasicBlock *BB = Worklist.pop_back_val(); |
2003 | // We can stop recursing at the cloned preheader (if we get there). |
2004 | if (BB == PH) |
2005 | continue; |
2006 | |
2007 | for (BasicBlock *PredBB : predecessors(BB)) { |
2008 | // If this pred has already been moved to our set or is part of some |
2009 | // (inner) loop, no update needed. |
2010 | if (!UnloopedBlocks.erase(Ptr: PredBB)) { |
2011 | assert((NewExitLoopBlocks.count(PredBB) || |
2012 | ExitL.contains(LI.getLoopFor(PredBB))) && |
2013 | "Predecessor not in a nested loop (or already visited)!" ); |
2014 | continue; |
2015 | } |
2016 | |
2017 | // We just insert into the loop set here. We'll add these blocks to the |
2018 | // exit loop after we build up the set in a deterministic order rather |
2019 | // than the predecessor-influenced visit order. |
2020 | bool Inserted = NewExitLoopBlocks.insert(Ptr: PredBB).second; |
2021 | (void)Inserted; |
2022 | assert(Inserted && "Should only visit an unlooped block once!" ); |
2023 | |
2024 | // And recurse through to its predecessors. |
2025 | Worklist.push_back(Elt: PredBB); |
2026 | } |
2027 | } while (!Worklist.empty()); |
2028 | |
2029 | // If blocks in this exit loop were directly part of the original loop (as |
2030 | // opposed to a child loop) update the map to point to this exit loop. This |
2031 | // just updates a map and so the fact that the order is unstable is fine. |
2032 | for (auto *BB : NewExitLoopBlocks) |
2033 | if (Loop *BBL = LI.getLoopFor(BB)) |
2034 | if (BBL == &L || !L.contains(L: BBL)) |
2035 | LI.changeLoopFor(BB, L: &ExitL); |
2036 | |
2037 | // We will remove the remaining unlooped blocks from this loop in the next |
2038 | // iteration or below. |
2039 | NewExitLoopBlocks.clear(); |
2040 | } |
2041 | |
2042 | // Any remaining unlooped blocks are no longer part of any loop unless they |
2043 | // are part of some child loop. |
2044 | for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop()) |
2045 | RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); |
2046 | for (auto *BB : UnloopedBlocks) |
2047 | if (Loop *BBL = LI.getLoopFor(BB)) |
2048 | if (BBL == &L || !L.contains(L: BBL)) |
2049 | LI.changeLoopFor(BB, L: nullptr); |
2050 | |
2051 | // Sink all the child loops whose headers are no longer in the loop set to |
2052 | // the parent (or to be top level loops). We reach into the loop and directly |
2053 | // update its subloop vector to make this batch update efficient. |
2054 | auto &SubLoops = L.getSubLoopsVector(); |
2055 | auto SubLoopsSplitI = |
2056 | LoopBlockSet.empty() |
2057 | ? SubLoops.begin() |
2058 | : std::stable_partition( |
2059 | first: SubLoops.begin(), last: SubLoops.end(), pred: [&](Loop *SubL) { |
2060 | return LoopBlockSet.count(Ptr: SubL->getHeader()); |
2061 | }); |
2062 | for (auto *HoistedL : make_range(x: SubLoopsSplitI, y: SubLoops.end())) { |
2063 | HoistedLoops.push_back(Elt: HoistedL); |
2064 | HoistedL->setParentLoop(nullptr); |
2065 | |
2066 | // To compute the new parent of this hoisted loop we look at where we |
2067 | // placed the preheader above. We can't lookup the header itself because we |
2068 | // retained the mapping from the header to the hoisted loop. But the |
2069 | // preheader and header should have the exact same new parent computed |
2070 | // based on the set of exit blocks from the original loop as the preheader |
2071 | // is a predecessor of the header and so reached in the reverse walk. And |
2072 | // because the loops were all in simplified form the preheader of the |
2073 | // hoisted loop can't be part of some *other* loop. |
2074 | if (auto *NewParentL = LI.getLoopFor(BB: HoistedL->getLoopPreheader())) |
2075 | NewParentL->addChildLoop(NewChild: HoistedL); |
2076 | else |
2077 | LI.addTopLevelLoop(New: HoistedL); |
2078 | } |
2079 | SubLoops.erase(first: SubLoopsSplitI, last: SubLoops.end()); |
2080 | |
2081 | // Actually delete the loop if nothing remained within it. |
2082 | if (Blocks.empty()) { |
2083 | assert(SubLoops.empty() && |
2084 | "Failed to remove all subloops from the original loop!" ); |
2085 | if (Loop *ParentL = L.getParentLoop()) |
2086 | ParentL->removeChildLoop(I: llvm::find(Range&: *ParentL, Val: &L)); |
2087 | else |
2088 | LI.removeLoop(I: llvm::find(Range&: LI, Val: &L)); |
2089 | // markLoopAsDeleted for L should be triggered by the caller (it is |
2090 | // typically done within postUnswitch). |
2091 | if (SE) |
2092 | SE->forgetBlockAndLoopDispositions(); |
2093 | LI.destroy(L: &L); |
2094 | return false; |
2095 | } |
2096 | |
2097 | return true; |
2098 | } |
2099 | |
2100 | /// Helper to visit a dominator subtree, invoking a callable on each node. |
2101 | /// |
2102 | /// Returning false at any point will stop walking past that node of the tree. |
2103 | template <typename CallableT> |
2104 | void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { |
2105 | SmallVector<DomTreeNode *, 4> DomWorklist; |
2106 | DomWorklist.push_back(Elt: DT[BB]); |
2107 | #ifndef NDEBUG |
2108 | SmallPtrSet<DomTreeNode *, 4> Visited; |
2109 | Visited.insert(Ptr: DT[BB]); |
2110 | #endif |
2111 | do { |
2112 | DomTreeNode *N = DomWorklist.pop_back_val(); |
2113 | |
2114 | // Visit this node. |
2115 | if (!Callable(N->getBlock())) |
2116 | continue; |
2117 | |
2118 | // Accumulate the child nodes. |
2119 | for (DomTreeNode *ChildN : *N) { |
2120 | assert(Visited.insert(ChildN).second && |
2121 | "Cannot visit a node twice when walking a tree!" ); |
2122 | DomWorklist.push_back(Elt: ChildN); |
2123 | } |
2124 | } while (!DomWorklist.empty()); |
2125 | } |
2126 | |
2127 | void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, |
2128 | bool CurrentLoopValid, bool PartiallyInvariant, |
2129 | bool InjectedCondition, ArrayRef<Loop *> NewLoops) { |
2130 | // If we did a non-trivial unswitch, we have added new (cloned) loops. |
2131 | if (!NewLoops.empty()) |
2132 | U.addSiblingLoops(NewSibLoops: NewLoops); |
2133 | |
2134 | // If the current loop remains valid, we should revisit it to catch any |
2135 | // other unswitch opportunities. Otherwise, we need to mark it as deleted. |
2136 | if (CurrentLoopValid) { |
2137 | if (PartiallyInvariant) { |
2138 | // Mark the new loop as partially unswitched, to avoid unswitching on |
2139 | // the same condition again. |
2140 | auto &Context = L.getHeader()->getContext(); |
2141 | MDNode *DisableUnswitchMD = MDNode::get( |
2142 | Context, |
2143 | MDs: MDString::get(Context, Str: "llvm.loop.unswitch.partial.disable" )); |
2144 | MDNode *NewLoopID = makePostTransformationMetadata( |
2145 | Context, OrigLoopID: L.getLoopID(), RemovePrefixes: {"llvm.loop.unswitch.partial" }, |
2146 | AddAttrs: {DisableUnswitchMD}); |
2147 | L.setLoopID(NewLoopID); |
2148 | } else if (InjectedCondition) { |
2149 | // Do the same for injection of invariant conditions. |
2150 | auto &Context = L.getHeader()->getContext(); |
2151 | MDNode *DisableUnswitchMD = MDNode::get( |
2152 | Context, |
2153 | MDs: MDString::get(Context, Str: "llvm.loop.unswitch.injection.disable" )); |
2154 | MDNode *NewLoopID = makePostTransformationMetadata( |
2155 | Context, OrigLoopID: L.getLoopID(), RemovePrefixes: {"llvm.loop.unswitch.injection" }, |
2156 | AddAttrs: {DisableUnswitchMD}); |
2157 | L.setLoopID(NewLoopID); |
2158 | } else |
2159 | U.revisitCurrentLoop(); |
2160 | } else |
2161 | U.markLoopAsDeleted(L, Name: LoopName); |
2162 | } |
2163 | |
2164 | static void unswitchNontrivialInvariants( |
2165 | Loop &L, Instruction &TI, ArrayRef<Value *> Invariants, |
2166 | IVConditionInfo &PartialIVInfo, DominatorTree &DT, LoopInfo &LI, |
2167 | AssumptionCache &AC, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, |
2168 | LPMUpdater &LoopUpdater, bool InsertFreeze, bool InjectedCondition) { |
2169 | auto *ParentBB = TI.getParent(); |
2170 | BranchInst *BI = dyn_cast<BranchInst>(Val: &TI); |
2171 | SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(Val: &TI); |
2172 | |
2173 | // Save the current loop name in a variable so that we can report it even |
2174 | // after it has been deleted. |
2175 | std::string LoopName(L.getName()); |
2176 | |
2177 | // We can only unswitch switches, conditional branches with an invariant |
2178 | // condition, or combining invariant conditions with an instruction or |
2179 | // partially invariant instructions. |
2180 | assert((SI || (BI && BI->isConditional())) && |
2181 | "Can only unswitch switches and conditional branch!" ); |
2182 | bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty(); |
2183 | bool FullUnswitch = |
2184 | SI || (skipTrivialSelect(Cond: BI->getCondition()) == Invariants[0] && |
2185 | !PartiallyInvariant); |
2186 | if (FullUnswitch) |
2187 | assert(Invariants.size() == 1 && |
2188 | "Cannot have other invariants with full unswitching!" ); |
2189 | else |
2190 | assert(isa<Instruction>(skipTrivialSelect(BI->getCondition())) && |
2191 | "Partial unswitching requires an instruction as the condition!" ); |
2192 | |
2193 | if (MSSAU && VerifyMemorySSA) |
2194 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2195 | |
2196 | // Constant and BBs tracking the cloned and continuing successor. When we are |
2197 | // unswitching the entire condition, this can just be trivially chosen to |
2198 | // unswitch towards `true`. However, when we are unswitching a set of |
2199 | // invariants combined with `and` or `or` or partially invariant instructions, |
2200 | // the combining operation determines the best direction to unswitch: we want |
2201 | // to unswitch the direction that will collapse the branch. |
2202 | bool Direction = true; |
2203 | int ClonedSucc = 0; |
2204 | if (!FullUnswitch) { |
2205 | Value *Cond = skipTrivialSelect(Cond: BI->getCondition()); |
2206 | (void)Cond; |
2207 | assert(((match(Cond, m_LogicalAnd()) ^ match(Cond, m_LogicalOr())) || |
2208 | PartiallyInvariant) && |
2209 | "Only `or`, `and`, an `select`, partially invariant instructions " |
2210 | "can combine invariants being unswitched." ); |
2211 | if (!match(V: Cond, P: m_LogicalOr())) { |
2212 | if (match(V: Cond, P: m_LogicalAnd()) || |
2213 | (PartiallyInvariant && !PartialIVInfo.KnownValue->isOneValue())) { |
2214 | Direction = false; |
2215 | ClonedSucc = 1; |
2216 | } |
2217 | } |
2218 | } |
2219 | |
2220 | BasicBlock *RetainedSuccBB = |
2221 | BI ? BI->getSuccessor(i: 1 - ClonedSucc) : SI->getDefaultDest(); |
2222 | SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs; |
2223 | if (BI) |
2224 | UnswitchedSuccBBs.insert(X: BI->getSuccessor(i: ClonedSucc)); |
2225 | else |
2226 | for (auto Case : SI->cases()) |
2227 | if (Case.getCaseSuccessor() != RetainedSuccBB) |
2228 | UnswitchedSuccBBs.insert(X: Case.getCaseSuccessor()); |
2229 | |
2230 | assert(!UnswitchedSuccBBs.count(RetainedSuccBB) && |
2231 | "Should not unswitch the same successor we are retaining!" ); |
2232 | |
2233 | // The branch should be in this exact loop. Any inner loop's invariant branch |
2234 | // should be handled by unswitching that inner loop. The caller of this |
2235 | // routine should filter out any candidates that remain (but were skipped for |
2236 | // whatever reason). |
2237 | assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!" ); |
2238 | |
2239 | // Compute the parent loop now before we start hacking on things. |
2240 | Loop *ParentL = L.getParentLoop(); |
2241 | // Get blocks in RPO order for MSSA update, before changing the CFG. |
2242 | LoopBlocksRPO LBRPO(&L); |
2243 | if (MSSAU) |
2244 | LBRPO.perform(LI: &LI); |
2245 | |
2246 | // Compute the outer-most loop containing one of our exit blocks. This is the |
2247 | // furthest up our loopnest which can be mutated, which we will use below to |
2248 | // update things. |
2249 | Loop *OuterExitL = &L; |
2250 | SmallVector<BasicBlock *, 4> ExitBlocks; |
2251 | L.getUniqueExitBlocks(ExitBlocks); |
2252 | for (auto *ExitBB : ExitBlocks) { |
2253 | // ExitBB can be an exit block for several levels in the loop nest. Make |
2254 | // sure we find the top most. |
2255 | Loop *NewOuterExitL = getTopMostExitingLoop(ExitBB, LI); |
2256 | if (!NewOuterExitL) { |
2257 | // We exited the entire nest with this block, so we're done. |
2258 | OuterExitL = nullptr; |
2259 | break; |
2260 | } |
2261 | if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(L: OuterExitL)) |
2262 | OuterExitL = NewOuterExitL; |
2263 | } |
2264 | |
2265 | // At this point, we're definitely going to unswitch something so invalidate |
2266 | // any cached information in ScalarEvolution for the outer most loop |
2267 | // containing an exit block and all nested loops. |
2268 | if (SE) { |
2269 | if (OuterExitL) |
2270 | SE->forgetLoop(L: OuterExitL); |
2271 | else |
2272 | SE->forgetTopmostLoop(L: &L); |
2273 | SE->forgetBlockAndLoopDispositions(); |
2274 | } |
2275 | |
2276 | // If the edge from this terminator to a successor dominates that successor, |
2277 | // store a map from each block in its dominator subtree to it. This lets us |
2278 | // tell when cloning for a particular successor if a block is dominated by |
2279 | // some *other* successor with a single data structure. We use this to |
2280 | // significantly reduce cloning. |
2281 | SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; |
2282 | for (auto *SuccBB : llvm::concat<BasicBlock *const>(Ranges: ArrayRef(RetainedSuccBB), |
2283 | Ranges&: UnswitchedSuccBBs)) |
2284 | if (SuccBB->getUniquePredecessor() || |
2285 | llvm::all_of(Range: predecessors(BB: SuccBB), P: [&](BasicBlock *PredBB) { |
2286 | return PredBB == ParentBB || DT.dominates(A: SuccBB, B: PredBB); |
2287 | })) |
2288 | visitDomSubTree(DT, BB: SuccBB, Callable: [&](BasicBlock *BB) { |
2289 | DominatingSucc[BB] = SuccBB; |
2290 | return true; |
2291 | }); |
2292 | |
2293 | // Split the preheader, so that we know that there is a safe place to insert |
2294 | // the conditional branch. We will change the preheader to have a conditional |
2295 | // branch on LoopCond. The original preheader will become the split point |
2296 | // between the unswitched versions, and we will have a new preheader for the |
2297 | // original loop. |
2298 | BasicBlock *SplitBB = L.getLoopPreheader(); |
2299 | BasicBlock *LoopPH = SplitEdge(From: SplitBB, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
2300 | |
2301 | // Keep track of the dominator tree updates needed. |
2302 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
2303 | |
2304 | // Clone the loop for each unswitched successor. |
2305 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
2306 | VMaps.reserve(N: UnswitchedSuccBBs.size()); |
2307 | SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs; |
2308 | for (auto *SuccBB : UnswitchedSuccBBs) { |
2309 | VMaps.emplace_back(Args: new ValueToValueMapTy()); |
2310 | ClonedPHs[SuccBB] = buildClonedLoopBlocks( |
2311 | L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB: SuccBB, ContinueSuccBB: RetainedSuccBB, |
2312 | DominatingSucc, VMap&: *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU, SE); |
2313 | } |
2314 | |
2315 | // Drop metadata if we may break its semantics by moving this instr into the |
2316 | // split block. |
2317 | if (TI.getMetadata(KindID: LLVMContext::MD_make_implicit)) { |
2318 | if (DropNonTrivialImplicitNullChecks) |
2319 | // Do not spend time trying to understand if we can keep it, just drop it |
2320 | // to save compile time. |
2321 | TI.setMetadata(KindID: LLVMContext::MD_make_implicit, Node: nullptr); |
2322 | else { |
2323 | // It is only legal to preserve make.implicit metadata if we are |
2324 | // guaranteed no reach implicit null check after following this branch. |
2325 | ICFLoopSafetyInfo SafetyInfo; |
2326 | SafetyInfo.computeLoopSafetyInfo(CurLoop: &L); |
2327 | if (!SafetyInfo.isGuaranteedToExecute(Inst: TI, DT: &DT, CurLoop: &L)) |
2328 | TI.setMetadata(KindID: LLVMContext::MD_make_implicit, Node: nullptr); |
2329 | } |
2330 | } |
2331 | |
2332 | // The stitching of the branched code back together depends on whether we're |
2333 | // doing full unswitching or not with the exception that we always want to |
2334 | // nuke the initial terminator placed in the split block. |
2335 | SplitBB->getTerminator()->eraseFromParent(); |
2336 | if (FullUnswitch) { |
2337 | // Splice the terminator from the original loop and rewrite its |
2338 | // successors. |
2339 | TI.moveBefore(BB&: *SplitBB, I: SplitBB->end()); |
2340 | |
2341 | // Keep a clone of the terminator for MSSA updates. |
2342 | Instruction *NewTI = TI.clone(); |
2343 | NewTI->insertInto(ParentBB, It: ParentBB->end()); |
2344 | |
2345 | // First wire up the moved terminator to the preheaders. |
2346 | if (BI) { |
2347 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2348 | BI->setSuccessor(idx: ClonedSucc, NewSucc: ClonedPH); |
2349 | BI->setSuccessor(idx: 1 - ClonedSucc, NewSucc: LoopPH); |
2350 | Value *Cond = skipTrivialSelect(Cond: BI->getCondition()); |
2351 | if (InsertFreeze) |
2352 | Cond = new FreezeInst(Cond, Cond->getName() + ".fr" , BI->getIterator()); |
2353 | BI->setCondition(Cond); |
2354 | DTUpdates.push_back(Elt: {DominatorTree::Insert, SplitBB, ClonedPH}); |
2355 | } else { |
2356 | assert(SI && "Must either be a branch or switch!" ); |
2357 | |
2358 | // Walk the cases and directly update their successors. |
2359 | assert(SI->getDefaultDest() == RetainedSuccBB && |
2360 | "Not retaining default successor!" ); |
2361 | SI->setDefaultDest(LoopPH); |
2362 | for (const auto &Case : SI->cases()) |
2363 | if (Case.getCaseSuccessor() == RetainedSuccBB) |
2364 | Case.setSuccessor(LoopPH); |
2365 | else |
2366 | Case.setSuccessor(ClonedPHs.find(Val: Case.getCaseSuccessor())->second); |
2367 | |
2368 | if (InsertFreeze) |
2369 | SI->setCondition(new FreezeInst(SI->getCondition(), |
2370 | SI->getCondition()->getName() + ".fr" , |
2371 | SI->getIterator())); |
2372 | |
2373 | // We need to use the set to populate domtree updates as even when there |
2374 | // are multiple cases pointing at the same successor we only want to |
2375 | // remove and insert one edge in the domtree. |
2376 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2377 | DTUpdates.push_back( |
2378 | Elt: {DominatorTree::Insert, SplitBB, ClonedPHs.find(Val: SuccBB)->second}); |
2379 | } |
2380 | |
2381 | if (MSSAU) { |
2382 | DT.applyUpdates(Updates: DTUpdates); |
2383 | DTUpdates.clear(); |
2384 | |
2385 | // Remove all but one edge to the retained block and all unswitched |
2386 | // blocks. This is to avoid having duplicate entries in the cloned Phis, |
2387 | // when we know we only keep a single edge for each case. |
2388 | MSSAU->removeDuplicatePhiEdgesBetween(From: ParentBB, To: RetainedSuccBB); |
2389 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2390 | MSSAU->removeDuplicatePhiEdgesBetween(From: ParentBB, To: SuccBB); |
2391 | |
2392 | for (auto &VMap : VMaps) |
2393 | MSSAU->updateForClonedLoop(LoopBlocks: LBRPO, ExitBlocks, VM: *VMap, |
2394 | /*IgnoreIncomingWithNoClones=*/true); |
2395 | MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); |
2396 | |
2397 | // Remove all edges to unswitched blocks. |
2398 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2399 | MSSAU->removeEdge(From: ParentBB, To: SuccBB); |
2400 | } |
2401 | |
2402 | // Now unhook the successor relationship as we'll be replacing |
2403 | // the terminator with a direct branch. This is much simpler for branches |
2404 | // than switches so we handle those first. |
2405 | if (BI) { |
2406 | // Remove the parent as a predecessor of the unswitched successor. |
2407 | assert(UnswitchedSuccBBs.size() == 1 && |
2408 | "Only one possible unswitched block for a branch!" ); |
2409 | BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin(); |
2410 | UnswitchedSuccBB->removePredecessor(Pred: ParentBB, |
2411 | /*KeepOneInputPHIs*/ true); |
2412 | DTUpdates.push_back(Elt: {DominatorTree::Delete, ParentBB, UnswitchedSuccBB}); |
2413 | } else { |
2414 | // Note that we actually want to remove the parent block as a predecessor |
2415 | // of *every* case successor. The case successor is either unswitched, |
2416 | // completely eliminating an edge from the parent to that successor, or it |
2417 | // is a duplicate edge to the retained successor as the retained successor |
2418 | // is always the default successor and as we'll replace this with a direct |
2419 | // branch we no longer need the duplicate entries in the PHI nodes. |
2420 | SwitchInst *NewSI = cast<SwitchInst>(Val: NewTI); |
2421 | assert(NewSI->getDefaultDest() == RetainedSuccBB && |
2422 | "Not retaining default successor!" ); |
2423 | for (const auto &Case : NewSI->cases()) |
2424 | Case.getCaseSuccessor()->removePredecessor( |
2425 | Pred: ParentBB, |
2426 | /*KeepOneInputPHIs*/ true); |
2427 | |
2428 | // We need to use the set to populate domtree updates as even when there |
2429 | // are multiple cases pointing at the same successor we only want to |
2430 | // remove and insert one edge in the domtree. |
2431 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2432 | DTUpdates.push_back(Elt: {DominatorTree::Delete, ParentBB, SuccBB}); |
2433 | } |
2434 | |
2435 | // After MSSAU update, remove the cloned terminator instruction NewTI. |
2436 | ParentBB->getTerminator()->eraseFromParent(); |
2437 | |
2438 | // Create a new unconditional branch to the continuing block (as opposed to |
2439 | // the one cloned). |
2440 | BranchInst::Create(IfTrue: RetainedSuccBB, InsertAtEnd: ParentBB); |
2441 | } else { |
2442 | assert(BI && "Only branches have partial unswitching." ); |
2443 | assert(UnswitchedSuccBBs.size() == 1 && |
2444 | "Only one possible unswitched block for a branch!" ); |
2445 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2446 | // When doing a partial unswitch, we have to do a bit more work to build up |
2447 | // the branch in the split block. |
2448 | if (PartiallyInvariant) |
2449 | buildPartialInvariantUnswitchConditionalBranch( |
2450 | BB&: *SplitBB, ToDuplicate: Invariants, Direction, UnswitchedSucc&: *ClonedPH, NormalSucc&: *LoopPH, L, MSSAU); |
2451 | else { |
2452 | buildPartialUnswitchConditionalBranch( |
2453 | BB&: *SplitBB, Invariants, Direction, UnswitchedSucc&: *ClonedPH, NormalSucc&: *LoopPH, |
2454 | InsertFreeze: FreezeLoopUnswitchCond, I: BI, AC: &AC, DT); |
2455 | } |
2456 | DTUpdates.push_back(Elt: {DominatorTree::Insert, SplitBB, ClonedPH}); |
2457 | |
2458 | if (MSSAU) { |
2459 | DT.applyUpdates(Updates: DTUpdates); |
2460 | DTUpdates.clear(); |
2461 | |
2462 | // Perform MSSA cloning updates. |
2463 | for (auto &VMap : VMaps) |
2464 | MSSAU->updateForClonedLoop(LoopBlocks: LBRPO, ExitBlocks, VM: *VMap, |
2465 | /*IgnoreIncomingWithNoClones=*/true); |
2466 | MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); |
2467 | } |
2468 | } |
2469 | |
2470 | // Apply the updates accumulated above to get an up-to-date dominator tree. |
2471 | DT.applyUpdates(Updates: DTUpdates); |
2472 | |
2473 | // Now that we have an accurate dominator tree, first delete the dead cloned |
2474 | // blocks so that we can accurately build any cloned loops. It is important to |
2475 | // not delete the blocks from the original loop yet because we still want to |
2476 | // reference the original loop to understand the cloned loop's structure. |
2477 | deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU); |
2478 | |
2479 | // Build the cloned loop structure itself. This may be substantially |
2480 | // different from the original structure due to the simplified CFG. This also |
2481 | // handles inserting all the cloned blocks into the correct loops. |
2482 | SmallVector<Loop *, 4> NonChildClonedLoops; |
2483 | for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps) |
2484 | buildClonedLoops(OrigL&: L, ExitBlocks, VMap: *VMap, LI, NonChildClonedLoops); |
2485 | |
2486 | // Now that our cloned loops have been built, we can update the original loop. |
2487 | // First we delete the dead blocks from it and then we rebuild the loop |
2488 | // structure taking these deletions into account. |
2489 | deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, SE, LoopUpdater); |
2490 | |
2491 | if (MSSAU && VerifyMemorySSA) |
2492 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2493 | |
2494 | SmallVector<Loop *, 4> HoistedLoops; |
2495 | bool IsStillLoop = |
2496 | rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops, SE); |
2497 | |
2498 | if (MSSAU && VerifyMemorySSA) |
2499 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2500 | |
2501 | // This transformation has a high risk of corrupting the dominator tree, and |
2502 | // the below steps to rebuild loop structures will result in hard to debug |
2503 | // errors in that case so verify that the dominator tree is sane first. |
2504 | // FIXME: Remove this when the bugs stop showing up and rely on existing |
2505 | // verification steps. |
2506 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
2507 | |
2508 | if (BI && !PartiallyInvariant) { |
2509 | // If we unswitched a branch which collapses the condition to a known |
2510 | // constant we want to replace all the uses of the invariants within both |
2511 | // the original and cloned blocks. We do this here so that we can use the |
2512 | // now updated dominator tree to identify which side the users are on. |
2513 | assert(UnswitchedSuccBBs.size() == 1 && |
2514 | "Only one possible unswitched block for a branch!" ); |
2515 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2516 | |
2517 | // When considering multiple partially-unswitched invariants |
2518 | // we cant just go replace them with constants in both branches. |
2519 | // |
2520 | // For 'AND' we infer that true branch ("continue") means true |
2521 | // for each invariant operand. |
2522 | // For 'OR' we can infer that false branch ("continue") means false |
2523 | // for each invariant operand. |
2524 | // So it happens that for multiple-partial case we dont replace |
2525 | // in the unswitched branch. |
2526 | bool ReplaceUnswitched = |
2527 | FullUnswitch || (Invariants.size() == 1) || PartiallyInvariant; |
2528 | |
2529 | ConstantInt *UnswitchedReplacement = |
2530 | Direction ? ConstantInt::getTrue(Context&: BI->getContext()) |
2531 | : ConstantInt::getFalse(Context&: BI->getContext()); |
2532 | ConstantInt *ContinueReplacement = |
2533 | Direction ? ConstantInt::getFalse(Context&: BI->getContext()) |
2534 | : ConstantInt::getTrue(Context&: BI->getContext()); |
2535 | for (Value *Invariant : Invariants) { |
2536 | assert(!isa<Constant>(Invariant) && |
2537 | "Should not be replacing constant values!" ); |
2538 | // Use make_early_inc_range here as set invalidates the iterator. |
2539 | for (Use &U : llvm::make_early_inc_range(Range: Invariant->uses())) { |
2540 | Instruction *UserI = dyn_cast<Instruction>(Val: U.getUser()); |
2541 | if (!UserI) |
2542 | continue; |
2543 | |
2544 | // Replace it with the 'continue' side if in the main loop body, and the |
2545 | // unswitched if in the cloned blocks. |
2546 | if (DT.dominates(A: LoopPH, B: UserI->getParent())) |
2547 | U.set(ContinueReplacement); |
2548 | else if (ReplaceUnswitched && |
2549 | DT.dominates(A: ClonedPH, B: UserI->getParent())) |
2550 | U.set(UnswitchedReplacement); |
2551 | } |
2552 | } |
2553 | } |
2554 | |
2555 | // We can change which blocks are exit blocks of all the cloned sibling |
2556 | // loops, the current loop, and any parent loops which shared exit blocks |
2557 | // with the current loop. As a consequence, we need to re-form LCSSA for |
2558 | // them. But we shouldn't need to re-form LCSSA for any child loops. |
2559 | // FIXME: This could be made more efficient by tracking which exit blocks are |
2560 | // new, and focusing on them, but that isn't likely to be necessary. |
2561 | // |
2562 | // In order to reasonably rebuild LCSSA we need to walk inside-out across the |
2563 | // loop nest and update every loop that could have had its exits changed. We |
2564 | // also need to cover any intervening loops. We add all of these loops to |
2565 | // a list and sort them by loop depth to achieve this without updating |
2566 | // unnecessary loops. |
2567 | auto UpdateLoop = [&](Loop &UpdateL) { |
2568 | #ifndef NDEBUG |
2569 | UpdateL.verifyLoop(); |
2570 | for (Loop *ChildL : UpdateL) { |
2571 | ChildL->verifyLoop(); |
2572 | assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && |
2573 | "Perturbed a child loop's LCSSA form!" ); |
2574 | } |
2575 | #endif |
2576 | // First build LCSSA for this loop so that we can preserve it when |
2577 | // forming dedicated exits. We don't want to perturb some other loop's |
2578 | // LCSSA while doing that CFG edit. |
2579 | formLCSSA(L&: UpdateL, DT, LI: &LI, SE); |
2580 | |
2581 | // For loops reached by this loop's original exit blocks we may |
2582 | // introduced new, non-dedicated exits. At least try to re-form dedicated |
2583 | // exits for these loops. This may fail if they couldn't have dedicated |
2584 | // exits to start with. |
2585 | formDedicatedExitBlocks(L: &UpdateL, DT: &DT, LI: &LI, MSSAU, /*PreserveLCSSA*/ true); |
2586 | }; |
2587 | |
2588 | // For non-child cloned loops and hoisted loops, we just need to update LCSSA |
2589 | // and we can do it in any order as they don't nest relative to each other. |
2590 | // |
2591 | // Also check if any of the loops we have updated have become top-level loops |
2592 | // as that will necessitate widening the outer loop scope. |
2593 | for (Loop *UpdatedL : |
2594 | llvm::concat<Loop *>(Ranges&: NonChildClonedLoops, Ranges&: HoistedLoops)) { |
2595 | UpdateLoop(*UpdatedL); |
2596 | if (UpdatedL->isOutermost()) |
2597 | OuterExitL = nullptr; |
2598 | } |
2599 | if (IsStillLoop) { |
2600 | UpdateLoop(L); |
2601 | if (L.isOutermost()) |
2602 | OuterExitL = nullptr; |
2603 | } |
2604 | |
2605 | // If the original loop had exit blocks, walk up through the outer most loop |
2606 | // of those exit blocks to update LCSSA and form updated dedicated exits. |
2607 | if (OuterExitL != &L) |
2608 | for (Loop *OuterL = ParentL; OuterL != OuterExitL; |
2609 | OuterL = OuterL->getParentLoop()) |
2610 | UpdateLoop(*OuterL); |
2611 | |
2612 | #ifndef NDEBUG |
2613 | // Verify the entire loop structure to catch any incorrect updates before we |
2614 | // progress in the pass pipeline. |
2615 | LI.verify(DomTree: DT); |
2616 | #endif |
2617 | |
2618 | // Now that we've unswitched something, make callbacks to report the changes. |
2619 | // For that we need to merge together the updated loops and the cloned loops |
2620 | // and check whether the original loop survived. |
2621 | SmallVector<Loop *, 4> SibLoops; |
2622 | for (Loop *UpdatedL : llvm::concat<Loop *>(Ranges&: NonChildClonedLoops, Ranges&: HoistedLoops)) |
2623 | if (UpdatedL->getParentLoop() == ParentL) |
2624 | SibLoops.push_back(Elt: UpdatedL); |
2625 | postUnswitch(L, U&: LoopUpdater, LoopName, CurrentLoopValid: IsStillLoop, PartiallyInvariant, |
2626 | InjectedCondition, NewLoops: SibLoops); |
2627 | |
2628 | if (MSSAU && VerifyMemorySSA) |
2629 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2630 | |
2631 | if (BI) |
2632 | ++NumBranches; |
2633 | else |
2634 | ++NumSwitches; |
2635 | } |
2636 | |
2637 | /// Recursively compute the cost of a dominator subtree based on the per-block |
2638 | /// cost map provided. |
2639 | /// |
2640 | /// The recursive computation is memozied into the provided DT-indexed cost map |
2641 | /// to allow querying it for most nodes in the domtree without it becoming |
2642 | /// quadratic. |
2643 | static InstructionCost computeDomSubtreeCost( |
2644 | DomTreeNode &N, |
2645 | const SmallDenseMap<BasicBlock *, InstructionCost, 4> &BBCostMap, |
2646 | SmallDenseMap<DomTreeNode *, InstructionCost, 4> &DTCostMap) { |
2647 | // Don't accumulate cost (or recurse through) blocks not in our block cost |
2648 | // map and thus not part of the duplication cost being considered. |
2649 | auto BBCostIt = BBCostMap.find(Val: N.getBlock()); |
2650 | if (BBCostIt == BBCostMap.end()) |
2651 | return 0; |
2652 | |
2653 | // Lookup this node to see if we already computed its cost. |
2654 | auto DTCostIt = DTCostMap.find(Val: &N); |
2655 | if (DTCostIt != DTCostMap.end()) |
2656 | return DTCostIt->second; |
2657 | |
2658 | // If not, we have to compute it. We can't use insert above and update |
2659 | // because computing the cost may insert more things into the map. |
2660 | InstructionCost Cost = std::accumulate( |
2661 | first: N.begin(), last: N.end(), init: BBCostIt->second, |
2662 | binary_op: [&](InstructionCost Sum, DomTreeNode *ChildN) -> InstructionCost { |
2663 | return Sum + computeDomSubtreeCost(N&: *ChildN, BBCostMap, DTCostMap); |
2664 | }); |
2665 | bool Inserted = DTCostMap.insert(KV: {&N, Cost}).second; |
2666 | (void)Inserted; |
2667 | assert(Inserted && "Should not insert a node while visiting children!" ); |
2668 | return Cost; |
2669 | } |
2670 | |
2671 | /// Turns a select instruction into implicit control flow branch, |
2672 | /// making the following replacement: |
2673 | /// |
2674 | /// head: |
2675 | /// --code before select-- |
2676 | /// select %cond, %trueval, %falseval |
2677 | /// --code after select-- |
2678 | /// |
2679 | /// into |
2680 | /// |
2681 | /// head: |
2682 | /// --code before select-- |
2683 | /// br i1 %cond, label %then, label %tail |
2684 | /// |
2685 | /// then: |
2686 | /// br %tail |
2687 | /// |
2688 | /// tail: |
2689 | /// phi [ %trueval, %then ], [ %falseval, %head] |
2690 | /// unreachable |
2691 | /// |
2692 | /// It also makes all relevant DT and LI updates, so that all structures are in |
2693 | /// valid state after this transform. |
2694 | static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, |
2695 | LoopInfo &LI, MemorySSAUpdater *MSSAU, |
2696 | AssumptionCache *AC) { |
2697 | LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n" ); |
2698 | BasicBlock *HeadBB = SI->getParent(); |
2699 | |
2700 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
2701 | SplitBlockAndInsertIfThen(Cond: SI->getCondition(), SplitBefore: SI, Unreachable: false, |
2702 | BranchWeights: SI->getMetadata(KindID: LLVMContext::MD_prof), DTU: &DTU, LI: &LI); |
2703 | auto *CondBr = cast<BranchInst>(Val: HeadBB->getTerminator()); |
2704 | BasicBlock *ThenBB = CondBr->getSuccessor(i: 0), |
2705 | *TailBB = CondBr->getSuccessor(i: 1); |
2706 | if (MSSAU) |
2707 | MSSAU->moveAllAfterSpliceBlocks(From: HeadBB, To: TailBB, Start: SI); |
2708 | |
2709 | PHINode *Phi = |
2710 | PHINode::Create(Ty: SI->getType(), NumReservedValues: 2, NameStr: "unswitched.select" , InsertBefore: SI->getIterator()); |
2711 | Phi->addIncoming(V: SI->getTrueValue(), BB: ThenBB); |
2712 | Phi->addIncoming(V: SI->getFalseValue(), BB: HeadBB); |
2713 | SI->replaceAllUsesWith(V: Phi); |
2714 | SI->eraseFromParent(); |
2715 | |
2716 | if (MSSAU && VerifyMemorySSA) |
2717 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2718 | |
2719 | ++NumSelects; |
2720 | return CondBr; |
2721 | } |
2722 | |
2723 | /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, |
2724 | /// making the following replacement: |
2725 | /// |
2726 | /// --code before guard-- |
2727 | /// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] |
2728 | /// --code after guard-- |
2729 | /// |
2730 | /// into |
2731 | /// |
2732 | /// --code before guard-- |
2733 | /// br i1 %cond, label %guarded, label %deopt |
2734 | /// |
2735 | /// guarded: |
2736 | /// --code after guard-- |
2737 | /// |
2738 | /// deopt: |
2739 | /// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] |
2740 | /// unreachable |
2741 | /// |
2742 | /// It also makes all relevant DT and LI updates, so that all structures are in |
2743 | /// valid state after this transform. |
2744 | static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, |
2745 | DominatorTree &DT, LoopInfo &LI, |
2746 | MemorySSAUpdater *MSSAU) { |
2747 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
2748 | LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n" ); |
2749 | BasicBlock *CheckBB = GI->getParent(); |
2750 | |
2751 | if (MSSAU && VerifyMemorySSA) |
2752 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2753 | |
2754 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
2755 | Instruction *DeoptBlockTerm = |
2756 | SplitBlockAndInsertIfThen(Cond: GI->getArgOperand(i: 0), SplitBefore: GI, Unreachable: true, |
2757 | BranchWeights: GI->getMetadata(KindID: LLVMContext::MD_prof), DTU: &DTU, LI: &LI); |
2758 | BranchInst *CheckBI = cast<BranchInst>(Val: CheckBB->getTerminator()); |
2759 | // SplitBlockAndInsertIfThen inserts control flow that branches to |
2760 | // DeoptBlockTerm if the condition is true. We want the opposite. |
2761 | CheckBI->swapSuccessors(); |
2762 | |
2763 | BasicBlock *GuardedBlock = CheckBI->getSuccessor(i: 0); |
2764 | GuardedBlock->setName("guarded" ); |
2765 | CheckBI->getSuccessor(i: 1)->setName("deopt" ); |
2766 | BasicBlock *DeoptBlock = CheckBI->getSuccessor(i: 1); |
2767 | |
2768 | if (MSSAU) |
2769 | MSSAU->moveAllAfterSpliceBlocks(From: CheckBB, To: GuardedBlock, Start: GI); |
2770 | |
2771 | GI->moveBefore(MovePos: DeoptBlockTerm); |
2772 | GI->setArgOperand(i: 0, v: ConstantInt::getFalse(Context&: GI->getContext())); |
2773 | |
2774 | if (MSSAU) { |
2775 | MemoryDef *MD = cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: GI)); |
2776 | MSSAU->moveToPlace(What: MD, BB: DeoptBlock, Where: MemorySSA::BeforeTerminator); |
2777 | if (VerifyMemorySSA) |
2778 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2779 | } |
2780 | |
2781 | if (VerifyLoopInfo) |
2782 | LI.verify(DomTree: DT); |
2783 | ++NumGuards; |
2784 | return CheckBI; |
2785 | } |
2786 | |
2787 | /// Cost multiplier is a way to limit potentially exponential behavior |
2788 | /// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch |
2789 | /// candidates available. Also accounting for the number of "sibling" loops with |
2790 | /// the idea to account for previous unswitches that already happened on this |
2791 | /// cluster of loops. There was an attempt to keep this formula simple, |
2792 | /// just enough to limit the worst case behavior. Even if it is not that simple |
2793 | /// now it is still not an attempt to provide a detailed heuristic size |
2794 | /// prediction. |
2795 | /// |
2796 | /// TODO: Make a proper accounting of "explosion" effect for all kinds of |
2797 | /// unswitch candidates, making adequate predictions instead of wild guesses. |
2798 | /// That requires knowing not just the number of "remaining" candidates but |
2799 | /// also costs of unswitching for each of these candidates. |
2800 | static int CalculateUnswitchCostMultiplier( |
2801 | const Instruction &TI, const Loop &L, const LoopInfo &LI, |
2802 | const DominatorTree &DT, |
2803 | ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates) { |
2804 | |
2805 | // Guards and other exiting conditions do not contribute to exponential |
2806 | // explosion as soon as they dominate the latch (otherwise there might be |
2807 | // another path to the latch remaining that does not allow to eliminate the |
2808 | // loop copy on unswitch). |
2809 | const BasicBlock *Latch = L.getLoopLatch(); |
2810 | const BasicBlock *CondBlock = TI.getParent(); |
2811 | if (DT.dominates(A: CondBlock, B: Latch) && |
2812 | (isGuard(U: &TI) || |
2813 | (TI.isTerminator() && |
2814 | llvm::count_if(Range: successors(I: &TI), P: [&L](const BasicBlock *SuccBB) { |
2815 | return L.contains(BB: SuccBB); |
2816 | }) <= 1))) { |
2817 | NumCostMultiplierSkipped++; |
2818 | return 1; |
2819 | } |
2820 | |
2821 | auto *ParentL = L.getParentLoop(); |
2822 | int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() |
2823 | : std::distance(first: LI.begin(), last: LI.end())); |
2824 | // Count amount of clones that all the candidates might cause during |
2825 | // unswitching. Branch/guard/select counts as 1, switch counts as log2 of its |
2826 | // cases. |
2827 | int UnswitchedClones = 0; |
2828 | for (const auto &Candidate : UnswitchCandidates) { |
2829 | const Instruction *CI = Candidate.TI; |
2830 | const BasicBlock *CondBlock = CI->getParent(); |
2831 | bool SkipExitingSuccessors = DT.dominates(A: CondBlock, B: Latch); |
2832 | if (isa<SelectInst>(Val: CI)) { |
2833 | UnswitchedClones++; |
2834 | continue; |
2835 | } |
2836 | if (isGuard(U: CI)) { |
2837 | if (!SkipExitingSuccessors) |
2838 | UnswitchedClones++; |
2839 | continue; |
2840 | } |
2841 | int NonExitingSuccessors = |
2842 | llvm::count_if(Range: successors(BB: CondBlock), |
2843 | P: [SkipExitingSuccessors, &L](const BasicBlock *SuccBB) { |
2844 | return !SkipExitingSuccessors || L.contains(BB: SuccBB); |
2845 | }); |
2846 | UnswitchedClones += Log2_32(Value: NonExitingSuccessors); |
2847 | } |
2848 | |
2849 | // Ignore up to the "unscaled candidates" number of unswitch candidates |
2850 | // when calculating the power-of-two scaling of the cost. The main idea |
2851 | // with this control is to allow a small number of unswitches to happen |
2852 | // and rely more on siblings multiplier (see below) when the number |
2853 | // of candidates is small. |
2854 | unsigned ClonesPower = |
2855 | std::max(a: UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, b: 0); |
2856 | |
2857 | // Allowing top-level loops to spread a bit more than nested ones. |
2858 | int SiblingsMultiplier = |
2859 | std::max(a: (ParentL ? SiblingsCount |
2860 | : SiblingsCount / (int)UnswitchSiblingsToplevelDiv), |
2861 | b: 1); |
2862 | // Compute the cost multiplier in a way that won't overflow by saturating |
2863 | // at an upper bound. |
2864 | int CostMultiplier; |
2865 | if (ClonesPower > Log2_32(Value: UnswitchThreshold) || |
2866 | SiblingsMultiplier > UnswitchThreshold) |
2867 | CostMultiplier = UnswitchThreshold; |
2868 | else |
2869 | CostMultiplier = std::min(a: SiblingsMultiplier * (1 << ClonesPower), |
2870 | b: (int)UnswitchThreshold); |
2871 | |
2872 | LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier |
2873 | << " (siblings " << SiblingsMultiplier << " * clones " |
2874 | << (1 << ClonesPower) << ")" |
2875 | << " for unswitch candidate: " << TI << "\n" ); |
2876 | return CostMultiplier; |
2877 | } |
2878 | |
2879 | static bool collectUnswitchCandidates( |
2880 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, |
2881 | IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, |
2882 | const Loop &L, const LoopInfo &LI, AAResults &AA, |
2883 | const MemorySSAUpdater *MSSAU) { |
2884 | assert(UnswitchCandidates.empty() && "Should be!" ); |
2885 | |
2886 | auto AddUnswitchCandidatesForInst = [&](Instruction *I, Value *Cond) { |
2887 | Cond = skipTrivialSelect(Cond); |
2888 | if (isa<Constant>(Val: Cond)) |
2889 | return; |
2890 | if (L.isLoopInvariant(V: Cond)) { |
2891 | UnswitchCandidates.push_back(Elt: {I, {Cond}}); |
2892 | return; |
2893 | } |
2894 | if (match(V: Cond, P: m_CombineOr(L: m_LogicalAnd(), R: m_LogicalOr()))) { |
2895 | TinyPtrVector<Value *> Invariants = |
2896 | collectHomogenousInstGraphLoopInvariants( |
2897 | L, Root&: *static_cast<Instruction *>(Cond), LI); |
2898 | if (!Invariants.empty()) |
2899 | UnswitchCandidates.push_back(Elt: {I, std::move(Invariants)}); |
2900 | } |
2901 | }; |
2902 | |
2903 | // Whether or not we should also collect guards in the loop. |
2904 | bool CollectGuards = false; |
2905 | if (UnswitchGuards) { |
2906 | auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction( |
2907 | Intrinsic::getName(Intrinsic::experimental_guard)); |
2908 | if (GuardDecl && !GuardDecl->use_empty()) |
2909 | CollectGuards = true; |
2910 | } |
2911 | |
2912 | for (auto *BB : L.blocks()) { |
2913 | if (LI.getLoopFor(BB) != &L) |
2914 | continue; |
2915 | |
2916 | for (auto &I : *BB) { |
2917 | if (auto *SI = dyn_cast<SelectInst>(Val: &I)) { |
2918 | auto *Cond = SI->getCondition(); |
2919 | // Do not unswitch vector selects and logical and/or selects |
2920 | if (Cond->getType()->isIntegerTy(Bitwidth: 1) && !SI->getType()->isIntegerTy(Bitwidth: 1)) |
2921 | AddUnswitchCandidatesForInst(SI, Cond); |
2922 | } else if (CollectGuards && isGuard(U: &I)) { |
2923 | auto *Cond = |
2924 | skipTrivialSelect(Cond: cast<IntrinsicInst>(Val: &I)->getArgOperand(i: 0)); |
2925 | // TODO: Support AND, OR conditions and partial unswitching. |
2926 | if (!isa<Constant>(Val: Cond) && L.isLoopInvariant(V: Cond)) |
2927 | UnswitchCandidates.push_back(Elt: {&I, {Cond}}); |
2928 | } |
2929 | } |
2930 | |
2931 | if (auto *SI = dyn_cast<SwitchInst>(Val: BB->getTerminator())) { |
2932 | // We can only consider fully loop-invariant switch conditions as we need |
2933 | // to completely eliminate the switch after unswitching. |
2934 | if (!isa<Constant>(Val: SI->getCondition()) && |
2935 | L.isLoopInvariant(V: SI->getCondition()) && !BB->getUniqueSuccessor()) |
2936 | UnswitchCandidates.push_back(Elt: {SI, {SI->getCondition()}}); |
2937 | continue; |
2938 | } |
2939 | |
2940 | auto *BI = dyn_cast<BranchInst>(Val: BB->getTerminator()); |
2941 | if (!BI || !BI->isConditional() || |
2942 | BI->getSuccessor(i: 0) == BI->getSuccessor(i: 1)) |
2943 | continue; |
2944 | |
2945 | AddUnswitchCandidatesForInst(BI, BI->getCondition()); |
2946 | } |
2947 | |
2948 | if (MSSAU && !findOptionMDForLoop(TheLoop: &L, Name: "llvm.loop.unswitch.partial.disable" ) && |
2949 | !any_of(Range&: UnswitchCandidates, P: [&L](auto &TerminatorAndInvariants) { |
2950 | return TerminatorAndInvariants.TI == L.getHeader()->getTerminator(); |
2951 | })) { |
2952 | MemorySSA *MSSA = MSSAU->getMemorySSA(); |
2953 | if (auto Info = hasPartialIVCondition(L, MSSAThreshold, MSSA: *MSSA, AA)) { |
2954 | LLVM_DEBUG( |
2955 | dbgs() << "simple-loop-unswitch: Found partially invariant condition " |
2956 | << *Info->InstToDuplicate[0] << "\n" ); |
2957 | PartialIVInfo = *Info; |
2958 | PartialIVCondBranch = L.getHeader()->getTerminator(); |
2959 | TinyPtrVector<Value *> ValsToDuplicate; |
2960 | llvm::append_range(C&: ValsToDuplicate, R&: Info->InstToDuplicate); |
2961 | UnswitchCandidates.push_back( |
2962 | Elt: {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)}); |
2963 | } |
2964 | } |
2965 | return !UnswitchCandidates.empty(); |
2966 | } |
2967 | |
2968 | /// Tries to canonicalize condition described by: |
2969 | /// |
2970 | /// br (LHS pred RHS), label IfTrue, label IfFalse |
2971 | /// |
2972 | /// into its equivalent where `Pred` is something that we support for injected |
2973 | /// invariants (so far it is limited to ult), LHS in canonicalized form is |
2974 | /// non-invariant and RHS is an invariant. |
2975 | static void canonicalizeForInvariantConditionInjection( |
2976 | ICmpInst::Predicate &Pred, Value *&LHS, Value *&RHS, BasicBlock *&IfTrue, |
2977 | BasicBlock *&IfFalse, const Loop &L) { |
2978 | if (!L.contains(BB: IfTrue)) { |
2979 | Pred = ICmpInst::getInversePredicate(pred: Pred); |
2980 | std::swap(a&: IfTrue, b&: IfFalse); |
2981 | } |
2982 | |
2983 | // Move loop-invariant argument to RHS position. |
2984 | if (L.isLoopInvariant(V: LHS)) { |
2985 | Pred = ICmpInst::getSwappedPredicate(pred: Pred); |
2986 | std::swap(a&: LHS, b&: RHS); |
2987 | } |
2988 | |
2989 | if (Pred == ICmpInst::ICMP_SGE && match(V: RHS, P: m_Zero())) { |
2990 | // Turn "x >=s 0" into "x <u UMIN_INT" |
2991 | Pred = ICmpInst::ICMP_ULT; |
2992 | RHS = ConstantInt::get( |
2993 | Context&: RHS->getContext(), |
2994 | V: APInt::getSignedMinValue(numBits: RHS->getType()->getIntegerBitWidth())); |
2995 | } |
2996 | } |
2997 | |
2998 | /// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS ) |
2999 | /// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by |
3000 | /// injecting a loop-invariant condition. |
3001 | static bool shouldTryInjectInvariantCondition( |
3002 | const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS, |
3003 | const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) { |
3004 | if (L.isLoopInvariant(V: LHS) || !L.isLoopInvariant(V: RHS)) |
3005 | return false; |
3006 | // TODO: Support other predicates. |
3007 | if (Pred != ICmpInst::ICMP_ULT) |
3008 | return false; |
3009 | // TODO: Support non-loop-exiting branches? |
3010 | if (!L.contains(BB: IfTrue) || L.contains(BB: IfFalse)) |
3011 | return false; |
3012 | // FIXME: For some reason this causes problems with MSSA updates, need to |
3013 | // investigate why. So far, just don't unswitch latch. |
3014 | if (L.getHeader() == IfTrue) |
3015 | return false; |
3016 | return true; |
3017 | } |
3018 | |
3019 | /// Returns true, if metadata on \p BI allows us to optimize branching into \p |
3020 | /// TakenSucc via injection of invariant conditions. The branch should be not |
3021 | /// enough and not previously unswitched, the information about this comes from |
3022 | /// the metadata. |
3023 | bool shouldTryInjectBasingOnMetadata(const BranchInst *BI, |
3024 | const BasicBlock *TakenSucc) { |
3025 | SmallVector<uint32_t> Weights; |
3026 | if (!extractBranchWeights(I: *BI, Weights)) |
3027 | return false; |
3028 | unsigned T = InjectInvariantConditionHotnesThreshold; |
3029 | BranchProbability LikelyTaken(T - 1, T); |
3030 | |
3031 | assert(Weights.size() == 2 && "Unexpected profile data!" ); |
3032 | size_t Idx = BI->getSuccessor(i: 0) == TakenSucc ? 0 : 1; |
3033 | auto Num = Weights[Idx]; |
3034 | auto Denom = Weights[0] + Weights[1]; |
3035 | // Degenerate or overflowed metadata. |
3036 | if (Denom == 0 || Num > Denom) |
3037 | return false; |
3038 | BranchProbability ActualTaken(Num, Denom); |
3039 | if (LikelyTaken > ActualTaken) |
3040 | return false; |
3041 | return true; |
3042 | } |
3043 | |
3044 | /// Materialize pending invariant condition of the given candidate into IR. The |
3045 | /// injected loop-invariant condition implies the original loop-variant branch |
3046 | /// condition, so the materialization turns |
3047 | /// |
3048 | /// loop_block: |
3049 | /// ... |
3050 | /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc |
3051 | /// |
3052 | /// into |
3053 | /// |
3054 | /// preheader: |
3055 | /// %invariant_cond = LHS pred RHS |
3056 | /// ... |
3057 | /// loop_block: |
3058 | /// br i1 %invariant_cond, label InLoopSucc, label OriginalCheck |
3059 | /// OriginalCheck: |
3060 | /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc |
3061 | /// ... |
3062 | static NonTrivialUnswitchCandidate |
3063 | injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, |
3064 | DominatorTree &DT, LoopInfo &LI, |
3065 | AssumptionCache &AC, MemorySSAUpdater *MSSAU) { |
3066 | assert(Candidate.hasPendingInjection() && "Nothing to inject!" ); |
3067 | BasicBlock * = L.getLoopPreheader(); |
3068 | assert(Preheader && "Loop is not in simplified form?" ); |
3069 | assert(LI.getLoopFor(Candidate.TI->getParent()) == &L && |
3070 | "Unswitching branch of inner loop!" ); |
3071 | |
3072 | auto Pred = Candidate.PendingInjection->Pred; |
3073 | auto *LHS = Candidate.PendingInjection->LHS; |
3074 | auto *RHS = Candidate.PendingInjection->RHS; |
3075 | auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc; |
3076 | auto *TI = cast<BranchInst>(Val: Candidate.TI); |
3077 | auto *BB = Candidate.TI->getParent(); |
3078 | auto *OutOfLoopSucc = InLoopSucc == TI->getSuccessor(i: 0) ? TI->getSuccessor(i: 1) |
3079 | : TI->getSuccessor(i: 0); |
3080 | // FIXME: Remove this once limitation on successors is lifted. |
3081 | assert(L.contains(InLoopSucc) && "Not supported yet!" ); |
3082 | assert(!L.contains(OutOfLoopSucc) && "Not supported yet!" ); |
3083 | auto &Ctx = BB->getContext(); |
3084 | |
3085 | IRBuilder<> Builder(Preheader->getTerminator()); |
3086 | assert(ICmpInst::isUnsigned(Pred) && "Not supported yet!" ); |
3087 | if (LHS->getType() != RHS->getType()) { |
3088 | if (LHS->getType()->getIntegerBitWidth() < |
3089 | RHS->getType()->getIntegerBitWidth()) |
3090 | LHS = Builder.CreateZExt(V: LHS, DestTy: RHS->getType(), Name: LHS->getName() + ".wide" ); |
3091 | else |
3092 | RHS = Builder.CreateZExt(V: RHS, DestTy: LHS->getType(), Name: RHS->getName() + ".wide" ); |
3093 | } |
3094 | // Do not use builder here: CreateICmp may simplify this into a constant and |
3095 | // unswitching will break. Better optimize it away later. |
3096 | auto *InjectedCond = |
3097 | ICmpInst::Create(Op: Instruction::ICmp, Pred, S1: LHS, S2: RHS, Name: "injected.cond" , |
3098 | InsertBefore: Preheader->getTerminator()->getIterator()); |
3099 | |
3100 | BasicBlock *CheckBlock = BasicBlock::Create(Context&: Ctx, Name: BB->getName() + ".check" , |
3101 | Parent: BB->getParent(), InsertBefore: InLoopSucc); |
3102 | Builder.SetInsertPoint(TI); |
3103 | auto *InvariantBr = |
3104 | Builder.CreateCondBr(Cond: InjectedCond, True: InLoopSucc, False: CheckBlock); |
3105 | |
3106 | Builder.SetInsertPoint(CheckBlock); |
3107 | Builder.CreateCondBr(Cond: TI->getCondition(), True: TI->getSuccessor(i: 0), |
3108 | False: TI->getSuccessor(i: 1)); |
3109 | TI->eraseFromParent(); |
3110 | |
3111 | // Fixup phis. |
3112 | for (auto &I : *InLoopSucc) { |
3113 | auto *PN = dyn_cast<PHINode>(Val: &I); |
3114 | if (!PN) |
3115 | break; |
3116 | auto *Inc = PN->getIncomingValueForBlock(BB); |
3117 | PN->addIncoming(V: Inc, BB: CheckBlock); |
3118 | } |
3119 | OutOfLoopSucc->replacePhiUsesWith(Old: BB, New: CheckBlock); |
3120 | |
3121 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates = { |
3122 | { DominatorTree::Insert, BB, CheckBlock }, |
3123 | { DominatorTree::Insert, CheckBlock, InLoopSucc }, |
3124 | { DominatorTree::Insert, CheckBlock, OutOfLoopSucc }, |
3125 | { DominatorTree::Delete, BB, OutOfLoopSucc } |
3126 | }; |
3127 | |
3128 | DT.applyUpdates(Updates: DTUpdates); |
3129 | if (MSSAU) |
3130 | MSSAU->applyUpdates(Updates: DTUpdates, DT); |
3131 | L.addBasicBlockToLoop(NewBB: CheckBlock, LI); |
3132 | |
3133 | #ifndef NDEBUG |
3134 | DT.verify(); |
3135 | LI.verify(DomTree: DT); |
3136 | if (MSSAU && VerifyMemorySSA) |
3137 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
3138 | #endif |
3139 | |
3140 | // TODO: In fact, cost of unswitching a new invariant candidate is *slightly* |
3141 | // higher because we have just inserted a new block. Need to think how to |
3142 | // adjust the cost of injected candidates when it was first computed. |
3143 | LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr |
3144 | << " and considering it for unswitching." ); |
3145 | ++NumInvariantConditionsInjected; |
3146 | return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond }, |
3147 | Candidate.Cost); |
3148 | } |
3149 | |
3150 | /// Given chain of loop branch conditions looking like: |
3151 | /// br (Variant < Invariant1) |
3152 | /// br (Variant < Invariant2) |
3153 | /// br (Variant < Invariant3) |
3154 | /// ... |
3155 | /// collect set of invariant conditions on which we want to unswitch, which |
3156 | /// look like: |
3157 | /// Invariant1 <= Invariant2 |
3158 | /// Invariant2 <= Invariant3 |
3159 | /// ... |
3160 | /// Though they might not immediately exist in the IR, we can still inject them. |
3161 | static bool insertCandidatesWithPendingInjections( |
3162 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, Loop &L, |
3163 | ICmpInst::Predicate Pred, ArrayRef<CompareDesc> Compares, |
3164 | const DominatorTree &DT) { |
3165 | |
3166 | assert(ICmpInst::isRelational(Pred)); |
3167 | assert(ICmpInst::isStrictPredicate(Pred)); |
3168 | if (Compares.size() < 2) |
3169 | return false; |
3170 | ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(pred: Pred); |
3171 | for (auto Prev = Compares.begin(), Next = Compares.begin() + 1; |
3172 | Next != Compares.end(); ++Prev, ++Next) { |
3173 | Value *LHS = Next->Invariant; |
3174 | Value *RHS = Prev->Invariant; |
3175 | BasicBlock *InLoopSucc = Prev->InLoopSucc; |
3176 | InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc); |
3177 | NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS }, |
3178 | std::nullopt, std::move(ToInject)); |
3179 | UnswitchCandidates.push_back(Elt: std::move(Candidate)); |
3180 | } |
3181 | return true; |
3182 | } |
3183 | |
3184 | /// Collect unswitch candidates by invariant conditions that are not immediately |
3185 | /// present in the loop. However, they can be injected into the code if we |
3186 | /// decide it's profitable. |
3187 | /// An example of such conditions is following: |
3188 | /// |
3189 | /// for (...) { |
3190 | /// x = load ... |
3191 | /// if (! x <u C1) break; |
3192 | /// if (! x <u C2) break; |
3193 | /// <do something> |
3194 | /// } |
3195 | /// |
3196 | /// We can unswitch by condition "C1 <=u C2". If that is true, then "x <u C1 <= |
3197 | /// C2" automatically implies "x <u C2", so we can get rid of one of |
3198 | /// loop-variant checks in unswitched loop version. |
3199 | static bool collectUnswitchCandidatesWithInjections( |
3200 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, |
3201 | IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L, |
3202 | const DominatorTree &DT, const LoopInfo &LI, AAResults &AA, |
3203 | const MemorySSAUpdater *MSSAU) { |
3204 | if (!InjectInvariantConditions) |
3205 | return false; |
3206 | |
3207 | if (!DT.isReachableFromEntry(A: L.getHeader())) |
3208 | return false; |
3209 | auto *Latch = L.getLoopLatch(); |
3210 | // Need to have a single latch and a preheader. |
3211 | if (!Latch) |
3212 | return false; |
3213 | assert(L.getLoopPreheader() && "Must have a preheader!" ); |
3214 | |
3215 | DenseMap<Value *, SmallVector<CompareDesc, 4> > CandidatesULT; |
3216 | // Traverse the conditions that dominate latch (and therefore dominate each |
3217 | // other). |
3218 | for (auto *DTN = DT.getNode(BB: Latch); L.contains(BB: DTN->getBlock()); |
3219 | DTN = DTN->getIDom()) { |
3220 | ICmpInst::Predicate Pred; |
3221 | Value *LHS = nullptr, *RHS = nullptr; |
3222 | BasicBlock *IfTrue = nullptr, *IfFalse = nullptr; |
3223 | auto *BB = DTN->getBlock(); |
3224 | // Ignore inner loops. |
3225 | if (LI.getLoopFor(BB) != &L) |
3226 | continue; |
3227 | auto *Term = BB->getTerminator(); |
3228 | if (!match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Value(V&: LHS), R: m_Value(V&: RHS)), |
3229 | T: m_BasicBlock(V&: IfTrue), F: m_BasicBlock(V&: IfFalse)))) |
3230 | continue; |
3231 | if (!LHS->getType()->isIntegerTy()) |
3232 | continue; |
3233 | canonicalizeForInvariantConditionInjection(Pred, LHS, RHS, IfTrue, IfFalse, |
3234 | L); |
3235 | if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L)) |
3236 | continue; |
3237 | if (!shouldTryInjectBasingOnMetadata(BI: cast<BranchInst>(Val: Term), TakenSucc: IfTrue)) |
3238 | continue; |
3239 | // Strip ZEXT for unsigned predicate. |
3240 | // TODO: once signed predicates are supported, also strip SEXT. |
3241 | CompareDesc Desc(cast<BranchInst>(Val: Term), RHS, IfTrue); |
3242 | while (auto *Zext = dyn_cast<ZExtInst>(Val: LHS)) |
3243 | LHS = Zext->getOperand(i_nocapture: 0); |
3244 | CandidatesULT[LHS].push_back(Elt: Desc); |
3245 | } |
3246 | |
3247 | bool Found = false; |
3248 | for (auto &It : CandidatesULT) |
3249 | Found |= insertCandidatesWithPendingInjections( |
3250 | UnswitchCandidates, L, Pred: ICmpInst::ICMP_ULT, Compares: It.second, DT); |
3251 | return Found; |
3252 | } |
3253 | |
3254 | static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { |
3255 | if (!L.isSafeToClone()) |
3256 | return false; |
3257 | for (auto *BB : L.blocks()) |
3258 | for (auto &I : *BB) { |
3259 | if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) |
3260 | return false; |
3261 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
3262 | assert(!CB->cannotDuplicate() && "Checked by L.isSafeToClone()." ); |
3263 | if (CB->isConvergent()) |
3264 | return false; |
3265 | } |
3266 | } |
3267 | |
3268 | // Check if there are irreducible CFG cycles in this loop. If so, we cannot |
3269 | // easily unswitch non-trivial edges out of the loop. Doing so might turn the |
3270 | // irreducible control flow into reducible control flow and introduce new |
3271 | // loops "out of thin air". If we ever discover important use cases for doing |
3272 | // this, we can add support to loop unswitch, but it is a lot of complexity |
3273 | // for what seems little or no real world benefit. |
3274 | LoopBlocksRPO RPOT(&L); |
3275 | RPOT.perform(LI: &LI); |
3276 | if (containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI)) |
3277 | return false; |
3278 | |
3279 | SmallVector<BasicBlock *, 4> ExitBlocks; |
3280 | L.getUniqueExitBlocks(ExitBlocks); |
3281 | // We cannot unswitch if exit blocks contain a cleanuppad/catchswitch |
3282 | // instruction as we don't know how to split those exit blocks. |
3283 | // FIXME: We should teach SplitBlock to handle this and remove this |
3284 | // restriction. |
3285 | for (auto *ExitBB : ExitBlocks) { |
3286 | auto *I = ExitBB->getFirstNonPHI(); |
3287 | if (isa<CleanupPadInst>(Val: I) || isa<CatchSwitchInst>(Val: I)) { |
3288 | LLVM_DEBUG(dbgs() << "Cannot unswitch because of cleanuppad/catchswitch " |
3289 | "in exit block\n" ); |
3290 | return false; |
3291 | } |
3292 | } |
3293 | |
3294 | return true; |
3295 | } |
3296 | |
3297 | static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( |
3298 | ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates, const Loop &L, |
3299 | const DominatorTree &DT, const LoopInfo &LI, AssumptionCache &AC, |
3300 | const TargetTransformInfo &TTI, const IVConditionInfo &PartialIVInfo) { |
3301 | // Given that unswitching these terminators will require duplicating parts of |
3302 | // the loop, so we need to be able to model that cost. Compute the ephemeral |
3303 | // values and set up a data structure to hold per-BB costs. We cache each |
3304 | // block's cost so that we don't recompute this when considering different |
3305 | // subsets of the loop for duplication during unswitching. |
3306 | SmallPtrSet<const Value *, 4> EphValues; |
3307 | CodeMetrics::collectEphemeralValues(L: &L, AC: &AC, EphValues); |
3308 | SmallDenseMap<BasicBlock *, InstructionCost, 4> BBCostMap; |
3309 | |
3310 | // Compute the cost of each block, as well as the total loop cost. Also, bail |
3311 | // out if we see instructions which are incompatible with loop unswitching |
3312 | // (convergent, noduplicate, or cross-basic-block tokens). |
3313 | // FIXME: We might be able to safely handle some of these in non-duplicated |
3314 | // regions. |
3315 | TargetTransformInfo::TargetCostKind CostKind = |
3316 | L.getHeader()->getParent()->hasMinSize() |
3317 | ? TargetTransformInfo::TCK_CodeSize |
3318 | : TargetTransformInfo::TCK_SizeAndLatency; |
3319 | InstructionCost LoopCost = 0; |
3320 | for (auto *BB : L.blocks()) { |
3321 | InstructionCost Cost = 0; |
3322 | for (auto &I : *BB) { |
3323 | if (EphValues.count(Ptr: &I)) |
3324 | continue; |
3325 | Cost += TTI.getInstructionCost(U: &I, CostKind); |
3326 | } |
3327 | assert(Cost >= 0 && "Must not have negative costs!" ); |
3328 | LoopCost += Cost; |
3329 | assert(LoopCost >= 0 && "Must not have negative loop costs!" ); |
3330 | BBCostMap[BB] = Cost; |
3331 | } |
3332 | LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n" ); |
3333 | |
3334 | // Now we find the best candidate by searching for the one with the following |
3335 | // properties in order: |
3336 | // |
3337 | // 1) An unswitching cost below the threshold |
3338 | // 2) The smallest number of duplicated unswitch candidates (to avoid |
3339 | // creating redundant subsequent unswitching) |
3340 | // 3) The smallest cost after unswitching. |
3341 | // |
3342 | // We prioritize reducing fanout of unswitch candidates provided the cost |
3343 | // remains below the threshold because this has a multiplicative effect. |
3344 | // |
3345 | // This requires memoizing each dominator subtree to avoid redundant work. |
3346 | // |
3347 | // FIXME: Need to actually do the number of candidates part above. |
3348 | SmallDenseMap<DomTreeNode *, InstructionCost, 4> DTCostMap; |
3349 | // Given a terminator which might be unswitched, computes the non-duplicated |
3350 | // cost for that terminator. |
3351 | auto ComputeUnswitchedCost = [&](Instruction &TI, |
3352 | bool FullUnswitch) -> InstructionCost { |
3353 | // Unswitching selects unswitches the entire loop. |
3354 | if (isa<SelectInst>(Val: TI)) |
3355 | return LoopCost; |
3356 | |
3357 | BasicBlock &BB = *TI.getParent(); |
3358 | SmallPtrSet<BasicBlock *, 4> Visited; |
3359 | |
3360 | InstructionCost Cost = 0; |
3361 | for (BasicBlock *SuccBB : successors(BB: &BB)) { |
3362 | // Don't count successors more than once. |
3363 | if (!Visited.insert(Ptr: SuccBB).second) |
3364 | continue; |
3365 | |
3366 | // If this is a partial unswitch candidate, then it must be a conditional |
3367 | // branch with a condition of either `or`, `and`, their corresponding |
3368 | // select forms or partially invariant instructions. In that case, one of |
3369 | // the successors is necessarily duplicated, so don't even try to remove |
3370 | // its cost. |
3371 | if (!FullUnswitch) { |
3372 | auto &BI = cast<BranchInst>(Val&: TI); |
3373 | Value *Cond = skipTrivialSelect(Cond: BI.getCondition()); |
3374 | if (match(V: Cond, P: m_LogicalAnd())) { |
3375 | if (SuccBB == BI.getSuccessor(i: 1)) |
3376 | continue; |
3377 | } else if (match(V: Cond, P: m_LogicalOr())) { |
3378 | if (SuccBB == BI.getSuccessor(i: 0)) |
3379 | continue; |
3380 | } else if ((PartialIVInfo.KnownValue->isOneValue() && |
3381 | SuccBB == BI.getSuccessor(i: 0)) || |
3382 | (!PartialIVInfo.KnownValue->isOneValue() && |
3383 | SuccBB == BI.getSuccessor(i: 1))) |
3384 | continue; |
3385 | } |
3386 | |
3387 | // This successor's domtree will not need to be duplicated after |
3388 | // unswitching if the edge to the successor dominates it (and thus the |
3389 | // entire tree). This essentially means there is no other path into this |
3390 | // subtree and so it will end up live in only one clone of the loop. |
3391 | if (SuccBB->getUniquePredecessor() || |
3392 | llvm::all_of(Range: predecessors(BB: SuccBB), P: [&](BasicBlock *PredBB) { |
3393 | return PredBB == &BB || DT.dominates(A: SuccBB, B: PredBB); |
3394 | })) { |
3395 | Cost += computeDomSubtreeCost(N&: *DT[SuccBB], BBCostMap, DTCostMap); |
3396 | assert(Cost <= LoopCost && |
3397 | "Non-duplicated cost should never exceed total loop cost!" ); |
3398 | } |
3399 | } |
3400 | |
3401 | // Now scale the cost by the number of unique successors minus one. We |
3402 | // subtract one because there is already at least one copy of the entire |
3403 | // loop. This is computing the new cost of unswitching a condition. |
3404 | // Note that guards always have 2 unique successors that are implicit and |
3405 | // will be materialized if we decide to unswitch it. |
3406 | int SuccessorsCount = isGuard(U: &TI) ? 2 : Visited.size(); |
3407 | assert(SuccessorsCount > 1 && |
3408 | "Cannot unswitch a condition without multiple distinct successors!" ); |
3409 | return (LoopCost - Cost) * (SuccessorsCount - 1); |
3410 | }; |
3411 | |
3412 | std::optional<NonTrivialUnswitchCandidate> Best; |
3413 | for (auto &Candidate : UnswitchCandidates) { |
3414 | Instruction &TI = *Candidate.TI; |
3415 | ArrayRef<Value *> Invariants = Candidate.Invariants; |
3416 | BranchInst *BI = dyn_cast<BranchInst>(Val: &TI); |
3417 | bool FullUnswitch = |
3418 | !BI || Candidate.hasPendingInjection() || |
3419 | (Invariants.size() == 1 && |
3420 | Invariants[0] == skipTrivialSelect(Cond: BI->getCondition())); |
3421 | InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch); |
3422 | // Calculate cost multiplier which is a tool to limit potentially |
3423 | // exponential behavior of loop-unswitch. |
3424 | if (EnableUnswitchCostMultiplier) { |
3425 | int CostMultiplier = |
3426 | CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates); |
3427 | assert( |
3428 | (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) && |
3429 | "cost multiplier needs to be in the range of 1..UnswitchThreshold" ); |
3430 | CandidateCost *= CostMultiplier; |
3431 | LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost |
3432 | << " (multiplier: " << CostMultiplier << ")" |
3433 | << " for unswitch candidate: " << TI << "\n" ); |
3434 | } else { |
3435 | LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost |
3436 | << " for unswitch candidate: " << TI << "\n" ); |
3437 | } |
3438 | |
3439 | if (!Best || CandidateCost < Best->Cost) { |
3440 | Best = Candidate; |
3441 | Best->Cost = CandidateCost; |
3442 | } |
3443 | } |
3444 | assert(Best && "Must be!" ); |
3445 | return *Best; |
3446 | } |
3447 | |
3448 | // Insert a freeze on an unswitched branch if all is true: |
3449 | // 1. freeze-loop-unswitch-cond option is true |
3450 | // 2. The branch may not execute in the loop pre-transformation. If a branch may |
3451 | // not execute and could cause UB, it would always cause UB if it is hoisted outside |
3452 | // of the loop. Insert a freeze to prevent this case. |
3453 | // 3. The branch condition may be poison or undef |
3454 | static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT, |
3455 | AssumptionCache &AC) { |
3456 | assert(isa<BranchInst>(TI) || isa<SwitchInst>(TI)); |
3457 | if (!FreezeLoopUnswitchCond) |
3458 | return false; |
3459 | |
3460 | ICFLoopSafetyInfo SafetyInfo; |
3461 | SafetyInfo.computeLoopSafetyInfo(CurLoop: &L); |
3462 | if (SafetyInfo.isGuaranteedToExecute(Inst: TI, DT: &DT, CurLoop: &L)) |
3463 | return false; |
3464 | |
3465 | Value *Cond; |
3466 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &TI)) |
3467 | Cond = skipTrivialSelect(Cond: BI->getCondition()); |
3468 | else |
3469 | Cond = skipTrivialSelect(Cond: cast<SwitchInst>(Val: &TI)->getCondition()); |
3470 | return !isGuaranteedNotToBeUndefOrPoison( |
3471 | V: Cond, AC: &AC, CtxI: L.getLoopPreheader()->getTerminator(), DT: &DT); |
3472 | } |
3473 | |
3474 | static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, |
3475 | AssumptionCache &AC, AAResults &AA, |
3476 | TargetTransformInfo &TTI, ScalarEvolution *SE, |
3477 | MemorySSAUpdater *MSSAU, |
3478 | LPMUpdater &LoopUpdater) { |
3479 | // Collect all invariant conditions within this loop (as opposed to an inner |
3480 | // loop which would be handled when visiting that inner loop). |
3481 | SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; |
3482 | IVConditionInfo PartialIVInfo; |
3483 | Instruction *PartialIVCondBranch = nullptr; |
3484 | collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, |
3485 | PartialIVCondBranch, L, LI, AA, MSSAU); |
3486 | if (!findOptionMDForLoop(TheLoop: &L, Name: "llvm.loop.unswitch.injection.disable" )) |
3487 | collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, |
3488 | PartialIVCondBranch, L, DT, LI, AA, |
3489 | MSSAU); |
3490 | // If we didn't find any candidates, we're done. |
3491 | if (UnswitchCandidates.empty()) |
3492 | return false; |
3493 | |
3494 | LLVM_DEBUG( |
3495 | dbgs() << "Considering " << UnswitchCandidates.size() |
3496 | << " non-trivial loop invariant conditions for unswitching.\n" ); |
3497 | |
3498 | NonTrivialUnswitchCandidate Best = findBestNonTrivialUnswitchCandidate( |
3499 | UnswitchCandidates, L, DT, LI, AC, TTI, PartialIVInfo); |
3500 | |
3501 | assert(Best.TI && "Failed to find loop unswitch candidate" ); |
3502 | assert(Best.Cost && "Failed to compute cost" ); |
3503 | |
3504 | if (*Best.Cost >= UnswitchThreshold) { |
3505 | LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << *Best.Cost |
3506 | << "\n" ); |
3507 | return false; |
3508 | } |
3509 | |
3510 | bool InjectedCondition = false; |
3511 | if (Best.hasPendingInjection()) { |
3512 | Best = injectPendingInvariantConditions(Candidate: Best, L, DT, LI, AC, MSSAU); |
3513 | InjectedCondition = true; |
3514 | } |
3515 | assert(!Best.hasPendingInjection() && |
3516 | "All injections should have been done by now!" ); |
3517 | |
3518 | if (Best.TI != PartialIVCondBranch) |
3519 | PartialIVInfo.InstToDuplicate.clear(); |
3520 | |
3521 | bool InsertFreeze; |
3522 | if (auto *SI = dyn_cast<SelectInst>(Val: Best.TI)) { |
3523 | // If the best candidate is a select, turn it into a branch. Select |
3524 | // instructions with a poison conditional do not propagate poison, but |
3525 | // branching on poison causes UB. Insert a freeze on the select |
3526 | // conditional to prevent UB after turning the select into a branch. |
3527 | InsertFreeze = !isGuaranteedNotToBeUndefOrPoison( |
3528 | V: SI->getCondition(), AC: &AC, CtxI: L.getLoopPreheader()->getTerminator(), DT: &DT); |
3529 | Best.TI = turnSelectIntoBranch(SI, DT, LI, MSSAU, AC: &AC); |
3530 | } else { |
3531 | // If the best candidate is a guard, turn it into a branch. |
3532 | if (isGuard(U: Best.TI)) |
3533 | Best.TI = |
3534 | turnGuardIntoBranch(GI: cast<IntrinsicInst>(Val: Best.TI), L, DT, LI, MSSAU); |
3535 | InsertFreeze = shouldInsertFreeze(L, TI&: *Best.TI, DT, AC); |
3536 | } |
3537 | |
3538 | LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost |
3539 | << ") terminator: " << *Best.TI << "\n" ); |
3540 | unswitchNontrivialInvariants(L, TI&: *Best.TI, Invariants: Best.Invariants, PartialIVInfo, DT, |
3541 | LI, AC, SE, MSSAU, LoopUpdater, InsertFreeze, |
3542 | InjectedCondition); |
3543 | return true; |
3544 | } |
3545 | |
3546 | /// Unswitch control flow predicated on loop invariant conditions. |
3547 | /// |
3548 | /// This first hoists all branches or switches which are trivial (IE, do not |
3549 | /// require duplicating any part of the loop) out of the loop body. It then |
3550 | /// looks at other loop invariant control flows and tries to unswitch those as |
3551 | /// well by cloning the loop if the result is small enough. |
3552 | /// |
3553 | /// The `DT`, `LI`, `AC`, `AA`, `TTI` parameters are required analyses that are |
3554 | /// also updated based on the unswitch. The `MSSA` analysis is also updated if |
3555 | /// valid (i.e. its use is enabled). |
3556 | /// |
3557 | /// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is |
3558 | /// true, we will attempt to do non-trivial unswitching as well as trivial |
3559 | /// unswitching. |
3560 | /// |
3561 | /// The `postUnswitch` function will be run after unswitching is complete |
3562 | /// with information on whether or not the provided loop remains a loop and |
3563 | /// a list of new sibling loops created. |
3564 | /// |
3565 | /// If `SE` is non-null, we will update that analysis based on the unswitching |
3566 | /// done. |
3567 | static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, |
3568 | AssumptionCache &AC, AAResults &AA, |
3569 | TargetTransformInfo &TTI, bool Trivial, |
3570 | bool NonTrivial, ScalarEvolution *SE, |
3571 | MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, |
3572 | BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { |
3573 | assert(L.isRecursivelyLCSSAForm(DT, LI) && |
3574 | "Loops must be in LCSSA form before unswitching." ); |
3575 | |
3576 | // Must be in loop simplified form: we need a preheader and dedicated exits. |
3577 | if (!L.isLoopSimplifyForm()) |
3578 | return false; |
3579 | |
3580 | // Try trivial unswitch first before loop over other basic blocks in the loop. |
3581 | if (Trivial && unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) { |
3582 | // If we unswitched successfully we will want to clean up the loop before |
3583 | // processing it further so just mark it as unswitched and return. |
3584 | postUnswitch(L, U&: LoopUpdater, LoopName: L.getName(), |
3585 | /*CurrentLoopValid*/ true, /*PartiallyInvariant*/ false, |
3586 | /*InjectedCondition*/ false, NewLoops: {}); |
3587 | return true; |
3588 | } |
3589 | |
3590 | const Function *F = L.getHeader()->getParent(); |
3591 | |
3592 | // Check whether we should continue with non-trivial conditions. |
3593 | // EnableNonTrivialUnswitch: Global variable that forces non-trivial |
3594 | // unswitching for testing and debugging. |
3595 | // NonTrivial: Parameter that enables non-trivial unswitching for this |
3596 | // invocation of the transform. But this should be allowed only |
3597 | // for targets without branch divergence. |
3598 | // |
3599 | // FIXME: If divergence analysis becomes available to a loop |
3600 | // transform, we should allow unswitching for non-trivial uniform |
3601 | // branches even on targets that have divergence. |
3602 | // https://bugs.llvm.org/show_bug.cgi?id=48819 |
3603 | bool ContinueWithNonTrivial = |
3604 | EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence(F)); |
3605 | if (!ContinueWithNonTrivial) |
3606 | return false; |
3607 | |
3608 | // Skip non-trivial unswitching for optsize functions. |
3609 | if (F->hasOptSize()) |
3610 | return false; |
3611 | |
3612 | // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, |
3613 | // of the loops L is nested in, and of the loops nested in L are all cold. |
3614 | auto IsLoopNestCold = [&](const Loop *L) { |
3615 | // Check L and all of its parent loops. |
3616 | auto *Parent = L; |
3617 | while (Parent) { |
3618 | if (!PSI->isColdBlock(BB: Parent->getHeader(), BFI)) |
3619 | return false; |
3620 | Parent = Parent->getParentLoop(); |
3621 | } |
3622 | // Next check all loops nested within L. |
3623 | SmallVector<const Loop *, 4> Worklist; |
3624 | Worklist.insert(I: Worklist.end(), From: L->getSubLoops().begin(), |
3625 | To: L->getSubLoops().end()); |
3626 | while (!Worklist.empty()) { |
3627 | auto *CurLoop = Worklist.pop_back_val(); |
3628 | if (!PSI->isColdBlock(BB: CurLoop->getHeader(), BFI)) |
3629 | return false; |
3630 | Worklist.insert(I: Worklist.end(), From: CurLoop->getSubLoops().begin(), |
3631 | To: CurLoop->getSubLoops().end()); |
3632 | } |
3633 | return true; |
3634 | }; |
3635 | |
3636 | // Skip cold loops in cold loop nests, as unswitching them brings little |
3637 | // benefit but increases the code size |
3638 | if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { |
3639 | LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n" ); |
3640 | return false; |
3641 | } |
3642 | |
3643 | // Perform legality checks. |
3644 | if (!isSafeForNoNTrivialUnswitching(L, LI)) |
3645 | return false; |
3646 | |
3647 | // For non-trivial unswitching, because it often creates new loops, we rely on |
3648 | // the pass manager to iterate on the loops rather than trying to immediately |
3649 | // reach a fixed point. There is no substantial advantage to iterating |
3650 | // internally, and if any of the new loops are simplified enough to contain |
3651 | // trivial unswitching we want to prefer those. |
3652 | |
3653 | // Try to unswitch the best invariant condition. We prefer this full unswitch to |
3654 | // a partial unswitch when possible below the threshold. |
3655 | if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, SE, MSSAU, LoopUpdater)) |
3656 | return true; |
3657 | |
3658 | // No other opportunities to unswitch. |
3659 | return false; |
3660 | } |
3661 | |
3662 | PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, |
3663 | LoopStandardAnalysisResults &AR, |
3664 | LPMUpdater &U) { |
3665 | Function &F = *L.getHeader()->getParent(); |
3666 | (void)F; |
3667 | ProfileSummaryInfo *PSI = nullptr; |
3668 | if (auto OuterProxy = |
3669 | AM.getResult<FunctionAnalysisManagerLoopProxy>(IR&: L, ExtraArgs&: AR) |
3670 | .getCachedResult<ModuleAnalysisManagerFunctionProxy>(IR&: F)) |
3671 | PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
3672 | LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L |
3673 | << "\n" ); |
3674 | |
3675 | std::optional<MemorySSAUpdater> MSSAU; |
3676 | if (AR.MSSA) { |
3677 | MSSAU = MemorySSAUpdater(AR.MSSA); |
3678 | if (VerifyMemorySSA) |
3679 | AR.MSSA->verifyMemorySSA(); |
3680 | } |
3681 | if (!unswitchLoop(L, DT&: AR.DT, LI&: AR.LI, AC&: AR.AC, AA&: AR.AA, TTI&: AR.TTI, Trivial, NonTrivial, |
3682 | SE: &AR.SE, MSSAU: MSSAU ? &*MSSAU : nullptr, PSI, BFI: AR.BFI, LoopUpdater&: U)) |
3683 | return PreservedAnalyses::all(); |
3684 | |
3685 | if (AR.MSSA && VerifyMemorySSA) |
3686 | AR.MSSA->verifyMemorySSA(); |
3687 | |
3688 | // Historically this pass has had issues with the dominator tree so verify it |
3689 | // in asserts builds. |
3690 | assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); |
3691 | |
3692 | auto PA = getLoopPassPreservedAnalyses(); |
3693 | if (AR.MSSA) |
3694 | PA.preserve<MemorySSAAnalysis>(); |
3695 | return PA; |
3696 | } |
3697 | |
3698 | void SimpleLoopUnswitchPass::printPipeline( |
3699 | raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { |
3700 | static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline( |
3701 | OS, MapClassName2PassName); |
3702 | |
3703 | OS << '<'; |
3704 | OS << (NonTrivial ? "" : "no-" ) << "nontrivial;" ; |
3705 | OS << (Trivial ? "" : "no-" ) << "trivial" ; |
3706 | OS << '>'; |
3707 | } |
3708 | |