1 | //===- AArch64LoopIdiomTransform.cpp - Loop idiom recognition -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass implements a pass that recognizes certain loop idioms and |
10 | // transforms them into more optimized versions of the same loop. In cases |
11 | // where this happens, it can be a significant performance win. |
12 | // |
13 | // We currently only recognize one loop that finds the first mismatched byte |
14 | // in an array and returns the index, i.e. something like: |
15 | // |
16 | // while (++i != n) { |
17 | // if (a[i] != b[i]) |
18 | // break; |
19 | // } |
20 | // |
21 | // In this example we can actually vectorize the loop despite the early exit, |
22 | // although the loop vectorizer does not support it. It requires some extra |
23 | // checks to deal with the possibility of faulting loads when crossing page |
24 | // boundaries. However, even with these checks it is still profitable to do the |
25 | // transformation. |
26 | // |
27 | //===----------------------------------------------------------------------===// |
28 | // |
29 | // TODO List: |
30 | // |
31 | // * Add support for the inverse case where we scan for a matching element. |
32 | // * Permit 64-bit induction variable types. |
33 | // * Recognize loops that increment the IV *after* comparing bytes. |
34 | // * Allow 32-bit sign-extends of the IV used by the GEP. |
35 | // |
36 | //===----------------------------------------------------------------------===// |
37 | |
38 | #include "AArch64LoopIdiomTransform.h" |
39 | #include "llvm/Analysis/DomTreeUpdater.h" |
40 | #include "llvm/Analysis/LoopPass.h" |
41 | #include "llvm/Analysis/TargetTransformInfo.h" |
42 | #include "llvm/IR/Dominators.h" |
43 | #include "llvm/IR/IRBuilder.h" |
44 | #include "llvm/IR/Intrinsics.h" |
45 | #include "llvm/IR/MDBuilder.h" |
46 | #include "llvm/IR/PatternMatch.h" |
47 | #include "llvm/InitializePasses.h" |
48 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
49 | |
50 | using namespace llvm; |
51 | using namespace PatternMatch; |
52 | |
53 | #define DEBUG_TYPE "aarch64-loop-idiom-transform" |
54 | |
55 | static cl::opt<bool> |
56 | DisableAll("disable-aarch64-lit-all" , cl::Hidden, cl::init(Val: false), |
57 | cl::desc("Disable AArch64 Loop Idiom Transform Pass." )); |
58 | |
59 | static cl::opt<bool> DisableByteCmp( |
60 | "disable-aarch64-lit-bytecmp" , cl::Hidden, cl::init(Val: false), |
61 | cl::desc("Proceed with AArch64 Loop Idiom Transform Pass, but do " |
62 | "not convert byte-compare loop(s)." )); |
63 | |
64 | static cl::opt<bool> VerifyLoops( |
65 | "aarch64-lit-verify" , cl::Hidden, cl::init(Val: false), |
66 | cl::desc("Verify loops generated AArch64 Loop Idiom Transform Pass." )); |
67 | |
68 | namespace llvm { |
69 | |
70 | void initializeAArch64LoopIdiomTransformLegacyPassPass(PassRegistry &); |
71 | Pass *createAArch64LoopIdiomTransformPass(); |
72 | |
73 | } // end namespace llvm |
74 | |
75 | namespace { |
76 | |
77 | class AArch64LoopIdiomTransform { |
78 | Loop *CurLoop = nullptr; |
79 | DominatorTree *DT; |
80 | LoopInfo *LI; |
81 | const TargetTransformInfo *TTI; |
82 | const DataLayout *DL; |
83 | |
84 | public: |
85 | explicit AArch64LoopIdiomTransform(DominatorTree *DT, LoopInfo *LI, |
86 | const TargetTransformInfo *TTI, |
87 | const DataLayout *DL) |
88 | : DT(DT), LI(LI), TTI(TTI), DL(DL) {} |
89 | |
90 | bool run(Loop *L); |
91 | |
92 | private: |
93 | /// \name Countable Loop Idiom Handling |
94 | /// @{ |
95 | |
96 | bool runOnCountableLoop(); |
97 | bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, |
98 | SmallVectorImpl<BasicBlock *> &ExitBlocks); |
99 | |
100 | bool recognizeByteCompare(); |
101 | Value *expandFindMismatch(IRBuilder<> &Builder, DomTreeUpdater &DTU, |
102 | GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, |
103 | Instruction *Index, Value *Start, Value *MaxLen); |
104 | void transformByteCompare(GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, |
105 | PHINode *IndPhi, Value *MaxLen, Instruction *Index, |
106 | Value *Start, bool IncIdx, BasicBlock *FoundBB, |
107 | BasicBlock *EndBB); |
108 | /// @} |
109 | }; |
110 | |
111 | class AArch64LoopIdiomTransformLegacyPass : public LoopPass { |
112 | public: |
113 | static char ID; |
114 | |
115 | explicit AArch64LoopIdiomTransformLegacyPass() : LoopPass(ID) { |
116 | initializeAArch64LoopIdiomTransformLegacyPassPass( |
117 | *PassRegistry::getPassRegistry()); |
118 | } |
119 | |
120 | StringRef getPassName() const override { |
121 | return "Transform AArch64-specific loop idioms" ; |
122 | } |
123 | |
124 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
125 | AU.addRequired<LoopInfoWrapperPass>(); |
126 | AU.addRequired<DominatorTreeWrapperPass>(); |
127 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
128 | } |
129 | |
130 | bool runOnLoop(Loop *L, LPPassManager &LPM) override; |
131 | }; |
132 | |
133 | bool AArch64LoopIdiomTransformLegacyPass::runOnLoop(Loop *L, |
134 | LPPassManager &LPM) { |
135 | |
136 | if (skipLoop(L)) |
137 | return false; |
138 | |
139 | auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
140 | auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
141 | auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI( |
142 | F: *L->getHeader()->getParent()); |
143 | return AArch64LoopIdiomTransform( |
144 | DT, LI, &TTI, &L->getHeader()->getModule()->getDataLayout()) |
145 | .run(L); |
146 | } |
147 | |
148 | } // end anonymous namespace |
149 | |
150 | char AArch64LoopIdiomTransformLegacyPass::ID = 0; |
151 | |
152 | INITIALIZE_PASS_BEGIN( |
153 | AArch64LoopIdiomTransformLegacyPass, "aarch64-lit" , |
154 | "Transform specific loop idioms into optimized vector forms" , false, false) |
155 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
156 | INITIALIZE_PASS_DEPENDENCY(LoopSimplify) |
157 | INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) |
158 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
159 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
160 | INITIALIZE_PASS_END( |
161 | AArch64LoopIdiomTransformLegacyPass, "aarch64-lit" , |
162 | "Transform specific loop idioms into optimized vector forms" , false, false) |
163 | |
164 | Pass *llvm::createAArch64LoopIdiomTransformPass() { |
165 | return new AArch64LoopIdiomTransformLegacyPass(); |
166 | } |
167 | |
168 | PreservedAnalyses |
169 | AArch64LoopIdiomTransformPass::run(Loop &L, LoopAnalysisManager &AM, |
170 | LoopStandardAnalysisResults &AR, |
171 | LPMUpdater &) { |
172 | if (DisableAll) |
173 | return PreservedAnalyses::all(); |
174 | |
175 | const auto *DL = &L.getHeader()->getModule()->getDataLayout(); |
176 | |
177 | AArch64LoopIdiomTransform LIT(&AR.DT, &AR.LI, &AR.TTI, DL); |
178 | if (!LIT.run(L: &L)) |
179 | return PreservedAnalyses::all(); |
180 | |
181 | return PreservedAnalyses::none(); |
182 | } |
183 | |
184 | //===----------------------------------------------------------------------===// |
185 | // |
186 | // Implementation of AArch64LoopIdiomTransform |
187 | // |
188 | //===----------------------------------------------------------------------===// |
189 | |
190 | bool AArch64LoopIdiomTransform::run(Loop *L) { |
191 | CurLoop = L; |
192 | |
193 | Function &F = *L->getHeader()->getParent(); |
194 | if (DisableAll || F.hasOptSize()) |
195 | return false; |
196 | |
197 | if (F.hasFnAttribute(Attribute::NoImplicitFloat)) { |
198 | LLVM_DEBUG(dbgs() << DEBUG_TYPE << " is disabled on " << F.getName() |
199 | << " due to its NoImplicitFloat attribute" ); |
200 | return false; |
201 | } |
202 | |
203 | // If the loop could not be converted to canonical form, it must have an |
204 | // indirectbr in it, just give up. |
205 | if (!L->getLoopPreheader()) |
206 | return false; |
207 | |
208 | LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %" |
209 | << CurLoop->getHeader()->getName() << "\n" ); |
210 | |
211 | return recognizeByteCompare(); |
212 | } |
213 | |
214 | bool AArch64LoopIdiomTransform::recognizeByteCompare() { |
215 | // Currently the transformation only works on scalable vector types, although |
216 | // there is no fundamental reason why it cannot be made to work for fixed |
217 | // width too. |
218 | |
219 | // We also need to know the minimum page size for the target in order to |
220 | // generate runtime memory checks to ensure the vector version won't fault. |
221 | if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || |
222 | DisableByteCmp) |
223 | return false; |
224 | |
225 | BasicBlock * = CurLoop->getHeader(); |
226 | |
227 | // In AArch64LoopIdiomTransform::run we have already checked that the loop |
228 | // has a preheader so we can assume it's in a canonical form. |
229 | if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 2) |
230 | return false; |
231 | |
232 | PHINode *PN = dyn_cast<PHINode>(Val: &Header->front()); |
233 | if (!PN || PN->getNumIncomingValues() != 2) |
234 | return false; |
235 | |
236 | auto LoopBlocks = CurLoop->getBlocks(); |
237 | // The first block in the loop should contain only 4 instructions, e.g. |
238 | // |
239 | // while.cond: |
240 | // %res.phi = phi i32 [ %start, %ph ], [ %inc, %while.body ] |
241 | // %inc = add i32 %res.phi, 1 |
242 | // %cmp.not = icmp eq i32 %inc, %n |
243 | // br i1 %cmp.not, label %while.end, label %while.body |
244 | // |
245 | auto CondBBInsts = LoopBlocks[0]->instructionsWithoutDebug(); |
246 | if (std::distance(first: CondBBInsts.begin(), last: CondBBInsts.end()) > 4) |
247 | return false; |
248 | |
249 | // The second block should contain 7 instructions, e.g. |
250 | // |
251 | // while.body: |
252 | // %idx = zext i32 %inc to i64 |
253 | // %idx.a = getelementptr inbounds i8, ptr %a, i64 %idx |
254 | // %load.a = load i8, ptr %idx.a |
255 | // %idx.b = getelementptr inbounds i8, ptr %b, i64 %idx |
256 | // %load.b = load i8, ptr %idx.b |
257 | // %cmp.not.ld = icmp eq i8 %load.a, %load.b |
258 | // br i1 %cmp.not.ld, label %while.cond, label %while.end |
259 | // |
260 | auto LoopBBInsts = LoopBlocks[1]->instructionsWithoutDebug(); |
261 | if (std::distance(first: LoopBBInsts.begin(), last: LoopBBInsts.end()) > 7) |
262 | return false; |
263 | |
264 | // The incoming value to the PHI node from the loop should be an add of 1. |
265 | Value *StartIdx = nullptr; |
266 | Instruction *Index = nullptr; |
267 | if (!CurLoop->contains(BB: PN->getIncomingBlock(i: 0))) { |
268 | StartIdx = PN->getIncomingValue(i: 0); |
269 | Index = dyn_cast<Instruction>(Val: PN->getIncomingValue(i: 1)); |
270 | } else { |
271 | StartIdx = PN->getIncomingValue(i: 1); |
272 | Index = dyn_cast<Instruction>(Val: PN->getIncomingValue(i: 0)); |
273 | } |
274 | |
275 | // Limit to 32-bit types for now |
276 | if (!Index || !Index->getType()->isIntegerTy(Bitwidth: 32) || |
277 | !match(V: Index, P: m_c_Add(L: m_Specific(V: PN), R: m_One()))) |
278 | return false; |
279 | |
280 | // If we match the pattern, PN and Index will be replaced with the result of |
281 | // the cttz.elts intrinsic. If any other instructions are used outside of |
282 | // the loop, we cannot replace it. |
283 | for (BasicBlock *BB : LoopBlocks) |
284 | for (Instruction &I : *BB) |
285 | if (&I != PN && &I != Index) |
286 | for (User *U : I.users()) |
287 | if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) |
288 | return false; |
289 | |
290 | // Match the branch instruction for the header |
291 | ICmpInst::Predicate Pred; |
292 | Value *MaxLen; |
293 | BasicBlock *EndBB, *WhileBB; |
294 | if (!match(V: Header->getTerminator(), |
295 | P: m_Br(C: m_ICmp(Pred, L: m_Specific(V: Index), R: m_Value(V&: MaxLen)), |
296 | T: m_BasicBlock(V&: EndBB), F: m_BasicBlock(V&: WhileBB))) || |
297 | Pred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(BB: WhileBB)) |
298 | return false; |
299 | |
300 | // WhileBB should contain the pattern of load & compare instructions. Match |
301 | // the pattern and find the GEP instructions used by the loads. |
302 | ICmpInst::Predicate WhilePred; |
303 | BasicBlock *FoundBB; |
304 | BasicBlock *TrueBB; |
305 | Value *LoadA, *LoadB; |
306 | if (!match(V: WhileBB->getTerminator(), |
307 | P: m_Br(C: m_ICmp(Pred&: WhilePred, L: m_Value(V&: LoadA), R: m_Value(V&: LoadB)), |
308 | T: m_BasicBlock(V&: TrueBB), F: m_BasicBlock(V&: FoundBB))) || |
309 | WhilePred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(BB: TrueBB)) |
310 | return false; |
311 | |
312 | Value *A, *B; |
313 | if (!match(V: LoadA, P: m_Load(Op: m_Value(V&: A))) || !match(V: LoadB, P: m_Load(Op: m_Value(V&: B)))) |
314 | return false; |
315 | |
316 | LoadInst *LoadAI = cast<LoadInst>(Val: LoadA); |
317 | LoadInst *LoadBI = cast<LoadInst>(Val: LoadB); |
318 | if (!LoadAI->isSimple() || !LoadBI->isSimple()) |
319 | return false; |
320 | |
321 | GetElementPtrInst *GEPA = dyn_cast<GetElementPtrInst>(Val: A); |
322 | GetElementPtrInst *GEPB = dyn_cast<GetElementPtrInst>(Val: B); |
323 | |
324 | if (!GEPA || !GEPB) |
325 | return false; |
326 | |
327 | Value *PtrA = GEPA->getPointerOperand(); |
328 | Value *PtrB = GEPB->getPointerOperand(); |
329 | |
330 | // Check we are loading i8 values from two loop invariant pointers |
331 | if (!CurLoop->isLoopInvariant(V: PtrA) || !CurLoop->isLoopInvariant(V: PtrB) || |
332 | !GEPA->getResultElementType()->isIntegerTy(Bitwidth: 8) || |
333 | !GEPB->getResultElementType()->isIntegerTy(Bitwidth: 8) || |
334 | !LoadAI->getType()->isIntegerTy(Bitwidth: 8) || |
335 | !LoadBI->getType()->isIntegerTy(Bitwidth: 8) || PtrA == PtrB) |
336 | return false; |
337 | |
338 | // Check that the index to the GEPs is the index we found earlier |
339 | if (GEPA->getNumIndices() > 1 || GEPB->getNumIndices() > 1) |
340 | return false; |
341 | |
342 | Value *IdxA = GEPA->getOperand(i_nocapture: GEPA->getNumIndices()); |
343 | Value *IdxB = GEPB->getOperand(i_nocapture: GEPB->getNumIndices()); |
344 | if (IdxA != IdxB || !match(V: IdxA, P: m_ZExt(Op: m_Specific(V: Index)))) |
345 | return false; |
346 | |
347 | // We only ever expect the pre-incremented index value to be used inside the |
348 | // loop. |
349 | if (!PN->hasOneUse()) |
350 | return false; |
351 | |
352 | // Ensure that when the Found and End blocks are identical the PHIs have the |
353 | // supported format. We don't currently allow cases like this: |
354 | // while.cond: |
355 | // ... |
356 | // br i1 %cmp.not, label %while.end, label %while.body |
357 | // |
358 | // while.body: |
359 | // ... |
360 | // br i1 %cmp.not2, label %while.cond, label %while.end |
361 | // |
362 | // while.end: |
363 | // %final_ptr = phi ptr [ %c, %while.body ], [ %d, %while.cond ] |
364 | // |
365 | // Where the incoming values for %final_ptr are unique and from each of the |
366 | // loop blocks, but not actually defined in the loop. This requires extra |
367 | // work setting up the byte.compare block, i.e. by introducing a select to |
368 | // choose the correct value. |
369 | // TODO: We could add support for this in future. |
370 | if (FoundBB == EndBB) { |
371 | for (PHINode &EndPN : EndBB->phis()) { |
372 | Value *WhileCondVal = EndPN.getIncomingValueForBlock(BB: Header); |
373 | Value *WhileBodyVal = EndPN.getIncomingValueForBlock(BB: WhileBB); |
374 | |
375 | // The value of the index when leaving the while.cond block is always the |
376 | // same as the end value (MaxLen) so we permit either. The value when |
377 | // leaving the while.body block should only be the index. Otherwise for |
378 | // any other values we only allow ones that are same for both blocks. |
379 | if (WhileCondVal != WhileBodyVal && |
380 | ((WhileCondVal != Index && WhileCondVal != MaxLen) || |
381 | (WhileBodyVal != Index))) |
382 | return false; |
383 | } |
384 | } |
385 | |
386 | LLVM_DEBUG(dbgs() << "FOUND IDIOM IN LOOP: \n" |
387 | << *(EndBB->getParent()) << "\n\n" ); |
388 | |
389 | // The index is incremented before the GEP/Load pair so we need to |
390 | // add 1 to the start value. |
391 | transformByteCompare(GEPA, GEPB, IndPhi: PN, MaxLen, Index, Start: StartIdx, /*IncIdx=*/true, |
392 | FoundBB, EndBB); |
393 | return true; |
394 | } |
395 | |
396 | Value *AArch64LoopIdiomTransform::expandFindMismatch( |
397 | IRBuilder<> &Builder, DomTreeUpdater &DTU, GetElementPtrInst *GEPA, |
398 | GetElementPtrInst *GEPB, Instruction *Index, Value *Start, Value *MaxLen) { |
399 | Value *PtrA = GEPA->getPointerOperand(); |
400 | Value *PtrB = GEPB->getPointerOperand(); |
401 | |
402 | // Get the arguments and types for the intrinsic. |
403 | BasicBlock * = CurLoop->getLoopPreheader(); |
404 | BranchInst *PHBranch = cast<BranchInst>(Val: Preheader->getTerminator()); |
405 | LLVMContext &Ctx = PHBranch->getContext(); |
406 | Type *LoadType = Type::getInt8Ty(C&: Ctx); |
407 | Type *ResType = Builder.getInt32Ty(); |
408 | |
409 | // Split block in the original loop preheader. |
410 | BasicBlock *EndBlock = |
411 | SplitBlock(Old: Preheader, SplitPt: PHBranch, DT, LI, MSSAU: nullptr, BBName: "mismatch_end" ); |
412 | |
413 | // Create the blocks that we're going to need: |
414 | // 1. A block for checking the zero-extended length exceeds 0 |
415 | // 2. A block to check that the start and end addresses of a given array |
416 | // lie on the same page. |
417 | // 3. The SVE loop preheader. |
418 | // 4. The first SVE loop block. |
419 | // 5. The SVE loop increment block. |
420 | // 6. A block we can jump to from the SVE loop when a mismatch is found. |
421 | // 7. The first block of the scalar loop itself, containing PHIs , loads |
422 | // and cmp. |
423 | // 8. A scalar loop increment block to increment the PHIs and go back |
424 | // around the loop. |
425 | |
426 | BasicBlock *MinItCheckBlock = BasicBlock::Create( |
427 | Context&: Ctx, Name: "mismatch_min_it_check" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
428 | |
429 | // Update the terminator added by SplitBlock to branch to the first block |
430 | Preheader->getTerminator()->setSuccessor(Idx: 0, BB: MinItCheckBlock); |
431 | |
432 | BasicBlock *MemCheckBlock = BasicBlock::Create( |
433 | Context&: Ctx, Name: "mismatch_mem_check" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
434 | |
435 | BasicBlock * = BasicBlock::Create( |
436 | Context&: Ctx, Name: "mismatch_sve_loop_preheader" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
437 | |
438 | BasicBlock *SVELoopStartBlock = BasicBlock::Create( |
439 | Context&: Ctx, Name: "mismatch_sve_loop" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
440 | |
441 | BasicBlock *SVELoopIncBlock = BasicBlock::Create( |
442 | Context&: Ctx, Name: "mismatch_sve_loop_inc" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
443 | |
444 | BasicBlock *SVELoopMismatchBlock = BasicBlock::Create( |
445 | Context&: Ctx, Name: "mismatch_sve_loop_found" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
446 | |
447 | BasicBlock * = BasicBlock::Create( |
448 | Context&: Ctx, Name: "mismatch_loop_pre" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
449 | |
450 | BasicBlock *LoopStartBlock = |
451 | BasicBlock::Create(Context&: Ctx, Name: "mismatch_loop" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
452 | |
453 | BasicBlock *LoopIncBlock = BasicBlock::Create( |
454 | Context&: Ctx, Name: "mismatch_loop_inc" , Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
455 | |
456 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, Preheader, MinItCheckBlock}, |
457 | {DominatorTree::Delete, Preheader, EndBlock}}); |
458 | |
459 | // Update LoopInfo with the new SVE & scalar loops. |
460 | auto SVELoop = LI->AllocateLoop(); |
461 | auto ScalarLoop = LI->AllocateLoop(); |
462 | |
463 | if (CurLoop->getParentLoop()) { |
464 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: MinItCheckBlock, LI&: *LI); |
465 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: MemCheckBlock, LI&: *LI); |
466 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: SVELoopPreheaderBlock, LI&: *LI); |
467 | CurLoop->getParentLoop()->addChildLoop(NewChild: SVELoop); |
468 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: SVELoopMismatchBlock, LI&: *LI); |
469 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: LoopPreHeaderBlock, LI&: *LI); |
470 | CurLoop->getParentLoop()->addChildLoop(NewChild: ScalarLoop); |
471 | } else { |
472 | LI->addTopLevelLoop(New: SVELoop); |
473 | LI->addTopLevelLoop(New: ScalarLoop); |
474 | } |
475 | |
476 | // Add the new basic blocks to their associated loops. |
477 | SVELoop->addBasicBlockToLoop(NewBB: SVELoopStartBlock, LI&: *LI); |
478 | SVELoop->addBasicBlockToLoop(NewBB: SVELoopIncBlock, LI&: *LI); |
479 | |
480 | ScalarLoop->addBasicBlockToLoop(NewBB: LoopStartBlock, LI&: *LI); |
481 | ScalarLoop->addBasicBlockToLoop(NewBB: LoopIncBlock, LI&: *LI); |
482 | |
483 | // Set up some types and constants that we intend to reuse. |
484 | Type *I64Type = Builder.getInt64Ty(); |
485 | |
486 | // Check the zero-extended iteration count > 0 |
487 | Builder.SetInsertPoint(MinItCheckBlock); |
488 | Value *ExtStart = Builder.CreateZExt(V: Start, DestTy: I64Type); |
489 | Value *ExtEnd = Builder.CreateZExt(V: MaxLen, DestTy: I64Type); |
490 | // This check doesn't really cost us very much. |
491 | |
492 | Value *LimitCheck = Builder.CreateICmpULE(LHS: Start, RHS: MaxLen); |
493 | BranchInst *MinItCheckBr = |
494 | BranchInst::Create(IfTrue: MemCheckBlock, IfFalse: LoopPreHeaderBlock, Cond: LimitCheck); |
495 | MinItCheckBr->setMetadata( |
496 | KindID: LLVMContext::MD_prof, |
497 | Node: MDBuilder(MinItCheckBr->getContext()).createBranchWeights(TrueWeight: 99, FalseWeight: 1)); |
498 | Builder.Insert(I: MinItCheckBr); |
499 | |
500 | DTU.applyUpdates( |
501 | Updates: {{DominatorTree::Insert, MinItCheckBlock, MemCheckBlock}, |
502 | {DominatorTree::Insert, MinItCheckBlock, LoopPreHeaderBlock}}); |
503 | |
504 | // For each of the arrays, check the start/end addresses are on the same |
505 | // page. |
506 | Builder.SetInsertPoint(MemCheckBlock); |
507 | |
508 | // The early exit in the original loop means that when performing vector |
509 | // loads we are potentially reading ahead of the early exit. So we could |
510 | // fault if crossing a page boundary. Therefore, we create runtime memory |
511 | // checks based on the minimum page size as follows: |
512 | // 1. Calculate the addresses of the first memory accesses in the loop, |
513 | // i.e. LhsStart and RhsStart. |
514 | // 2. Get the last accessed addresses in the loop, i.e. LhsEnd and RhsEnd. |
515 | // 3. Determine which pages correspond to all the memory accesses, i.e |
516 | // LhsStartPage, LhsEndPage, RhsStartPage, RhsEndPage. |
517 | // 4. If LhsStartPage == LhsEndPage and RhsStartPage == RhsEndPage, then |
518 | // we know we won't cross any page boundaries in the loop so we can |
519 | // enter the vector loop! Otherwise we fall back on the scalar loop. |
520 | Value *LhsStartGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: ExtStart); |
521 | Value *RhsStartGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: ExtStart); |
522 | Value *RhsStart = Builder.CreatePtrToInt(V: RhsStartGEP, DestTy: I64Type); |
523 | Value *LhsStart = Builder.CreatePtrToInt(V: LhsStartGEP, DestTy: I64Type); |
524 | Value *LhsEndGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: ExtEnd); |
525 | Value *RhsEndGEP = Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: ExtEnd); |
526 | Value *LhsEnd = Builder.CreatePtrToInt(V: LhsEndGEP, DestTy: I64Type); |
527 | Value *RhsEnd = Builder.CreatePtrToInt(V: RhsEndGEP, DestTy: I64Type); |
528 | |
529 | const uint64_t MinPageSize = TTI->getMinPageSize().value(); |
530 | const uint64_t AddrShiftAmt = llvm::Log2_64(Value: MinPageSize); |
531 | Value *LhsStartPage = Builder.CreateLShr(LHS: LhsStart, RHS: AddrShiftAmt); |
532 | Value *LhsEndPage = Builder.CreateLShr(LHS: LhsEnd, RHS: AddrShiftAmt); |
533 | Value *RhsStartPage = Builder.CreateLShr(LHS: RhsStart, RHS: AddrShiftAmt); |
534 | Value *RhsEndPage = Builder.CreateLShr(LHS: RhsEnd, RHS: AddrShiftAmt); |
535 | Value *LhsPageCmp = Builder.CreateICmpNE(LHS: LhsStartPage, RHS: LhsEndPage); |
536 | Value *RhsPageCmp = Builder.CreateICmpNE(LHS: RhsStartPage, RHS: RhsEndPage); |
537 | |
538 | Value *CombinedPageCmp = Builder.CreateOr(LHS: LhsPageCmp, RHS: RhsPageCmp); |
539 | BranchInst *CombinedPageCmpCmpBr = BranchInst::Create( |
540 | IfTrue: LoopPreHeaderBlock, IfFalse: SVELoopPreheaderBlock, Cond: CombinedPageCmp); |
541 | CombinedPageCmpCmpBr->setMetadata( |
542 | KindID: LLVMContext::MD_prof, Node: MDBuilder(CombinedPageCmpCmpBr->getContext()) |
543 | .createBranchWeights(TrueWeight: 10, FalseWeight: 90)); |
544 | Builder.Insert(I: CombinedPageCmpCmpBr); |
545 | |
546 | DTU.applyUpdates( |
547 | Updates: {{DominatorTree::Insert, MemCheckBlock, LoopPreHeaderBlock}, |
548 | {DominatorTree::Insert, MemCheckBlock, SVELoopPreheaderBlock}}); |
549 | |
550 | // Set up the SVE loop preheader, i.e. calculate initial loop predicate, |
551 | // zero-extend MaxLen to 64-bits, determine the number of vector elements |
552 | // processed in each iteration, etc. |
553 | Builder.SetInsertPoint(SVELoopPreheaderBlock); |
554 | |
555 | // At this point we know two things must be true: |
556 | // 1. Start <= End |
557 | // 2. ExtMaxLen <= MinPageSize due to the page checks. |
558 | // Therefore, we know that we can use a 64-bit induction variable that |
559 | // starts from 0 -> ExtMaxLen and it will not overflow. |
560 | ScalableVectorType *PredVTy = |
561 | ScalableVectorType::get(ElementType: Builder.getInt1Ty(), MinNumElts: 16); |
562 | |
563 | Value *InitialPred = Builder.CreateIntrinsic( |
564 | Intrinsic::get_active_lane_mask, {PredVTy, I64Type}, {ExtStart, ExtEnd}); |
565 | |
566 | Value *VecLen = Builder.CreateIntrinsic(Intrinsic::vscale, {I64Type}, {}); |
567 | VecLen = Builder.CreateMul(LHS: VecLen, RHS: ConstantInt::get(Ty: I64Type, V: 16), Name: "" , |
568 | /*HasNUW=*/true, /*HasNSW=*/true); |
569 | |
570 | Value *PFalse = Builder.CreateVectorSplat(EC: PredVTy->getElementCount(), |
571 | V: Builder.getInt1(V: false)); |
572 | |
573 | BranchInst *JumpToSVELoop = BranchInst::Create(IfTrue: SVELoopStartBlock); |
574 | Builder.Insert(I: JumpToSVELoop); |
575 | |
576 | DTU.applyUpdates( |
577 | Updates: {{DominatorTree::Insert, SVELoopPreheaderBlock, SVELoopStartBlock}}); |
578 | |
579 | // Set up the first SVE loop block by creating the PHIs, doing the vector |
580 | // loads and comparing the vectors. |
581 | Builder.SetInsertPoint(SVELoopStartBlock); |
582 | PHINode *LoopPred = Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 2, Name: "mismatch_sve_loop_pred" ); |
583 | LoopPred->addIncoming(V: InitialPred, BB: SVELoopPreheaderBlock); |
584 | PHINode *SVEIndexPhi = Builder.CreatePHI(Ty: I64Type, NumReservedValues: 2, Name: "mismatch_sve_index" ); |
585 | SVEIndexPhi->addIncoming(V: ExtStart, BB: SVELoopPreheaderBlock); |
586 | Type *SVELoadType = ScalableVectorType::get(ElementType: Builder.getInt8Ty(), MinNumElts: 16); |
587 | Value *Passthru = ConstantInt::getNullValue(Ty: SVELoadType); |
588 | |
589 | Value *SVELhsGep = Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: SVEIndexPhi); |
590 | if (GEPA->isInBounds()) |
591 | cast<GetElementPtrInst>(Val: SVELhsGep)->setIsInBounds(true); |
592 | Value *SVELhsLoad = Builder.CreateMaskedLoad(Ty: SVELoadType, Ptr: SVELhsGep, Alignment: Align(1), |
593 | Mask: LoopPred, PassThru: Passthru); |
594 | |
595 | Value *SVERhsGep = Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: SVEIndexPhi); |
596 | if (GEPB->isInBounds()) |
597 | cast<GetElementPtrInst>(Val: SVERhsGep)->setIsInBounds(true); |
598 | Value *SVERhsLoad = Builder.CreateMaskedLoad(Ty: SVELoadType, Ptr: SVERhsGep, Alignment: Align(1), |
599 | Mask: LoopPred, PassThru: Passthru); |
600 | |
601 | Value *SVEMatchCmp = Builder.CreateICmpNE(LHS: SVELhsLoad, RHS: SVERhsLoad); |
602 | SVEMatchCmp = Builder.CreateSelect(C: LoopPred, True: SVEMatchCmp, False: PFalse); |
603 | Value *SVEMatchHasActiveLanes = Builder.CreateOrReduce(Src: SVEMatchCmp); |
604 | BranchInst *SVEEarlyExit = BranchInst::Create( |
605 | IfTrue: SVELoopMismatchBlock, IfFalse: SVELoopIncBlock, Cond: SVEMatchHasActiveLanes); |
606 | Builder.Insert(I: SVEEarlyExit); |
607 | |
608 | DTU.applyUpdates( |
609 | Updates: {{DominatorTree::Insert, SVELoopStartBlock, SVELoopMismatchBlock}, |
610 | {DominatorTree::Insert, SVELoopStartBlock, SVELoopIncBlock}}); |
611 | |
612 | // Increment the index counter and calculate the predicate for the next |
613 | // iteration of the loop. We branch back to the start of the loop if there |
614 | // is at least one active lane. |
615 | Builder.SetInsertPoint(SVELoopIncBlock); |
616 | Value *NewSVEIndexPhi = Builder.CreateAdd(LHS: SVEIndexPhi, RHS: VecLen, Name: "" , |
617 | /*HasNUW=*/true, /*HasNSW=*/true); |
618 | SVEIndexPhi->addIncoming(V: NewSVEIndexPhi, BB: SVELoopIncBlock); |
619 | Value *NewPred = |
620 | Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, |
621 | {PredVTy, I64Type}, {NewSVEIndexPhi, ExtEnd}); |
622 | LoopPred->addIncoming(V: NewPred, BB: SVELoopIncBlock); |
623 | |
624 | Value *PredHasActiveLanes = |
625 | Builder.CreateExtractElement(Vec: NewPred, Idx: uint64_t(0)); |
626 | BranchInst *SVELoopBranchBack = |
627 | BranchInst::Create(IfTrue: SVELoopStartBlock, IfFalse: EndBlock, Cond: PredHasActiveLanes); |
628 | Builder.Insert(I: SVELoopBranchBack); |
629 | |
630 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, SVELoopIncBlock, SVELoopStartBlock}, |
631 | {DominatorTree::Insert, SVELoopIncBlock, EndBlock}}); |
632 | |
633 | // If we found a mismatch then we need to calculate which lane in the vector |
634 | // had a mismatch and add that on to the current loop index. |
635 | Builder.SetInsertPoint(SVELoopMismatchBlock); |
636 | PHINode *FoundPred = Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 1, Name: "mismatch_sve_found_pred" ); |
637 | FoundPred->addIncoming(V: SVEMatchCmp, BB: SVELoopStartBlock); |
638 | PHINode *LastLoopPred = |
639 | Builder.CreatePHI(Ty: PredVTy, NumReservedValues: 1, Name: "mismatch_sve_last_loop_pred" ); |
640 | LastLoopPred->addIncoming(V: LoopPred, BB: SVELoopStartBlock); |
641 | PHINode *SVEFoundIndex = |
642 | Builder.CreatePHI(Ty: I64Type, NumReservedValues: 1, Name: "mismatch_sve_found_index" ); |
643 | SVEFoundIndex->addIncoming(V: SVEIndexPhi, BB: SVELoopStartBlock); |
644 | |
645 | Value *PredMatchCmp = Builder.CreateAnd(LHS: LastLoopPred, RHS: FoundPred); |
646 | Value *Ctz = Builder.CreateIntrinsic( |
647 | Intrinsic::experimental_cttz_elts, {ResType, PredMatchCmp->getType()}, |
648 | {PredMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(V: true)}); |
649 | Ctz = Builder.CreateZExt(V: Ctz, DestTy: I64Type); |
650 | Value *SVELoopRes64 = Builder.CreateAdd(LHS: SVEFoundIndex, RHS: Ctz, Name: "" , |
651 | /*HasNUW=*/true, /*HasNSW=*/true); |
652 | Value *SVELoopRes = Builder.CreateTrunc(V: SVELoopRes64, DestTy: ResType); |
653 | |
654 | Builder.Insert(I: BranchInst::Create(IfTrue: EndBlock)); |
655 | |
656 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, SVELoopMismatchBlock, EndBlock}}); |
657 | |
658 | // Generate code for scalar loop. |
659 | Builder.SetInsertPoint(LoopPreHeaderBlock); |
660 | Builder.Insert(I: BranchInst::Create(IfTrue: LoopStartBlock)); |
661 | |
662 | DTU.applyUpdates( |
663 | Updates: {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}}); |
664 | |
665 | Builder.SetInsertPoint(LoopStartBlock); |
666 | PHINode *IndexPhi = Builder.CreatePHI(Ty: ResType, NumReservedValues: 2, Name: "mismatch_index" ); |
667 | IndexPhi->addIncoming(V: Start, BB: LoopPreHeaderBlock); |
668 | |
669 | // Otherwise compare the values |
670 | // Load bytes from each array and compare them. |
671 | Value *GepOffset = Builder.CreateZExt(V: IndexPhi, DestTy: I64Type); |
672 | |
673 | Value *LhsGep = Builder.CreateGEP(Ty: LoadType, Ptr: PtrA, IdxList: GepOffset); |
674 | if (GEPA->isInBounds()) |
675 | cast<GetElementPtrInst>(Val: LhsGep)->setIsInBounds(true); |
676 | Value *LhsLoad = Builder.CreateLoad(Ty: LoadType, Ptr: LhsGep); |
677 | |
678 | Value *RhsGep = Builder.CreateGEP(Ty: LoadType, Ptr: PtrB, IdxList: GepOffset); |
679 | if (GEPB->isInBounds()) |
680 | cast<GetElementPtrInst>(Val: RhsGep)->setIsInBounds(true); |
681 | Value *RhsLoad = Builder.CreateLoad(Ty: LoadType, Ptr: RhsGep); |
682 | |
683 | Value *MatchCmp = Builder.CreateICmpEQ(LHS: LhsLoad, RHS: RhsLoad); |
684 | // If we have a mismatch then exit the loop ... |
685 | BranchInst *MatchCmpBr = BranchInst::Create(IfTrue: LoopIncBlock, IfFalse: EndBlock, Cond: MatchCmp); |
686 | Builder.Insert(I: MatchCmpBr); |
687 | |
688 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, LoopStartBlock, LoopIncBlock}, |
689 | {DominatorTree::Insert, LoopStartBlock, EndBlock}}); |
690 | |
691 | // Have we reached the maximum permitted length for the loop? |
692 | Builder.SetInsertPoint(LoopIncBlock); |
693 | Value *PhiInc = Builder.CreateAdd(LHS: IndexPhi, RHS: ConstantInt::get(Ty: ResType, V: 1), Name: "" , |
694 | /*HasNUW=*/Index->hasNoUnsignedWrap(), |
695 | /*HasNSW=*/Index->hasNoSignedWrap()); |
696 | IndexPhi->addIncoming(V: PhiInc, BB: LoopIncBlock); |
697 | Value *IVCmp = Builder.CreateICmpEQ(LHS: PhiInc, RHS: MaxLen); |
698 | BranchInst *IVCmpBr = BranchInst::Create(IfTrue: EndBlock, IfFalse: LoopStartBlock, Cond: IVCmp); |
699 | Builder.Insert(I: IVCmpBr); |
700 | |
701 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, LoopIncBlock, EndBlock}, |
702 | {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}}); |
703 | |
704 | // In the end block we need to insert a PHI node to deal with three cases: |
705 | // 1. We didn't find a mismatch in the scalar loop, so we return MaxLen. |
706 | // 2. We exitted the scalar loop early due to a mismatch and need to return |
707 | // the index that we found. |
708 | // 3. We didn't find a mismatch in the SVE loop, so we return MaxLen. |
709 | // 4. We exitted the SVE loop early due to a mismatch and need to return |
710 | // the index that we found. |
711 | Builder.SetInsertPoint(TheBB: EndBlock, IP: EndBlock->getFirstInsertionPt()); |
712 | PHINode *ResPhi = Builder.CreatePHI(Ty: ResType, NumReservedValues: 4, Name: "mismatch_result" ); |
713 | ResPhi->addIncoming(V: MaxLen, BB: LoopIncBlock); |
714 | ResPhi->addIncoming(V: IndexPhi, BB: LoopStartBlock); |
715 | ResPhi->addIncoming(V: MaxLen, BB: SVELoopIncBlock); |
716 | ResPhi->addIncoming(V: SVELoopRes, BB: SVELoopMismatchBlock); |
717 | |
718 | Value *FinalRes = Builder.CreateTrunc(V: ResPhi, DestTy: ResType); |
719 | |
720 | if (VerifyLoops) { |
721 | ScalarLoop->verifyLoop(); |
722 | SVELoop->verifyLoop(); |
723 | if (!SVELoop->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
724 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
725 | if (!ScalarLoop->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
726 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
727 | } |
728 | |
729 | return FinalRes; |
730 | } |
731 | |
732 | void AArch64LoopIdiomTransform::transformByteCompare( |
733 | GetElementPtrInst *GEPA, GetElementPtrInst *GEPB, PHINode *IndPhi, |
734 | Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, |
735 | BasicBlock *FoundBB, BasicBlock *EndBB) { |
736 | |
737 | // Insert the byte compare code at the end of the preheader block |
738 | BasicBlock * = CurLoop->getLoopPreheader(); |
739 | BasicBlock * = CurLoop->getHeader(); |
740 | BranchInst *PHBranch = cast<BranchInst>(Val: Preheader->getTerminator()); |
741 | IRBuilder<> Builder(PHBranch); |
742 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); |
743 | Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); |
744 | |
745 | // Increment the pointer if this was done before the loads in the loop. |
746 | if (IncIdx) |
747 | Start = Builder.CreateAdd(LHS: Start, RHS: ConstantInt::get(Ty: Start->getType(), V: 1)); |
748 | |
749 | Value *ByteCmpRes = |
750 | expandFindMismatch(Builder, DTU, GEPA, GEPB, Index, Start, MaxLen); |
751 | |
752 | // Replaces uses of index & induction Phi with intrinsic (we already |
753 | // checked that the the first instruction of Header is the Phi above). |
754 | assert(IndPhi->hasOneUse() && "Index phi node has more than one use!" ); |
755 | Index->replaceAllUsesWith(V: ByteCmpRes); |
756 | |
757 | assert(PHBranch->isUnconditional() && |
758 | "Expected preheader to terminate with an unconditional branch." ); |
759 | |
760 | // If no mismatch was found, we can jump to the end block. Create a |
761 | // new basic block for the compare instruction. |
762 | auto *CmpBB = BasicBlock::Create(Context&: Preheader->getContext(), Name: "byte.compare" , |
763 | Parent: Preheader->getParent()); |
764 | CmpBB->moveBefore(MovePos: EndBB); |
765 | |
766 | // Replace the branch in the preheader with an always-true conditional branch. |
767 | // This ensures there is still a reference to the original loop. |
768 | Builder.CreateCondBr(Cond: Builder.getTrue(), True: CmpBB, False: Header); |
769 | PHBranch->eraseFromParent(); |
770 | |
771 | BasicBlock *MismatchEnd = cast<Instruction>(Val: ByteCmpRes)->getParent(); |
772 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, MismatchEnd, CmpBB}}); |
773 | |
774 | // Create the branch to either the end or found block depending on the value |
775 | // returned by the intrinsic. |
776 | Builder.SetInsertPoint(CmpBB); |
777 | if (FoundBB != EndBB) { |
778 | Value *FoundCmp = Builder.CreateICmpEQ(LHS: ByteCmpRes, RHS: MaxLen); |
779 | Builder.CreateCondBr(Cond: FoundCmp, True: EndBB, False: FoundBB); |
780 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, CmpBB, FoundBB}, |
781 | {DominatorTree::Insert, CmpBB, EndBB}}); |
782 | |
783 | } else { |
784 | Builder.CreateBr(Dest: FoundBB); |
785 | DTU.applyUpdates(Updates: {{DominatorTree::Insert, CmpBB, FoundBB}}); |
786 | } |
787 | |
788 | auto fixSuccessorPhis = [&](BasicBlock *SuccBB) { |
789 | for (PHINode &PN : SuccBB->phis()) { |
790 | // At this point we've already replaced all uses of the result from the |
791 | // loop with ByteCmp. Look through the incoming values to find ByteCmp, |
792 | // meaning this is a Phi collecting the results of the byte compare. |
793 | bool ResPhi = false; |
794 | for (Value *Op : PN.incoming_values()) |
795 | if (Op == ByteCmpRes) { |
796 | ResPhi = true; |
797 | break; |
798 | } |
799 | |
800 | // Any PHI that depended upon the result of the byte compare needs a new |
801 | // incoming value from CmpBB. This is because the original loop will get |
802 | // deleted. |
803 | if (ResPhi) |
804 | PN.addIncoming(V: ByteCmpRes, BB: CmpBB); |
805 | else { |
806 | // There should be no other outside uses of other values in the |
807 | // original loop. Any incoming values should either: |
808 | // 1. Be for blocks outside the loop, which aren't interesting. Or .. |
809 | // 2. These are from blocks in the loop with values defined outside |
810 | // the loop. We should a similar incoming value from CmpBB. |
811 | for (BasicBlock *BB : PN.blocks()) |
812 | if (CurLoop->contains(BB)) { |
813 | PN.addIncoming(V: PN.getIncomingValueForBlock(BB), BB: CmpBB); |
814 | break; |
815 | } |
816 | } |
817 | } |
818 | }; |
819 | |
820 | // Ensure all Phis in the successors of CmpBB have an incoming value from it. |
821 | fixSuccessorPhis(EndBB); |
822 | if (EndBB != FoundBB) |
823 | fixSuccessorPhis(FoundBB); |
824 | |
825 | // The new CmpBB block isn't part of the loop, but will need to be added to |
826 | // the outer loop if there is one. |
827 | if (!CurLoop->isOutermost()) |
828 | CurLoop->getParentLoop()->addBasicBlockToLoop(NewBB: CmpBB, LI&: *LI); |
829 | |
830 | if (VerifyLoops && CurLoop->getParentLoop()) { |
831 | CurLoop->getParentLoop()->verifyLoop(); |
832 | if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(DT: *DT, LI: *LI)) |
833 | report_fatal_error(reason: "Loops must remain in LCSSA form!" ); |
834 | } |
835 | } |
836 | |