1//===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file contains implementations for different VPlan recipes.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlan.h"
15#include "VPlanAnalysis.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallVector.h"
18#include "llvm/ADT/Twine.h"
19#include "llvm/Analysis/IVDescriptors.h"
20#include "llvm/IR/BasicBlock.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/Instruction.h"
23#include "llvm/IR/Instructions.h"
24#include "llvm/IR/Type.h"
25#include "llvm/IR/Value.h"
26#include "llvm/Support/Casting.h"
27#include "llvm/Support/CommandLine.h"
28#include "llvm/Support/Debug.h"
29#include "llvm/Support/raw_ostream.h"
30#include "llvm/Transforms/Utils/BasicBlockUtils.h"
31#include "llvm/Transforms/Utils/LoopUtils.h"
32#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33#include <cassert>
34
35using namespace llvm;
36
37using VectorParts = SmallVector<Value *, 2>;
38
39namespace llvm {
40extern cl::opt<bool> EnableVPlanNativePath;
41}
42
43#define LV_NAME "loop-vectorize"
44#define DEBUG_TYPE LV_NAME
45
46bool VPRecipeBase::mayWriteToMemory() const {
47 switch (getVPDefID()) {
48 case VPInterleaveSC:
49 return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > 0;
50 case VPWidenStoreEVLSC:
51 case VPWidenStoreSC:
52 return true;
53 case VPReplicateSC:
54 case VPWidenCallSC:
55 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
56 ->mayWriteToMemory();
57 case VPBranchOnMaskSC:
58 case VPScalarIVStepsSC:
59 case VPPredInstPHISC:
60 return false;
61 case VPBlendSC:
62 case VPReductionSC:
63 case VPWidenCanonicalIVSC:
64 case VPWidenCastSC:
65 case VPWidenGEPSC:
66 case VPWidenIntOrFpInductionSC:
67 case VPWidenLoadEVLSC:
68 case VPWidenLoadSC:
69 case VPWidenPHISC:
70 case VPWidenSC:
71 case VPWidenSelectSC: {
72 const Instruction *I =
73 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
74 (void)I;
75 assert((!I || !I->mayWriteToMemory()) &&
76 "underlying instruction may write to memory");
77 return false;
78 }
79 default:
80 return true;
81 }
82}
83
84bool VPRecipeBase::mayReadFromMemory() const {
85 switch (getVPDefID()) {
86 case VPWidenLoadEVLSC:
87 case VPWidenLoadSC:
88 return true;
89 case VPReplicateSC:
90 case VPWidenCallSC:
91 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
92 ->mayReadFromMemory();
93 case VPBranchOnMaskSC:
94 case VPPredInstPHISC:
95 case VPScalarIVStepsSC:
96 case VPWidenStoreEVLSC:
97 case VPWidenStoreSC:
98 return false;
99 case VPBlendSC:
100 case VPReductionSC:
101 case VPWidenCanonicalIVSC:
102 case VPWidenCastSC:
103 case VPWidenGEPSC:
104 case VPWidenIntOrFpInductionSC:
105 case VPWidenPHISC:
106 case VPWidenSC:
107 case VPWidenSelectSC: {
108 const Instruction *I =
109 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
110 (void)I;
111 assert((!I || !I->mayReadFromMemory()) &&
112 "underlying instruction may read from memory");
113 return false;
114 }
115 default:
116 return true;
117 }
118}
119
120bool VPRecipeBase::mayHaveSideEffects() const {
121 switch (getVPDefID()) {
122 case VPDerivedIVSC:
123 case VPPredInstPHISC:
124 case VPScalarCastSC:
125 return false;
126 case VPInstructionSC:
127 switch (cast<VPInstruction>(Val: this)->getOpcode()) {
128 case Instruction::Or:
129 case Instruction::ICmp:
130 case Instruction::Select:
131 case VPInstruction::Not:
132 case VPInstruction::CalculateTripCountMinusVF:
133 case VPInstruction::CanonicalIVIncrementForPart:
134 case VPInstruction::PtrAdd:
135 return false;
136 default:
137 return true;
138 }
139 case VPWidenCallSC:
140 return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue())
141 ->mayHaveSideEffects();
142 case VPBlendSC:
143 case VPReductionSC:
144 case VPScalarIVStepsSC:
145 case VPWidenCanonicalIVSC:
146 case VPWidenCastSC:
147 case VPWidenGEPSC:
148 case VPWidenIntOrFpInductionSC:
149 case VPWidenPHISC:
150 case VPWidenPointerInductionSC:
151 case VPWidenSC:
152 case VPWidenSelectSC: {
153 const Instruction *I =
154 dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue());
155 (void)I;
156 assert((!I || !I->mayHaveSideEffects()) &&
157 "underlying instruction has side-effects");
158 return false;
159 }
160 case VPInterleaveSC:
161 return mayWriteToMemory();
162 case VPWidenLoadEVLSC:
163 case VPWidenLoadSC:
164 case VPWidenStoreEVLSC:
165 case VPWidenStoreSC:
166 assert(
167 cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
168 mayWriteToMemory() &&
169 "mayHaveSideffects result for ingredient differs from this "
170 "implementation");
171 return mayWriteToMemory();
172 case VPReplicateSC: {
173 auto *R = cast<VPReplicateRecipe>(Val: this);
174 return R->getUnderlyingInstr()->mayHaveSideEffects();
175 }
176 default:
177 return true;
178 }
179}
180
181void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
182 auto Lane = VPLane::getLastLaneForVF(VF: State.VF);
183 VPValue *ExitValue = getOperand(N: 0);
184 if (vputils::isUniformAfterVectorization(VPV: ExitValue))
185 Lane = VPLane::getFirstLane();
186 VPBasicBlock *MiddleVPBB =
187 cast<VPBasicBlock>(Val: Plan.getVectorLoopRegion()->getSingleSuccessor());
188 assert(MiddleVPBB->getNumSuccessors() == 0 &&
189 "the middle block must not have any successors");
190 BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
191 Phi->addIncoming(V: State.get(Def: ExitValue, Instance: VPIteration(State.UF - 1, Lane)),
192 BB: MiddleBB);
193}
194
195#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
196void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
197 O << "Live-out ";
198 getPhi()->printAsOperand(O);
199 O << " = ";
200 getOperand(N: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker);
201 O << "\n";
202}
203#endif
204
205void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
206 assert(!Parent && "Recipe already in some VPBasicBlock");
207 assert(InsertPos->getParent() &&
208 "Insertion position not in any VPBasicBlock");
209 InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator());
210}
211
212void VPRecipeBase::insertBefore(VPBasicBlock &BB,
213 iplist<VPRecipeBase>::iterator I) {
214 assert(!Parent && "Recipe already in some VPBasicBlock");
215 assert(I == BB.end() || I->getParent() == &BB);
216 BB.insert(Recipe: this, InsertPt: I);
217}
218
219void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
220 assert(!Parent && "Recipe already in some VPBasicBlock");
221 assert(InsertPos->getParent() &&
222 "Insertion position not in any VPBasicBlock");
223 InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator()));
224}
225
226void VPRecipeBase::removeFromParent() {
227 assert(getParent() && "Recipe not in any VPBasicBlock");
228 getParent()->getRecipeList().remove(IT: getIterator());
229 Parent = nullptr;
230}
231
232iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
233 assert(getParent() && "Recipe not in any VPBasicBlock");
234 return getParent()->getRecipeList().erase(where: getIterator());
235}
236
237void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
238 removeFromParent();
239 insertAfter(InsertPos);
240}
241
242void VPRecipeBase::moveBefore(VPBasicBlock &BB,
243 iplist<VPRecipeBase>::iterator I) {
244 removeFromParent();
245 insertBefore(BB, I);
246}
247
248FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
249 assert(OpType == OperationType::FPMathOp &&
250 "recipe doesn't have fast math flags");
251 FastMathFlags Res;
252 Res.setAllowReassoc(FMFs.AllowReassoc);
253 Res.setNoNaNs(FMFs.NoNaNs);
254 Res.setNoInfs(FMFs.NoInfs);
255 Res.setNoSignedZeros(FMFs.NoSignedZeros);
256 Res.setAllowReciprocal(FMFs.AllowReciprocal);
257 Res.setAllowContract(FMFs.AllowContract);
258 Res.setApproxFunc(FMFs.ApproxFunc);
259 return Res;
260}
261
262VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
263 VPValue *A, VPValue *B, DebugLoc DL,
264 const Twine &Name)
265 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
266 Pred, DL),
267 Opcode(Opcode), Name(Name.str()) {
268 assert(Opcode == Instruction::ICmp &&
269 "only ICmp predicates supported at the moment");
270}
271
272VPInstruction::VPInstruction(unsigned Opcode,
273 std::initializer_list<VPValue *> Operands,
274 FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
275 : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
276 Opcode(Opcode), Name(Name.str()) {
277 // Make sure the VPInstruction is a floating-point operation.
278 assert(isFPMathOp() && "this op can't take fast-math flags");
279}
280
281bool VPInstruction::doesGeneratePerAllLanes() const {
282 return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this);
283}
284
285bool VPInstruction::canGenerateScalarForFirstLane() const {
286 if (Instruction::isBinaryOp(Opcode: getOpcode()))
287 return true;
288
289 switch (Opcode) {
290 case VPInstruction::BranchOnCond:
291 case VPInstruction::BranchOnCount:
292 case VPInstruction::CalculateTripCountMinusVF:
293 case VPInstruction::CanonicalIVIncrementForPart:
294 case VPInstruction::ComputeReductionResult:
295 case VPInstruction::PtrAdd:
296 case VPInstruction::ExplicitVectorLength:
297 return true;
298 default:
299 return false;
300 }
301}
302
303Value *VPInstruction::generatePerLane(VPTransformState &State,
304 const VPIteration &Lane) {
305 IRBuilderBase &Builder = State.Builder;
306
307 assert(getOpcode() == VPInstruction::PtrAdd &&
308 "only PtrAdd opcodes are supported for now");
309 return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: 0), Instance: Lane),
310 Offset: State.get(Def: getOperand(N: 1), Instance: Lane), Name);
311}
312
313Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
314 IRBuilderBase &Builder = State.Builder;
315
316 if (Instruction::isBinaryOp(Opcode: getOpcode())) {
317 bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this);
318 if (Part != 0 && vputils::onlyFirstPartUsed(Def: this))
319 return State.get(Def: this, Part: 0, IsScalar: OnlyFirstLaneUsed);
320
321 Value *A = State.get(Def: getOperand(N: 0), Part, IsScalar: OnlyFirstLaneUsed);
322 Value *B = State.get(Def: getOperand(N: 1), Part, IsScalar: OnlyFirstLaneUsed);
323 auto *Res =
324 Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name);
325 if (auto *I = dyn_cast<Instruction>(Val: Res))
326 setFlags(I);
327 return Res;
328 }
329
330 switch (getOpcode()) {
331 case VPInstruction::Not: {
332 Value *A = State.get(Def: getOperand(N: 0), Part);
333 return Builder.CreateNot(V: A, Name);
334 }
335 case Instruction::ICmp: {
336 Value *A = State.get(Def: getOperand(N: 0), Part);
337 Value *B = State.get(Def: getOperand(N: 1), Part);
338 return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name);
339 }
340 case Instruction::Select: {
341 Value *Cond = State.get(Def: getOperand(N: 0), Part);
342 Value *Op1 = State.get(Def: getOperand(N: 1), Part);
343 Value *Op2 = State.get(Def: getOperand(N: 2), Part);
344 return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name);
345 }
346 case VPInstruction::ActiveLaneMask: {
347 // Get first lane of vector induction variable.
348 Value *VIVElem0 = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0));
349 // Get the original loop tripcount.
350 Value *ScalarTC = State.get(Def: getOperand(N: 1), Instance: VPIteration(Part, 0));
351
352 // If this part of the active lane mask is scalar, generate the CMP directly
353 // to avoid unnecessary extracts.
354 if (State.VF.isScalar())
355 return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC,
356 Name);
357
358 auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext());
359 auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF);
360 return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
361 {PredTy, ScalarTC->getType()},
362 {VIVElem0, ScalarTC}, nullptr, Name);
363 }
364 case VPInstruction::FirstOrderRecurrenceSplice: {
365 // Generate code to combine the previous and current values in vector v3.
366 //
367 // vector.ph:
368 // v_init = vector(..., ..., ..., a[-1])
369 // br vector.body
370 //
371 // vector.body
372 // i = phi [0, vector.ph], [i+4, vector.body]
373 // v1 = phi [v_init, vector.ph], [v2, vector.body]
374 // v2 = a[i, i+1, i+2, i+3];
375 // v3 = vector(v1(3), v2(0, 1, 2))
376
377 // For the first part, use the recurrence phi (v1), otherwise v2.
378 auto *V1 = State.get(Def: getOperand(N: 0), Part: 0);
379 Value *PartMinus1 = Part == 0 ? V1 : State.get(Def: getOperand(N: 1), Part: Part - 1);
380 if (!PartMinus1->getType()->isVectorTy())
381 return PartMinus1;
382 Value *V2 = State.get(Def: getOperand(N: 1), Part);
383 return Builder.CreateVectorSplice(V1: PartMinus1, V2, Imm: -1, Name);
384 }
385 case VPInstruction::CalculateTripCountMinusVF: {
386 if (Part != 0)
387 return State.get(Def: this, Part: 0, /*IsScalar*/ true);
388
389 Value *ScalarTC = State.get(Def: getOperand(N: 0), Instance: {0, 0});
390 Value *Step =
391 createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: State.UF);
392 Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step);
393 Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step);
394 Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: 0);
395 return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero);
396 }
397 case VPInstruction::ExplicitVectorLength: {
398 // Compute EVL
399 auto GetEVL = [=](VPTransformState &State, Value *AVL) {
400 assert(AVL->getType()->isIntegerTy() &&
401 "Requested vector length should be an integer.");
402
403 // TODO: Add support for MaxSafeDist for correct loop emission.
404 assert(State.VF.isScalable() && "Expected scalable vector factor.");
405 Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue());
406
407 Value *EVL = State.Builder.CreateIntrinsic(
408 State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
409 {AVL, VFArg, State.Builder.getTrue()});
410 return EVL;
411 };
412 // TODO: Restructure this code with an explicit remainder loop, vsetvli can
413 // be outside of the main loop.
414 assert(Part == 0 && "No unrolling expected for predicated vectorization.");
415 // Compute VTC - IV as the AVL (requested vector length).
416 Value *Index = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
417 Value *TripCount = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0));
418 Value *AVL = State.Builder.CreateSub(LHS: TripCount, RHS: Index);
419 Value *EVL = GetEVL(State, AVL);
420 return EVL;
421 }
422 case VPInstruction::CanonicalIVIncrementForPart: {
423 auto *IV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
424 if (Part == 0)
425 return IV;
426
427 // The canonical IV is incremented by the vectorization factor (num of SIMD
428 // elements) times the unroll part.
429 Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part);
430 return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(),
431 HasNSW: hasNoSignedWrap());
432 }
433 case VPInstruction::BranchOnCond: {
434 if (Part != 0)
435 return nullptr;
436
437 Value *Cond = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0));
438 VPRegionBlock *ParentRegion = getParent()->getParent();
439 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
440
441 // Replace the temporary unreachable terminator with a new conditional
442 // branch, hooking it up to backward destination for exiting blocks now and
443 // to forward destination(s) later when they are created.
444 BranchInst *CondBr =
445 Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(), False: nullptr);
446
447 if (getParent()->isExiting())
448 CondBr->setSuccessor(idx: 1, NewSucc: State.CFG.VPBB2IRBB[Header]);
449
450 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
451 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
452 return CondBr;
453 }
454 case VPInstruction::BranchOnCount: {
455 if (Part != 0)
456 return nullptr;
457 // First create the compare.
458 Value *IV = State.get(Def: getOperand(N: 0), Part, /*IsScalar*/ true);
459 Value *TC = State.get(Def: getOperand(N: 1), Part, /*IsScalar*/ true);
460 Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC);
461
462 // Now create the branch.
463 auto *Plan = getParent()->getPlan();
464 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
465 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
466
467 // Replace the temporary unreachable terminator with a new conditional
468 // branch, hooking it up to backward destination (the header) now and to the
469 // forward destination (the exit/middle block) later when it is created.
470 // Note that CreateCondBr expects a valid BB as first argument, so we need
471 // to set it to nullptr later.
472 BranchInst *CondBr = Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(),
473 False: State.CFG.VPBB2IRBB[Header]);
474 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
475 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
476 return CondBr;
477 }
478 case VPInstruction::ComputeReductionResult: {
479 if (Part != 0)
480 return State.get(Def: this, Part: 0, /*IsScalar*/ true);
481
482 // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
483 // and will be removed by breaking up the recipe further.
484 auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0));
485 auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
486 // Get its reduction variable descriptor.
487 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
488
489 RecurKind RK = RdxDesc.getRecurrenceKind();
490
491 VPValue *LoopExitingDef = getOperand(N: 1);
492 Type *PhiTy = OrigPhi->getType();
493 VectorParts RdxParts(State.UF);
494 for (unsigned Part = 0; Part < State.UF; ++Part)
495 RdxParts[Part] = State.get(Def: LoopExitingDef, Part, IsScalar: PhiR->isInLoop());
496
497 // If the vector reduction can be performed in a smaller type, we truncate
498 // then extend the loop exit value to enable InstCombine to evaluate the
499 // entire expression in the smaller type.
500 // TODO: Handle this in truncateToMinBW.
501 if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
502 Type *RdxVecTy = VectorType::get(ElementType: RdxDesc.getRecurrenceType(), EC: State.VF);
503 for (unsigned Part = 0; Part < State.UF; ++Part)
504 RdxParts[Part] = Builder.CreateTrunc(V: RdxParts[Part], DestTy: RdxVecTy);
505 }
506 // Reduce all of the unrolled parts into a single vector.
507 Value *ReducedPartRdx = RdxParts[0];
508 unsigned Op = RecurrenceDescriptor::getOpcode(Kind: RK);
509
510 if (PhiR->isOrdered()) {
511 ReducedPartRdx = RdxParts[State.UF - 1];
512 } else {
513 // Floating-point operations should have some FMF to enable the reduction.
514 IRBuilderBase::FastMathFlagGuard FMFG(Builder);
515 Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
516 for (unsigned Part = 1; Part < State.UF; ++Part) {
517 Value *RdxPart = RdxParts[Part];
518 if (Op != Instruction::ICmp && Op != Instruction::FCmp)
519 ReducedPartRdx = Builder.CreateBinOp(
520 Opc: (Instruction::BinaryOps)Op, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx");
521 else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
522 TrackingVH<Value> ReductionStartValue =
523 RdxDesc.getRecurrenceStartValue();
524 ReducedPartRdx = createAnyOfOp(Builder, StartVal: ReductionStartValue, RK,
525 Left: ReducedPartRdx, Right: RdxPart);
526 } else
527 ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart);
528 }
529 }
530
531 // Create the reduction after the loop. Note that inloop reductions create
532 // the target reduction in the loop using a Reduction recipe.
533 if (State.VF.isVector() && !PhiR->isInLoop()) {
534 ReducedPartRdx =
535 createTargetReduction(B&: Builder, Desc: RdxDesc, Src: ReducedPartRdx, OrigPhi);
536 // If the reduction can be performed in a smaller type, we need to extend
537 // the reduction to the wider type before we branch to the original loop.
538 if (PhiTy != RdxDesc.getRecurrenceType())
539 ReducedPartRdx = RdxDesc.isSigned()
540 ? Builder.CreateSExt(V: ReducedPartRdx, DestTy: PhiTy)
541 : Builder.CreateZExt(V: ReducedPartRdx, DestTy: PhiTy);
542 }
543
544 // If there were stores of the reduction value to a uniform memory address
545 // inside the loop, create the final store here.
546 if (StoreInst *SI = RdxDesc.IntermediateStore) {
547 auto *NewSI = Builder.CreateAlignedStore(
548 Val: ReducedPartRdx, Ptr: SI->getPointerOperand(), Align: SI->getAlign());
549 propagateMetadata(I: NewSI, VL: SI);
550 }
551
552 return ReducedPartRdx;
553 }
554 case VPInstruction::PtrAdd: {
555 assert(vputils::onlyFirstLaneUsed(this) &&
556 "can only generate first lane for PtrAdd");
557 Value *Ptr = State.get(Def: getOperand(N: 0), Part, /* IsScalar */ true);
558 Value *Addend = State.get(Def: getOperand(N: 1), Part, /* IsScalar */ true);
559 return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name);
560 }
561 default:
562 llvm_unreachable("Unsupported opcode for instruction");
563 }
564}
565
566#if !defined(NDEBUG)
567bool VPInstruction::isFPMathOp() const {
568 // Inspired by FPMathOperator::classof. Notable differences are that we don't
569 // support Call, PHI and Select opcodes here yet.
570 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
571 Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
572 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
573 Opcode == Instruction::FCmp || Opcode == Instruction::Select;
574}
575#endif
576
577void VPInstruction::execute(VPTransformState &State) {
578 assert(!State.Instance && "VPInstruction executing an Instance");
579 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
580 assert((hasFastMathFlags() == isFPMathOp() ||
581 getOpcode() == Instruction::Select) &&
582 "Recipe not a FPMathOp but has fast-math flags?");
583 if (hasFastMathFlags())
584 State.Builder.setFastMathFlags(getFastMathFlags());
585 State.setDebugLocFrom(getDebugLoc());
586 bool GeneratesPerFirstLaneOnly =
587 canGenerateScalarForFirstLane() &&
588 (vputils::onlyFirstLaneUsed(Def: this) ||
589 getOpcode() == VPInstruction::ComputeReductionResult);
590 bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
591 for (unsigned Part = 0; Part < State.UF; ++Part) {
592 if (GeneratesPerAllLanes) {
593 for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
594 Lane != NumLanes; ++Lane) {
595 Value *GeneratedValue = generatePerLane(State, Lane: VPIteration(Part, Lane));
596 assert(GeneratedValue && "generatePerLane must produce a value");
597 State.set(Def: this, V: GeneratedValue, Instance: VPIteration(Part, Lane));
598 }
599 continue;
600 }
601
602 Value *GeneratedValue = generatePerPart(State, Part);
603 if (!hasResult())
604 continue;
605 assert(GeneratedValue && "generatePerPart must produce a value");
606 assert((GeneratedValue->getType()->isVectorTy() ==
607 !GeneratesPerFirstLaneOnly ||
608 State.VF.isScalar()) &&
609 "scalar value but not only first lane defined");
610 State.set(Def: this, V: GeneratedValue, Part,
611 /*IsScalar*/ GeneratesPerFirstLaneOnly);
612 }
613}
614
615bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
616 assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
617 if (Instruction::isBinaryOp(Opcode: getOpcode()))
618 return vputils::onlyFirstLaneUsed(Def: this);
619
620 switch (getOpcode()) {
621 default:
622 return false;
623 case Instruction::ICmp:
624 case VPInstruction::PtrAdd:
625 // TODO: Cover additional opcodes.
626 return vputils::onlyFirstLaneUsed(Def: this);
627 case VPInstruction::ActiveLaneMask:
628 case VPInstruction::ExplicitVectorLength:
629 case VPInstruction::CalculateTripCountMinusVF:
630 case VPInstruction::CanonicalIVIncrementForPart:
631 case VPInstruction::BranchOnCount:
632 return true;
633 };
634 llvm_unreachable("switch should return");
635}
636
637#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
638void VPInstruction::dump() const {
639 VPSlotTracker SlotTracker(getParent()->getPlan());
640 print(O&: dbgs(), Indent: "", SlotTracker);
641}
642
643void VPInstruction::print(raw_ostream &O, const Twine &Indent,
644 VPSlotTracker &SlotTracker) const {
645 O << Indent << "EMIT ";
646
647 if (hasResult()) {
648 printAsOperand(OS&: O, Tracker&: SlotTracker);
649 O << " = ";
650 }
651
652 switch (getOpcode()) {
653 case VPInstruction::Not:
654 O << "not";
655 break;
656 case VPInstruction::SLPLoad:
657 O << "combined load";
658 break;
659 case VPInstruction::SLPStore:
660 O << "combined store";
661 break;
662 case VPInstruction::ActiveLaneMask:
663 O << "active lane mask";
664 break;
665 case VPInstruction::ExplicitVectorLength:
666 O << "EXPLICIT-VECTOR-LENGTH";
667 break;
668 case VPInstruction::FirstOrderRecurrenceSplice:
669 O << "first-order splice";
670 break;
671 case VPInstruction::BranchOnCond:
672 O << "branch-on-cond";
673 break;
674 case VPInstruction::CalculateTripCountMinusVF:
675 O << "TC > VF ? TC - VF : 0";
676 break;
677 case VPInstruction::CanonicalIVIncrementForPart:
678 O << "VF * Part +";
679 break;
680 case VPInstruction::BranchOnCount:
681 O << "branch-on-count";
682 break;
683 case VPInstruction::ComputeReductionResult:
684 O << "compute-reduction-result";
685 break;
686 case VPInstruction::PtrAdd:
687 O << "ptradd";
688 break;
689 default:
690 O << Instruction::getOpcodeName(Opcode: getOpcode());
691 }
692
693 printFlags(O);
694 printOperands(O, SlotTracker);
695
696 if (auto DL = getDebugLoc()) {
697 O << ", !dbg ";
698 DL.print(OS&: O);
699 }
700}
701#endif
702
703void VPWidenCallRecipe::execute(VPTransformState &State) {
704 assert(State.VF.isVector() && "not widening");
705 auto &CI = *cast<CallInst>(Val: getUnderlyingInstr());
706 assert(!isa<DbgInfoIntrinsic>(CI) &&
707 "DbgInfoIntrinsic should have been dropped during VPlan construction");
708 State.setDebugLocFrom(getDebugLoc());
709
710 bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
711 FunctionType *VFTy = nullptr;
712 if (Variant)
713 VFTy = Variant->getFunctionType();
714 for (unsigned Part = 0; Part < State.UF; ++Part) {
715 SmallVector<Type *, 2> TysForDecl;
716 // Add return type if intrinsic is overloaded on it.
717 if (UseIntrinsic &&
718 isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1))
719 TysForDecl.push_back(
720 Elt: VectorType::get(ElementType: CI.getType()->getScalarType(), EC: State.VF));
721 SmallVector<Value *, 4> Args;
722 for (const auto &I : enumerate(First: operands())) {
723 // Some intrinsics have a scalar argument - don't replace it with a
724 // vector.
725 Value *Arg;
726 if (UseIntrinsic &&
727 isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index()))
728 Arg = State.get(Def: I.value(), Instance: VPIteration(0, 0));
729 // Some vectorized function variants may also take a scalar argument,
730 // e.g. linear parameters for pointers. This needs to be the scalar value
731 // from the start of the respective part when interleaving.
732 else if (VFTy && !VFTy->getParamType(i: I.index())->isVectorTy())
733 Arg = State.get(Def: I.value(), Instance: VPIteration(Part, 0));
734 else
735 Arg = State.get(Def: I.value(), Part);
736 if (UseIntrinsic &&
737 isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index()))
738 TysForDecl.push_back(Elt: Arg->getType());
739 Args.push_back(Elt: Arg);
740 }
741
742 Function *VectorF;
743 if (UseIntrinsic) {
744 // Use vector version of the intrinsic.
745 Module *M = State.Builder.GetInsertBlock()->getModule();
746 VectorF = Intrinsic::getDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl);
747 assert(VectorF && "Can't retrieve vector intrinsic.");
748 } else {
749#ifndef NDEBUG
750 assert(Variant != nullptr && "Can't create vector function.");
751#endif
752 VectorF = Variant;
753 }
754
755 SmallVector<OperandBundleDef, 1> OpBundles;
756 CI.getOperandBundlesAsDefs(Defs&: OpBundles);
757 CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles);
758
759 if (isa<FPMathOperator>(Val: V))
760 V->copyFastMathFlags(I: &CI);
761
762 if (!V->getType()->isVoidTy())
763 State.set(Def: this, V, Part);
764 State.addMetadata(To: V, From: &CI);
765 }
766}
767
768#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
769void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
770 VPSlotTracker &SlotTracker) const {
771 O << Indent << "WIDEN-CALL ";
772
773 auto *CI = cast<CallInst>(Val: getUnderlyingInstr());
774 if (CI->getType()->isVoidTy())
775 O << "void ";
776 else {
777 printAsOperand(OS&: O, Tracker&: SlotTracker);
778 O << " = ";
779 }
780
781 O << "call @" << CI->getCalledFunction()->getName() << "(";
782 printOperands(O, SlotTracker);
783 O << ")";
784
785 if (VectorIntrinsicID)
786 O << " (using vector intrinsic)";
787 else {
788 O << " (using library function";
789 if (Variant->hasName())
790 O << ": " << Variant->getName();
791 O << ")";
792 }
793}
794
795void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
796 VPSlotTracker &SlotTracker) const {
797 O << Indent << "WIDEN-SELECT ";
798 printAsOperand(OS&: O, Tracker&: SlotTracker);
799 O << " = select ";
800 getOperand(N: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker);
801 O << ", ";
802 getOperand(N: 1)->printAsOperand(OS&: O, Tracker&: SlotTracker);
803 O << ", ";
804 getOperand(N: 2)->printAsOperand(OS&: O, Tracker&: SlotTracker);
805 O << (isInvariantCond() ? " (condition is loop invariant)" : "");
806}
807#endif
808
809void VPWidenSelectRecipe::execute(VPTransformState &State) {
810 State.setDebugLocFrom(getDebugLoc());
811
812 // The condition can be loop invariant but still defined inside the
813 // loop. This means that we can't just use the original 'cond' value.
814 // We have to take the 'vectorized' value and pick the first lane.
815 // Instcombine will make this a no-op.
816 auto *InvarCond =
817 isInvariantCond() ? State.get(Def: getCond(), Instance: VPIteration(0, 0)) : nullptr;
818
819 for (unsigned Part = 0; Part < State.UF; ++Part) {
820 Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond(), Part);
821 Value *Op0 = State.get(Def: getOperand(N: 1), Part);
822 Value *Op1 = State.get(Def: getOperand(N: 2), Part);
823 Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1);
824 State.set(Def: this, V: Sel, Part);
825 State.addMetadata(To: Sel, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
826 }
827}
828
829VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
830 const FastMathFlags &FMF) {
831 AllowReassoc = FMF.allowReassoc();
832 NoNaNs = FMF.noNaNs();
833 NoInfs = FMF.noInfs();
834 NoSignedZeros = FMF.noSignedZeros();
835 AllowReciprocal = FMF.allowReciprocal();
836 AllowContract = FMF.allowContract();
837 ApproxFunc = FMF.approxFunc();
838}
839
840#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
841void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
842 switch (OpType) {
843 case OperationType::Cmp:
844 O << " " << CmpInst::getPredicateName(P: getPredicate());
845 break;
846 case OperationType::DisjointOp:
847 if (DisjointFlags.IsDisjoint)
848 O << " disjoint";
849 break;
850 case OperationType::PossiblyExactOp:
851 if (ExactFlags.IsExact)
852 O << " exact";
853 break;
854 case OperationType::OverflowingBinOp:
855 if (WrapFlags.HasNUW)
856 O << " nuw";
857 if (WrapFlags.HasNSW)
858 O << " nsw";
859 break;
860 case OperationType::FPMathOp:
861 getFastMathFlags().print(O);
862 break;
863 case OperationType::GEPOp:
864 if (GEPFlags.IsInBounds)
865 O << " inbounds";
866 break;
867 case OperationType::NonNegOp:
868 if (NonNegFlags.NonNeg)
869 O << " nneg";
870 break;
871 case OperationType::Other:
872 break;
873 }
874 if (getNumOperands() > 0)
875 O << " ";
876}
877#endif
878
879void VPWidenRecipe::execute(VPTransformState &State) {
880 State.setDebugLocFrom(getDebugLoc());
881 auto &Builder = State.Builder;
882 switch (Opcode) {
883 case Instruction::Call:
884 case Instruction::Br:
885 case Instruction::PHI:
886 case Instruction::GetElementPtr:
887 case Instruction::Select:
888 llvm_unreachable("This instruction is handled by a different recipe.");
889 case Instruction::UDiv:
890 case Instruction::SDiv:
891 case Instruction::SRem:
892 case Instruction::URem:
893 case Instruction::Add:
894 case Instruction::FAdd:
895 case Instruction::Sub:
896 case Instruction::FSub:
897 case Instruction::FNeg:
898 case Instruction::Mul:
899 case Instruction::FMul:
900 case Instruction::FDiv:
901 case Instruction::FRem:
902 case Instruction::Shl:
903 case Instruction::LShr:
904 case Instruction::AShr:
905 case Instruction::And:
906 case Instruction::Or:
907 case Instruction::Xor: {
908 // Just widen unops and binops.
909 for (unsigned Part = 0; Part < State.UF; ++Part) {
910 SmallVector<Value *, 2> Ops;
911 for (VPValue *VPOp : operands())
912 Ops.push_back(Elt: State.get(Def: VPOp, Part));
913
914 Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops);
915
916 if (auto *VecOp = dyn_cast<Instruction>(Val: V))
917 setFlags(VecOp);
918
919 // Use this vector value for all users of the original instruction.
920 State.set(Def: this, V, Part);
921 State.addMetadata(To: V, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
922 }
923
924 break;
925 }
926 case Instruction::Freeze: {
927 for (unsigned Part = 0; Part < State.UF; ++Part) {
928 Value *Op = State.get(Def: getOperand(N: 0), Part);
929
930 Value *Freeze = Builder.CreateFreeze(V: Op);
931 State.set(Def: this, V: Freeze, Part);
932 }
933 break;
934 }
935 case Instruction::ICmp:
936 case Instruction::FCmp: {
937 // Widen compares. Generate vector compares.
938 bool FCmp = Opcode == Instruction::FCmp;
939 for (unsigned Part = 0; Part < State.UF; ++Part) {
940 Value *A = State.get(Def: getOperand(N: 0), Part);
941 Value *B = State.get(Def: getOperand(N: 1), Part);
942 Value *C = nullptr;
943 if (FCmp) {
944 // Propagate fast math flags.
945 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
946 if (auto *I = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()))
947 Builder.setFastMathFlags(I->getFastMathFlags());
948 C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B);
949 } else {
950 C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B);
951 }
952 State.set(Def: this, V: C, Part);
953 State.addMetadata(To: C, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue()));
954 }
955
956 break;
957 }
958 default:
959 // This instruction is not vectorized by simple widening.
960 LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
961 << Instruction::getOpcodeName(Opcode));
962 llvm_unreachable("Unhandled instruction!");
963 } // end of switch.
964
965#if !defined(NDEBUG)
966 // Verify that VPlan type inference results agree with the type of the
967 // generated values.
968 for (unsigned Part = 0; Part < State.UF; ++Part) {
969 assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
970 State.VF) == State.get(this, Part)->getType() &&
971 "inferred type and type from generated instructions do not match");
972 }
973#endif
974}
975
976#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
977void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
978 VPSlotTracker &SlotTracker) const {
979 O << Indent << "WIDEN ";
980 printAsOperand(OS&: O, Tracker&: SlotTracker);
981 O << " = " << Instruction::getOpcodeName(Opcode);
982 printFlags(O);
983 printOperands(O, SlotTracker);
984}
985#endif
986
987void VPWidenCastRecipe::execute(VPTransformState &State) {
988 State.setDebugLocFrom(getDebugLoc());
989 auto &Builder = State.Builder;
990 /// Vectorize casts.
991 assert(State.VF.isVector() && "Not vectorizing?");
992 Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF);
993 VPValue *Op = getOperand(N: 0);
994 for (unsigned Part = 0; Part < State.UF; ++Part) {
995 if (Part > 0 && Op->isLiveIn()) {
996 // FIXME: Remove once explicit unrolling is implemented using VPlan.
997 State.set(Def: this, V: State.get(Def: this, Part: 0), Part);
998 continue;
999 }
1000 Value *A = State.get(Def: Op, Part);
1001 Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy);
1002 State.set(Def: this, V: Cast, Part);
1003 State.addMetadata(To: Cast, From: cast_or_null<Instruction>(Val: getUnderlyingValue()));
1004 }
1005}
1006
1007#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1008void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
1009 VPSlotTracker &SlotTracker) const {
1010 O << Indent << "WIDEN-CAST ";
1011 printAsOperand(OS&: O, Tracker&: SlotTracker);
1012 O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1013 printFlags(O);
1014 printOperands(O, SlotTracker);
1015 O << " to " << *getResultType();
1016}
1017#endif
1018
1019/// This function adds
1020/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
1021/// to each vector element of Val. The sequence starts at StartIndex.
1022/// \p Opcode is relevant for FP induction variable.
1023static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
1024 Instruction::BinaryOps BinOp, ElementCount VF,
1025 IRBuilderBase &Builder) {
1026 assert(VF.isVector() && "only vector VFs are supported");
1027
1028 // Create and check the types.
1029 auto *ValVTy = cast<VectorType>(Val: Val->getType());
1030 ElementCount VLen = ValVTy->getElementCount();
1031
1032 Type *STy = Val->getType()->getScalarType();
1033 assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1034 "Induction Step must be an integer or FP");
1035 assert(Step->getType() == STy && "Step has wrong type");
1036
1037 SmallVector<Constant *, 8> Indices;
1038
1039 // Create a vector of consecutive numbers from zero to VF.
1040 VectorType *InitVecValVTy = ValVTy;
1041 if (STy->isFloatingPointTy()) {
1042 Type *InitVecValSTy =
1043 IntegerType::get(C&: STy->getContext(), NumBits: STy->getScalarSizeInBits());
1044 InitVecValVTy = VectorType::get(ElementType: InitVecValSTy, EC: VLen);
1045 }
1046 Value *InitVec = Builder.CreateStepVector(DstType: InitVecValVTy);
1047
1048 // Splat the StartIdx
1049 Value *StartIdxSplat = Builder.CreateVectorSplat(EC: VLen, V: StartIdx);
1050
1051 if (STy->isIntegerTy()) {
1052 InitVec = Builder.CreateAdd(LHS: InitVec, RHS: StartIdxSplat);
1053 Step = Builder.CreateVectorSplat(EC: VLen, V: Step);
1054 assert(Step->getType() == Val->getType() && "Invalid step vec");
1055 // FIXME: The newly created binary instructions should contain nsw/nuw
1056 // flags, which can be found from the original scalar operations.
1057 Step = Builder.CreateMul(LHS: InitVec, RHS: Step);
1058 return Builder.CreateAdd(LHS: Val, RHS: Step, Name: "induction");
1059 }
1060
1061 // Floating point induction.
1062 assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1063 "Binary Opcode should be specified for FP induction");
1064 InitVec = Builder.CreateUIToFP(V: InitVec, DestTy: ValVTy);
1065 InitVec = Builder.CreateFAdd(L: InitVec, R: StartIdxSplat);
1066
1067 Step = Builder.CreateVectorSplat(EC: VLen, V: Step);
1068 Value *MulOp = Builder.CreateFMul(L: InitVec, R: Step);
1069 return Builder.CreateBinOp(Opc: BinOp, LHS: Val, RHS: MulOp, Name: "induction");
1070}
1071
1072/// A helper function that returns an integer or floating-point constant with
1073/// value C.
1074static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
1075 return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C)
1076 : ConstantFP::get(Ty, V: C);
1077}
1078
1079static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1080 ElementCount VF) {
1081 assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1082 Type *IntTy = IntegerType::get(C&: FTy->getContext(), NumBits: FTy->getScalarSizeInBits());
1083 Value *RuntimeVF = getRuntimeVF(B, Ty: IntTy, VF);
1084 return B.CreateUIToFP(V: RuntimeVF, DestTy: FTy);
1085}
1086
1087void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
1088 assert(!State.Instance && "Int or FP induction being replicated.");
1089
1090 Value *Start = getStartValue()->getLiveInIRValue();
1091 const InductionDescriptor &ID = getInductionDescriptor();
1092 TruncInst *Trunc = getTruncInst();
1093 IRBuilderBase &Builder = State.Builder;
1094 assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1095 assert(State.VF.isVector() && "must have vector VF");
1096
1097 // The value from the original loop to which we are mapping the new induction
1098 // variable.
1099 Instruction *EntryVal = Trunc ? cast<Instruction>(Val: Trunc) : IV;
1100
1101 // Fast-math-flags propagate from the original induction instruction.
1102 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
1103 if (ID.getInductionBinOp() && isa<FPMathOperator>(Val: ID.getInductionBinOp()))
1104 Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
1105
1106 // Now do the actual transformations, and start with fetching the step value.
1107 Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0));
1108
1109 assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1110 "Expected either an induction phi-node or a truncate of it!");
1111
1112 // Construct the initial value of the vector IV in the vector loop preheader
1113 auto CurrIP = Builder.saveIP();
1114 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
1115 Builder.SetInsertPoint(VectorPH->getTerminator());
1116 if (isa<TruncInst>(Val: EntryVal)) {
1117 assert(Start->getType()->isIntegerTy() &&
1118 "Truncation requires an integer type");
1119 auto *TruncType = cast<IntegerType>(Val: EntryVal->getType());
1120 Step = Builder.CreateTrunc(V: Step, DestTy: TruncType);
1121 Start = Builder.CreateCast(Op: Instruction::Trunc, V: Start, DestTy: TruncType);
1122 }
1123
1124 Value *Zero = getSignedIntOrFpConstant(Ty: Start->getType(), C: 0);
1125 Value *SplatStart = Builder.CreateVectorSplat(EC: State.VF, V: Start);
1126 Value *SteppedStart = getStepVector(
1127 Val: SplatStart, StartIdx: Zero, Step, BinOp: ID.getInductionOpcode(), VF: State.VF, Builder&: State.Builder);
1128
1129 // We create vector phi nodes for both integer and floating-point induction
1130 // variables. Here, we determine the kind of arithmetic we will perform.
1131 Instruction::BinaryOps AddOp;
1132 Instruction::BinaryOps MulOp;
1133 if (Step->getType()->isIntegerTy()) {
1134 AddOp = Instruction::Add;
1135 MulOp = Instruction::Mul;
1136 } else {
1137 AddOp = ID.getInductionOpcode();
1138 MulOp = Instruction::FMul;
1139 }
1140
1141 // Multiply the vectorization factor by the step using integer or
1142 // floating-point arithmetic as appropriate.
1143 Type *StepType = Step->getType();
1144 Value *RuntimeVF;
1145 if (Step->getType()->isFloatingPointTy())
1146 RuntimeVF = getRuntimeVFAsFloat(B&: Builder, FTy: StepType, VF: State.VF);
1147 else
1148 RuntimeVF = getRuntimeVF(B&: Builder, Ty: StepType, VF: State.VF);
1149 Value *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: Step, RHS: RuntimeVF);
1150
1151 // Create a vector splat to use in the induction update.
1152 //
1153 // FIXME: If the step is non-constant, we create the vector splat with
1154 // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1155 // handle a constant vector splat.
1156 Value *SplatVF = isa<Constant>(Val: Mul)
1157 ? ConstantVector::getSplat(EC: State.VF, Elt: cast<Constant>(Val: Mul))
1158 : Builder.CreateVectorSplat(EC: State.VF, V: Mul);
1159 Builder.restoreIP(IP: CurrIP);
1160
1161 // We may need to add the step a number of times, depending on the unroll
1162 // factor. The last of those goes into the PHI.
1163 PHINode *VecInd = PHINode::Create(Ty: SteppedStart->getType(), NumReservedValues: 2, NameStr: "vec.ind");
1164 VecInd->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
1165 VecInd->setDebugLoc(EntryVal->getDebugLoc());
1166 Instruction *LastInduction = VecInd;
1167 for (unsigned Part = 0; Part < State.UF; ++Part) {
1168 State.set(Def: this, V: LastInduction, Part);
1169
1170 if (isa<TruncInst>(Val: EntryVal))
1171 State.addMetadata(To: LastInduction, From: EntryVal);
1172
1173 LastInduction = cast<Instruction>(
1174 Val: Builder.CreateBinOp(Opc: AddOp, LHS: LastInduction, RHS: SplatVF, Name: "step.add"));
1175 LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1176 }
1177
1178 LastInduction->setName("vec.ind.next");
1179 VecInd->addIncoming(V: SteppedStart, BB: VectorPH);
1180 // Add induction update using an incorrect block temporarily. The phi node
1181 // will be fixed after VPlan execution. Note that at this point the latch
1182 // block cannot be used, as it does not exist yet.
1183 // TODO: Model increment value in VPlan, by turning the recipe into a
1184 // multi-def and a subclass of VPHeaderPHIRecipe.
1185 VecInd->addIncoming(V: LastInduction, BB: VectorPH);
1186}
1187
1188#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1189void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1190 VPSlotTracker &SlotTracker) const {
1191 O << Indent << "WIDEN-INDUCTION";
1192 if (getTruncInst()) {
1193 O << "\\l\"";
1194 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
1195 O << " +\n" << Indent << "\" ";
1196 getVPValue(I: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker);
1197 } else
1198 O << " " << VPlanIngredient(IV);
1199
1200 O << ", ";
1201 getStepValue()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1202}
1203#endif
1204
1205bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1206 // The step may be defined by a recipe in the preheader (e.g. if it requires
1207 // SCEV expansion), but for the canonical induction the step is required to be
1208 // 1, which is represented as live-in.
1209 if (getStepValue()->getDefiningRecipe())
1210 return false;
1211 auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue());
1212 auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue());
1213 return StartC && StartC->isZero() && StepC && StepC->isOne();
1214}
1215
1216#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1217void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1218 VPSlotTracker &SlotTracker) const {
1219 O << Indent;
1220 printAsOperand(OS&: O, Tracker&: SlotTracker);
1221 O << Indent << "= DERIVED-IV ";
1222 getStartValue()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1223 O << " + ";
1224 getOperand(N: 1)->printAsOperand(OS&: O, Tracker&: SlotTracker);
1225 O << " * ";
1226 getStepValue()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1227}
1228#endif
1229
1230void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1231 // Fast-math-flags propagate from the original induction instruction.
1232 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1233 if (hasFastMathFlags())
1234 State.Builder.setFastMathFlags(getFastMathFlags());
1235
1236 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1237 /// variable on which to base the steps, \p Step is the size of the step.
1238
1239 Value *BaseIV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
1240 Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0));
1241 IRBuilderBase &Builder = State.Builder;
1242
1243 // Ensure step has the same type as that of scalar IV.
1244 Type *BaseIVTy = BaseIV->getType()->getScalarType();
1245 assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!");
1246
1247 // We build scalar steps for both integer and floating-point induction
1248 // variables. Here, we determine the kind of arithmetic we will perform.
1249 Instruction::BinaryOps AddOp;
1250 Instruction::BinaryOps MulOp;
1251 if (BaseIVTy->isIntegerTy()) {
1252 AddOp = Instruction::Add;
1253 MulOp = Instruction::Mul;
1254 } else {
1255 AddOp = InductionOpcode;
1256 MulOp = Instruction::FMul;
1257 }
1258
1259 // Determine the number of scalars we need to generate for each unroll
1260 // iteration.
1261 bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this);
1262 // Compute the scalar steps and save the results in State.
1263 Type *IntStepTy =
1264 IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits());
1265 Type *VecIVTy = nullptr;
1266 Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1267 if (!FirstLaneOnly && State.VF.isScalable()) {
1268 VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF);
1269 UnitStepVec =
1270 Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF));
1271 SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step);
1272 SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV);
1273 }
1274
1275 unsigned StartPart = 0;
1276 unsigned EndPart = State.UF;
1277 unsigned StartLane = 0;
1278 unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1279 if (State.Instance) {
1280 StartPart = State.Instance->Part;
1281 EndPart = StartPart + 1;
1282 StartLane = State.Instance->Lane.getKnownLane();
1283 EndLane = StartLane + 1;
1284 }
1285 for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1286 Value *StartIdx0 = createStepForVF(B&: Builder, Ty: IntStepTy, VF: State.VF, Step: Part);
1287
1288 if (!FirstLaneOnly && State.VF.isScalable()) {
1289 auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0);
1290 auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec);
1291 if (BaseIVTy->isFloatingPointTy())
1292 InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy);
1293 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep);
1294 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul);
1295 State.set(Def: this, V: Add, Part);
1296 // It's useful to record the lane values too for the known minimum number
1297 // of elements so we do those below. This improves the code quality when
1298 // trying to extract the first element, for example.
1299 }
1300
1301 if (BaseIVTy->isFloatingPointTy())
1302 StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy);
1303
1304 for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1305 Value *StartIdx = Builder.CreateBinOp(
1306 Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane));
1307 // The step returned by `createStepForVF` is a runtime-evaluated value
1308 // when VF is scalable. Otherwise, it should be folded into a Constant.
1309 assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1310 "Expected StartIdx to be folded to a constant when VF is not "
1311 "scalable");
1312 auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step);
1313 auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul);
1314 State.set(Def: this, V: Add, Instance: VPIteration(Part, Lane));
1315 }
1316 }
1317}
1318
1319#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1320void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1321 VPSlotTracker &SlotTracker) const {
1322 O << Indent;
1323 printAsOperand(OS&: O, Tracker&: SlotTracker);
1324 O << " = SCALAR-STEPS ";
1325 printOperands(O, SlotTracker);
1326}
1327#endif
1328
1329void VPWidenGEPRecipe::execute(VPTransformState &State) {
1330 assert(State.VF.isVector() && "not widening");
1331 auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr());
1332 // Construct a vector GEP by widening the operands of the scalar GEP as
1333 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1334 // results in a vector of pointers when at least one operand of the GEP
1335 // is vector-typed. Thus, to keep the representation compact, we only use
1336 // vector-typed operands for loop-varying values.
1337
1338 if (areAllOperandsInvariant()) {
1339 // If we are vectorizing, but the GEP has only loop-invariant operands,
1340 // the GEP we build (by only using vector-typed operands for
1341 // loop-varying values) would be a scalar pointer. Thus, to ensure we
1342 // produce a vector of pointers, we need to either arbitrarily pick an
1343 // operand to broadcast, or broadcast a clone of the original GEP.
1344 // Here, we broadcast a clone of the original.
1345 //
1346 // TODO: If at some point we decide to scalarize instructions having
1347 // loop-invariant operands, this special case will no longer be
1348 // required. We would add the scalarization decision to
1349 // collectLoopScalars() and teach getVectorValue() to broadcast
1350 // the lane-zero scalar value.
1351 SmallVector<Value *> Ops;
1352 for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1353 Ops.push_back(Elt: State.get(Def: getOperand(N: I), Instance: VPIteration(0, 0)));
1354
1355 auto *NewGEP =
1356 State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0],
1357 IdxList: ArrayRef(Ops).drop_front(), Name: "", IsInBounds: isInBounds());
1358 for (unsigned Part = 0; Part < State.UF; ++Part) {
1359 Value *EntryPart = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP);
1360 State.set(Def: this, V: EntryPart, Part);
1361 State.addMetadata(To: EntryPart, From: GEP);
1362 }
1363 } else {
1364 // If the GEP has at least one loop-varying operand, we are sure to
1365 // produce a vector of pointers. But if we are only unrolling, we want
1366 // to produce a scalar GEP for each unroll part. Thus, the GEP we
1367 // produce with the code below will be scalar (if VF == 1) or vector
1368 // (otherwise). Note that for the unroll-only case, we still maintain
1369 // values in the vector mapping with initVector, as we do for other
1370 // instructions.
1371 for (unsigned Part = 0; Part < State.UF; ++Part) {
1372 // The pointer operand of the new GEP. If it's loop-invariant, we
1373 // won't broadcast it.
1374 auto *Ptr = isPointerLoopInvariant()
1375 ? State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0))
1376 : State.get(Def: getOperand(N: 0), Part);
1377
1378 // Collect all the indices for the new GEP. If any index is
1379 // loop-invariant, we won't broadcast it.
1380 SmallVector<Value *, 4> Indices;
1381 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1382 VPValue *Operand = getOperand(N: I);
1383 if (isIndexLoopInvariant(I: I - 1))
1384 Indices.push_back(Elt: State.get(Def: Operand, Instance: VPIteration(0, 0)));
1385 else
1386 Indices.push_back(Elt: State.get(Def: Operand, Part));
1387 }
1388
1389 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1390 // but it should be a vector, otherwise.
1391 auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr,
1392 IdxList: Indices, Name: "", IsInBounds: isInBounds());
1393 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1394 "NewGEP is not a pointer vector");
1395 State.set(Def: this, V: NewGEP, Part);
1396 State.addMetadata(To: NewGEP, From: GEP);
1397 }
1398 }
1399}
1400
1401#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1402void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1403 VPSlotTracker &SlotTracker) const {
1404 O << Indent << "WIDEN-GEP ";
1405 O << (isPointerLoopInvariant() ? "Inv" : "Var");
1406 for (size_t I = 0; I < getNumOperands() - 1; ++I)
1407 O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1408
1409 O << " ";
1410 printAsOperand(OS&: O, Tracker&: SlotTracker);
1411 O << " = getelementptr";
1412 printFlags(O);
1413 printOperands(O, SlotTracker);
1414}
1415#endif
1416
1417void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1418 auto &Builder = State.Builder;
1419 State.setDebugLocFrom(getDebugLoc());
1420 for (unsigned Part = 0; Part < State.UF; ++Part) {
1421 // Calculate the pointer for the specific unroll-part.
1422 Value *PartPtr = nullptr;
1423 // Use i32 for the gep index type when the value is constant,
1424 // or query DataLayout for a more suitable index type otherwise.
1425 const DataLayout &DL =
1426 Builder.GetInsertBlock()->getModule()->getDataLayout();
1427 Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1428 ? DL.getIndexType(PtrTy: IndexedTy->getPointerTo())
1429 : Builder.getInt32Ty();
1430 Value *Ptr = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
1431 bool InBounds = isInBounds();
1432 if (IsReverse) {
1433 // If the address is consecutive but reversed, then the
1434 // wide store needs to start at the last vector element.
1435 // RunTimeVF = VScale * VF.getKnownMinValue()
1436 // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1437 Value *RunTimeVF = getRuntimeVF(B&: Builder, Ty: IndexTy, VF: State.VF);
1438 // NumElt = -Part * RunTimeVF
1439 Value *NumElt = Builder.CreateMul(
1440 LHS: ConstantInt::get(Ty: IndexTy, V: -(int64_t)Part), RHS: RunTimeVF);
1441 // LastLane = 1 - RunTimeVF
1442 Value *LastLane =
1443 Builder.CreateSub(LHS: ConstantInt::get(Ty: IndexTy, V: 1), RHS: RunTimeVF);
1444 PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "", IsInBounds: InBounds);
1445 PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: PartPtr, IdxList: LastLane, Name: "", IsInBounds: InBounds);
1446 } else {
1447 Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: Part);
1448 PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "", IsInBounds: InBounds);
1449 }
1450
1451 State.set(Def: this, V: PartPtr, Part, /*IsScalar*/ true);
1452 }
1453}
1454
1455#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1456void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1457 VPSlotTracker &SlotTracker) const {
1458 O << Indent;
1459 printAsOperand(OS&: O, Tracker&: SlotTracker);
1460 O << " = vector-pointer ";
1461 if (IsReverse)
1462 O << "(reverse) ";
1463
1464 printOperands(O, SlotTracker);
1465}
1466#endif
1467
1468void VPBlendRecipe::execute(VPTransformState &State) {
1469 State.setDebugLocFrom(getDebugLoc());
1470 // We know that all PHIs in non-header blocks are converted into
1471 // selects, so we don't have to worry about the insertion order and we
1472 // can just use the builder.
1473 // At this point we generate the predication tree. There may be
1474 // duplications since this is a simple recursive scan, but future
1475 // optimizations will clean it up.
1476
1477 unsigned NumIncoming = getNumIncomingValues();
1478
1479 // Generate a sequence of selects of the form:
1480 // SELECT(Mask3, In3,
1481 // SELECT(Mask2, In2,
1482 // SELECT(Mask1, In1,
1483 // In0)))
1484 // Note that Mask0 is never used: lanes for which no path reaches this phi and
1485 // are essentially undef are taken from In0.
1486 VectorParts Entry(State.UF);
1487 for (unsigned In = 0; In < NumIncoming; ++In) {
1488 for (unsigned Part = 0; Part < State.UF; ++Part) {
1489 // We might have single edge PHIs (blocks) - use an identity
1490 // 'select' for the first PHI operand.
1491 Value *In0 = State.get(Def: getIncomingValue(Idx: In), Part);
1492 if (In == 0)
1493 Entry[Part] = In0; // Initialize with the first incoming value.
1494 else {
1495 // Select between the current value and the previous incoming edge
1496 // based on the incoming mask.
1497 Value *Cond = State.get(Def: getMask(Idx: In), Part);
1498 Entry[Part] =
1499 State.Builder.CreateSelect(C: Cond, True: In0, False: Entry[Part], Name: "predphi");
1500 }
1501 }
1502 }
1503 for (unsigned Part = 0; Part < State.UF; ++Part)
1504 State.set(Def: this, V: Entry[Part], Part);
1505}
1506
1507#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1508void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1509 VPSlotTracker &SlotTracker) const {
1510 O << Indent << "BLEND ";
1511 printAsOperand(OS&: O, Tracker&: SlotTracker);
1512 O << " =";
1513 if (getNumIncomingValues() == 1) {
1514 // Not a User of any mask: not really blending, this is a
1515 // single-predecessor phi.
1516 O << " ";
1517 getIncomingValue(Idx: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker);
1518 } else {
1519 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1520 O << " ";
1521 getIncomingValue(Idx: I)->printAsOperand(OS&: O, Tracker&: SlotTracker);
1522 if (I == 0)
1523 continue;
1524 O << "/";
1525 getMask(Idx: I)->printAsOperand(OS&: O, Tracker&: SlotTracker);
1526 }
1527 }
1528}
1529#endif
1530
1531void VPReductionRecipe::execute(VPTransformState &State) {
1532 assert(!State.Instance && "Reduction being replicated.");
1533 Value *PrevInChain = State.get(Def: getChainOp(), Part: 0, /*IsScalar*/ true);
1534 RecurKind Kind = RdxDesc.getRecurrenceKind();
1535 // Propagate the fast-math flags carried by the underlying instruction.
1536 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
1537 State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
1538 for (unsigned Part = 0; Part < State.UF; ++Part) {
1539 Value *NewVecOp = State.get(Def: getVecOp(), Part);
1540 if (VPValue *Cond = getCondOp()) {
1541 Value *NewCond = State.get(Def: Cond, Part, IsScalar: State.VF.isScalar());
1542 VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType());
1543 Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType();
1544 Value *Iden = RdxDesc.getRecurrenceIdentity(K: Kind, Tp: ElementTy,
1545 FMF: RdxDesc.getFastMathFlags());
1546 if (State.VF.isVector()) {
1547 Iden = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Iden);
1548 }
1549
1550 Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Iden);
1551 NewVecOp = Select;
1552 }
1553 Value *NewRed;
1554 Value *NextInChain;
1555 if (IsOrdered) {
1556 if (State.VF.isVector())
1557 NewRed = createOrderedReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp,
1558 Start: PrevInChain);
1559 else
1560 NewRed = State.Builder.CreateBinOp(
1561 Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: PrevInChain,
1562 RHS: NewVecOp);
1563 PrevInChain = NewRed;
1564 } else {
1565 PrevInChain = State.get(Def: getChainOp(), Part, /*IsScalar*/ true);
1566 NewRed = createTargetReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp);
1567 }
1568 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
1569 NextInChain = createMinMaxOp(Builder&: State.Builder, RK: RdxDesc.getRecurrenceKind(),
1570 Left: NewRed, Right: PrevInChain);
1571 } else if (IsOrdered)
1572 NextInChain = NewRed;
1573 else
1574 NextInChain = State.Builder.CreateBinOp(
1575 Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: NewRed, RHS: PrevInChain);
1576 State.set(Def: this, V: NextInChain, Part, /*IsScalar*/ true);
1577 }
1578}
1579
1580#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1581void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1582 VPSlotTracker &SlotTracker) const {
1583 O << Indent << "REDUCE ";
1584 printAsOperand(OS&: O, Tracker&: SlotTracker);
1585 O << " = ";
1586 getChainOp()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1587 O << " +";
1588 if (isa<FPMathOperator>(Val: getUnderlyingInstr()))
1589 O << getUnderlyingInstr()->getFastMathFlags();
1590 O << " reduce." << Instruction::getOpcodeName(Opcode: RdxDesc.getOpcode()) << " (";
1591 getVecOp()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1592 if (getCondOp()) {
1593 O << ", ";
1594 getCondOp()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1595 }
1596 O << ")";
1597 if (RdxDesc.IntermediateStore)
1598 O << " (with final reduction value stored in invariant address sank "
1599 "outside of loop)";
1600}
1601#endif
1602
1603bool VPReplicateRecipe::shouldPack() const {
1604 // Find if the recipe is used by a widened recipe via an intervening
1605 // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1606 return any_of(Range: users(), P: [](const VPUser *U) {
1607 if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U))
1608 return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) {
1609 return !U->usesScalars(Op: PredR);
1610 });
1611 return false;
1612 });
1613}
1614
1615#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1616void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1617 VPSlotTracker &SlotTracker) const {
1618 O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1619
1620 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1621 printAsOperand(OS&: O, Tracker&: SlotTracker);
1622 O << " = ";
1623 }
1624 if (auto *CB = dyn_cast<CallBase>(Val: getUnderlyingInstr())) {
1625 O << "call";
1626 printFlags(O);
1627 O << "@" << CB->getCalledFunction()->getName() << "(";
1628 interleaveComma(c: make_range(x: op_begin(), y: op_begin() + (getNumOperands() - 1)),
1629 os&: O, each_fn: [&O, &SlotTracker](VPValue *Op) {
1630 Op->printAsOperand(OS&: O, Tracker&: SlotTracker);
1631 });
1632 O << ")";
1633 } else {
1634 O << Instruction::getOpcodeName(Opcode: getUnderlyingInstr()->getOpcode());
1635 printFlags(O);
1636 printOperands(O, SlotTracker);
1637 }
1638
1639 if (shouldPack())
1640 O << " (S->V)";
1641}
1642#endif
1643
1644/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
1645/// if it is either defined outside the vector region or its operand is known to
1646/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
1647/// TODO: Uniformity should be associated with a VPValue and there should be a
1648/// generic way to check.
1649static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) {
1650 return C->isDefinedOutsideVectorRegions() ||
1651 isa<VPDerivedIVRecipe>(Val: C->getOperand(N: 0)) ||
1652 isa<VPCanonicalIVPHIRecipe>(Val: C->getOperand(N: 0));
1653}
1654
1655Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) {
1656 assert(vputils::onlyFirstLaneUsed(this) &&
1657 "Codegen only implemented for first lane.");
1658 switch (Opcode) {
1659 case Instruction::SExt:
1660 case Instruction::ZExt:
1661 case Instruction::Trunc: {
1662 // Note: SExt/ZExt not used yet.
1663 Value *Op = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0));
1664 return State.Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: Op, DestTy: ResultTy);
1665 }
1666 default:
1667 llvm_unreachable("opcode not implemented yet");
1668 }
1669}
1670
1671void VPScalarCastRecipe ::execute(VPTransformState &State) {
1672 bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(C: this);
1673 for (unsigned Part = 0; Part != State.UF; ++Part) {
1674 Value *Res;
1675 // Only generate a single instance, if the recipe is uniform across UFs and
1676 // VFs.
1677 if (Part > 0 && IsUniformAcrossVFsAndUFs)
1678 Res = State.get(Def: this, Instance: VPIteration(0, 0));
1679 else
1680 Res = generate(State, Part);
1681 State.set(Def: this, V: Res, Instance: VPIteration(Part, 0));
1682 }
1683}
1684
1685#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1686void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
1687 VPSlotTracker &SlotTracker) const {
1688 O << Indent << "SCALAR-CAST ";
1689 printAsOperand(OS&: O, Tracker&: SlotTracker);
1690 O << " = " << Instruction::getOpcodeName(Opcode) << " ";
1691 printOperands(O, SlotTracker);
1692 O << " to " << *ResultTy;
1693}
1694#endif
1695
1696void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1697 assert(State.Instance && "Branch on Mask works only on single instance.");
1698
1699 unsigned Part = State.Instance->Part;
1700 unsigned Lane = State.Instance->Lane.getKnownLane();
1701
1702 Value *ConditionBit = nullptr;
1703 VPValue *BlockInMask = getMask();
1704 if (BlockInMask) {
1705 ConditionBit = State.get(Def: BlockInMask, Part);
1706 if (ConditionBit->getType()->isVectorTy())
1707 ConditionBit = State.Builder.CreateExtractElement(
1708 Vec: ConditionBit, Idx: State.Builder.getInt32(C: Lane));
1709 } else // Block in mask is all-one.
1710 ConditionBit = State.Builder.getTrue();
1711
1712 // Replace the temporary unreachable terminator with a new conditional branch,
1713 // whose two destinations will be set later when they are created.
1714 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1715 assert(isa<UnreachableInst>(CurrentTerminator) &&
1716 "Expected to replace unreachable terminator with conditional branch.");
1717 auto *CondBr = BranchInst::Create(IfTrue: State.CFG.PrevBB, IfFalse: nullptr, Cond: ConditionBit);
1718 CondBr->setSuccessor(idx: 0, NewSucc: nullptr);
1719 ReplaceInstWithInst(From: CurrentTerminator, To: CondBr);
1720}
1721
1722void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1723 assert(State.Instance && "Predicated instruction PHI works per instance.");
1724 Instruction *ScalarPredInst =
1725 cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Instance: *State.Instance));
1726 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1727 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1728 assert(PredicatingBB && "Predicated block has no single predecessor.");
1729 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1730 "operand must be VPReplicateRecipe");
1731
1732 // By current pack/unpack logic we need to generate only a single phi node: if
1733 // a vector value for the predicated instruction exists at this point it means
1734 // the instruction has vector users only, and a phi for the vector value is
1735 // needed. In this case the recipe of the predicated instruction is marked to
1736 // also do that packing, thereby "hoisting" the insert-element sequence.
1737 // Otherwise, a phi node for the scalar value is needed.
1738 unsigned Part = State.Instance->Part;
1739 if (State.hasVectorValue(Def: getOperand(N: 0), Part)) {
1740 Value *VectorValue = State.get(Def: getOperand(N: 0), Part);
1741 InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue);
1742 PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: 2);
1743 VPhi->addIncoming(V: IEI->getOperand(i_nocapture: 0), BB: PredicatingBB); // Unmodified vector.
1744 VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element.
1745 if (State.hasVectorValue(Def: this, Part))
1746 State.reset(Def: this, V: VPhi, Part);
1747 else
1748 State.set(Def: this, V: VPhi, Part);
1749 // NOTE: Currently we need to update the value of the operand, so the next
1750 // predicated iteration inserts its generated value in the correct vector.
1751 State.reset(Def: getOperand(N: 0), V: VPhi, Part);
1752 } else {
1753 Type *PredInstType = getOperand(N: 0)->getUnderlyingValue()->getType();
1754 PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2);
1755 Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()),
1756 BB: PredicatingBB);
1757 Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB);
1758 if (State.hasScalarValue(Def: this, Instance: *State.Instance))
1759 State.reset(Def: this, V: Phi, Instance: *State.Instance);
1760 else
1761 State.set(Def: this, V: Phi, Instance: *State.Instance);
1762 // NOTE: Currently we need to update the value of the operand, so the next
1763 // predicated iteration inserts its generated value in the correct vector.
1764 State.reset(Def: getOperand(N: 0), V: Phi, Instance: *State.Instance);
1765 }
1766}
1767
1768#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1769void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1770 VPSlotTracker &SlotTracker) const {
1771 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1772 printAsOperand(OS&: O, Tracker&: SlotTracker);
1773 O << " = ";
1774 printOperands(O, SlotTracker);
1775}
1776
1777void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
1778 VPSlotTracker &SlotTracker) const {
1779 O << Indent << "WIDEN ";
1780 printAsOperand(OS&: O, Tracker&: SlotTracker);
1781 O << " = load ";
1782 printOperands(O, SlotTracker);
1783}
1784
1785void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1786 VPSlotTracker &SlotTracker) const {
1787 O << Indent << "WIDEN ";
1788 printAsOperand(OS&: O, Tracker&: SlotTracker);
1789 O << " = vp.load ";
1790 printOperands(O, SlotTracker);
1791}
1792
1793void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
1794 VPSlotTracker &SlotTracker) const {
1795 O << Indent << "WIDEN store ";
1796 printOperands(O, SlotTracker);
1797}
1798
1799void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent,
1800 VPSlotTracker &SlotTracker) const {
1801 O << Indent << "WIDEN vp.store ";
1802 printOperands(O, SlotTracker);
1803}
1804#endif
1805
1806void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1807 Value *Start = getStartValue()->getLiveInIRValue();
1808 PHINode *EntryPart = PHINode::Create(Ty: Start->getType(), NumReservedValues: 2, NameStr: "index");
1809 EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
1810
1811 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
1812 EntryPart->addIncoming(V: Start, BB: VectorPH);
1813 EntryPart->setDebugLoc(getDebugLoc());
1814 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1815 State.set(Def: this, V: EntryPart, Part, /*IsScalar*/ true);
1816}
1817
1818#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1819void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1820 VPSlotTracker &SlotTracker) const {
1821 O << Indent << "EMIT ";
1822 printAsOperand(OS&: O, Tracker&: SlotTracker);
1823 O << " = CANONICAL-INDUCTION ";
1824 printOperands(O, SlotTracker);
1825}
1826#endif
1827
1828bool VPCanonicalIVPHIRecipe::isCanonical(
1829 InductionDescriptor::InductionKind Kind, VPValue *Start,
1830 VPValue *Step) const {
1831 // Must be an integer induction.
1832 if (Kind != InductionDescriptor::IK_IntInduction)
1833 return false;
1834 // Start must match the start value of this canonical induction.
1835 if (Start != getStartValue())
1836 return false;
1837
1838 // If the step is defined by a recipe, it is not a ConstantInt.
1839 if (Step->getDefiningRecipe())
1840 return false;
1841
1842 ConstantInt *StepC = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue());
1843 return StepC && StepC->isOne();
1844}
1845
1846bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) {
1847 return IsScalarAfterVectorization &&
1848 (!IsScalable || vputils::onlyFirstLaneUsed(Def: this));
1849}
1850
1851#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1852void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1853 VPSlotTracker &SlotTracker) const {
1854 O << Indent << "EMIT ";
1855 printAsOperand(OS&: O, Tracker&: SlotTracker);
1856 O << " = WIDEN-POINTER-INDUCTION ";
1857 getStartValue()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1858 O << ", " << *IndDesc.getStep();
1859}
1860#endif
1861
1862void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1863 assert(!State.Instance && "cannot be used in per-lane");
1864 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1865 SCEVExpander Exp(SE, DL, "induction");
1866
1867 Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(),
1868 I: &*State.Builder.GetInsertPoint());
1869 assert(!State.ExpandedSCEVs.contains(Expr) &&
1870 "Same SCEV expanded multiple times");
1871 State.ExpandedSCEVs[Expr] = Res;
1872 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1873 State.set(Def: this, V: Res, Instance: {Part, 0});
1874}
1875
1876#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1877void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1878 VPSlotTracker &SlotTracker) const {
1879 O << Indent << "EMIT ";
1880 getVPSingleValue()->printAsOperand(OS&: O, Tracker&: SlotTracker);
1881 O << " = EXPAND SCEV " << *Expr;
1882}
1883#endif
1884
1885void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1886 Value *CanonicalIV = State.get(Def: getOperand(N: 0), Part: 0, /*IsScalar*/ true);
1887 Type *STy = CanonicalIV->getType();
1888 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1889 ElementCount VF = State.VF;
1890 Value *VStart = VF.isScalar()
1891 ? CanonicalIV
1892 : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast");
1893 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1894 Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: Part);
1895 if (VF.isVector()) {
1896 VStep = Builder.CreateVectorSplat(EC: VF, V: VStep);
1897 VStep =
1898 Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType()));
1899 }
1900 Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv");
1901 State.set(Def: this, V: CanonicalVectorIV, Part);
1902 }
1903}
1904
1905#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1906void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1907 VPSlotTracker &SlotTracker) const {
1908 O << Indent << "EMIT ";
1909 printAsOperand(OS&: O, Tracker&: SlotTracker);
1910 O << " = WIDEN-CANONICAL-INDUCTION ";
1911 printOperands(O, SlotTracker);
1912}
1913#endif
1914
1915void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1916 auto &Builder = State.Builder;
1917 // Create a vector from the initial value.
1918 auto *VectorInit = getStartValue()->getLiveInIRValue();
1919
1920 Type *VecTy = State.VF.isScalar()
1921 ? VectorInit->getType()
1922 : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF);
1923
1924 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
1925 if (State.VF.isVector()) {
1926 auto *IdxTy = Builder.getInt32Ty();
1927 auto *One = ConstantInt::get(Ty: IdxTy, V: 1);
1928 IRBuilder<>::InsertPointGuard Guard(Builder);
1929 Builder.SetInsertPoint(VectorPH->getTerminator());
1930 auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF);
1931 auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One);
1932 VectorInit = Builder.CreateInsertElement(
1933 Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init");
1934 }
1935
1936 // Create a phi node for the new recurrence.
1937 PHINode *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur");
1938 EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt());
1939 EntryPart->addIncoming(V: VectorInit, BB: VectorPH);
1940 State.set(Def: this, V: EntryPart, Part: 0);
1941}
1942
1943#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1944void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1945 VPSlotTracker &SlotTracker) const {
1946 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1947 printAsOperand(OS&: O, Tracker&: SlotTracker);
1948 O << " = phi ";
1949 printOperands(O, SlotTracker);
1950}
1951#endif
1952
1953void VPReductionPHIRecipe::execute(VPTransformState &State) {
1954 auto &Builder = State.Builder;
1955
1956 // Reductions do not have to start at zero. They can start with
1957 // any loop invariant values.
1958 VPValue *StartVPV = getStartValue();
1959 Value *StartV = StartVPV->getLiveInIRValue();
1960
1961 // In order to support recurrences we need to be able to vectorize Phi nodes.
1962 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1963 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1964 // this value when we vectorize all of the instructions that use the PHI.
1965 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1966 Type *VecTy = ScalarPHI ? StartV->getType()
1967 : VectorType::get(ElementType: StartV->getType(), EC: State.VF);
1968
1969 BasicBlock *HeaderBB = State.CFG.PrevBB;
1970 assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1971 "recipe must be in the vector loop header");
1972 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1973 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1974 Instruction *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi");
1975 EntryPart->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt());
1976 State.set(Def: this, V: EntryPart, Part, IsScalar: IsInLoop);
1977 }
1978
1979 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
1980
1981 Value *Iden = nullptr;
1982 RecurKind RK = RdxDesc.getRecurrenceKind();
1983 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) ||
1984 RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
1985 // MinMax and AnyOf reductions have the start value as their identity.
1986 if (ScalarPHI) {
1987 Iden = StartV;
1988 } else {
1989 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1990 Builder.SetInsertPoint(VectorPH->getTerminator());
1991 StartV = Iden =
1992 Builder.CreateVectorSplat(EC: State.VF, V: StartV, Name: "minmax.ident");
1993 }
1994 } else {
1995 Iden = RdxDesc.getRecurrenceIdentity(K: RK, Tp: VecTy->getScalarType(),
1996 FMF: RdxDesc.getFastMathFlags());
1997
1998 if (!ScalarPHI) {
1999 Iden = Builder.CreateVectorSplat(EC: State.VF, V: Iden);
2000 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
2001 Builder.SetInsertPoint(VectorPH->getTerminator());
2002 Constant *Zero = Builder.getInt32(C: 0);
2003 StartV = Builder.CreateInsertElement(Vec: Iden, NewElt: StartV, Idx: Zero);
2004 }
2005 }
2006
2007 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
2008 Value *EntryPart = State.get(Def: this, Part, IsScalar: IsInLoop);
2009 // Make sure to add the reduction start value only to the
2010 // first unroll part.
2011 Value *StartVal = (Part == 0) ? StartV : Iden;
2012 cast<PHINode>(Val: EntryPart)->addIncoming(V: StartVal, BB: VectorPH);
2013 }
2014}
2015
2016#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2017void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2018 VPSlotTracker &SlotTracker) const {
2019 O << Indent << "WIDEN-REDUCTION-PHI ";
2020
2021 printAsOperand(OS&: O, Tracker&: SlotTracker);
2022 O << " = phi ";
2023 printOperands(O, SlotTracker);
2024}
2025#endif
2026
2027void VPWidenPHIRecipe::execute(VPTransformState &State) {
2028 assert(EnableVPlanNativePath &&
2029 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
2030
2031 Value *Op0 = State.get(Def: getOperand(N: 0), Part: 0);
2032 Type *VecTy = Op0->getType();
2033 Value *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name: "vec.phi");
2034 State.set(Def: this, V: VecPhi, Part: 0);
2035}
2036
2037#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2038void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2039 VPSlotTracker &SlotTracker) const {
2040 O << Indent << "WIDEN-PHI ";
2041
2042 auto *OriginalPhi = cast<PHINode>(Val: getUnderlyingValue());
2043 // Unless all incoming values are modeled in VPlan print the original PHI
2044 // directly.
2045 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
2046 // values as VPValues.
2047 if (getNumOperands() != OriginalPhi->getNumOperands()) {
2048 O << VPlanIngredient(OriginalPhi);
2049 return;
2050 }
2051
2052 printAsOperand(OS&: O, Tracker&: SlotTracker);
2053 O << " = phi ";
2054 printOperands(O, SlotTracker);
2055}
2056#endif
2057
2058// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
2059// remove VPActiveLaneMaskPHIRecipe.
2060void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
2061 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
2062 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
2063 Value *StartMask = State.get(Def: getOperand(N: 0), Part);
2064 PHINode *EntryPart =
2065 State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask");
2066 EntryPart->addIncoming(V: StartMask, BB: VectorPH);
2067 EntryPart->setDebugLoc(getDebugLoc());
2068 State.set(Def: this, V: EntryPart, Part);
2069 }
2070}
2071
2072#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2073void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2074 VPSlotTracker &SlotTracker) const {
2075 O << Indent << "ACTIVE-LANE-MASK-PHI ";
2076
2077 printAsOperand(OS&: O, Tracker&: SlotTracker);
2078 O << " = phi ";
2079 printOperands(O, SlotTracker);
2080}
2081#endif
2082
2083void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) {
2084 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
2085 assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization.");
2086 Value *Start = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0));
2087 PHINode *EntryPart =
2088 State.Builder.CreatePHI(Ty: Start->getType(), NumReservedValues: 2, Name: "evl.based.iv");
2089 EntryPart->addIncoming(V: Start, BB: VectorPH);
2090 EntryPart->setDebugLoc(getDebugLoc());
2091 State.set(Def: this, V: EntryPart, Part: 0, /*IsScalar=*/true);
2092}
2093
2094#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2095void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
2096 VPSlotTracker &SlotTracker) const {
2097 O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI ";
2098
2099 printAsOperand(OS&: O, Tracker&: SlotTracker);
2100 O << " = phi ";
2101 printOperands(O, SlotTracker);
2102}
2103#endif
2104

source code of llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp