1 | //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// |
9 | /// \file |
10 | /// This file contains implementations for different VPlan recipes. |
11 | /// |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "VPlan.h" |
15 | #include "VPlanAnalysis.h" |
16 | #include "llvm/ADT/STLExtras.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/Twine.h" |
19 | #include "llvm/Analysis/IVDescriptors.h" |
20 | #include "llvm/IR/BasicBlock.h" |
21 | #include "llvm/IR/IRBuilder.h" |
22 | #include "llvm/IR/Instruction.h" |
23 | #include "llvm/IR/Instructions.h" |
24 | #include "llvm/IR/Type.h" |
25 | #include "llvm/IR/Value.h" |
26 | #include "llvm/Support/Casting.h" |
27 | #include "llvm/Support/CommandLine.h" |
28 | #include "llvm/Support/Debug.h" |
29 | #include "llvm/Support/raw_ostream.h" |
30 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
31 | #include "llvm/Transforms/Utils/LoopUtils.h" |
32 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
33 | #include <cassert> |
34 | |
35 | using namespace llvm; |
36 | |
37 | using VectorParts = SmallVector<Value *, 2>; |
38 | |
39 | namespace llvm { |
40 | extern cl::opt<bool> EnableVPlanNativePath; |
41 | } |
42 | |
43 | #define LV_NAME "loop-vectorize" |
44 | #define DEBUG_TYPE LV_NAME |
45 | |
46 | bool VPRecipeBase::mayWriteToMemory() const { |
47 | switch (getVPDefID()) { |
48 | case VPInterleaveSC: |
49 | return cast<VPInterleaveRecipe>(Val: this)->getNumStoreOperands() > 0; |
50 | case VPWidenStoreEVLSC: |
51 | case VPWidenStoreSC: |
52 | return true; |
53 | case VPReplicateSC: |
54 | case VPWidenCallSC: |
55 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
56 | ->mayWriteToMemory(); |
57 | case VPBranchOnMaskSC: |
58 | case VPScalarIVStepsSC: |
59 | case VPPredInstPHISC: |
60 | return false; |
61 | case VPBlendSC: |
62 | case VPReductionSC: |
63 | case VPWidenCanonicalIVSC: |
64 | case VPWidenCastSC: |
65 | case VPWidenGEPSC: |
66 | case VPWidenIntOrFpInductionSC: |
67 | case VPWidenLoadEVLSC: |
68 | case VPWidenLoadSC: |
69 | case VPWidenPHISC: |
70 | case VPWidenSC: |
71 | case VPWidenSelectSC: { |
72 | const Instruction *I = |
73 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
74 | (void)I; |
75 | assert((!I || !I->mayWriteToMemory()) && |
76 | "underlying instruction may write to memory" ); |
77 | return false; |
78 | } |
79 | default: |
80 | return true; |
81 | } |
82 | } |
83 | |
84 | bool VPRecipeBase::mayReadFromMemory() const { |
85 | switch (getVPDefID()) { |
86 | case VPWidenLoadEVLSC: |
87 | case VPWidenLoadSC: |
88 | return true; |
89 | case VPReplicateSC: |
90 | case VPWidenCallSC: |
91 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
92 | ->mayReadFromMemory(); |
93 | case VPBranchOnMaskSC: |
94 | case VPPredInstPHISC: |
95 | case VPScalarIVStepsSC: |
96 | case VPWidenStoreEVLSC: |
97 | case VPWidenStoreSC: |
98 | return false; |
99 | case VPBlendSC: |
100 | case VPReductionSC: |
101 | case VPWidenCanonicalIVSC: |
102 | case VPWidenCastSC: |
103 | case VPWidenGEPSC: |
104 | case VPWidenIntOrFpInductionSC: |
105 | case VPWidenPHISC: |
106 | case VPWidenSC: |
107 | case VPWidenSelectSC: { |
108 | const Instruction *I = |
109 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
110 | (void)I; |
111 | assert((!I || !I->mayReadFromMemory()) && |
112 | "underlying instruction may read from memory" ); |
113 | return false; |
114 | } |
115 | default: |
116 | return true; |
117 | } |
118 | } |
119 | |
120 | bool VPRecipeBase::mayHaveSideEffects() const { |
121 | switch (getVPDefID()) { |
122 | case VPDerivedIVSC: |
123 | case VPPredInstPHISC: |
124 | case VPScalarCastSC: |
125 | return false; |
126 | case VPInstructionSC: |
127 | switch (cast<VPInstruction>(Val: this)->getOpcode()) { |
128 | case Instruction::Or: |
129 | case Instruction::ICmp: |
130 | case Instruction::Select: |
131 | case VPInstruction::Not: |
132 | case VPInstruction::CalculateTripCountMinusVF: |
133 | case VPInstruction::CanonicalIVIncrementForPart: |
134 | case VPInstruction::PtrAdd: |
135 | return false; |
136 | default: |
137 | return true; |
138 | } |
139 | case VPWidenCallSC: |
140 | return cast<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()) |
141 | ->mayHaveSideEffects(); |
142 | case VPBlendSC: |
143 | case VPReductionSC: |
144 | case VPScalarIVStepsSC: |
145 | case VPWidenCanonicalIVSC: |
146 | case VPWidenCastSC: |
147 | case VPWidenGEPSC: |
148 | case VPWidenIntOrFpInductionSC: |
149 | case VPWidenPHISC: |
150 | case VPWidenPointerInductionSC: |
151 | case VPWidenSC: |
152 | case VPWidenSelectSC: { |
153 | const Instruction *I = |
154 | dyn_cast_or_null<Instruction>(Val: getVPSingleValue()->getUnderlyingValue()); |
155 | (void)I; |
156 | assert((!I || !I->mayHaveSideEffects()) && |
157 | "underlying instruction has side-effects" ); |
158 | return false; |
159 | } |
160 | case VPInterleaveSC: |
161 | return mayWriteToMemory(); |
162 | case VPWidenLoadEVLSC: |
163 | case VPWidenLoadSC: |
164 | case VPWidenStoreEVLSC: |
165 | case VPWidenStoreSC: |
166 | assert( |
167 | cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() == |
168 | mayWriteToMemory() && |
169 | "mayHaveSideffects result for ingredient differs from this " |
170 | "implementation" ); |
171 | return mayWriteToMemory(); |
172 | case VPReplicateSC: { |
173 | auto *R = cast<VPReplicateRecipe>(Val: this); |
174 | return R->getUnderlyingInstr()->mayHaveSideEffects(); |
175 | } |
176 | default: |
177 | return true; |
178 | } |
179 | } |
180 | |
181 | void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) { |
182 | auto Lane = VPLane::getLastLaneForVF(VF: State.VF); |
183 | VPValue *ExitValue = getOperand(N: 0); |
184 | if (vputils::isUniformAfterVectorization(VPV: ExitValue)) |
185 | Lane = VPLane::getFirstLane(); |
186 | VPBasicBlock *MiddleVPBB = |
187 | cast<VPBasicBlock>(Val: Plan.getVectorLoopRegion()->getSingleSuccessor()); |
188 | assert(MiddleVPBB->getNumSuccessors() == 0 && |
189 | "the middle block must not have any successors" ); |
190 | BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB]; |
191 | Phi->addIncoming(V: State.get(Def: ExitValue, Instance: VPIteration(State.UF - 1, Lane)), |
192 | BB: MiddleBB); |
193 | } |
194 | |
195 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
196 | void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const { |
197 | O << "Live-out " ; |
198 | getPhi()->printAsOperand(O); |
199 | O << " = " ; |
200 | getOperand(N: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
201 | O << "\n" ; |
202 | } |
203 | #endif |
204 | |
205 | void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { |
206 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
207 | assert(InsertPos->getParent() && |
208 | "Insertion position not in any VPBasicBlock" ); |
209 | InsertPos->getParent()->insert(Recipe: this, InsertPt: InsertPos->getIterator()); |
210 | } |
211 | |
212 | void VPRecipeBase::insertBefore(VPBasicBlock &BB, |
213 | iplist<VPRecipeBase>::iterator I) { |
214 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
215 | assert(I == BB.end() || I->getParent() == &BB); |
216 | BB.insert(Recipe: this, InsertPt: I); |
217 | } |
218 | |
219 | void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) { |
220 | assert(!Parent && "Recipe already in some VPBasicBlock" ); |
221 | assert(InsertPos->getParent() && |
222 | "Insertion position not in any VPBasicBlock" ); |
223 | InsertPos->getParent()->insert(Recipe: this, InsertPt: std::next(x: InsertPos->getIterator())); |
224 | } |
225 | |
226 | void VPRecipeBase::removeFromParent() { |
227 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
228 | getParent()->getRecipeList().remove(IT: getIterator()); |
229 | Parent = nullptr; |
230 | } |
231 | |
232 | iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() { |
233 | assert(getParent() && "Recipe not in any VPBasicBlock" ); |
234 | return getParent()->getRecipeList().erase(where: getIterator()); |
235 | } |
236 | |
237 | void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) { |
238 | removeFromParent(); |
239 | insertAfter(InsertPos); |
240 | } |
241 | |
242 | void VPRecipeBase::moveBefore(VPBasicBlock &BB, |
243 | iplist<VPRecipeBase>::iterator I) { |
244 | removeFromParent(); |
245 | insertBefore(BB, I); |
246 | } |
247 | |
248 | FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const { |
249 | assert(OpType == OperationType::FPMathOp && |
250 | "recipe doesn't have fast math flags" ); |
251 | FastMathFlags Res; |
252 | Res.setAllowReassoc(FMFs.AllowReassoc); |
253 | Res.setNoNaNs(FMFs.NoNaNs); |
254 | Res.setNoInfs(FMFs.NoInfs); |
255 | Res.setNoSignedZeros(FMFs.NoSignedZeros); |
256 | Res.setAllowReciprocal(FMFs.AllowReciprocal); |
257 | Res.setAllowContract(FMFs.AllowContract); |
258 | Res.setApproxFunc(FMFs.ApproxFunc); |
259 | return Res; |
260 | } |
261 | |
262 | VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred, |
263 | VPValue *A, VPValue *B, DebugLoc DL, |
264 | const Twine &Name) |
265 | : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}), |
266 | Pred, DL), |
267 | Opcode(Opcode), Name(Name.str()) { |
268 | assert(Opcode == Instruction::ICmp && |
269 | "only ICmp predicates supported at the moment" ); |
270 | } |
271 | |
272 | VPInstruction::VPInstruction(unsigned Opcode, |
273 | std::initializer_list<VPValue *> Operands, |
274 | FastMathFlags FMFs, DebugLoc DL, const Twine &Name) |
275 | : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL), |
276 | Opcode(Opcode), Name(Name.str()) { |
277 | // Make sure the VPInstruction is a floating-point operation. |
278 | assert(isFPMathOp() && "this op can't take fast-math flags" ); |
279 | } |
280 | |
281 | bool VPInstruction::doesGeneratePerAllLanes() const { |
282 | return Opcode == VPInstruction::PtrAdd && !vputils::onlyFirstLaneUsed(Def: this); |
283 | } |
284 | |
285 | bool VPInstruction::canGenerateScalarForFirstLane() const { |
286 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
287 | return true; |
288 | |
289 | switch (Opcode) { |
290 | case VPInstruction::BranchOnCond: |
291 | case VPInstruction::BranchOnCount: |
292 | case VPInstruction::CalculateTripCountMinusVF: |
293 | case VPInstruction::CanonicalIVIncrementForPart: |
294 | case VPInstruction::ComputeReductionResult: |
295 | case VPInstruction::PtrAdd: |
296 | case VPInstruction::ExplicitVectorLength: |
297 | return true; |
298 | default: |
299 | return false; |
300 | } |
301 | } |
302 | |
303 | Value *VPInstruction::generatePerLane(VPTransformState &State, |
304 | const VPIteration &Lane) { |
305 | IRBuilderBase &Builder = State.Builder; |
306 | |
307 | assert(getOpcode() == VPInstruction::PtrAdd && |
308 | "only PtrAdd opcodes are supported for now" ); |
309 | return Builder.CreatePtrAdd(Ptr: State.get(Def: getOperand(N: 0), Instance: Lane), |
310 | Offset: State.get(Def: getOperand(N: 1), Instance: Lane), Name); |
311 | } |
312 | |
313 | Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { |
314 | IRBuilderBase &Builder = State.Builder; |
315 | |
316 | if (Instruction::isBinaryOp(Opcode: getOpcode())) { |
317 | bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(Def: this); |
318 | if (Part != 0 && vputils::onlyFirstPartUsed(Def: this)) |
319 | return State.get(Def: this, Part: 0, IsScalar: OnlyFirstLaneUsed); |
320 | |
321 | Value *A = State.get(Def: getOperand(N: 0), Part, IsScalar: OnlyFirstLaneUsed); |
322 | Value *B = State.get(Def: getOperand(N: 1), Part, IsScalar: OnlyFirstLaneUsed); |
323 | auto *Res = |
324 | Builder.CreateBinOp(Opc: (Instruction::BinaryOps)getOpcode(), LHS: A, RHS: B, Name); |
325 | if (auto *I = dyn_cast<Instruction>(Val: Res)) |
326 | setFlags(I); |
327 | return Res; |
328 | } |
329 | |
330 | switch (getOpcode()) { |
331 | case VPInstruction::Not: { |
332 | Value *A = State.get(Def: getOperand(N: 0), Part); |
333 | return Builder.CreateNot(V: A, Name); |
334 | } |
335 | case Instruction::ICmp: { |
336 | Value *A = State.get(Def: getOperand(N: 0), Part); |
337 | Value *B = State.get(Def: getOperand(N: 1), Part); |
338 | return Builder.CreateCmp(Pred: getPredicate(), LHS: A, RHS: B, Name); |
339 | } |
340 | case Instruction::Select: { |
341 | Value *Cond = State.get(Def: getOperand(N: 0), Part); |
342 | Value *Op1 = State.get(Def: getOperand(N: 1), Part); |
343 | Value *Op2 = State.get(Def: getOperand(N: 2), Part); |
344 | return Builder.CreateSelect(C: Cond, True: Op1, False: Op2, Name); |
345 | } |
346 | case VPInstruction::ActiveLaneMask: { |
347 | // Get first lane of vector induction variable. |
348 | Value *VIVElem0 = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0)); |
349 | // Get the original loop tripcount. |
350 | Value *ScalarTC = State.get(Def: getOperand(N: 1), Instance: VPIteration(Part, 0)); |
351 | |
352 | // If this part of the active lane mask is scalar, generate the CMP directly |
353 | // to avoid unnecessary extracts. |
354 | if (State.VF.isScalar()) |
355 | return Builder.CreateCmp(Pred: CmpInst::Predicate::ICMP_ULT, LHS: VIVElem0, RHS: ScalarTC, |
356 | Name); |
357 | |
358 | auto *Int1Ty = Type::getInt1Ty(C&: Builder.getContext()); |
359 | auto *PredTy = VectorType::get(ElementType: Int1Ty, EC: State.VF); |
360 | return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, |
361 | {PredTy, ScalarTC->getType()}, |
362 | {VIVElem0, ScalarTC}, nullptr, Name); |
363 | } |
364 | case VPInstruction::FirstOrderRecurrenceSplice: { |
365 | // Generate code to combine the previous and current values in vector v3. |
366 | // |
367 | // vector.ph: |
368 | // v_init = vector(..., ..., ..., a[-1]) |
369 | // br vector.body |
370 | // |
371 | // vector.body |
372 | // i = phi [0, vector.ph], [i+4, vector.body] |
373 | // v1 = phi [v_init, vector.ph], [v2, vector.body] |
374 | // v2 = a[i, i+1, i+2, i+3]; |
375 | // v3 = vector(v1(3), v2(0, 1, 2)) |
376 | |
377 | // For the first part, use the recurrence phi (v1), otherwise v2. |
378 | auto *V1 = State.get(Def: getOperand(N: 0), Part: 0); |
379 | Value *PartMinus1 = Part == 0 ? V1 : State.get(Def: getOperand(N: 1), Part: Part - 1); |
380 | if (!PartMinus1->getType()->isVectorTy()) |
381 | return PartMinus1; |
382 | Value *V2 = State.get(Def: getOperand(N: 1), Part); |
383 | return Builder.CreateVectorSplice(V1: PartMinus1, V2, Imm: -1, Name); |
384 | } |
385 | case VPInstruction::CalculateTripCountMinusVF: { |
386 | if (Part != 0) |
387 | return State.get(Def: this, Part: 0, /*IsScalar*/ true); |
388 | |
389 | Value *ScalarTC = State.get(Def: getOperand(N: 0), Instance: {0, 0}); |
390 | Value *Step = |
391 | createStepForVF(B&: Builder, Ty: ScalarTC->getType(), VF: State.VF, Step: State.UF); |
392 | Value *Sub = Builder.CreateSub(LHS: ScalarTC, RHS: Step); |
393 | Value *Cmp = Builder.CreateICmp(P: CmpInst::Predicate::ICMP_UGT, LHS: ScalarTC, RHS: Step); |
394 | Value *Zero = ConstantInt::get(Ty: ScalarTC->getType(), V: 0); |
395 | return Builder.CreateSelect(C: Cmp, True: Sub, False: Zero); |
396 | } |
397 | case VPInstruction::ExplicitVectorLength: { |
398 | // Compute EVL |
399 | auto GetEVL = [=](VPTransformState &State, Value *AVL) { |
400 | assert(AVL->getType()->isIntegerTy() && |
401 | "Requested vector length should be an integer." ); |
402 | |
403 | // TODO: Add support for MaxSafeDist for correct loop emission. |
404 | assert(State.VF.isScalable() && "Expected scalable vector factor." ); |
405 | Value *VFArg = State.Builder.getInt32(C: State.VF.getKnownMinValue()); |
406 | |
407 | Value *EVL = State.Builder.CreateIntrinsic( |
408 | State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length, |
409 | {AVL, VFArg, State.Builder.getTrue()}); |
410 | return EVL; |
411 | }; |
412 | // TODO: Restructure this code with an explicit remainder loop, vsetvli can |
413 | // be outside of the main loop. |
414 | assert(Part == 0 && "No unrolling expected for predicated vectorization." ); |
415 | // Compute VTC - IV as the AVL (requested vector length). |
416 | Value *Index = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
417 | Value *TripCount = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0)); |
418 | Value *AVL = State.Builder.CreateSub(LHS: TripCount, RHS: Index); |
419 | Value *EVL = GetEVL(State, AVL); |
420 | return EVL; |
421 | } |
422 | case VPInstruction::CanonicalIVIncrementForPart: { |
423 | auto *IV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
424 | if (Part == 0) |
425 | return IV; |
426 | |
427 | // The canonical IV is incremented by the vectorization factor (num of SIMD |
428 | // elements) times the unroll part. |
429 | Value *Step = createStepForVF(B&: Builder, Ty: IV->getType(), VF: State.VF, Step: Part); |
430 | return Builder.CreateAdd(LHS: IV, RHS: Step, Name, HasNUW: hasNoUnsignedWrap(), |
431 | HasNSW: hasNoSignedWrap()); |
432 | } |
433 | case VPInstruction::BranchOnCond: { |
434 | if (Part != 0) |
435 | return nullptr; |
436 | |
437 | Value *Cond = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0)); |
438 | VPRegionBlock *ParentRegion = getParent()->getParent(); |
439 | VPBasicBlock * = ParentRegion->getEntryBasicBlock(); |
440 | |
441 | // Replace the temporary unreachable terminator with a new conditional |
442 | // branch, hooking it up to backward destination for exiting blocks now and |
443 | // to forward destination(s) later when they are created. |
444 | BranchInst *CondBr = |
445 | Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(), False: nullptr); |
446 | |
447 | if (getParent()->isExiting()) |
448 | CondBr->setSuccessor(idx: 1, NewSucc: State.CFG.VPBB2IRBB[Header]); |
449 | |
450 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
451 | Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); |
452 | return CondBr; |
453 | } |
454 | case VPInstruction::BranchOnCount: { |
455 | if (Part != 0) |
456 | return nullptr; |
457 | // First create the compare. |
458 | Value *IV = State.get(Def: getOperand(N: 0), Part, /*IsScalar*/ true); |
459 | Value *TC = State.get(Def: getOperand(N: 1), Part, /*IsScalar*/ true); |
460 | Value *Cond = Builder.CreateICmpEQ(LHS: IV, RHS: TC); |
461 | |
462 | // Now create the branch. |
463 | auto *Plan = getParent()->getPlan(); |
464 | VPRegionBlock *TopRegion = Plan->getVectorLoopRegion(); |
465 | VPBasicBlock * = TopRegion->getEntry()->getEntryBasicBlock(); |
466 | |
467 | // Replace the temporary unreachable terminator with a new conditional |
468 | // branch, hooking it up to backward destination (the header) now and to the |
469 | // forward destination (the exit/middle block) later when it is created. |
470 | // Note that CreateCondBr expects a valid BB as first argument, so we need |
471 | // to set it to nullptr later. |
472 | BranchInst *CondBr = Builder.CreateCondBr(Cond, True: Builder.GetInsertBlock(), |
473 | False: State.CFG.VPBB2IRBB[Header]); |
474 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
475 | Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); |
476 | return CondBr; |
477 | } |
478 | case VPInstruction::ComputeReductionResult: { |
479 | if (Part != 0) |
480 | return State.get(Def: this, Part: 0, /*IsScalar*/ true); |
481 | |
482 | // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary |
483 | // and will be removed by breaking up the recipe further. |
484 | auto *PhiR = cast<VPReductionPHIRecipe>(Val: getOperand(N: 0)); |
485 | auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue()); |
486 | // Get its reduction variable descriptor. |
487 | const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); |
488 | |
489 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
490 | |
491 | VPValue *LoopExitingDef = getOperand(N: 1); |
492 | Type *PhiTy = OrigPhi->getType(); |
493 | VectorParts RdxParts(State.UF); |
494 | for (unsigned Part = 0; Part < State.UF; ++Part) |
495 | RdxParts[Part] = State.get(Def: LoopExitingDef, Part, IsScalar: PhiR->isInLoop()); |
496 | |
497 | // If the vector reduction can be performed in a smaller type, we truncate |
498 | // then extend the loop exit value to enable InstCombine to evaluate the |
499 | // entire expression in the smaller type. |
500 | // TODO: Handle this in truncateToMinBW. |
501 | if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) { |
502 | Type *RdxVecTy = VectorType::get(ElementType: RdxDesc.getRecurrenceType(), EC: State.VF); |
503 | for (unsigned Part = 0; Part < State.UF; ++Part) |
504 | RdxParts[Part] = Builder.CreateTrunc(V: RdxParts[Part], DestTy: RdxVecTy); |
505 | } |
506 | // Reduce all of the unrolled parts into a single vector. |
507 | Value *ReducedPartRdx = RdxParts[0]; |
508 | unsigned Op = RecurrenceDescriptor::getOpcode(Kind: RK); |
509 | |
510 | if (PhiR->isOrdered()) { |
511 | ReducedPartRdx = RdxParts[State.UF - 1]; |
512 | } else { |
513 | // Floating-point operations should have some FMF to enable the reduction. |
514 | IRBuilderBase::FastMathFlagGuard FMFG(Builder); |
515 | Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); |
516 | for (unsigned Part = 1; Part < State.UF; ++Part) { |
517 | Value *RdxPart = RdxParts[Part]; |
518 | if (Op != Instruction::ICmp && Op != Instruction::FCmp) |
519 | ReducedPartRdx = Builder.CreateBinOp( |
520 | Opc: (Instruction::BinaryOps)Op, LHS: RdxPart, RHS: ReducedPartRdx, Name: "bin.rdx" ); |
521 | else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) { |
522 | TrackingVH<Value> ReductionStartValue = |
523 | RdxDesc.getRecurrenceStartValue(); |
524 | ReducedPartRdx = createAnyOfOp(Builder, StartVal: ReductionStartValue, RK, |
525 | Left: ReducedPartRdx, Right: RdxPart); |
526 | } else |
527 | ReducedPartRdx = createMinMaxOp(Builder, RK, Left: ReducedPartRdx, Right: RdxPart); |
528 | } |
529 | } |
530 | |
531 | // Create the reduction after the loop. Note that inloop reductions create |
532 | // the target reduction in the loop using a Reduction recipe. |
533 | if (State.VF.isVector() && !PhiR->isInLoop()) { |
534 | ReducedPartRdx = |
535 | createTargetReduction(B&: Builder, Desc: RdxDesc, Src: ReducedPartRdx, OrigPhi); |
536 | // If the reduction can be performed in a smaller type, we need to extend |
537 | // the reduction to the wider type before we branch to the original loop. |
538 | if (PhiTy != RdxDesc.getRecurrenceType()) |
539 | ReducedPartRdx = RdxDesc.isSigned() |
540 | ? Builder.CreateSExt(V: ReducedPartRdx, DestTy: PhiTy) |
541 | : Builder.CreateZExt(V: ReducedPartRdx, DestTy: PhiTy); |
542 | } |
543 | |
544 | // If there were stores of the reduction value to a uniform memory address |
545 | // inside the loop, create the final store here. |
546 | if (StoreInst *SI = RdxDesc.IntermediateStore) { |
547 | auto *NewSI = Builder.CreateAlignedStore( |
548 | Val: ReducedPartRdx, Ptr: SI->getPointerOperand(), Align: SI->getAlign()); |
549 | propagateMetadata(I: NewSI, VL: SI); |
550 | } |
551 | |
552 | return ReducedPartRdx; |
553 | } |
554 | case VPInstruction::PtrAdd: { |
555 | assert(vputils::onlyFirstLaneUsed(this) && |
556 | "can only generate first lane for PtrAdd" ); |
557 | Value *Ptr = State.get(Def: getOperand(N: 0), Part, /* IsScalar */ true); |
558 | Value *Addend = State.get(Def: getOperand(N: 1), Part, /* IsScalar */ true); |
559 | return Builder.CreatePtrAdd(Ptr, Offset: Addend, Name); |
560 | } |
561 | default: |
562 | llvm_unreachable("Unsupported opcode for instruction" ); |
563 | } |
564 | } |
565 | |
566 | #if !defined(NDEBUG) |
567 | bool VPInstruction::isFPMathOp() const { |
568 | // Inspired by FPMathOperator::classof. Notable differences are that we don't |
569 | // support Call, PHI and Select opcodes here yet. |
570 | return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || |
571 | Opcode == Instruction::FNeg || Opcode == Instruction::FSub || |
572 | Opcode == Instruction::FDiv || Opcode == Instruction::FRem || |
573 | Opcode == Instruction::FCmp || Opcode == Instruction::Select; |
574 | } |
575 | #endif |
576 | |
577 | void VPInstruction::execute(VPTransformState &State) { |
578 | assert(!State.Instance && "VPInstruction executing an Instance" ); |
579 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
580 | assert((hasFastMathFlags() == isFPMathOp() || |
581 | getOpcode() == Instruction::Select) && |
582 | "Recipe not a FPMathOp but has fast-math flags?" ); |
583 | if (hasFastMathFlags()) |
584 | State.Builder.setFastMathFlags(getFastMathFlags()); |
585 | State.setDebugLocFrom(getDebugLoc()); |
586 | bool GeneratesPerFirstLaneOnly = |
587 | canGenerateScalarForFirstLane() && |
588 | (vputils::onlyFirstLaneUsed(Def: this) || |
589 | getOpcode() == VPInstruction::ComputeReductionResult); |
590 | bool GeneratesPerAllLanes = doesGeneratePerAllLanes(); |
591 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
592 | if (GeneratesPerAllLanes) { |
593 | for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue(); |
594 | Lane != NumLanes; ++Lane) { |
595 | Value *GeneratedValue = generatePerLane(State, Lane: VPIteration(Part, Lane)); |
596 | assert(GeneratedValue && "generatePerLane must produce a value" ); |
597 | State.set(Def: this, V: GeneratedValue, Instance: VPIteration(Part, Lane)); |
598 | } |
599 | continue; |
600 | } |
601 | |
602 | Value *GeneratedValue = generatePerPart(State, Part); |
603 | if (!hasResult()) |
604 | continue; |
605 | assert(GeneratedValue && "generatePerPart must produce a value" ); |
606 | assert((GeneratedValue->getType()->isVectorTy() == |
607 | !GeneratesPerFirstLaneOnly || |
608 | State.VF.isScalar()) && |
609 | "scalar value but not only first lane defined" ); |
610 | State.set(Def: this, V: GeneratedValue, Part, |
611 | /*IsScalar*/ GeneratesPerFirstLaneOnly); |
612 | } |
613 | } |
614 | |
615 | bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { |
616 | assert(is_contained(operands(), Op) && "Op must be an operand of the recipe" ); |
617 | if (Instruction::isBinaryOp(Opcode: getOpcode())) |
618 | return vputils::onlyFirstLaneUsed(Def: this); |
619 | |
620 | switch (getOpcode()) { |
621 | default: |
622 | return false; |
623 | case Instruction::ICmp: |
624 | case VPInstruction::PtrAdd: |
625 | // TODO: Cover additional opcodes. |
626 | return vputils::onlyFirstLaneUsed(Def: this); |
627 | case VPInstruction::ActiveLaneMask: |
628 | case VPInstruction::ExplicitVectorLength: |
629 | case VPInstruction::CalculateTripCountMinusVF: |
630 | case VPInstruction::CanonicalIVIncrementForPart: |
631 | case VPInstruction::BranchOnCount: |
632 | return true; |
633 | }; |
634 | llvm_unreachable("switch should return" ); |
635 | } |
636 | |
637 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
638 | void VPInstruction::dump() const { |
639 | VPSlotTracker SlotTracker(getParent()->getPlan()); |
640 | print(O&: dbgs(), Indent: "" , SlotTracker); |
641 | } |
642 | |
643 | void VPInstruction::print(raw_ostream &O, const Twine &Indent, |
644 | VPSlotTracker &SlotTracker) const { |
645 | O << Indent << "EMIT " ; |
646 | |
647 | if (hasResult()) { |
648 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
649 | O << " = " ; |
650 | } |
651 | |
652 | switch (getOpcode()) { |
653 | case VPInstruction::Not: |
654 | O << "not" ; |
655 | break; |
656 | case VPInstruction::SLPLoad: |
657 | O << "combined load" ; |
658 | break; |
659 | case VPInstruction::SLPStore: |
660 | O << "combined store" ; |
661 | break; |
662 | case VPInstruction::ActiveLaneMask: |
663 | O << "active lane mask" ; |
664 | break; |
665 | case VPInstruction::ExplicitVectorLength: |
666 | O << "EXPLICIT-VECTOR-LENGTH" ; |
667 | break; |
668 | case VPInstruction::FirstOrderRecurrenceSplice: |
669 | O << "first-order splice" ; |
670 | break; |
671 | case VPInstruction::BranchOnCond: |
672 | O << "branch-on-cond" ; |
673 | break; |
674 | case VPInstruction::CalculateTripCountMinusVF: |
675 | O << "TC > VF ? TC - VF : 0" ; |
676 | break; |
677 | case VPInstruction::CanonicalIVIncrementForPart: |
678 | O << "VF * Part +" ; |
679 | break; |
680 | case VPInstruction::BranchOnCount: |
681 | O << "branch-on-count" ; |
682 | break; |
683 | case VPInstruction::ComputeReductionResult: |
684 | O << "compute-reduction-result" ; |
685 | break; |
686 | case VPInstruction::PtrAdd: |
687 | O << "ptradd" ; |
688 | break; |
689 | default: |
690 | O << Instruction::getOpcodeName(Opcode: getOpcode()); |
691 | } |
692 | |
693 | printFlags(O); |
694 | printOperands(O, SlotTracker); |
695 | |
696 | if (auto DL = getDebugLoc()) { |
697 | O << ", !dbg " ; |
698 | DL.print(OS&: O); |
699 | } |
700 | } |
701 | #endif |
702 | |
703 | void VPWidenCallRecipe::execute(VPTransformState &State) { |
704 | assert(State.VF.isVector() && "not widening" ); |
705 | auto &CI = *cast<CallInst>(Val: getUnderlyingInstr()); |
706 | assert(!isa<DbgInfoIntrinsic>(CI) && |
707 | "DbgInfoIntrinsic should have been dropped during VPlan construction" ); |
708 | State.setDebugLocFrom(getDebugLoc()); |
709 | |
710 | bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic; |
711 | FunctionType *VFTy = nullptr; |
712 | if (Variant) |
713 | VFTy = Variant->getFunctionType(); |
714 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
715 | SmallVector<Type *, 2> TysForDecl; |
716 | // Add return type if intrinsic is overloaded on it. |
717 | if (UseIntrinsic && |
718 | isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: -1)) |
719 | TysForDecl.push_back( |
720 | Elt: VectorType::get(ElementType: CI.getType()->getScalarType(), EC: State.VF)); |
721 | SmallVector<Value *, 4> Args; |
722 | for (const auto &I : enumerate(First: operands())) { |
723 | // Some intrinsics have a scalar argument - don't replace it with a |
724 | // vector. |
725 | Value *Arg; |
726 | if (UseIntrinsic && |
727 | isVectorIntrinsicWithScalarOpAtArg(ID: VectorIntrinsicID, ScalarOpdIdx: I.index())) |
728 | Arg = State.get(Def: I.value(), Instance: VPIteration(0, 0)); |
729 | // Some vectorized function variants may also take a scalar argument, |
730 | // e.g. linear parameters for pointers. This needs to be the scalar value |
731 | // from the start of the respective part when interleaving. |
732 | else if (VFTy && !VFTy->getParamType(i: I.index())->isVectorTy()) |
733 | Arg = State.get(Def: I.value(), Instance: VPIteration(Part, 0)); |
734 | else |
735 | Arg = State.get(Def: I.value(), Part); |
736 | if (UseIntrinsic && |
737 | isVectorIntrinsicWithOverloadTypeAtArg(ID: VectorIntrinsicID, OpdIdx: I.index())) |
738 | TysForDecl.push_back(Elt: Arg->getType()); |
739 | Args.push_back(Elt: Arg); |
740 | } |
741 | |
742 | Function *VectorF; |
743 | if (UseIntrinsic) { |
744 | // Use vector version of the intrinsic. |
745 | Module *M = State.Builder.GetInsertBlock()->getModule(); |
746 | VectorF = Intrinsic::getDeclaration(M, id: VectorIntrinsicID, Tys: TysForDecl); |
747 | assert(VectorF && "Can't retrieve vector intrinsic." ); |
748 | } else { |
749 | #ifndef NDEBUG |
750 | assert(Variant != nullptr && "Can't create vector function." ); |
751 | #endif |
752 | VectorF = Variant; |
753 | } |
754 | |
755 | SmallVector<OperandBundleDef, 1> OpBundles; |
756 | CI.getOperandBundlesAsDefs(Defs&: OpBundles); |
757 | CallInst *V = State.Builder.CreateCall(Callee: VectorF, Args, OpBundles); |
758 | |
759 | if (isa<FPMathOperator>(Val: V)) |
760 | V->copyFastMathFlags(I: &CI); |
761 | |
762 | if (!V->getType()->isVoidTy()) |
763 | State.set(Def: this, V, Part); |
764 | State.addMetadata(To: V, From: &CI); |
765 | } |
766 | } |
767 | |
768 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
769 | void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, |
770 | VPSlotTracker &SlotTracker) const { |
771 | O << Indent << "WIDEN-CALL " ; |
772 | |
773 | auto *CI = cast<CallInst>(Val: getUnderlyingInstr()); |
774 | if (CI->getType()->isVoidTy()) |
775 | O << "void " ; |
776 | else { |
777 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
778 | O << " = " ; |
779 | } |
780 | |
781 | O << "call @" << CI->getCalledFunction()->getName() << "(" ; |
782 | printOperands(O, SlotTracker); |
783 | O << ")" ; |
784 | |
785 | if (VectorIntrinsicID) |
786 | O << " (using vector intrinsic)" ; |
787 | else { |
788 | O << " (using library function" ; |
789 | if (Variant->hasName()) |
790 | O << ": " << Variant->getName(); |
791 | O << ")" ; |
792 | } |
793 | } |
794 | |
795 | void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent, |
796 | VPSlotTracker &SlotTracker) const { |
797 | O << Indent << "WIDEN-SELECT " ; |
798 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
799 | O << " = select " ; |
800 | getOperand(N: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
801 | O << ", " ; |
802 | getOperand(N: 1)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
803 | O << ", " ; |
804 | getOperand(N: 2)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
805 | O << (isInvariantCond() ? " (condition is loop invariant)" : "" ); |
806 | } |
807 | #endif |
808 | |
809 | void VPWidenSelectRecipe::execute(VPTransformState &State) { |
810 | State.setDebugLocFrom(getDebugLoc()); |
811 | |
812 | // The condition can be loop invariant but still defined inside the |
813 | // loop. This means that we can't just use the original 'cond' value. |
814 | // We have to take the 'vectorized' value and pick the first lane. |
815 | // Instcombine will make this a no-op. |
816 | auto *InvarCond = |
817 | isInvariantCond() ? State.get(Def: getCond(), Instance: VPIteration(0, 0)) : nullptr; |
818 | |
819 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
820 | Value *Cond = InvarCond ? InvarCond : State.get(Def: getCond(), Part); |
821 | Value *Op0 = State.get(Def: getOperand(N: 1), Part); |
822 | Value *Op1 = State.get(Def: getOperand(N: 2), Part); |
823 | Value *Sel = State.Builder.CreateSelect(C: Cond, True: Op0, False: Op1); |
824 | State.set(Def: this, V: Sel, Part); |
825 | State.addMetadata(To: Sel, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
826 | } |
827 | } |
828 | |
829 | VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy( |
830 | const FastMathFlags &FMF) { |
831 | AllowReassoc = FMF.allowReassoc(); |
832 | NoNaNs = FMF.noNaNs(); |
833 | NoInfs = FMF.noInfs(); |
834 | NoSignedZeros = FMF.noSignedZeros(); |
835 | AllowReciprocal = FMF.allowReciprocal(); |
836 | AllowContract = FMF.allowContract(); |
837 | ApproxFunc = FMF.approxFunc(); |
838 | } |
839 | |
840 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
841 | void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const { |
842 | switch (OpType) { |
843 | case OperationType::Cmp: |
844 | O << " " << CmpInst::getPredicateName(P: getPredicate()); |
845 | break; |
846 | case OperationType::DisjointOp: |
847 | if (DisjointFlags.IsDisjoint) |
848 | O << " disjoint" ; |
849 | break; |
850 | case OperationType::PossiblyExactOp: |
851 | if (ExactFlags.IsExact) |
852 | O << " exact" ; |
853 | break; |
854 | case OperationType::OverflowingBinOp: |
855 | if (WrapFlags.HasNUW) |
856 | O << " nuw" ; |
857 | if (WrapFlags.HasNSW) |
858 | O << " nsw" ; |
859 | break; |
860 | case OperationType::FPMathOp: |
861 | getFastMathFlags().print(O); |
862 | break; |
863 | case OperationType::GEPOp: |
864 | if (GEPFlags.IsInBounds) |
865 | O << " inbounds" ; |
866 | break; |
867 | case OperationType::NonNegOp: |
868 | if (NonNegFlags.NonNeg) |
869 | O << " nneg" ; |
870 | break; |
871 | case OperationType::Other: |
872 | break; |
873 | } |
874 | if (getNumOperands() > 0) |
875 | O << " " ; |
876 | } |
877 | #endif |
878 | |
879 | void VPWidenRecipe::execute(VPTransformState &State) { |
880 | State.setDebugLocFrom(getDebugLoc()); |
881 | auto &Builder = State.Builder; |
882 | switch (Opcode) { |
883 | case Instruction::Call: |
884 | case Instruction::Br: |
885 | case Instruction::PHI: |
886 | case Instruction::GetElementPtr: |
887 | case Instruction::Select: |
888 | llvm_unreachable("This instruction is handled by a different recipe." ); |
889 | case Instruction::UDiv: |
890 | case Instruction::SDiv: |
891 | case Instruction::SRem: |
892 | case Instruction::URem: |
893 | case Instruction::Add: |
894 | case Instruction::FAdd: |
895 | case Instruction::Sub: |
896 | case Instruction::FSub: |
897 | case Instruction::FNeg: |
898 | case Instruction::Mul: |
899 | case Instruction::FMul: |
900 | case Instruction::FDiv: |
901 | case Instruction::FRem: |
902 | case Instruction::Shl: |
903 | case Instruction::LShr: |
904 | case Instruction::AShr: |
905 | case Instruction::And: |
906 | case Instruction::Or: |
907 | case Instruction::Xor: { |
908 | // Just widen unops and binops. |
909 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
910 | SmallVector<Value *, 2> Ops; |
911 | for (VPValue *VPOp : operands()) |
912 | Ops.push_back(Elt: State.get(Def: VPOp, Part)); |
913 | |
914 | Value *V = Builder.CreateNAryOp(Opc: Opcode, Ops); |
915 | |
916 | if (auto *VecOp = dyn_cast<Instruction>(Val: V)) |
917 | setFlags(VecOp); |
918 | |
919 | // Use this vector value for all users of the original instruction. |
920 | State.set(Def: this, V, Part); |
921 | State.addMetadata(To: V, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
922 | } |
923 | |
924 | break; |
925 | } |
926 | case Instruction::Freeze: { |
927 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
928 | Value *Op = State.get(Def: getOperand(N: 0), Part); |
929 | |
930 | Value *Freeze = Builder.CreateFreeze(V: Op); |
931 | State.set(Def: this, V: Freeze, Part); |
932 | } |
933 | break; |
934 | } |
935 | case Instruction::ICmp: |
936 | case Instruction::FCmp: { |
937 | // Widen compares. Generate vector compares. |
938 | bool FCmp = Opcode == Instruction::FCmp; |
939 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
940 | Value *A = State.get(Def: getOperand(N: 0), Part); |
941 | Value *B = State.get(Def: getOperand(N: 1), Part); |
942 | Value *C = nullptr; |
943 | if (FCmp) { |
944 | // Propagate fast math flags. |
945 | IRBuilder<>::FastMathFlagGuard FMFG(Builder); |
946 | if (auto *I = dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())) |
947 | Builder.setFastMathFlags(I->getFastMathFlags()); |
948 | C = Builder.CreateFCmp(P: getPredicate(), LHS: A, RHS: B); |
949 | } else { |
950 | C = Builder.CreateICmp(P: getPredicate(), LHS: A, RHS: B); |
951 | } |
952 | State.set(Def: this, V: C, Part); |
953 | State.addMetadata(To: C, From: dyn_cast_or_null<Instruction>(Val: getUnderlyingValue())); |
954 | } |
955 | |
956 | break; |
957 | } |
958 | default: |
959 | // This instruction is not vectorized by simple widening. |
960 | LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " |
961 | << Instruction::getOpcodeName(Opcode)); |
962 | llvm_unreachable("Unhandled instruction!" ); |
963 | } // end of switch. |
964 | |
965 | #if !defined(NDEBUG) |
966 | // Verify that VPlan type inference results agree with the type of the |
967 | // generated values. |
968 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
969 | assert(VectorType::get(State.TypeAnalysis.inferScalarType(this), |
970 | State.VF) == State.get(this, Part)->getType() && |
971 | "inferred type and type from generated instructions do not match" ); |
972 | } |
973 | #endif |
974 | } |
975 | |
976 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
977 | void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, |
978 | VPSlotTracker &SlotTracker) const { |
979 | O << Indent << "WIDEN " ; |
980 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
981 | O << " = " << Instruction::getOpcodeName(Opcode); |
982 | printFlags(O); |
983 | printOperands(O, SlotTracker); |
984 | } |
985 | #endif |
986 | |
987 | void VPWidenCastRecipe::execute(VPTransformState &State) { |
988 | State.setDebugLocFrom(getDebugLoc()); |
989 | auto &Builder = State.Builder; |
990 | /// Vectorize casts. |
991 | assert(State.VF.isVector() && "Not vectorizing?" ); |
992 | Type *DestTy = VectorType::get(ElementType: getResultType(), EC: State.VF); |
993 | VPValue *Op = getOperand(N: 0); |
994 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
995 | if (Part > 0 && Op->isLiveIn()) { |
996 | // FIXME: Remove once explicit unrolling is implemented using VPlan. |
997 | State.set(Def: this, V: State.get(Def: this, Part: 0), Part); |
998 | continue; |
999 | } |
1000 | Value *A = State.get(Def: Op, Part); |
1001 | Value *Cast = Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: A, DestTy); |
1002 | State.set(Def: this, V: Cast, Part); |
1003 | State.addMetadata(To: Cast, From: cast_or_null<Instruction>(Val: getUnderlyingValue())); |
1004 | } |
1005 | } |
1006 | |
1007 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1008 | void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent, |
1009 | VPSlotTracker &SlotTracker) const { |
1010 | O << Indent << "WIDEN-CAST " ; |
1011 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1012 | O << " = " << Instruction::getOpcodeName(Opcode) << " " ; |
1013 | printFlags(O); |
1014 | printOperands(O, SlotTracker); |
1015 | O << " to " << *getResultType(); |
1016 | } |
1017 | #endif |
1018 | |
1019 | /// This function adds |
1020 | /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...) |
1021 | /// to each vector element of Val. The sequence starts at StartIndex. |
1022 | /// \p Opcode is relevant for FP induction variable. |
1023 | static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step, |
1024 | Instruction::BinaryOps BinOp, ElementCount VF, |
1025 | IRBuilderBase &Builder) { |
1026 | assert(VF.isVector() && "only vector VFs are supported" ); |
1027 | |
1028 | // Create and check the types. |
1029 | auto *ValVTy = cast<VectorType>(Val: Val->getType()); |
1030 | ElementCount VLen = ValVTy->getElementCount(); |
1031 | |
1032 | Type *STy = Val->getType()->getScalarType(); |
1033 | assert((STy->isIntegerTy() || STy->isFloatingPointTy()) && |
1034 | "Induction Step must be an integer or FP" ); |
1035 | assert(Step->getType() == STy && "Step has wrong type" ); |
1036 | |
1037 | SmallVector<Constant *, 8> Indices; |
1038 | |
1039 | // Create a vector of consecutive numbers from zero to VF. |
1040 | VectorType *InitVecValVTy = ValVTy; |
1041 | if (STy->isFloatingPointTy()) { |
1042 | Type *InitVecValSTy = |
1043 | IntegerType::get(C&: STy->getContext(), NumBits: STy->getScalarSizeInBits()); |
1044 | InitVecValVTy = VectorType::get(ElementType: InitVecValSTy, EC: VLen); |
1045 | } |
1046 | Value *InitVec = Builder.CreateStepVector(DstType: InitVecValVTy); |
1047 | |
1048 | // Splat the StartIdx |
1049 | Value *StartIdxSplat = Builder.CreateVectorSplat(EC: VLen, V: StartIdx); |
1050 | |
1051 | if (STy->isIntegerTy()) { |
1052 | InitVec = Builder.CreateAdd(LHS: InitVec, RHS: StartIdxSplat); |
1053 | Step = Builder.CreateVectorSplat(EC: VLen, V: Step); |
1054 | assert(Step->getType() == Val->getType() && "Invalid step vec" ); |
1055 | // FIXME: The newly created binary instructions should contain nsw/nuw |
1056 | // flags, which can be found from the original scalar operations. |
1057 | Step = Builder.CreateMul(LHS: InitVec, RHS: Step); |
1058 | return Builder.CreateAdd(LHS: Val, RHS: Step, Name: "induction" ); |
1059 | } |
1060 | |
1061 | // Floating point induction. |
1062 | assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) && |
1063 | "Binary Opcode should be specified for FP induction" ); |
1064 | InitVec = Builder.CreateUIToFP(V: InitVec, DestTy: ValVTy); |
1065 | InitVec = Builder.CreateFAdd(L: InitVec, R: StartIdxSplat); |
1066 | |
1067 | Step = Builder.CreateVectorSplat(EC: VLen, V: Step); |
1068 | Value *MulOp = Builder.CreateFMul(L: InitVec, R: Step); |
1069 | return Builder.CreateBinOp(Opc: BinOp, LHS: Val, RHS: MulOp, Name: "induction" ); |
1070 | } |
1071 | |
1072 | /// A helper function that returns an integer or floating-point constant with |
1073 | /// value C. |
1074 | static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) { |
1075 | return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, V: C) |
1076 | : ConstantFP::get(Ty, V: C); |
1077 | } |
1078 | |
1079 | static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy, |
1080 | ElementCount VF) { |
1081 | assert(FTy->isFloatingPointTy() && "Expected floating point type!" ); |
1082 | Type *IntTy = IntegerType::get(C&: FTy->getContext(), NumBits: FTy->getScalarSizeInBits()); |
1083 | Value *RuntimeVF = getRuntimeVF(B, Ty: IntTy, VF); |
1084 | return B.CreateUIToFP(V: RuntimeVF, DestTy: FTy); |
1085 | } |
1086 | |
1087 | void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { |
1088 | assert(!State.Instance && "Int or FP induction being replicated." ); |
1089 | |
1090 | Value *Start = getStartValue()->getLiveInIRValue(); |
1091 | const InductionDescriptor &ID = getInductionDescriptor(); |
1092 | TruncInst *Trunc = getTruncInst(); |
1093 | IRBuilderBase &Builder = State.Builder; |
1094 | assert(IV->getType() == ID.getStartValue()->getType() && "Types must match" ); |
1095 | assert(State.VF.isVector() && "must have vector VF" ); |
1096 | |
1097 | // The value from the original loop to which we are mapping the new induction |
1098 | // variable. |
1099 | Instruction *EntryVal = Trunc ? cast<Instruction>(Val: Trunc) : IV; |
1100 | |
1101 | // Fast-math-flags propagate from the original induction instruction. |
1102 | IRBuilder<>::FastMathFlagGuard FMFG(Builder); |
1103 | if (ID.getInductionBinOp() && isa<FPMathOperator>(Val: ID.getInductionBinOp())) |
1104 | Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags()); |
1105 | |
1106 | // Now do the actual transformations, and start with fetching the step value. |
1107 | Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0)); |
1108 | |
1109 | assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) && |
1110 | "Expected either an induction phi-node or a truncate of it!" ); |
1111 | |
1112 | // Construct the initial value of the vector IV in the vector loop preheader |
1113 | auto CurrIP = Builder.saveIP(); |
1114 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
1115 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
1116 | if (isa<TruncInst>(Val: EntryVal)) { |
1117 | assert(Start->getType()->isIntegerTy() && |
1118 | "Truncation requires an integer type" ); |
1119 | auto *TruncType = cast<IntegerType>(Val: EntryVal->getType()); |
1120 | Step = Builder.CreateTrunc(V: Step, DestTy: TruncType); |
1121 | Start = Builder.CreateCast(Op: Instruction::Trunc, V: Start, DestTy: TruncType); |
1122 | } |
1123 | |
1124 | Value *Zero = getSignedIntOrFpConstant(Ty: Start->getType(), C: 0); |
1125 | Value *SplatStart = Builder.CreateVectorSplat(EC: State.VF, V: Start); |
1126 | Value *SteppedStart = getStepVector( |
1127 | Val: SplatStart, StartIdx: Zero, Step, BinOp: ID.getInductionOpcode(), VF: State.VF, Builder&: State.Builder); |
1128 | |
1129 | // We create vector phi nodes for both integer and floating-point induction |
1130 | // variables. Here, we determine the kind of arithmetic we will perform. |
1131 | Instruction::BinaryOps AddOp; |
1132 | Instruction::BinaryOps MulOp; |
1133 | if (Step->getType()->isIntegerTy()) { |
1134 | AddOp = Instruction::Add; |
1135 | MulOp = Instruction::Mul; |
1136 | } else { |
1137 | AddOp = ID.getInductionOpcode(); |
1138 | MulOp = Instruction::FMul; |
1139 | } |
1140 | |
1141 | // Multiply the vectorization factor by the step using integer or |
1142 | // floating-point arithmetic as appropriate. |
1143 | Type *StepType = Step->getType(); |
1144 | Value *RuntimeVF; |
1145 | if (Step->getType()->isFloatingPointTy()) |
1146 | RuntimeVF = getRuntimeVFAsFloat(B&: Builder, FTy: StepType, VF: State.VF); |
1147 | else |
1148 | RuntimeVF = getRuntimeVF(B&: Builder, Ty: StepType, VF: State.VF); |
1149 | Value *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: Step, RHS: RuntimeVF); |
1150 | |
1151 | // Create a vector splat to use in the induction update. |
1152 | // |
1153 | // FIXME: If the step is non-constant, we create the vector splat with |
1154 | // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't |
1155 | // handle a constant vector splat. |
1156 | Value *SplatVF = isa<Constant>(Val: Mul) |
1157 | ? ConstantVector::getSplat(EC: State.VF, Elt: cast<Constant>(Val: Mul)) |
1158 | : Builder.CreateVectorSplat(EC: State.VF, V: Mul); |
1159 | Builder.restoreIP(IP: CurrIP); |
1160 | |
1161 | // We may need to add the step a number of times, depending on the unroll |
1162 | // factor. The last of those goes into the PHI. |
1163 | PHINode *VecInd = PHINode::Create(Ty: SteppedStart->getType(), NumReservedValues: 2, NameStr: "vec.ind" ); |
1164 | VecInd->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
1165 | VecInd->setDebugLoc(EntryVal->getDebugLoc()); |
1166 | Instruction *LastInduction = VecInd; |
1167 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1168 | State.set(Def: this, V: LastInduction, Part); |
1169 | |
1170 | if (isa<TruncInst>(Val: EntryVal)) |
1171 | State.addMetadata(To: LastInduction, From: EntryVal); |
1172 | |
1173 | LastInduction = cast<Instruction>( |
1174 | Val: Builder.CreateBinOp(Opc: AddOp, LHS: LastInduction, RHS: SplatVF, Name: "step.add" )); |
1175 | LastInduction->setDebugLoc(EntryVal->getDebugLoc()); |
1176 | } |
1177 | |
1178 | LastInduction->setName("vec.ind.next" ); |
1179 | VecInd->addIncoming(V: SteppedStart, BB: VectorPH); |
1180 | // Add induction update using an incorrect block temporarily. The phi node |
1181 | // will be fixed after VPlan execution. Note that at this point the latch |
1182 | // block cannot be used, as it does not exist yet. |
1183 | // TODO: Model increment value in VPlan, by turning the recipe into a |
1184 | // multi-def and a subclass of VPHeaderPHIRecipe. |
1185 | VecInd->addIncoming(V: LastInduction, BB: VectorPH); |
1186 | } |
1187 | |
1188 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1189 | void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
1190 | VPSlotTracker &SlotTracker) const { |
1191 | O << Indent << "WIDEN-INDUCTION" ; |
1192 | if (getTruncInst()) { |
1193 | O << "\\l\"" ; |
1194 | O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"" ; |
1195 | O << " +\n" << Indent << "\" " ; |
1196 | getVPValue(I: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1197 | } else |
1198 | O << " " << VPlanIngredient(IV); |
1199 | |
1200 | O << ", " ; |
1201 | getStepValue()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1202 | } |
1203 | #endif |
1204 | |
1205 | bool VPWidenIntOrFpInductionRecipe::isCanonical() const { |
1206 | // The step may be defined by a recipe in the preheader (e.g. if it requires |
1207 | // SCEV expansion), but for the canonical induction the step is required to be |
1208 | // 1, which is represented as live-in. |
1209 | if (getStepValue()->getDefiningRecipe()) |
1210 | return false; |
1211 | auto *StepC = dyn_cast<ConstantInt>(Val: getStepValue()->getLiveInIRValue()); |
1212 | auto *StartC = dyn_cast<ConstantInt>(Val: getStartValue()->getLiveInIRValue()); |
1213 | return StartC && StartC->isZero() && StepC && StepC->isOne(); |
1214 | } |
1215 | |
1216 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1217 | void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent, |
1218 | VPSlotTracker &SlotTracker) const { |
1219 | O << Indent; |
1220 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1221 | O << Indent << "= DERIVED-IV " ; |
1222 | getStartValue()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1223 | O << " + " ; |
1224 | getOperand(N: 1)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1225 | O << " * " ; |
1226 | getStepValue()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1227 | } |
1228 | #endif |
1229 | |
1230 | void VPScalarIVStepsRecipe::execute(VPTransformState &State) { |
1231 | // Fast-math-flags propagate from the original induction instruction. |
1232 | IRBuilder<>::FastMathFlagGuard FMFG(State.Builder); |
1233 | if (hasFastMathFlags()) |
1234 | State.Builder.setFastMathFlags(getFastMathFlags()); |
1235 | |
1236 | /// Compute scalar induction steps. \p ScalarIV is the scalar induction |
1237 | /// variable on which to base the steps, \p Step is the size of the step. |
1238 | |
1239 | Value *BaseIV = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
1240 | Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0)); |
1241 | IRBuilderBase &Builder = State.Builder; |
1242 | |
1243 | // Ensure step has the same type as that of scalar IV. |
1244 | Type *BaseIVTy = BaseIV->getType()->getScalarType(); |
1245 | assert(BaseIVTy == Step->getType() && "Types of BaseIV and Step must match!" ); |
1246 | |
1247 | // We build scalar steps for both integer and floating-point induction |
1248 | // variables. Here, we determine the kind of arithmetic we will perform. |
1249 | Instruction::BinaryOps AddOp; |
1250 | Instruction::BinaryOps MulOp; |
1251 | if (BaseIVTy->isIntegerTy()) { |
1252 | AddOp = Instruction::Add; |
1253 | MulOp = Instruction::Mul; |
1254 | } else { |
1255 | AddOp = InductionOpcode; |
1256 | MulOp = Instruction::FMul; |
1257 | } |
1258 | |
1259 | // Determine the number of scalars we need to generate for each unroll |
1260 | // iteration. |
1261 | bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def: this); |
1262 | // Compute the scalar steps and save the results in State. |
1263 | Type *IntStepTy = |
1264 | IntegerType::get(C&: BaseIVTy->getContext(), NumBits: BaseIVTy->getScalarSizeInBits()); |
1265 | Type *VecIVTy = nullptr; |
1266 | Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr; |
1267 | if (!FirstLaneOnly && State.VF.isScalable()) { |
1268 | VecIVTy = VectorType::get(ElementType: BaseIVTy, EC: State.VF); |
1269 | UnitStepVec = |
1270 | Builder.CreateStepVector(DstType: VectorType::get(ElementType: IntStepTy, EC: State.VF)); |
1271 | SplatStep = Builder.CreateVectorSplat(EC: State.VF, V: Step); |
1272 | SplatIV = Builder.CreateVectorSplat(EC: State.VF, V: BaseIV); |
1273 | } |
1274 | |
1275 | unsigned StartPart = 0; |
1276 | unsigned EndPart = State.UF; |
1277 | unsigned StartLane = 0; |
1278 | unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue(); |
1279 | if (State.Instance) { |
1280 | StartPart = State.Instance->Part; |
1281 | EndPart = StartPart + 1; |
1282 | StartLane = State.Instance->Lane.getKnownLane(); |
1283 | EndLane = StartLane + 1; |
1284 | } |
1285 | for (unsigned Part = StartPart; Part < EndPart; ++Part) { |
1286 | Value *StartIdx0 = createStepForVF(B&: Builder, Ty: IntStepTy, VF: State.VF, Step: Part); |
1287 | |
1288 | if (!FirstLaneOnly && State.VF.isScalable()) { |
1289 | auto *SplatStartIdx = Builder.CreateVectorSplat(EC: State.VF, V: StartIdx0); |
1290 | auto *InitVec = Builder.CreateAdd(LHS: SplatStartIdx, RHS: UnitStepVec); |
1291 | if (BaseIVTy->isFloatingPointTy()) |
1292 | InitVec = Builder.CreateSIToFP(V: InitVec, DestTy: VecIVTy); |
1293 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: InitVec, RHS: SplatStep); |
1294 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: SplatIV, RHS: Mul); |
1295 | State.set(Def: this, V: Add, Part); |
1296 | // It's useful to record the lane values too for the known minimum number |
1297 | // of elements so we do those below. This improves the code quality when |
1298 | // trying to extract the first element, for example. |
1299 | } |
1300 | |
1301 | if (BaseIVTy->isFloatingPointTy()) |
1302 | StartIdx0 = Builder.CreateSIToFP(V: StartIdx0, DestTy: BaseIVTy); |
1303 | |
1304 | for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) { |
1305 | Value *StartIdx = Builder.CreateBinOp( |
1306 | Opc: AddOp, LHS: StartIdx0, RHS: getSignedIntOrFpConstant(Ty: BaseIVTy, C: Lane)); |
1307 | // The step returned by `createStepForVF` is a runtime-evaluated value |
1308 | // when VF is scalable. Otherwise, it should be folded into a Constant. |
1309 | assert((State.VF.isScalable() || isa<Constant>(StartIdx)) && |
1310 | "Expected StartIdx to be folded to a constant when VF is not " |
1311 | "scalable" ); |
1312 | auto *Mul = Builder.CreateBinOp(Opc: MulOp, LHS: StartIdx, RHS: Step); |
1313 | auto *Add = Builder.CreateBinOp(Opc: AddOp, LHS: BaseIV, RHS: Mul); |
1314 | State.set(Def: this, V: Add, Instance: VPIteration(Part, Lane)); |
1315 | } |
1316 | } |
1317 | } |
1318 | |
1319 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1320 | void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent, |
1321 | VPSlotTracker &SlotTracker) const { |
1322 | O << Indent; |
1323 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1324 | O << " = SCALAR-STEPS " ; |
1325 | printOperands(O, SlotTracker); |
1326 | } |
1327 | #endif |
1328 | |
1329 | void VPWidenGEPRecipe::execute(VPTransformState &State) { |
1330 | assert(State.VF.isVector() && "not widening" ); |
1331 | auto *GEP = cast<GetElementPtrInst>(Val: getUnderlyingInstr()); |
1332 | // Construct a vector GEP by widening the operands of the scalar GEP as |
1333 | // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP |
1334 | // results in a vector of pointers when at least one operand of the GEP |
1335 | // is vector-typed. Thus, to keep the representation compact, we only use |
1336 | // vector-typed operands for loop-varying values. |
1337 | |
1338 | if (areAllOperandsInvariant()) { |
1339 | // If we are vectorizing, but the GEP has only loop-invariant operands, |
1340 | // the GEP we build (by only using vector-typed operands for |
1341 | // loop-varying values) would be a scalar pointer. Thus, to ensure we |
1342 | // produce a vector of pointers, we need to either arbitrarily pick an |
1343 | // operand to broadcast, or broadcast a clone of the original GEP. |
1344 | // Here, we broadcast a clone of the original. |
1345 | // |
1346 | // TODO: If at some point we decide to scalarize instructions having |
1347 | // loop-invariant operands, this special case will no longer be |
1348 | // required. We would add the scalarization decision to |
1349 | // collectLoopScalars() and teach getVectorValue() to broadcast |
1350 | // the lane-zero scalar value. |
1351 | SmallVector<Value *> Ops; |
1352 | for (unsigned I = 0, E = getNumOperands(); I != E; I++) |
1353 | Ops.push_back(Elt: State.get(Def: getOperand(N: I), Instance: VPIteration(0, 0))); |
1354 | |
1355 | auto *NewGEP = |
1356 | State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr: Ops[0], |
1357 | IdxList: ArrayRef(Ops).drop_front(), Name: "" , IsInBounds: isInBounds()); |
1358 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1359 | Value *EntryPart = State.Builder.CreateVectorSplat(EC: State.VF, V: NewGEP); |
1360 | State.set(Def: this, V: EntryPart, Part); |
1361 | State.addMetadata(To: EntryPart, From: GEP); |
1362 | } |
1363 | } else { |
1364 | // If the GEP has at least one loop-varying operand, we are sure to |
1365 | // produce a vector of pointers. But if we are only unrolling, we want |
1366 | // to produce a scalar GEP for each unroll part. Thus, the GEP we |
1367 | // produce with the code below will be scalar (if VF == 1) or vector |
1368 | // (otherwise). Note that for the unroll-only case, we still maintain |
1369 | // values in the vector mapping with initVector, as we do for other |
1370 | // instructions. |
1371 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1372 | // The pointer operand of the new GEP. If it's loop-invariant, we |
1373 | // won't broadcast it. |
1374 | auto *Ptr = isPointerLoopInvariant() |
1375 | ? State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)) |
1376 | : State.get(Def: getOperand(N: 0), Part); |
1377 | |
1378 | // Collect all the indices for the new GEP. If any index is |
1379 | // loop-invariant, we won't broadcast it. |
1380 | SmallVector<Value *, 4> Indices; |
1381 | for (unsigned I = 1, E = getNumOperands(); I < E; I++) { |
1382 | VPValue *Operand = getOperand(N: I); |
1383 | if (isIndexLoopInvariant(I: I - 1)) |
1384 | Indices.push_back(Elt: State.get(Def: Operand, Instance: VPIteration(0, 0))); |
1385 | else |
1386 | Indices.push_back(Elt: State.get(Def: Operand, Part)); |
1387 | } |
1388 | |
1389 | // Create the new GEP. Note that this GEP may be a scalar if VF == 1, |
1390 | // but it should be a vector, otherwise. |
1391 | auto *NewGEP = State.Builder.CreateGEP(Ty: GEP->getSourceElementType(), Ptr, |
1392 | IdxList: Indices, Name: "" , IsInBounds: isInBounds()); |
1393 | assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) && |
1394 | "NewGEP is not a pointer vector" ); |
1395 | State.set(Def: this, V: NewGEP, Part); |
1396 | State.addMetadata(To: NewGEP, From: GEP); |
1397 | } |
1398 | } |
1399 | } |
1400 | |
1401 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1402 | void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent, |
1403 | VPSlotTracker &SlotTracker) const { |
1404 | O << Indent << "WIDEN-GEP " ; |
1405 | O << (isPointerLoopInvariant() ? "Inv" : "Var" ); |
1406 | for (size_t I = 0; I < getNumOperands() - 1; ++I) |
1407 | O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var" ) << "]" ; |
1408 | |
1409 | O << " " ; |
1410 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1411 | O << " = getelementptr" ; |
1412 | printFlags(O); |
1413 | printOperands(O, SlotTracker); |
1414 | } |
1415 | #endif |
1416 | |
1417 | void VPVectorPointerRecipe ::execute(VPTransformState &State) { |
1418 | auto &Builder = State.Builder; |
1419 | State.setDebugLocFrom(getDebugLoc()); |
1420 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1421 | // Calculate the pointer for the specific unroll-part. |
1422 | Value *PartPtr = nullptr; |
1423 | // Use i32 for the gep index type when the value is constant, |
1424 | // or query DataLayout for a more suitable index type otherwise. |
1425 | const DataLayout &DL = |
1426 | Builder.GetInsertBlock()->getModule()->getDataLayout(); |
1427 | Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) |
1428 | ? DL.getIndexType(PtrTy: IndexedTy->getPointerTo()) |
1429 | : Builder.getInt32Ty(); |
1430 | Value *Ptr = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
1431 | bool InBounds = isInBounds(); |
1432 | if (IsReverse) { |
1433 | // If the address is consecutive but reversed, then the |
1434 | // wide store needs to start at the last vector element. |
1435 | // RunTimeVF = VScale * VF.getKnownMinValue() |
1436 | // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() |
1437 | Value *RunTimeVF = getRuntimeVF(B&: Builder, Ty: IndexTy, VF: State.VF); |
1438 | // NumElt = -Part * RunTimeVF |
1439 | Value *NumElt = Builder.CreateMul( |
1440 | LHS: ConstantInt::get(Ty: IndexTy, V: -(int64_t)Part), RHS: RunTimeVF); |
1441 | // LastLane = 1 - RunTimeVF |
1442 | Value *LastLane = |
1443 | Builder.CreateSub(LHS: ConstantInt::get(Ty: IndexTy, V: 1), RHS: RunTimeVF); |
1444 | PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: NumElt, Name: "" , IsInBounds: InBounds); |
1445 | PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr: PartPtr, IdxList: LastLane, Name: "" , IsInBounds: InBounds); |
1446 | } else { |
1447 | Value *Increment = createStepForVF(B&: Builder, Ty: IndexTy, VF: State.VF, Step: Part); |
1448 | PartPtr = Builder.CreateGEP(Ty: IndexedTy, Ptr, IdxList: Increment, Name: "" , IsInBounds: InBounds); |
1449 | } |
1450 | |
1451 | State.set(Def: this, V: PartPtr, Part, /*IsScalar*/ true); |
1452 | } |
1453 | } |
1454 | |
1455 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1456 | void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent, |
1457 | VPSlotTracker &SlotTracker) const { |
1458 | O << Indent; |
1459 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1460 | O << " = vector-pointer " ; |
1461 | if (IsReverse) |
1462 | O << "(reverse) " ; |
1463 | |
1464 | printOperands(O, SlotTracker); |
1465 | } |
1466 | #endif |
1467 | |
1468 | void VPBlendRecipe::execute(VPTransformState &State) { |
1469 | State.setDebugLocFrom(getDebugLoc()); |
1470 | // We know that all PHIs in non-header blocks are converted into |
1471 | // selects, so we don't have to worry about the insertion order and we |
1472 | // can just use the builder. |
1473 | // At this point we generate the predication tree. There may be |
1474 | // duplications since this is a simple recursive scan, but future |
1475 | // optimizations will clean it up. |
1476 | |
1477 | unsigned NumIncoming = getNumIncomingValues(); |
1478 | |
1479 | // Generate a sequence of selects of the form: |
1480 | // SELECT(Mask3, In3, |
1481 | // SELECT(Mask2, In2, |
1482 | // SELECT(Mask1, In1, |
1483 | // In0))) |
1484 | // Note that Mask0 is never used: lanes for which no path reaches this phi and |
1485 | // are essentially undef are taken from In0. |
1486 | VectorParts Entry(State.UF); |
1487 | for (unsigned In = 0; In < NumIncoming; ++In) { |
1488 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1489 | // We might have single edge PHIs (blocks) - use an identity |
1490 | // 'select' for the first PHI operand. |
1491 | Value *In0 = State.get(Def: getIncomingValue(Idx: In), Part); |
1492 | if (In == 0) |
1493 | Entry[Part] = In0; // Initialize with the first incoming value. |
1494 | else { |
1495 | // Select between the current value and the previous incoming edge |
1496 | // based on the incoming mask. |
1497 | Value *Cond = State.get(Def: getMask(Idx: In), Part); |
1498 | Entry[Part] = |
1499 | State.Builder.CreateSelect(C: Cond, True: In0, False: Entry[Part], Name: "predphi" ); |
1500 | } |
1501 | } |
1502 | } |
1503 | for (unsigned Part = 0; Part < State.UF; ++Part) |
1504 | State.set(Def: this, V: Entry[Part], Part); |
1505 | } |
1506 | |
1507 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1508 | void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent, |
1509 | VPSlotTracker &SlotTracker) const { |
1510 | O << Indent << "BLEND " ; |
1511 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1512 | O << " =" ; |
1513 | if (getNumIncomingValues() == 1) { |
1514 | // Not a User of any mask: not really blending, this is a |
1515 | // single-predecessor phi. |
1516 | O << " " ; |
1517 | getIncomingValue(Idx: 0)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1518 | } else { |
1519 | for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) { |
1520 | O << " " ; |
1521 | getIncomingValue(Idx: I)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1522 | if (I == 0) |
1523 | continue; |
1524 | O << "/" ; |
1525 | getMask(Idx: I)->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1526 | } |
1527 | } |
1528 | } |
1529 | #endif |
1530 | |
1531 | void VPReductionRecipe::execute(VPTransformState &State) { |
1532 | assert(!State.Instance && "Reduction being replicated." ); |
1533 | Value *PrevInChain = State.get(Def: getChainOp(), Part: 0, /*IsScalar*/ true); |
1534 | RecurKind Kind = RdxDesc.getRecurrenceKind(); |
1535 | // Propagate the fast-math flags carried by the underlying instruction. |
1536 | IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder); |
1537 | State.Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); |
1538 | for (unsigned Part = 0; Part < State.UF; ++Part) { |
1539 | Value *NewVecOp = State.get(Def: getVecOp(), Part); |
1540 | if (VPValue *Cond = getCondOp()) { |
1541 | Value *NewCond = State.get(Def: Cond, Part, IsScalar: State.VF.isScalar()); |
1542 | VectorType *VecTy = dyn_cast<VectorType>(Val: NewVecOp->getType()); |
1543 | Type *ElementTy = VecTy ? VecTy->getElementType() : NewVecOp->getType(); |
1544 | Value *Iden = RdxDesc.getRecurrenceIdentity(K: Kind, Tp: ElementTy, |
1545 | FMF: RdxDesc.getFastMathFlags()); |
1546 | if (State.VF.isVector()) { |
1547 | Iden = State.Builder.CreateVectorSplat(EC: VecTy->getElementCount(), V: Iden); |
1548 | } |
1549 | |
1550 | Value *Select = State.Builder.CreateSelect(C: NewCond, True: NewVecOp, False: Iden); |
1551 | NewVecOp = Select; |
1552 | } |
1553 | Value *NewRed; |
1554 | Value *NextInChain; |
1555 | if (IsOrdered) { |
1556 | if (State.VF.isVector()) |
1557 | NewRed = createOrderedReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp, |
1558 | Start: PrevInChain); |
1559 | else |
1560 | NewRed = State.Builder.CreateBinOp( |
1561 | Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: PrevInChain, |
1562 | RHS: NewVecOp); |
1563 | PrevInChain = NewRed; |
1564 | } else { |
1565 | PrevInChain = State.get(Def: getChainOp(), Part, /*IsScalar*/ true); |
1566 | NewRed = createTargetReduction(B&: State.Builder, Desc: RdxDesc, Src: NewVecOp); |
1567 | } |
1568 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { |
1569 | NextInChain = createMinMaxOp(Builder&: State.Builder, RK: RdxDesc.getRecurrenceKind(), |
1570 | Left: NewRed, Right: PrevInChain); |
1571 | } else if (IsOrdered) |
1572 | NextInChain = NewRed; |
1573 | else |
1574 | NextInChain = State.Builder.CreateBinOp( |
1575 | Opc: (Instruction::BinaryOps)RdxDesc.getOpcode(Kind), LHS: NewRed, RHS: PrevInChain); |
1576 | State.set(Def: this, V: NextInChain, Part, /*IsScalar*/ true); |
1577 | } |
1578 | } |
1579 | |
1580 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1581 | void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent, |
1582 | VPSlotTracker &SlotTracker) const { |
1583 | O << Indent << "REDUCE " ; |
1584 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1585 | O << " = " ; |
1586 | getChainOp()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1587 | O << " +" ; |
1588 | if (isa<FPMathOperator>(Val: getUnderlyingInstr())) |
1589 | O << getUnderlyingInstr()->getFastMathFlags(); |
1590 | O << " reduce." << Instruction::getOpcodeName(Opcode: RdxDesc.getOpcode()) << " (" ; |
1591 | getVecOp()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1592 | if (getCondOp()) { |
1593 | O << ", " ; |
1594 | getCondOp()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1595 | } |
1596 | O << ")" ; |
1597 | if (RdxDesc.IntermediateStore) |
1598 | O << " (with final reduction value stored in invariant address sank " |
1599 | "outside of loop)" ; |
1600 | } |
1601 | #endif |
1602 | |
1603 | bool VPReplicateRecipe::shouldPack() const { |
1604 | // Find if the recipe is used by a widened recipe via an intervening |
1605 | // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector. |
1606 | return any_of(Range: users(), P: [](const VPUser *U) { |
1607 | if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(Val: U)) |
1608 | return any_of(Range: PredR->users(), P: [PredR](const VPUser *U) { |
1609 | return !U->usesScalars(Op: PredR); |
1610 | }); |
1611 | return false; |
1612 | }); |
1613 | } |
1614 | |
1615 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1616 | void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent, |
1617 | VPSlotTracker &SlotTracker) const { |
1618 | O << Indent << (IsUniform ? "CLONE " : "REPLICATE " ); |
1619 | |
1620 | if (!getUnderlyingInstr()->getType()->isVoidTy()) { |
1621 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1622 | O << " = " ; |
1623 | } |
1624 | if (auto *CB = dyn_cast<CallBase>(Val: getUnderlyingInstr())) { |
1625 | O << "call" ; |
1626 | printFlags(O); |
1627 | O << "@" << CB->getCalledFunction()->getName() << "(" ; |
1628 | interleaveComma(c: make_range(x: op_begin(), y: op_begin() + (getNumOperands() - 1)), |
1629 | os&: O, each_fn: [&O, &SlotTracker](VPValue *Op) { |
1630 | Op->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1631 | }); |
1632 | O << ")" ; |
1633 | } else { |
1634 | O << Instruction::getOpcodeName(Opcode: getUnderlyingInstr()->getOpcode()); |
1635 | printFlags(O); |
1636 | printOperands(O, SlotTracker); |
1637 | } |
1638 | |
1639 | if (shouldPack()) |
1640 | O << " (S->V)" ; |
1641 | } |
1642 | #endif |
1643 | |
1644 | /// Checks if \p C is uniform across all VFs and UFs. It is considered as such |
1645 | /// if it is either defined outside the vector region or its operand is known to |
1646 | /// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI). |
1647 | /// TODO: Uniformity should be associated with a VPValue and there should be a |
1648 | /// generic way to check. |
1649 | static bool isUniformAcrossVFsAndUFs(VPScalarCastRecipe *C) { |
1650 | return C->isDefinedOutsideVectorRegions() || |
1651 | isa<VPDerivedIVRecipe>(Val: C->getOperand(N: 0)) || |
1652 | isa<VPCanonicalIVPHIRecipe>(Val: C->getOperand(N: 0)); |
1653 | } |
1654 | |
1655 | Value *VPScalarCastRecipe ::generate(VPTransformState &State, unsigned Part) { |
1656 | assert(vputils::onlyFirstLaneUsed(this) && |
1657 | "Codegen only implemented for first lane." ); |
1658 | switch (Opcode) { |
1659 | case Instruction::SExt: |
1660 | case Instruction::ZExt: |
1661 | case Instruction::Trunc: { |
1662 | // Note: SExt/ZExt not used yet. |
1663 | Value *Op = State.get(Def: getOperand(N: 0), Instance: VPIteration(Part, 0)); |
1664 | return State.Builder.CreateCast(Op: Instruction::CastOps(Opcode), V: Op, DestTy: ResultTy); |
1665 | } |
1666 | default: |
1667 | llvm_unreachable("opcode not implemented yet" ); |
1668 | } |
1669 | } |
1670 | |
1671 | void VPScalarCastRecipe ::execute(VPTransformState &State) { |
1672 | bool IsUniformAcrossVFsAndUFs = isUniformAcrossVFsAndUFs(C: this); |
1673 | for (unsigned Part = 0; Part != State.UF; ++Part) { |
1674 | Value *Res; |
1675 | // Only generate a single instance, if the recipe is uniform across UFs and |
1676 | // VFs. |
1677 | if (Part > 0 && IsUniformAcrossVFsAndUFs) |
1678 | Res = State.get(Def: this, Instance: VPIteration(0, 0)); |
1679 | else |
1680 | Res = generate(State, Part); |
1681 | State.set(Def: this, V: Res, Instance: VPIteration(Part, 0)); |
1682 | } |
1683 | } |
1684 | |
1685 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1686 | void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent, |
1687 | VPSlotTracker &SlotTracker) const { |
1688 | O << Indent << "SCALAR-CAST " ; |
1689 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1690 | O << " = " << Instruction::getOpcodeName(Opcode) << " " ; |
1691 | printOperands(O, SlotTracker); |
1692 | O << " to " << *ResultTy; |
1693 | } |
1694 | #endif |
1695 | |
1696 | void VPBranchOnMaskRecipe::execute(VPTransformState &State) { |
1697 | assert(State.Instance && "Branch on Mask works only on single instance." ); |
1698 | |
1699 | unsigned Part = State.Instance->Part; |
1700 | unsigned Lane = State.Instance->Lane.getKnownLane(); |
1701 | |
1702 | Value *ConditionBit = nullptr; |
1703 | VPValue *BlockInMask = getMask(); |
1704 | if (BlockInMask) { |
1705 | ConditionBit = State.get(Def: BlockInMask, Part); |
1706 | if (ConditionBit->getType()->isVectorTy()) |
1707 | ConditionBit = State.Builder.CreateExtractElement( |
1708 | Vec: ConditionBit, Idx: State.Builder.getInt32(C: Lane)); |
1709 | } else // Block in mask is all-one. |
1710 | ConditionBit = State.Builder.getTrue(); |
1711 | |
1712 | // Replace the temporary unreachable terminator with a new conditional branch, |
1713 | // whose two destinations will be set later when they are created. |
1714 | auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); |
1715 | assert(isa<UnreachableInst>(CurrentTerminator) && |
1716 | "Expected to replace unreachable terminator with conditional branch." ); |
1717 | auto *CondBr = BranchInst::Create(IfTrue: State.CFG.PrevBB, IfFalse: nullptr, Cond: ConditionBit); |
1718 | CondBr->setSuccessor(idx: 0, NewSucc: nullptr); |
1719 | ReplaceInstWithInst(From: CurrentTerminator, To: CondBr); |
1720 | } |
1721 | |
1722 | void VPPredInstPHIRecipe::execute(VPTransformState &State) { |
1723 | assert(State.Instance && "Predicated instruction PHI works per instance." ); |
1724 | Instruction *ScalarPredInst = |
1725 | cast<Instruction>(Val: State.get(Def: getOperand(N: 0), Instance: *State.Instance)); |
1726 | BasicBlock *PredicatedBB = ScalarPredInst->getParent(); |
1727 | BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor(); |
1728 | assert(PredicatingBB && "Predicated block has no single predecessor." ); |
1729 | assert(isa<VPReplicateRecipe>(getOperand(0)) && |
1730 | "operand must be VPReplicateRecipe" ); |
1731 | |
1732 | // By current pack/unpack logic we need to generate only a single phi node: if |
1733 | // a vector value for the predicated instruction exists at this point it means |
1734 | // the instruction has vector users only, and a phi for the vector value is |
1735 | // needed. In this case the recipe of the predicated instruction is marked to |
1736 | // also do that packing, thereby "hoisting" the insert-element sequence. |
1737 | // Otherwise, a phi node for the scalar value is needed. |
1738 | unsigned Part = State.Instance->Part; |
1739 | if (State.hasVectorValue(Def: getOperand(N: 0), Part)) { |
1740 | Value *VectorValue = State.get(Def: getOperand(N: 0), Part); |
1741 | InsertElementInst *IEI = cast<InsertElementInst>(Val: VectorValue); |
1742 | PHINode *VPhi = State.Builder.CreatePHI(Ty: IEI->getType(), NumReservedValues: 2); |
1743 | VPhi->addIncoming(V: IEI->getOperand(i_nocapture: 0), BB: PredicatingBB); // Unmodified vector. |
1744 | VPhi->addIncoming(V: IEI, BB: PredicatedBB); // New vector with inserted element. |
1745 | if (State.hasVectorValue(Def: this, Part)) |
1746 | State.reset(Def: this, V: VPhi, Part); |
1747 | else |
1748 | State.set(Def: this, V: VPhi, Part); |
1749 | // NOTE: Currently we need to update the value of the operand, so the next |
1750 | // predicated iteration inserts its generated value in the correct vector. |
1751 | State.reset(Def: getOperand(N: 0), V: VPhi, Part); |
1752 | } else { |
1753 | Type *PredInstType = getOperand(N: 0)->getUnderlyingValue()->getType(); |
1754 | PHINode *Phi = State.Builder.CreatePHI(Ty: PredInstType, NumReservedValues: 2); |
1755 | Phi->addIncoming(V: PoisonValue::get(T: ScalarPredInst->getType()), |
1756 | BB: PredicatingBB); |
1757 | Phi->addIncoming(V: ScalarPredInst, BB: PredicatedBB); |
1758 | if (State.hasScalarValue(Def: this, Instance: *State.Instance)) |
1759 | State.reset(Def: this, V: Phi, Instance: *State.Instance); |
1760 | else |
1761 | State.set(Def: this, V: Phi, Instance: *State.Instance); |
1762 | // NOTE: Currently we need to update the value of the operand, so the next |
1763 | // predicated iteration inserts its generated value in the correct vector. |
1764 | State.reset(Def: getOperand(N: 0), V: Phi, Instance: *State.Instance); |
1765 | } |
1766 | } |
1767 | |
1768 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1769 | void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
1770 | VPSlotTracker &SlotTracker) const { |
1771 | O << Indent << "PHI-PREDICATED-INSTRUCTION " ; |
1772 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1773 | O << " = " ; |
1774 | printOperands(O, SlotTracker); |
1775 | } |
1776 | |
1777 | void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, |
1778 | VPSlotTracker &SlotTracker) const { |
1779 | O << Indent << "WIDEN " ; |
1780 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1781 | O << " = load " ; |
1782 | printOperands(O, SlotTracker); |
1783 | } |
1784 | |
1785 | void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
1786 | VPSlotTracker &SlotTracker) const { |
1787 | O << Indent << "WIDEN " ; |
1788 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1789 | O << " = vp.load " ; |
1790 | printOperands(O, SlotTracker); |
1791 | } |
1792 | |
1793 | void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent, |
1794 | VPSlotTracker &SlotTracker) const { |
1795 | O << Indent << "WIDEN store " ; |
1796 | printOperands(O, SlotTracker); |
1797 | } |
1798 | |
1799 | void VPWidenStoreEVLRecipe::print(raw_ostream &O, const Twine &Indent, |
1800 | VPSlotTracker &SlotTracker) const { |
1801 | O << Indent << "WIDEN vp.store " ; |
1802 | printOperands(O, SlotTracker); |
1803 | } |
1804 | #endif |
1805 | |
1806 | void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) { |
1807 | Value *Start = getStartValue()->getLiveInIRValue(); |
1808 | PHINode *EntryPart = PHINode::Create(Ty: Start->getType(), NumReservedValues: 2, NameStr: "index" ); |
1809 | EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
1810 | |
1811 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
1812 | EntryPart->addIncoming(V: Start, BB: VectorPH); |
1813 | EntryPart->setDebugLoc(getDebugLoc()); |
1814 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) |
1815 | State.set(Def: this, V: EntryPart, Part, /*IsScalar*/ true); |
1816 | } |
1817 | |
1818 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1819 | void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
1820 | VPSlotTracker &SlotTracker) const { |
1821 | O << Indent << "EMIT " ; |
1822 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1823 | O << " = CANONICAL-INDUCTION " ; |
1824 | printOperands(O, SlotTracker); |
1825 | } |
1826 | #endif |
1827 | |
1828 | bool VPCanonicalIVPHIRecipe::isCanonical( |
1829 | InductionDescriptor::InductionKind Kind, VPValue *Start, |
1830 | VPValue *Step) const { |
1831 | // Must be an integer induction. |
1832 | if (Kind != InductionDescriptor::IK_IntInduction) |
1833 | return false; |
1834 | // Start must match the start value of this canonical induction. |
1835 | if (Start != getStartValue()) |
1836 | return false; |
1837 | |
1838 | // If the step is defined by a recipe, it is not a ConstantInt. |
1839 | if (Step->getDefiningRecipe()) |
1840 | return false; |
1841 | |
1842 | ConstantInt *StepC = dyn_cast<ConstantInt>(Val: Step->getLiveInIRValue()); |
1843 | return StepC && StepC->isOne(); |
1844 | } |
1845 | |
1846 | bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool IsScalable) { |
1847 | return IsScalarAfterVectorization && |
1848 | (!IsScalable || vputils::onlyFirstLaneUsed(Def: this)); |
1849 | } |
1850 | |
1851 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1852 | void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, |
1853 | VPSlotTracker &SlotTracker) const { |
1854 | O << Indent << "EMIT " ; |
1855 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1856 | O << " = WIDEN-POINTER-INDUCTION " ; |
1857 | getStartValue()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1858 | O << ", " << *IndDesc.getStep(); |
1859 | } |
1860 | #endif |
1861 | |
1862 | void VPExpandSCEVRecipe::execute(VPTransformState &State) { |
1863 | assert(!State.Instance && "cannot be used in per-lane" ); |
1864 | const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout(); |
1865 | SCEVExpander Exp(SE, DL, "induction" ); |
1866 | |
1867 | Value *Res = Exp.expandCodeFor(SH: Expr, Ty: Expr->getType(), |
1868 | I: &*State.Builder.GetInsertPoint()); |
1869 | assert(!State.ExpandedSCEVs.contains(Expr) && |
1870 | "Same SCEV expanded multiple times" ); |
1871 | State.ExpandedSCEVs[Expr] = Res; |
1872 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) |
1873 | State.set(Def: this, V: Res, Instance: {Part, 0}); |
1874 | } |
1875 | |
1876 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1877 | void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, |
1878 | VPSlotTracker &SlotTracker) const { |
1879 | O << Indent << "EMIT " ; |
1880 | getVPSingleValue()->printAsOperand(OS&: O, Tracker&: SlotTracker); |
1881 | O << " = EXPAND SCEV " << *Expr; |
1882 | } |
1883 | #endif |
1884 | |
1885 | void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { |
1886 | Value *CanonicalIV = State.get(Def: getOperand(N: 0), Part: 0, /*IsScalar*/ true); |
1887 | Type *STy = CanonicalIV->getType(); |
1888 | IRBuilder<> Builder(State.CFG.PrevBB->getTerminator()); |
1889 | ElementCount VF = State.VF; |
1890 | Value *VStart = VF.isScalar() |
1891 | ? CanonicalIV |
1892 | : Builder.CreateVectorSplat(EC: VF, V: CanonicalIV, Name: "broadcast" ); |
1893 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { |
1894 | Value *VStep = createStepForVF(B&: Builder, Ty: STy, VF, Step: Part); |
1895 | if (VF.isVector()) { |
1896 | VStep = Builder.CreateVectorSplat(EC: VF, V: VStep); |
1897 | VStep = |
1898 | Builder.CreateAdd(LHS: VStep, RHS: Builder.CreateStepVector(DstType: VStep->getType())); |
1899 | } |
1900 | Value *CanonicalVectorIV = Builder.CreateAdd(LHS: VStart, RHS: VStep, Name: "vec.iv" ); |
1901 | State.set(Def: this, V: CanonicalVectorIV, Part); |
1902 | } |
1903 | } |
1904 | |
1905 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1906 | void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, |
1907 | VPSlotTracker &SlotTracker) const { |
1908 | O << Indent << "EMIT " ; |
1909 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1910 | O << " = WIDEN-CANONICAL-INDUCTION " ; |
1911 | printOperands(O, SlotTracker); |
1912 | } |
1913 | #endif |
1914 | |
1915 | void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) { |
1916 | auto &Builder = State.Builder; |
1917 | // Create a vector from the initial value. |
1918 | auto *VectorInit = getStartValue()->getLiveInIRValue(); |
1919 | |
1920 | Type *VecTy = State.VF.isScalar() |
1921 | ? VectorInit->getType() |
1922 | : VectorType::get(ElementType: VectorInit->getType(), EC: State.VF); |
1923 | |
1924 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
1925 | if (State.VF.isVector()) { |
1926 | auto *IdxTy = Builder.getInt32Ty(); |
1927 | auto *One = ConstantInt::get(Ty: IdxTy, V: 1); |
1928 | IRBuilder<>::InsertPointGuard Guard(Builder); |
1929 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
1930 | auto *RuntimeVF = getRuntimeVF(B&: Builder, Ty: IdxTy, VF: State.VF); |
1931 | auto *LastIdx = Builder.CreateSub(LHS: RuntimeVF, RHS: One); |
1932 | VectorInit = Builder.CreateInsertElement( |
1933 | Vec: PoisonValue::get(T: VecTy), NewElt: VectorInit, Idx: LastIdx, Name: "vector.recur.init" ); |
1934 | } |
1935 | |
1936 | // Create a phi node for the new recurrence. |
1937 | PHINode *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vector.recur" ); |
1938 | EntryPart->insertBefore(InsertPos: State.CFG.PrevBB->getFirstInsertionPt()); |
1939 | EntryPart->addIncoming(V: VectorInit, BB: VectorPH); |
1940 | State.set(Def: this, V: EntryPart, Part: 0); |
1941 | } |
1942 | |
1943 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1944 | void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent, |
1945 | VPSlotTracker &SlotTracker) const { |
1946 | O << Indent << "FIRST-ORDER-RECURRENCE-PHI " ; |
1947 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
1948 | O << " = phi " ; |
1949 | printOperands(O, SlotTracker); |
1950 | } |
1951 | #endif |
1952 | |
1953 | void VPReductionPHIRecipe::execute(VPTransformState &State) { |
1954 | auto &Builder = State.Builder; |
1955 | |
1956 | // Reductions do not have to start at zero. They can start with |
1957 | // any loop invariant values. |
1958 | VPValue *StartVPV = getStartValue(); |
1959 | Value *StartV = StartVPV->getLiveInIRValue(); |
1960 | |
1961 | // In order to support recurrences we need to be able to vectorize Phi nodes. |
1962 | // Phi nodes have cycles, so we need to vectorize them in two stages. This is |
1963 | // stage #1: We create a new vector PHI node with no incoming edges. We'll use |
1964 | // this value when we vectorize all of the instructions that use the PHI. |
1965 | bool ScalarPHI = State.VF.isScalar() || IsInLoop; |
1966 | Type *VecTy = ScalarPHI ? StartV->getType() |
1967 | : VectorType::get(ElementType: StartV->getType(), EC: State.VF); |
1968 | |
1969 | BasicBlock * = State.CFG.PrevBB; |
1970 | assert(State.CurrentVectorLoop->getHeader() == HeaderBB && |
1971 | "recipe must be in the vector loop header" ); |
1972 | unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF; |
1973 | for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { |
1974 | Instruction *EntryPart = PHINode::Create(Ty: VecTy, NumReservedValues: 2, NameStr: "vec.phi" ); |
1975 | EntryPart->insertBefore(InsertPos: HeaderBB->getFirstInsertionPt()); |
1976 | State.set(Def: this, V: EntryPart, Part, IsScalar: IsInLoop); |
1977 | } |
1978 | |
1979 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
1980 | |
1981 | Value *Iden = nullptr; |
1982 | RecurKind RK = RdxDesc.getRecurrenceKind(); |
1983 | if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) || |
1984 | RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) { |
1985 | // MinMax and AnyOf reductions have the start value as their identity. |
1986 | if (ScalarPHI) { |
1987 | Iden = StartV; |
1988 | } else { |
1989 | IRBuilderBase::InsertPointGuard IPBuilder(Builder); |
1990 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
1991 | StartV = Iden = |
1992 | Builder.CreateVectorSplat(EC: State.VF, V: StartV, Name: "minmax.ident" ); |
1993 | } |
1994 | } else { |
1995 | Iden = RdxDesc.getRecurrenceIdentity(K: RK, Tp: VecTy->getScalarType(), |
1996 | FMF: RdxDesc.getFastMathFlags()); |
1997 | |
1998 | if (!ScalarPHI) { |
1999 | Iden = Builder.CreateVectorSplat(EC: State.VF, V: Iden); |
2000 | IRBuilderBase::InsertPointGuard IPBuilder(Builder); |
2001 | Builder.SetInsertPoint(VectorPH->getTerminator()); |
2002 | Constant *Zero = Builder.getInt32(C: 0); |
2003 | StartV = Builder.CreateInsertElement(Vec: Iden, NewElt: StartV, Idx: Zero); |
2004 | } |
2005 | } |
2006 | |
2007 | for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { |
2008 | Value *EntryPart = State.get(Def: this, Part, IsScalar: IsInLoop); |
2009 | // Make sure to add the reduction start value only to the |
2010 | // first unroll part. |
2011 | Value *StartVal = (Part == 0) ? StartV : Iden; |
2012 | cast<PHINode>(Val: EntryPart)->addIncoming(V: StartVal, BB: VectorPH); |
2013 | } |
2014 | } |
2015 | |
2016 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2017 | void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2018 | VPSlotTracker &SlotTracker) const { |
2019 | O << Indent << "WIDEN-REDUCTION-PHI " ; |
2020 | |
2021 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
2022 | O << " = phi " ; |
2023 | printOperands(O, SlotTracker); |
2024 | } |
2025 | #endif |
2026 | |
2027 | void VPWidenPHIRecipe::execute(VPTransformState &State) { |
2028 | assert(EnableVPlanNativePath && |
2029 | "Non-native vplans are not expected to have VPWidenPHIRecipes." ); |
2030 | |
2031 | Value *Op0 = State.get(Def: getOperand(N: 0), Part: 0); |
2032 | Type *VecTy = Op0->getType(); |
2033 | Value *VecPhi = State.Builder.CreatePHI(Ty: VecTy, NumReservedValues: 2, Name: "vec.phi" ); |
2034 | State.set(Def: this, V: VecPhi, Part: 0); |
2035 | } |
2036 | |
2037 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2038 | void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2039 | VPSlotTracker &SlotTracker) const { |
2040 | O << Indent << "WIDEN-PHI " ; |
2041 | |
2042 | auto *OriginalPhi = cast<PHINode>(Val: getUnderlyingValue()); |
2043 | // Unless all incoming values are modeled in VPlan print the original PHI |
2044 | // directly. |
2045 | // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming |
2046 | // values as VPValues. |
2047 | if (getNumOperands() != OriginalPhi->getNumOperands()) { |
2048 | O << VPlanIngredient(OriginalPhi); |
2049 | return; |
2050 | } |
2051 | |
2052 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
2053 | O << " = phi " ; |
2054 | printOperands(O, SlotTracker); |
2055 | } |
2056 | #endif |
2057 | |
2058 | // TODO: It would be good to use the existing VPWidenPHIRecipe instead and |
2059 | // remove VPActiveLaneMaskPHIRecipe. |
2060 | void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) { |
2061 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
2062 | for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { |
2063 | Value *StartMask = State.get(Def: getOperand(N: 0), Part); |
2064 | PHINode *EntryPart = |
2065 | State.Builder.CreatePHI(Ty: StartMask->getType(), NumReservedValues: 2, Name: "active.lane.mask" ); |
2066 | EntryPart->addIncoming(V: StartMask, BB: VectorPH); |
2067 | EntryPart->setDebugLoc(getDebugLoc()); |
2068 | State.set(Def: this, V: EntryPart, Part); |
2069 | } |
2070 | } |
2071 | |
2072 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2073 | void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2074 | VPSlotTracker &SlotTracker) const { |
2075 | O << Indent << "ACTIVE-LANE-MASK-PHI " ; |
2076 | |
2077 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
2078 | O << " = phi " ; |
2079 | printOperands(O, SlotTracker); |
2080 | } |
2081 | #endif |
2082 | |
2083 | void VPEVLBasedIVPHIRecipe::execute(VPTransformState &State) { |
2084 | BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this); |
2085 | assert(State.UF == 1 && "Expected unroll factor 1 for VP vectorization." ); |
2086 | Value *Start = State.get(Def: getOperand(N: 0), Instance: VPIteration(0, 0)); |
2087 | PHINode *EntryPart = |
2088 | State.Builder.CreatePHI(Ty: Start->getType(), NumReservedValues: 2, Name: "evl.based.iv" ); |
2089 | EntryPart->addIncoming(V: Start, BB: VectorPH); |
2090 | EntryPart->setDebugLoc(getDebugLoc()); |
2091 | State.set(Def: this, V: EntryPart, Part: 0, /*IsScalar=*/true); |
2092 | } |
2093 | |
2094 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
2095 | void VPEVLBasedIVPHIRecipe::print(raw_ostream &O, const Twine &Indent, |
2096 | VPSlotTracker &SlotTracker) const { |
2097 | O << Indent << "EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI " ; |
2098 | |
2099 | printAsOperand(OS&: O, Tracker&: SlotTracker); |
2100 | O << " = phi " ; |
2101 | printOperands(O, SlotTracker); |
2102 | } |
2103 | #endif |
2104 | |