1 | //===------- VectorCombine.cpp - Optimize partial vector operations -------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass optimizes scalar/vector interactions using target cost models. The |
10 | // transforms implemented here may not fit in traditional loop-based or SLP |
11 | // vectorization passes. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "llvm/Transforms/Vectorize/VectorCombine.h" |
16 | #include "llvm/ADT/DenseMap.h" |
17 | #include "llvm/ADT/ScopeExit.h" |
18 | #include "llvm/ADT/Statistic.h" |
19 | #include "llvm/Analysis/AssumptionCache.h" |
20 | #include "llvm/Analysis/BasicAliasAnalysis.h" |
21 | #include "llvm/Analysis/GlobalsModRef.h" |
22 | #include "llvm/Analysis/Loads.h" |
23 | #include "llvm/Analysis/TargetTransformInfo.h" |
24 | #include "llvm/Analysis/ValueTracking.h" |
25 | #include "llvm/Analysis/VectorUtils.h" |
26 | #include "llvm/IR/Dominators.h" |
27 | #include "llvm/IR/Function.h" |
28 | #include "llvm/IR/IRBuilder.h" |
29 | #include "llvm/IR/PatternMatch.h" |
30 | #include "llvm/Support/CommandLine.h" |
31 | #include "llvm/Transforms/Utils/Local.h" |
32 | #include <numeric> |
33 | #include <queue> |
34 | |
35 | #define DEBUG_TYPE "vector-combine" |
36 | #include "llvm/Transforms/Utils/InstructionWorklist.h" |
37 | |
38 | using namespace llvm; |
39 | using namespace llvm::PatternMatch; |
40 | |
41 | STATISTIC(NumVecLoad, "Number of vector loads formed" ); |
42 | STATISTIC(NumVecCmp, "Number of vector compares formed" ); |
43 | STATISTIC(NumVecBO, "Number of vector binops formed" ); |
44 | STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed" ); |
45 | STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast" ); |
46 | STATISTIC(NumScalarBO, "Number of scalar binops formed" ); |
47 | STATISTIC(NumScalarCmp, "Number of scalar compares formed" ); |
48 | |
49 | static cl::opt<bool> DisableVectorCombine( |
50 | "disable-vector-combine" , cl::init(Val: false), cl::Hidden, |
51 | cl::desc("Disable all vector combine transforms" )); |
52 | |
53 | static cl::opt<bool> ( |
54 | "disable-binop-extract-shuffle" , cl::init(Val: false), cl::Hidden, |
55 | cl::desc("Disable binop extract to shuffle transforms" )); |
56 | |
57 | static cl::opt<unsigned> MaxInstrsToScan( |
58 | "vector-combine-max-scan-instrs" , cl::init(Val: 30), cl::Hidden, |
59 | cl::desc("Max number of instructions to scan for vector combining." )); |
60 | |
61 | static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max(); |
62 | |
63 | namespace { |
64 | class VectorCombine { |
65 | public: |
66 | VectorCombine(Function &F, const TargetTransformInfo &TTI, |
67 | const DominatorTree &DT, AAResults &AA, AssumptionCache &AC, |
68 | bool TryEarlyFoldsOnly) |
69 | : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), |
70 | TryEarlyFoldsOnly(TryEarlyFoldsOnly) {} |
71 | |
72 | bool run(); |
73 | |
74 | private: |
75 | Function &F; |
76 | IRBuilder<> Builder; |
77 | const TargetTransformInfo &TTI; |
78 | const DominatorTree &DT; |
79 | AAResults &AA; |
80 | AssumptionCache &AC; |
81 | |
82 | /// If true, only perform beneficial early IR transforms. Do not introduce new |
83 | /// vector operations. |
84 | bool TryEarlyFoldsOnly; |
85 | |
86 | InstructionWorklist Worklist; |
87 | |
88 | // TODO: Direct calls from the top-level "run" loop use a plain "Instruction" |
89 | // parameter. That should be updated to specific sub-classes because the |
90 | // run loop was changed to dispatch on opcode. |
91 | bool vectorizeLoadInsert(Instruction &I); |
92 | bool widenSubvectorLoad(Instruction &I); |
93 | ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0, |
94 | ExtractElementInst *Ext1, |
95 | unsigned ) const; |
96 | bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1, |
97 | const Instruction &I, |
98 | ExtractElementInst *&ConvertToShuffle, |
99 | unsigned ); |
100 | void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, |
101 | Instruction &I); |
102 | void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, |
103 | Instruction &I); |
104 | bool foldExtractExtract(Instruction &I); |
105 | bool foldInsExtFNeg(Instruction &I); |
106 | bool foldBitcastShuffle(Instruction &I); |
107 | bool scalarizeBinopOrCmp(Instruction &I); |
108 | bool scalarizeVPIntrinsic(Instruction &I); |
109 | bool foldExtractedCmps(Instruction &I); |
110 | bool foldSingleElementStore(Instruction &I); |
111 | bool scalarizeLoadExtract(Instruction &I); |
112 | bool foldShuffleOfBinops(Instruction &I); |
113 | bool foldShuffleFromReductions(Instruction &I); |
114 | bool foldSelectShuffle(Instruction &I, bool FromReduction = false); |
115 | |
116 | void replaceValue(Value &Old, Value &New) { |
117 | Old.replaceAllUsesWith(V: &New); |
118 | if (auto *NewI = dyn_cast<Instruction>(Val: &New)) { |
119 | New.takeName(V: &Old); |
120 | Worklist.pushUsersToWorkList(I&: *NewI); |
121 | Worklist.pushValue(V: NewI); |
122 | } |
123 | Worklist.pushValue(V: &Old); |
124 | } |
125 | |
126 | void eraseInstruction(Instruction &I) { |
127 | for (Value *Op : I.operands()) |
128 | Worklist.pushValue(V: Op); |
129 | Worklist.remove(I: &I); |
130 | I.eraseFromParent(); |
131 | } |
132 | }; |
133 | } // namespace |
134 | |
135 | static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) { |
136 | // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan. |
137 | // The widened load may load data from dirty regions or create data races |
138 | // non-existent in the source. |
139 | if (!Load || !Load->isSimple() || !Load->hasOneUse() || |
140 | Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) || |
141 | mustSuppressSpeculation(LI: *Load)) |
142 | return false; |
143 | |
144 | // We are potentially transforming byte-sized (8-bit) memory accesses, so make |
145 | // sure we have all of our type-based constraints in place for this target. |
146 | Type *ScalarTy = Load->getType()->getScalarType(); |
147 | uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); |
148 | unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); |
149 | if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 || |
150 | ScalarSize % 8 != 0) |
151 | return false; |
152 | |
153 | return true; |
154 | } |
155 | |
156 | bool VectorCombine::vectorizeLoadInsert(Instruction &I) { |
157 | // Match insert into fixed vector of scalar value. |
158 | // TODO: Handle non-zero insert index. |
159 | Value *Scalar; |
160 | if (!match(V: &I, P: m_InsertElt(Val: m_Undef(), Elt: m_Value(V&: Scalar), Idx: m_ZeroInt())) || |
161 | !Scalar->hasOneUse()) |
162 | return false; |
163 | |
164 | // Optionally match an extract from another vector. |
165 | Value *X; |
166 | bool = match(V: Scalar, P: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ZeroInt())); |
167 | if (!HasExtract) |
168 | X = Scalar; |
169 | |
170 | auto *Load = dyn_cast<LoadInst>(Val: X); |
171 | if (!canWidenLoad(Load, TTI)) |
172 | return false; |
173 | |
174 | Type *ScalarTy = Scalar->getType(); |
175 | uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits(); |
176 | unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth(); |
177 | |
178 | // Check safety of replacing the scalar load with a larger vector load. |
179 | // We use minimal alignment (maximum flexibility) because we only care about |
180 | // the dereferenceable region. When calculating cost and creating a new op, |
181 | // we may use a larger value based on alignment attributes. |
182 | const DataLayout &DL = I.getModule()->getDataLayout(); |
183 | Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); |
184 | assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type" ); |
185 | |
186 | unsigned MinVecNumElts = MinVectorSize / ScalarSize; |
187 | auto *MinVecTy = VectorType::get(ElementType: ScalarTy, NumElements: MinVecNumElts, Scalable: false); |
188 | unsigned OffsetEltIndex = 0; |
189 | Align Alignment = Load->getAlign(); |
190 | if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align(1), DL, ScanFrom: Load, AC: &AC, |
191 | DT: &DT)) { |
192 | // It is not safe to load directly from the pointer, but we can still peek |
193 | // through gep offsets and check if it safe to load from a base address with |
194 | // updated alignment. If it is, we can shuffle the element(s) into place |
195 | // after loading. |
196 | unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(Ty: SrcPtr->getType()); |
197 | APInt Offset(OffsetBitWidth, 0); |
198 | SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); |
199 | |
200 | // We want to shuffle the result down from a high element of a vector, so |
201 | // the offset must be positive. |
202 | if (Offset.isNegative()) |
203 | return false; |
204 | |
205 | // The offset must be a multiple of the scalar element to shuffle cleanly |
206 | // in the element's size. |
207 | uint64_t ScalarSizeInBytes = ScalarSize / 8; |
208 | if (Offset.urem(RHS: ScalarSizeInBytes) != 0) |
209 | return false; |
210 | |
211 | // If we load MinVecNumElts, will our target element still be loaded? |
212 | OffsetEltIndex = Offset.udiv(RHS: ScalarSizeInBytes).getZExtValue(); |
213 | if (OffsetEltIndex >= MinVecNumElts) |
214 | return false; |
215 | |
216 | if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align(1), DL, ScanFrom: Load, AC: &AC, |
217 | DT: &DT)) |
218 | return false; |
219 | |
220 | // Update alignment with offset value. Note that the offset could be negated |
221 | // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but |
222 | // negation does not change the result of the alignment calculation. |
223 | Alignment = commonAlignment(A: Alignment, Offset: Offset.getZExtValue()); |
224 | } |
225 | |
226 | // Original pattern: insertelt undef, load [free casts of] PtrOp, 0 |
227 | // Use the greater of the alignment on the load or its source pointer. |
228 | Alignment = std::max(a: SrcPtr->getPointerAlignment(DL), b: Alignment); |
229 | Type *LoadTy = Load->getType(); |
230 | unsigned AS = Load->getPointerAddressSpace(); |
231 | InstructionCost OldCost = |
232 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS); |
233 | APInt DemandedElts = APInt::getOneBitSet(numBits: MinVecNumElts, BitNo: 0); |
234 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
235 | OldCost += |
236 | TTI.getScalarizationOverhead(Ty: MinVecTy, DemandedElts, |
237 | /* Insert */ true, Extract: HasExtract, CostKind); |
238 | |
239 | // New pattern: load VecPtr |
240 | InstructionCost NewCost = |
241 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: MinVecTy, Alignment, AddressSpace: AS); |
242 | // Optionally, we are shuffling the loaded vector element(s) into place. |
243 | // For the mask set everything but element 0 to undef to prevent poison from |
244 | // propagating from the extra loaded memory. This will also optionally |
245 | // shrink/grow the vector from the loaded size to the output size. |
246 | // We assume this operation has no cost in codegen if there was no offset. |
247 | // Note that we could use freeze to avoid poison problems, but then we might |
248 | // still need a shuffle to change the vector size. |
249 | auto *Ty = cast<FixedVectorType>(Val: I.getType()); |
250 | unsigned OutputNumElts = Ty->getNumElements(); |
251 | SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem); |
252 | assert(OffsetEltIndex < MinVecNumElts && "Address offset too big" ); |
253 | Mask[0] = OffsetEltIndex; |
254 | if (OffsetEltIndex) |
255 | NewCost += TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: MinVecTy, Mask); |
256 | |
257 | // We can aggressively convert to the vector form because the backend can |
258 | // invert this transform if it does not result in a performance win. |
259 | if (OldCost < NewCost || !NewCost.isValid()) |
260 | return false; |
261 | |
262 | // It is safe and potentially profitable to load a vector directly: |
263 | // inselt undef, load Scalar, 0 --> load VecPtr |
264 | IRBuilder<> Builder(Load); |
265 | Value *CastedPtr = |
266 | Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS)); |
267 | Value *VecLd = Builder.CreateAlignedLoad(Ty: MinVecTy, Ptr: CastedPtr, Align: Alignment); |
268 | VecLd = Builder.CreateShuffleVector(V: VecLd, Mask); |
269 | |
270 | replaceValue(Old&: I, New&: *VecLd); |
271 | ++NumVecLoad; |
272 | return true; |
273 | } |
274 | |
275 | /// If we are loading a vector and then inserting it into a larger vector with |
276 | /// undefined elements, try to load the larger vector and eliminate the insert. |
277 | /// This removes a shuffle in IR and may allow combining of other loaded values. |
278 | bool VectorCombine::widenSubvectorLoad(Instruction &I) { |
279 | // Match subvector insert of fixed vector. |
280 | auto *Shuf = cast<ShuffleVectorInst>(Val: &I); |
281 | if (!Shuf->isIdentityWithPadding()) |
282 | return false; |
283 | |
284 | // Allow a non-canonical shuffle mask that is choosing elements from op1. |
285 | unsigned NumOpElts = |
286 | cast<FixedVectorType>(Val: Shuf->getOperand(i_nocapture: 0)->getType())->getNumElements(); |
287 | unsigned OpIndex = any_of(Range: Shuf->getShuffleMask(), P: [&NumOpElts](int M) { |
288 | return M >= (int)(NumOpElts); |
289 | }); |
290 | |
291 | auto *Load = dyn_cast<LoadInst>(Val: Shuf->getOperand(i_nocapture: OpIndex)); |
292 | if (!canWidenLoad(Load, TTI)) |
293 | return false; |
294 | |
295 | // We use minimal alignment (maximum flexibility) because we only care about |
296 | // the dereferenceable region. When calculating cost and creating a new op, |
297 | // we may use a larger value based on alignment attributes. |
298 | auto *Ty = cast<FixedVectorType>(Val: I.getType()); |
299 | const DataLayout &DL = I.getModule()->getDataLayout(); |
300 | Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts(); |
301 | assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type" ); |
302 | Align Alignment = Load->getAlign(); |
303 | if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty, Alignment: Align(1), DL, ScanFrom: Load, AC: &AC, DT: &DT)) |
304 | return false; |
305 | |
306 | Alignment = std::max(a: SrcPtr->getPointerAlignment(DL), b: Alignment); |
307 | Type *LoadTy = Load->getType(); |
308 | unsigned AS = Load->getPointerAddressSpace(); |
309 | |
310 | // Original pattern: insert_subvector (load PtrOp) |
311 | // This conservatively assumes that the cost of a subvector insert into an |
312 | // undef value is 0. We could add that cost if the cost model accurately |
313 | // reflects the real cost of that operation. |
314 | InstructionCost OldCost = |
315 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS); |
316 | |
317 | // New pattern: load PtrOp |
318 | InstructionCost NewCost = |
319 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS); |
320 | |
321 | // We can aggressively convert to the vector form because the backend can |
322 | // invert this transform if it does not result in a performance win. |
323 | if (OldCost < NewCost || !NewCost.isValid()) |
324 | return false; |
325 | |
326 | IRBuilder<> Builder(Load); |
327 | Value *CastedPtr = |
328 | Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS)); |
329 | Value *VecLd = Builder.CreateAlignedLoad(Ty, Ptr: CastedPtr, Align: Alignment); |
330 | replaceValue(Old&: I, New&: *VecLd); |
331 | ++NumVecLoad; |
332 | return true; |
333 | } |
334 | |
335 | /// Determine which, if any, of the inputs should be replaced by a shuffle |
336 | /// followed by extract from a different index. |
337 | ExtractElementInst *VectorCombine::( |
338 | ExtractElementInst *Ext0, ExtractElementInst *Ext1, |
339 | unsigned = InvalidIndex) const { |
340 | auto *Index0C = dyn_cast<ConstantInt>(Val: Ext0->getIndexOperand()); |
341 | auto *Index1C = dyn_cast<ConstantInt>(Val: Ext1->getIndexOperand()); |
342 | assert(Index0C && Index1C && "Expected constant extract indexes" ); |
343 | |
344 | unsigned Index0 = Index0C->getZExtValue(); |
345 | unsigned Index1 = Index1C->getZExtValue(); |
346 | |
347 | // If the extract indexes are identical, no shuffle is needed. |
348 | if (Index0 == Index1) |
349 | return nullptr; |
350 | |
351 | Type *VecTy = Ext0->getVectorOperand()->getType(); |
352 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
353 | assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types" ); |
354 | InstructionCost Cost0 = |
355 | TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0); |
356 | InstructionCost Cost1 = |
357 | TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1); |
358 | |
359 | // If both costs are invalid no shuffle is needed |
360 | if (!Cost0.isValid() && !Cost1.isValid()) |
361 | return nullptr; |
362 | |
363 | // We are extracting from 2 different indexes, so one operand must be shuffled |
364 | // before performing a vector operation and/or extract. The more expensive |
365 | // extract will be replaced by a shuffle. |
366 | if (Cost0 > Cost1) |
367 | return Ext0; |
368 | if (Cost1 > Cost0) |
369 | return Ext1; |
370 | |
371 | // If the costs are equal and there is a preferred extract index, shuffle the |
372 | // opposite operand. |
373 | if (PreferredExtractIndex == Index0) |
374 | return Ext1; |
375 | if (PreferredExtractIndex == Index1) |
376 | return Ext0; |
377 | |
378 | // Otherwise, replace the extract with the higher index. |
379 | return Index0 > Index1 ? Ext0 : Ext1; |
380 | } |
381 | |
382 | /// Compare the relative costs of 2 extracts followed by scalar operation vs. |
383 | /// vector operation(s) followed by extract. Return true if the existing |
384 | /// instructions are cheaper than a vector alternative. Otherwise, return false |
385 | /// and if one of the extracts should be transformed to a shufflevector, set |
386 | /// \p ConvertToShuffle to that extract instruction. |
387 | bool VectorCombine::(ExtractElementInst *Ext0, |
388 | ExtractElementInst *Ext1, |
389 | const Instruction &I, |
390 | ExtractElementInst *&ConvertToShuffle, |
391 | unsigned ) { |
392 | auto *Ext0IndexC = dyn_cast<ConstantInt>(Val: Ext0->getOperand(i_nocapture: 1)); |
393 | auto *Ext1IndexC = dyn_cast<ConstantInt>(Val: Ext1->getOperand(i_nocapture: 1)); |
394 | assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes" ); |
395 | |
396 | unsigned Opcode = I.getOpcode(); |
397 | Type *ScalarTy = Ext0->getType(); |
398 | auto *VecTy = cast<VectorType>(Val: Ext0->getOperand(i_nocapture: 0)->getType()); |
399 | InstructionCost ScalarOpCost, VectorOpCost; |
400 | |
401 | // Get cost estimates for scalar and vector versions of the operation. |
402 | bool IsBinOp = Instruction::isBinaryOp(Opcode); |
403 | if (IsBinOp) { |
404 | ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy); |
405 | VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy); |
406 | } else { |
407 | assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && |
408 | "Expected a compare" ); |
409 | CmpInst::Predicate Pred = cast<CmpInst>(Val: I).getPredicate(); |
410 | ScalarOpCost = TTI.getCmpSelInstrCost( |
411 | Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred); |
412 | VectorOpCost = TTI.getCmpSelInstrCost( |
413 | Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred); |
414 | } |
415 | |
416 | // Get cost estimates for the extract elements. These costs will factor into |
417 | // both sequences. |
418 | unsigned Ext0Index = Ext0IndexC->getZExtValue(); |
419 | unsigned Ext1Index = Ext1IndexC->getZExtValue(); |
420 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
421 | |
422 | InstructionCost = |
423 | TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Ext0Index); |
424 | InstructionCost = |
425 | TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Ext1Index); |
426 | |
427 | // A more expensive extract will always be replaced by a splat shuffle. |
428 | // For example, if Ext0 is more expensive: |
429 | // opcode (extelt V0, Ext0), (ext V1, Ext1) --> |
430 | // extelt (opcode (splat V0, Ext0), V1), Ext1 |
431 | // TODO: Evaluate whether that always results in lowest cost. Alternatively, |
432 | // check the cost of creating a broadcast shuffle and shuffling both |
433 | // operands to element 0. |
434 | InstructionCost = std::min(a: Extract0Cost, b: Extract1Cost); |
435 | |
436 | // Extra uses of the extracts mean that we include those costs in the |
437 | // vector total because those instructions will not be eliminated. |
438 | InstructionCost OldCost, NewCost; |
439 | if (Ext0->getOperand(i_nocapture: 0) == Ext1->getOperand(i_nocapture: 0) && Ext0Index == Ext1Index) { |
440 | // Handle a special case. If the 2 extracts are identical, adjust the |
441 | // formulas to account for that. The extra use charge allows for either the |
442 | // CSE'd pattern or an unoptimized form with identical values: |
443 | // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C |
444 | bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(N: 2) |
445 | : !Ext0->hasOneUse() || !Ext1->hasOneUse(); |
446 | OldCost = CheapExtractCost + ScalarOpCost; |
447 | NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost; |
448 | } else { |
449 | // Handle the general case. Each extract is actually a different value: |
450 | // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C |
451 | OldCost = Extract0Cost + Extract1Cost + ScalarOpCost; |
452 | NewCost = VectorOpCost + CheapExtractCost + |
453 | !Ext0->hasOneUse() * Extract0Cost + |
454 | !Ext1->hasOneUse() * Extract1Cost; |
455 | } |
456 | |
457 | ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex); |
458 | if (ConvertToShuffle) { |
459 | if (IsBinOp && DisableBinopExtractShuffle) |
460 | return true; |
461 | |
462 | // If we are extracting from 2 different indexes, then one operand must be |
463 | // shuffled before performing the vector operation. The shuffle mask is |
464 | // poison except for 1 lane that is being translated to the remaining |
465 | // extraction lane. Therefore, it is a splat shuffle. Ex: |
466 | // ShufMask = { poison, poison, 0, poison } |
467 | // TODO: The cost model has an option for a "broadcast" shuffle |
468 | // (splat-from-element-0), but no option for a more general splat. |
469 | NewCost += |
470 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy); |
471 | } |
472 | |
473 | // Aggressively form a vector op if the cost is equal because the transform |
474 | // may enable further optimization. |
475 | // Codegen can reverse this transform (scalarize) if it was not profitable. |
476 | return OldCost < NewCost; |
477 | } |
478 | |
479 | /// Create a shuffle that translates (shifts) 1 element from the input vector |
480 | /// to a new element location. |
481 | static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, |
482 | unsigned NewIndex, IRBuilder<> &Builder) { |
483 | // The shuffle mask is poison except for 1 lane that is being translated |
484 | // to the new element index. Example for OldIndex == 2 and NewIndex == 0: |
485 | // ShufMask = { 2, poison, poison, poison } |
486 | auto *VecTy = cast<FixedVectorType>(Val: Vec->getType()); |
487 | SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); |
488 | ShufMask[NewIndex] = OldIndex; |
489 | return Builder.CreateShuffleVector(V: Vec, Mask: ShufMask, Name: "shift" ); |
490 | } |
491 | |
492 | /// Given an extract element instruction with constant index operand, shuffle |
493 | /// the source vector (shift the scalar element) to a NewIndex for extraction. |
494 | /// Return null if the input can be constant folded, so that we are not creating |
495 | /// unnecessary instructions. |
496 | static ExtractElementInst *(ExtractElementInst *ExtElt, |
497 | unsigned NewIndex, |
498 | IRBuilder<> &Builder) { |
499 | // Shufflevectors can only be created for fixed-width vectors. |
500 | if (!isa<FixedVectorType>(Val: ExtElt->getOperand(i_nocapture: 0)->getType())) |
501 | return nullptr; |
502 | |
503 | // If the extract can be constant-folded, this code is unsimplified. Defer |
504 | // to other passes to handle that. |
505 | Value *X = ExtElt->getVectorOperand(); |
506 | Value *C = ExtElt->getIndexOperand(); |
507 | assert(isa<ConstantInt>(C) && "Expected a constant index operand" ); |
508 | if (isa<Constant>(Val: X)) |
509 | return nullptr; |
510 | |
511 | Value *Shuf = createShiftShuffle(Vec: X, OldIndex: cast<ConstantInt>(Val: C)->getZExtValue(), |
512 | NewIndex, Builder); |
513 | return cast<ExtractElementInst>(Val: Builder.CreateExtractElement(Vec: Shuf, Idx: NewIndex)); |
514 | } |
515 | |
516 | /// Try to reduce extract element costs by converting scalar compares to vector |
517 | /// compares followed by extract. |
518 | /// cmp (ext0 V0, C), (ext1 V1, C) |
519 | void VectorCombine::(ExtractElementInst *Ext0, |
520 | ExtractElementInst *Ext1, Instruction &I) { |
521 | assert(isa<CmpInst>(&I) && "Expected a compare" ); |
522 | assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == |
523 | cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && |
524 | "Expected matching constant extract indexes" ); |
525 | |
526 | // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C |
527 | ++NumVecCmp; |
528 | CmpInst::Predicate Pred = cast<CmpInst>(Val: &I)->getPredicate(); |
529 | Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); |
530 | Value *VecCmp = Builder.CreateCmp(Pred, LHS: V0, RHS: V1); |
531 | Value *NewExt = Builder.CreateExtractElement(Vec: VecCmp, Idx: Ext0->getIndexOperand()); |
532 | replaceValue(Old&: I, New&: *NewExt); |
533 | } |
534 | |
535 | /// Try to reduce extract element costs by converting scalar binops to vector |
536 | /// binops followed by extract. |
537 | /// bo (ext0 V0, C), (ext1 V1, C) |
538 | void VectorCombine::(ExtractElementInst *Ext0, |
539 | ExtractElementInst *Ext1, Instruction &I) { |
540 | assert(isa<BinaryOperator>(&I) && "Expected a binary operator" ); |
541 | assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() == |
542 | cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() && |
543 | "Expected matching constant extract indexes" ); |
544 | |
545 | // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C |
546 | ++NumVecBO; |
547 | Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); |
548 | Value *VecBO = |
549 | Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val: &I)->getOpcode(), LHS: V0, RHS: V1); |
550 | |
551 | // All IR flags are safe to back-propagate because any potential poison |
552 | // created in unused vector elements is discarded by the extract. |
553 | if (auto *VecBOInst = dyn_cast<Instruction>(Val: VecBO)) |
554 | VecBOInst->copyIRFlags(V: &I); |
555 | |
556 | Value *NewExt = Builder.CreateExtractElement(Vec: VecBO, Idx: Ext0->getIndexOperand()); |
557 | replaceValue(Old&: I, New&: *NewExt); |
558 | } |
559 | |
560 | /// Match an instruction with extracted vector operands. |
561 | bool VectorCombine::(Instruction &I) { |
562 | // It is not safe to transform things like div, urem, etc. because we may |
563 | // create undefined behavior when executing those on unknown vector elements. |
564 | if (!isSafeToSpeculativelyExecute(I: &I)) |
565 | return false; |
566 | |
567 | Instruction *I0, *I1; |
568 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
569 | if (!match(V: &I, P: m_Cmp(Pred, L: m_Instruction(I&: I0), R: m_Instruction(I&: I1))) && |
570 | !match(V: &I, P: m_BinOp(L: m_Instruction(I&: I0), R: m_Instruction(I&: I1)))) |
571 | return false; |
572 | |
573 | Value *V0, *V1; |
574 | uint64_t C0, C1; |
575 | if (!match(V: I0, P: m_ExtractElt(Val: m_Value(V&: V0), Idx: m_ConstantInt(V&: C0))) || |
576 | !match(V: I1, P: m_ExtractElt(Val: m_Value(V&: V1), Idx: m_ConstantInt(V&: C1))) || |
577 | V0->getType() != V1->getType()) |
578 | return false; |
579 | |
580 | // If the scalar value 'I' is going to be re-inserted into a vector, then try |
581 | // to create an extract to that same element. The extract/insert can be |
582 | // reduced to a "select shuffle". |
583 | // TODO: If we add a larger pattern match that starts from an insert, this |
584 | // probably becomes unnecessary. |
585 | auto *Ext0 = cast<ExtractElementInst>(Val: I0); |
586 | auto *Ext1 = cast<ExtractElementInst>(Val: I1); |
587 | uint64_t InsertIndex = InvalidIndex; |
588 | if (I.hasOneUse()) |
589 | match(V: I.user_back(), |
590 | P: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ConstantInt(V&: InsertIndex))); |
591 | |
592 | ExtractElementInst *; |
593 | if (isExtractExtractCheap(Ext0, Ext1, I, ConvertToShuffle&: ExtractToChange, PreferredExtractIndex: InsertIndex)) |
594 | return false; |
595 | |
596 | if (ExtractToChange) { |
597 | unsigned = ExtractToChange == Ext0 ? C1 : C0; |
598 | ExtractElementInst * = |
599 | translateExtract(ExtElt: ExtractToChange, NewIndex: CheapExtractIdx, Builder); |
600 | if (!NewExtract) |
601 | return false; |
602 | if (ExtractToChange == Ext0) |
603 | Ext0 = NewExtract; |
604 | else |
605 | Ext1 = NewExtract; |
606 | } |
607 | |
608 | if (Pred != CmpInst::BAD_ICMP_PREDICATE) |
609 | foldExtExtCmp(Ext0, Ext1, I); |
610 | else |
611 | foldExtExtBinop(Ext0, Ext1, I); |
612 | |
613 | Worklist.push(I: Ext0); |
614 | Worklist.push(I: Ext1); |
615 | return true; |
616 | } |
617 | |
618 | /// Try to replace an extract + scalar fneg + insert with a vector fneg + |
619 | /// shuffle. |
620 | bool VectorCombine::foldInsExtFNeg(Instruction &I) { |
621 | // Match an insert (op (extract)) pattern. |
622 | Value *DestVec; |
623 | uint64_t Index; |
624 | Instruction *FNeg; |
625 | if (!match(V: &I, P: m_InsertElt(Val: m_Value(V&: DestVec), Elt: m_OneUse(SubPattern: m_Instruction(I&: FNeg)), |
626 | Idx: m_ConstantInt(V&: Index)))) |
627 | return false; |
628 | |
629 | // Note: This handles the canonical fneg instruction and "fsub -0.0, X". |
630 | Value *SrcVec; |
631 | Instruction *; |
632 | if (!match(V: FNeg, P: m_FNeg(X: m_CombineAnd( |
633 | L: m_Instruction(I&: Extract), |
634 | R: m_ExtractElt(Val: m_Value(V&: SrcVec), Idx: m_SpecificInt(V: Index)))))) |
635 | return false; |
636 | |
637 | // TODO: We could handle this with a length-changing shuffle. |
638 | auto *VecTy = cast<FixedVectorType>(Val: I.getType()); |
639 | if (SrcVec->getType() != VecTy) |
640 | return false; |
641 | |
642 | // Ignore bogus insert/extract index. |
643 | unsigned NumElts = VecTy->getNumElements(); |
644 | if (Index >= NumElts) |
645 | return false; |
646 | |
647 | // We are inserting the negated element into the same lane that we extracted |
648 | // from. This is equivalent to a select-shuffle that chooses all but the |
649 | // negated element from the destination vector. |
650 | SmallVector<int> Mask(NumElts); |
651 | std::iota(first: Mask.begin(), last: Mask.end(), value: 0); |
652 | Mask[Index] = Index + NumElts; |
653 | |
654 | Type *ScalarTy = VecTy->getScalarType(); |
655 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
656 | InstructionCost OldCost = |
657 | TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: ScalarTy) + |
658 | TTI.getVectorInstrCost(I, Val: VecTy, CostKind, Index); |
659 | |
660 | // If the extract has one use, it will be eliminated, so count it in the |
661 | // original cost. If it has more than one use, ignore the cost because it will |
662 | // be the same before/after. |
663 | if (Extract->hasOneUse()) |
664 | OldCost += TTI.getVectorInstrCost(I: *Extract, Val: VecTy, CostKind, Index); |
665 | |
666 | InstructionCost NewCost = |
667 | TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: VecTy) + |
668 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Select, Tp: VecTy, Mask); |
669 | |
670 | if (NewCost > OldCost) |
671 | return false; |
672 | |
673 | // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> |
674 | // shuffle DestVec, (fneg SrcVec), Mask |
675 | Value *VecFNeg = Builder.CreateFNegFMF(V: SrcVec, FMFSource: FNeg); |
676 | Value *Shuf = Builder.CreateShuffleVector(V1: DestVec, V2: VecFNeg, Mask); |
677 | replaceValue(Old&: I, New&: *Shuf); |
678 | return true; |
679 | } |
680 | |
681 | /// If this is a bitcast of a shuffle, try to bitcast the source vector to the |
682 | /// destination type followed by shuffle. This can enable further transforms by |
683 | /// moving bitcasts or shuffles together. |
684 | bool VectorCombine::foldBitcastShuffle(Instruction &I) { |
685 | Value *V; |
686 | ArrayRef<int> Mask; |
687 | if (!match(V: &I, P: m_BitCast( |
688 | Op: m_OneUse(SubPattern: m_Shuffle(v1: m_Value(V), v2: m_Undef(), mask: m_Mask(Mask)))))) |
689 | return false; |
690 | |
691 | // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for |
692 | // scalable type is unknown; Second, we cannot reason if the narrowed shuffle |
693 | // mask for scalable type is a splat or not. |
694 | // 2) Disallow non-vector casts. |
695 | // TODO: We could allow any shuffle. |
696 | auto *DestTy = dyn_cast<FixedVectorType>(Val: I.getType()); |
697 | auto *SrcTy = dyn_cast<FixedVectorType>(Val: V->getType()); |
698 | if (!DestTy || !SrcTy) |
699 | return false; |
700 | |
701 | unsigned DestEltSize = DestTy->getScalarSizeInBits(); |
702 | unsigned SrcEltSize = SrcTy->getScalarSizeInBits(); |
703 | if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0) |
704 | return false; |
705 | |
706 | SmallVector<int, 16> NewMask; |
707 | if (DestEltSize <= SrcEltSize) { |
708 | // The bitcast is from wide to narrow/equal elements. The shuffle mask can |
709 | // always be expanded to the equivalent form choosing narrower elements. |
710 | assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask" ); |
711 | unsigned ScaleFactor = SrcEltSize / DestEltSize; |
712 | narrowShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask); |
713 | } else { |
714 | // The bitcast is from narrow elements to wide elements. The shuffle mask |
715 | // must choose consecutive elements to allow casting first. |
716 | assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask" ); |
717 | unsigned ScaleFactor = DestEltSize / SrcEltSize; |
718 | if (!widenShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask)) |
719 | return false; |
720 | } |
721 | |
722 | // Bitcast the shuffle src - keep its original width but using the destination |
723 | // scalar type. |
724 | unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize; |
725 | auto *ShuffleTy = FixedVectorType::get(ElementType: DestTy->getScalarType(), NumElts: NumSrcElts); |
726 | |
727 | // The new shuffle must not cost more than the old shuffle. The bitcast is |
728 | // moved ahead of the shuffle, so assume that it has the same cost as before. |
729 | InstructionCost DestCost = TTI.getShuffleCost( |
730 | Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: ShuffleTy, Mask: NewMask); |
731 | InstructionCost SrcCost = |
732 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: SrcTy, Mask); |
733 | if (DestCost > SrcCost || !DestCost.isValid()) |
734 | return false; |
735 | |
736 | // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC' |
737 | ++NumShufOfBitcast; |
738 | Value *CastV = Builder.CreateBitCast(V, DestTy: ShuffleTy); |
739 | Value *Shuf = Builder.CreateShuffleVector(V: CastV, Mask: NewMask); |
740 | replaceValue(Old&: I, New&: *Shuf); |
741 | return true; |
742 | } |
743 | |
744 | /// VP Intrinsics whose vector operands are both splat values may be simplified |
745 | /// into the scalar version of the operation and the result splatted. This |
746 | /// can lead to scalarization down the line. |
747 | bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) { |
748 | if (!isa<VPIntrinsic>(Val: I)) |
749 | return false; |
750 | VPIntrinsic &VPI = cast<VPIntrinsic>(Val&: I); |
751 | Value *Op0 = VPI.getArgOperand(i: 0); |
752 | Value *Op1 = VPI.getArgOperand(i: 1); |
753 | |
754 | if (!isSplatValue(V: Op0) || !isSplatValue(V: Op1)) |
755 | return false; |
756 | |
757 | // Check getSplatValue early in this function, to avoid doing unnecessary |
758 | // work. |
759 | Value *ScalarOp0 = getSplatValue(V: Op0); |
760 | Value *ScalarOp1 = getSplatValue(V: Op1); |
761 | if (!ScalarOp0 || !ScalarOp1) |
762 | return false; |
763 | |
764 | // For the binary VP intrinsics supported here, the result on disabled lanes |
765 | // is a poison value. For now, only do this simplification if all lanes |
766 | // are active. |
767 | // TODO: Relax the condition that all lanes are active by using insertelement |
768 | // on inactive lanes. |
769 | auto IsAllTrueMask = [](Value *MaskVal) { |
770 | if (Value *SplattedVal = getSplatValue(V: MaskVal)) |
771 | if (auto *ConstValue = dyn_cast<Constant>(Val: SplattedVal)) |
772 | return ConstValue->isAllOnesValue(); |
773 | return false; |
774 | }; |
775 | if (!IsAllTrueMask(VPI.getArgOperand(i: 2))) |
776 | return false; |
777 | |
778 | // Check to make sure we support scalarization of the intrinsic |
779 | Intrinsic::ID IntrID = VPI.getIntrinsicID(); |
780 | if (!VPBinOpIntrinsic::isVPBinOp(ID: IntrID)) |
781 | return false; |
782 | |
783 | // Calculate cost of splatting both operands into vectors and the vector |
784 | // intrinsic |
785 | VectorType *VecTy = cast<VectorType>(Val: VPI.getType()); |
786 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
787 | InstructionCost SplatCost = |
788 | TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0) + |
789 | TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VecTy); |
790 | |
791 | // Calculate the cost of the VP Intrinsic |
792 | SmallVector<Type *, 4> Args; |
793 | for (Value *V : VPI.args()) |
794 | Args.push_back(Elt: V->getType()); |
795 | IntrinsicCostAttributes Attrs(IntrID, VecTy, Args); |
796 | InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind); |
797 | InstructionCost OldCost = 2 * SplatCost + VectorOpCost; |
798 | |
799 | // Determine scalar opcode |
800 | std::optional<unsigned> FunctionalOpcode = |
801 | VPI.getFunctionalOpcode(); |
802 | std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt; |
803 | if (!FunctionalOpcode) { |
804 | ScalarIntrID = VPI.getFunctionalIntrinsicID(); |
805 | if (!ScalarIntrID) |
806 | return false; |
807 | } |
808 | |
809 | // Calculate cost of scalarizing |
810 | InstructionCost ScalarOpCost = 0; |
811 | if (ScalarIntrID) { |
812 | IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args); |
813 | ScalarOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind); |
814 | } else { |
815 | ScalarOpCost = |
816 | TTI.getArithmeticInstrCost(Opcode: *FunctionalOpcode, Ty: VecTy->getScalarType()); |
817 | } |
818 | |
819 | // The existing splats may be kept around if other instructions use them. |
820 | InstructionCost CostToKeepSplats = |
821 | (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse()); |
822 | InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats; |
823 | |
824 | LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI |
825 | << "\n" ); |
826 | LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost |
827 | << ", Cost of scalarizing:" << NewCost << "\n" ); |
828 | |
829 | // We want to scalarize unless the vector variant actually has lower cost. |
830 | if (OldCost < NewCost || !NewCost.isValid()) |
831 | return false; |
832 | |
833 | // Scalarize the intrinsic |
834 | ElementCount EC = cast<VectorType>(Val: Op0->getType())->getElementCount(); |
835 | Value *EVL = VPI.getArgOperand(i: 3); |
836 | const DataLayout &DL = VPI.getModule()->getDataLayout(); |
837 | |
838 | // If the VP op might introduce UB or poison, we can scalarize it provided |
839 | // that we know the EVL > 0: If the EVL is zero, then the original VP op |
840 | // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by |
841 | // scalarizing it. |
842 | bool SafeToSpeculate; |
843 | if (ScalarIntrID) |
844 | SafeToSpeculate = Intrinsic::getAttributes(C&: I.getContext(), id: *ScalarIntrID) |
845 | .hasFnAttr(Attribute::AttrKind::Speculatable); |
846 | else |
847 | SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode( |
848 | Opcode: *FunctionalOpcode, Inst: &VPI, CtxI: nullptr, AC: &AC, DT: &DT); |
849 | if (!SafeToSpeculate && !isKnownNonZero(V: EVL, DL, Depth: 0, AC: &AC, CxtI: &VPI, DT: &DT)) |
850 | return false; |
851 | |
852 | Value *ScalarVal = |
853 | ScalarIntrID |
854 | ? Builder.CreateIntrinsic(RetTy: VecTy->getScalarType(), ID: *ScalarIntrID, |
855 | Args: {ScalarOp0, ScalarOp1}) |
856 | : Builder.CreateBinOp(Opc: (Instruction::BinaryOps)(*FunctionalOpcode), |
857 | LHS: ScalarOp0, RHS: ScalarOp1); |
858 | |
859 | replaceValue(Old&: VPI, New&: *Builder.CreateVectorSplat(EC, V: ScalarVal)); |
860 | return true; |
861 | } |
862 | |
863 | /// Match a vector binop or compare instruction with at least one inserted |
864 | /// scalar operand and convert to scalar binop/cmp followed by insertelement. |
865 | bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { |
866 | CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; |
867 | Value *Ins0, *Ins1; |
868 | if (!match(V: &I, P: m_BinOp(L: m_Value(V&: Ins0), R: m_Value(V&: Ins1))) && |
869 | !match(V: &I, P: m_Cmp(Pred, L: m_Value(V&: Ins0), R: m_Value(V&: Ins1)))) |
870 | return false; |
871 | |
872 | // Do not convert the vector condition of a vector select into a scalar |
873 | // condition. That may cause problems for codegen because of differences in |
874 | // boolean formats and register-file transfers. |
875 | // TODO: Can we account for that in the cost model? |
876 | bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE; |
877 | if (IsCmp) |
878 | for (User *U : I.users()) |
879 | if (match(V: U, P: m_Select(C: m_Specific(V: &I), L: m_Value(), R: m_Value()))) |
880 | return false; |
881 | |
882 | // Match against one or both scalar values being inserted into constant |
883 | // vectors: |
884 | // vec_op VecC0, (inselt VecC1, V1, Index) |
885 | // vec_op (inselt VecC0, V0, Index), VecC1 |
886 | // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) |
887 | // TODO: Deal with mismatched index constants and variable indexes? |
888 | Constant *VecC0 = nullptr, *VecC1 = nullptr; |
889 | Value *V0 = nullptr, *V1 = nullptr; |
890 | uint64_t Index0 = 0, Index1 = 0; |
891 | if (!match(V: Ins0, P: m_InsertElt(Val: m_Constant(C&: VecC0), Elt: m_Value(V&: V0), |
892 | Idx: m_ConstantInt(V&: Index0))) && |
893 | !match(V: Ins0, P: m_Constant(C&: VecC0))) |
894 | return false; |
895 | if (!match(V: Ins1, P: m_InsertElt(Val: m_Constant(C&: VecC1), Elt: m_Value(V&: V1), |
896 | Idx: m_ConstantInt(V&: Index1))) && |
897 | !match(V: Ins1, P: m_Constant(C&: VecC1))) |
898 | return false; |
899 | |
900 | bool IsConst0 = !V0; |
901 | bool IsConst1 = !V1; |
902 | if (IsConst0 && IsConst1) |
903 | return false; |
904 | if (!IsConst0 && !IsConst1 && Index0 != Index1) |
905 | return false; |
906 | |
907 | // Bail for single insertion if it is a load. |
908 | // TODO: Handle this once getVectorInstrCost can cost for load/stores. |
909 | auto *I0 = dyn_cast_or_null<Instruction>(Val: V0); |
910 | auto *I1 = dyn_cast_or_null<Instruction>(Val: V1); |
911 | if ((IsConst0 && I1 && I1->mayReadFromMemory()) || |
912 | (IsConst1 && I0 && I0->mayReadFromMemory())) |
913 | return false; |
914 | |
915 | uint64_t Index = IsConst0 ? Index1 : Index0; |
916 | Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType(); |
917 | Type *VecTy = I.getType(); |
918 | assert(VecTy->isVectorTy() && |
919 | (IsConst0 || IsConst1 || V0->getType() == V1->getType()) && |
920 | (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() || |
921 | ScalarTy->isPointerTy()) && |
922 | "Unexpected types for insert element into binop or cmp" ); |
923 | |
924 | unsigned Opcode = I.getOpcode(); |
925 | InstructionCost ScalarOpCost, VectorOpCost; |
926 | if (IsCmp) { |
927 | CmpInst::Predicate Pred = cast<CmpInst>(Val&: I).getPredicate(); |
928 | ScalarOpCost = TTI.getCmpSelInstrCost( |
929 | Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred); |
930 | VectorOpCost = TTI.getCmpSelInstrCost( |
931 | Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred); |
932 | } else { |
933 | ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy); |
934 | VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy); |
935 | } |
936 | |
937 | // Get cost estimate for the insert element. This cost will factor into |
938 | // both sequences. |
939 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
940 | InstructionCost InsertCost = TTI.getVectorInstrCost( |
941 | Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index); |
942 | InstructionCost OldCost = |
943 | (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost; |
944 | InstructionCost NewCost = ScalarOpCost + InsertCost + |
945 | (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) + |
946 | (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost); |
947 | |
948 | // We want to scalarize unless the vector variant actually has lower cost. |
949 | if (OldCost < NewCost || !NewCost.isValid()) |
950 | return false; |
951 | |
952 | // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) --> |
953 | // inselt NewVecC, (scalar_op V0, V1), Index |
954 | if (IsCmp) |
955 | ++NumScalarCmp; |
956 | else |
957 | ++NumScalarBO; |
958 | |
959 | // For constant cases, extract the scalar element, this should constant fold. |
960 | if (IsConst0) |
961 | V0 = ConstantExpr::getExtractElement(Vec: VecC0, Idx: Builder.getInt64(C: Index)); |
962 | if (IsConst1) |
963 | V1 = ConstantExpr::getExtractElement(Vec: VecC1, Idx: Builder.getInt64(C: Index)); |
964 | |
965 | Value *Scalar = |
966 | IsCmp ? Builder.CreateCmp(Pred, LHS: V0, RHS: V1) |
967 | : Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: V0, RHS: V1); |
968 | |
969 | Scalar->setName(I.getName() + ".scalar" ); |
970 | |
971 | // All IR flags are safe to back-propagate. There is no potential for extra |
972 | // poison to be created by the scalar instruction. |
973 | if (auto *ScalarInst = dyn_cast<Instruction>(Val: Scalar)) |
974 | ScalarInst->copyIRFlags(V: &I); |
975 | |
976 | // Fold the vector constants in the original vectors into a new base vector. |
977 | Value *NewVecC = |
978 | IsCmp ? Builder.CreateCmp(Pred, LHS: VecC0, RHS: VecC1) |
979 | : Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: VecC0, RHS: VecC1); |
980 | Value *Insert = Builder.CreateInsertElement(Vec: NewVecC, NewElt: Scalar, Idx: Index); |
981 | replaceValue(Old&: I, New&: *Insert); |
982 | return true; |
983 | } |
984 | |
985 | /// Try to combine a scalar binop + 2 scalar compares of extracted elements of |
986 | /// a vector into vector operations followed by extract. Note: The SLP pass |
987 | /// may miss this pattern because of implementation problems. |
988 | bool VectorCombine::(Instruction &I) { |
989 | // We are looking for a scalar binop of booleans. |
990 | // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1) |
991 | if (!I.isBinaryOp() || !I.getType()->isIntegerTy(Bitwidth: 1)) |
992 | return false; |
993 | |
994 | // The compare predicates should match, and each compare should have a |
995 | // constant operand. |
996 | // TODO: Relax the one-use constraints. |
997 | Value *B0 = I.getOperand(i: 0), *B1 = I.getOperand(i: 1); |
998 | Instruction *I0, *I1; |
999 | Constant *C0, *C1; |
1000 | CmpInst::Predicate P0, P1; |
1001 | if (!match(V: B0, P: m_OneUse(SubPattern: m_Cmp(Pred&: P0, L: m_Instruction(I&: I0), R: m_Constant(C&: C0)))) || |
1002 | !match(V: B1, P: m_OneUse(SubPattern: m_Cmp(Pred&: P1, L: m_Instruction(I&: I1), R: m_Constant(C&: C1)))) || |
1003 | P0 != P1) |
1004 | return false; |
1005 | |
1006 | // The compare operands must be extracts of the same vector with constant |
1007 | // extract indexes. |
1008 | // TODO: Relax the one-use constraints. |
1009 | Value *X; |
1010 | uint64_t Index0, Index1; |
1011 | if (!match(V: I0, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ConstantInt(V&: Index0)))) || |
1012 | !match(V: I1, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Specific(V: X), Idx: m_ConstantInt(V&: Index1))))) |
1013 | return false; |
1014 | |
1015 | auto *Ext0 = cast<ExtractElementInst>(Val: I0); |
1016 | auto *Ext1 = cast<ExtractElementInst>(Val: I1); |
1017 | ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1); |
1018 | if (!ConvertToShuf) |
1019 | return false; |
1020 | |
1021 | // The original scalar pattern is: |
1022 | // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) |
1023 | CmpInst::Predicate Pred = P0; |
1024 | unsigned CmpOpcode = CmpInst::isFPPredicate(P: Pred) ? Instruction::FCmp |
1025 | : Instruction::ICmp; |
1026 | auto *VecTy = dyn_cast<FixedVectorType>(Val: X->getType()); |
1027 | if (!VecTy) |
1028 | return false; |
1029 | |
1030 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
1031 | InstructionCost OldCost = |
1032 | TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0); |
1033 | OldCost += TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1); |
1034 | OldCost += |
1035 | TTI.getCmpSelInstrCost(Opcode: CmpOpcode, ValTy: I0->getType(), |
1036 | CondTy: CmpInst::makeCmpResultType(opnd_type: I0->getType()), VecPred: Pred) * |
1037 | 2; |
1038 | OldCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: I.getType()); |
1039 | |
1040 | // The proposed vector pattern is: |
1041 | // vcmp = cmp Pred X, VecC |
1042 | // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0 |
1043 | int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0; |
1044 | int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1; |
1045 | auto *CmpTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: X->getType())); |
1046 | InstructionCost NewCost = TTI.getCmpSelInstrCost( |
1047 | Opcode: CmpOpcode, ValTy: X->getType(), CondTy: CmpInst::makeCmpResultType(opnd_type: X->getType()), VecPred: Pred); |
1048 | SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem); |
1049 | ShufMask[CheapIndex] = ExpensiveIndex; |
1050 | NewCost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: CmpTy, |
1051 | Mask: ShufMask); |
1052 | NewCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: CmpTy); |
1053 | NewCost += TTI.getVectorInstrCost(I: *Ext0, Val: CmpTy, CostKind, Index: CheapIndex); |
1054 | |
1055 | // Aggressively form vector ops if the cost is equal because the transform |
1056 | // may enable further optimization. |
1057 | // Codegen can reverse this transform (scalarize) if it was not profitable. |
1058 | if (OldCost < NewCost || !NewCost.isValid()) |
1059 | return false; |
1060 | |
1061 | // Create a vector constant from the 2 scalar constants. |
1062 | SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(), |
1063 | PoisonValue::get(T: VecTy->getElementType())); |
1064 | CmpC[Index0] = C0; |
1065 | CmpC[Index1] = C1; |
1066 | Value *VCmp = Builder.CreateCmp(Pred, LHS: X, RHS: ConstantVector::get(V: CmpC)); |
1067 | |
1068 | Value *Shuf = createShiftShuffle(Vec: VCmp, OldIndex: ExpensiveIndex, NewIndex: CheapIndex, Builder); |
1069 | Value *VecLogic = Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val&: I).getOpcode(), |
1070 | LHS: VCmp, RHS: Shuf); |
1071 | Value *NewExt = Builder.CreateExtractElement(Vec: VecLogic, Idx: CheapIndex); |
1072 | replaceValue(Old&: I, New&: *NewExt); |
1073 | ++NumVecCmpBO; |
1074 | return true; |
1075 | } |
1076 | |
1077 | // Check if memory loc modified between two instrs in the same BB |
1078 | static bool isMemModifiedBetween(BasicBlock::iterator Begin, |
1079 | BasicBlock::iterator End, |
1080 | const MemoryLocation &Loc, AAResults &AA) { |
1081 | unsigned NumScanned = 0; |
1082 | return std::any_of(first: Begin, last: End, pred: [&](const Instruction &Instr) { |
1083 | return isModSet(MRI: AA.getModRefInfo(I: &Instr, OptLoc: Loc)) || |
1084 | ++NumScanned > MaxInstrsToScan; |
1085 | }); |
1086 | } |
1087 | |
1088 | namespace { |
1089 | /// Helper class to indicate whether a vector index can be safely scalarized and |
1090 | /// if a freeze needs to be inserted. |
1091 | class ScalarizationResult { |
1092 | enum class StatusTy { Unsafe, Safe, SafeWithFreeze }; |
1093 | |
1094 | StatusTy Status; |
1095 | Value *ToFreeze; |
1096 | |
1097 | ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr) |
1098 | : Status(Status), ToFreeze(ToFreeze) {} |
1099 | |
1100 | public: |
1101 | ScalarizationResult(const ScalarizationResult &Other) = default; |
1102 | ~ScalarizationResult() { |
1103 | assert(!ToFreeze && "freeze() not called with ToFreeze being set" ); |
1104 | } |
1105 | |
1106 | static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; } |
1107 | static ScalarizationResult safe() { return {StatusTy::Safe}; } |
1108 | static ScalarizationResult safeWithFreeze(Value *ToFreeze) { |
1109 | return {StatusTy::SafeWithFreeze, ToFreeze}; |
1110 | } |
1111 | |
1112 | /// Returns true if the index can be scalarize without requiring a freeze. |
1113 | bool isSafe() const { return Status == StatusTy::Safe; } |
1114 | /// Returns true if the index cannot be scalarized. |
1115 | bool isUnsafe() const { return Status == StatusTy::Unsafe; } |
1116 | /// Returns true if the index can be scalarize, but requires inserting a |
1117 | /// freeze. |
1118 | bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; } |
1119 | |
1120 | /// Reset the state of Unsafe and clear ToFreze if set. |
1121 | void discard() { |
1122 | ToFreeze = nullptr; |
1123 | Status = StatusTy::Unsafe; |
1124 | } |
1125 | |
1126 | /// Freeze the ToFreeze and update the use in \p User to use it. |
1127 | void freeze(IRBuilder<> &Builder, Instruction &UserI) { |
1128 | assert(isSafeWithFreeze() && |
1129 | "should only be used when freezing is required" ); |
1130 | assert(is_contained(ToFreeze->users(), &UserI) && |
1131 | "UserI must be a user of ToFreeze" ); |
1132 | IRBuilder<>::InsertPointGuard Guard(Builder); |
1133 | Builder.SetInsertPoint(cast<Instruction>(Val: &UserI)); |
1134 | Value *Frozen = |
1135 | Builder.CreateFreeze(V: ToFreeze, Name: ToFreeze->getName() + ".frozen" ); |
1136 | for (Use &U : make_early_inc_range(Range: (UserI.operands()))) |
1137 | if (U.get() == ToFreeze) |
1138 | U.set(Frozen); |
1139 | |
1140 | ToFreeze = nullptr; |
1141 | } |
1142 | }; |
1143 | } // namespace |
1144 | |
1145 | /// Check if it is legal to scalarize a memory access to \p VecTy at index \p |
1146 | /// Idx. \p Idx must access a valid vector element. |
1147 | static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, |
1148 | Instruction *CtxI, |
1149 | AssumptionCache &AC, |
1150 | const DominatorTree &DT) { |
1151 | // We do checks for both fixed vector types and scalable vector types. |
1152 | // This is the number of elements of fixed vector types, |
1153 | // or the minimum number of elements of scalable vector types. |
1154 | uint64_t NumElements = VecTy->getElementCount().getKnownMinValue(); |
1155 | |
1156 | if (auto *C = dyn_cast<ConstantInt>(Val: Idx)) { |
1157 | if (C->getValue().ult(RHS: NumElements)) |
1158 | return ScalarizationResult::safe(); |
1159 | return ScalarizationResult::unsafe(); |
1160 | } |
1161 | |
1162 | unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); |
1163 | APInt Zero(IntWidth, 0); |
1164 | APInt MaxElts(IntWidth, NumElements); |
1165 | ConstantRange ValidIndices(Zero, MaxElts); |
1166 | ConstantRange IdxRange(IntWidth, true); |
1167 | |
1168 | if (isGuaranteedNotToBePoison(V: Idx, AC: &AC)) { |
1169 | if (ValidIndices.contains(CR: computeConstantRange(V: Idx, /* ForSigned */ false, |
1170 | UseInstrInfo: true, AC: &AC, CtxI, DT: &DT))) |
1171 | return ScalarizationResult::safe(); |
1172 | return ScalarizationResult::unsafe(); |
1173 | } |
1174 | |
1175 | // If the index may be poison, check if we can insert a freeze before the |
1176 | // range of the index is restricted. |
1177 | Value *IdxBase; |
1178 | ConstantInt *CI; |
1179 | if (match(V: Idx, P: m_And(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) { |
1180 | IdxRange = IdxRange.binaryAnd(Other: CI->getValue()); |
1181 | } else if (match(V: Idx, P: m_URem(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) { |
1182 | IdxRange = IdxRange.urem(Other: CI->getValue()); |
1183 | } |
1184 | |
1185 | if (ValidIndices.contains(CR: IdxRange)) |
1186 | return ScalarizationResult::safeWithFreeze(ToFreeze: IdxBase); |
1187 | return ScalarizationResult::unsafe(); |
1188 | } |
1189 | |
1190 | /// The memory operation on a vector of \p ScalarType had alignment of |
1191 | /// \p VectorAlignment. Compute the maximal, but conservatively correct, |
1192 | /// alignment that will be valid for the memory operation on a single scalar |
1193 | /// element of the same type with index \p Idx. |
1194 | static Align computeAlignmentAfterScalarization(Align VectorAlignment, |
1195 | Type *ScalarType, Value *Idx, |
1196 | const DataLayout &DL) { |
1197 | if (auto *C = dyn_cast<ConstantInt>(Val: Idx)) |
1198 | return commonAlignment(A: VectorAlignment, |
1199 | Offset: C->getZExtValue() * DL.getTypeStoreSize(Ty: ScalarType)); |
1200 | return commonAlignment(A: VectorAlignment, Offset: DL.getTypeStoreSize(Ty: ScalarType)); |
1201 | } |
1202 | |
1203 | // Combine patterns like: |
1204 | // %0 = load <4 x i32>, <4 x i32>* %a |
1205 | // %1 = insertelement <4 x i32> %0, i32 %b, i32 1 |
1206 | // store <4 x i32> %1, <4 x i32>* %a |
1207 | // to: |
1208 | // %0 = bitcast <4 x i32>* %a to i32* |
1209 | // %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1 |
1210 | // store i32 %b, i32* %1 |
1211 | bool VectorCombine::foldSingleElementStore(Instruction &I) { |
1212 | auto *SI = cast<StoreInst>(Val: &I); |
1213 | if (!SI->isSimple() || !isa<VectorType>(Val: SI->getValueOperand()->getType())) |
1214 | return false; |
1215 | |
1216 | // TODO: Combine more complicated patterns (multiple insert) by referencing |
1217 | // TargetTransformInfo. |
1218 | Instruction *Source; |
1219 | Value *NewElement; |
1220 | Value *Idx; |
1221 | if (!match(V: SI->getValueOperand(), |
1222 | P: m_InsertElt(Val: m_Instruction(I&: Source), Elt: m_Value(V&: NewElement), |
1223 | Idx: m_Value(V&: Idx)))) |
1224 | return false; |
1225 | |
1226 | if (auto *Load = dyn_cast<LoadInst>(Val: Source)) { |
1227 | auto VecTy = cast<VectorType>(Val: SI->getValueOperand()->getType()); |
1228 | const DataLayout &DL = I.getModule()->getDataLayout(); |
1229 | Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); |
1230 | // Don't optimize for atomic/volatile load or store. Ensure memory is not |
1231 | // modified between, vector type matches store size, and index is inbounds. |
1232 | if (!Load->isSimple() || Load->getParent() != SI->getParent() || |
1233 | !DL.typeSizeEqualsStoreSize(Ty: Load->getType()->getScalarType()) || |
1234 | SrcAddr != SI->getPointerOperand()->stripPointerCasts()) |
1235 | return false; |
1236 | |
1237 | auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, CtxI: Load, AC, DT); |
1238 | if (ScalarizableIdx.isUnsafe() || |
1239 | isMemModifiedBetween(Begin: Load->getIterator(), End: SI->getIterator(), |
1240 | Loc: MemoryLocation::get(SI), AA)) |
1241 | return false; |
1242 | |
1243 | if (ScalarizableIdx.isSafeWithFreeze()) |
1244 | ScalarizableIdx.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx)); |
1245 | Value *GEP = Builder.CreateInBoundsGEP( |
1246 | Ty: SI->getValueOperand()->getType(), Ptr: SI->getPointerOperand(), |
1247 | IdxList: {ConstantInt::get(Ty: Idx->getType(), V: 0), Idx}); |
1248 | StoreInst *NSI = Builder.CreateStore(Val: NewElement, Ptr: GEP); |
1249 | NSI->copyMetadata(SrcInst: *SI); |
1250 | Align ScalarOpAlignment = computeAlignmentAfterScalarization( |
1251 | VectorAlignment: std::max(a: SI->getAlign(), b: Load->getAlign()), ScalarType: NewElement->getType(), Idx, |
1252 | DL); |
1253 | NSI->setAlignment(ScalarOpAlignment); |
1254 | replaceValue(Old&: I, New&: *NSI); |
1255 | eraseInstruction(I); |
1256 | return true; |
1257 | } |
1258 | |
1259 | return false; |
1260 | } |
1261 | |
1262 | /// Try to scalarize vector loads feeding extractelement instructions. |
1263 | bool VectorCombine::(Instruction &I) { |
1264 | Value *Ptr; |
1265 | if (!match(V: &I, P: m_Load(Op: m_Value(V&: Ptr)))) |
1266 | return false; |
1267 | |
1268 | auto *VecTy = cast<VectorType>(Val: I.getType()); |
1269 | auto *LI = cast<LoadInst>(Val: &I); |
1270 | const DataLayout &DL = I.getModule()->getDataLayout(); |
1271 | if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(Ty: VecTy->getScalarType())) |
1272 | return false; |
1273 | |
1274 | InstructionCost OriginalCost = |
1275 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: LI->getAlign(), |
1276 | AddressSpace: LI->getPointerAddressSpace()); |
1277 | InstructionCost ScalarizedCost = 0; |
1278 | |
1279 | Instruction *LastCheckedInst = LI; |
1280 | unsigned NumInstChecked = 0; |
1281 | DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze; |
1282 | auto FailureGuard = make_scope_exit(F: [&]() { |
1283 | // If the transform is aborted, discard the ScalarizationResults. |
1284 | for (auto &Pair : NeedFreeze) |
1285 | Pair.second.discard(); |
1286 | }); |
1287 | |
1288 | // Check if all users of the load are extracts with no memory modifications |
1289 | // between the load and the extract. Compute the cost of both the original |
1290 | // code and the scalarized version. |
1291 | for (User *U : LI->users()) { |
1292 | auto *UI = dyn_cast<ExtractElementInst>(Val: U); |
1293 | if (!UI || UI->getParent() != LI->getParent()) |
1294 | return false; |
1295 | |
1296 | // Check if any instruction between the load and the extract may modify |
1297 | // memory. |
1298 | if (LastCheckedInst->comesBefore(Other: UI)) { |
1299 | for (Instruction &I : |
1300 | make_range(x: std::next(x: LI->getIterator()), y: UI->getIterator())) { |
1301 | // Bail out if we reached the check limit or the instruction may write |
1302 | // to memory. |
1303 | if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory()) |
1304 | return false; |
1305 | NumInstChecked++; |
1306 | } |
1307 | LastCheckedInst = UI; |
1308 | } |
1309 | |
1310 | auto ScalarIdx = canScalarizeAccess(VecTy, Idx: UI->getOperand(i_nocapture: 1), CtxI: &I, AC, DT); |
1311 | if (ScalarIdx.isUnsafe()) |
1312 | return false; |
1313 | if (ScalarIdx.isSafeWithFreeze()) { |
1314 | NeedFreeze.try_emplace(Key: UI, Args&: ScalarIdx); |
1315 | ScalarIdx.discard(); |
1316 | } |
1317 | |
1318 | auto *Index = dyn_cast<ConstantInt>(Val: UI->getOperand(i_nocapture: 1)); |
1319 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
1320 | OriginalCost += |
1321 | TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind, |
1322 | Index: Index ? Index->getZExtValue() : -1); |
1323 | ScalarizedCost += |
1324 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy->getElementType(), |
1325 | Alignment: Align(1), AddressSpace: LI->getPointerAddressSpace()); |
1326 | ScalarizedCost += TTI.getAddressComputationCost(Ty: VecTy->getElementType()); |
1327 | } |
1328 | |
1329 | if (ScalarizedCost >= OriginalCost) |
1330 | return false; |
1331 | |
1332 | // Replace extracts with narrow scalar loads. |
1333 | for (User *U : LI->users()) { |
1334 | auto *EI = cast<ExtractElementInst>(Val: U); |
1335 | Value *Idx = EI->getOperand(i_nocapture: 1); |
1336 | |
1337 | // Insert 'freeze' for poison indexes. |
1338 | auto It = NeedFreeze.find(Val: EI); |
1339 | if (It != NeedFreeze.end()) |
1340 | It->second.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx)); |
1341 | |
1342 | Builder.SetInsertPoint(EI); |
1343 | Value *GEP = |
1344 | Builder.CreateInBoundsGEP(Ty: VecTy, Ptr, IdxList: {Builder.getInt32(C: 0), Idx}); |
1345 | auto *NewLoad = cast<LoadInst>(Val: Builder.CreateLoad( |
1346 | Ty: VecTy->getElementType(), Ptr: GEP, Name: EI->getName() + ".scalar" )); |
1347 | |
1348 | Align ScalarOpAlignment = computeAlignmentAfterScalarization( |
1349 | VectorAlignment: LI->getAlign(), ScalarType: VecTy->getElementType(), Idx, DL); |
1350 | NewLoad->setAlignment(ScalarOpAlignment); |
1351 | |
1352 | replaceValue(Old&: *EI, New&: *NewLoad); |
1353 | } |
1354 | |
1355 | FailureGuard.release(); |
1356 | return true; |
1357 | } |
1358 | |
1359 | /// Try to convert "shuffle (binop), (binop)" with a shared binop operand into |
1360 | /// "binop (shuffle), (shuffle)". |
1361 | bool VectorCombine::foldShuffleOfBinops(Instruction &I) { |
1362 | auto *VecTy = cast<FixedVectorType>(Val: I.getType()); |
1363 | BinaryOperator *B0, *B1; |
1364 | ArrayRef<int> Mask; |
1365 | if (!match(V: &I, P: m_Shuffle(v1: m_OneUse(SubPattern: m_BinOp(I&: B0)), v2: m_OneUse(SubPattern: m_BinOp(I&: B1)), |
1366 | mask: m_Mask(Mask))) || |
1367 | B0->getOpcode() != B1->getOpcode() || B0->getType() != VecTy) |
1368 | return false; |
1369 | |
1370 | // Try to replace a binop with a shuffle if the shuffle is not costly. |
1371 | // The new shuffle will choose from a single, common operand, so it may be |
1372 | // cheaper than the existing two-operand shuffle. |
1373 | SmallVector<int> UnaryMask = createUnaryMask(Mask, NumElts: Mask.size()); |
1374 | Instruction::BinaryOps Opcode = B0->getOpcode(); |
1375 | InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy); |
1376 | InstructionCost ShufCost = TTI.getShuffleCost( |
1377 | Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy, Mask: UnaryMask); |
1378 | if (ShufCost > BinopCost) |
1379 | return false; |
1380 | |
1381 | // If we have something like "add X, Y" and "add Z, X", swap ops to match. |
1382 | Value *X = B0->getOperand(i_nocapture: 0), *Y = B0->getOperand(i_nocapture: 1); |
1383 | Value *Z = B1->getOperand(i_nocapture: 0), *W = B1->getOperand(i_nocapture: 1); |
1384 | if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W) |
1385 | std::swap(a&: X, b&: Y); |
1386 | |
1387 | Value *Shuf0, *Shuf1; |
1388 | if (X == Z) { |
1389 | // shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W) |
1390 | Shuf0 = Builder.CreateShuffleVector(V: X, Mask: UnaryMask); |
1391 | Shuf1 = Builder.CreateShuffleVector(V1: Y, V2: W, Mask); |
1392 | } else if (Y == W) { |
1393 | // shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y) |
1394 | Shuf0 = Builder.CreateShuffleVector(V1: X, V2: Z, Mask); |
1395 | Shuf1 = Builder.CreateShuffleVector(V: Y, Mask: UnaryMask); |
1396 | } else { |
1397 | return false; |
1398 | } |
1399 | |
1400 | Value *NewBO = Builder.CreateBinOp(Opc: Opcode, LHS: Shuf0, RHS: Shuf1); |
1401 | // Intersect flags from the old binops. |
1402 | if (auto *NewInst = dyn_cast<Instruction>(Val: NewBO)) { |
1403 | NewInst->copyIRFlags(V: B0); |
1404 | NewInst->andIRFlags(V: B1); |
1405 | } |
1406 | replaceValue(Old&: I, New&: *NewBO); |
1407 | return true; |
1408 | } |
1409 | |
1410 | /// Given a commutative reduction, the order of the input lanes does not alter |
1411 | /// the results. We can use this to remove certain shuffles feeding the |
1412 | /// reduction, removing the need to shuffle at all. |
1413 | bool VectorCombine::foldShuffleFromReductions(Instruction &I) { |
1414 | auto *II = dyn_cast<IntrinsicInst>(Val: &I); |
1415 | if (!II) |
1416 | return false; |
1417 | switch (II->getIntrinsicID()) { |
1418 | case Intrinsic::vector_reduce_add: |
1419 | case Intrinsic::vector_reduce_mul: |
1420 | case Intrinsic::vector_reduce_and: |
1421 | case Intrinsic::vector_reduce_or: |
1422 | case Intrinsic::vector_reduce_xor: |
1423 | case Intrinsic::vector_reduce_smin: |
1424 | case Intrinsic::vector_reduce_smax: |
1425 | case Intrinsic::vector_reduce_umin: |
1426 | case Intrinsic::vector_reduce_umax: |
1427 | break; |
1428 | default: |
1429 | return false; |
1430 | } |
1431 | |
1432 | // Find all the inputs when looking through operations that do not alter the |
1433 | // lane order (binops, for example). Currently we look for a single shuffle, |
1434 | // and can ignore splat values. |
1435 | std::queue<Value *> Worklist; |
1436 | SmallPtrSet<Value *, 4> Visited; |
1437 | ShuffleVectorInst *Shuffle = nullptr; |
1438 | if (auto *Op = dyn_cast<Instruction>(Val: I.getOperand(i: 0))) |
1439 | Worklist.push(x: Op); |
1440 | |
1441 | while (!Worklist.empty()) { |
1442 | Value *CV = Worklist.front(); |
1443 | Worklist.pop(); |
1444 | if (Visited.contains(Ptr: CV)) |
1445 | continue; |
1446 | |
1447 | // Splats don't change the order, so can be safely ignored. |
1448 | if (isSplatValue(V: CV)) |
1449 | continue; |
1450 | |
1451 | Visited.insert(Ptr: CV); |
1452 | |
1453 | if (auto *CI = dyn_cast<Instruction>(Val: CV)) { |
1454 | if (CI->isBinaryOp()) { |
1455 | for (auto *Op : CI->operand_values()) |
1456 | Worklist.push(x: Op); |
1457 | continue; |
1458 | } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: CI)) { |
1459 | if (Shuffle && Shuffle != SV) |
1460 | return false; |
1461 | Shuffle = SV; |
1462 | continue; |
1463 | } |
1464 | } |
1465 | |
1466 | // Anything else is currently an unknown node. |
1467 | return false; |
1468 | } |
1469 | |
1470 | if (!Shuffle) |
1471 | return false; |
1472 | |
1473 | // Check all uses of the binary ops and shuffles are also included in the |
1474 | // lane-invariant operations (Visited should be the list of lanewise |
1475 | // instructions, including the shuffle that we found). |
1476 | for (auto *V : Visited) |
1477 | for (auto *U : V->users()) |
1478 | if (!Visited.contains(Ptr: U) && U != &I) |
1479 | return false; |
1480 | |
1481 | FixedVectorType *VecType = |
1482 | dyn_cast<FixedVectorType>(Val: II->getOperand(i_nocapture: 0)->getType()); |
1483 | if (!VecType) |
1484 | return false; |
1485 | FixedVectorType *ShuffleInputType = |
1486 | dyn_cast<FixedVectorType>(Val: Shuffle->getOperand(i_nocapture: 0)->getType()); |
1487 | if (!ShuffleInputType) |
1488 | return false; |
1489 | unsigned NumInputElts = ShuffleInputType->getNumElements(); |
1490 | |
1491 | // Find the mask from sorting the lanes into order. This is most likely to |
1492 | // become a identity or concat mask. Undef elements are pushed to the end. |
1493 | SmallVector<int> ConcatMask; |
1494 | Shuffle->getShuffleMask(Result&: ConcatMask); |
1495 | sort(C&: ConcatMask, Comp: [](int X, int Y) { return (unsigned)X < (unsigned)Y; }); |
1496 | // In the case of a truncating shuffle it's possible for the mask |
1497 | // to have an index greater than the size of the resulting vector. |
1498 | // This requires special handling. |
1499 | bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts; |
1500 | bool UsesSecondVec = |
1501 | any_of(Range&: ConcatMask, P: [&](int M) { return M >= (int)NumInputElts; }); |
1502 | |
1503 | FixedVectorType *VecTyForCost = |
1504 | (UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType; |
1505 | InstructionCost OldCost = TTI.getShuffleCost( |
1506 | Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, |
1507 | Tp: VecTyForCost, Mask: Shuffle->getShuffleMask()); |
1508 | InstructionCost NewCost = TTI.getShuffleCost( |
1509 | Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, |
1510 | Tp: VecTyForCost, Mask: ConcatMask); |
1511 | |
1512 | LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle |
1513 | << "\n" ); |
1514 | LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost |
1515 | << "\n" ); |
1516 | if (NewCost < OldCost) { |
1517 | Builder.SetInsertPoint(Shuffle); |
1518 | Value *NewShuffle = Builder.CreateShuffleVector( |
1519 | V1: Shuffle->getOperand(i_nocapture: 0), V2: Shuffle->getOperand(i_nocapture: 1), Mask: ConcatMask); |
1520 | LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n" ); |
1521 | replaceValue(Old&: *Shuffle, New&: *NewShuffle); |
1522 | } |
1523 | |
1524 | // See if we can re-use foldSelectShuffle, getting it to reduce the size of |
1525 | // the shuffle into a nicer order, as it can ignore the order of the shuffles. |
1526 | return foldSelectShuffle(I&: *Shuffle, FromReduction: true); |
1527 | } |
1528 | |
1529 | /// This method looks for groups of shuffles acting on binops, of the form: |
1530 | /// %x = shuffle ... |
1531 | /// %y = shuffle ... |
1532 | /// %a = binop %x, %y |
1533 | /// %b = binop %x, %y |
1534 | /// shuffle %a, %b, selectmask |
1535 | /// We may, especially if the shuffle is wider than legal, be able to convert |
1536 | /// the shuffle to a form where only parts of a and b need to be computed. On |
1537 | /// architectures with no obvious "select" shuffle, this can reduce the total |
1538 | /// number of operations if the target reports them as cheaper. |
1539 | bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { |
1540 | auto *SVI = cast<ShuffleVectorInst>(Val: &I); |
1541 | auto *VT = cast<FixedVectorType>(Val: I.getType()); |
1542 | auto *Op0 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: 0)); |
1543 | auto *Op1 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: 1)); |
1544 | if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() || |
1545 | VT != Op0->getType()) |
1546 | return false; |
1547 | |
1548 | auto *SVI0A = dyn_cast<Instruction>(Val: Op0->getOperand(i: 0)); |
1549 | auto *SVI0B = dyn_cast<Instruction>(Val: Op0->getOperand(i: 1)); |
1550 | auto *SVI1A = dyn_cast<Instruction>(Val: Op1->getOperand(i: 0)); |
1551 | auto *SVI1B = dyn_cast<Instruction>(Val: Op1->getOperand(i: 1)); |
1552 | SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B}); |
1553 | auto checkSVNonOpUses = [&](Instruction *I) { |
1554 | if (!I || I->getOperand(i: 0)->getType() != VT) |
1555 | return true; |
1556 | return any_of(Range: I->users(), P: [&](User *U) { |
1557 | return U != Op0 && U != Op1 && |
1558 | !(isa<ShuffleVectorInst>(Val: U) && |
1559 | (InputShuffles.contains(Ptr: cast<Instruction>(Val: U)) || |
1560 | isInstructionTriviallyDead(I: cast<Instruction>(Val: U)))); |
1561 | }); |
1562 | }; |
1563 | if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) || |
1564 | checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B)) |
1565 | return false; |
1566 | |
1567 | // Collect all the uses that are shuffles that we can transform together. We |
1568 | // may not have a single shuffle, but a group that can all be transformed |
1569 | // together profitably. |
1570 | SmallVector<ShuffleVectorInst *> Shuffles; |
1571 | auto collectShuffles = [&](Instruction *I) { |
1572 | for (auto *U : I->users()) { |
1573 | auto *SV = dyn_cast<ShuffleVectorInst>(Val: U); |
1574 | if (!SV || SV->getType() != VT) |
1575 | return false; |
1576 | if ((SV->getOperand(i_nocapture: 0) != Op0 && SV->getOperand(i_nocapture: 0) != Op1) || |
1577 | (SV->getOperand(i_nocapture: 1) != Op0 && SV->getOperand(i_nocapture: 1) != Op1)) |
1578 | return false; |
1579 | if (!llvm::is_contained(Range&: Shuffles, Element: SV)) |
1580 | Shuffles.push_back(Elt: SV); |
1581 | } |
1582 | return true; |
1583 | }; |
1584 | if (!collectShuffles(Op0) || !collectShuffles(Op1)) |
1585 | return false; |
1586 | // From a reduction, we need to be processing a single shuffle, otherwise the |
1587 | // other uses will not be lane-invariant. |
1588 | if (FromReduction && Shuffles.size() > 1) |
1589 | return false; |
1590 | |
1591 | // Add any shuffle uses for the shuffles we have found, to include them in our |
1592 | // cost calculations. |
1593 | if (!FromReduction) { |
1594 | for (ShuffleVectorInst *SV : Shuffles) { |
1595 | for (auto *U : SV->users()) { |
1596 | ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(Val: U); |
1597 | if (SSV && isa<UndefValue>(Val: SSV->getOperand(i_nocapture: 1)) && SSV->getType() == VT) |
1598 | Shuffles.push_back(Elt: SSV); |
1599 | } |
1600 | } |
1601 | } |
1602 | |
1603 | // For each of the output shuffles, we try to sort all the first vector |
1604 | // elements to the beginning, followed by the second array elements at the |
1605 | // end. If the binops are legalized to smaller vectors, this may reduce total |
1606 | // number of binops. We compute the ReconstructMask mask needed to convert |
1607 | // back to the original lane order. |
1608 | SmallVector<std::pair<int, int>> V1, V2; |
1609 | SmallVector<SmallVector<int>> OrigReconstructMasks; |
1610 | int MaxV1Elt = 0, MaxV2Elt = 0; |
1611 | unsigned NumElts = VT->getNumElements(); |
1612 | for (ShuffleVectorInst *SVN : Shuffles) { |
1613 | SmallVector<int> Mask; |
1614 | SVN->getShuffleMask(Result&: Mask); |
1615 | |
1616 | // Check the operands are the same as the original, or reversed (in which |
1617 | // case we need to commute the mask). |
1618 | Value *SVOp0 = SVN->getOperand(i_nocapture: 0); |
1619 | Value *SVOp1 = SVN->getOperand(i_nocapture: 1); |
1620 | if (isa<UndefValue>(Val: SVOp1)) { |
1621 | auto *SSV = cast<ShuffleVectorInst>(Val: SVOp0); |
1622 | SVOp0 = SSV->getOperand(i_nocapture: 0); |
1623 | SVOp1 = SSV->getOperand(i_nocapture: 1); |
1624 | for (unsigned I = 0, E = Mask.size(); I != E; I++) { |
1625 | if (Mask[I] >= static_cast<int>(SSV->getShuffleMask().size())) |
1626 | return false; |
1627 | Mask[I] = Mask[I] < 0 ? Mask[I] : SSV->getMaskValue(Elt: Mask[I]); |
1628 | } |
1629 | } |
1630 | if (SVOp0 == Op1 && SVOp1 == Op0) { |
1631 | std::swap(a&: SVOp0, b&: SVOp1); |
1632 | ShuffleVectorInst::commuteShuffleMask(Mask, InVecNumElts: NumElts); |
1633 | } |
1634 | if (SVOp0 != Op0 || SVOp1 != Op1) |
1635 | return false; |
1636 | |
1637 | // Calculate the reconstruction mask for this shuffle, as the mask needed to |
1638 | // take the packed values from Op0/Op1 and reconstructing to the original |
1639 | // order. |
1640 | SmallVector<int> ReconstructMask; |
1641 | for (unsigned I = 0; I < Mask.size(); I++) { |
1642 | if (Mask[I] < 0) { |
1643 | ReconstructMask.push_back(Elt: -1); |
1644 | } else if (Mask[I] < static_cast<int>(NumElts)) { |
1645 | MaxV1Elt = std::max(a: MaxV1Elt, b: Mask[I]); |
1646 | auto It = find_if(Range&: V1, P: [&](const std::pair<int, int> &A) { |
1647 | return Mask[I] == A.first; |
1648 | }); |
1649 | if (It != V1.end()) |
1650 | ReconstructMask.push_back(Elt: It - V1.begin()); |
1651 | else { |
1652 | ReconstructMask.push_back(Elt: V1.size()); |
1653 | V1.emplace_back(Args&: Mask[I], Args: V1.size()); |
1654 | } |
1655 | } else { |
1656 | MaxV2Elt = std::max<int>(a: MaxV2Elt, b: Mask[I] - NumElts); |
1657 | auto It = find_if(Range&: V2, P: [&](const std::pair<int, int> &A) { |
1658 | return Mask[I] - static_cast<int>(NumElts) == A.first; |
1659 | }); |
1660 | if (It != V2.end()) |
1661 | ReconstructMask.push_back(Elt: NumElts + It - V2.begin()); |
1662 | else { |
1663 | ReconstructMask.push_back(Elt: NumElts + V2.size()); |
1664 | V2.emplace_back(Args: Mask[I] - NumElts, Args: NumElts + V2.size()); |
1665 | } |
1666 | } |
1667 | } |
1668 | |
1669 | // For reductions, we know that the lane ordering out doesn't alter the |
1670 | // result. In-order can help simplify the shuffle away. |
1671 | if (FromReduction) |
1672 | sort(C&: ReconstructMask); |
1673 | OrigReconstructMasks.push_back(Elt: std::move(ReconstructMask)); |
1674 | } |
1675 | |
1676 | // If the Maximum element used from V1 and V2 are not larger than the new |
1677 | // vectors, the vectors are already packes and performing the optimization |
1678 | // again will likely not help any further. This also prevents us from getting |
1679 | // stuck in a cycle in case the costs do not also rule it out. |
1680 | if (V1.empty() || V2.empty() || |
1681 | (MaxV1Elt == static_cast<int>(V1.size()) - 1 && |
1682 | MaxV2Elt == static_cast<int>(V2.size()) - 1)) |
1683 | return false; |
1684 | |
1685 | // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a |
1686 | // shuffle of another shuffle, or not a shuffle (that is treated like a |
1687 | // identity shuffle). |
1688 | auto GetBaseMaskValue = [&](Instruction *I, int M) { |
1689 | auto *SV = dyn_cast<ShuffleVectorInst>(Val: I); |
1690 | if (!SV) |
1691 | return M; |
1692 | if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: 1))) |
1693 | if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: 0))) |
1694 | if (InputShuffles.contains(Ptr: SSV)) |
1695 | return SSV->getMaskValue(Elt: SV->getMaskValue(Elt: M)); |
1696 | return SV->getMaskValue(Elt: M); |
1697 | }; |
1698 | |
1699 | // Attempt to sort the inputs my ascending mask values to make simpler input |
1700 | // shuffles and push complex shuffles down to the uses. We sort on the first |
1701 | // of the two input shuffle orders, to try and get at least one input into a |
1702 | // nice order. |
1703 | auto SortBase = [&](Instruction *A, std::pair<int, int> X, |
1704 | std::pair<int, int> Y) { |
1705 | int MXA = GetBaseMaskValue(A, X.first); |
1706 | int MYA = GetBaseMaskValue(A, Y.first); |
1707 | return MXA < MYA; |
1708 | }; |
1709 | stable_sort(Range&: V1, C: [&](std::pair<int, int> A, std::pair<int, int> B) { |
1710 | return SortBase(SVI0A, A, B); |
1711 | }); |
1712 | stable_sort(Range&: V2, C: [&](std::pair<int, int> A, std::pair<int, int> B) { |
1713 | return SortBase(SVI1A, A, B); |
1714 | }); |
1715 | // Calculate our ReconstructMasks from the OrigReconstructMasks and the |
1716 | // modified order of the input shuffles. |
1717 | SmallVector<SmallVector<int>> ReconstructMasks; |
1718 | for (const auto &Mask : OrigReconstructMasks) { |
1719 | SmallVector<int> ReconstructMask; |
1720 | for (int M : Mask) { |
1721 | auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) { |
1722 | auto It = find_if(Range: V, P: [M](auto A) { return A.second == M; }); |
1723 | assert(It != V.end() && "Expected all entries in Mask" ); |
1724 | return std::distance(first: V.begin(), last: It); |
1725 | }; |
1726 | if (M < 0) |
1727 | ReconstructMask.push_back(Elt: -1); |
1728 | else if (M < static_cast<int>(NumElts)) { |
1729 | ReconstructMask.push_back(Elt: FindIndex(V1, M)); |
1730 | } else { |
1731 | ReconstructMask.push_back(Elt: NumElts + FindIndex(V2, M)); |
1732 | } |
1733 | } |
1734 | ReconstructMasks.push_back(Elt: std::move(ReconstructMask)); |
1735 | } |
1736 | |
1737 | // Calculate the masks needed for the new input shuffles, which get padded |
1738 | // with undef |
1739 | SmallVector<int> V1A, V1B, V2A, V2B; |
1740 | for (unsigned I = 0; I < V1.size(); I++) { |
1741 | V1A.push_back(Elt: GetBaseMaskValue(SVI0A, V1[I].first)); |
1742 | V1B.push_back(Elt: GetBaseMaskValue(SVI0B, V1[I].first)); |
1743 | } |
1744 | for (unsigned I = 0; I < V2.size(); I++) { |
1745 | V2A.push_back(Elt: GetBaseMaskValue(SVI1A, V2[I].first)); |
1746 | V2B.push_back(Elt: GetBaseMaskValue(SVI1B, V2[I].first)); |
1747 | } |
1748 | while (V1A.size() < NumElts) { |
1749 | V1A.push_back(Elt: PoisonMaskElem); |
1750 | V1B.push_back(Elt: PoisonMaskElem); |
1751 | } |
1752 | while (V2A.size() < NumElts) { |
1753 | V2A.push_back(Elt: PoisonMaskElem); |
1754 | V2B.push_back(Elt: PoisonMaskElem); |
1755 | } |
1756 | |
1757 | auto AddShuffleCost = [&](InstructionCost C, Instruction *I) { |
1758 | auto *SV = dyn_cast<ShuffleVectorInst>(Val: I); |
1759 | if (!SV) |
1760 | return C; |
1761 | return C + TTI.getShuffleCost(Kind: isa<UndefValue>(Val: SV->getOperand(i_nocapture: 1)) |
1762 | ? TTI::SK_PermuteSingleSrc |
1763 | : TTI::SK_PermuteTwoSrc, |
1764 | Tp: VT, Mask: SV->getShuffleMask()); |
1765 | }; |
1766 | auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) { |
1767 | return C + TTI.getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, Tp: VT, Mask); |
1768 | }; |
1769 | |
1770 | // Get the costs of the shuffles + binops before and after with the new |
1771 | // shuffle masks. |
1772 | InstructionCost CostBefore = |
1773 | TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: VT) + |
1774 | TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: VT); |
1775 | CostBefore += std::accumulate(first: Shuffles.begin(), last: Shuffles.end(), |
1776 | init: InstructionCost(0), binary_op: AddShuffleCost); |
1777 | CostBefore += std::accumulate(first: InputShuffles.begin(), last: InputShuffles.end(), |
1778 | init: InstructionCost(0), binary_op: AddShuffleCost); |
1779 | |
1780 | // The new binops will be unused for lanes past the used shuffle lengths. |
1781 | // These types attempt to get the correct cost for that from the target. |
1782 | FixedVectorType *Op0SmallVT = |
1783 | FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V1.size()); |
1784 | FixedVectorType *Op1SmallVT = |
1785 | FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V2.size()); |
1786 | InstructionCost CostAfter = |
1787 | TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: Op0SmallVT) + |
1788 | TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: Op1SmallVT); |
1789 | CostAfter += std::accumulate(first: ReconstructMasks.begin(), last: ReconstructMasks.end(), |
1790 | init: InstructionCost(0), binary_op: AddShuffleMaskCost); |
1791 | std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B}); |
1792 | CostAfter += |
1793 | std::accumulate(first: OutputShuffleMasks.begin(), last: OutputShuffleMasks.end(), |
1794 | init: InstructionCost(0), binary_op: AddShuffleMaskCost); |
1795 | |
1796 | LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n" ); |
1797 | LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore |
1798 | << " vs CostAfter: " << CostAfter << "\n" ); |
1799 | if (CostBefore <= CostAfter) |
1800 | return false; |
1801 | |
1802 | // The cost model has passed, create the new instructions. |
1803 | auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * { |
1804 | auto *SV = dyn_cast<ShuffleVectorInst>(Val: I); |
1805 | if (!SV) |
1806 | return I; |
1807 | if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: 1))) |
1808 | if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: 0))) |
1809 | if (InputShuffles.contains(Ptr: SSV)) |
1810 | return SSV->getOperand(i_nocapture: Op); |
1811 | return SV->getOperand(i_nocapture: Op); |
1812 | }; |
1813 | Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef()); |
1814 | Value *NSV0A = Builder.CreateShuffleVector(V1: GetShuffleOperand(SVI0A, 0), |
1815 | V2: GetShuffleOperand(SVI0A, 1), Mask: V1A); |
1816 | Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef()); |
1817 | Value *NSV0B = Builder.CreateShuffleVector(V1: GetShuffleOperand(SVI0B, 0), |
1818 | V2: GetShuffleOperand(SVI0B, 1), Mask: V1B); |
1819 | Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef()); |
1820 | Value *NSV1A = Builder.CreateShuffleVector(V1: GetShuffleOperand(SVI1A, 0), |
1821 | V2: GetShuffleOperand(SVI1A, 1), Mask: V2A); |
1822 | Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef()); |
1823 | Value *NSV1B = Builder.CreateShuffleVector(V1: GetShuffleOperand(SVI1B, 0), |
1824 | V2: GetShuffleOperand(SVI1B, 1), Mask: V2B); |
1825 | Builder.SetInsertPoint(Op0); |
1826 | Value *NOp0 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op0->getOpcode(), |
1827 | LHS: NSV0A, RHS: NSV0B); |
1828 | if (auto *I = dyn_cast<Instruction>(Val: NOp0)) |
1829 | I->copyIRFlags(V: Op0, IncludeWrapFlags: true); |
1830 | Builder.SetInsertPoint(Op1); |
1831 | Value *NOp1 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op1->getOpcode(), |
1832 | LHS: NSV1A, RHS: NSV1B); |
1833 | if (auto *I = dyn_cast<Instruction>(Val: NOp1)) |
1834 | I->copyIRFlags(V: Op1, IncludeWrapFlags: true); |
1835 | |
1836 | for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { |
1837 | Builder.SetInsertPoint(Shuffles[S]); |
1838 | Value *NSV = Builder.CreateShuffleVector(V1: NOp0, V2: NOp1, Mask: ReconstructMasks[S]); |
1839 | replaceValue(Old&: *Shuffles[S], New&: *NSV); |
1840 | } |
1841 | |
1842 | Worklist.pushValue(V: NSV0A); |
1843 | Worklist.pushValue(V: NSV0B); |
1844 | Worklist.pushValue(V: NSV1A); |
1845 | Worklist.pushValue(V: NSV1B); |
1846 | for (auto *S : Shuffles) |
1847 | Worklist.add(I: S); |
1848 | return true; |
1849 | } |
1850 | |
1851 | /// This is the entry point for all transforms. Pass manager differences are |
1852 | /// handled in the callers of this function. |
1853 | bool VectorCombine::run() { |
1854 | if (DisableVectorCombine) |
1855 | return false; |
1856 | |
1857 | // Don't attempt vectorization if the target does not support vectors. |
1858 | if (!TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(/*Vector*/ true))) |
1859 | return false; |
1860 | |
1861 | bool MadeChange = false; |
1862 | auto FoldInst = [this, &MadeChange](Instruction &I) { |
1863 | Builder.SetInsertPoint(&I); |
1864 | bool IsFixedVectorType = isa<FixedVectorType>(Val: I.getType()); |
1865 | auto Opcode = I.getOpcode(); |
1866 | |
1867 | // These folds should be beneficial regardless of when this pass is run |
1868 | // in the optimization pipeline. |
1869 | // The type checking is for run-time efficiency. We can avoid wasting time |
1870 | // dispatching to folding functions if there's no chance of matching. |
1871 | if (IsFixedVectorType) { |
1872 | switch (Opcode) { |
1873 | case Instruction::InsertElement: |
1874 | MadeChange |= vectorizeLoadInsert(I); |
1875 | break; |
1876 | case Instruction::ShuffleVector: |
1877 | MadeChange |= widenSubvectorLoad(I); |
1878 | break; |
1879 | default: |
1880 | break; |
1881 | } |
1882 | } |
1883 | |
1884 | // This transform works with scalable and fixed vectors |
1885 | // TODO: Identify and allow other scalable transforms |
1886 | if (isa<VectorType>(Val: I.getType())) { |
1887 | MadeChange |= scalarizeBinopOrCmp(I); |
1888 | MadeChange |= scalarizeLoadExtract(I); |
1889 | MadeChange |= scalarizeVPIntrinsic(I); |
1890 | } |
1891 | |
1892 | if (Opcode == Instruction::Store) |
1893 | MadeChange |= foldSingleElementStore(I); |
1894 | |
1895 | // If this is an early pipeline invocation of this pass, we are done. |
1896 | if (TryEarlyFoldsOnly) |
1897 | return; |
1898 | |
1899 | // Otherwise, try folds that improve codegen but may interfere with |
1900 | // early IR canonicalizations. |
1901 | // The type checking is for run-time efficiency. We can avoid wasting time |
1902 | // dispatching to folding functions if there's no chance of matching. |
1903 | if (IsFixedVectorType) { |
1904 | switch (Opcode) { |
1905 | case Instruction::InsertElement: |
1906 | MadeChange |= foldInsExtFNeg(I); |
1907 | break; |
1908 | case Instruction::ShuffleVector: |
1909 | MadeChange |= foldShuffleOfBinops(I); |
1910 | MadeChange |= foldSelectShuffle(I); |
1911 | break; |
1912 | case Instruction::BitCast: |
1913 | MadeChange |= foldBitcastShuffle(I); |
1914 | break; |
1915 | } |
1916 | } else { |
1917 | switch (Opcode) { |
1918 | case Instruction::Call: |
1919 | MadeChange |= foldShuffleFromReductions(I); |
1920 | break; |
1921 | case Instruction::ICmp: |
1922 | case Instruction::FCmp: |
1923 | MadeChange |= foldExtractExtract(I); |
1924 | break; |
1925 | default: |
1926 | if (Instruction::isBinaryOp(Opcode)) { |
1927 | MadeChange |= foldExtractExtract(I); |
1928 | MadeChange |= foldExtractedCmps(I); |
1929 | } |
1930 | break; |
1931 | } |
1932 | } |
1933 | }; |
1934 | |
1935 | for (BasicBlock &BB : F) { |
1936 | // Ignore unreachable basic blocks. |
1937 | if (!DT.isReachableFromEntry(A: &BB)) |
1938 | continue; |
1939 | // Use early increment range so that we can erase instructions in loop. |
1940 | for (Instruction &I : make_early_inc_range(Range&: BB)) { |
1941 | if (I.isDebugOrPseudoInst()) |
1942 | continue; |
1943 | FoldInst(I); |
1944 | } |
1945 | } |
1946 | |
1947 | while (!Worklist.isEmpty()) { |
1948 | Instruction *I = Worklist.removeOne(); |
1949 | if (!I) |
1950 | continue; |
1951 | |
1952 | if (isInstructionTriviallyDead(I)) { |
1953 | eraseInstruction(I&: *I); |
1954 | continue; |
1955 | } |
1956 | |
1957 | FoldInst(*I); |
1958 | } |
1959 | |
1960 | return MadeChange; |
1961 | } |
1962 | |
1963 | PreservedAnalyses VectorCombinePass::run(Function &F, |
1964 | FunctionAnalysisManager &FAM) { |
1965 | auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F); |
1966 | TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F); |
1967 | DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
1968 | AAResults &AA = FAM.getResult<AAManager>(IR&: F); |
1969 | VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly); |
1970 | if (!Combiner.run()) |
1971 | return PreservedAnalyses::all(); |
1972 | PreservedAnalyses PA; |
1973 | PA.preserveSet<CFGAnalyses>(); |
1974 | return PA; |
1975 | } |
1976 | |