VectorCombine.cpp source code [llvm/lib/Transforms/Vectorize/VectorCombine.cpp]

1	//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass optimizes scalar/vector interactions using target cost models. The
10	// transforms implemented here may not fit in traditional loop-based or SLP
11	// vectorization passes.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/Transforms/Vectorize/VectorCombine.h"
16	#include "llvm/ADT/DenseMap.h"
17	#include "llvm/ADT/ScopeExit.h"
18	#include "llvm/ADT/Statistic.h"
19	#include "llvm/Analysis/AssumptionCache.h"
20	#include "llvm/Analysis/BasicAliasAnalysis.h"
21	#include "llvm/Analysis/GlobalsModRef.h"
22	#include "llvm/Analysis/Loads.h"
23	#include "llvm/Analysis/TargetTransformInfo.h"
24	#include "llvm/Analysis/ValueTracking.h"
25	#include "llvm/Analysis/VectorUtils.h"
26	#include "llvm/IR/Dominators.h"
27	#include "llvm/IR/Function.h"
28	#include "llvm/IR/IRBuilder.h"
29	#include "llvm/IR/PatternMatch.h"
30	#include "llvm/Support/CommandLine.h"
31	#include "llvm/Transforms/Utils/Local.h"
32	#include <numeric>
33	#include <queue>
34
35	#define DEBUG_TYPE "vector-combine"
36	#include "llvm/Transforms/Utils/InstructionWorklist.h"
37
38	using namespace llvm;
39	using namespace llvm::PatternMatch;
40
41	STATISTIC(NumVecLoad, "Number of vector loads formed");
42	STATISTIC(NumVecCmp, "Number of vector compares formed");
43	STATISTIC(NumVecBO, "Number of vector binops formed");
44	STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
45	STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
46	STATISTIC(NumScalarBO, "Number of scalar binops formed");
47	STATISTIC(NumScalarCmp, "Number of scalar compares formed");
48
49	static cl::opt<bool> DisableVectorCombine(
50	"disable-vector-combine", cl::init(Val: false), cl::Hidden,
51	cl::desc ("Disable all vector combine transforms"));
52
53	static cl::opt<bool> DisableBinopExtractShuffle(
54	"disable-binop-extract-shuffle", cl::init(Val: false), cl::Hidden,
55	cl::desc ("Disable binop extract to shuffle transforms"));
56
57	static cl::opt<unsigned> MaxInstrsToScan(
58	"vector-combine-max-scan-instrs", cl::init(Val: `30`), cl::Hidden,
59	cl::desc ("Max number of instructions to scan for vector combining."));
60
61	static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
62
63	namespace {
64	class VectorCombine {
65	public:
66	VectorCombine(Function &F, const TargetTransformInfo &TTI,
67	const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
68	bool TryEarlyFoldsOnly)
69	: F(F), Builder (F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC),
70	TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
71
72	bool run();
73
74	private:
75	Function &F;
76	IRBuilder<> Builder;
77	const TargetTransformInfo &TTI;
78	const DominatorTree &DT;
79	AAResults &AA;
80	AssumptionCache &AC;
81
82	/// If true, only perform beneficial early IR transforms. Do not introduce new
83	/// vector operations.
84	bool TryEarlyFoldsOnly;
85
86	InstructionWorklist Worklist;
87
88	// TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
89	// parameter. That should be updated to specific sub-classes because the
90	// run loop was changed to dispatch on opcode.
91	bool vectorizeLoadInsert(Instruction &I);
92	bool widenSubvectorLoad(Instruction &I);
93	ExtractElementInst getShuffleExtract(ExtractElementInst Ext0,
94	ExtractElementInst *Ext1,
95	unsigned PreferredExtractIndex) const;
96	bool isExtractExtractCheap(ExtractElementInst Ext0, ExtractElementInst Ext1,
97	const Instruction &I,
98	ExtractElementInst *&ConvertToShuffle,
99	unsigned PreferredExtractIndex);
100	void foldExtExtCmp(ExtractElementInst Ext0, ExtractElementInst Ext1,
101	Instruction &I);
102	void foldExtExtBinop(ExtractElementInst Ext0, ExtractElementInst Ext1,
103	Instruction &I);
104	bool foldExtractExtract(Instruction &I);
105	bool foldInsExtFNeg(Instruction &I);
106	bool foldBitcastShuffle(Instruction &I);
107	bool scalarizeBinopOrCmp(Instruction &I);
108	bool scalarizeVPIntrinsic(Instruction &I);
109	bool foldExtractedCmps(Instruction &I);
110	bool foldSingleElementStore(Instruction &I);
111	bool scalarizeLoadExtract(Instruction &I);
112	bool foldShuffleOfBinops(Instruction &I);
113	bool foldShuffleFromReductions(Instruction &I);
114	bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
115
116	void replaceValue(Value &Old, Value &New) {
117	Old.replaceAllUsesWith(V: &New);
118	if (auto *NewI = dyn_cast<Instruction>(Val: &New)) {
119	New.takeName(V: &Old);
120	Worklist.pushUsersToWorkList(I&: *NewI);
121	Worklist.pushValue(V: NewI);
122	}
123	Worklist.pushValue(V: &Old);
124	}
125
126	void eraseInstruction(Instruction &I) {
127	for (Value *Op : I.operands())
128	Worklist.pushValue(V: Op);
129	Worklist.remove(I: &I);
130	I.eraseFromParent();
131	}
132	};
133	} // namespace
134
135	static bool canWidenLoad(LoadInst Load, const* TargetTransformInfo &TTI) {
136	// Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
137	// The widened load may load data from dirty regions or create data races
138	// non-existent in the source.
139	if (!Load \|\| !Load->isSimple() \|\| !Load->hasOneUse() \|\|
140	Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) \|\|
141	mustSuppressSpeculation(LI: *Load))
142	return false;
143
144	// We are potentially transforming byte-sized (8-bit) memory accesses, so make
145	// sure we have all of our type-based constraints in place for this target.
146	Type *ScalarTy = Load->getType()->getScalarType();
147	uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
148	unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
149	if (!ScalarSize \|\| !MinVectorSize \|\| MinVectorSize % ScalarSize != `0` \|\|
150	ScalarSize % `8` != `0`)
151	return false;
152
153	return true;
154	}
155
156	bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
157	// Match insert into fixed vector of scalar value.
158	// TODO: Handle non-zero insert index.
159	Value *Scalar;
160	if (!match(V: &I, P: m_InsertElt(Val: m_Undef(), Elt: m_Value(V&: Scalar), Idx: m_ZeroInt())) \|\|
161	!Scalar->hasOneUse())
162	return false;
163
164	// Optionally match an extract from another vector.
165	Value *X;
166	bool HasExtract = match(V: Scalar, P: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ZeroInt()));
167	if (!HasExtract)
168	X = Scalar;
169
170	auto *Load = dyn_cast<LoadInst>(Val: X);
171	if (!canWidenLoad(Load, TTI))
172	return false;
173
174	Type *ScalarTy = Scalar->getType();
175	uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
176	unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
177
178	// Check safety of replacing the scalar load with a larger vector load.
179	// We use minimal alignment (maximum flexibility) because we only care about
180	// the dereferenceable region. When calculating cost and creating a new op,
181	// we may use a larger value based on alignment attributes.
182	const DataLayout &DL = I.getModule()->getDataLayout();
183	Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
184	assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
185
186	unsigned MinVecNumElts = MinVectorSize / ScalarSize;
187	auto MinVecTy = VectorType::get(ElementType: ScalarTy, NumElements: MinVecNumElts, Scalable: false*);
188	unsigned OffsetEltIndex = `0`;
189	Align Alignment = Load->getAlign();
190	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align (`1`), DL, ScanFrom: Load, AC: &AC,
191	DT: &DT)) {
192	// It is not safe to load directly from the pointer, but we can still peek
193	// through gep offsets and check if it safe to load from a base address with
194	// updated alignment. If it is, we can shuffle the element(s) into place
195	// after loading.
196	unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(Ty: SrcPtr->getType());
197	APInt Offset(OffsetBitWidth, `0`);
198	SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
199
200	// We want to shuffle the result down from a high element of a vector, so
201	// the offset must be positive.
202	if (Offset.isNegative())
203	return false;
204
205	// The offset must be a multiple of the scalar element to shuffle cleanly
206	// in the element's size.
207	uint64_t ScalarSizeInBytes = ScalarSize / `8`;
208	if (Offset.urem(RHS: ScalarSizeInBytes) != `0`)
209	return false;
210
211	// If we load MinVecNumElts, will our target element still be loaded?
212	OffsetEltIndex = Offset.udiv(RHS: ScalarSizeInBytes).getZExtValue();
213	if (OffsetEltIndex >= MinVecNumElts)
214	return false;
215
216	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty: MinVecTy, Alignment: Align (`1`), DL, ScanFrom: Load, AC: &AC,
217	DT: &DT))
218	return false;
219
220	// Update alignment with offset value. Note that the offset could be negated
221	// to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
222	// negation does not change the result of the alignment calculation.
223	Alignment = commonAlignment(A: Alignment, Offset: Offset.getZExtValue());
224	}
225
226	// Original pattern: insertelt undef, load [free casts of] PtrOp, 0
227	// Use the greater of the alignment on the load or its source pointer.
228	Alignment = std::max(a: SrcPtr->getPointerAlignment(DL), b: Alignment);
229	Type *LoadTy = Load->getType();
230	unsigned AS = Load->getPointerAddressSpace();
231	InstructionCost OldCost =
232	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS);
233	APInt DemandedElts = APInt::getOneBitSet(numBits: MinVecNumElts, BitNo: `0`);
234	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
235	OldCost +=
236	TTI.getScalarizationOverhead(Ty: MinVecTy, DemandedElts,
237	/ Insert / true, Extract: HasExtract, CostKind);
238
239	// New pattern: load VecPtr
240	InstructionCost NewCost =
241	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: MinVecTy, Alignment, AddressSpace: AS);
242	// Optionally, we are shuffling the loaded vector element(s) into place.
243	// For the mask set everything but element 0 to undef to prevent poison from
244	// propagating from the extra loaded memory. This will also optionally
245	// shrink/grow the vector from the loaded size to the output size.
246	// We assume this operation has no cost in codegen if there was no offset.
247	// Note that we could use freeze to avoid poison problems, but then we might
248	// still need a shuffle to change the vector size.
249	auto *Ty = cast<FixedVectorType>(Val: I.getType());
250	unsigned OutputNumElts = Ty->getNumElements();
251	SmallVector<int, `16`> Mask(OutputNumElts, PoisonMaskElem);
252	assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
253	Mask [`0`] = OffsetEltIndex;
254	if (OffsetEltIndex)
255	NewCost += TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: MinVecTy, Mask);
256
257	// We can aggressively convert to the vector form because the backend can
258	// invert this transform if it does not result in a performance win.
259	if (OldCost < NewCost \|\| !NewCost.isValid())
260	return false;
261
262	// It is safe and potentially profitable to load a vector directly:
263	// inselt undef, load Scalar, 0 --> load VecPtr
264	IRBuilder<> Builder(Load);
265	Value *CastedPtr =
266	Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS));
267	Value *VecLd = Builder.CreateAlignedLoad(Ty: MinVecTy, Ptr: CastedPtr, Align: Alignment);
268	VecLd = Builder.CreateShuffleVector(V: VecLd, Mask);
269
270	replaceValue(Old&: I, New&: *VecLd);
271	++NumVecLoad;
272	return true;
273	}
274
275	/// If we are loading a vector and then inserting it into a larger vector with
276	/// undefined elements, try to load the larger vector and eliminate the insert.
277	/// This removes a shuffle in IR and may allow combining of other loaded values.
278	bool VectorCombine::widenSubvectorLoad(Instruction &I) {
279	// Match subvector insert of fixed vector.
280	auto *Shuf = cast<ShuffleVectorInst>(Val: &I);
281	if (!Shuf->isIdentityWithPadding())
282	return false;
283
284	// Allow a non-canonical shuffle mask that is choosing elements from op1.
285	unsigned NumOpElts =
286	cast<FixedVectorType>(Val: Shuf->getOperand(i_nocapture: `0`)->getType())->getNumElements();
287	unsigned OpIndex = any_of(Range: Shuf->getShuffleMask(), P: [&NumOpElts](int M) {
288	return M >= (int)(NumOpElts);
289	});
290
291	auto *Load = dyn_cast<LoadInst>(Val: Shuf->getOperand(i_nocapture: OpIndex));
292	if (!canWidenLoad(Load, TTI))
293	return false;
294
295	// We use minimal alignment (maximum flexibility) because we only care about
296	// the dereferenceable region. When calculating cost and creating a new op,
297	// we may use a larger value based on alignment attributes.
298	auto *Ty = cast<FixedVectorType>(Val: I.getType());
299	const DataLayout &DL = I.getModule()->getDataLayout();
300	Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
301	assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
302	Align Alignment = Load->getAlign();
303	if (!isSafeToLoadUnconditionally(V: SrcPtr, Ty, Alignment: Align (`1`), DL, ScanFrom: Load, AC: &AC, DT: &DT))
304	return false;
305
306	Alignment = std::max(a: SrcPtr->getPointerAlignment(DL), b: Alignment);
307	Type *LoadTy = Load->getType();
308	unsigned AS = Load->getPointerAddressSpace();
309
310	// Original pattern: insert_subvector (load PtrOp)
311	// This conservatively assumes that the cost of a subvector insert into an
312	// undef value is 0. We could add that cost if the cost model accurately
313	// reflects the real cost of that operation.
314	InstructionCost OldCost =
315	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment, AddressSpace: AS);
316
317	// New pattern: load PtrOp
318	InstructionCost NewCost =
319	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: Ty, Alignment, AddressSpace: AS);
320
321	// We can aggressively convert to the vector form because the backend can
322	// invert this transform if it does not result in a performance win.
323	if (OldCost < NewCost \|\| !NewCost.isValid())
324	return false;
325
326	IRBuilder<> Builder(Load);
327	Value *CastedPtr =
328	Builder.CreatePointerBitCastOrAddrSpaceCast(V: SrcPtr, DestTy: Builder.getPtrTy(AddrSpace: AS));
329	Value *VecLd = Builder.CreateAlignedLoad(Ty, Ptr: CastedPtr, Align: Alignment);
330	replaceValue(Old&: I, New&: *VecLd);
331	++NumVecLoad;
332	return true;
333	}
334
335	/// Determine which, if any, of the inputs should be replaced by a shuffle
336	/// followed by extract from a different index.
337	ExtractElementInst *VectorCombine::getShuffleExtract(
338	ExtractElementInst Ext0, ExtractElementInst Ext1,
339	unsigned PreferredExtractIndex = InvalidIndex) const {
340	auto *Index0C = dyn_cast<ConstantInt>(Val: Ext0->getIndexOperand());
341	auto *Index1C = dyn_cast<ConstantInt>(Val: Ext1->getIndexOperand());
342	assert(Index0C && Index1C && "Expected constant extract indexes");
343
344	unsigned Index0 = Index0C->getZExtValue();
345	unsigned Index1 = Index1C->getZExtValue();
346
347	// If the extract indexes are identical, no shuffle is needed.
348	if (Index0 == Index1)
349	return nullptr;
350
351	Type *VecTy = Ext0->getVectorOperand()->getType();
352	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
353	assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
354	InstructionCost Cost0 =
355	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0);
356	InstructionCost Cost1 =
357	TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1);
358
359	// If both costs are invalid no shuffle is needed
360	if (!Cost0.isValid() && !Cost1.isValid())
361	return nullptr;
362
363	// We are extracting from 2 different indexes, so one operand must be shuffled
364	// before performing a vector operation and/or extract. The more expensive
365	// extract will be replaced by a shuffle.
366	if (Cost0 > Cost1)
367	return Ext0;
368	if (Cost1 > Cost0)
369	return Ext1;
370
371	// If the costs are equal and there is a preferred extract index, shuffle the
372	// opposite operand.
373	if (PreferredExtractIndex == Index0)
374	return Ext1;
375	if (PreferredExtractIndex == Index1)
376	return Ext0;
377
378	// Otherwise, replace the extract with the higher index.
379	return Index0 > Index1 ? Ext0 : Ext1;
380	}
381
382	/// Compare the relative costs of 2 extracts followed by scalar operation vs.
383	/// vector operation(s) followed by extract. Return true if the existing
384	/// instructions are cheaper than a vector alternative. Otherwise, return false
385	/// and if one of the extracts should be transformed to a shufflevector, set
386	/// \p ConvertToShuffle to that extract instruction.
387	bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
388	ExtractElementInst *Ext1,
389	const Instruction &I,
390	ExtractElementInst *&ConvertToShuffle,
391	unsigned PreferredExtractIndex) {
392	auto *Ext0IndexC = dyn_cast<ConstantInt>(Val: Ext0->getOperand(i_nocapture: `1`));
393	auto *Ext1IndexC = dyn_cast<ConstantInt>(Val: Ext1->getOperand(i_nocapture: `1`));
394	assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
395
396	unsigned Opcode = I.getOpcode();
397	Type *ScalarTy = Ext0->getType();
398	auto *VecTy = cast<VectorType>(Val: Ext0->getOperand(i_nocapture: `0`)->getType());
399	InstructionCost ScalarOpCost, VectorOpCost;
400
401	// Get cost estimates for scalar and vector versions of the operation.
402	bool IsBinOp = Instruction::isBinaryOp(Opcode);
403	if (IsBinOp) {
404	ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy);
405	VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy);
406	} else {
407	assert((Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) &&
408	"Expected a compare");
409	CmpInst::Predicate Pred = cast<CmpInst>(Val: I).getPredicate();
410	ScalarOpCost = TTI.getCmpSelInstrCost(
411	Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred);
412	VectorOpCost = TTI.getCmpSelInstrCost(
413	Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred);
414	}
415
416	// Get cost estimates for the extract elements. These costs will factor into
417	// both sequences.
418	unsigned Ext0Index = Ext0IndexC->getZExtValue();
419	unsigned Ext1Index = Ext1IndexC->getZExtValue();
420	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
421
422	InstructionCost Extract0Cost =
423	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Ext0Index);
424	InstructionCost Extract1Cost =
425	TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Ext1Index);
426
427	// A more expensive extract will always be replaced by a splat shuffle.
428	// For example, if Ext0 is more expensive:
429	// opcode (extelt V0, Ext0), (ext V1, Ext1) -->
430	// extelt (opcode (splat V0, Ext0), V1), Ext1
431	// TODO: Evaluate whether that always results in lowest cost. Alternatively,
432	// check the cost of creating a broadcast shuffle and shuffling both
433	// operands to element 0.
434	InstructionCost CheapExtractCost = std::min(a: Extract0Cost, b: Extract1Cost);
435
436	// Extra uses of the extracts mean that we include those costs in the
437	// vector total because those instructions will not be eliminated.
438	InstructionCost OldCost, NewCost;
439	if (Ext0->getOperand(i_nocapture: `0`) == Ext1->getOperand(i_nocapture: `0`) && Ext0Index == Ext1Index) {
440	// Handle a special case. If the 2 extracts are identical, adjust the
441	// formulas to account for that. The extra use charge allows for either the
442	// CSE'd pattern or an unoptimized form with identical values:
443	// opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
444	bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(N: `2`)
445	: !Ext0->hasOneUse() \|\| !Ext1->hasOneUse();
446	OldCost = CheapExtractCost + ScalarOpCost;
447	NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
448	} else {
449	// Handle the general case. Each extract is actually a different value:
450	// opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
451	OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
452	NewCost = VectorOpCost + CheapExtractCost +
453	!Ext0->hasOneUse() * Extract0Cost +
454	!Ext1->hasOneUse() * Extract1Cost;
455	}
456
457	ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
458	if (ConvertToShuffle) {
459	if (IsBinOp && DisableBinopExtractShuffle)
460	return true;
461
462	// If we are extracting from 2 different indexes, then one operand must be
463	// shuffled before performing the vector operation. The shuffle mask is
464	// poison except for 1 lane that is being translated to the remaining
465	// extraction lane. Therefore, it is a splat shuffle. Ex:
466	// ShufMask = { poison, poison, 0, poison }
467	// TODO: The cost model has an option for a "broadcast" shuffle
468	// (splat-from-element-0), but no option for a more general splat.
469	NewCost +=
470	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy);
471	}
472
473	// Aggressively form a vector op if the cost is equal because the transform
474	// may enable further optimization.
475	// Codegen can reverse this transform (scalarize) if it was not profitable.
476	return OldCost < NewCost;
477	}
478
479	/// Create a shuffle that translates (shifts) 1 element from the input vector
480	/// to a new element location.
481	static Value createShiftShuffle(Value Vec, unsigned OldIndex,
482	unsigned NewIndex, IRBuilder<> &Builder) {
483	// The shuffle mask is poison except for 1 lane that is being translated
484	// to the new element index. Example for OldIndex == 2 and NewIndex == 0:
485	// ShufMask = { 2, poison, poison, poison }
486	auto *VecTy = cast<FixedVectorType>(Val: Vec->getType());
487	SmallVector<int, `32`> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
488	ShufMask [NewIndex] = OldIndex;
489	return Builder.CreateShuffleVector(V: Vec, Mask: ShufMask, Name: "shift");
490	}
491
492	/// Given an extract element instruction with constant index operand, shuffle
493	/// the source vector (shift the scalar element) to a NewIndex for extraction.
494	/// Return null if the input can be constant folded, so that we are not creating
495	/// unnecessary instructions.
496	static ExtractElementInst translateExtract(ExtractElementInst ExtElt,
497	unsigned NewIndex,
498	IRBuilder<> &Builder) {
499	// Shufflevectors can only be created for fixed-width vectors.
500	if (!isa<FixedVectorType>(Val: ExtElt->getOperand(i_nocapture: `0`)->getType()))
501	return nullptr;
502
503	// If the extract can be constant-folded, this code is unsimplified. Defer
504	// to other passes to handle that.
505	Value *X = ExtElt->getVectorOperand();
506	Value *C = ExtElt->getIndexOperand();
507	assert(isa<ConstantInt>(C) && "Expected a constant index operand");
508	if (isa<Constant>(Val: X))
509	return nullptr;
510
511	Value *Shuf = createShiftShuffle(Vec: X, OldIndex: cast<ConstantInt>(Val: C)->getZExtValue(),
512	NewIndex, Builder);
513	return cast<ExtractElementInst>(Val: Builder.CreateExtractElement(Vec: Shuf, Idx: NewIndex));
514	}
515
516	/// Try to reduce extract element costs by converting scalar compares to vector
517	/// compares followed by extract.
518	/// cmp (ext0 V0, C), (ext1 V1, C)
519	void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
520	ExtractElementInst *Ext1, Instruction &I) {
521	assert(isa<CmpInst>(&I) && "Expected a compare");
522	assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
523	cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
524	"Expected matching constant extract indexes");
525
526	// cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
527	++NumVecCmp;
528	CmpInst::Predicate Pred = cast<CmpInst>(Val: &I)->getPredicate();
529	Value V0 = Ext0->getVectorOperand(), V1 = Ext1->getVectorOperand();
530	Value *VecCmp = Builder.CreateCmp(Pred, LHS: V0, RHS: V1);
531	Value *NewExt = Builder.CreateExtractElement(Vec: VecCmp, Idx: Ext0->getIndexOperand());
532	replaceValue(Old&: I, New&: *NewExt);
533	}
534
535	/// Try to reduce extract element costs by converting scalar binops to vector
536	/// binops followed by extract.
537	/// bo (ext0 V0, C), (ext1 V1, C)
538	void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
539	ExtractElementInst *Ext1, Instruction &I) {
540	assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
541	assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
542	cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
543	"Expected matching constant extract indexes");
544
545	// bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
546	++NumVecBO;
547	Value V0 = Ext0->getVectorOperand(), V1 = Ext1->getVectorOperand();
548	Value *VecBO =
549	Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val: &I)->getOpcode(), LHS: V0, RHS: V1);
550
551	// All IR flags are safe to back-propagate because any potential poison
552	// created in unused vector elements is discarded by the extract.
553	if (auto *VecBOInst = dyn_cast<Instruction>(Val: VecBO))
554	VecBOInst->copyIRFlags(V: &I);
555
556	Value *NewExt = Builder.CreateExtractElement(Vec: VecBO, Idx: Ext0->getIndexOperand());
557	replaceValue(Old&: I, New&: *NewExt);
558	}
559
560	/// Match an instruction with extracted vector operands.
561	bool VectorCombine::foldExtractExtract(Instruction &I) {
562	// It is not safe to transform things like div, urem, etc. because we may
563	// create undefined behavior when executing those on unknown vector elements.
564	if (!isSafeToSpeculativelyExecute(I: &I))
565	return false;
566
567	Instruction I0, I1;
568	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
569	if (!match(V: &I, P: m_Cmp(Pred, L: m_Instruction(I&: I0), R: m_Instruction(I&: I1))) &&
570	!match(V: &I, P: m_BinOp(L: m_Instruction(I&: I0), R: m_Instruction(I&: I1))))
571	return false;
572
573	Value V0, V1;
574	uint64_t C0, C1;
575	if (!match(V: I0, P: m_ExtractElt(Val: m_Value(V&: V0), Idx: m_ConstantInt(V&: C0))) \|\|
576	!match(V: I1, P: m_ExtractElt(Val: m_Value(V&: V1), Idx: m_ConstantInt(V&: C1))) \|\|
577	V0->getType() != V1->getType())
578	return false;
579
580	// If the scalar value 'I' is going to be re-inserted into a vector, then try
581	// to create an extract to that same element. The extract/insert can be
582	// reduced to a "select shuffle".
583	// TODO: If we add a larger pattern match that starts from an insert, this
584	// probably becomes unnecessary.
585	auto *Ext0 = cast<ExtractElementInst>(Val: I0);
586	auto *Ext1 = cast<ExtractElementInst>(Val: I1);
587	uint64_t InsertIndex = InvalidIndex;
588	if (I.hasOneUse())
589	match(V: I.user_back(),
590	P: m_InsertElt(Val: m_Value(), Elt: m_Value(), Idx: m_ConstantInt(V&: InsertIndex)));
591
592	ExtractElementInst *ExtractToChange;
593	if (isExtractExtractCheap(Ext0, Ext1, I, ConvertToShuffle&: ExtractToChange, PreferredExtractIndex: InsertIndex))
594	return false;
595
596	if (ExtractToChange) {
597	unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
598	ExtractElementInst *NewExtract =
599	translateExtract(ExtElt: ExtractToChange, NewIndex: CheapExtractIdx, Builder);
600	if (!NewExtract)
601	return false;
602	if (ExtractToChange == Ext0)
603	Ext0 = NewExtract;
604	else
605	Ext1 = NewExtract;
606	}
607
608	if (Pred != CmpInst::BAD_ICMP_PREDICATE)
609	foldExtExtCmp(Ext0, Ext1, I);
610	else
611	foldExtExtBinop(Ext0, Ext1, I);
612
613	Worklist.push(I: Ext0);
614	Worklist.push(I: Ext1);
615	return true;
616	}
617
618	/// Try to replace an extract + scalar fneg + insert with a vector fneg +
619	/// shuffle.
620	bool VectorCombine::foldInsExtFNeg(Instruction &I) {
621	// Match an insert (op (extract)) pattern.
622	Value *DestVec;
623	uint64_t Index;
624	Instruction *FNeg;
625	if (!match(V: &I, P: m_InsertElt(Val: m_Value(V&: DestVec), Elt: m_OneUse(SubPattern: m_Instruction(I&: FNeg)),
626	Idx: m_ConstantInt(V&: Index))))
627	return false;
628
629	// Note: This handles the canonical fneg instruction and "fsub -0.0, X".
630	Value *SrcVec;
631	Instruction *Extract;
632	if (!match(V: FNeg, P: m_FNeg(X: m_CombineAnd(
633	L: m_Instruction(I&: Extract),
634	R: m_ExtractElt(Val: m_Value(V&: SrcVec), Idx: m_SpecificInt(V: Index))))))
635	return false;
636
637	// TODO: We could handle this with a length-changing shuffle.
638	auto *VecTy = cast<FixedVectorType>(Val: I.getType());
639	if (SrcVec->getType() != VecTy)
640	return false;
641
642	// Ignore bogus insert/extract index.
643	unsigned NumElts = VecTy->getNumElements();
644	if (Index >= NumElts)
645	return false;
646
647	// We are inserting the negated element into the same lane that we extracted
648	// from. This is equivalent to a select-shuffle that chooses all but the
649	// negated element from the destination vector.
650	SmallVector<int> Mask(NumElts);
651	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
652	Mask [Index] = Index + NumElts;
653
654	Type *ScalarTy = VecTy->getScalarType();
655	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
656	InstructionCost OldCost =
657	TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: ScalarTy) +
658	TTI.getVectorInstrCost(I, Val: VecTy, CostKind, Index);
659
660	// If the extract has one use, it will be eliminated, so count it in the
661	// original cost. If it has more than one use, ignore the cost because it will
662	// be the same before/after.
663	if (Extract->hasOneUse())
664	OldCost += TTI.getVectorInstrCost(I: *Extract, Val: VecTy, CostKind, Index);
665
666	InstructionCost NewCost =
667	TTI.getArithmeticInstrCost(Opcode: Instruction::FNeg, Ty: VecTy) +
668	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Select, Tp: VecTy, Mask);
669
670	if (NewCost > OldCost)
671	return false;
672
673	// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
674	// shuffle DestVec, (fneg SrcVec), Mask
675	Value *VecFNeg = Builder.CreateFNegFMF(V: SrcVec, FMFSource: FNeg);
676	Value *Shuf = Builder.CreateShuffleVector(V1: DestVec, V2: VecFNeg, Mask);
677	replaceValue(Old&: I, New&: *Shuf);
678	return true;
679	}
680
681	/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
682	/// destination type followed by shuffle. This can enable further transforms by
683	/// moving bitcasts or shuffles together.
684	bool VectorCombine::foldBitcastShuffle(Instruction &I) {
685	Value *V;
686	ArrayRef<int> Mask;
687	if (!match(V: &I, P: m_BitCast(
688	Op: m_OneUse(SubPattern: m_Shuffle(v1: m_Value(V), v2: m_Undef(), mask: m_Mask (Mask))))))
689	return false;
690
691	// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
692	// scalable type is unknown; Second, we cannot reason if the narrowed shuffle
693	// mask for scalable type is a splat or not.
694	// 2) Disallow non-vector casts.
695	// TODO: We could allow any shuffle.
696	auto *DestTy = dyn_cast<FixedVectorType>(Val: I.getType());
697	auto *SrcTy = dyn_cast<FixedVectorType>(Val: V->getType());
698	if (!DestTy \|\| !SrcTy)
699	return false;
700
701	unsigned DestEltSize = DestTy->getScalarSizeInBits();
702	unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
703	if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != `0`)
704	return false;
705
706	SmallVector<int, `16`> NewMask;
707	if (DestEltSize <= SrcEltSize) {
708	// The bitcast is from wide to narrow/equal elements. The shuffle mask can
709	// always be expanded to the equivalent form choosing narrower elements.
710	assert(SrcEltSize % DestEltSize == `0` && "Unexpected shuffle mask");
711	unsigned ScaleFactor = SrcEltSize / DestEltSize;
712	narrowShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask);
713	} else {
714	// The bitcast is from narrow elements to wide elements. The shuffle mask
715	// must choose consecutive elements to allow casting first.
716	assert(DestEltSize % SrcEltSize == `0` && "Unexpected shuffle mask");
717	unsigned ScaleFactor = DestEltSize / SrcEltSize;
718	if (!widenShuffleMaskElts(Scale: ScaleFactor, Mask, ScaledMask&: NewMask))
719	return false;
720	}
721
722	// Bitcast the shuffle src - keep its original width but using the destination
723	// scalar type.
724	unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
725	auto *ShuffleTy = FixedVectorType::get(ElementType: DestTy->getScalarType(), NumElts: NumSrcElts);
726
727	// The new shuffle must not cost more than the old shuffle. The bitcast is
728	// moved ahead of the shuffle, so assume that it has the same cost as before.
729	InstructionCost DestCost = TTI.getShuffleCost(
730	Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: ShuffleTy, Mask: NewMask);
731	InstructionCost SrcCost =
732	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: SrcTy, Mask);
733	if (DestCost > SrcCost \|\| !DestCost.isValid())
734	return false;
735
736	// bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
737	++NumShufOfBitcast;
738	Value *CastV = Builder.CreateBitCast(V, DestTy: ShuffleTy);
739	Value *Shuf = Builder.CreateShuffleVector(V: CastV, Mask: NewMask);
740	replaceValue(Old&: I, New&: *Shuf);
741	return true;
742	}
743
744	/// VP Intrinsics whose vector operands are both splat values may be simplified
745	/// into the scalar version of the operation and the result splatted. This
746	/// can lead to scalarization down the line.
747	bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
748	if (!isa<VPIntrinsic>(Val: I))
749	return false;
750	VPIntrinsic &VPI = cast<VPIntrinsic>(Val&: I);
751	Value *Op0 = VPI.getArgOperand(i: `0`);
752	Value *Op1 = VPI.getArgOperand(i: `1`);
753
754	if (!isSplatValue(V: Op0) \|\| !isSplatValue(V: Op1))
755	return false;
756
757	// Check getSplatValue early in this function, to avoid doing unnecessary
758	// work.
759	Value *ScalarOp0 = getSplatValue(V: Op0);
760	Value *ScalarOp1 = getSplatValue(V: Op1);
761	if (!ScalarOp0 \|\| !ScalarOp1)
762	return false;
763
764	// For the binary VP intrinsics supported here, the result on disabled lanes
765	// is a poison value. For now, only do this simplification if all lanes
766	// are active.
767	// TODO: Relax the condition that all lanes are active by using insertelement
768	// on inactive lanes.
769	auto IsAllTrueMask = [](Value *MaskVal) {
770	if (Value *SplattedVal = getSplatValue(V: MaskVal))
771	if (auto *ConstValue = dyn_cast<Constant>(Val: SplattedVal))
772	return ConstValue->isAllOnesValue();
773	return false;
774	};
775	if (!IsAllTrueMask (VPI.getArgOperand(i: `2`)))
776	return false;
777
778	// Check to make sure we support scalarization of the intrinsic
779	Intrinsic::ID IntrID = VPI.getIntrinsicID();
780	if (!VPBinOpIntrinsic::isVPBinOp(ID: IntrID))
781	return false;
782
783	// Calculate cost of splatting both operands into vectors and the vector
784	// intrinsic
785	VectorType *VecTy = cast<VectorType>(Val: VPI.getType());
786	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
787	InstructionCost SplatCost =
788	TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: `0`) +
789	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VecTy);
790
791	// Calculate the cost of the VP Intrinsic
792	SmallVector<Type *, `4`> Args;
793	for (Value *V : VPI.args())
794	Args.push_back(Elt: V->getType());
795	IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
796	InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind);
797	InstructionCost OldCost = `2` * SplatCost + VectorOpCost;
798
799	// Determine scalar opcode
800	std::optional<unsigned> FunctionalOpcode =
801	VPI.getFunctionalOpcode();
802	std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
803	if (!FunctionalOpcode) {
804	ScalarIntrID = VPI.getFunctionalIntrinsicID();
805	if (!ScalarIntrID)
806	return false;
807	}
808
809	// Calculate cost of scalarizing
810	InstructionCost ScalarOpCost = `0`;
811	if (ScalarIntrID) {
812	IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
813	ScalarOpCost = TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind);
814	} else {
815	ScalarOpCost =
816	TTI.getArithmeticInstrCost(Opcode: *FunctionalOpcode, Ty: VecTy->getScalarType());
817	}
818
819	// The existing splats may be kept around if other instructions use them.
820	InstructionCost CostToKeepSplats =
821	(SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
822	InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
823
824	LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
825	<< "\n");
826	LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
827	<< ", Cost of scalarizing:" << NewCost << "\n");
828
829	// We want to scalarize unless the vector variant actually has lower cost.
830	if (OldCost < NewCost \|\| !NewCost.isValid())
831	return false;
832
833	// Scalarize the intrinsic
834	ElementCount EC = cast<VectorType>(Val: Op0->getType())->getElementCount();
835	Value *EVL = VPI.getArgOperand(i: `3`);
836	const DataLayout &DL = VPI.getModule()->getDataLayout();
837
838	// If the VP op might introduce UB or poison, we can scalarize it provided
839	// that we know the EVL > 0: If the EVL is zero, then the original VP op
840	// becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
841	// scalarizing it.
842	bool SafeToSpeculate;
843	if (ScalarIntrID)
844	SafeToSpeculate = Intrinsic::getAttributes(C&: I.getContext(), id: *ScalarIntrID)
845	.hasFnAttr(Attribute::AttrKind::Speculatable);
846	else
847	SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(
848	Opcode: FunctionalOpcode, Inst: &VPI, CtxI: nullptr*, AC: &AC, DT: &DT);
849	if (!SafeToSpeculate && !isKnownNonZero(V: EVL, DL, Depth: `0`, AC: &AC, CxtI: &VPI, DT: &DT))
850	return false;
851
852	Value *ScalarVal =
853	ScalarIntrID
854	? Builder.CreateIntrinsic(RetTy: VecTy->getScalarType(), ID: *ScalarIntrID,
855	Args: {ScalarOp0, ScalarOp1})
856	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)(*FunctionalOpcode),
857	LHS: ScalarOp0, RHS: ScalarOp1);
858
859	replaceValue(Old&: VPI, New&: *Builder.CreateVectorSplat(EC, V: ScalarVal));
860	return true;
861	}
862
863	/// Match a vector binop or compare instruction with at least one inserted
864	/// scalar operand and convert to scalar binop/cmp followed by insertelement.
865	bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
866	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
867	Value Ins0, Ins1;
868	if (!match(V: &I, P: m_BinOp(L: m_Value(V&: Ins0), R: m_Value(V&: Ins1))) &&
869	!match(V: &I, P: m_Cmp(Pred, L: m_Value(V&: Ins0), R: m_Value(V&: Ins1))))
870	return false;
871
872	// Do not convert the vector condition of a vector select into a scalar
873	// condition. That may cause problems for codegen because of differences in
874	// boolean formats and register-file transfers.
875	// TODO: Can we account for that in the cost model?
876	bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
877	if (IsCmp)
878	for (User *U : I.users())
879	if (match(V: U, P: m_Select(C: m_Specific(V: &I), L: m_Value(), R: m_Value())))
880	return false;
881
882	// Match against one or both scalar values being inserted into constant
883	// vectors:
884	// vec_op VecC0, (inselt VecC1, V1, Index)
885	// vec_op (inselt VecC0, V0, Index), VecC1
886	// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
887	// TODO: Deal with mismatched index constants and variable indexes?
888	Constant VecC0 = nullptr, VecC1 = nullptr;
889	Value V0 = nullptr, V1 = nullptr;
890	uint64_t Index0 = `0`, Index1 = `0`;
891	if (!match(V: Ins0, P: m_InsertElt(Val: m_Constant(C&: VecC0), Elt: m_Value(V&: V0),
892	Idx: m_ConstantInt(V&: Index0))) &&
893	!match(V: Ins0, P: m_Constant(C&: VecC0)))
894	return false;
895	if (!match(V: Ins1, P: m_InsertElt(Val: m_Constant(C&: VecC1), Elt: m_Value(V&: V1),
896	Idx: m_ConstantInt(V&: Index1))) &&
897	!match(V: Ins1, P: m_Constant(C&: VecC1)))
898	return false;
899
900	bool IsConst0 = !V0;
901	bool IsConst1 = !V1;
902	if (IsConst0 && IsConst1)
903	return false;
904	if (!IsConst0 && !IsConst1 && Index0 != Index1)
905	return false;
906
907	// Bail for single insertion if it is a load.
908	// TODO: Handle this once getVectorInstrCost can cost for load/stores.
909	auto *I0 = dyn_cast_or_null<Instruction>(Val: V0);
910	auto *I1 = dyn_cast_or_null<Instruction>(Val: V1);
911	if ((IsConst0 && I1 && I1->mayReadFromMemory()) \|\|
912	(IsConst1 && I0 && I0->mayReadFromMemory()))
913	return false;
914
915	uint64_t Index = IsConst0 ? Index1 : Index0;
916	Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
917	Type *VecTy = I.getType();
918	assert(VecTy->isVectorTy() &&
919	(IsConst0 \|\| IsConst1 \|\| V0->getType() == V1->getType()) &&
920	(ScalarTy->isIntegerTy() \|\| ScalarTy->isFloatingPointTy() \|\|
921	ScalarTy->isPointerTy()) &&
922	"Unexpected types for insert element into binop or cmp");
923
924	unsigned Opcode = I.getOpcode();
925	InstructionCost ScalarOpCost, VectorOpCost;
926	if (IsCmp) {
927	CmpInst::Predicate Pred = cast<CmpInst>(Val&: I).getPredicate();
928	ScalarOpCost = TTI.getCmpSelInstrCost(
929	Opcode, ValTy: ScalarTy, CondTy: CmpInst::makeCmpResultType(opnd_type: ScalarTy), VecPred: Pred);
930	VectorOpCost = TTI.getCmpSelInstrCost(
931	Opcode, ValTy: VecTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VecTy), VecPred: Pred);
932	} else {
933	ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: ScalarTy);
934	VectorOpCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy);
935	}
936
937	// Get cost estimate for the insert element. This cost will factor into
938	// both sequences.
939	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
940	InstructionCost InsertCost = TTI.getVectorInstrCost(
941	Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index);
942	InstructionCost OldCost =
943	(IsConst0 ? `0` : InsertCost) + (IsConst1 ? `0` : InsertCost) + VectorOpCost;
944	InstructionCost NewCost = ScalarOpCost + InsertCost +
945	(IsConst0 ? `0` : !Ins0->hasOneUse() * InsertCost) +
946	(IsConst1 ? `0` : !Ins1->hasOneUse() * InsertCost);
947
948	// We want to scalarize unless the vector variant actually has lower cost.
949	if (OldCost < NewCost \|\| !NewCost.isValid())
950	return false;
951
952	// vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
953	// inselt NewVecC, (scalar_op V0, V1), Index
954	if (IsCmp)
955	++NumScalarCmp;
956	else
957	++NumScalarBO;
958
959	// For constant cases, extract the scalar element, this should constant fold.
960	if (IsConst0)
961	V0 = ConstantExpr::getExtractElement(Vec: VecC0, Idx: Builder.getInt64(C: Index));
962	if (IsConst1)
963	V1 = ConstantExpr::getExtractElement(Vec: VecC1, Idx: Builder.getInt64(C: Index));
964
965	Value *Scalar =
966	IsCmp ? Builder.CreateCmp(Pred, LHS: V0, RHS: V1)
967	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: V0, RHS: V1);
968
969	Scalar->setName(I.getName() + ".scalar");
970
971	// All IR flags are safe to back-propagate. There is no potential for extra
972	// poison to be created by the scalar instruction.
973	if (auto *ScalarInst = dyn_cast<Instruction>(Val: Scalar))
974	ScalarInst->copyIRFlags(V: &I);
975
976	// Fold the vector constants in the original vectors into a new base vector.
977	Value *NewVecC =
978	IsCmp ? Builder.CreateCmp(Pred, LHS: VecC0, RHS: VecC1)
979	: Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Opcode, LHS: VecC0, RHS: VecC1);
980	Value *Insert = Builder.CreateInsertElement(Vec: NewVecC, NewElt: Scalar, Idx: Index);
981	replaceValue(Old&: I, New&: *Insert);
982	return true;
983	}
984
985	/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
986	/// a vector into vector operations followed by extract. Note: The SLP pass
987	/// may miss this pattern because of implementation problems.
988	bool VectorCombine::foldExtractedCmps(Instruction &I) {
989	// We are looking for a scalar binop of booleans.
990	// binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
991	if (!I.isBinaryOp() \|\| !I.getType()->isIntegerTy(Bitwidth: `1`))
992	return false;
993
994	// The compare predicates should match, and each compare should have a
995	// constant operand.
996	// TODO: Relax the one-use constraints.
997	Value B0 = I.getOperand(i: `0`), B1 = I.getOperand(i: `1`);
998	Instruction I0, I1;
999	Constant C0, C1;
1000	CmpInst::Predicate P0, P1;
1001	if (!match(V: B0, P: m_OneUse(SubPattern: m_Cmp(Pred&: P0, L: m_Instruction(I&: I0), R: m_Constant(C&: C0)))) \|\|
1002	!match(V: B1, P: m_OneUse(SubPattern: m_Cmp(Pred&: P1, L: m_Instruction(I&: I1), R: m_Constant(C&: C1)))) \|\|
1003	P0 != P1)
1004	return false;
1005
1006	// The compare operands must be extracts of the same vector with constant
1007	// extract indexes.
1008	// TODO: Relax the one-use constraints.
1009	Value *X;
1010	uint64_t Index0, Index1;
1011	if (!match(V: I0, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Value(V&: X), Idx: m_ConstantInt(V&: Index0)))) \|\|
1012	!match(V: I1, P: m_OneUse(SubPattern: m_ExtractElt(Val: m_Specific(V: X), Idx: m_ConstantInt(V&: Index1)))))
1013	return false;
1014
1015	auto *Ext0 = cast<ExtractElementInst>(Val: I0);
1016	auto *Ext1 = cast<ExtractElementInst>(Val: I1);
1017	ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
1018	if (!ConvertToShuf)
1019	return false;
1020
1021	// The original scalar pattern is:
1022	// binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1023	CmpInst::Predicate Pred = P0;
1024	unsigned CmpOpcode = CmpInst::isFPPredicate(P: Pred) ? Instruction::FCmp
1025	: Instruction::ICmp;
1026	auto *VecTy = dyn_cast<FixedVectorType>(Val: X->getType());
1027	if (!VecTy)
1028	return false;
1029
1030	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1031	InstructionCost OldCost =
1032	TTI.getVectorInstrCost(I: *Ext0, Val: VecTy, CostKind, Index: Index0);
1033	OldCost += TTI.getVectorInstrCost(I: *Ext1, Val: VecTy, CostKind, Index: Index1);
1034	OldCost +=
1035	TTI.getCmpSelInstrCost(Opcode: CmpOpcode, ValTy: I0->getType(),
1036	CondTy: CmpInst::makeCmpResultType(opnd_type: I0->getType()), VecPred: Pred) *
1037	`2`;
1038	OldCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: I.getType());
1039
1040	// The proposed vector pattern is:
1041	// vcmp = cmp Pred X, VecC
1042	// ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1043	int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1044	int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1045	auto *CmpTy = cast<FixedVectorType>(Val: CmpInst::makeCmpResultType(opnd_type: X->getType()));
1046	InstructionCost NewCost = TTI.getCmpSelInstrCost(
1047	Opcode: CmpOpcode, ValTy: X->getType(), CondTy: CmpInst::makeCmpResultType(opnd_type: X->getType()), VecPred: Pred);
1048	SmallVector<int, `32`> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1049	ShufMask [CheapIndex] = ExpensiveIndex;
1050	NewCost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: CmpTy,
1051	Mask: ShufMask);
1052	NewCost += TTI.getArithmeticInstrCost(Opcode: I.getOpcode(), Ty: CmpTy);
1053	NewCost += TTI.getVectorInstrCost(I: *Ext0, Val: CmpTy, CostKind, Index: CheapIndex);
1054
1055	// Aggressively form vector ops if the cost is equal because the transform
1056	// may enable further optimization.
1057	// Codegen can reverse this transform (scalarize) if it was not profitable.
1058	if (OldCost < NewCost \|\| !NewCost.isValid())
1059	return false;
1060
1061	// Create a vector constant from the 2 scalar constants.
1062	SmallVector<Constant *, `32`> CmpC(VecTy->getNumElements(),
1063	PoisonValue::get(T: VecTy->getElementType()));
1064	CmpC [Index0] = C0;
1065	CmpC [Index1] = C1;
1066	Value *VCmp = Builder.CreateCmp(Pred, LHS: X, RHS: ConstantVector::get(V: CmpC));
1067
1068	Value *Shuf = createShiftShuffle(Vec: VCmp, OldIndex: ExpensiveIndex, NewIndex: CheapIndex, Builder);
1069	Value *VecLogic = Builder.CreateBinOp(Opc: cast<BinaryOperator>(Val&: I).getOpcode(),
1070	LHS: VCmp, RHS: Shuf);
1071	Value *NewExt = Builder.CreateExtractElement(Vec: VecLogic, Idx: CheapIndex);
1072	replaceValue(Old&: I, New&: *NewExt);
1073	++NumVecCmpBO;
1074	return true;
1075	}
1076
1077	// Check if memory loc modified between two instrs in the same BB
1078	static bool isMemModifiedBetween(BasicBlock::iterator Begin,
1079	BasicBlock::iterator End,
1080	const MemoryLocation &Loc, AAResults &AA) {
1081	unsigned NumScanned = `0`;
1082	return std::any_of(first: Begin, last: End, pred: [&](const Instruction &Instr) {
1083	return isModSet(MRI: AA.getModRefInfo(I: &Instr, OptLoc: Loc)) \|\|
1084	++NumScanned > MaxInstrsToScan;
1085	});
1086	}
1087
1088	namespace {
1089	/// Helper class to indicate whether a vector index can be safely scalarized and
1090	/// if a freeze needs to be inserted.
1091	class ScalarizationResult {
1092	enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1093
1094	StatusTy Status;
1095	Value *ToFreeze;
1096
1097	ScalarizationResult(StatusTy Status, Value ToFreeze = nullptr*)
1098	: Status(Status), ToFreeze(ToFreeze) {}
1099
1100	public:
1101	ScalarizationResult(const ScalarizationResult &Other) = default;
1102	~ScalarizationResult() {
1103	assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1104	}
1105
1106	static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1107	static ScalarizationResult safe() { return {StatusTy::Safe}; }
1108	static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1109	return {StatusTy::SafeWithFreeze, ToFreeze};
1110	}
1111
1112	/// Returns true if the index can be scalarize without requiring a freeze.
1113	bool isSafe() const { return Status == StatusTy::Safe; }
1114	/// Returns true if the index cannot be scalarized.
1115	bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1116	/// Returns true if the index can be scalarize, but requires inserting a
1117	/// freeze.
1118	bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1119
1120	/// Reset the state of Unsafe and clear ToFreze if set.
1121	void discard() {
1122	ToFreeze = nullptr;
1123	Status = StatusTy::Unsafe;
1124	}
1125
1126	/// Freeze the ToFreeze and update the use in \p User to use it.
1127	void freeze(IRBuilder<> &Builder, Instruction &UserI) {
1128	assert(isSafeWithFreeze() &&
1129	"should only be used when freezing is required");
1130	assert(is_contained(ToFreeze->users(), &UserI) &&
1131	"UserI must be a user of ToFreeze");
1132	IRBuilder<>::InsertPointGuard Guard(Builder);
1133	Builder.SetInsertPoint(cast<Instruction>(Val: &UserI));
1134	Value *Frozen =
1135	Builder.CreateFreeze(V: ToFreeze, Name: ToFreeze->getName() + ".frozen");
1136	for (Use &U : make_early_inc_range(Range: (UserI.operands())))
1137	if (U.get() == ToFreeze)
1138	U.set(Frozen);
1139
1140	ToFreeze = nullptr;
1141	}
1142	};
1143	} // namespace
1144
1145	/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1146	/// Idx. \p Idx must access a valid vector element.
1147	static ScalarizationResult canScalarizeAccess(VectorType VecTy, Value Idx,
1148	Instruction *CtxI,
1149	AssumptionCache &AC,
1150	const DominatorTree &DT) {
1151	// We do checks for both fixed vector types and scalable vector types.
1152	// This is the number of elements of fixed vector types,
1153	// or the minimum number of elements of scalable vector types.
1154	uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1155
1156	if (auto *C = dyn_cast<ConstantInt>(Val: Idx)) {
1157	if (C->getValue().ult(RHS: NumElements))
1158	return ScalarizationResult::safe();
1159	return ScalarizationResult::unsafe();
1160	}
1161
1162	unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1163	APInt Zero(IntWidth, `0`);
1164	APInt MaxElts(IntWidth, NumElements);
1165	ConstantRange ValidIndices(Zero, MaxElts);
1166	ConstantRange IdxRange(IntWidth, true);
1167
1168	if (isGuaranteedNotToBePoison(V: Idx, AC: &AC)) {
1169	if (ValidIndices.contains(CR: computeConstantRange(V: Idx, / ForSigned / false,
1170	UseInstrInfo: true, AC: &AC, CtxI, DT: &DT)))
1171	return ScalarizationResult::safe();
1172	return ScalarizationResult::unsafe();
1173	}
1174
1175	// If the index may be poison, check if we can insert a freeze before the
1176	// range of the index is restricted.
1177	Value *IdxBase;
1178	ConstantInt *CI;
1179	if (match(V: Idx, P: m_And(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) {
1180	IdxRange = IdxRange.binaryAnd(Other: CI->getValue());
1181	} else if (match(V: Idx, P: m_URem(L: m_Value(V&: IdxBase), R: m_ConstantInt(CI)))) {
1182	IdxRange = IdxRange.urem(Other: CI->getValue());
1183	}
1184
1185	if (ValidIndices.contains(CR: IdxRange))
1186	return ScalarizationResult::safeWithFreeze(ToFreeze: IdxBase);
1187	return ScalarizationResult::unsafe();
1188	}
1189
1190	/// The memory operation on a vector of \p ScalarType had alignment of
1191	/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1192	/// alignment that will be valid for the memory operation on a single scalar
1193	/// element of the same type with index \p Idx.
1194	static Align computeAlignmentAfterScalarization(Align VectorAlignment,
1195	Type ScalarType, Value Idx,
1196	const DataLayout &DL) {
1197	if (auto *C = dyn_cast<ConstantInt>(Val: Idx))
1198	return commonAlignment(A: VectorAlignment,
1199	Offset: C->getZExtValue() * DL.getTypeStoreSize(Ty: ScalarType));
1200	return commonAlignment(A: VectorAlignment, Offset: DL.getTypeStoreSize(Ty: ScalarType));
1201	}
1202
1203	// Combine patterns like:
1204	// %0 = load <4 x i32>, <4 x i32> %a*
1205	// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1206	// store <4 x i32> %1, <4 x i32> %a*
1207	// to:
1208	// %0 = bitcast <4 x i32>* %a to i32*
1209	// %1 = getelementptr inbounds i32, i32 %0, i64 0, i64 1*
1210	// store i32 %b, i32 %1*
1211	bool VectorCombine::foldSingleElementStore(Instruction &I) {
1212	auto *SI = cast<StoreInst>(Val: &I);
1213	if (!SI->isSimple() \|\| !isa<VectorType>(Val: SI->getValueOperand()->getType()))
1214	return false;
1215
1216	// TODO: Combine more complicated patterns (multiple insert) by referencing
1217	// TargetTransformInfo.
1218	Instruction *Source;
1219	Value *NewElement;
1220	Value *Idx;
1221	if (!match(V: SI->getValueOperand(),
1222	P: m_InsertElt(Val: m_Instruction(I&: Source), Elt: m_Value(V&: NewElement),
1223	Idx: m_Value(V&: Idx))))
1224	return false;
1225
1226	if (auto *Load = dyn_cast<LoadInst>(Val: Source)) {
1227	auto VecTy = cast<VectorType>(Val: SI->getValueOperand()->getType());
1228	const DataLayout &DL = I.getModule()->getDataLayout();
1229	Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1230	// Don't optimize for atomic/volatile load or store. Ensure memory is not
1231	// modified between, vector type matches store size, and index is inbounds.
1232	if (!Load->isSimple() \|\| Load->getParent() != SI->getParent() \|\|
1233	!DL.typeSizeEqualsStoreSize(Ty: Load->getType()->getScalarType()) \|\|
1234	SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1235	return false;
1236
1237	auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, CtxI: Load, AC, DT);
1238	if (ScalarizableIdx.isUnsafe() \|\|
1239	isMemModifiedBetween(Begin: Load->getIterator(), End: SI->getIterator(),
1240	Loc: MemoryLocation::get(SI), AA))
1241	return false;
1242
1243	if (ScalarizableIdx.isSafeWithFreeze())
1244	ScalarizableIdx.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx));
1245	Value *GEP = Builder.CreateInBoundsGEP(
1246	Ty: SI->getValueOperand()->getType(), Ptr: SI->getPointerOperand(),
1247	IdxList: {ConstantInt::get(Ty: Idx->getType(), V: `0`), Idx});
1248	StoreInst *NSI = Builder.CreateStore(Val: NewElement, Ptr: GEP);
1249	NSI->copyMetadata(SrcInst: *SI);
1250	Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1251	VectorAlignment: std::max(a: SI->getAlign(), b: Load->getAlign()), ScalarType: NewElement->getType(), Idx,
1252	DL);
1253	NSI->setAlignment(ScalarOpAlignment);
1254	replaceValue(Old&: I, New&: *NSI);
1255	eraseInstruction(I);
1256	return true;
1257	}
1258
1259	return false;
1260	}
1261
1262	/// Try to scalarize vector loads feeding extractelement instructions.
1263	bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1264	Value *Ptr;
1265	if (!match(V: &I, P: m_Load(Op: m_Value(V&: Ptr))))
1266	return false;
1267
1268	auto *VecTy = cast<VectorType>(Val: I.getType());
1269	auto *LI = cast<LoadInst>(Val: &I);
1270	const DataLayout &DL = I.getModule()->getDataLayout();
1271	if (LI->isVolatile() \|\| !DL.typeSizeEqualsStoreSize(Ty: VecTy->getScalarType()))
1272	return false;
1273
1274	InstructionCost OriginalCost =
1275	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: LI->getAlign(),
1276	AddressSpace: LI->getPointerAddressSpace());
1277	InstructionCost ScalarizedCost = `0`;
1278
1279	Instruction *LastCheckedInst = LI;
1280	unsigned NumInstChecked = `0`;
1281	DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1282	auto FailureGuard = make_scope_exit(F: [&]() {
1283	// If the transform is aborted, discard the ScalarizationResults.
1284	for (auto &Pair : NeedFreeze)
1285	Pair.second.discard();
1286	});
1287
1288	// Check if all users of the load are extracts with no memory modifications
1289	// between the load and the extract. Compute the cost of both the original
1290	// code and the scalarized version.
1291	for (User *U : LI->users()) {
1292	auto *UI = dyn_cast<ExtractElementInst>(Val: U);
1293	if (!UI \|\| UI->getParent() != LI->getParent())
1294	return false;
1295
1296	// Check if any instruction between the load and the extract may modify
1297	// memory.
1298	if (LastCheckedInst->comesBefore(Other: UI)) {
1299	for (Instruction &I :
1300	make_range(x: std::next(x: LI->getIterator()), y: UI->getIterator())) {
1301	// Bail out if we reached the check limit or the instruction may write
1302	// to memory.
1303	if (NumInstChecked == MaxInstrsToScan \|\| I.mayWriteToMemory())
1304	return false;
1305	NumInstChecked++;
1306	}
1307	LastCheckedInst = UI;
1308	}
1309
1310	auto ScalarIdx = canScalarizeAccess(VecTy, Idx: UI->getOperand(i_nocapture: `1`), CtxI: &I, AC, DT);
1311	if (ScalarIdx.isUnsafe())
1312	return false;
1313	if (ScalarIdx.isSafeWithFreeze()) {
1314	NeedFreeze.try_emplace(Key: UI, Args&: ScalarIdx);
1315	ScalarIdx.discard();
1316	}
1317
1318	auto *Index = dyn_cast<ConstantInt>(Val: UI->getOperand(i_nocapture: `1`));
1319	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1320	OriginalCost +=
1321	TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
1322	Index: Index ? Index->getZExtValue() : -`1`);
1323	ScalarizedCost +=
1324	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy->getElementType(),
1325	Alignment: Align (`1`), AddressSpace: LI->getPointerAddressSpace());
1326	ScalarizedCost += TTI.getAddressComputationCost(Ty: VecTy->getElementType());
1327	}
1328
1329	if (ScalarizedCost >= OriginalCost)
1330	return false;
1331
1332	// Replace extracts with narrow scalar loads.
1333	for (User *U : LI->users()) {
1334	auto *EI = cast<ExtractElementInst>(Val: U);
1335	Value *Idx = EI->getOperand(i_nocapture: `1`);
1336
1337	// Insert 'freeze' for poison indexes.
1338	auto It = NeedFreeze.find(Val: EI);
1339	if (It != NeedFreeze.end())
1340	It ->second.freeze(Builder, UserI&: *cast<Instruction>(Val: Idx));
1341
1342	Builder.SetInsertPoint(EI);
1343	Value *GEP =
1344	Builder.CreateInBoundsGEP(Ty: VecTy, Ptr, IdxList: {Builder.getInt32(C: `0`), Idx});
1345	auto *NewLoad = cast<LoadInst>(Val: Builder.CreateLoad(
1346	Ty: VecTy->getElementType(), Ptr: GEP, Name: EI->getName() + ".scalar"));
1347
1348	Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1349	VectorAlignment: LI->getAlign(), ScalarType: VecTy->getElementType(), Idx, DL);
1350	NewLoad->setAlignment(ScalarOpAlignment);
1351
1352	replaceValue(Old&: EI, New&: NewLoad);
1353	}
1354
1355	FailureGuard.release();
1356	return true;
1357	}
1358
1359	/// Try to convert "shuffle (binop), (binop)" with a shared binop operand into
1360	/// "binop (shuffle), (shuffle)".
1361	bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
1362	auto *VecTy = cast<FixedVectorType>(Val: I.getType());
1363	BinaryOperator B0, B1;
1364	ArrayRef<int> Mask;
1365	if (!match(V: &I, P: m_Shuffle(v1: m_OneUse(SubPattern: m_BinOp(I&: B0)), v2: m_OneUse(SubPattern: m_BinOp(I&: B1)),
1366	mask: m_Mask (Mask))) \|\|
1367	B0->getOpcode() != B1->getOpcode() \|\| B0->getType() != VecTy)
1368	return false;
1369
1370	// Try to replace a binop with a shuffle if the shuffle is not costly.
1371	// The new shuffle will choose from a single, common operand, so it may be
1372	// cheaper than the existing two-operand shuffle.
1373	SmallVector<int> UnaryMask = createUnaryMask(Mask, NumElts: Mask.size());
1374	Instruction::BinaryOps Opcode = B0->getOpcode();
1375	InstructionCost BinopCost = TTI.getArithmeticInstrCost(Opcode, Ty: VecTy);
1376	InstructionCost ShufCost = TTI.getShuffleCost(
1377	Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy, Mask: UnaryMask);
1378	if (ShufCost > BinopCost)
1379	return false;
1380
1381	// If we have something like "add X, Y" and "add Z, X", swap ops to match.
1382	Value X = B0->getOperand(i_nocapture: `0`), Y = B0->getOperand(i_nocapture: `1`);
1383	Value Z = B1->getOperand(i_nocapture: `0`), W = B1->getOperand(i_nocapture: `1`);
1384	if (BinaryOperator::isCommutative(Opcode) && X != Z && Y != W)
1385	std::swap(a&: X, b&: Y);
1386
1387	Value Shuf0, Shuf1;
1388	if (X == Z) {
1389	// shuf (bo X, Y), (bo X, W) --> bo (shuf X), (shuf Y, W)
1390	Shuf0 = Builder.CreateShuffleVector(V: X, Mask: UnaryMask);
1391	Shuf1 = Builder.CreateShuffleVector(V1: Y, V2: W, Mask);
1392	} else if (Y == W) {
1393	// shuf (bo X, Y), (bo Z, Y) --> bo (shuf X, Z), (shuf Y)
1394	Shuf0 = Builder.CreateShuffleVector(V1: X, V2: Z, Mask);
1395	Shuf1 = Builder.CreateShuffleVector(V: Y, Mask: UnaryMask);
1396	} else {
1397	return false;
1398	}
1399
1400	Value *NewBO = Builder.CreateBinOp(Opc: Opcode, LHS: Shuf0, RHS: Shuf1);
1401	// Intersect flags from the old binops.
1402	if (auto *NewInst = dyn_cast<Instruction>(Val: NewBO)) {
1403	NewInst->copyIRFlags(V: B0);
1404	NewInst->andIRFlags(V: B1);
1405	}
1406	replaceValue(Old&: I, New&: *NewBO);
1407	return true;
1408	}
1409
1410	/// Given a commutative reduction, the order of the input lanes does not alter
1411	/// the results. We can use this to remove certain shuffles feeding the
1412	/// reduction, removing the need to shuffle at all.
1413	bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
1414	auto *II = dyn_cast<IntrinsicInst>(Val: &I);
1415	if (!II)
1416	return false;
1417	switch (II->getIntrinsicID()) {
1418	case Intrinsic::vector_reduce_add:
1419	case Intrinsic::vector_reduce_mul:
1420	case Intrinsic::vector_reduce_and:
1421	case Intrinsic::vector_reduce_or:
1422	case Intrinsic::vector_reduce_xor:
1423	case Intrinsic::vector_reduce_smin:
1424	case Intrinsic::vector_reduce_smax:
1425	case Intrinsic::vector_reduce_umin:
1426	case Intrinsic::vector_reduce_umax:
1427	break;
1428	default:
1429	return false;
1430	}
1431
1432	// Find all the inputs when looking through operations that do not alter the
1433	// lane order (binops, for example). Currently we look for a single shuffle,
1434	// and can ignore splat values.
1435	std::queue<Value *> Worklist;
1436	SmallPtrSet<Value *, `4`> Visited;
1437	ShuffleVectorInst Shuffle = nullptr*;
1438	if (auto *Op = dyn_cast<Instruction>(Val: I.getOperand(i: `0`)))
1439	Worklist.push(x: Op);
1440
1441	while (!Worklist.empty()) {
1442	Value *CV = Worklist.front();
1443	Worklist.pop();
1444	if (Visited.contains(Ptr: CV))
1445	continue;
1446
1447	// Splats don't change the order, so can be safely ignored.
1448	if (isSplatValue(V: CV))
1449	continue;
1450
1451	Visited.insert(Ptr: CV);
1452
1453	if (auto *CI = dyn_cast<Instruction>(Val: CV)) {
1454	if (CI->isBinaryOp()) {
1455	for (auto *Op : CI->operand_values())
1456	Worklist.push(x: Op);
1457	continue;
1458	} else if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: CI)) {
1459	if (Shuffle && Shuffle != SV)
1460	return false;
1461	Shuffle = SV;
1462	continue;
1463	}
1464	}
1465
1466	// Anything else is currently an unknown node.
1467	return false;
1468	}
1469
1470	if (!Shuffle)
1471	return false;
1472
1473	// Check all uses of the binary ops and shuffles are also included in the
1474	// lane-invariant operations (Visited should be the list of lanewise
1475	// instructions, including the shuffle that we found).
1476	for (auto *V : Visited)
1477	for (auto *U : V->users())
1478	if (!Visited.contains(Ptr: U) && U != &I)
1479	return false;
1480
1481	FixedVectorType *VecType =
1482	dyn_cast<FixedVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
1483	if (!VecType)
1484	return false;
1485	FixedVectorType *ShuffleInputType =
1486	dyn_cast<FixedVectorType>(Val: Shuffle->getOperand(i_nocapture: `0`)->getType());
1487	if (!ShuffleInputType)
1488	return false;
1489	unsigned NumInputElts = ShuffleInputType->getNumElements();
1490
1491	// Find the mask from sorting the lanes into order. This is most likely to
1492	// become a identity or concat mask. Undef elements are pushed to the end.
1493	SmallVector<int> ConcatMask;
1494	Shuffle->getShuffleMask(Result&: ConcatMask);
1495	sort(C&: ConcatMask, Comp: [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
1496	// In the case of a truncating shuffle it's possible for the mask
1497	// to have an index greater than the size of the resulting vector.
1498	// This requires special handling.
1499	bool IsTruncatingShuffle = VecType->getNumElements() < NumInputElts;
1500	bool UsesSecondVec =
1501	any_of(Range&: ConcatMask, P: [&](int M) { return M >= (int)NumInputElts; });
1502
1503	FixedVectorType *VecTyForCost =
1504	(UsesSecondVec && !IsTruncatingShuffle) ? VecType : ShuffleInputType;
1505	InstructionCost OldCost = TTI.getShuffleCost(
1506	Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
1507	Tp: VecTyForCost, Mask: Shuffle->getShuffleMask());
1508	InstructionCost NewCost = TTI.getShuffleCost(
1509	Kind: UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc,
1510	Tp: VecTyForCost, Mask: ConcatMask);
1511
1512	LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
1513	<< "\n");
1514	LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
1515	<< "\n");
1516	if (NewCost < OldCost) {
1517	Builder.SetInsertPoint(Shuffle);
1518	Value *NewShuffle = Builder.CreateShuffleVector(
1519	V1: Shuffle->getOperand(i_nocapture: `0`), V2: Shuffle->getOperand(i_nocapture: `1`), Mask: ConcatMask);
1520	LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
1521	replaceValue(Old&: Shuffle, New&: NewShuffle);
1522	}
1523
1524	// See if we can re-use foldSelectShuffle, getting it to reduce the size of
1525	// the shuffle into a nicer order, as it can ignore the order of the shuffles.
1526	return foldSelectShuffle(I&: Shuffle, FromReduction: true*);
1527	}
1528
1529	/// This method looks for groups of shuffles acting on binops, of the form:
1530	/// %x = shuffle ...
1531	/// %y = shuffle ...
1532	/// %a = binop %x, %y
1533	/// %b = binop %x, %y
1534	/// shuffle %a, %b, selectmask
1535	/// We may, especially if the shuffle is wider than legal, be able to convert
1536	/// the shuffle to a form where only parts of a and b need to be computed. On
1537	/// architectures with no obvious "select" shuffle, this can reduce the total
1538	/// number of operations if the target reports them as cheaper.
1539	bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
1540	auto *SVI = cast<ShuffleVectorInst>(Val: &I);
1541	auto *VT = cast<FixedVectorType>(Val: I.getType());
1542	auto *Op0 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: `0`));
1543	auto *Op1 = dyn_cast<Instruction>(Val: SVI->getOperand(i_nocapture: `1`));
1544	if (!Op0 \|\| !Op1 \|\| Op0 == Op1 \|\| !Op0->isBinaryOp() \|\| !Op1->isBinaryOp() \|\|
1545	VT != Op0->getType())
1546	return false;
1547
1548	auto *SVI0A = dyn_cast<Instruction>(Val: Op0->getOperand(i: `0`));
1549	auto *SVI0B = dyn_cast<Instruction>(Val: Op0->getOperand(i: `1`));
1550	auto *SVI1A = dyn_cast<Instruction>(Val: Op1->getOperand(i: `0`));
1551	auto *SVI1B = dyn_cast<Instruction>(Val: Op1->getOperand(i: `1`));
1552	SmallPtrSet<Instruction *, `4`> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
1553	auto checkSVNonOpUses = [&](Instruction *I) {
1554	if (!I \|\| I->getOperand(i: `0`)->getType() != VT)
1555	return true;
1556	return any_of(Range: I->users(), P: [&](User *U) {
1557	return U != Op0 && U != Op1 &&
1558	!(isa<ShuffleVectorInst>(Val: U) &&
1559	(InputShuffles.contains(Ptr: cast<Instruction>(Val: U)) \|\|
1560	isInstructionTriviallyDead(I: cast<Instruction>(Val: U))));
1561	});
1562	};
1563	if (checkSVNonOpUses (SVI0A) \|\| checkSVNonOpUses (SVI0B) \|\|
1564	checkSVNonOpUses (SVI1A) \|\| checkSVNonOpUses (SVI1B))
1565	return false;
1566
1567	// Collect all the uses that are shuffles that we can transform together. We
1568	// may not have a single shuffle, but a group that can all be transformed
1569	// together profitably.
1570	SmallVector<ShuffleVectorInst *> Shuffles;
1571	auto collectShuffles = [&](Instruction *I) {
1572	for (auto *U : I->users()) {
1573	auto *SV = dyn_cast<ShuffleVectorInst>(Val: U);
1574	if (!SV \|\| SV->getType() != VT)
1575	return false;
1576	if ((SV->getOperand(i_nocapture: `0`) != Op0 && SV->getOperand(i_nocapture: `0`) != Op1) \|\|
1577	(SV->getOperand(i_nocapture: `1`) != Op0 && SV->getOperand(i_nocapture: `1`) != Op1))
1578	return false;
1579	if (!llvm::is_contained(Range&: Shuffles, Element: SV))
1580	Shuffles.push_back(Elt: SV);
1581	}
1582	return true;
1583	};
1584	if (!collectShuffles (Op0) \|\| !collectShuffles (Op1))
1585	return false;
1586	// From a reduction, we need to be processing a single shuffle, otherwise the
1587	// other uses will not be lane-invariant.
1588	if (FromReduction && Shuffles.size() > `1`)
1589	return false;
1590
1591	// Add any shuffle uses for the shuffles we have found, to include them in our
1592	// cost calculations.
1593	if (!FromReduction) {
1594	for (ShuffleVectorInst *SV : Shuffles) {
1595	for (auto *U : SV->users()) {
1596	ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(Val: U);
1597	if (SSV && isa<UndefValue>(Val: SSV->getOperand(i_nocapture: `1`)) && SSV->getType() == VT)
1598	Shuffles.push_back(Elt: SSV);
1599	}
1600	}
1601	}
1602
1603	// For each of the output shuffles, we try to sort all the first vector
1604	// elements to the beginning, followed by the second array elements at the
1605	// end. If the binops are legalized to smaller vectors, this may reduce total
1606	// number of binops. We compute the ReconstructMask mask needed to convert
1607	// back to the original lane order.
1608	SmallVector<std::pair<int, int>> V1, V2;
1609	SmallVector<SmallVector<int>> OrigReconstructMasks;
1610	int MaxV1Elt = `0`, MaxV2Elt = `0`;
1611	unsigned NumElts = VT->getNumElements();
1612	for (ShuffleVectorInst *SVN : Shuffles) {
1613	SmallVector<int> Mask;
1614	SVN->getShuffleMask(Result&: Mask);
1615
1616	// Check the operands are the same as the original, or reversed (in which
1617	// case we need to commute the mask).
1618	Value *SVOp0 = SVN->getOperand(i_nocapture: `0`);
1619	Value *SVOp1 = SVN->getOperand(i_nocapture: `1`);
1620	if (isa<UndefValue>(Val: SVOp1)) {
1621	auto *SSV = cast<ShuffleVectorInst>(Val: SVOp0);
1622	SVOp0 = SSV->getOperand(i_nocapture: `0`);
1623	SVOp1 = SSV->getOperand(i_nocapture: `1`);
1624	for (unsigned I = `0`, E = Mask.size(); I != E; I++) {
1625	if (Mask [I] >= static_cast<int>(SSV->getShuffleMask().size()))
1626	return false;
1627	Mask [I] = Mask [I] < `0` ? Mask [I] : SSV->getMaskValue(Elt: Mask [I]);
1628	}
1629	}
1630	if (SVOp0 == Op1 && SVOp1 == Op0) {
1631	std::swap(a&: SVOp0, b&: SVOp1);
1632	ShuffleVectorInst::commuteShuffleMask(Mask, InVecNumElts: NumElts);
1633	}
1634	if (SVOp0 != Op0 \|\| SVOp1 != Op1)
1635	return false;
1636
1637	// Calculate the reconstruction mask for this shuffle, as the mask needed to
1638	// take the packed values from Op0/Op1 and reconstructing to the original
1639	// order.
1640	SmallVector<int> ReconstructMask;
1641	for (unsigned I = `0`; I < Mask.size(); I++) {
1642	if (Mask [I] < `0`) {
1643	ReconstructMask.push_back(Elt: -`1`);
1644	} else if (Mask [I] < static_cast<int>(NumElts)) {
1645	MaxV1Elt = std::max(a: MaxV1Elt, b: Mask [I]);
1646	auto It = find_if(Range&: V1, P: [&](const std::pair<int, int> &A) {
1647	return Mask [I] == A.first;
1648	});
1649	if (It != V1.end())
1650	ReconstructMask.push_back(Elt: It - V1.begin());
1651	else {
1652	ReconstructMask.push_back(Elt: V1.size());
1653	V1.emplace_back(Args&: Mask [I], Args: V1.size());
1654	}
1655	} else {
1656	MaxV2Elt = std::max<int>(a: MaxV2Elt, b: Mask [I] - NumElts);
1657	auto It = find_if(Range&: V2, P: [&](const std::pair<int, int> &A) {
1658	return Mask [I] - static_cast<int>(NumElts) == A.first;
1659	});
1660	if (It != V2.end())
1661	ReconstructMask.push_back(Elt: NumElts + It - V2.begin());
1662	else {
1663	ReconstructMask.push_back(Elt: NumElts + V2.size());
1664	V2.emplace_back(Args: Mask [I] - NumElts, Args: NumElts + V2.size());
1665	}
1666	}
1667	}
1668
1669	// For reductions, we know that the lane ordering out doesn't alter the
1670	// result. In-order can help simplify the shuffle away.
1671	if (FromReduction)
1672	sort(C&: ReconstructMask);
1673	OrigReconstructMasks.push_back(Elt: std::move(ReconstructMask));
1674	}
1675
1676	// If the Maximum element used from V1 and V2 are not larger than the new
1677	// vectors, the vectors are already packes and performing the optimization
1678	// again will likely not help any further. This also prevents us from getting
1679	// stuck in a cycle in case the costs do not also rule it out.
1680	if (V1.empty() \|\| V2.empty() \|\|
1681	(MaxV1Elt == static_cast<int>(V1.size()) - `1` &&
1682	MaxV2Elt == static_cast<int>(V2.size()) - `1`))
1683	return false;
1684
1685	// GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
1686	// shuffle of another shuffle, or not a shuffle (that is treated like a
1687	// identity shuffle).
1688	auto GetBaseMaskValue = [&](Instruction I, int* M) {
1689	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
1690	if (!SV)
1691	return M;
1692	if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`)))
1693	if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: `0`)))
1694	if (InputShuffles.contains(Ptr: SSV))
1695	return SSV->getMaskValue(Elt: SV->getMaskValue(Elt: M));
1696	return SV->getMaskValue(Elt: M);
1697	};
1698
1699	// Attempt to sort the inputs my ascending mask values to make simpler input
1700	// shuffles and push complex shuffles down to the uses. We sort on the first
1701	// of the two input shuffle orders, to try and get at least one input into a
1702	// nice order.
1703	auto SortBase = [&](Instruction A, std::pair<int, int*> X,
1704	std::pair<int, int> Y) {
1705	int MXA = GetBaseMaskValue (A, X.first);
1706	int MYA = GetBaseMaskValue (A, Y.first);
1707	return MXA < MYA;
1708	};
1709	stable_sort(Range&: V1, C: [&](std::pair<int, int> A, std::pair<int, int> B) {
1710	return SortBase (SVI0A, A, B);
1711	});
1712	stable_sort(Range&: V2, C: [&](std::pair<int, int> A, std::pair<int, int> B) {
1713	return SortBase (SVI1A, A, B);
1714	});
1715	// Calculate our ReconstructMasks from the OrigReconstructMasks and the
1716	// modified order of the input shuffles.
1717	SmallVector<SmallVector<int>> ReconstructMasks;
1718	for (const auto &Mask : OrigReconstructMasks) {
1719	SmallVector<int> ReconstructMask;
1720	for (int M : Mask) {
1721	auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
1722	auto It = find_if(Range: V, P: [M](auto A) { return A.second == M; });
1723	assert(It != V.end() && "Expected all entries in Mask");
1724	return std::distance(first: V.begin(), last: It);
1725	};
1726	if (M < `0`)
1727	ReconstructMask.push_back(Elt: -`1`);
1728	else if (M < static_cast<int>(NumElts)) {
1729	ReconstructMask.push_back(Elt: FindIndex (V1, M));
1730	} else {
1731	ReconstructMask.push_back(Elt: NumElts + FindIndex (V2, M));
1732	}
1733	}
1734	ReconstructMasks.push_back(Elt: std::move(ReconstructMask));
1735	}
1736
1737	// Calculate the masks needed for the new input shuffles, which get padded
1738	// with undef
1739	SmallVector<int> V1A, V1B, V2A, V2B;
1740	for (unsigned I = `0`; I < V1.size(); I++) {
1741	V1A.push_back(Elt: GetBaseMaskValue (SVI0A, V1 [I].first));
1742	V1B.push_back(Elt: GetBaseMaskValue (SVI0B, V1 [I].first));
1743	}
1744	for (unsigned I = `0`; I < V2.size(); I++) {
1745	V2A.push_back(Elt: GetBaseMaskValue (SVI1A, V2 [I].first));
1746	V2B.push_back(Elt: GetBaseMaskValue (SVI1B, V2 [I].first));
1747	}
1748	while (V1A.size() < NumElts) {
1749	V1A.push_back(Elt: PoisonMaskElem);
1750	V1B.push_back(Elt: PoisonMaskElem);
1751	}
1752	while (V2A.size() < NumElts) {
1753	V2A.push_back(Elt: PoisonMaskElem);
1754	V2B.push_back(Elt: PoisonMaskElem);
1755	}
1756
1757	auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
1758	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
1759	if (!SV)
1760	return C;
1761	return C + TTI.getShuffleCost(Kind: isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`))
1762	? TTI::SK_PermuteSingleSrc
1763	: TTI::SK_PermuteTwoSrc,
1764	Tp: VT, Mask: SV->getShuffleMask());
1765	};
1766	auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
1767	return C + TTI.getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, Tp: VT, Mask);
1768	};
1769
1770	// Get the costs of the shuffles + binops before and after with the new
1771	// shuffle masks.
1772	InstructionCost CostBefore =
1773	TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: VT) +
1774	TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: VT);
1775	CostBefore += std::accumulate(first: Shuffles.begin(), last: Shuffles.end(),
1776	init: InstructionCost (`0`), binary_op: AddShuffleCost);
1777	CostBefore += std::accumulate(first: InputShuffles.begin(), last: InputShuffles.end(),
1778	init: InstructionCost (`0`), binary_op: AddShuffleCost);
1779
1780	// The new binops will be unused for lanes past the used shuffle lengths.
1781	// These types attempt to get the correct cost for that from the target.
1782	FixedVectorType *Op0SmallVT =
1783	FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V1.size());
1784	FixedVectorType *Op1SmallVT =
1785	FixedVectorType::get(ElementType: VT->getScalarType(), NumElts: V2.size());
1786	InstructionCost CostAfter =
1787	TTI.getArithmeticInstrCost(Opcode: Op0->getOpcode(), Ty: Op0SmallVT) +
1788	TTI.getArithmeticInstrCost(Opcode: Op1->getOpcode(), Ty: Op1SmallVT);
1789	CostAfter += std::accumulate(first: ReconstructMasks.begin(), last: ReconstructMasks.end(),
1790	init: InstructionCost (`0`), binary_op: AddShuffleMaskCost);
1791	std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
1792	CostAfter +=
1793	std::accumulate(first: OutputShuffleMasks.begin(), last: OutputShuffleMasks.end(),
1794	init: InstructionCost (`0`), binary_op: AddShuffleMaskCost);
1795
1796	LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
1797	LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
1798	<< " vs CostAfter: " << CostAfter << "\n");
1799	if (CostBefore <= CostAfter)
1800	return false;
1801
1802	// The cost model has passed, create the new instructions.
1803	auto GetShuffleOperand = [&](Instruction I, unsigned* Op) -> Value * {
1804	auto *SV = dyn_cast<ShuffleVectorInst>(Val: I);
1805	if (!SV)
1806	return I;
1807	if (isa<UndefValue>(Val: SV->getOperand(i_nocapture: `1`)))
1808	if (auto *SSV = dyn_cast<ShuffleVectorInst>(Val: SV->getOperand(i_nocapture: `0`)))
1809	if (InputShuffles.contains(Ptr: SSV))
1810	return SSV->getOperand(i_nocapture: Op);
1811	return SV->getOperand(i_nocapture: Op);
1812	};
1813	Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
1814	Value *NSV0A = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI0A, `0`),
1815	V2: GetShuffleOperand (SVI0A, `1`), Mask: V1A);
1816	Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
1817	Value *NSV0B = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI0B, `0`),
1818	V2: GetShuffleOperand (SVI0B, `1`), Mask: V1B);
1819	Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
1820	Value *NSV1A = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI1A, `0`),
1821	V2: GetShuffleOperand (SVI1A, `1`), Mask: V2A);
1822	Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
1823	Value *NSV1B = Builder.CreateShuffleVector(V1: GetShuffleOperand (SVI1B, `0`),
1824	V2: GetShuffleOperand (SVI1B, `1`), Mask: V2B);
1825	Builder.SetInsertPoint(Op0);
1826	Value *NOp0 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op0->getOpcode(),
1827	LHS: NSV0A, RHS: NSV0B);
1828	if (auto *I = dyn_cast<Instruction>(Val: NOp0))
1829	I->copyIRFlags(V: Op0, IncludeWrapFlags: true);
1830	Builder.SetInsertPoint(Op1);
1831	Value *NOp1 = Builder.CreateBinOp(Opc: (Instruction::BinaryOps)Op1->getOpcode(),
1832	LHS: NSV1A, RHS: NSV1B);
1833	if (auto *I = dyn_cast<Instruction>(Val: NOp1))
1834	I->copyIRFlags(V: Op1, IncludeWrapFlags: true);
1835
1836	for (int S = `0`, E = ReconstructMasks.size(); S != E; S++) {
1837	Builder.SetInsertPoint(Shuffles [S]);
1838	Value *NSV = Builder.CreateShuffleVector(V1: NOp0, V2: NOp1, Mask: ReconstructMasks [S]);
1839	replaceValue(Old&: Shuffles [S], New&: NSV);
1840	}
1841
1842	Worklist.pushValue(V: NSV0A);
1843	Worklist.pushValue(V: NSV0B);
1844	Worklist.pushValue(V: NSV1A);
1845	Worklist.pushValue(V: NSV1B);
1846	for (auto *S : Shuffles)
1847	Worklist.add(I: S);
1848	return true;
1849	}
1850
1851	/// This is the entry point for all transforms. Pass manager differences are
1852	/// handled in the callers of this function.
1853	bool VectorCombine::run() {
1854	if (DisableVectorCombine)
1855	return false;
1856
1857	// Don't attempt vectorization if the target does not support vectors.
1858	if (!TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(/Vector/ true)))
1859	return false;
1860
1861	bool MadeChange = false;
1862	auto FoldInst = [this, &MadeChange](Instruction &I) {
1863	Builder.SetInsertPoint(&I);
1864	bool IsFixedVectorType = isa<FixedVectorType>(Val: I.getType());
1865	auto Opcode = I.getOpcode();
1866
1867	// These folds should be beneficial regardless of when this pass is run
1868	// in the optimization pipeline.
1869	// The type checking is for run-time efficiency. We can avoid wasting time
1870	// dispatching to folding functions if there's no chance of matching.
1871	if (IsFixedVectorType) {
1872	switch (Opcode) {
1873	case Instruction::InsertElement:
1874	MadeChange \|= vectorizeLoadInsert(I);
1875	break;
1876	case Instruction::ShuffleVector:
1877	MadeChange \|= widenSubvectorLoad(I);
1878	break;
1879	default:
1880	break;
1881	}
1882	}
1883
1884	// This transform works with scalable and fixed vectors
1885	// TODO: Identify and allow other scalable transforms
1886	if (isa<VectorType>(Val: I.getType())) {
1887	MadeChange \|= scalarizeBinopOrCmp(I);
1888	MadeChange \|= scalarizeLoadExtract(I);
1889	MadeChange \|= scalarizeVPIntrinsic(I);
1890	}
1891
1892	if (Opcode == Instruction::Store)
1893	MadeChange \|= foldSingleElementStore(I);
1894
1895	// If this is an early pipeline invocation of this pass, we are done.
1896	if (TryEarlyFoldsOnly)
1897	return;
1898
1899	// Otherwise, try folds that improve codegen but may interfere with
1900	// early IR canonicalizations.
1901	// The type checking is for run-time efficiency. We can avoid wasting time
1902	// dispatching to folding functions if there's no chance of matching.
1903	if (IsFixedVectorType) {
1904	switch (Opcode) {
1905	case Instruction::InsertElement:
1906	MadeChange \|= foldInsExtFNeg(I);
1907	break;
1908	case Instruction::ShuffleVector:
1909	MadeChange \|= foldShuffleOfBinops(I);
1910	MadeChange \|= foldSelectShuffle(I);
1911	break;
1912	case Instruction::BitCast:
1913	MadeChange \|= foldBitcastShuffle(I);
1914	break;
1915	}
1916	} else {
1917	switch (Opcode) {
1918	case Instruction::Call:
1919	MadeChange \|= foldShuffleFromReductions(I);
1920	break;
1921	case Instruction::ICmp:
1922	case Instruction::FCmp:
1923	MadeChange \|= foldExtractExtract(I);
1924	break;
1925	default:
1926	if (Instruction::isBinaryOp(Opcode)) {
1927	MadeChange \|= foldExtractExtract(I);
1928	MadeChange \|= foldExtractedCmps(I);
1929	}
1930	break;
1931	}
1932	}
1933	};
1934
1935	for (BasicBlock &BB : F) {
1936	// Ignore unreachable basic blocks.
1937	if (!DT.isReachableFromEntry(A: &BB))
1938	continue;
1939	// Use early increment range so that we can erase instructions in loop.
1940	for (Instruction &I : make_early_inc_range(Range&: BB)) {
1941	if (I.isDebugOrPseudoInst())
1942	continue;
1943	FoldInst (I);
1944	}
1945	}
1946
1947	while (!Worklist.isEmpty()) {
1948	Instruction *I = Worklist.removeOne();
1949	if (!I)
1950	continue;
1951
1952	if (isInstructionTriviallyDead(I)) {
1953	eraseInstruction(I&: *I);
1954	continue;
1955	}
1956
1957	FoldInst (*I);
1958	}
1959
1960	return MadeChange;
1961	}
1962
1963	PreservedAnalyses VectorCombinePass::run(Function &F,
1964	FunctionAnalysisManager &FAM) {
1965	auto &AC = FAM.getResult<AssumptionAnalysis>(IR&: F);
1966	TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(IR&: F);
1967	DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F);
1968	AAResults &AA = FAM.getResult<AAManager>(IR&: F);
1969	VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly);
1970	if (!Combiner.run())
1971	return PreservedAnalyses::all();
1972	PreservedAnalyses PA;
1973	PA.preserveSet<CFGAnalyses>();
1974	return PA;
1975	}
1976

source code of llvm/lib/Transforms/Vectorize/VectorCombine.cpp