SLPVectorizer.cpp source code [llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp]

1	//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10	// stores that can be put together into vector-stores. Next, it attempts to
11	// construct vectorizable tree using the use-def chains. If a profitable tree
12	// was found, the SLP vectorizer performs vectorization on the tree.
13	//
14	// The pass is inspired by the work described in the paper:
15	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16	//
17	//===----------------------------------------------------------------------===//
18
19	#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20	#include "llvm/ADT/DenseMap.h"
21	#include "llvm/ADT/DenseSet.h"
22	#include "llvm/ADT/PriorityQueue.h"
23	#include "llvm/ADT/STLExtras.h"
24	#include "llvm/ADT/ScopeExit.h"
25	#include "llvm/ADT/SetOperations.h"
26	#include "llvm/ADT/SetVector.h"
27	#include "llvm/ADT/SmallBitVector.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallString.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/ADT/iterator.h"
33	#include "llvm/ADT/iterator_range.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/Analysis/AssumptionCache.h"
36	#include "llvm/Analysis/CodeMetrics.h"
37	#include "llvm/Analysis/ConstantFolding.h"
38	#include "llvm/Analysis/DemandedBits.h"
39	#include "llvm/Analysis/GlobalsModRef.h"
40	#include "llvm/Analysis/IVDescriptors.h"
41	#include "llvm/Analysis/LoopAccessAnalysis.h"
42	#include "llvm/Analysis/LoopInfo.h"
43	#include "llvm/Analysis/MemoryLocation.h"
44	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45	#include "llvm/Analysis/ScalarEvolution.h"
46	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47	#include "llvm/Analysis/TargetLibraryInfo.h"
48	#include "llvm/Analysis/TargetTransformInfo.h"
49	#include "llvm/Analysis/ValueTracking.h"
50	#include "llvm/Analysis/VectorUtils.h"
51	#include "llvm/IR/Attributes.h"
52	#include "llvm/IR/BasicBlock.h"
53	#include "llvm/IR/Constant.h"
54	#include "llvm/IR/Constants.h"
55	#include "llvm/IR/DataLayout.h"
56	#include "llvm/IR/DerivedTypes.h"
57	#include "llvm/IR/Dominators.h"
58	#include "llvm/IR/Function.h"
59	#include "llvm/IR/IRBuilder.h"
60	#include "llvm/IR/InstrTypes.h"
61	#include "llvm/IR/Instruction.h"
62	#include "llvm/IR/Instructions.h"
63	#include "llvm/IR/IntrinsicInst.h"
64	#include "llvm/IR/Intrinsics.h"
65	#include "llvm/IR/Module.h"
66	#include "llvm/IR/Operator.h"
67	#include "llvm/IR/PatternMatch.h"
68	#include "llvm/IR/Type.h"
69	#include "llvm/IR/Use.h"
70	#include "llvm/IR/User.h"
71	#include "llvm/IR/Value.h"
72	#include "llvm/IR/ValueHandle.h"
73	#ifdef EXPENSIVE_CHECKS
74	#include "llvm/IR/Verifier.h"
75	#endif
76	#include "llvm/Pass.h"
77	#include "llvm/Support/Casting.h"
78	#include "llvm/Support/CommandLine.h"
79	#include "llvm/Support/Compiler.h"
80	#include "llvm/Support/DOTGraphTraits.h"
81	#include "llvm/Support/Debug.h"
82	#include "llvm/Support/ErrorHandling.h"
83	#include "llvm/Support/GraphWriter.h"
84	#include "llvm/Support/InstructionCost.h"
85	#include "llvm/Support/KnownBits.h"
86	#include "llvm/Support/MathExtras.h"
87	#include "llvm/Support/raw_ostream.h"
88	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
89	#include "llvm/Transforms/Utils/Local.h"
90	#include "llvm/Transforms/Utils/LoopUtils.h"
91	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
92	#include <algorithm>
93	#include <cassert>
94	#include <cstdint>
95	#include <iterator>
96	#include <memory>
97	#include <optional>
98	#include <set>
99	#include <string>
100	#include <tuple>
101	#include <utility>
102
103	using namespace llvm;
104	using namespace llvm::PatternMatch;
105	using namespace slpvectorizer;
106
107	#define SV_NAME "slp-vectorizer"
108	#define DEBUG_TYPE "SLP"
109
110	STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112	static cl::opt<bool>
113	RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
114	cl::desc ("Run the SLP vectorization passes"));
115
116	static cl::opt<int>
117	SLPCostThreshold("slp-threshold", cl::init(Val: `0`), cl::Hidden,
118	cl::desc ("Only vectorize if you gain more than this "
119	"number "));
120
121	static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
122	"slp-skip-early-profitability-check", cl::init(Val: false), cl::Hidden,
123	cl::desc ("When true, SLP vectorizer bypasses profitability checks based on "
124	"heuristics and makes vectorization decision via cost modeling."));
125
126	static cl::opt<bool>
127	ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
128	cl::desc ("Attempt to vectorize horizontal reductions"));
129
130	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
131	"slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
132	cl::desc (
133	"Attempt to vectorize horizontal reductions feeding into a store"));
134
135	// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136	// even if we match a reduction but do not vectorize in the end.
137	static cl::opt<bool> AllowHorRdxIdenityOptimization(
138	"slp-optimize-identity-hor-reduction-ops", cl::init(Val: true), cl::Hidden,
139	cl::desc ("Allow optimization of original scalar identity operations on "
140	"matched horizontal reductions."));
141
142	static cl::opt<int>
143	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: `128`), cl::Hidden,
144	cl::desc ("Attempt to vectorize for this register size in bits"));
145
146	static cl::opt<unsigned>
147	MaxVFOption("slp-max-vf", cl::init(Val: `0`), cl::Hidden,
148	cl::desc ("Maximum SLP vectorization factor (0=unlimited)"));
149
150	/// Limits the size of scheduling regions in a block.
151	/// It avoid long compile times for _very_ large blocks where vector
152	/// instructions are spread over a wide range.
153	/// This limit is way higher than needed by real-world functions.
154	static cl::opt<int>
155	ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: `100000`), cl::Hidden,
156	cl::desc ("Limit the size of the SLP scheduling region per block"));
157
158	static cl::opt<int> MinVectorRegSizeOption(
159	"slp-min-reg-size", cl::init(Val: `128`), cl::Hidden,
160	cl::desc ("Attempt to vectorize for this register size in bits"));
161
162	static cl::opt<unsigned> RecursionMaxDepth(
163	"slp-recursion-max-depth", cl::init(Val: `12`), cl::Hidden,
164	cl::desc ("Limit the recursion depth when building a vectorizable tree"));
165
166	static cl::opt<unsigned> MinTreeSize(
167	"slp-min-tree-size", cl::init(Val: `3`), cl::Hidden,
168	cl::desc ("Only vectorize small trees if they are fully vectorizable"));
169
170	// The maximum depth that the look-ahead score heuristic will explore.
171	// The higher this value, the higher the compilation time overhead.
172	static cl::opt<int> LookAheadMaxDepth(
173	"slp-max-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
174	cl::desc ("The maximum look-ahead depth for operand reordering scores"));
175
176	// The maximum depth that the look-ahead score heuristic will explore
177	// when it probing among candidates for vectorization tree roots.
178	// The higher this value, the higher the compilation time overhead but unlike
179	// similar limit for operands ordering this is less frequently used, hence
180	// impact of higher value is less noticeable.
181	static cl::opt<int> RootLookAheadMaxDepth(
182	"slp-max-root-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
183	cl::desc ("The maximum look-ahead depth for searching best rooting option"));
184
185	static cl::opt<unsigned> MinProfitableStridedLoads(
186	"slp-min-strided-loads", cl::init(Val: `2`), cl::Hidden,
187	cl::desc ("The minimum number of loads, which should be considered strided, "
188	"if the stride is > 1 or is runtime value"));
189
190	static cl::opt<unsigned> MaxProfitableLoadStride(
191	"slp-max-stride", cl::init(Val: `8`), cl::Hidden,
192	cl::desc ("The maximum stride, considered to be profitable."));
193
194	static cl::opt<bool>
195	ViewSLPTree("view-slp-tree", cl::Hidden,
196	cl::desc ("Display the SLP trees with Graphviz"));
197
198	static cl::opt<bool> VectorizeNonPowerOf2(
199	"slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
200	cl::desc ("Try to vectorize with non-power-of-2 number of elements."));
201
202	// Limit the number of alias checks. The limit is chosen so that
203	// it has no negative effect on the llvm benchmarks.
204	static const unsigned AliasedCheckLimit = `10`;
205
206	// Limit of the number of uses for potentially transformed instructions/values,
207	// used in checks to avoid compile-time explode.
208	static constexpr int UsesLimit = `8`;
209
210	// Another limit for the alias checks: The maximum distance between load/store
211	// instructions where alias checks are done.
212	// This limit is useful for very large basic blocks.
213	static const unsigned MaxMemDepDistance = `160`;
214
215	/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216	/// regions to be handled.
217	static const int MinScheduleRegionSize = `16`;
218
219	/// Predicate for the element types that the SLP vectorizer supports.
220	///
221	/// The most important thing to filter here are types which are invalid in LLVM
222	/// vectors. We also filter target specific types which have absolutely no
223	/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
224	/// avoids spending time checking the cost model and realizing that they will
225	/// be inevitably scalarized.
226	static bool isValidElementType(Type *Ty) {
227	return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
228	!Ty->isPPC_FP128Ty();
229	}
230
231	/// \returns True if the value is a constant (but not globals/constant
232	/// expressions).
233	static bool isConstant(Value *V) {
234	return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
235	}
236
237	/// Checks if \p V is one of vector-like instructions, i.e. undef,
238	/// insertelement/extractelement with constant indices for fixed vector type or
239	/// extractvalue instruction.
240	static bool isVectorLikeInstWithConstOps(Value *V) {
241	if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
242	!isa<ExtractValueInst, UndefValue>(Val: V))
243	return false;
244	auto *I = dyn_cast<Instruction>(Val: V);
245	if (!I \|\| isa<ExtractValueInst>(Val: I))
246	return true;
247	if (!isa<FixedVectorType>(Val: I->getOperand(i: `0`)->getType()))
248	return false;
249	if (isa<ExtractElementInst>(Val: I))
250	return isConstant(V: I->getOperand(i: `1`));
251	assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
252	return isConstant(V: I->getOperand(i: `2`));
253	}
254
255	#if !defined(NDEBUG)
256	/// Print a short descriptor of the instruction bundle suitable for debug output.
257	static std::string shortBundleName(ArrayRef<Value *> VL) {
258	std::string Result;
259	raw_string_ostream OS(Result);
260	OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
261	OS.flush();
262	return Result;
263	}
264	#endif
265
266	/// \returns true if all of the instructions in \p VL are in the same block or
267	/// false otherwise.
268	static bool allSameBlock(ArrayRef<Value *> VL) {
269	Instruction *I0 = dyn_cast<Instruction>(Val: VL [`0`]);
270	if (!I0)
271	return false;
272	if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
273	return true;
274
275	BasicBlock *BB = I0->getParent();
276	for (int I = `1`, E = VL.size(); I < E; I++) {
277	auto *II = dyn_cast<Instruction>(Val: VL [I]);
278	if (!II)
279	return false;
280
281	if (BB != II->getParent())
282	return false;
283	}
284	return true;
285	}
286
287	/// \returns True if all of the values in \p VL are constants (but not
288	/// globals/constant expressions).
289	static bool allConstant(ArrayRef<Value *> VL) {
290	// Constant expressions and globals can't be vectorized like normal integer/FP
291	// constants.
292	return all_of(Range&: VL, P: isConstant);
293	}
294
295	/// \returns True if all of the values in \p VL are identical or some of them
296	/// are UndefValue.
297	static bool isSplat(ArrayRef<Value *> VL) {
298	Value FirstNonUndef = nullptr*;
299	for (Value *V : VL) {
300	if (isa<UndefValue>(Val: V))
301	continue;
302	if (!FirstNonUndef) {
303	FirstNonUndef = V;
304	continue;
305	}
306	if (V != FirstNonUndef)
307	return false;
308	}
309	return FirstNonUndef != nullptr;
310	}
311
312	/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
313	static bool isCommutative(Instruction *I) {
314	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
315	return Cmp->isCommutative();
316	if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
317	return BO->isCommutative() \|\|
318	(BO->getOpcode() == Instruction::Sub &&
319	!BO->hasNUsesOrMore(N: UsesLimit) &&
320	all_of(
321	Range: BO->uses(),
322	P: [](const Use &U) {
323	// Commutative, if icmp eq/ne sub, 0
324	ICmpInst::Predicate Pred;
325	if (match(V: U.getUser(),
326	P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
327	(Pred == ICmpInst::ICMP_EQ \|\| Pred == ICmpInst::ICMP_NE))
328	return true;
329	// Commutative, if abs(sub nsw, true) or abs(sub, false).
330	ConstantInt *Flag;
331	return match(U.getUser(),
332	m_Intrinsic<Intrinsic::abs>(
333	m_Specific(U.get()), m_ConstantInt(Flag))) &&
334	(!cast<Instruction>(U.get())->hasNoSignedWrap() \|\|
335	Flag->isOne());
336	})) \|\|
337	(BO->getOpcode() == Instruction::FSub &&
338	!BO->hasNUsesOrMore(N: UsesLimit) &&
339	all_of(Range: BO->uses(), P: [](const Use &U) {
340	return match(U.getUser(),
341	m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
342	}));
343	return I->isCommutative();
344	}
345
346	/// \returns inserting index of InsertElement or InsertValue instruction,
347	/// using Offset as base offset for index.
348	static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
349	unsigned Offset = `0`) {
350	int Index = Offset;
351	if (const auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst)) {
352	const auto *VT = dyn_cast<FixedVectorType>(Val: IE->getType());
353	if (!VT)
354	return std::nullopt;
355	const auto *CI = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: `2`));
356	if (!CI)
357	return std::nullopt;
358	if (CI->getValue().uge(RHS: VT->getNumElements()))
359	return std::nullopt;
360	Index *= VT->getNumElements();
361	Index += CI->getZExtValue();
362	return Index;
363	}
364
365	const auto *IV = cast<InsertValueInst>(Val: InsertInst);
366	Type *CurrentType = IV->getType();
367	for (unsigned I : IV->indices()) {
368	if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
369	Index *= ST->getNumElements();
370	CurrentType = ST->getElementType(N: I);
371	} else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
372	Index *= AT->getNumElements();
373	CurrentType = AT->getElementType();
374	} else {
375	return std::nullopt;
376	}
377	Index += I;
378	}
379	return Index;
380	}
381
382	namespace {
383	/// Specifies the way the mask should be analyzed for undefs/poisonous elements
384	/// in the shuffle mask.
385	enum class UseMask {
386	FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
387	///< check for the mask elements for the first argument (mask
388	///< indices are in range [0:VF)).
389	SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
390	///< for the mask elements for the second argument (mask indices
391	///< are in range [VF:2VF))*
392	UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
393	///< future shuffle elements and mark them as ones as being used
394	///< in future. Non-undef elements are considered as unused since
395	///< they're already marked as used in the mask.
396	};
397	} // namespace
398
399	/// Prepares a use bitset for the given mask either for the first argument or
400	/// for the second.
401	static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
402	UseMask MaskArg) {
403	SmallBitVector UseMask(VF, true);
404	for (auto [Idx, Value] : enumerate(First&: Mask)) {
405	if (Value == PoisonMaskElem) {
406	if (MaskArg == UseMask::UndefsAsMask)
407	UseMask.reset(Idx);
408	continue;
409	}
410	if (MaskArg == UseMask::FirstArg && Value < VF)
411	UseMask.reset(Idx: Value);
412	else if (MaskArg == UseMask::SecondArg && Value >= VF)
413	UseMask.reset(Idx: Value - VF);
414	}
415	return UseMask;
416	}
417
418	/// Checks if the given value is actually an undefined constant vector.
419	/// Also, if the \p UseMask is not empty, tries to check if the non-masked
420	/// elements actually mask the insertelement buildvector, if any.
421	template <bool IsPoisonOnly = false>
422	static SmallBitVector isUndefVector(const Value *V,
423	const SmallBitVector &UseMask = {}) {
424	SmallBitVector Res(UseMask.empty() ? `1` : UseMask.size(), true);
425	using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
426	if (isa<T>(V))
427	return Res;
428	auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
429	if (!VecTy)
430	return Res.reset();
431	auto *C = dyn_cast<Constant>(Val: V);
432	if (!C) {
433	if (!UseMask.empty()) {
434	const Value *Base = V;
435	while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
436	Base = II->getOperand(i_nocapture: `0`);
437	if (isa<T>(II->getOperand(i_nocapture: `1`)))
438	continue;
439	std::optional<unsigned> Idx = getInsertIndex(InsertInst: II);
440	if (!Idx) {
441	Res.reset();
442	return Res;
443	}
444	if (Idx < UseMask.size() && !UseMask.test(Idx: Idx))
445	Res.reset(Idx: *Idx);
446	}
447	// TODO: Add analysis for shuffles here too.
448	if (V == Base) {
449	Res.reset();
450	} else {
451	SmallBitVector SubMask(UseMask.size(), false);
452	Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
453	}
454	} else {
455	Res.reset();
456	}
457	return Res;
458	}
459	for (unsigned I = `0`, E = VecTy->getNumElements(); I != E; ++I) {
460	if (Constant *Elem = C->getAggregateElement(Elt: I))
461	if (!isa<T>(Elem) &&
462	(UseMask.empty() \|\| (I < UseMask.size() && !UseMask.test(Idx: I))))
463	Res.reset(Idx: I);
464	}
465	return Res;
466	}
467
468	/// Checks if the vector of instructions can be represented as a shuffle, like:
469	/// %x0 = extractelement <4 x i8> %x, i32 0
470	/// %x3 = extractelement <4 x i8> %x, i32 3
471	/// %y1 = extractelement <4 x i8> %y, i32 1
472	/// %y2 = extractelement <4 x i8> %y, i32 2
473	/// %x0x0 = mul i8 %x0, %x0
474	/// %x3x3 = mul i8 %x3, %x3
475	/// %y1y1 = mul i8 %y1, %y1
476	/// %y2y2 = mul i8 %y2, %y2
477	/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
478	/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
479	/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
480	/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
481	/// ret <4 x i8> %ins4
482	/// can be transformed into:
483	/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
484	/// i32 6>
485	/// %2 = mul <4 x i8> %1, %1
486	/// ret <4 x i8> %2
487	/// Mask will return the Shuffle Mask equivalent to the extracted elements.
488	/// TODO: Can we split off and reuse the shuffle mask detection from
489	/// ShuffleVectorInst/getShuffleCost?
490	static std::optional<TargetTransformInfo::ShuffleKind>
491	isFixedVectorShuffle(ArrayRef<Value > VL, SmallVectorImpl<int*> &Mask) {
492	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
493	if (It == VL.end())
494	return std::nullopt;
495	auto EI0 = cast<ExtractElementInst>(Val: It);
496	if (isa<ScalableVectorType>(Val: EI0->getVectorOperandType()))
497	return std::nullopt;
498	unsigned Size =
499	cast<FixedVectorType>(Val: EI0->getVectorOperandType())->getNumElements();
500	Value Vec1 = nullptr*;
501	Value Vec2 = nullptr*;
502	enum ShuffleMode { Unknown, Select, Permute };
503	ShuffleMode CommonShuffleMode = Unknown;
504	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
505	for (unsigned I = `0`, E = VL.size(); I < E; ++I) {
506	// Undef can be represented as an undef element in a vector.
507	if (isa<UndefValue>(Val: VL [I]))
508	continue;
509	auto *EI = cast<ExtractElementInst>(Val: VL [I]);
510	if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
511	return std::nullopt;
512	auto *Vec = EI->getVectorOperand();
513	// We can extractelement from undef or poison vector.
514	if (isUndefVector(V: Vec).all())
515	continue;
516	// All vector operands must have the same number of vector elements.
517	if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() != Size)
518	return std::nullopt;
519	if (isa<UndefValue>(Val: EI->getIndexOperand()))
520	continue;
521	auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
522	if (!Idx)
523	return std::nullopt;
524	// Undefined behavior if Idx is negative or >= Size.
525	if (Idx->getValue().uge(RHS: Size))
526	continue;
527	unsigned IntIdx = Idx->getValue().getZExtValue();
528	Mask [I] = IntIdx;
529	// For correct shuffling we have to have at most 2 different vector operands
530	// in all extractelement instructions.
531	if (!Vec1 \|\| Vec1 == Vec) {
532	Vec1 = Vec;
533	} else if (!Vec2 \|\| Vec2 == Vec) {
534	Vec2 = Vec;
535	Mask [I] += Size;
536	} else {
537	return std::nullopt;
538	}
539	if (CommonShuffleMode == Permute)
540	continue;
541	// If the extract index is not the same as the operation number, it is a
542	// permutation.
543	if (IntIdx != I) {
544	CommonShuffleMode = Permute;
545	continue;
546	}
547	CommonShuffleMode = Select;
548	}
549	// If we're not crossing lanes in different vectors, consider it as blending.
550	if (CommonShuffleMode == Select && Vec2)
551	return TargetTransformInfo::SK_Select;
552	// If Vec2 was never used, we have a permutation of a single vector, otherwise
553	// we have permutation of 2 vectors.
554	return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
555	: TargetTransformInfo::SK_PermuteSingleSrc;
556	}
557
558	/// \returns True if Extract{Value,Element} instruction extracts element Idx.
559	static std::optional<unsigned> getExtractIndex(Instruction *E) {
560	unsigned Opcode = E->getOpcode();
561	assert((Opcode == Instruction::ExtractElement \|\|
562	Opcode == Instruction::ExtractValue) &&
563	"Expected extractelement or extractvalue instruction.");
564	if (Opcode == Instruction::ExtractElement) {
565	auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: `1`));
566	if (!CI)
567	return std::nullopt;
568	return CI->getZExtValue();
569	}
570	auto *EI = cast<ExtractValueInst>(Val: E);
571	if (EI->getNumIndices() != `1`)
572	return std::nullopt;
573	return *EI->idx_begin();
574	}
575
576	namespace {
577
578	/// Main data required for vectorization of instructions.
579	struct InstructionsState {
580	/// The very first instruction in the list with the main opcode.
581	Value OpValue = nullptr*;
582
583	/// The main/alternate instruction.
584	Instruction MainOp = nullptr*;
585	Instruction AltOp = nullptr*;
586
587	/// The main/alternate opcodes for the list of instructions.
588	unsigned getOpcode() const {
589	return MainOp ? MainOp->getOpcode() : `0`;
590	}
591
592	unsigned getAltOpcode() const {
593	return AltOp ? AltOp->getOpcode() : `0`;
594	}
595
596	/// Some of the instructions in the list have alternate opcodes.
597	bool isAltShuffle() const { return AltOp != MainOp; }
598
599	bool isOpcodeOrAlt(Instruction I) const* {
600	unsigned CheckedOpcode = I->getOpcode();
601	return getOpcode() == CheckedOpcode \|\| getAltOpcode() == CheckedOpcode;
602	}
603
604	InstructionsState() = delete;
605	InstructionsState(Value OpValue, Instruction MainOp, Instruction *AltOp)
606	: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
607	};
608
609	} // end anonymous namespace
610
611	/// Chooses the correct key for scheduling data. If \p Op has the same (or
612	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
613	/// OpValue.
614	static Value isOneOf(const* InstructionsState &S, Value *Op) {
615	auto *I = dyn_cast<Instruction>(Val: Op);
616	if (I && S.isOpcodeOrAlt(I))
617	return Op;
618	return S.OpValue;
619	}
620
621	/// \returns true if \p Opcode is allowed as part of the main/alternate
622	/// instruction for SLP vectorization.
623	///
624	/// Example of unsupported opcode is SDIV that can potentially cause UB if the
625	/// "shuffled out" lane would result in division by zero.
626	static bool isValidForAlternation(unsigned Opcode) {
627	if (Instruction::isIntDivRem(Opcode))
628	return false;
629
630	return true;
631	}
632
633	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
634	const TargetLibraryInfo &TLI,
635	unsigned BaseIndex = `0`);
636
637	/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
638	/// compatible instructions or constants, or just some other regular values.
639	static bool areCompatibleCmpOps(Value BaseOp0, Value BaseOp1, Value *Op0,
640	Value Op1, const* TargetLibraryInfo &TLI) {
641	return (isConstant(V: BaseOp0) && isConstant(V: Op0)) \|\|
642	(isConstant(V: BaseOp1) && isConstant(V: Op1)) \|\|
643	(!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
644	!isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) \|\|
645	BaseOp0 == Op0 \|\| BaseOp1 == Op1 \|\|
646	getSameOpcode(VL: {BaseOp0, Op0}, TLI).getOpcode() \|\|
647	getSameOpcode(VL: {BaseOp1, Op1}, TLI).getOpcode();
648	}
649
650	/// \returns true if a compare instruction \p CI has similar "look" and
651	/// same predicate as \p BaseCI, "as is" or with its operands and predicate
652	/// swapped, false otherwise.
653	static bool isCmpSameOrSwapped(const CmpInst BaseCI, const* CmpInst *CI,
654	const TargetLibraryInfo &TLI) {
655	assert(BaseCI->getOperand(`0`)->getType() == CI->getOperand(`0`)->getType() &&
656	"Assessing comparisons of different types?");
657	CmpInst::Predicate BasePred = BaseCI->getPredicate();
658	CmpInst::Predicate Pred = CI->getPredicate();
659	CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
660
661	Value *BaseOp0 = BaseCI->getOperand(i_nocapture: `0`);
662	Value *BaseOp1 = BaseCI->getOperand(i_nocapture: `1`);
663	Value *Op0 = CI->getOperand(i_nocapture: `0`);
664	Value *Op1 = CI->getOperand(i_nocapture: `1`);
665
666	return (BasePred == Pred &&
667	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) \|\|
668	(BasePred == SwappedPred &&
669	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
670	}
671
672	/// \returns analysis of the Instructions in \p VL described in
673	/// InstructionsState, the Opcode that we suppose the whole list
674	/// could be vectorized even if its structure is diverse.
675	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
676	const TargetLibraryInfo &TLI,
677	unsigned BaseIndex) {
678	// Make sure these are all Instructions.
679	if (llvm::any_of(Range&: VL, P: [](Value V) { return* !isa<Instruction>(Val: V); }))
680	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
681
682	bool IsCastOp = isa<CastInst>(Val: VL [BaseIndex]);
683	bool IsBinOp = isa<BinaryOperator>(Val: VL [BaseIndex]);
684	bool IsCmpOp = isa<CmpInst>(Val: VL [BaseIndex]);
685	CmpInst::Predicate BasePred =
686	IsCmpOp ? cast<CmpInst>(Val: VL [BaseIndex])->getPredicate()
687	: CmpInst::BAD_ICMP_PREDICATE;
688	unsigned Opcode = cast<Instruction>(Val: VL [BaseIndex])->getOpcode();
689	unsigned AltOpcode = Opcode;
690	unsigned AltIndex = BaseIndex;
691
692	bool SwappedPredsCompatible = [&]() {
693	if (!IsCmpOp)
694	return false;
695	SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
696	UniquePreds.insert(X: BasePred);
697	UniqueNonSwappedPreds.insert(X: BasePred);
698	for (Value *V : VL) {
699	auto *I = dyn_cast<CmpInst>(Val: V);
700	if (!I)
701	return false;
702	CmpInst::Predicate CurrentPred = I->getPredicate();
703	CmpInst::Predicate SwappedCurrentPred =
704	CmpInst::getSwappedPredicate(pred: CurrentPred);
705	UniqueNonSwappedPreds.insert(X: CurrentPred);
706	if (!UniquePreds.contains(key: CurrentPred) &&
707	!UniquePreds.contains(key: SwappedCurrentPred))
708	UniquePreds.insert(X: CurrentPred);
709	}
710	// Total number of predicates > 2, but if consider swapped predicates
711	// compatible only 2, consider swappable predicates as compatible opcodes,
712	// not alternate.
713	return UniqueNonSwappedPreds.size() > `2` && UniquePreds.size() == `2`;
714	}();
715	// Check for one alternate opcode from another BinaryOperator.
716	// TODO - generalize to support all operators (types, calls etc.).
717	auto *IBase = cast<Instruction>(Val: VL [BaseIndex]);
718	Intrinsic::ID BaseID = `0`;
719	SmallVector<VFInfo> BaseMappings;
720	if (auto *CallBase = dyn_cast<CallInst>(Val: IBase)) {
721	BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
722	BaseMappings = VFDatabase (CallBase).getMappings(CI: CallBase);
723	if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
724	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
725	}
726	for (int Cnt = `0`, E = VL.size(); Cnt < E; Cnt++) {
727	auto *I = cast<Instruction>(Val: VL [Cnt]);
728	unsigned InstOpcode = I->getOpcode();
729	if (IsBinOp && isa<BinaryOperator>(Val: I)) {
730	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
731	continue;
732	if (Opcode == AltOpcode && isValidForAlternation(Opcode: InstOpcode) &&
733	isValidForAlternation(Opcode)) {
734	AltOpcode = InstOpcode;
735	AltIndex = Cnt;
736	continue;
737	}
738	} else if (IsCastOp && isa<CastInst>(Val: I)) {
739	Value *Op0 = IBase->getOperand(i: `0`);
740	Type *Ty0 = Op0->getType();
741	Value *Op1 = I->getOperand(i: `0`);
742	Type *Ty1 = Op1->getType();
743	if (Ty0 == Ty1) {
744	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
745	continue;
746	if (Opcode == AltOpcode) {
747	assert(isValidForAlternation(Opcode) &&
748	isValidForAlternation(InstOpcode) &&
749	"Cast isn't safe for alternation, logic needs to be updated!");
750	AltOpcode = InstOpcode;
751	AltIndex = Cnt;
752	continue;
753	}
754	}
755	} else if (auto *Inst = dyn_cast<CmpInst>(Val: VL [Cnt]); Inst && IsCmpOp) {
756	auto *BaseInst = cast<CmpInst>(Val: VL [BaseIndex]);
757	Type *Ty0 = BaseInst->getOperand(i_nocapture: `0`)->getType();
758	Type *Ty1 = Inst->getOperand(i_nocapture: `0`)->getType();
759	if (Ty0 == Ty1) {
760	assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
761	// Check for compatible operands. If the corresponding operands are not
762	// compatible - need to perform alternate vectorization.
763	CmpInst::Predicate CurrentPred = Inst->getPredicate();
764	CmpInst::Predicate SwappedCurrentPred =
765	CmpInst::getSwappedPredicate(pred: CurrentPred);
766
767	if ((E == `2` \|\| SwappedPredsCompatible) &&
768	(BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred))
769	continue;
770
771	if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
772	continue;
773	auto *AltInst = cast<CmpInst>(Val: VL [AltIndex]);
774	if (AltIndex != BaseIndex) {
775	if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
776	continue;
777	} else if (BasePred != CurrentPred) {
778	assert(
779	isValidForAlternation(InstOpcode) &&
780	"CmpInst isn't safe for alternation, logic needs to be updated!");
781	AltIndex = Cnt;
782	continue;
783	}
784	CmpInst::Predicate AltPred = AltInst->getPredicate();
785	if (BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred \|\|
786	AltPred == CurrentPred \|\| AltPred == SwappedCurrentPred)
787	continue;
788	}
789	} else if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode) {
790	if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
791	if (Gep->getNumOperands() != `2` \|\|
792	Gep->getOperand(i_nocapture: `0`)->getType() != IBase->getOperand(i: `0`)->getType())
793	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
794	} else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
795	if (!isVectorLikeInstWithConstOps(V: EI))
796	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
797	} else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
798	auto *BaseLI = cast<LoadInst>(Val: IBase);
799	if (!LI->isSimple() \|\| !BaseLI->isSimple())
800	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
801	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
802	auto *CallBase = cast<CallInst>(Val: IBase);
803	if (Call->getCalledFunction() != CallBase->getCalledFunction())
804	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
805	if (Call->hasOperandBundles() &&
806	!std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
807	last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
808	first2: CallBase->op_begin() +
809	CallBase->getBundleOperandsStartIndex()))
810	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
811	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
812	if (ID != BaseID)
813	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
814	if (!ID) {
815	SmallVector<VFInfo> Mappings = VFDatabase (Call).getMappings(CI: Call);
816	if (Mappings.size() != BaseMappings.size() \|\|
817	Mappings.front().ISA != BaseMappings.front().ISA \|\|
818	Mappings.front().ScalarName != BaseMappings.front().ScalarName \|\|
819	Mappings.front().VectorName != BaseMappings.front().VectorName \|\|
820	Mappings.front().Shape.VF != BaseMappings.front().Shape.VF \|\|
821	Mappings.front().Shape.Parameters !=
822	BaseMappings.front().Shape.Parameters)
823	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
824	}
825	}
826	continue;
827	}
828	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
829	}
830
831	return InstructionsState (VL [BaseIndex], cast<Instruction>(Val: VL [BaseIndex]),
832	cast<Instruction>(Val: VL [AltIndex]));
833	}
834
835	/// \returns true if all of the values in \p VL have the same type or false
836	/// otherwise.
837	static bool allSameType(ArrayRef<Value *> VL) {
838	Type *Ty = VL.front()->getType();
839	return all_of(Range: VL.drop_front(), P: [&](Value V) { return* V->getType() == Ty; });
840	}
841
842	/// \returns True if in-tree use also needs extract. This refers to
843	/// possible scalar operand in vectorized instruction.
844	static bool doesInTreeUserNeedToExtract(Value Scalar, Instruction UserInst,
845	TargetLibraryInfo *TLI) {
846	unsigned Opcode = UserInst->getOpcode();
847	switch (Opcode) {
848	case Instruction::Load: {
849	LoadInst *LI = cast<LoadInst>(Val: UserInst);
850	return (LI->getPointerOperand() == Scalar);
851	}
852	case Instruction::Store: {
853	StoreInst *SI = cast<StoreInst>(Val: UserInst);
854	return (SI->getPointerOperand() == Scalar);
855	}
856	case Instruction::Call: {
857	CallInst *CI = cast<CallInst>(Val: UserInst);
858	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
859	return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
860	return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861	Arg.value().get() == Scalar;
862	});
863	}
864	default:
865	return false;
866	}
867	}
868
869	/// \returns the AA location that is being access by the instruction.
870	static MemoryLocation getLocation(Instruction *I) {
871	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
872	return MemoryLocation::get(SI);
873	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
874	return MemoryLocation::get(LI);
875	return MemoryLocation ();
876	}
877
878	/// \returns True if the instruction is not a volatile or atomic load/store.
879	static bool isSimple(Instruction *I) {
880	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
881	return LI->isSimple();
882	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
883	return SI->isSimple();
884	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
885	return !MI->isVolatile();
886	return true;
887	}
888
889	/// Shuffles \p Mask in accordance with the given \p SubMask.
890	/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
891	/// one but two input vectors.
892	static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
893	bool ExtendingManyInputs = false) {
894	if (SubMask.empty())
895	return;
896	assert(
897	(!ExtendingManyInputs \|\| SubMask.size() > Mask.size() \|\|
898	// Check if input scalars were extended to match the size of other node.
899	(SubMask.size() == Mask.size() &&
900	std::all_of(std::next(Mask.begin(), Mask.size() / `2`), Mask.end(),
901	[](int Idx) { return Idx == PoisonMaskElem; }))) &&
902	"SubMask with many inputs support must be larger than the mask.");
903	if (Mask.empty()) {
904	Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
905	return;
906	}
907	SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
908	int TermValue = std::min(a: Mask.size(), b: SubMask.size());
909	for (int I = `0`, E = SubMask.size(); I < E; ++I) {
910	if (SubMask [I] == PoisonMaskElem \|\|
911	(!ExtendingManyInputs &&
912	(SubMask [I] >= TermValue \|\| Mask [SubMask [I]] >= TermValue)))
913	continue;
914	NewMask [I] = Mask [SubMask [I]];
915	}
916	Mask.swap(RHS&: NewMask);
917	}
918
919	/// Order may have elements assigned special value (size) which is out of
920	/// bounds. Such indices only appear on places which correspond to undef values
921	/// (see canReuseExtract for details) and used in order to avoid undef values
922	/// have effect on operands ordering.
923	/// The first loop below simply finds all unused indices and then the next loop
924	/// nest assigns these indices for undef values positions.
925	/// As an example below Order has two undef positions and they have assigned
926	/// values 3 and 7 respectively:
927	/// before: 6 9 5 4 9 2 1 0
928	/// after: 6 3 5 4 7 2 1 0
929	static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
930	const unsigned Sz = Order.size();
931	SmallBitVector UnusedIndices(Sz, /t=/true);
932	SmallBitVector MaskedIndices(Sz);
933	for (unsigned I = `0`; I < Sz; ++I) {
934	if (Order [I] < Sz)
935	UnusedIndices.reset(Idx: Order [I]);
936	else
937	MaskedIndices.set(I);
938	}
939	if (MaskedIndices.none())
940	return;
941	assert(UnusedIndices.count() == MaskedIndices.count() &&
942	"Non-synced masked/available indices.");
943	int Idx = UnusedIndices.find_first();
944	int MIdx = MaskedIndices.find_first();
945	while (MIdx >= `0`) {
946	assert(Idx >= `0` && "Indices must be synced.");
947	Order [MIdx] = Idx;
948	Idx = UnusedIndices.find_next(Prev: Idx);
949	MIdx = MaskedIndices.find_next(Prev: MIdx);
950	}
951	}
952
953	namespace llvm {
954
955	static void inversePermutation(ArrayRef<unsigned> Indices,
956	SmallVectorImpl<int> &Mask) {
957	Mask.clear();
958	const unsigned E = Indices.size();
959	Mask.resize(N: E, NV: PoisonMaskElem);
960	for (unsigned I = `0`; I < E; ++I)
961	Mask [Indices [I]] = I;
962	}
963
964	/// Reorders the list of scalars in accordance with the given \p Mask.
965	static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
966	ArrayRef<int> Mask) {
967	assert(!Mask.empty() && "Expected non-empty mask.");
968	SmallVector<Value *> Prev(Scalars.size(),
969	UndefValue::get(T: Scalars.front()->getType()));
970	Prev.swap(RHS&: Scalars);
971	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
972	if (Mask [I] != PoisonMaskElem)
973	Scalars [Mask [I]] = Prev [I];
974	}
975
976	/// Checks if the provided value does not require scheduling. It does not
977	/// require scheduling if this is not an instruction or it is an instruction
978	/// that does not read/write memory and all operands are either not instructions
979	/// or phi nodes or instructions from different blocks.
980	static bool areAllOperandsNonInsts(Value *V) {
981	auto *I = dyn_cast<Instruction>(Val: V);
982	if (!I)
983	return true;
984	return !mayHaveNonDefUseDependency(I: *I) &&
985	all_of(Range: I->operands(), P: [I](Value *V) {
986	auto *IO = dyn_cast<Instruction>(Val: V);
987	if (!IO)
988	return true;
989	return isa<PHINode>(Val: IO) \|\| IO->getParent() != I->getParent();
990	});
991	}
992
993	/// Checks if the provided value does not require scheduling. It does not
994	/// require scheduling if this is not an instruction or it is an instruction
995	/// that does not read/write memory and all users are phi nodes or instructions
996	/// from the different blocks.
997	static bool isUsedOutsideBlock(Value *V) {
998	auto *I = dyn_cast<Instruction>(Val: V);
999	if (!I)
1000	return true;
1001	// Limits the number of uses to save compile time.
1002	return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1003	all_of(Range: I->users(), P: [I](User *U) {
1004	auto *IU = dyn_cast<Instruction>(Val: U);
1005	if (!IU)
1006	return true;
1007	return IU->getParent() != I->getParent() \|\| isa<PHINode>(Val: IU);
1008	});
1009	}
1010
1011	/// Checks if the specified value does not require scheduling. It does not
1012	/// require scheduling if all operands and all users do not need to be scheduled
1013	/// in the current basic block.
1014	static bool doesNotNeedToBeScheduled(Value *V) {
1015	return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1016	}
1017
1018	/// Checks if the specified array of instructions does not require scheduling.
1019	/// It is so if all either instructions have operands that do not require
1020	/// scheduling or their users do not require scheduling since they are phis or
1021	/// in other basic blocks.
1022	static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1023	return !VL.empty() &&
1024	(all_of(Range&: VL, P: isUsedOutsideBlock) \|\| all_of(Range&: VL, P: areAllOperandsNonInsts));
1025	}
1026
1027	namespace slpvectorizer {
1028
1029	/// Bottom Up SLP Vectorizer.
1030	class BoUpSLP {
1031	struct TreeEntry;
1032	struct ScheduleData;
1033	class ShuffleCostEstimator;
1034	class ShuffleInstructionBuilder;
1035
1036	public:
1037	/// Tracks the state we can represent the loads in the given sequence.
1038	enum class LoadsState {
1039	Gather,
1040	Vectorize,
1041	ScatterVectorize,
1042	StridedVectorize
1043	};
1044
1045	using ValueList = SmallVector<Value *, `8`>;
1046	using InstrList = SmallVector<Instruction *, `16`>;
1047	using ValueSet = SmallPtrSet<Value *, `16`>;
1048	using StoreList = SmallVector<StoreInst *, `8`>;
1049	using ExtraValueToDebugLocsMap =
1050	MapVector<Value , SmallVector<Instruction , `2`>>;
1051	using OrdersType = SmallVector<unsigned, `4`>;
1052
1053	BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
1054	TargetLibraryInfo TLi, AAResults Aa, LoopInfo *Li,
1055	DominatorTree Dt, AssumptionCache AC, DemandedBits *DB,
1056	const DataLayout DL, OptimizationRemarkEmitter ORE)
1057	: BatchAA (*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058	AC(AC), DB(DB), DL(DL), ORE(ORE),
1059	Builder (Se->getContext(), TargetFolder (*DL)) {
1060	CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1061	// Use the vector register size specified by the target unless overridden
1062	// by a command-line option.
1063	// TODO: It would be better to limit the vectorization factor based on
1064	// data type rather than just register size. For example, x86 AVX has
1065	// 256-bit registers, but it does not support integer operations
1066	// at that width (that requires AVX2).
1067	if (MaxVectorRegSizeOption.getNumOccurrences())
1068	MaxVecRegSize = MaxVectorRegSizeOption;
1069	else
1070	MaxVecRegSize =
1071	TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
1072	.getFixedValue();
1073
1074	if (MinVectorRegSizeOption.getNumOccurrences())
1075	MinVecRegSize = MinVectorRegSizeOption;
1076	else
1077	MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1078	}
1079
1080	/// Vectorize the tree that starts with the elements in \p VL.
1081	/// Returns the vectorized root.
1082	Value *vectorizeTree();
1083
1084	/// Vectorize the tree but with the list of externally used values \p
1085	/// ExternallyUsedValues. Values in this MapVector can be replaced but the
1086	/// generated extractvalue instructions.
1087	/// \param ReplacedExternals containd list of replaced external values
1088	/// {scalar, replace} after emitting extractelement for external uses.
1089	Value *
1090	vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1091	SmallVectorImpl<std::pair<Value , Value >> &ReplacedExternals,
1092	Instruction ReductionRoot = nullptr*);
1093
1094	/// \returns the cost incurred by unwanted spills and fills, caused by
1095	/// holding live values over call sites.
1096	InstructionCost getSpillCost() const;
1097
1098	/// \returns the vectorization cost of the subtree that starts at \p VL.
1099	/// A negative number means that this is profitable.
1100	InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1101
1102	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1103	/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1104	void buildTree(ArrayRef<Value *> Roots,
1105	const SmallDenseSet<Value *> &UserIgnoreLst);
1106
1107	/// Construct a vectorizable tree that starts at \p Roots.
1108	void buildTree(ArrayRef<Value *> Roots);
1109
1110	/// Returns whether the root node has in-tree uses.
1111	bool doesRootHaveInTreeUses() const {
1112	return !VectorizableTree.empty() &&
1113	!VectorizableTree.front()->UserTreeIndices.empty();
1114	}
1115
1116	/// Return the scalars of the root node.
1117	ArrayRef<Value > getRootNodeScalars() const* {
1118	assert(!VectorizableTree.empty() && "No graph to get the first node from");
1119	return VectorizableTree.front()->Scalars;
1120	}
1121
1122	/// Builds external uses of the vectorized scalars, i.e. the list of
1123	/// vectorized scalars to be extracted, their lanes and their scalar users. \p
1124	/// ExternallyUsedValues contains additional list of external uses to handle
1125	/// vectorization of reductions.
1126	void
1127	buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1128
1129	/// Transforms graph nodes to target specific representations, if profitable.
1130	void transformNodes();
1131
1132	/// Clear the internal data structures that are created by 'buildTree'.
1133	void deleteTree() {
1134	VectorizableTree.clear();
1135	ScalarToTreeEntry.clear();
1136	MultiNodeScalars.clear();
1137	MustGather.clear();
1138	EntryToLastInstruction.clear();
1139	ExternalUses.clear();
1140	ExternalUsesAsGEPs.clear();
1141	for (auto &Iter : BlocksSchedules) {
1142	BlockScheduling *BS = Iter.second.get();
1143	BS->clear();
1144	}
1145	MinBWs.clear();
1146	ReductionBitWidth = `0`;
1147	CastMaxMinBWSizes.reset();
1148	ExtraBitWidthNodes.clear();
1149	InstrElementSize.clear();
1150	UserIgnoreList = nullptr;
1151	PostponedGathers.clear();
1152	ValueToGatherNodes.clear();
1153	}
1154
1155	unsigned getTreeSize() const { return VectorizableTree.size(); }
1156
1157	/// Perform LICM and CSE on the newly generated gather sequences.
1158	void optimizeGatherSequence();
1159
1160	/// Checks if the specified gather tree entry \p TE can be represented as a
1161	/// shuffled vector entry + (possibly) permutation with other gathers. It
1162	/// implements the checks only for possibly ordered scalars (Loads,
1163	/// ExtractElement, ExtractValue), which can be part of the graph.
1164	std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1165
1166	/// Sort loads into increasing pointers offsets to allow greater clustering.
1167	std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1168
1169	/// Gets reordering data for the given tree entry. If the entry is vectorized
1170	/// - just return ReorderIndices, otherwise check if the scalars can be
1171	/// reordered and return the most optimal order.
1172	/// \return std::nullopt if ordering is not important, empty order, if
1173	/// identity order is important, or the actual order.
1174	/// \param TopToBottom If true, include the order of vectorized stores and
1175	/// insertelement nodes, otherwise skip them.
1176	std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1177	bool TopToBottom);
1178
1179	/// Reorders the current graph to the most profitable order starting from the
1180	/// root node to the leaf nodes. The best order is chosen only from the nodes
1181	/// of the same size (vectorization factor). Smaller nodes are considered
1182	/// parts of subgraph with smaller VF and they are reordered independently. We
1183	/// can make it because we still need to extend smaller nodes to the wider VF
1184	/// and we can merge reordering shuffles with the widening shuffles.
1185	void reorderTopToBottom();
1186
1187	/// Reorders the current graph to the most profitable order starting from
1188	/// leaves to the root. It allows to rotate small subgraphs and reduce the
1189	/// number of reshuffles if the leaf nodes use the same order. In this case we
1190	/// can merge the orders and just shuffle user node instead of shuffling its
1191	/// operands. Plus, even the leaf nodes have different orders, it allows to
1192	/// sink reordering in the graph closer to the root node and merge it later
1193	/// during analysis.
1194	void reorderBottomToTop(bool IgnoreReorder = false);
1195
1196	/// \return The vector element size in bits to use when vectorizing the
1197	/// expression tree ending at \p V. If V is a store, the size is the width of
1198	/// the stored value. Otherwise, the size is the width of the largest loaded
1199	/// value reaching V. This method is used by the vectorizer to calculate
1200	/// vectorization factors.
1201	unsigned getVectorElementSize(Value *V);
1202
1203	/// Compute the minimum type sizes required to represent the entries in a
1204	/// vectorizable tree.
1205	void computeMinimumValueSizes();
1206
1207	// \returns maximum vector register size as set by TTI or overridden by cl::opt.
1208	unsigned getMaxVecRegSize() const {
1209	return MaxVecRegSize;
1210	}
1211
1212	// \returns minimum vector register size as set by cl::opt.
1213	unsigned getMinVecRegSize() const {
1214	return MinVecRegSize;
1215	}
1216
1217	unsigned getMinVF(unsigned Sz) const {
1218	return std::max(a: `2U`, b: getMinVecRegSize() / Sz);
1219	}
1220
1221	unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1222	unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1223	MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1224	return MaxVF ? MaxVF : UINT_MAX;
1225	}
1226
1227	/// Check if homogeneous aggregate is isomorphic to some VectorType.
1228	/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1229	/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1230	/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1231	///
1232	/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1233	unsigned canMapToVector(Type T) const*;
1234
1235	/// \returns True if the VectorizableTree is both tiny and not fully
1236	/// vectorizable. We do not vectorize such trees.
1237	bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1238
1239	/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1240	/// can be load combined in the backend. Load combining may not be allowed in
1241	/// the IR optimizer, so we do not want to alter the pattern. For example,
1242	/// partially transforming a scalar bswap() pattern into vector code is
1243	/// effectively impossible for the backend to undo.
1244	/// TODO: If load combining is allowed in the IR optimizer, this analysis
1245	/// may not be necessary.
1246	bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1247
1248	/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1249	/// can be load combined in the backend. Load combining may not be allowed in
1250	/// the IR optimizer, so we do not want to alter the pattern. For example,
1251	/// partially transforming a scalar bswap() pattern into vector code is
1252	/// effectively impossible for the backend to undo.
1253	/// TODO: If load combining is allowed in the IR optimizer, this analysis
1254	/// may not be necessary.
1255	bool isLoadCombineCandidate() const;
1256
1257	/// Checks if the given array of loads can be represented as a vectorized,
1258	/// scatter or just simple gather.
1259	/// \param VL list of loads.
1260	/// \param VL0 main load value.
1261	/// \param Order returned order of load instructions.
1262	/// \param PointerOps returned list of pointer operands.
1263	/// \param TryRecursiveCheck used to check if long masked gather can be
1264	/// represented as a serie of loads/insert subvector, if profitable.
1265	LoadsState canVectorizeLoads(ArrayRef<Value > VL, const* Value *VL0,
1266	SmallVectorImpl<unsigned> &Order,
1267	SmallVectorImpl<Value *> &PointerOps,
1268	bool TryRecursiveCheck = true) const;
1269
1270	OptimizationRemarkEmitter getORE() { return* ORE; }
1271
1272	/// This structure holds any data we need about the edges being traversed
1273	/// during buildTree_rec(). We keep track of:
1274	/// (i) the user TreeEntry index, and
1275	/// (ii) the index of the edge.
1276	struct EdgeInfo {
1277	EdgeInfo() = default;
1278	EdgeInfo(TreeEntry UserTE, unsigned* EdgeIdx)
1279	: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1280	/// The user TreeEntry.
1281	TreeEntry UserTE = nullptr*;
1282	/// The operand index of the use.
1283	unsigned EdgeIdx = UINT_MAX;
1284	#ifndef NDEBUG
1285	friend inline raw_ostream &operator<<(raw_ostream &OS,
1286	const BoUpSLP::EdgeInfo &EI) {
1287	EI.dump(OS);
1288	return OS;
1289	}
1290	/// Debug print.
1291	void dump(raw_ostream &OS) const {
1292	OS << "{User:" << (UserTE ? std::to_string(val: UserTE->Idx) : "null")
1293	<< " EdgeIdx:" << EdgeIdx << "}";
1294	}
1295	LLVM_DUMP_METHOD void dump() const { dump(OS&: dbgs()); }
1296	#endif
1297	bool operator == (const EdgeInfo &Other) const {
1298	return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1299	}
1300	};
1301
1302	/// A helper class used for scoring candidates for two consecutive lanes.
1303	class LookAheadHeuristics {
1304	const TargetLibraryInfo &TLI;
1305	const DataLayout &DL;
1306	ScalarEvolution &SE;
1307	const BoUpSLP &R;
1308	int NumLanes; // Total number of lanes (aka vectorization factor).
1309	int MaxLevel; // The maximum recursion depth for accumulating score.
1310
1311	public:
1312	LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1313	ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1314	int MaxLevel)
1315	: TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1316	MaxLevel(MaxLevel) {}
1317
1318	// The hard-coded scores listed here are not very important, though it shall
1319	// be higher for better matches to improve the resulting cost. When
1320	// computing the scores of matching one sub-tree with another, we are
1321	// basically counting the number of values that are matching. So even if all
1322	// scores are set to 1, we would still get a decent matching result.
1323	// However, sometimes we have to break ties. For example we may have to
1324	// choose between matching loads vs matching opcodes. This is what these
1325	// scores are helping us with: they provide the order of preference. Also,
1326	// this is important if the scalar is externally used or used in another
1327	// tree entry node in the different lane.
1328
1329	/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1330	static const int ScoreConsecutiveLoads = `4`;
1331	/// The same load multiple times. This should have a better score than
1332	/// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1333	/// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1334	/// a vector load and 1.0 for a broadcast.
1335	static const int ScoreSplatLoads = `3`;
1336	/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1337	static const int ScoreReversedLoads = `3`;
1338	/// A load candidate for masked gather.
1339	static const int ScoreMaskedGatherCandidate = `1`;
1340	/// ExtractElementInst from same vector and consecutive indexes.
1341	static const int ScoreConsecutiveExtracts = `4`;
1342	/// ExtractElementInst from same vector and reversed indices.
1343	static const int ScoreReversedExtracts = `3`;
1344	/// Constants.
1345	static const int ScoreConstants = `2`;
1346	/// Instructions with the same opcode.
1347	static const int ScoreSameOpcode = `2`;
1348	/// Instructions with alt opcodes (e.g, add + sub).
1349	static const int ScoreAltOpcodes = `1`;
1350	/// Identical instructions (a.k.a. splat or broadcast).
1351	static const int ScoreSplat = `1`;
1352	/// Matching with an undef is preferable to failing.
1353	static const int ScoreUndef = `1`;
1354	/// Score for failing to find a decent match.
1355	static const int ScoreFail = `0`;
1356	/// Score if all users are vectorized.
1357	static const int ScoreAllUserVectorized = `1`;
1358
1359	/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1360	/// \p U1 and \p U2 are the users of \p V1 and \p V2.
1361	/// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1362	/// MainAltOps.
1363	int getShallowScore(Value V1, Value V2, Instruction U1, Instruction U2,
1364	ArrayRef<Value > MainAltOps) const* {
1365	if (!isValidElementType(Ty: V1->getType()) \|\|
1366	!isValidElementType(Ty: V2->getType()))
1367	return LookAheadHeuristics::ScoreFail;
1368
1369	if (V1 == V2) {
1370	if (isa<LoadInst>(Val: V1)) {
1371	// Retruns true if the users of V1 and V2 won't need to be extracted.
1372	auto AllUsersAreInternal = [U1, U2, this](Value V1, Value V2) {
1373	// Bail out if we have too many uses to save compilation time.
1374	if (V1->hasNUsesOrMore(N: UsesLimit) \|\| V2->hasNUsesOrMore(N: UsesLimit))
1375	return false;
1376
1377	auto AllUsersVectorized = [U1, U2, this](Value *V) {
1378	return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
1379	return U == U1 \|\| U == U2 \|\| R.getTreeEntry(V: U) != nullptr;
1380	});
1381	};
1382	return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1383	};
1384	// A broadcast of a load can be cheaper on some targets.
1385	if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
1386	NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
1387	((int)V1->getNumUses() == NumLanes \|\|
1388	AllUsersAreInternal(V1, V2)))
1389	return LookAheadHeuristics::ScoreSplatLoads;
1390	}
1391	return LookAheadHeuristics::ScoreSplat;
1392	}
1393
1394	auto *LI1 = dyn_cast<LoadInst>(Val: V1);
1395	auto *LI2 = dyn_cast<LoadInst>(Val: V2);
1396	if (LI1 && LI2) {
1397	if (LI1->getParent() != LI2->getParent() \|\| !LI1->isSimple() \|\|
1398	!LI2->isSimple())
1399	return LookAheadHeuristics::ScoreFail;
1400
1401	std::optional<int> Dist = getPointersDiff(
1402	ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
1403	PtrB: LI2->getPointerOperand(), DL, SE, /StrictCheck=/true);
1404	if (!Dist \|\| *Dist == `0`) {
1405	if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
1406	getUnderlyingObject(V: LI2->getPointerOperand()) &&
1407	R.TTI->isLegalMaskedGather(
1408	DataType: FixedVectorType::get(ElementType: LI1->getType(), NumElts: NumLanes),
1409	Alignment: LI1->getAlign()))
1410	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1411	return LookAheadHeuristics::ScoreFail;
1412	}
1413	// The distance is too large - still may be profitable to use masked
1414	// loads/gathers.
1415	if (std::abs(x: *Dist) > NumLanes / `2`)
1416	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1417	// This still will detect consecutive loads, but we might have "holes"
1418	// in some cases. It is ok for non-power-2 vectorization and may produce
1419	// better results. It should not affect current vectorization.
1420	return (*Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveLoads
1421	: LookAheadHeuristics::ScoreReversedLoads;
1422	}
1423
1424	auto *C1 = dyn_cast<Constant>(Val: V1);
1425	auto *C2 = dyn_cast<Constant>(Val: V2);
1426	if (C1 && C2)
1427	return LookAheadHeuristics::ScoreConstants;
1428
1429	// Extracts from consecutive indexes of the same vector better score as
1430	// the extracts could be optimized away.
1431	Value *EV1;
1432	ConstantInt *Ex1Idx;
1433	if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
1434	// Undefs are always profitable for extractelements.
1435	// Compiler can easily combine poison and extractelement <non-poison> or
1436	// undef and extractelement <poison>. But combining undef +
1437	// extractelement <non-poison-but-may-produce-poison> requires some
1438	// extra operations.
1439	if (isa<UndefValue>(Val: V2))
1440	return (isa<PoisonValue>(Val: V2) \|\| isUndefVector(V: EV1).all())
1441	? LookAheadHeuristics::ScoreConsecutiveExtracts
1442	: LookAheadHeuristics::ScoreSameOpcode;
1443	Value EV2 = nullptr*;
1444	ConstantInt Ex2Idx = nullptr*;
1445	if (match(V: V2,
1446	P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
1447	R: m_Undef())))) {
1448	// Undefs are always profitable for extractelements.
1449	if (!Ex2Idx)
1450	return LookAheadHeuristics::ScoreConsecutiveExtracts;
1451	if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
1452	return LookAheadHeuristics::ScoreConsecutiveExtracts;
1453	if (EV2 == EV1) {
1454	int Idx1 = Ex1Idx->getZExtValue();
1455	int Idx2 = Ex2Idx->getZExtValue();
1456	int Dist = Idx2 - Idx1;
1457	// The distance is too large - still may be profitable to use
1458	// shuffles.
1459	if (std::abs(x: Dist) == `0`)
1460	return LookAheadHeuristics::ScoreSplat;
1461	if (std::abs(x: Dist) > NumLanes / `2`)
1462	return LookAheadHeuristics::ScoreSameOpcode;
1463	return (Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1464	: LookAheadHeuristics::ScoreReversedExtracts;
1465	}
1466	return LookAheadHeuristics::ScoreAltOpcodes;
1467	}
1468	return LookAheadHeuristics::ScoreFail;
1469	}
1470
1471	auto *I1 = dyn_cast<Instruction>(Val: V1);
1472	auto *I2 = dyn_cast<Instruction>(Val: V2);
1473	if (I1 && I2) {
1474	if (I1->getParent() != I2->getParent())
1475	return LookAheadHeuristics::ScoreFail;
1476	SmallVector<Value *, `4`> Ops(MainAltOps.begin(), MainAltOps.end());
1477	Ops.push_back(Elt: I1);
1478	Ops.push_back(Elt: I2);
1479	InstructionsState S = getSameOpcode(VL: Ops, TLI);
1480	// Note: Only consider instructions with <= 2 operands to avoid
1481	// complexity explosion.
1482	if (S.getOpcode() &&
1483	(S.MainOp->getNumOperands() <= `2` \|\| !MainAltOps.empty() \|\|
1484	!S.isAltShuffle()) &&
1485	all_of(Range&: Ops, P: [&S](Value *V) {
1486	return cast<Instruction>(Val: V)->getNumOperands() ==
1487	S.MainOp->getNumOperands();
1488	}))
1489	return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1490	: LookAheadHeuristics::ScoreSameOpcode;
1491	}
1492
1493	if (isa<UndefValue>(Val: V2))
1494	return LookAheadHeuristics::ScoreUndef;
1495
1496	return LookAheadHeuristics::ScoreFail;
1497	}
1498
1499	/// Go through the operands of \p LHS and \p RHS recursively until
1500	/// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1501	/// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1502	/// of \p U1 and \p U2), except at the beginning of the recursion where
1503	/// these are set to nullptr.
1504	///
1505	/// For example:
1506	/// \verbatim
1507	/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1508	/// \ / \ / \ / \ /
1509	/// + + + +
1510	/// G1 G2 G3 G4
1511	/// \endverbatim
1512	/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1513	/// each level recursively, accumulating the score. It starts from matching
1514	/// the additions at level 0, then moves on to the loads (level 1). The
1515	/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1516	/// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1517	/// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1518	/// Please note that the order of the operands does not matter, as we
1519	/// evaluate the score of all profitable combinations of operands. In
1520	/// other words the score of G1 and G4 is the same as G1 and G2. This
1521	/// heuristic is based on ideas described in:
1522	/// Look-ahead SLP: Auto-vectorization in the presence of commutative
1523	/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1524	/// Luís F. W. Góes
1525	int getScoreAtLevelRec(Value LHS, Value RHS, Instruction *U1,
1526	Instruction U2, int* CurrLevel,
1527	ArrayRef<Value > MainAltOps) const* {
1528
1529	// Get the shallow score of V1 and V2.
1530	int ShallowScoreAtThisLevel =
1531	getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
1532
1533	// If reached MaxLevel,
1534	// or if V1 and V2 are not instructions,
1535	// or if they are SPLAT,
1536	// or if they are not consecutive,
1537	// or if profitable to vectorize loads or extractelements, early return
1538	// the current cost.
1539	auto *I1 = dyn_cast<Instruction>(Val: LHS);
1540	auto *I2 = dyn_cast<Instruction>(Val: RHS);
1541	if (CurrLevel == MaxLevel \|\| !(I1 && I2) \|\| I1 == I2 \|\|
1542	ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail \|\|
1543	(((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) \|\|
1544	(I1->getNumOperands() > `2` && I2->getNumOperands() > `2`) \|\|
1545	(isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
1546	ShallowScoreAtThisLevel))
1547	return ShallowScoreAtThisLevel;
1548	assert(I1 && I2 && "Should have early exited.");
1549
1550	// Contains the I2 operand indexes that got matched with I1 operands.
1551	SmallSet<unsigned, `4`> Op2Used;
1552
1553	// Recursion towards the operands of I1 and I2. We are trying all possible
1554	// operand pairs, and keeping track of the best score.
1555	for (unsigned OpIdx1 = `0`, NumOperands1 = I1->getNumOperands();
1556	OpIdx1 != NumOperands1; ++OpIdx1) {
1557	// Try to pair op1I with the best operand of I2.
1558	int MaxTmpScore = `0`;
1559	unsigned MaxOpIdx2 = `0`;
1560	bool FoundBest = false;
1561	// If I2 is commutative try all combinations.
1562	unsigned FromIdx = isCommutative(I: I2) ? `0` : OpIdx1;
1563	unsigned ToIdx = isCommutative(I: I2)
1564	? I2->getNumOperands()
1565	: std::min(a: I2->getNumOperands(), b: OpIdx1 + `1`);
1566	assert(FromIdx <= ToIdx && "Bad index");
1567	for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1568	// Skip operands already paired with OpIdx1.
1569	if (Op2Used.count(V: OpIdx2))
1570	continue;
1571	// Recursively calculate the cost at each level
1572	int TmpScore =
1573	getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
1574	U1: I1, U2: I2, CurrLevel: CurrLevel + `1`, MainAltOps: std::nullopt);
1575	// Look for the best score.
1576	if (TmpScore > LookAheadHeuristics::ScoreFail &&
1577	TmpScore > MaxTmpScore) {
1578	MaxTmpScore = TmpScore;
1579	MaxOpIdx2 = OpIdx2;
1580	FoundBest = true;
1581	}
1582	}
1583	if (FoundBest) {
1584	// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1585	Op2Used.insert(V: MaxOpIdx2);
1586	ShallowScoreAtThisLevel += MaxTmpScore;
1587	}
1588	}
1589	return ShallowScoreAtThisLevel;
1590	}
1591	};
1592	/// A helper data structure to hold the operands of a vector of instructions.
1593	/// This supports a fixed vector length for all operand vectors.
1594	class VLOperands {
1595	/// For each operand we need (i) the value, and (ii) the opcode that it
1596	/// would be attached to if the expression was in a left-linearized form.
1597	/// This is required to avoid illegal operand reordering.
1598	/// For example:
1599	/// \verbatim
1600	/// 0 Op1
1601	/// \|/
1602	/// Op1 Op2 Linearized + Op2
1603	/// \ / ----------> \|/
1604	/// - -
1605	///
1606	/// Op1 - Op2 (0 + Op1) - Op2
1607	/// \endverbatim
1608	///
1609	/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1610	///
1611	/// Another way to think of this is to track all the operations across the
1612	/// path from the operand all the way to the root of the tree and to
1613	/// calculate the operation that corresponds to this path. For example, the
1614	/// path from Op2 to the root crosses the RHS of the '-', therefore the
1615	/// corresponding operation is a '-' (which matches the one in the
1616	/// linearized tree, as shown above).
1617	///
1618	/// For lack of a better term, we refer to this operation as Accumulated
1619	/// Path Operation (APO).
1620	struct OperandData {
1621	OperandData() = default;
1622	OperandData(Value V, bool* APO, bool IsUsed)
1623	: V(V), APO(APO), IsUsed(IsUsed) {}
1624	/// The operand value.
1625	Value V = nullptr*;
1626	/// TreeEntries only allow a single opcode, or an alternate sequence of
1627	/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1628	/// APO. It is set to 'true' if 'V' is attached to an inverse operation
1629	/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1630	/// (e.g., Add/Mul)
1631	bool APO = false;
1632	/// Helper data for the reordering function.
1633	bool IsUsed = false;
1634	};
1635
1636	/// During operand reordering, we are trying to select the operand at lane
1637	/// that matches best with the operand at the neighboring lane. Our
1638	/// selection is based on the type of value we are looking for. For example,
1639	/// if the neighboring lane has a load, we need to look for a load that is
1640	/// accessing a consecutive address. These strategies are summarized in the
1641	/// 'ReorderingMode' enumerator.
1642	enum class ReorderingMode {
1643	Load, ///< Matching loads to consecutive memory addresses
1644	Opcode, ///< Matching instructions based on opcode (same or alternate)
1645	Constant, ///< Matching constants
1646	Splat, ///< Matching the same instruction multiple times (broadcast)
1647	Failed, ///< We failed to create a vectorizable group
1648	};
1649
1650	using OperandDataVec = SmallVector<OperandData, `2`>;
1651
1652	/// A vector of operand vectors.
1653	SmallVector<OperandDataVec, `4`> OpsVec;
1654
1655	const TargetLibraryInfo &TLI;
1656	const DataLayout &DL;
1657	ScalarEvolution &SE;
1658	const BoUpSLP &R;
1659
1660	/// \returns the operand data at \p OpIdx and \p Lane.
1661	OperandData &getData(unsigned OpIdx, unsigned Lane) {
1662	return OpsVec [OpIdx][Lane];
1663	}
1664
1665	/// \returns the operand data at \p OpIdx and \p Lane. Const version.
1666	const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1667	return OpsVec [OpIdx][Lane];
1668	}
1669
1670	/// Clears the used flag for all entries.
1671	void clearUsed() {
1672	for (unsigned OpIdx = `0`, NumOperands = getNumOperands();
1673	OpIdx != NumOperands; ++OpIdx)
1674	for (unsigned Lane = `0`, NumLanes = getNumLanes(); Lane != NumLanes;
1675	++Lane)
1676	OpsVec [OpIdx][Lane].IsUsed = false;
1677	}
1678
1679	/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1680	void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1681	std::swap(a&: OpsVec [OpIdx1][Lane], b&: OpsVec [OpIdx2][Lane]);
1682	}
1683
1684	/// \param Lane lane of the operands under analysis.
1685	/// \param OpIdx operand index in \p Lane lane we're looking the best
1686	/// candidate for.
1687	/// \param Idx operand index of the current candidate value.
1688	/// \returns The additional score due to possible broadcasting of the
1689	/// elements in the lane. It is more profitable to have power-of-2 unique
1690	/// elements in the lane, it will be vectorized with higher probability
1691	/// after removing duplicates. Currently the SLP vectorizer supports only
1692	/// vectorization of the power-of-2 number of unique scalars.
1693	int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1694	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1695	if (!isa<Instruction>(Val: IdxLaneV) \|\| IdxLaneV == getData(OpIdx, Lane).V)
1696	return `0`;
1697	SmallPtrSet<Value *, `4`> Uniques;
1698	for (unsigned Ln = `0`, E = getNumLanes(); Ln < E; ++Ln) {
1699	if (Ln == Lane)
1700	continue;
1701	Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
1702	if (!isa<Instruction>(Val: OpIdxLnV))
1703	return `0`;
1704	Uniques.insert(Ptr: OpIdxLnV);
1705	}
1706	int UniquesCount = Uniques.size();
1707	int UniquesCntWithIdxLaneV =
1708	Uniques.contains(Ptr: IdxLaneV) ? UniquesCount : UniquesCount + `1`;
1709	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1710	int UniquesCntWithOpIdxLaneV =
1711	Uniques.contains(Ptr: OpIdxLaneV) ? UniquesCount : UniquesCount + `1`;
1712	if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1713	return `0`;
1714	return (PowerOf2Ceil(A: UniquesCntWithOpIdxLaneV) -
1715	UniquesCntWithOpIdxLaneV) -
1716	(PowerOf2Ceil(A: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1717	}
1718
1719	/// \param Lane lane of the operands under analysis.
1720	/// \param OpIdx operand index in \p Lane lane we're looking the best
1721	/// candidate for.
1722	/// \param Idx operand index of the current candidate value.
1723	/// \returns The additional score for the scalar which users are all
1724	/// vectorized.
1725	int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1726	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1727	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1728	// Do not care about number of uses for vector-like instructions
1729	// (extractelement/extractvalue with constant indices), they are extracts
1730	// themselves and already externally used. Vectorization of such
1731	// instructions does not add extra extractelement instruction, just may
1732	// remove it.
1733	if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
1734	isVectorLikeInstWithConstOps(V: OpIdxLaneV))
1735	return LookAheadHeuristics::ScoreAllUserVectorized;
1736	auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
1737	if (!IdxLaneI \|\| !isa<Instruction>(Val: OpIdxLaneV))
1738	return `0`;
1739	return R.areAllUsersVectorized(I: IdxLaneI)
1740	? LookAheadHeuristics::ScoreAllUserVectorized
1741	: `0`;
1742	}
1743
1744	/// Score scaling factor for fully compatible instructions but with
1745	/// different number of external uses. Allows better selection of the
1746	/// instructions with less external uses.
1747	static const int ScoreScaleFactor = `10`;
1748
1749	/// \Returns the look-ahead score, which tells us how much the sub-trees
1750	/// rooted at \p LHS and \p RHS match, the more they match the higher the
1751	/// score. This helps break ties in an informed way when we cannot decide on
1752	/// the order of the operands by just considering the immediate
1753	/// predecessors.
1754	int getLookAheadScore(Value LHS, Value RHS, ArrayRef<Value *> MainAltOps,
1755	int Lane, unsigned OpIdx, unsigned Idx,
1756	bool &IsUsed) {
1757	LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1758	LookAheadMaxDepth);
1759	// Keep track of the instruction stack as we recurse into the operands
1760	// during the look-ahead score exploration.
1761	int Score =
1762	LookAhead.getScoreAtLevelRec(LHS, RHS, /U1=/nullptr, /U2=/nullptr,
1763	/CurrLevel=/`1`, MainAltOps);
1764	if (Score) {
1765	int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1766	if (Score <= -SplatScore) {
1767	// Set the minimum score for splat-like sequence to avoid setting
1768	// failed state.
1769	Score = `1`;
1770	} else {
1771	Score += SplatScore;
1772	// Scale score to see the difference between different operands
1773	// and similar operands but all vectorized/not all vectorized
1774	// uses. It does not affect actual selection of the best
1775	// compatible operand in general, just allows to select the
1776	// operand with all vectorized uses.
1777	Score *= ScoreScaleFactor;
1778	Score += getExternalUseScore(Lane, OpIdx, Idx);
1779	IsUsed = true;
1780	}
1781	}
1782	return Score;
1783	}
1784
1785	/// Best defined scores per lanes between the passes. Used to choose the
1786	/// best operand (with the highest score) between the passes.
1787	/// The key - {Operand Index, Lane}.
1788	/// The value - the best score between the passes for the lane and the
1789	/// operand.
1790	SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, `8`>
1791	BestScoresPerLanes;
1792
1793	// Search all operands in Ops[][Lane] for the one that matches best*
1794	// Ops[OpIdx][LastLane] and return its opreand index.
1795	// If no good match can be found, return std::nullopt.
1796	std::optional<unsigned>
1797	getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1798	ArrayRef<ReorderingMode> ReorderingModes,
1799	ArrayRef<Value *> MainAltOps) {
1800	unsigned NumOperands = getNumOperands();
1801
1802	// The operand of the previous lane at OpIdx.
1803	Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
1804
1805	// Our strategy mode for OpIdx.
1806	ReorderingMode RMode = ReorderingModes [OpIdx];
1807	if (RMode == ReorderingMode::Failed)
1808	return std::nullopt;
1809
1810	// The linearized opcode of the operand at OpIdx, Lane.
1811	bool OpIdxAPO = getData(OpIdx, Lane).APO;
1812
1813	// The best operand index and its score.
1814	// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1815	// are using the score to differentiate between the two.
1816	struct BestOpData {
1817	std::optional<unsigned> Idx;
1818	unsigned Score = `0`;
1819	} BestOp;
1820	BestOp.Score =
1821	BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: `0`)
1822	.first ->second;
1823
1824	// Track if the operand must be marked as used. If the operand is set to
1825	// Score 1 explicitly (because of non power-of-2 unique scalars, we may
1826	// want to reestimate the operands again on the following iterations).
1827	bool IsUsed =
1828	RMode == ReorderingMode::Splat \|\| RMode == ReorderingMode::Constant;
1829	// Iterate through all unused operands and look for the best.
1830	for (unsigned Idx = `0`; Idx != NumOperands; ++Idx) {
1831	// Get the operand at Idx and Lane.
1832	OperandData &OpData = getData(OpIdx: Idx, Lane);
1833	Value *Op = OpData.V;
1834	bool OpAPO = OpData.APO;
1835
1836	// Skip already selected operands.
1837	if (OpData.IsUsed)
1838	continue;
1839
1840	// Skip if we are trying to move the operand to a position with a
1841	// different opcode in the linearized tree form. This would break the
1842	// semantics.
1843	if (OpAPO != OpIdxAPO)
1844	continue;
1845
1846	// Look for an operand that matches the current mode.
1847	switch (RMode) {
1848	case ReorderingMode::Load:
1849	case ReorderingMode::Constant:
1850	case ReorderingMode::Opcode: {
1851	bool LeftToRight = Lane > LastLane;
1852	Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1853	Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1854	int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
1855	OpIdx, Idx, IsUsed);
1856	if (Score > static_cast<int>(BestOp.Score)) {
1857	BestOp.Idx = Idx;
1858	BestOp.Score = Score;
1859	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] = Score;
1860	}
1861	break;
1862	}
1863	case ReorderingMode::Splat:
1864	if (Op == OpLastLane)
1865	BestOp.Idx = Idx;
1866	break;
1867	case ReorderingMode::Failed:
1868	llvm_unreachable("Not expected Failed reordering mode.");
1869	}
1870	}
1871
1872	if (BestOp.Idx) {
1873	getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
1874	return BestOp.Idx;
1875	}
1876	// If we could not find a good match return std::nullopt.
1877	return std::nullopt;
1878	}
1879
1880	/// Helper for reorderOperandVecs.
1881	/// \returns the lane that we should start reordering from. This is the one
1882	/// which has the least number of operands that can freely move about or
1883	/// less profitable because it already has the most optimal set of operands.
1884	unsigned getBestLaneToStartReordering() const {
1885	unsigned Min = UINT_MAX;
1886	unsigned SameOpNumber = `0`;
1887	// std::pair<unsigned, unsigned> is used to implement a simple voting
1888	// algorithm and choose the lane with the least number of operands that
1889	// can freely move about or less profitable because it already has the
1890	// most optimal set of operands. The first unsigned is a counter for
1891	// voting, the second unsigned is the counter of lanes with instructions
1892	// with same/alternate opcodes and same parent basic block.
1893	MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
1894	// Try to be closer to the original results, if we have multiple lanes
1895	// with same cost. If 2 lanes have the same cost, use the one with the
1896	// lowest index.
1897	for (int I = getNumLanes(); I > `0`; --I) {
1898	unsigned Lane = I - `1`;
1899	OperandsOrderData NumFreeOpsHash =
1900	getMaxNumOperandsThatCanBeReordered(Lane);
1901	// Compare the number of operands that can move and choose the one with
1902	// the least number.
1903	if (NumFreeOpsHash.NumOfAPOs < Min) {
1904	Min = NumFreeOpsHash.NumOfAPOs;
1905	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1906	HashMap.clear();
1907	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
1908	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
1909	NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1910	// Select the most optimal lane in terms of number of operands that
1911	// should be moved around.
1912	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1913	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
1914	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
1915	NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1916	auto *It = HashMap.find(Key: NumFreeOpsHash.Hash);
1917	if (It == HashMap.end())
1918	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
1919	else
1920	++It->second.first;
1921	}
1922	}
1923	// Select the lane with the minimum counter.
1924	unsigned BestLane = `0`;
1925	unsigned CntMin = UINT_MAX;
1926	for (const auto &Data : reverse(C&: HashMap)) {
1927	if (Data.second.first < CntMin) {
1928	CntMin = Data.second.first;
1929	BestLane = Data.second.second;
1930	}
1931	}
1932	return BestLane;
1933	}
1934
1935	/// Data structure that helps to reorder operands.
1936	struct OperandsOrderData {
1937	/// The best number of operands with the same APOs, which can be
1938	/// reordered.
1939	unsigned NumOfAPOs = UINT_MAX;
1940	/// Number of operands with the same/alternate instruction opcode and
1941	/// parent.
1942	unsigned NumOpsWithSameOpcodeParent = `0`;
1943	/// Hash for the actual operands ordering.
1944	/// Used to count operands, actually their position id and opcode
1945	/// value. It is used in the voting mechanism to find the lane with the
1946	/// least number of operands that can freely move about or less profitable
1947	/// because it already has the most optimal set of operands. Can be
1948	/// replaced with SmallVector<unsigned> instead but hash code is faster
1949	/// and requires less memory.
1950	unsigned Hash = `0`;
1951	};
1952	/// \returns the maximum number of operands that are allowed to be reordered
1953	/// for \p Lane and the number of compatible instructions(with the same
1954	/// parent/opcode). This is used as a heuristic for selecting the first lane
1955	/// to start operand reordering.
1956	OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1957	unsigned CntTrue = `0`;
1958	unsigned NumOperands = getNumOperands();
1959	// Operands with the same APO can be reordered. We therefore need to count
1960	// how many of them we have for each APO, like this: Cnt[APO] = x.
1961	// Since we only have two APOs, namely true and false, we can avoid using
1962	// a map. Instead we can simply count the number of operands that
1963	// correspond to one of them (in this case the 'true' APO), and calculate
1964	// the other by subtracting it from the total number of operands.
1965	// Operands with the same instruction opcode and parent are more
1966	// profitable since we don't need to move them in many cases, with a high
1967	// probability such lane already can be vectorized effectively.
1968	bool AllUndefs = true;
1969	unsigned NumOpsWithSameOpcodeParent = `0`;
1970	Instruction OpcodeI = nullptr*;
1971	BasicBlock Parent = nullptr*;
1972	unsigned Hash = `0`;
1973	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
1974	const OperandData &OpData = getData(OpIdx, Lane);
1975	if (OpData.APO)
1976	++CntTrue;
1977	// Use Boyer-Moore majority voting for finding the majority opcode and
1978	// the number of times it occurs.
1979	if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
1980	if (!OpcodeI \|\| !getSameOpcode(VL: {OpcodeI, I}, TLI).getOpcode() \|\|
1981	I->getParent() != Parent) {
1982	if (NumOpsWithSameOpcodeParent == `0`) {
1983	NumOpsWithSameOpcodeParent = `1`;
1984	OpcodeI = I;
1985	Parent = I->getParent();
1986	} else {
1987	--NumOpsWithSameOpcodeParent;
1988	}
1989	} else {
1990	++NumOpsWithSameOpcodeParent;
1991	}
1992	}
1993	Hash = hash_combine(
1994	args: Hash, args: hash_value(value: (OpIdx + `1`) * (OpData.V->getValueID() + `1`)));
1995	AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
1996	}
1997	if (AllUndefs)
1998	return {};
1999	OperandsOrderData Data;
2000	Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
2001	Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2002	Data.Hash = Hash;
2003	return Data;
2004	}
2005
2006	/// Go through the instructions in VL and append their operands.
2007	void appendOperandsOfVL(ArrayRef<Value *> VL) {
2008	assert(!VL.empty() && "Bad VL");
2009	assert((empty() \|\| VL.size() == getNumLanes()) &&
2010	"Expected same number of lanes");
2011	assert(isa<Instruction>(VL[`0`]) && "Expected instruction");
2012	unsigned NumOperands = cast<Instruction>(Val: VL [`0`])->getNumOperands();
2013	constexpr unsigned IntrinsicNumOperands = `2`;
2014	if (isa<IntrinsicInst>(Val: VL [`0`]))
2015	NumOperands = IntrinsicNumOperands;
2016	OpsVec.resize(N: NumOperands);
2017	unsigned NumLanes = VL.size();
2018	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2019	OpsVec [OpIdx].resize(N: NumLanes);
2020	for (unsigned Lane = `0`; Lane != NumLanes; ++Lane) {
2021	assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2022	// Our tree has just 3 nodes: the root and two operands.
2023	// It is therefore trivial to get the APO. We only need to check the
2024	// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2025	// RHS operand. The LHS operand of both add and sub is never attached
2026	// to an inversese operation in the linearized form, therefore its APO
2027	// is false. The RHS is true only if VL[Lane] is an inverse operation.
2028
2029	// Since operand reordering is performed on groups of commutative
2030	// operations or alternating sequences (e.g., +, -), we can safely
2031	// tell the inverse operations by checking commutativity.
2032	bool IsInverseOperation = !isCommutative(I: cast<Instruction>(Val: VL [Lane]));
2033	bool APO = (OpIdx == `0`) ? false : IsInverseOperation;
2034	OpsVec [OpIdx][Lane] = {cast<Instruction>(Val: VL [Lane])->getOperand(i: OpIdx),
2035	APO, false};
2036	}
2037	}
2038	}
2039
2040	/// \returns the number of operands.
2041	unsigned getNumOperands() const { return OpsVec.size(); }
2042
2043	/// \returns the number of lanes.
2044	unsigned getNumLanes() const { return OpsVec [`0`].size(); }
2045
2046	/// \returns the operand value at \p OpIdx and \p Lane.
2047	Value getValue(unsigned* OpIdx, unsigned Lane) const {
2048	return getData(OpIdx, Lane).V;
2049	}
2050
2051	/// \returns true if the data structure is empty.
2052	bool empty() const { return OpsVec.empty(); }
2053
2054	/// Clears the data.
2055	void clear() { OpsVec.clear(); }
2056
2057	/// \Returns true if there are enough operands identical to \p Op to fill
2058	/// the whole vector.
2059	/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2060	bool shouldBroadcast(Value Op, unsigned* OpIdx, unsigned Lane) {
2061	bool OpAPO = getData(OpIdx, Lane).APO;
2062	for (unsigned Ln = `0`, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2063	if (Ln == Lane)
2064	continue;
2065	// This is set to true if we found a candidate for broadcast at Lane.
2066	bool FoundCandidate = false;
2067	for (unsigned OpI = `0`, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2068	OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2069	if (Data.APO != OpAPO \|\| Data.IsUsed)
2070	continue;
2071	if (Data.V == Op) {
2072	FoundCandidate = true;
2073	Data.IsUsed = true;
2074	break;
2075	}
2076	}
2077	if (!FoundCandidate)
2078	return false;
2079	}
2080	return true;
2081	}
2082
2083	public:
2084	/// Initialize with all the operands of the instruction vector \p RootVL.
2085	VLOperands(ArrayRef<Value > RootVL, const* BoUpSLP &R)
2086	: TLI(R.TLI), DL(R.DL), SE(*R.SE), R(R) {
2087	// Append all the operands of RootVL.
2088	appendOperandsOfVL(VL: RootVL);
2089	}
2090
2091	/// \Returns a value vector with the operands across all lanes for the
2092	/// opearnd at \p OpIdx.
2093	ValueList getVL(unsigned OpIdx) const {
2094	ValueList OpVL(OpsVec [OpIdx].size());
2095	assert(OpsVec[OpIdx].size() == getNumLanes() &&
2096	"Expected same num of lanes across all operands");
2097	for (unsigned Lane = `0`, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2098	OpVL [Lane] = OpsVec [OpIdx][Lane].V;
2099	return OpVL;
2100	}
2101
2102	// Performs operand reordering for 2 or more operands.
2103	// The original operands are in OrigOps[OpIdx][Lane].
2104	// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2105	void reorder() {
2106	unsigned NumOperands = getNumOperands();
2107	unsigned NumLanes = getNumLanes();
2108	// Each operand has its own mode. We are using this mode to help us select
2109	// the instructions for each lane, so that they match best with the ones
2110	// we have selected so far.
2111	SmallVector<ReorderingMode, `2`> ReorderingModes(NumOperands);
2112
2113	// This is a greedy single-pass algorithm. We are going over each lane
2114	// once and deciding on the best order right away with no back-tracking.
2115	// However, in order to increase its effectiveness, we start with the lane
2116	// that has operands that can move the least. For example, given the
2117	// following lanes:
2118	// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2119	// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2120	// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2121	// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2122	// we will start at Lane 1, since the operands of the subtraction cannot
2123	// be reordered. Then we will visit the rest of the lanes in a circular
2124	// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2125
2126	// Find the first lane that we will start our search from.
2127	unsigned FirstLane = getBestLaneToStartReordering();
2128
2129	// Initialize the modes.
2130	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2131	Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
2132	// Keep track if we have instructions with all the same opcode on one
2133	// side.
2134	if (isa<LoadInst>(Val: OpLane0))
2135	ReorderingModes [OpIdx] = ReorderingMode::Load;
2136	else if (isa<Instruction>(Val: OpLane0)) {
2137	// Check if OpLane0 should be broadcast.
2138	if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane))
2139	ReorderingModes [OpIdx] = ReorderingMode::Splat;
2140	else
2141	ReorderingModes [OpIdx] = ReorderingMode::Opcode;
2142	}
2143	else if (isa<Constant>(Val: OpLane0))
2144	ReorderingModes [OpIdx] = ReorderingMode::Constant;
2145	else if (isa<Argument>(Val: OpLane0))
2146	// Our best hope is a Splat. It may save some cost in some cases.
2147	ReorderingModes [OpIdx] = ReorderingMode::Splat;
2148	else
2149	// NOTE: This should be unreachable.
2150	ReorderingModes [OpIdx] = ReorderingMode::Failed;
2151	}
2152
2153	// Check that we don't have same operands. No need to reorder if operands
2154	// are just perfect diamond or shuffled diamond match. Do not do it only
2155	// for possible broadcasts or non-power of 2 number of scalars (just for
2156	// now).
2157	auto &&SkipReordering = [this]() {
2158	SmallPtrSet<Value *, `4`> UniqueValues;
2159	ArrayRef<OperandData> Op0 = OpsVec.front();
2160	for (const OperandData &Data : Op0)
2161	UniqueValues.insert(Ptr: Data.V);
2162	for (ArrayRef<OperandData> Op : drop_begin(RangeOrContainer&: OpsVec, N: `1`)) {
2163	if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
2164	return !UniqueValues.contains(Ptr: Data.V);
2165	}))
2166	return false;
2167	}
2168	// TODO: Check if we can remove a check for non-power-2 number of
2169	// scalars after full support of non-power-2 vectorization.
2170	return UniqueValues.size() != `2` && isPowerOf2_32(Value: UniqueValues.size());
2171	};
2172
2173	// If the initial strategy fails for any of the operand indexes, then we
2174	// perform reordering again in a second pass. This helps avoid assigning
2175	// high priority to the failed strategy, and should improve reordering for
2176	// the non-failed operand indexes.
2177	for (int Pass = `0`; Pass != `2`; ++Pass) {
2178	// Check if no need to reorder operands since they're are perfect or
2179	// shuffled diamond match.
2180	// Need to do it to avoid extra external use cost counting for
2181	// shuffled matches, which may cause regressions.
2182	if (SkipReordering())
2183	break;
2184	// Skip the second pass if the first pass did not fail.
2185	bool StrategyFailed = false;
2186	// Mark all operand data as free to use.
2187	clearUsed();
2188	// We keep the original operand order for the FirstLane, so reorder the
2189	// rest of the lanes. We are visiting the nodes in a circular fashion,
2190	// using FirstLane as the center point and increasing the radius
2191	// distance.
2192	SmallVector<SmallVector<Value *, `2`>> MainAltOps(NumOperands);
2193	for (unsigned I = `0`; I < NumOperands; ++I)
2194	MainAltOps [I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
2195
2196	for (unsigned Distance = `1`; Distance != NumLanes; ++Distance) {
2197	// Visit the lane on the right and then the lane on the left.
2198	for (int Direction : {+`1`, -`1`}) {
2199	int Lane = FirstLane + Direction * Distance;
2200	if (Lane < `0` \|\| Lane >= (int)NumLanes)
2201	continue;
2202	int LastLane = Lane - Direction;
2203	assert(LastLane >= `0` && LastLane < (int)NumLanes &&
2204	"Out of bounds");
2205	// Look for a good match for each operand.
2206	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2207	// Search for the operand that matches SortedOps[OpIdx][Lane-1].
2208	std::optional<unsigned> BestIdx = getBestOperand(
2209	OpIdx, Lane, LastLane, ReorderingModes, MainAltOps: MainAltOps [OpIdx]);
2210	// By not selecting a value, we allow the operands that follow to
2211	// select a better matching value. We will get a non-null value in
2212	// the next run of getBestOperand().
2213	if (BestIdx) {
2214	// Swap the current operand with the one returned by
2215	// getBestOperand().
2216	swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
2217	} else {
2218	// We failed to find a best operand, set mode to 'Failed'.
2219	ReorderingModes [OpIdx] = ReorderingMode::Failed;
2220	// Enable the second pass.
2221	StrategyFailed = true;
2222	}
2223	// Try to get the alternate opcode and follow it during analysis.
2224	if (MainAltOps [OpIdx].size() != `2`) {
2225	OperandData &AltOp = getData(OpIdx, Lane);
2226	InstructionsState OpS =
2227	getSameOpcode(VL: {MainAltOps [OpIdx].front(), AltOp.V}, TLI);
2228	if (OpS.getOpcode() && OpS.isAltShuffle())
2229	MainAltOps [OpIdx].push_back(Elt: AltOp.V);
2230	}
2231	}
2232	}
2233	}
2234	// Skip second pass if the strategy did not fail.
2235	if (!StrategyFailed)
2236	break;
2237	}
2238	}
2239
2240	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2241	LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2242	switch (RMode) {
2243	case ReorderingMode::Load:
2244	return "Load";
2245	case ReorderingMode::Opcode:
2246	return "Opcode";
2247	case ReorderingMode::Constant:
2248	return "Constant";
2249	case ReorderingMode::Splat:
2250	return "Splat";
2251	case ReorderingMode::Failed:
2252	return "Failed";
2253	}
2254	llvm_unreachable("Unimplemented Reordering Type");
2255	}
2256
2257	LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2258	raw_ostream &OS) {
2259	return OS << getModeStr(RMode);
2260	}
2261
2262	/// Debug print.
2263	LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2264	printMode(RMode, OS&: dbgs());
2265	}
2266
2267	friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2268	return printMode(RMode, OS);
2269	}
2270
2271	LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2272	const unsigned Indent = `2`;
2273	unsigned Cnt = `0`;
2274	for (const OperandDataVec &OpDataVec : OpsVec) {
2275	OS << "Operand " << Cnt++ << "\n";
2276	for (const OperandData &OpData : OpDataVec) {
2277	OS.indent(NumSpaces: Indent) << "{";
2278	if (Value *V = OpData.V)
2279	OS << *V;
2280	else
2281	OS << "null";
2282	OS << ", APO:" << OpData.APO << "}\n";
2283	}
2284	OS << "\n";
2285	}
2286	return OS;
2287	}
2288
2289	/// Debug print.
2290	LLVM_DUMP_METHOD void dump() const { print(OS&: dbgs()); }
2291	#endif
2292	};
2293
2294	/// Evaluate each pair in \p Candidates and return index into \p Candidates
2295	/// for a pair which have highest score deemed to have best chance to form
2296	/// root of profitable tree to vectorize. Return std::nullopt if no candidate
2297	/// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2298	/// of the cost, considered to be good enough score.
2299	std::optional<int>
2300	findBestRootPair(ArrayRef<std::pair<Value , Value >> Candidates,
2301	int Limit = LookAheadHeuristics::ScoreFail) const {
2302	LookAheadHeuristics LookAhead(TLI, DL, SE, this, /NumLanes=/`2`,
2303	RootLookAheadMaxDepth);
2304	int BestScore = Limit;
2305	std::optional<int> Index;
2306	for (int I : seq<int>(Begin: `0`, End: Candidates.size())) {
2307	int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates [I].first,
2308	RHS: Candidates [I].second,
2309	/U1=/nullptr, /U2=/nullptr,
2310	/Level=/CurrLevel: `1`, MainAltOps: std::nullopt);
2311	if (Score > BestScore) {
2312	BestScore = Score;
2313	Index = I;
2314	}
2315	}
2316	return Index;
2317	}
2318
2319	/// Checks if the instruction is marked for deletion.
2320	bool isDeleted(Instruction I) const* { return DeletedInstructions.count(V: I); }
2321
2322	/// Removes an instruction from its block and eventually deletes it.
2323	/// It's like Instruction::eraseFromParent() except that the actual deletion
2324	/// is delayed until BoUpSLP is destructed.
2325	void eraseInstruction(Instruction *I) {
2326	DeletedInstructions.insert(V: I);
2327	}
2328
2329	/// Checks if the instruction was already analyzed for being possible
2330	/// reduction root.
2331	bool isAnalyzedReductionRoot(Instruction I) const* {
2332	return AnalyzedReductionsRoots.count(Ptr: I);
2333	}
2334	/// Register given instruction as already analyzed for being possible
2335	/// reduction root.
2336	void analyzedReductionRoot(Instruction *I) {
2337	AnalyzedReductionsRoots.insert(Ptr: I);
2338	}
2339	/// Checks if the provided list of reduced values was checked already for
2340	/// vectorization.
2341	bool areAnalyzedReductionVals(ArrayRef<Value > VL) const* {
2342	return AnalyzedReductionVals.contains(V: hash_value(S: VL));
2343	}
2344	/// Adds the list of reduced values to list of already checked values for the
2345	/// vectorization.
2346	void analyzedReductionVals(ArrayRef<Value *> VL) {
2347	AnalyzedReductionVals.insert(V: hash_value(S: VL));
2348	}
2349	/// Clear the list of the analyzed reduction root instructions.
2350	void clearReductionData() {
2351	AnalyzedReductionsRoots.clear();
2352	AnalyzedReductionVals.clear();
2353	AnalyzedMinBWVals.clear();
2354	}
2355	/// Checks if the given value is gathered in one of the nodes.
2356	bool isAnyGathered(const SmallDenseSet<Value > &Vals) const* {
2357	return any_of(Range: MustGather, P: [&](Value V) { return* Vals.contains(V); });
2358	}
2359
2360	/// Check if the value is vectorized in the tree.
2361	bool isVectorized(Value V) const* { return getTreeEntry(V); }
2362
2363	~BoUpSLP();
2364
2365	private:
2366	/// Determine if a node \p E in can be demoted to a smaller type with a
2367	/// truncation. We collect the entries that will be demoted in ToDemote.
2368	/// \param E Node for analysis
2369	/// \param ToDemote indices of the nodes to be demoted.
2370	bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2371	unsigned &BitWidth,
2372	SmallVectorImpl<unsigned> &ToDemote,
2373	DenseSet<const TreeEntry *> &Visited,
2374	unsigned &MaxDepthLevel,
2375	bool &IsProfitableToDemote,
2376	bool IsTruncRoot) const;
2377
2378	/// Check if the operands on the edges \p Edges of the \p UserTE allows
2379	/// reordering (i.e. the operands can be reordered because they have only one
2380	/// user and reordarable).
2381	/// \param ReorderableGathers List of all gather nodes that require reordering
2382	/// (e.g., gather of extractlements or partially vectorizable loads).
2383	/// \param GatherOps List of gather operand nodes for \p UserTE that require
2384	/// reordering, subset of \p NonVectorized.
2385	bool
2386	canReorderOperands(TreeEntry *UserTE,
2387	SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2388	ArrayRef<TreeEntry *> ReorderableGathers,
2389	SmallVectorImpl<TreeEntry *> &GatherOps);
2390
2391	/// Checks if the given \p TE is a gather node with clustered reused scalars
2392	/// and reorders it per given \p Mask.
2393	void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2394
2395	/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2396	/// if any. If it is not vectorized (gather node), returns nullptr.
2397	TreeEntry getVectorizedOperand(TreeEntry UserTE, unsigned OpIdx) {
2398	ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2399	TreeEntry TE = nullptr*;
2400	const auto It = find_if(Range&: VL, P: [&](Value V) {
2401	TE = getTreeEntry(V);
2402	if (TE && is_contained(Range&: TE->UserTreeIndices, Element: EdgeInfo (UserTE, OpIdx)))
2403	return true;
2404	auto It = MultiNodeScalars.find(Val: V);
2405	if (It != MultiNodeScalars.end()) {
2406	for (TreeEntry *E : It ->second) {
2407	if (is_contained(Range&: E->UserTreeIndices, Element: EdgeInfo (UserTE, OpIdx))) {
2408	TE = E;
2409	return true;
2410	}
2411	}
2412	}
2413	return false;
2414	});
2415	if (It != VL.end()) {
2416	assert(TE->isSame(VL) && "Expected same scalars.");
2417	return TE;
2418	}
2419	return nullptr;
2420	}
2421
2422	/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2423	/// if any. If it is not vectorized (gather node), returns nullptr.
2424	const TreeEntry getVectorizedOperand(const* TreeEntry *UserTE,
2425	unsigned OpIdx) const {
2426	return const_cast<BoUpSLP >(this*)->getVectorizedOperand(
2427	UserTE: const_cast<TreeEntry *>(UserTE), OpIdx);
2428	}
2429
2430	/// Checks if all users of \p I are the part of the vectorization tree.
2431	bool areAllUsersVectorized(
2432	Instruction *I,
2433	const SmallDenseSet<Value > VectorizedVals = nullptr) const;
2434
2435	/// Return information about the vector formed for the specified index
2436	/// of a vector of (the same) instruction.
2437	TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2438
2439	/// \ returns the graph entry for the \p Idx operand of the \p E entry.
2440	const TreeEntry getOperandEntry(const* TreeEntry E, unsigned* Idx) const;
2441
2442	/// \returns Cast context for the given graph node.
2443	TargetTransformInfo::CastContextHint
2444	getCastContextHint(const TreeEntry &TE) const;
2445
2446	/// \returns the cost of the vectorizable entry.
2447	InstructionCost getEntryCost(const TreeEntry *E,
2448	ArrayRef<Value *> VectorizedVals,
2449	SmallPtrSetImpl<Value *> &CheckedExtracts);
2450
2451	/// This is the recursive part of buildTree.
2452	void buildTree_rec(ArrayRef<Value > Roots, unsigned* Depth,
2453	const EdgeInfo &EI);
2454
2455	/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2456	/// be vectorized to use the original vector (or aggregate "bitcast" to a
2457	/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2458	/// returns false, setting \p CurrentOrder to either an empty vector or a
2459	/// non-identity permutation that allows to reuse extract instructions.
2460	/// \param ResizeAllowed indicates whether it is allowed to handle subvector
2461	/// extract order.
2462	bool canReuseExtract(ArrayRef<Value > VL, Value OpValue,
2463	SmallVectorImpl<unsigned> &CurrentOrder,
2464	bool ResizeAllowed = false) const;
2465
2466	/// Vectorize a single entry in the tree.
2467	/// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2468	/// avoid issues with def-use order.
2469	Value vectorizeTree(TreeEntry E, bool PostponedPHIs);
2470
2471	/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2472	/// \p E.
2473	/// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2474	/// avoid issues with def-use order.
2475	Value vectorizeOperand(TreeEntry E, unsigned NodeIdx, bool PostponedPHIs);
2476
2477	/// Create a new vector from a list of scalar values. Produces a sequence
2478	/// which exploits values reused across lanes, and arranges the inserts
2479	/// for ease of later optimization.
2480	template <typename BVTy, typename ResTy, typename... Args>
2481	ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2482
2483	/// Create a new vector from a list of scalar values. Produces a sequence
2484	/// which exploits values reused across lanes, and arranges the inserts
2485	/// for ease of later optimization.
2486	Value createBuildVector(const* TreeEntry *E);
2487
2488	/// Returns the instruction in the bundle, which can be used as a base point
2489	/// for scheduling. Usually it is the last instruction in the bundle, except
2490	/// for the case when all operands are external (in this case, it is the first
2491	/// instruction in the list).
2492	Instruction &getLastInstructionInBundle(const TreeEntry *E);
2493
2494	/// Tries to find extractelement instructions with constant indices from fixed
2495	/// vector type and gather such instructions into a bunch, which highly likely
2496	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2497	/// was successful, the matched scalars are replaced by poison values in \p VL
2498	/// for future analysis.
2499	std::optional<TargetTransformInfo::ShuffleKind>
2500	tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2501	SmallVectorImpl<int> &Mask) const;
2502
2503	/// Tries to find extractelement instructions with constant indices from fixed
2504	/// vector type and gather such instructions into a bunch, which highly likely
2505	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2506	/// was successful, the matched scalars are replaced by poison values in \p VL
2507	/// for future analysis.
2508	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2509	tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2510	SmallVectorImpl<int> &Mask,
2511	unsigned NumParts) const;
2512
2513	/// Checks if the gathered \p VL can be represented as a single register
2514	/// shuffle(s) of previous tree entries.
2515	/// \param TE Tree entry checked for permutation.
2516	/// \param VL List of scalars (a subset of the TE scalar), checked for
2517	/// permutations. Must form single-register vector.
2518	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2519	/// commands to build the mask using the original vector value, without
2520	/// relying on the potential reordering.
2521	/// \returns ShuffleKind, if gathered values can be represented as shuffles of
2522	/// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2523	std::optional<TargetTransformInfo::ShuffleKind>
2524	isGatherShuffledSingleRegisterEntry(
2525	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
2526	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part,
2527	bool ForOrder);
2528
2529	/// Checks if the gathered \p VL can be represented as multi-register
2530	/// shuffle(s) of previous tree entries.
2531	/// \param TE Tree entry checked for permutation.
2532	/// \param VL List of scalars (a subset of the TE scalar), checked for
2533	/// permutations.
2534	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2535	/// commands to build the mask using the original vector value, without
2536	/// relying on the potential reordering.
2537	/// \returns per-register series of ShuffleKind, if gathered values can be
2538	/// represented as shuffles of previous tree entries. \p Mask is filled with
2539	/// the shuffle mask (also on per-register base).
2540	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2541	isGatherShuffledEntry(
2542	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
2543	SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2544	unsigned NumParts, bool ForOrder = false);
2545
2546	/// \returns the scalarization cost for this list of values. Assuming that
2547	/// this subtree gets vectorized, we may need to extract the values from the
2548	/// roots. This method calculates the cost of extracting the values.
2549	/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2550	InstructionCost getGatherCost(ArrayRef<Value > VL, bool* ForPoisonSrc) const;
2551
2552	/// Set the Builder insert point to one after the last instruction in
2553	/// the bundle
2554	void setInsertPointAfterBundle(const TreeEntry *E);
2555
2556	/// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2557	/// specified, the starting vector value is poison.
2558	Value gather(ArrayRef<Value > VL, Value *Root);
2559
2560	/// \returns whether the VectorizableTree is fully vectorizable and will
2561	/// be beneficial even the tree height is tiny.
2562	bool isFullyVectorizableTinyTree(bool ForReduction) const;
2563
2564	/// Reorder commutative or alt operands to get better probability of
2565	/// generating vectorized code.
2566	static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2567	SmallVectorImpl<Value *> &Left,
2568	SmallVectorImpl<Value *> &Right,
2569	const BoUpSLP &R);
2570
2571	/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2572	/// users of \p TE and collects the stores. It returns the map from the store
2573	/// pointers to the collected stores.
2574	DenseMap<Value , SmallVector<StoreInst >>
2575	collectUserStores(const BoUpSLP::TreeEntry TE) const*;
2576
2577	/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2578	/// stores in \p StoresVec can form a vector instruction. If so it returns
2579	/// true and populates \p ReorderIndices with the shuffle indices of the
2580	/// stores when compared to the sorted vector.
2581	bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2582	OrdersType &ReorderIndices) const;
2583
2584	/// Iterates through the users of \p TE, looking for scalar stores that can be
2585	/// potentially vectorized in a future SLP-tree. If found, it keeps track of
2586	/// their order and builds an order index vector for each store bundle. It
2587	/// returns all these order vectors found.
2588	/// We run this after the tree has formed, otherwise we may come across user
2589	/// instructions that are not yet in the tree.
2590	SmallVector<OrdersType, `1`>
2591	findExternalStoreUsersReorderIndices(TreeEntry TE) const*;
2592
2593	struct TreeEntry {
2594	using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, `8`>;
2595	TreeEntry(VecTreeTy &Container) : Container(Container) {}
2596
2597	/// \returns Common mask for reorder indices and reused scalars.
2598	SmallVector<int> getCommonMask() const {
2599	SmallVector<int> Mask;
2600	inversePermutation(Indices: ReorderIndices, Mask);
2601	::addMask(Mask, SubMask: ReuseShuffleIndices);
2602	return Mask;
2603	}
2604
2605	/// \returns true if the scalars in VL are equal to this entry.
2606	bool isSame(ArrayRef<Value > VL) const* {
2607	auto &&IsSame = [VL](ArrayRef<Value > Scalars, ArrayRef<int*> Mask) {
2608	if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2609	return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
2610	return VL.size() == Mask.size() &&
2611	std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
2612	binary_pred: [Scalars](Value V, int* Idx) {
2613	return (isa<UndefValue>(Val: V) &&
2614	Idx == PoisonMaskElem) \|\|
2615	(Idx != PoisonMaskElem && V == Scalars [Idx]);
2616	});
2617	};
2618	if (!ReorderIndices.empty()) {
2619	// TODO: implement matching if the nodes are just reordered, still can
2620	// treat the vector as the same if the list of scalars matches VL
2621	// directly, without reordering.
2622	SmallVector<int> Mask;
2623	inversePermutation(Indices: ReorderIndices, Mask);
2624	if (VL.size() == Scalars.size())
2625	return IsSame(Scalars, Mask);
2626	if (VL.size() == ReuseShuffleIndices.size()) {
2627	::addMask(Mask, SubMask: ReuseShuffleIndices);
2628	return IsSame(Scalars, Mask);
2629	}
2630	return false;
2631	}
2632	return IsSame(Scalars, ReuseShuffleIndices);
2633	}
2634
2635	bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2636	return State == TreeEntry::NeedToGather &&
2637	UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2638	UserTreeIndices.front().UserTE == UserEI.UserTE;
2639	}
2640
2641	/// \returns true if current entry has same operands as \p TE.
2642	bool hasEqualOperands(const TreeEntry &TE) const {
2643	if (TE.getNumOperands() != getNumOperands())
2644	return false;
2645	SmallBitVector Used(getNumOperands());
2646	for (unsigned I = `0`, E = getNumOperands(); I < E; ++I) {
2647	unsigned PrevCount = Used.count();
2648	for (unsigned K = `0`; K < E; ++K) {
2649	if (Used.test(Idx: K))
2650	continue;
2651	if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
2652	Used.set(K);
2653	break;
2654	}
2655	}
2656	// Check if we actually found the matching operand.
2657	if (PrevCount == Used.count())
2658	return false;
2659	}
2660	return true;
2661	}
2662
2663	/// \return Final vectorization factor for the node. Defined by the total
2664	/// number of vectorized scalars, including those, used several times in the
2665	/// entry and counted in the \a ReuseShuffleIndices, if any.
2666	unsigned getVectorFactor() const {
2667	if (!ReuseShuffleIndices.empty())
2668	return ReuseShuffleIndices.size();
2669	return Scalars.size();
2670	};
2671
2672	/// A vector of scalars.
2673	ValueList Scalars;
2674
2675	/// The Scalars are vectorized into this value. It is initialized to Null.
2676	WeakTrackingVH VectorizedValue = nullptr;
2677
2678	/// New vector phi instructions emitted for the vectorized phi nodes.
2679	PHINode PHI = nullptr*;
2680
2681	/// Do we need to gather this sequence or vectorize it
2682	/// (either with vector instruction or with scatter/gather
2683	/// intrinsics for store/load)?
2684	enum EntryState {
2685	Vectorize,
2686	ScatterVectorize,
2687	StridedVectorize,
2688	NeedToGather
2689	};
2690	EntryState State;
2691
2692	/// Does this sequence require some shuffling?
2693	SmallVector<int, `4`> ReuseShuffleIndices;
2694
2695	/// Does this entry require reordering?
2696	SmallVector<unsigned, `4`> ReorderIndices;
2697
2698	/// Points back to the VectorizableTree.
2699	///
2700	/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2701	/// to be a pointer and needs to be able to initialize the child iterator.
2702	/// Thus we need a reference back to the container to translate the indices
2703	/// to entries.
2704	VecTreeTy &Container;
2705
2706	/// The TreeEntry index containing the user of this entry. We can actually
2707	/// have multiple users so the data structure is not truly a tree.
2708	SmallVector<EdgeInfo, `1`> UserTreeIndices;
2709
2710	/// The index of this treeEntry in VectorizableTree.
2711	int Idx = -`1`;
2712
2713	private:
2714	/// The operands of each instruction in each lane Operands[op_index][lane].
2715	/// Note: This helps avoid the replication of the code that performs the
2716	/// reordering of operands during buildTree_rec() and vectorizeTree().
2717	SmallVector<ValueList, `2`> Operands;
2718
2719	/// The main/alternate instruction.
2720	Instruction MainOp = nullptr*;
2721	Instruction AltOp = nullptr*;
2722
2723	public:
2724	/// Set this bundle's \p OpIdx'th operand to \p OpVL.
2725	void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2726	if (Operands.size() < OpIdx + `1`)
2727	Operands.resize(N: OpIdx + `1`);
2728	assert(Operands[OpIdx].empty() && "Already resized?");
2729	assert(OpVL.size() <= Scalars.size() &&
2730	"Number of operands is greater than the number of scalars.");
2731	Operands [OpIdx].resize(N: OpVL.size());
2732	copy(Range&: OpVL, Out: Operands [OpIdx].begin());
2733	}
2734
2735	/// Set the operands of this bundle in their original order.
2736	void setOperandsInOrder() {
2737	assert(Operands.empty() && "Already initialized?");
2738	auto *I0 = cast<Instruction>(Val: Scalars [`0`]);
2739	Operands.resize(N: I0->getNumOperands());
2740	unsigned NumLanes = Scalars.size();
2741	for (unsigned OpIdx = `0`, NumOperands = I0->getNumOperands();
2742	OpIdx != NumOperands; ++OpIdx) {
2743	Operands [OpIdx].resize(N: NumLanes);
2744	for (unsigned Lane = `0`; Lane != NumLanes; ++Lane) {
2745	auto *I = cast<Instruction>(Val: Scalars [Lane]);
2746	assert(I->getNumOperands() == NumOperands &&
2747	"Expected same number of operands");
2748	Operands [OpIdx][Lane] = I->getOperand(i: OpIdx);
2749	}
2750	}
2751	}
2752
2753	/// Reorders operands of the node to the given mask \p Mask.
2754	void reorderOperands(ArrayRef<int> Mask) {
2755	for (ValueList &Operand : Operands)
2756	reorderScalars(Scalars&: Operand, Mask);
2757	}
2758
2759	/// \returns the \p OpIdx operand of this TreeEntry.
2760	ValueList &getOperand(unsigned OpIdx) {
2761	assert(OpIdx < Operands.size() && "Off bounds");
2762	return Operands [OpIdx];
2763	}
2764
2765	/// \returns the \p OpIdx operand of this TreeEntry.
2766	ArrayRef<Value > getOperand(unsigned* OpIdx) const {
2767	assert(OpIdx < Operands.size() && "Off bounds");
2768	return Operands [OpIdx];
2769	}
2770
2771	/// \returns the number of operands.
2772	unsigned getNumOperands() const { return Operands.size(); }
2773
2774	/// \return the single \p OpIdx operand.
2775	Value getSingleOperand(unsigned* OpIdx) const {
2776	assert(OpIdx < Operands.size() && "Off bounds");
2777	assert(!Operands[OpIdx].empty() && "No operand available");
2778	return Operands [OpIdx][`0`];
2779	}
2780
2781	/// Some of the instructions in the list have alternate opcodes.
2782	bool isAltShuffle() const { return MainOp != AltOp; }
2783
2784	bool isOpcodeOrAlt(Instruction I) const* {
2785	unsigned CheckedOpcode = I->getOpcode();
2786	return (getOpcode() == CheckedOpcode \|\|
2787	getAltOpcode() == CheckedOpcode);
2788	}
2789
2790	/// Chooses the correct key for scheduling data. If \p Op has the same (or
2791	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2792	/// \p OpValue.
2793	Value isOneOf(Value Op) const {
2794	auto *I = dyn_cast<Instruction>(Val: Op);
2795	if (I && isOpcodeOrAlt(I))
2796	return Op;
2797	return MainOp;
2798	}
2799
2800	void setOperations(const InstructionsState &S) {
2801	MainOp = S.MainOp;
2802	AltOp = S.AltOp;
2803	}
2804
2805	Instruction getMainOp() const* {
2806	return MainOp;
2807	}
2808
2809	Instruction getAltOp() const* {
2810	return AltOp;
2811	}
2812
2813	/// The main/alternate opcodes for the list of instructions.
2814	unsigned getOpcode() const {
2815	return MainOp ? MainOp->getOpcode() : `0`;
2816	}
2817
2818	unsigned getAltOpcode() const {
2819	return AltOp ? AltOp->getOpcode() : `0`;
2820	}
2821
2822	/// When ReuseReorderShuffleIndices is empty it just returns position of \p
2823	/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2824	int findLaneForValue(Value V) const* {
2825	unsigned FoundLane = std::distance(first: Scalars.begin(), last: find(Range: Scalars, Val: V));
2826	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2827	if (!ReorderIndices.empty())
2828	FoundLane = ReorderIndices [FoundLane];
2829	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2830	if (!ReuseShuffleIndices.empty()) {
2831	FoundLane = std::distance(first: ReuseShuffleIndices.begin(),
2832	last: find(Range: ReuseShuffleIndices, Val: FoundLane));
2833	}
2834	return FoundLane;
2835	}
2836
2837	/// Build a shuffle mask for graph entry which represents a merge of main
2838	/// and alternate operations.
2839	void
2840	buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2841	SmallVectorImpl<int> &Mask,
2842	SmallVectorImpl<Value > OpScalars = nullptr,
2843	SmallVectorImpl<Value > AltScalars = nullptr) const;
2844
2845	/// Return true if this is a non-power-of-2 node.
2846	bool isNonPowOf2Vec() const {
2847	bool IsNonPowerOf2 = !isPowerOf2_32(Value: Scalars.size());
2848	assert((!IsNonPowerOf2 \|\| ReuseShuffleIndices.empty()) &&
2849	"Reshuffling not supported with non-power-of-2 vectors yet.");
2850	return IsNonPowerOf2;
2851	}
2852
2853	#ifndef NDEBUG
2854	/// Debug printer.
2855	LLVM_DUMP_METHOD void dump() const {
2856	dbgs() << Idx << ".\n";
2857	for (unsigned OpI = `0`, OpE = Operands.size(); OpI != OpE; ++OpI) {
2858	dbgs() << "Operand " << OpI << ":\n";
2859	for (const Value *V : Operands [OpI])
2860	dbgs().indent(NumSpaces: `2`) << *V << "\n";
2861	}
2862	dbgs() << "Scalars: \n";
2863	for (Value *V : Scalars)
2864	dbgs().indent(NumSpaces: `2`) << *V << "\n";
2865	dbgs() << "State: ";
2866	switch (State) {
2867	case Vectorize:
2868	dbgs() << "Vectorize\n";
2869	break;
2870	case ScatterVectorize:
2871	dbgs() << "ScatterVectorize\n";
2872	break;
2873	case StridedVectorize:
2874	dbgs() << "StridedVectorize\n";
2875	break;
2876	case NeedToGather:
2877	dbgs() << "NeedToGather\n";
2878	break;
2879	}
2880	dbgs() << "MainOp: ";
2881	if (MainOp)
2882	dbgs() << *MainOp << "\n";
2883	else
2884	dbgs() << "NULL\n";
2885	dbgs() << "AltOp: ";
2886	if (AltOp)
2887	dbgs() << *AltOp << "\n";
2888	else
2889	dbgs() << "NULL\n";
2890	dbgs() << "VectorizedValue: ";
2891	if (VectorizedValue)
2892	dbgs() << *VectorizedValue << "\n";
2893	else
2894	dbgs() << "NULL\n";
2895	dbgs() << "ReuseShuffleIndices: ";
2896	if (ReuseShuffleIndices.empty())
2897	dbgs() << "Empty";
2898	else
2899	for (int ReuseIdx : ReuseShuffleIndices)
2900	dbgs() << ReuseIdx << ", ";
2901	dbgs() << "\n";
2902	dbgs() << "ReorderIndices: ";
2903	for (unsigned ReorderIdx : ReorderIndices)
2904	dbgs() << ReorderIdx << ", ";
2905	dbgs() << "\n";
2906	dbgs() << "UserTreeIndices: ";
2907	for (const auto &EInfo : UserTreeIndices)
2908	dbgs() << EInfo << ", ";
2909	dbgs() << "\n";
2910	}
2911	#endif
2912	};
2913
2914	#ifndef NDEBUG
2915	void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2916	InstructionCost VecCost, InstructionCost ScalarCost,
2917	StringRef Banner) const {
2918	dbgs() << "SLP: " << Banner << ":\n";
2919	E->dump();
2920	dbgs() << "SLP: Costs:\n";
2921	dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2922	dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2923	dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2924	dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2925	<< ReuseShuffleCost + VecCost - ScalarCost << "\n";
2926	}
2927	#endif
2928
2929	/// Create a new VectorizableTree entry.
2930	TreeEntry newTreeEntry(ArrayRef<Value > VL,
2931	std::optional<ScheduleData *> Bundle,
2932	const InstructionsState &S,
2933	const EdgeInfo &UserTreeIdx,
2934	ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2935	ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2936	TreeEntry::EntryState EntryState =
2937	Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2938	return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2939	ReuseShuffleIndices, ReorderIndices);
2940	}
2941
2942	TreeEntry newTreeEntry(ArrayRef<Value > VL,
2943	TreeEntry::EntryState EntryState,
2944	std::optional<ScheduleData *> Bundle,
2945	const InstructionsState &S,
2946	const EdgeInfo &UserTreeIdx,
2947	ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2948	ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2949	assert(((!Bundle && EntryState == TreeEntry::NeedToGather) \|\|
2950	(Bundle && EntryState != TreeEntry::NeedToGather)) &&
2951	"Need to vectorize gather entry?");
2952	VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
2953	TreeEntry *Last = VectorizableTree.back().get();
2954	Last->Idx = VectorizableTree.size() - `1`;
2955	Last->State = EntryState;
2956	Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
2957	in_end: ReuseShuffleIndices.end());
2958	if (ReorderIndices.empty()) {
2959	Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
2960	Last->setOperations(S);
2961	} else {
2962	// Reorder scalars and build final mask.
2963	Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
2964	transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
2965	F: [VL](unsigned Idx) -> Value * {
2966	if (Idx >= VL.size())
2967	return UndefValue::get(T: VL.front()->getType());
2968	return VL [Idx];
2969	});
2970	InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
2971	Last->setOperations(S);
2972	Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
2973	}
2974	if (Last->State != TreeEntry::NeedToGather) {
2975	for (Value *V : VL) {
2976	const TreeEntry *TE = getTreeEntry(V);
2977	assert((!TE \|\| TE == Last \|\| doesNotNeedToBeScheduled(V)) &&
2978	"Scalar already in tree!");
2979	if (TE) {
2980	if (TE != Last)
2981	MultiNodeScalars.try_emplace(Key: V).first ->getSecond().push_back(Elt: Last);
2982	continue;
2983	}
2984	ScalarToTreeEntry [V] = Last;
2985	}
2986	// Update the scheduler bundle to point to this TreeEntry.
2987	ScheduleData BundleMember = Bundle;
2988	assert((BundleMember \|\| isa<PHINode>(S.MainOp) \|\|
2989	isVectorLikeInstWithConstOps(S.MainOp) \|\|
2990	doesNotNeedToSchedule(VL)) &&
2991	"Bundle and VL out of sync");
2992	if (BundleMember) {
2993	for (Value *V : VL) {
2994	if (doesNotNeedToBeScheduled(V))
2995	continue;
2996	if (!BundleMember)
2997	continue;
2998	BundleMember->TE = Last;
2999	BundleMember = BundleMember->NextInBundle;
3000	}
3001	}
3002	assert(!BundleMember && "Bundle and VL out of sync");
3003	} else {
3004	// Build a map for gathered scalars to the nodes where they are used.
3005	bool AllConstsOrCasts = true;
3006	for (Value *V : VL)
3007	if (!isConstant(V)) {
3008	auto *I = dyn_cast<CastInst>(Val: V);
3009	AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3010	ValueToGatherNodes.try_emplace(Key: V).first ->getSecond().insert(Ptr: Last);
3011	}
3012	if (AllConstsOrCasts)
3013	CastMaxMinBWSizes =
3014	std::make_pair(x: std::numeric_limits<unsigned>::max(), y: `1`);
3015	MustGather.insert(I: VL.begin(), E: VL.end());
3016	}
3017
3018	if (UserTreeIdx.UserTE) {
3019	Last->UserTreeIndices.push_back(Elt: UserTreeIdx);
3020	assert((!Last->isNonPowOf2Vec() \|\| Last->ReorderIndices.empty()) &&
3021	"Reordering isn't implemented for non-power-of-2 nodes yet");
3022	}
3023	return Last;
3024	}
3025
3026	/// -- Vectorization State --
3027	/// Holds all of the tree entries.
3028	TreeEntry::VecTreeTy VectorizableTree;
3029
3030	#ifndef NDEBUG
3031	/// Debug printer.
3032	LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3033	for (unsigned Id = `0`, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3034	VectorizableTree [Id]->dump();
3035	dbgs() << "\n";
3036	}
3037	}
3038	#endif
3039
3040	TreeEntry getTreeEntry(Value V) { return ScalarToTreeEntry.lookup(Val: V); }
3041
3042	const TreeEntry getTreeEntry(Value V) const {
3043	return ScalarToTreeEntry.lookup(Val: V);
3044	}
3045
3046	/// Check that the operand node of alternate node does not generate
3047	/// buildvector sequence. If it is, then probably not worth it to build
3048	/// alternate shuffle, if number of buildvector operands + alternate
3049	/// instruction > than the number of buildvector instructions.
3050	/// \param S the instructions state of the analyzed values.
3051	/// \param VL list of the instructions with alternate opcodes.
3052	bool areAltOperandsProfitable(const InstructionsState &S,
3053	ArrayRef<Value > VL) const*;
3054
3055	/// Checks if the specified list of the instructions/values can be vectorized
3056	/// and fills required data before actual scheduling of the instructions.
3057	TreeEntry::EntryState getScalarsVectorizationState(
3058	InstructionsState &S, ArrayRef<Value > VL, bool* IsScatterVectorizeUserTE,
3059	OrdersType &CurrentOrder, SmallVectorImpl<Value > &PointerOps) const*;
3060
3061	/// Maps a specific scalar to its tree entry.
3062	SmallDenseMap<Value , TreeEntry > ScalarToTreeEntry;
3063
3064	/// List of scalars, used in several vectorize nodes, and the list of the
3065	/// nodes.
3066	SmallDenseMap<Value , SmallVector<TreeEntry >> MultiNodeScalars;
3067
3068	/// Maps a value to the proposed vectorizable size.
3069	SmallDenseMap<Value , unsigned*> InstrElementSize;
3070
3071	/// A list of scalars that we found that we need to keep as scalars.
3072	ValueSet MustGather;
3073
3074	/// A map between the vectorized entries and the last instructions in the
3075	/// bundles. The bundles are built in use order, not in the def order of the
3076	/// instructions. So, we cannot rely directly on the last instruction in the
3077	/// bundle being the last instruction in the program order during
3078	/// vectorization process since the basic blocks are affected, need to
3079	/// pre-gather them before.
3080	DenseMap<const TreeEntry , Instruction > EntryToLastInstruction;
3081
3082	/// List of gather nodes, depending on other gather/vector nodes, which should
3083	/// be emitted after the vector instruction emission process to correctly
3084	/// handle order of the vector instructions and shuffles.
3085	SetVector<const TreeEntry *> PostponedGathers;
3086
3087	using ValueToGatherNodesMap =
3088	DenseMap<Value , SmallPtrSet<const* TreeEntry *, `4`>>;
3089	ValueToGatherNodesMap ValueToGatherNodes;
3090
3091	/// This POD struct describes one external user in the vectorized tree.
3092	struct ExternalUser {
3093	ExternalUser(Value S, llvm::User U, int L)
3094	: Scalar(S), User(U), Lane(L) {}
3095
3096	// Which scalar in our function.
3097	Value *Scalar;
3098
3099	// Which user that uses the scalar.
3100	llvm::User *User;
3101
3102	// Which lane does the scalar belong to.
3103	int Lane;
3104	};
3105	using UserList = SmallVector<ExternalUser, `16`>;
3106
3107	/// Checks if two instructions may access the same memory.
3108	///
3109	/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3110	/// is invariant in the calling loop.
3111	bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3112	Instruction *Inst2) {
3113	if (!Loc1.Ptr \|\| !isSimple(I: Inst1) \|\| !isSimple(I: Inst2))
3114	return true;
3115	// First check if the result is already in the cache.
3116	AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
3117	auto It = AliasCache.find(Val: Key);
3118	if (It != AliasCache.end())
3119	return It ->second;
3120	bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
3121	// Store the result in the cache.
3122	AliasCache.try_emplace(Key, Args&: Aliased);
3123	AliasCache.try_emplace(Key: std::make_pair(x&: Inst2, y&: Inst1), Args&: Aliased);
3124	return Aliased;
3125	}
3126
3127	using AliasCacheKey = std::pair<Instruction , Instruction >;
3128
3129	/// Cache for alias results.
3130	/// TODO: consider moving this to the AliasAnalysis itself.
3131	DenseMap<AliasCacheKey, bool> AliasCache;
3132
3133	// Cache for pointerMayBeCaptured calls inside AA. This is preserved
3134	// globally through SLP because we don't perform any action which
3135	// invalidates capture results.
3136	BatchAAResults BatchAA;
3137
3138	/// Temporary store for deleted instructions. Instructions will be deleted
3139	/// eventually when the BoUpSLP is destructed. The deferral is required to
3140	/// ensure that there are no incorrect collisions in the AliasCache, which
3141	/// can happen if a new instruction is allocated at the same address as a
3142	/// previously deleted instruction.
3143	DenseSet<Instruction *> DeletedInstructions;
3144
3145	/// Set of the instruction, being analyzed already for reductions.
3146	SmallPtrSet<Instruction *, `16`> AnalyzedReductionsRoots;
3147
3148	/// Set of hashes for the list of reduction values already being analyzed.
3149	DenseSet<size_t> AnalyzedReductionVals;
3150
3151	/// Values, already been analyzed for mininmal bitwidth and found to be
3152	/// non-profitable.
3153	DenseSet<Value *> AnalyzedMinBWVals;
3154
3155	/// A list of values that need to extracted out of the tree.
3156	/// This list holds pairs of (Internal Scalar : External User). External User
3157	/// can be nullptr, it means that this Internal Scalar will be used later,
3158	/// after vectorization.
3159	UserList ExternalUses;
3160
3161	/// A list of GEPs which can be reaplced by scalar GEPs instead of
3162	/// extractelement instructions.
3163	SmallPtrSet<Value *, `4`> ExternalUsesAsGEPs;
3164
3165	/// Values used only by @llvm.assume calls.
3166	SmallPtrSet<const Value *, `32`> EphValues;
3167
3168	/// Holds all of the instructions that we gathered, shuffle instructions and
3169	/// extractelements.
3170	SetVector<Instruction *> GatherShuffleExtractSeq;
3171
3172	/// A list of blocks that we are going to CSE.
3173	DenseSet<BasicBlock *> CSEBlocks;
3174
3175	/// Contains all scheduling relevant data for an instruction.
3176	/// A ScheduleData either represents a single instruction or a member of an
3177	/// instruction bundle (= a group of instructions which is combined into a
3178	/// vector instruction).
3179	struct ScheduleData {
3180	// The initial value for the dependency counters. It means that the
3181	// dependencies are not calculated yet.
3182	enum { InvalidDeps = -`1` };
3183
3184	ScheduleData() = default;
3185
3186	void init(int BlockSchedulingRegionID, Value *OpVal) {
3187	FirstInBundle = this;
3188	NextInBundle = nullptr;
3189	NextLoadStore = nullptr;
3190	IsScheduled = false;
3191	SchedulingRegionID = BlockSchedulingRegionID;
3192	clearDependencies();
3193	OpValue = OpVal;
3194	TE = nullptr;
3195	}
3196
3197	/// Verify basic self consistency properties
3198	void verify() {
3199	if (hasValidDependencies()) {
3200	assert(UnscheduledDeps <= Dependencies && "invariant");
3201	} else {
3202	assert(UnscheduledDeps == Dependencies && "invariant");
3203	}
3204
3205	if (IsScheduled) {
3206	assert(isSchedulingEntity() &&
3207	"unexpected scheduled state");
3208	for (const ScheduleData BundleMember = this*; BundleMember;
3209	BundleMember = BundleMember->NextInBundle) {
3210	assert(BundleMember->hasValidDependencies() &&
3211	BundleMember->UnscheduledDeps == `0` &&
3212	"unexpected scheduled state");
3213	assert((BundleMember == this \|\| !BundleMember->IsScheduled) &&
3214	"only bundle is marked scheduled");
3215	}
3216	}
3217
3218	assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3219	"all bundle members must be in same basic block");
3220	}
3221
3222	/// Returns true if the dependency information has been calculated.
3223	/// Note that depenendency validity can vary between instructions within
3224	/// a single bundle.
3225	bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3226
3227	/// Returns true for single instructions and for bundle representatives
3228	/// (= the head of a bundle).
3229	bool isSchedulingEntity() const { return FirstInBundle == this; }
3230
3231	/// Returns true if it represents an instruction bundle and not only a
3232	/// single instruction.
3233	bool isPartOfBundle() const {
3234	return NextInBundle != nullptr \|\| FirstInBundle != this \|\| TE;
3235	}
3236
3237	/// Returns true if it is ready for scheduling, i.e. it has no more
3238	/// unscheduled depending instructions/bundles.
3239	bool isReady() const {
3240	assert(isSchedulingEntity() &&
3241	"can't consider non-scheduling entity for ready list");
3242	return unscheduledDepsInBundle() == `0` && !IsScheduled;
3243	}
3244
3245	/// Modifies the number of unscheduled dependencies for this instruction,
3246	/// and returns the number of remaining dependencies for the containing
3247	/// bundle.
3248	int incrementUnscheduledDeps(int Incr) {
3249	assert(hasValidDependencies() &&
3250	"increment of unscheduled deps would be meaningless");
3251	UnscheduledDeps += Incr;
3252	return FirstInBundle->unscheduledDepsInBundle();
3253	}
3254
3255	/// Sets the number of unscheduled dependencies to the number of
3256	/// dependencies.
3257	void resetUnscheduledDeps() {
3258	UnscheduledDeps = Dependencies;
3259	}
3260
3261	/// Clears all dependency information.
3262	void clearDependencies() {
3263	Dependencies = InvalidDeps;
3264	resetUnscheduledDeps();
3265	MemoryDependencies.clear();
3266	ControlDependencies.clear();
3267	}
3268
3269	int unscheduledDepsInBundle() const {
3270	assert(isSchedulingEntity() && "only meaningful on the bundle");
3271	int Sum = `0`;
3272	for (const ScheduleData BundleMember = this*; BundleMember;
3273	BundleMember = BundleMember->NextInBundle) {
3274	if (BundleMember->UnscheduledDeps == InvalidDeps)
3275	return InvalidDeps;
3276	Sum += BundleMember->UnscheduledDeps;
3277	}
3278	return Sum;
3279	}
3280
3281	void dump(raw_ostream &os) const {
3282	if (!isSchedulingEntity()) {
3283	os << "/ " << *Inst;
3284	} else if (NextInBundle) {
3285	os << `'['` << *Inst;
3286	ScheduleData *SD = NextInBundle;
3287	while (SD) {
3288	os << `';'` << *SD->Inst;
3289	SD = SD->NextInBundle;
3290	}
3291	os << `']'`;
3292	} else {
3293	os << *Inst;
3294	}
3295	}
3296
3297	Instruction Inst = nullptr*;
3298
3299	/// Opcode of the current instruction in the schedule data.
3300	Value OpValue = nullptr*;
3301
3302	/// The TreeEntry that this instruction corresponds to.
3303	TreeEntry TE = nullptr*;
3304
3305	/// Points to the head in an instruction bundle (and always to this for
3306	/// single instructions).
3307	ScheduleData FirstInBundle = nullptr*;
3308
3309	/// Single linked list of all instructions in a bundle. Null if it is a
3310	/// single instruction.
3311	ScheduleData NextInBundle = nullptr*;
3312
3313	/// Single linked list of all memory instructions (e.g. load, store, call)
3314	/// in the block - until the end of the scheduling region.
3315	ScheduleData NextLoadStore = nullptr*;
3316
3317	/// The dependent memory instructions.
3318	/// This list is derived on demand in calculateDependencies().
3319	SmallVector<ScheduleData *, `4`> MemoryDependencies;
3320
3321	/// List of instructions which this instruction could be control dependent
3322	/// on. Allowing such nodes to be scheduled below this one could introduce
3323	/// a runtime fault which didn't exist in the original program.
3324	/// ex: this is a load or udiv following a readonly call which inf loops
3325	SmallVector<ScheduleData *, `4`> ControlDependencies;
3326
3327	/// This ScheduleData is in the current scheduling region if this matches
3328	/// the current SchedulingRegionID of BlockScheduling.
3329	int SchedulingRegionID = `0`;
3330
3331	/// Used for getting a "good" final ordering of instructions.
3332	int SchedulingPriority = `0`;
3333
3334	/// The number of dependencies. Constitutes of the number of users of the
3335	/// instruction plus the number of dependent memory instructions (if any).
3336	/// This value is calculated on demand.
3337	/// If InvalidDeps, the number of dependencies is not calculated yet.
3338	int Dependencies = InvalidDeps;
3339
3340	/// The number of dependencies minus the number of dependencies of scheduled
3341	/// instructions. As soon as this is zero, the instruction/bundle gets ready
3342	/// for scheduling.
3343	/// Note that this is negative as long as Dependencies is not calculated.
3344	int UnscheduledDeps = InvalidDeps;
3345
3346	/// True if this instruction is scheduled (or considered as scheduled in the
3347	/// dry-run).
3348	bool IsScheduled = false;
3349	};
3350
3351	#ifndef NDEBUG
3352	friend inline raw_ostream &operator<<(raw_ostream &os,
3353	const BoUpSLP::ScheduleData &SD) {
3354	SD.dump(os);
3355	return os;
3356	}
3357	#endif
3358
3359	friend struct GraphTraits<BoUpSLP *>;
3360	friend struct DOTGraphTraits<BoUpSLP *>;
3361
3362	/// Contains all scheduling data for a basic block.
3363	/// It does not schedules instructions, which are not memory read/write
3364	/// instructions and their operands are either constants, or arguments, or
3365	/// phis, or instructions from others blocks, or their users are phis or from
3366	/// the other blocks. The resulting vector instructions can be placed at the
3367	/// beginning of the basic block without scheduling (if operands does not need
3368	/// to be scheduled) or at the end of the block (if users are outside of the
3369	/// block). It allows to save some compile time and memory used by the
3370	/// compiler.
3371	/// ScheduleData is assigned for each instruction in between the boundaries of
3372	/// the tree entry, even for those, which are not part of the graph. It is
3373	/// required to correctly follow the dependencies between the instructions and
3374	/// their correct scheduling. The ScheduleData is not allocated for the
3375	/// instructions, which do not require scheduling, like phis, nodes with
3376	/// extractelements/insertelements only or nodes with instructions, with
3377	/// uses/operands outside of the block.
3378	struct BlockScheduling {
3379	BlockScheduling(BasicBlock *BB)
3380	: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3381
3382	void clear() {
3383	ReadyInsts.clear();
3384	ScheduleStart = nullptr;
3385	ScheduleEnd = nullptr;
3386	FirstLoadStoreInRegion = nullptr;
3387	LastLoadStoreInRegion = nullptr;
3388	RegionHasStackSave = false;
3389
3390	// Reduce the maximum schedule region size by the size of the
3391	// previous scheduling run.
3392	ScheduleRegionSizeLimit -= ScheduleRegionSize;
3393	if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3394	ScheduleRegionSizeLimit = MinScheduleRegionSize;
3395	ScheduleRegionSize = `0`;
3396
3397	// Make a new scheduling region, i.e. all existing ScheduleData is not
3398	// in the new region yet.
3399	++SchedulingRegionID;
3400	}
3401
3402	ScheduleData getScheduleData(Instruction I) {
3403	if (BB != I->getParent())
3404	// Avoid lookup if can't possibly be in map.
3405	return nullptr;
3406	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
3407	if (SD && isInSchedulingRegion(SD))
3408	return SD;
3409	return nullptr;
3410	}
3411
3412	ScheduleData getScheduleData(Value V) {
3413	if (auto *I = dyn_cast<Instruction>(Val: V))
3414	return getScheduleData(I);
3415	return nullptr;
3416	}
3417
3418	ScheduleData getScheduleData(Value V, Value *Key) {
3419	if (V == Key)
3420	return getScheduleData(V);
3421	auto I = ExtraScheduleDataMap.find(Val: V);
3422	if (I != ExtraScheduleDataMap.end()) {
3423	ScheduleData *SD = I ->second.lookup(Val: Key);
3424	if (SD && isInSchedulingRegion(SD))
3425	return SD;
3426	}
3427	return nullptr;
3428	}
3429
3430	bool isInSchedulingRegion(ScheduleData SD) const* {
3431	return SD->SchedulingRegionID == SchedulingRegionID;
3432	}
3433
3434	/// Marks an instruction as scheduled and puts all dependent ready
3435	/// instructions into the ready-list.
3436	template <typename ReadyListType>
3437	void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3438	SD->IsScheduled = true;
3439	LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3440
3441	for (ScheduleData *BundleMember = SD; BundleMember;
3442	BundleMember = BundleMember->NextInBundle) {
3443	if (BundleMember->Inst != BundleMember->OpValue)
3444	continue;
3445
3446	// Handle the def-use chain dependencies.
3447
3448	// Decrement the unscheduled counter and insert to ready list if ready.
3449	auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3450	doForAllOpcodes(V: I, Action: [&ReadyList](ScheduleData *OpDef) {
3451	if (OpDef && OpDef->hasValidDependencies() &&
3452	OpDef->incrementUnscheduledDeps(Incr: -`1`) == `0`) {
3453	// There are no more unscheduled dependencies after
3454	// decrementing, so we can put the dependent instruction
3455	// into the ready list.
3456	ScheduleData *DepBundle = OpDef->FirstInBundle;
3457	assert(!DepBundle->IsScheduled &&
3458	"already scheduled bundle gets ready");
3459	ReadyList.insert(DepBundle);
3460	LLVM_DEBUG(dbgs()
3461	<< "SLP: gets ready (def): " << *DepBundle << "\n");
3462	}
3463	});
3464	};
3465
3466	// If BundleMember is a vector bundle, its operands may have been
3467	// reordered during buildTree(). We therefore need to get its operands
3468	// through the TreeEntry.
3469	if (TreeEntry *TE = BundleMember->TE) {
3470	// Need to search for the lane since the tree entry can be reordered.
3471	int Lane = std::distance(first: TE->Scalars.begin(),
3472	last: find(Range&: TE->Scalars, Val: BundleMember->Inst));
3473	assert(Lane >= `0` && "Lane not set");
3474
3475	// Since vectorization tree is being built recursively this assertion
3476	// ensures that the tree entry has all operands set before reaching
3477	// this code. Couple of exceptions known at the moment are extracts
3478	// where their second (immediate) operand is not added. Since
3479	// immediates do not affect scheduler behavior this is considered
3480	// okay.
3481	auto *In = BundleMember->Inst;
3482	assert(
3483	In &&
3484	(isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) \|\|
3485	In->getNumOperands() == TE->getNumOperands()) &&
3486	"Missed TreeEntry operands?");
3487	(void)In; // fake use to avoid build failure when assertions disabled
3488
3489	for (unsigned OpIdx = `0`, NumOperands = TE->getNumOperands();
3490	OpIdx != NumOperands; ++OpIdx)
3491	if (auto *I = dyn_cast<Instruction>(Val: TE->getOperand(OpIdx)[Lane]))
3492	DecrUnsched(I);
3493	} else {
3494	// If BundleMember is a stand-alone instruction, no operand reordering
3495	// has taken place, so we directly access its operands.
3496	for (Use &U : BundleMember->Inst->operands())
3497	if (auto *I = dyn_cast<Instruction>(Val: U.get()))
3498	DecrUnsched(I);
3499	}
3500	// Handle the memory dependencies.
3501	for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3502	if (MemoryDepSD->hasValidDependencies() &&
3503	MemoryDepSD->incrementUnscheduledDeps(Incr: -`1`) == `0`) {
3504	// There are no more unscheduled dependencies after decrementing,
3505	// so we can put the dependent instruction into the ready list.
3506	ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3507	assert(!DepBundle->IsScheduled &&
3508	"already scheduled bundle gets ready");
3509	ReadyList.insert(DepBundle);
3510	LLVM_DEBUG(dbgs()
3511	<< "SLP: gets ready (mem): " << *DepBundle << "\n");
3512	}
3513	}
3514	// Handle the control dependencies.
3515	for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3516	if (DepSD->incrementUnscheduledDeps(Incr: -`1`) == `0`) {
3517	// There are no more unscheduled dependencies after decrementing,
3518	// so we can put the dependent instruction into the ready list.
3519	ScheduleData *DepBundle = DepSD->FirstInBundle;
3520	assert(!DepBundle->IsScheduled &&
3521	"already scheduled bundle gets ready");
3522	ReadyList.insert(DepBundle);
3523	LLVM_DEBUG(dbgs()
3524	<< "SLP: gets ready (ctl): " << *DepBundle << "\n");
3525	}
3526	}
3527	}
3528	}
3529
3530	/// Verify basic self consistency properties of the data structure.
3531	void verify() {
3532	if (!ScheduleStart)
3533	return;
3534
3535	assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3536	ScheduleStart->comesBefore(ScheduleEnd) &&
3537	"Not a valid scheduling region?");
3538
3539	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3540	auto *SD = getScheduleData(I);
3541	if (!SD)
3542	continue;
3543	assert(isInSchedulingRegion(SD) &&
3544	"primary schedule data not in window?");
3545	assert(isInSchedulingRegion(SD->FirstInBundle) &&
3546	"entire bundle in window!");
3547	(void)SD;
3548	doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->verify(); });
3549	}
3550
3551	for (auto *SD : ReadyInsts) {
3552	assert(SD->isSchedulingEntity() && SD->isReady() &&
3553	"item in ready list not ready?");
3554	(void)SD;
3555	}
3556	}
3557
3558	void doForAllOpcodes(Value *V,
3559	function_ref<void(ScheduleData *SD)> Action) {
3560	if (ScheduleData *SD = getScheduleData(V))
3561	Action (SD);
3562	auto I = ExtraScheduleDataMap.find(Val: V);
3563	if (I != ExtraScheduleDataMap.end())
3564	for (auto &P : I ->second)
3565	if (isInSchedulingRegion(SD: P.second))
3566	Action (P.second);
3567	}
3568
3569	/// Put all instructions into the ReadyList which are ready for scheduling.
3570	template <typename ReadyListType>
3571	void initialFillReadyList(ReadyListType &ReadyList) {
3572	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3573	doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
3574	if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3575	SD->isReady()) {
3576	ReadyList.insert(SD);
3577	LLVM_DEBUG(dbgs()
3578	<< "SLP: initially in ready list: " << *SD << "\n");
3579	}
3580	});
3581	}
3582	}
3583
3584	/// Build a bundle from the ScheduleData nodes corresponding to the
3585	/// scalar instruction for each lane.
3586	ScheduleData buildBundle(ArrayRef<Value > VL);
3587
3588	/// Checks if a bundle of instructions can be scheduled, i.e. has no
3589	/// cyclic dependencies. This is only a dry-run, no instructions are
3590	/// actually moved at this stage.
3591	/// \returns the scheduling bundle. The returned Optional value is not
3592	/// std::nullopt if \p VL is allowed to be scheduled.
3593	std::optional<ScheduleData *>
3594	tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
3595	const InstructionsState &S);
3596
3597	/// Un-bundles a group of instructions.
3598	void cancelScheduling(ArrayRef<Value > VL, Value OpValue);
3599
3600	/// Allocates schedule data chunk.
3601	ScheduleData *allocateScheduleDataChunks();
3602
3603	/// Extends the scheduling region so that V is inside the region.
3604	/// \returns true if the region size is within the limit.
3605	bool extendSchedulingRegion(Value V, const* InstructionsState &S);
3606
3607	/// Initialize the ScheduleData structures for new instructions in the
3608	/// scheduling region.
3609	void initScheduleData(Instruction FromI, Instruction ToI,
3610	ScheduleData *PrevLoadStore,
3611	ScheduleData *NextLoadStore);
3612
3613	/// Updates the dependency information of a bundle and of all instructions/
3614	/// bundles which depend on the original bundle.
3615	void calculateDependencies(ScheduleData SD, bool* InsertInReadyList,
3616	BoUpSLP *SLP);
3617
3618	/// Sets all instruction in the scheduling region to un-scheduled.
3619	void resetSchedule();
3620
3621	BasicBlock *BB;
3622
3623	/// Simple memory allocation for ScheduleData.
3624	SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3625
3626	/// The size of a ScheduleData array in ScheduleDataChunks.
3627	int ChunkSize;
3628
3629	/// The allocator position in the current chunk, which is the last entry
3630	/// of ScheduleDataChunks.
3631	int ChunkPos;
3632
3633	/// Attaches ScheduleData to Instruction.
3634	/// Note that the mapping survives during all vectorization iterations, i.e.
3635	/// ScheduleData structures are recycled.
3636	DenseMap<Instruction , ScheduleData > ScheduleDataMap;
3637
3638	/// Attaches ScheduleData to Instruction with the leading key.
3639	DenseMap<Value , SmallDenseMap<Value , ScheduleData *>>
3640	ExtraScheduleDataMap;
3641
3642	/// The ready-list for scheduling (only used for the dry-run).
3643	SetVector<ScheduleData *> ReadyInsts;
3644
3645	/// The first instruction of the scheduling region.
3646	Instruction ScheduleStart = nullptr*;
3647
3648	/// The first instruction _after_ the scheduling region.
3649	Instruction ScheduleEnd = nullptr*;
3650
3651	/// The first memory accessing instruction in the scheduling region
3652	/// (can be null).
3653	ScheduleData FirstLoadStoreInRegion = nullptr*;
3654
3655	/// The last memory accessing instruction in the scheduling region
3656	/// (can be null).
3657	ScheduleData LastLoadStoreInRegion = nullptr*;
3658
3659	/// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3660	/// region? Used to optimize the dependence calculation for the
3661	/// common case where there isn't.
3662	bool RegionHasStackSave = false;
3663
3664	/// The current size of the scheduling region.
3665	int ScheduleRegionSize = `0`;
3666
3667	/// The maximum size allowed for the scheduling region.
3668	int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3669
3670	/// The ID of the scheduling region. For a new vectorization iteration this
3671	/// is incremented which "removes" all ScheduleData from the region.
3672	/// Make sure that the initial SchedulingRegionID is greater than the
3673	/// initial SchedulingRegionID in ScheduleData (which is 0).
3674	int SchedulingRegionID = `1`;
3675	};
3676
3677	/// Attaches the BlockScheduling structures to basic blocks.
3678	MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3679
3680	/// Performs the "real" scheduling. Done before vectorization is actually
3681	/// performed in a basic block.
3682	void scheduleBlock(BlockScheduling *BS);
3683
3684	/// List of users to ignore during scheduling and that don't need extracting.
3685	const SmallDenseSet<Value > UserIgnoreList = nullptr;
3686
3687	/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3688	/// sorted SmallVectors of unsigned.
3689	struct OrdersTypeDenseMapInfo {
3690	static OrdersType getEmptyKey() {
3691	OrdersType V;
3692	V.push_back(Elt: ~`1U`);
3693	return V;
3694	}
3695
3696	static OrdersType getTombstoneKey() {
3697	OrdersType V;
3698	V.push_back(Elt: ~`2U`);
3699	return V;
3700	}
3701
3702	static unsigned getHashValue(const OrdersType &V) {
3703	return static_cast<unsigned>(hash_combine_range(first: V.begin(), last: V.end()));
3704	}
3705
3706	static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3707	return LHS == RHS;
3708	}
3709	};
3710
3711	// Analysis and block reference.
3712	Function *F;
3713	ScalarEvolution *SE;
3714	TargetTransformInfo *TTI;
3715	TargetLibraryInfo *TLI;
3716	LoopInfo *LI;
3717	DominatorTree *DT;
3718	AssumptionCache *AC;
3719	DemandedBits *DB;
3720	const DataLayout *DL;
3721	OptimizationRemarkEmitter *ORE;
3722
3723	unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3724	unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3725
3726	/// Instruction builder to construct the vectorized tree.
3727	IRBuilder<TargetFolder> Builder;
3728
3729	/// A map of scalar integer values to the smallest bit width with which they
3730	/// can legally be represented. The values map to (width, signed) pairs,
3731	/// where "width" indicates the minimum bit width and "signed" is True if the
3732	/// value must be signed-extended, rather than zero-extended, back to its
3733	/// original width.
3734	DenseMap<const TreeEntry , std::pair<uint64_t, bool*>> MinBWs;
3735
3736	/// Final size of the reduced vector, if the current graph represents the
3737	/// input for the reduction and it was possible to narrow the size of the
3738	/// reduction.
3739	unsigned ReductionBitWidth = `0`;
3740
3741	/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3742	/// type sizes, used in the tree.
3743	std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3744
3745	/// Indices of the vectorized nodes, which supposed to be the roots of the new
3746	/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3747	DenseSet<unsigned> ExtraBitWidthNodes;
3748	};
3749
3750	} // end namespace slpvectorizer
3751
3752	template <> struct GraphTraits<BoUpSLP *> {
3753	using TreeEntry = BoUpSLP::TreeEntry;
3754
3755	/// NodeRef has to be a pointer per the GraphWriter.
3756	using NodeRef = TreeEntry *;
3757
3758	using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
3759
3760	/// Add the VectorizableTree to the index iterator to be able to return
3761	/// TreeEntry pointers.
3762	struct ChildIteratorType
3763	: public iterator_adaptor_base<
3764	ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator> {
3765	ContainerTy &VectorizableTree;
3766
3767	ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator W,
3768	ContainerTy &VT)
3769	: ChildIteratorType::iterator_adaptor_base (W), VectorizableTree(VT) {}
3770
3771	NodeRef operator() { return* I->UserTE; }
3772	};
3773
3774	static NodeRef getEntryNode(BoUpSLP &R) {
3775	return R.VectorizableTree [`0`].get();
3776	}
3777
3778	static ChildIteratorType child_begin(NodeRef N) {
3779	return {N->UserTreeIndices.begin(), N->Container};
3780	}
3781
3782	static ChildIteratorType child_end(NodeRef N) {
3783	return {N->UserTreeIndices.end(), N->Container};
3784	}
3785
3786	/// For the node iterator we just need to turn the TreeEntry iterator into a
3787	/// TreeEntry iterator so that it dereferences to NodeRef.*
3788	class nodes_iterator {
3789	using ItTy = ContainerTy::iterator;
3790	ItTy It;
3791
3792	public:
3793	nodes_iterator(const ItTy &It2) : It(It2) {}
3794	NodeRef operator() { return* It->get(); }
3795	nodes_iterator operator++() {
3796	++It;
3797	return *this;
3798	}
3799	bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3800	};
3801
3802	static nodes_iterator nodes_begin(BoUpSLP *R) {
3803	return nodes_iterator (R->VectorizableTree.begin());
3804	}
3805
3806	static nodes_iterator nodes_end(BoUpSLP *R) {
3807	return nodes_iterator (R->VectorizableTree.end());
3808	}
3809
3810	static unsigned size(BoUpSLP R) { return* R->VectorizableTree.size(); }
3811	};
3812
3813	template <> struct DOTGraphTraits<BoUpSLP > : public* DefaultDOTGraphTraits {
3814	using TreeEntry = BoUpSLP::TreeEntry;
3815
3816	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits (IsSimple) {}
3817
3818	std::string getNodeLabel(const TreeEntry Entry, const* BoUpSLP *R) {
3819	std::string Str;
3820	raw_string_ostream OS(Str);
3821	OS << Entry->Idx << ".\n";
3822	if (isSplat(VL: Entry->Scalars))
3823	OS << "<splat> ";
3824	for (auto *V : Entry->Scalars) {
3825	OS << *V;
3826	if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
3827	return EU.Scalar == V;
3828	}))
3829	OS << " <extract>";
3830	OS << "\n";
3831	}
3832	return Str;
3833	}
3834
3835	static std::string getNodeAttributes(const TreeEntry *Entry,
3836	const BoUpSLP *) {
3837	if (Entry->State == TreeEntry::NeedToGather)
3838	return "color=red";
3839	if (Entry->State == TreeEntry::ScatterVectorize \|\|
3840	Entry->State == TreeEntry::StridedVectorize)
3841	return "color=blue";
3842	return "";
3843	}
3844	};
3845
3846	} // end namespace llvm
3847
3848	BoUpSLP::~BoUpSLP() {
3849	SmallVector<WeakTrackingVH> DeadInsts;
3850	for (auto *I : DeletedInstructions) {
3851	for (Use &U : I->operands()) {
3852	auto *Op = dyn_cast<Instruction>(Val: U.get());
3853	if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
3854	wouldInstructionBeTriviallyDead(I: Op, TLI))
3855	DeadInsts.emplace_back(Args&: Op);
3856	}
3857	I->dropAllReferences();
3858	}
3859	for (auto *I : DeletedInstructions) {
3860	assert(I->use_empty() &&
3861	"trying to erase instruction with users.");
3862	I->eraseFromParent();
3863	}
3864
3865	// Cleanup any dead scalar code feeding the vectorized instructions
3866	RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
3867
3868	#ifdef EXPENSIVE_CHECKS
3869	// If we could guarantee that this call is not extremely slow, we could
3870	// remove the ifdef limitation (see PR47712).
3871	assert(!verifyFunction(*F, &dbgs()));
3872	#endif
3873	}
3874
3875	/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3876	/// contains original mask for the scalars reused in the node. Procedure
3877	/// transform this mask in accordance with the given \p Mask.
3878	static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
3879	assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3880	"Expected non-empty mask.");
3881	SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3882	Prev.swap(RHS&: Reuses);
3883	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
3884	if (Mask [I] != PoisonMaskElem)
3885	Reuses [Mask [I]] = Prev [I];
3886	}
3887
3888	/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3889	/// the original order of the scalars. Procedure transforms the provided order
3890	/// in accordance with the given \p Mask. If the resulting \p Order is just an
3891	/// identity order, \p Order is cleared.
3892	static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
3893	bool BottomOrder = false) {
3894	assert(!Mask.empty() && "Expected non-empty mask.");
3895	unsigned Sz = Mask.size();
3896	if (BottomOrder) {
3897	SmallVector<unsigned> PrevOrder;
3898	if (Order.empty()) {
3899	PrevOrder.resize(N: Sz);
3900	std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: `0`);
3901	} else {
3902	PrevOrder.swap(RHS&: Order);
3903	}
3904	Order.assign(NumElts: Sz, Elt: Sz);
3905	for (unsigned I = `0`; I < Sz; ++I)
3906	if (Mask [I] != PoisonMaskElem)
3907	Order [I] = PrevOrder [Mask [I]];
3908	if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
3909	return Data.value() == Sz \|\| Data.index() == Data.value();
3910	})) {
3911	Order.clear();
3912	return;
3913	}
3914	fixupOrderingIndices(Order);
3915	return;
3916	}
3917	SmallVector<int> MaskOrder;
3918	if (Order.empty()) {
3919	MaskOrder.resize(N: Sz);
3920	std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: `0`);
3921	} else {
3922	inversePermutation(Indices: Order, Mask&: MaskOrder);
3923	}
3924	reorderReuses(Reuses&: MaskOrder, Mask);
3925	if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
3926	Order.clear();
3927	return;
3928	}
3929	Order.assign(NumElts: Sz, Elt: Sz);
3930	for (unsigned I = `0`; I < Sz; ++I)
3931	if (MaskOrder [I] != PoisonMaskElem)
3932	Order [MaskOrder [I]] = I;
3933	fixupOrderingIndices(Order);
3934	}
3935
3936	std::optional<BoUpSLP::OrdersType>
3937	BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3938	assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3939	// Try to find subvector extract/insert patterns and reorder only such
3940	// patterns.
3941	SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3942	Type *ScalarTy = GatheredScalars.front()->getType();
3943	int NumScalars = GatheredScalars.size();
3944	if (!isValidElementType(Ty: ScalarTy))
3945	return std::nullopt;
3946	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumScalars);
3947	int NumParts = TTI->getNumberOfParts(Tp: VecTy);
3948	if (NumParts == `0` \|\| NumParts >= NumScalars)
3949	NumParts = `1`;
3950	SmallVector<int> ExtractMask;
3951	SmallVector<int> Mask;
3952	SmallVector<SmallVector<const TreeEntry *>> Entries;
3953	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
3954	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
3955	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
3956	isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
3957	/ForOrder=/true);
3958	// No shuffled operands - ignore.
3959	if (GatherShuffles.empty() && ExtractShuffles.empty())
3960	return std::nullopt;
3961	OrdersType CurrentOrder(NumScalars, NumScalars);
3962	if (GatherShuffles.size() == `1` &&
3963	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3964	Entries.front().front()->isSame(VL: TE.Scalars)) {
3965	// Perfect match in the graph, will reuse the previously vectorized
3966	// node. Cost is 0.
3967	std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: `0`);
3968	return CurrentOrder;
3969	}
3970	auto IsSplatMask = [](ArrayRef<int> Mask) {
3971	int SingleElt = PoisonMaskElem;
3972	return all_of(Range&: Mask, P: [&](int I) {
3973	if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3974	SingleElt = I;
3975	return I == PoisonMaskElem \|\| I == SingleElt;
3976	});
3977	};
3978	// Exclusive broadcast mask - ignore.
3979	if ((ExtractShuffles.empty() && IsSplatMask (Mask) &&
3980	(Entries.size() != `1` \|\|
3981	Entries.front().front()->ReorderIndices.empty())) \|\|
3982	(GatherShuffles.empty() && IsSplatMask (ExtractMask)))
3983	return std::nullopt;
3984	SmallBitVector ShuffledSubMasks(NumParts);
3985	auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3986	ArrayRef<int> Mask, int PartSz, int NumParts,
3987	function_ref<unsigned(unsigned)> GetVF) {
3988	for (int I : seq<int>(Begin: `0`, End: NumParts)) {
3989	if (ShuffledSubMasks.test(Idx: I))
3990	continue;
3991	const int VF = GetVF (I);
3992	if (VF == `0`)
3993	continue;
3994	MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: PartSz);
3995	// Shuffle of at least 2 vectors - ignore.
3996	if (any_of(Range&: Slice, P: [&](int I) { return I != NumScalars; })) {
3997	std::fill(first: Slice.begin(), last: Slice.end(), value: NumScalars);
3998	ShuffledSubMasks.set(I);
3999	continue;
4000	}
4001	// Try to include as much elements from the mask as possible.
4002	int FirstMin = INT_MAX;
4003	int SecondVecFound = false;
4004	for (int K : seq<int>(Begin: `0`, End: PartSz)) {
4005	int Idx = Mask [I * PartSz + K];
4006	if (Idx == PoisonMaskElem) {
4007	Value V = GatheredScalars [I PartSz + K];
4008	if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
4009	SecondVecFound = true;
4010	break;
4011	}
4012	continue;
4013	}
4014	if (Idx < VF) {
4015	if (FirstMin > Idx)
4016	FirstMin = Idx;
4017	} else {
4018	SecondVecFound = true;
4019	break;
4020	}
4021	}
4022	FirstMin = (FirstMin / PartSz) * PartSz;
4023	// Shuffle of at least 2 vectors - ignore.
4024	if (SecondVecFound) {
4025	std::fill(first: Slice.begin(), last: Slice.end(), value: NumScalars);
4026	ShuffledSubMasks.set(I);
4027	continue;
4028	}
4029	for (int K : seq<int>(Begin: `0`, End: PartSz)) {
4030	int Idx = Mask [I * PartSz + K];
4031	if (Idx == PoisonMaskElem)
4032	continue;
4033	Idx -= FirstMin;
4034	if (Idx >= PartSz) {
4035	SecondVecFound = true;
4036	break;
4037	}
4038	if (CurrentOrder [I * PartSz + Idx] >
4039	static_cast<unsigned>(I * PartSz + K) &&
4040	CurrentOrder [I * PartSz + Idx] !=
4041	static_cast<unsigned>(I * PartSz + Idx))
4042	CurrentOrder [I * PartSz + Idx] = I * PartSz + K;
4043	}
4044	// Shuffle of at least 2 vectors - ignore.
4045	if (SecondVecFound) {
4046	std::fill(first: Slice.begin(), last: Slice.end(), value: NumScalars);
4047	ShuffledSubMasks.set(I);
4048	continue;
4049	}
4050	}
4051	};
4052	int PartSz = NumScalars / NumParts;
4053	if (!ExtractShuffles.empty())
4054	TransformMaskToOrder (
4055	CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4056	if (!ExtractShuffles [I])
4057	return `0U`;
4058	unsigned VF = `0`;
4059	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: PartSz)) {
4060	int K = I * PartSz + Idx;
4061	if (ExtractMask [K] == PoisonMaskElem)
4062	continue;
4063	if (!TE.ReuseShuffleIndices.empty())
4064	K = TE.ReuseShuffleIndices [K];
4065	if (!TE.ReorderIndices.empty())
4066	K = std::distance(first: TE.ReorderIndices.begin(),
4067	last: find(Range: TE.ReorderIndices, Val: K));
4068	auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars [K]);
4069	if (!EI)
4070	continue;
4071	VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
4072	->getElementCount()
4073	.getKnownMinValue());
4074	}
4075	return VF;
4076	});
4077	// Check special corner case - single shuffle of the same entry.
4078	if (GatherShuffles.size() == `1` && NumParts != `1`) {
4079	if (ShuffledSubMasks.any())
4080	return std::nullopt;
4081	PartSz = NumScalars;
4082	NumParts = `1`;
4083	}
4084	if (!Entries.empty())
4085	TransformMaskToOrder (CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4086	if (!GatherShuffles [I])
4087	return `0U`;
4088	return std::max(a: Entries [I].front()->getVectorFactor(),
4089	b: Entries [I].back()->getVectorFactor());
4090	});
4091	int NumUndefs =
4092	count_if(Range&: CurrentOrder, P: [&](int Idx) { return Idx == NumScalars; });
4093	if (ShuffledSubMasks.all() \|\| (NumScalars > `2` && NumUndefs >= NumScalars / `2`))
4094	return std::nullopt;
4095	return std::move(CurrentOrder);
4096	}
4097
4098	static bool arePointersCompatible(Value Ptr1, Value Ptr2,
4099	const TargetLibraryInfo &TLI,
4100	bool CompareOpcodes = true) {
4101	if (getUnderlyingObject(V: Ptr1) != getUnderlyingObject(V: Ptr2))
4102	return false;
4103	auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
4104	if (!GEP1)
4105	return false;
4106	auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
4107	if (!GEP2)
4108	return false;
4109	return GEP1->getNumOperands() == `2` && GEP2->getNumOperands() == `2` &&
4110	((isConstant(V: GEP1->getOperand(i_nocapture: `1`)) &&
4111	isConstant(V: GEP2->getOperand(i_nocapture: `1`))) \|\|
4112	!CompareOpcodes \|\|
4113	getSameOpcode(VL: {GEP1->getOperand(i_nocapture: `1`), GEP2->getOperand(i_nocapture: `1`)}, TLI)
4114	.getOpcode());
4115	}
4116
4117	/// Calculates minimal alignment as a common alignment.
4118	template <typename T>
4119	static Align computeCommonAlignment(ArrayRef<Value *> VL) {
4120	Align CommonAlignment = cast<T>(VL.front())->getAlign();
4121	for (Value *V : VL.drop_front())
4122	CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4123	return CommonAlignment;
4124	}
4125
4126	/// Check if \p Order represents reverse order.
4127	static bool isReverseOrder(ArrayRef<unsigned> Order) {
4128	unsigned Sz = Order.size();
4129	return !Order.empty() && all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
4130	return Pair.value() == Sz \|\| Sz - Pair.index() - `1` == Pair.value();
4131	});
4132	}
4133
4134	/// Checks if the provided list of pointers \p Pointers represents the strided
4135	/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4136	/// Otherwise, if \p Inst is not specified, just initialized optional value is
4137	/// returned to show that the pointers represent strided pointers. If \p Inst
4138	/// specified, the runtime stride is materialized before the given \p Inst.
4139	/// \returns std::nullopt if the pointers are not pointers with the runtime
4140	/// stride, nullptr or actual stride value, otherwise.
4141	static std::optional<Value *>
4142	calculateRtStride(ArrayRef<Value > PointerOps, Type ElemTy,
4143	const DataLayout &DL, ScalarEvolution &SE,
4144	SmallVectorImpl<unsigned> &SortedIndices,
4145	Instruction Inst = nullptr*) {
4146	SmallVector<const SCEV *> SCEVs;
4147	const SCEV PtrSCEVLowest = nullptr*;
4148	const SCEV PtrSCEVHighest = nullptr*;
4149	// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4150	// addresses).
4151	for (Value *Ptr : PointerOps) {
4152	const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
4153	if (!PtrSCEV)
4154	return std::nullopt;
4155	SCEVs.push_back(Elt: PtrSCEV);
4156	if (!PtrSCEVLowest && !PtrSCEVHighest) {
4157	PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4158	continue;
4159	}
4160	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4161	if (isa<SCEVCouldNotCompute>(Val: Diff))
4162	return std::nullopt;
4163	if (Diff->isNonConstantNegative()) {
4164	PtrSCEVLowest = PtrSCEV;
4165	continue;
4166	}
4167	const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
4168	if (isa<SCEVCouldNotCompute>(Val: Diff1))
4169	return std::nullopt;
4170	if (Diff1->isNonConstantNegative()) {
4171	PtrSCEVHighest = PtrSCEV;
4172	continue;
4173	}
4174	}
4175	// Dist = PtrSCEVHighest - PtrSCEVLowest;
4176	const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
4177	if (isa<SCEVCouldNotCompute>(Val: Dist))
4178	return std::nullopt;
4179	int Size = DL.getTypeStoreSize(Ty: ElemTy);
4180	auto TryGetStride = [&](const SCEV *Dist,
4181	const SCEV Multiplier) -> const* SCEV * {
4182	if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
4183	if (M->getOperand(i: `0`) == Multiplier)
4184	return M->getOperand(i: `1`);
4185	if (M->getOperand(i: `1`) == Multiplier)
4186	return M->getOperand(i: `0`);
4187	return nullptr;
4188	}
4189	if (Multiplier == Dist)
4190	return SE.getConstant(Ty: Dist->getType(), V: `1`);
4191	return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
4192	};
4193	// Stride_in_elements = Dist / element_size (num_elems - 1).*
4194	const SCEV Stride = nullptr*;
4195	if (Size != `1` \|\| SCEVs.size() > `2`) {
4196	const SCEV Sz = SE.getConstant(Ty: Dist->getType(), V: Size (SCEVs.size() - `1`));
4197	Stride = TryGetStride (Dist, Sz);
4198	if (!Stride)
4199	return std::nullopt;
4200	}
4201	if (!Stride \|\| isa<SCEVConstant>(Val: Stride))
4202	return std::nullopt;
4203	// Iterate through all pointers and check if all distances are
4204	// unique multiple of Stride.
4205	using DistOrdPair = std::pair<int64_t, int>;
4206	auto Compare = llvm::less_first ();
4207	std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4208	int Cnt = `0`;
4209	bool IsConsecutive = true;
4210	for (const SCEV *PtrSCEV : SCEVs) {
4211	unsigned Dist = `0`;
4212	if (PtrSCEV != PtrSCEVLowest) {
4213	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4214	const SCEV *Coeff = TryGetStride (Diff, Stride);
4215	if (!Coeff)
4216	return std::nullopt;
4217	const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
4218	if (!SC \|\| isa<SCEVCouldNotCompute>(Val: SC))
4219	return std::nullopt;
4220	if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
4221	RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
4222	->isZero())
4223	return std::nullopt;
4224	Dist = SC->getAPInt().getZExtValue();
4225	}
4226	// If the strides are not the same or repeated, we can't vectorize.
4227	if ((Dist / Size) * Size != Dist \|\| (Dist / Size) >= SCEVs.size())
4228	return std::nullopt;
4229	auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
4230	if (!Res.second)
4231	return std::nullopt;
4232	// Consecutive order if the inserted element is the last one.
4233	IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
4234	++Cnt;
4235	}
4236	if (Offsets.size() != SCEVs.size())
4237	return std::nullopt;
4238	SortedIndices.clear();
4239	if (!IsConsecutive) {
4240	// Fill SortedIndices array only if it is non-consecutive.
4241	SortedIndices.resize(N: PointerOps.size());
4242	Cnt = `0`;
4243	for (const std::pair<int64_t, int> &Pair : Offsets) {
4244	SortedIndices [Cnt] = Pair.second;
4245	++Cnt;
4246	}
4247	}
4248	if (!Inst)
4249	return nullptr;
4250	SCEVExpander Expander(SE, DL, "strided-load-vec");
4251	return Expander.expandCodeFor(SH: Stride, Ty: Stride->getType(), I: Inst);
4252	}
4253
4254	BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4255	ArrayRef<Value > VL, const* Value VL0, SmallVectorImpl<unsigned*> &Order,
4256	SmallVectorImpl<Value > &PointerOps, bool* TryRecursiveCheck) const {
4257	// Check that a vectorized load would load the same memory as a scalar
4258	// load. For example, we don't want to vectorize loads that are smaller
4259	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4260	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
4261	// from such a struct, we read/write packed bits disagreeing with the
4262	// unvectorized version.
4263	Type *ScalarTy = VL0->getType();
4264
4265	if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
4266	return LoadsState::Gather;
4267
4268	// Make sure all loads in the bundle are simple - we can't vectorize
4269	// atomic or volatile loads.
4270	PointerOps.clear();
4271	const unsigned Sz = VL.size();
4272	PointerOps.resize(N: Sz);
4273	auto *POIter = PointerOps.begin();
4274	for (Value *V : VL) {
4275	auto *L = cast<LoadInst>(Val: V);
4276	if (!L->isSimple())
4277	return LoadsState::Gather;
4278	*POIter = L->getPointerOperand();
4279	++POIter;
4280	}
4281
4282	Order.clear();
4283	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: Sz);
4284	// Check the order of pointer operands or that all pointers are the same.
4285	bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order);
4286	// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4287	if (!Order.empty() && !isPowerOf2_32(Value: VL.size())) {
4288	assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4289	"supported with VectorizeNonPowerOf2");
4290	return LoadsState::Gather;
4291	}
4292
4293	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4294	if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(Ty: VecTy) &&
4295	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment) &&
4296	calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order))
4297	return LoadsState::StridedVectorize;
4298	if (IsSorted \|\| all_of(Range&: PointerOps, P: [&](Value *P) {
4299	return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
4300	})) {
4301	if (IsSorted) {
4302	Value *Ptr0;
4303	Value *PtrN;
4304	if (Order.empty()) {
4305	Ptr0 = PointerOps.front();
4306	PtrN = PointerOps.back();
4307	} else {
4308	Ptr0 = PointerOps [Order.front()];
4309	PtrN = PointerOps [Order.back()];
4310	}
4311	std::optional<int> Diff =
4312	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
4313	// Check that the sorted loads are consecutive.
4314	if (static_cast<unsigned>(*Diff) == Sz - `1`)
4315	return LoadsState::Vectorize;
4316	// Simple check if not a strided access - clear order.
4317	bool IsPossibleStrided = *Diff % (Sz - `1`) == `0`;
4318	// Try to generate strided load node if:
4319	// 1. Target with strided load support is detected.
4320	// 2. The number of loads is greater than MinProfitableStridedLoads,
4321	// or the potential stride <= MaxProfitableLoadStride and the
4322	// potential stride is power-of-2 (to avoid perf regressions for the very
4323	// small number of loads) and max distance > number of loads, or potential
4324	// stride is -1.
4325	// 3. The loads are ordered, or number of unordered loads <=
4326	// MaxProfitableUnorderedLoads, or loads are in reversed order.
4327	// (this check is to avoid extra costs for very expensive shuffles).
4328	if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads \|\|
4329	(static_cast<unsigned>(std::abs(x: *Diff)) <=
4330	MaxProfitableLoadStride * Sz &&
4331	isPowerOf2_32(Value: std::abs(x: *Diff)))) &&
4332	static_cast<unsigned>(std::abs(x: *Diff)) > Sz) \|\|
4333	Diff == -(static_cast<int*>(Sz) - `1`))) {
4334	int Stride = Diff / static_cast<int*>(Sz - `1`);
4335	if (Diff == Stride static_cast<int>(Sz - `1`)) {
4336	Align Alignment =
4337	cast<LoadInst>(Val: Order.empty() ? VL.front() : VL [Order.front()])
4338	->getAlign();
4339	if (TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment)) {
4340	// Iterate through all pointers and check if all distances are
4341	// unique multiple of Dist.
4342	SmallSet<int, `4`> Dists;
4343	for (Value *Ptr : PointerOps) {
4344	int Dist = `0`;
4345	if (Ptr == PtrN)
4346	Dist = *Diff;
4347	else if (Ptr != Ptr0)
4348	Dist =
4349	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL: DL, SE&: *SE);
4350	// If the strides are not the same or repeated, we can't
4351	// vectorize.
4352	if (((Dist / Stride) * Stride) != Dist \|\|
4353	!Dists.insert(V: Dist).second)
4354	break;
4355	}
4356	if (Dists.size() == Sz)
4357	return LoadsState::StridedVectorize;
4358	}
4359	}
4360	}
4361	}
4362	auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4363	unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
4364	unsigned MinVF = getMinVF(Sz);
4365	unsigned MaxVF = std::max<unsigned>(a: bit_floor(Value: VL.size() / `2`), b: MinVF);
4366	MaxVF = std::min(a: getMaximumVF(ElemWidth: Sz, Opcode: Instruction::Load), b: MaxVF);
4367	for (unsigned VF = MaxVF; VF >= MinVF; VF /= `2`) {
4368	unsigned VectorizedCnt = `0`;
4369	SmallVector<LoadsState> States;
4370	for (unsigned Cnt = `0`, End = VL.size(); Cnt + VF <= End;
4371	Cnt += VF, ++VectorizedCnt) {
4372	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
4373	SmallVector<unsigned> Order;
4374	SmallVector<Value *> PointerOps;
4375	LoadsState LS =
4376	canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
4377	/TryRecursiveCheck=/false);
4378	// Check that the sorted loads are consecutive.
4379	if (LS == LoadsState::Gather)
4380	break;
4381	// If need the reorder - consider as high-cost masked gather for now.
4382	if ((LS == LoadsState::Vectorize \|\|
4383	LS == LoadsState::StridedVectorize) &&
4384	!Order.empty() && !isReverseOrder(Order))
4385	LS = LoadsState::ScatterVectorize;
4386	States.push_back(Elt: LS);
4387	}
4388	// Can be vectorized later as a serie of loads/insertelements.
4389	if (VectorizedCnt == VL.size() / VF) {
4390	// Compare masked gather cost and loads + insersubvector costs.
4391	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4392	InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4393	Opcode: Instruction::Load, DataTy: VecTy,
4394	Ptr: cast<LoadInst>(Val: VL0)->getPointerOperand(),
4395	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
4396	InstructionCost VecLdCost = `0`;
4397	auto *SubVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF);
4398	for (auto [I, LS] : enumerate(First&: States)) {
4399	auto LI0 = cast<LoadInst>(Val: VL [I VF]);
4400	switch (LS) {
4401	case LoadsState::Vectorize:
4402	VecLdCost += TTI.getMemoryOpCost(
4403	Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
4404	AddressSpace: LI0->getPointerAddressSpace(), CostKind,
4405	OpdInfo: TTI::OperandValueInfo ());
4406	break;
4407	case LoadsState::StridedVectorize:
4408	VecLdCost += TTI.getStridedMemoryOpCost(
4409	Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4410	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
4411	break;
4412	case LoadsState::ScatterVectorize:
4413	VecLdCost += TTI.getGatherScatterOpCost(
4414	Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4415	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
4416	break;
4417	case LoadsState::Gather:
4418	llvm_unreachable(
4419	"Expected only consecutive, strided or masked gather loads.");
4420	}
4421	SmallVector<int> ShuffleMask(VL.size());
4422	for (int Idx : seq<int>(Begin: `0`, End: VL.size()))
4423	ShuffleMask [Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4424	VecLdCost +=
4425	TTI.getShuffleCost(Kind: TTI ::SK_InsertSubvector, Tp: VecTy,
4426	Mask: ShuffleMask, CostKind, Index: I * VF, SubTp: SubVecTy);
4427	}
4428	// If masked gather cost is higher - better to vectorize, so
4429	// consider it as a gather node. It will be better estimated
4430	// later.
4431	if (MaskedGatherCost > VecLdCost)
4432	return true;
4433	}
4434	}
4435	return false;
4436	};
4437	// TODO: need to improve analysis of the pointers, if not all of them are
4438	// GEPs or have > 2 operands, we end up with a gather node, which just
4439	// increases the cost.
4440	Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
4441	bool ProfitableGatherPointers =
4442	L && Sz > `2` &&
4443	static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
4444	return L->isLoopInvariant(V);
4445	})) <= Sz / `2`;
4446	if (ProfitableGatherPointers \|\| all_of(Range&: PointerOps, P: [IsSorted](Value *P) {
4447	auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
4448	return (IsSorted && !GEP && doesNotNeedToBeScheduled(V: P)) \|\|
4449	(GEP && GEP->getNumOperands() == `2` &&
4450	isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: `1`)));
4451	})) {
4452	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4453	if (TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) &&
4454	!TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment)) {
4455	// Check if potential masked gather can be represented as series
4456	// of loads + insertsubvectors.
4457	if (TryRecursiveCheck && CheckForShuffledLoads (CommonAlignment)) {
4458	// If masked gather cost is higher - better to vectorize, so
4459	// consider it as a gather node. It will be better estimated
4460	// later.
4461	return LoadsState::Gather;
4462	}
4463	return LoadsState::ScatterVectorize;
4464	}
4465	}
4466	}
4467
4468	return LoadsState::Gather;
4469	}
4470
4471	static bool clusterSortPtrAccesses(ArrayRef<Value > VL, Type ElemTy,
4472	const DataLayout &DL, ScalarEvolution &SE,
4473	SmallVectorImpl<unsigned> &SortedIndices) {
4474	assert(llvm::all_of(
4475	VL, [](const Value V) { return* V->getType()->isPointerTy(); }) &&
4476	"Expected list of pointer operands.");
4477	// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4478	// Ptr into, sort and return the sorted indices with values next to one
4479	// another.
4480	MapVector<Value , SmallVector<std::tuple<Value , int, unsigned>>> Bases;
4481	Bases [VL [`0`]].push_back(Elt: std::make_tuple(args: VL [`0`], args: `0U`, args: `0U`));
4482
4483	unsigned Cnt = `1`;
4484	for (Value *Ptr : VL.drop_front()) {
4485	bool Found = any_of(Range&: Bases, P: [&](auto &Base) {
4486	std::optional<int> Diff =
4487	getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4488	/StrictCheck=/true);
4489	if (!Diff)
4490	return false;
4491
4492	Base.second.emplace_back(Ptr, *Diff, Cnt++);
4493	return true;
4494	});
4495
4496	if (!Found) {
4497	// If we haven't found enough to usefully cluster, return early.
4498	if (Bases.size() > VL.size() / `2` - `1`)
4499	return false;
4500
4501	// Not found already - add a new Base
4502	Bases [Ptr].emplace_back(Args&: Ptr, Args: `0`, Args: Cnt++);
4503	}
4504	}
4505
4506	// For each of the bases sort the pointers by Offset and check if any of the
4507	// base become consecutively allocated.
4508	bool AnyConsecutive = false;
4509	for (auto &Base : Bases) {
4510	auto &Vec = Base.second;
4511	if (Vec.size() > `1`) {
4512	llvm::stable_sort(Range&: Vec, C: [](const std::tuple<Value , int, unsigned*> &X,
4513	const std::tuple<Value , int, unsigned*> &Y) {
4514	return std::get<`1`>(t: X) < std::get<`1`>(t: Y);
4515	});
4516	int InitialOffset = std::get<`1`>(t&: Vec [`0`]);
4517	AnyConsecutive \|= all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
4518	return std::get<`1`>(P.value()) == int(P.index()) + InitialOffset;
4519	});
4520	}
4521	}
4522
4523	// Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4524	SortedIndices.clear();
4525	if (!AnyConsecutive)
4526	return false;
4527
4528	for (auto &Base : Bases) {
4529	for (auto &T : Base.second)
4530	SortedIndices.push_back(Elt: std::get<`2`>(t&: T));
4531	}
4532
4533	assert(SortedIndices.size() == VL.size() &&
4534	"Expected SortedIndices to be the size of VL");
4535	return true;
4536	}
4537
4538	std::optional<BoUpSLP::OrdersType>
4539	BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4540	assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4541	Type *ScalarTy = TE.Scalars [`0`]->getType();
4542
4543	SmallVector<Value *> Ptrs;
4544	Ptrs.reserve(N: TE.Scalars.size());
4545	for (Value *V : TE.Scalars) {
4546	auto *L = dyn_cast<LoadInst>(Val: V);
4547	if (!L \|\| !L->isSimple())
4548	return std::nullopt;
4549	Ptrs.push_back(Elt: L->getPointerOperand());
4550	}
4551
4552	BoUpSLP::OrdersType Order;
4553	if (clusterSortPtrAccesses(VL: Ptrs, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order))
4554	return std::move(Order);
4555	return std::nullopt;
4556	}
4557
4558	/// Check if two insertelement instructions are from the same buildvector.
4559	static bool areTwoInsertFromSameBuildVector(
4560	InsertElementInst VU, InsertElementInst V,
4561	function_ref<Value (InsertElementInst )> GetBaseOperand) {
4562	// Instructions must be from the same basic blocks.
4563	if (VU->getParent() != V->getParent())
4564	return false;
4565	// Checks if 2 insertelements are from the same buildvector.
4566	if (VU->getType() != V->getType())
4567	return false;
4568	// Multiple used inserts are separate nodes.
4569	if (!VU->hasOneUse() && !V->hasOneUse())
4570	return false;
4571	auto *IE1 = VU;
4572	auto *IE2 = V;
4573	std::optional<unsigned> Idx1 = getInsertIndex(InsertInst: IE1);
4574	std::optional<unsigned> Idx2 = getInsertIndex(InsertInst: IE2);
4575	if (Idx1 == std::nullopt \|\| Idx2 == std::nullopt)
4576	return false;
4577	// Go through the vector operand of insertelement instructions trying to find
4578	// either VU as the original vector for IE2 or V as the original vector for
4579	// IE1.
4580	SmallBitVector ReusedIdx(
4581	cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
4582	bool IsReusedIdx = false;
4583	do {
4584	if (IE2 == VU && !IE1)
4585	return VU->hasOneUse();
4586	if (IE1 == V && !IE2)
4587	return V->hasOneUse();
4588	if (IE1 && IE1 != V) {
4589	unsigned Idx1 = getInsertIndex(InsertInst: IE1).value_or(u&: *Idx2);
4590	IsReusedIdx \|= ReusedIdx.test(Idx: Idx1);
4591	ReusedIdx.set(Idx1);
4592	if ((IE1 != VU && !IE1->hasOneUse()) \|\| IsReusedIdx)
4593	IE1 = nullptr;
4594	else
4595	IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE1));
4596	}
4597	if (IE2 && IE2 != VU) {
4598	unsigned Idx2 = getInsertIndex(InsertInst: IE2).value_or(u&: *Idx1);
4599	IsReusedIdx \|= ReusedIdx.test(Idx: Idx2);
4600	ReusedIdx.set(Idx2);
4601	if ((IE2 != V && !IE2->hasOneUse()) \|\| IsReusedIdx)
4602	IE2 = nullptr;
4603	else
4604	IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE2));
4605	}
4606	} while (!IsReusedIdx && (IE1 \|\| IE2));
4607	return false;
4608	}
4609
4610	std::optional<BoUpSLP::OrdersType>
4611	BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4612	// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4613	if (TE.isNonPowOf2Vec())
4614	return std::nullopt;
4615
4616	// No need to reorder if need to shuffle reuses, still need to shuffle the
4617	// node.
4618	if (!TE.ReuseShuffleIndices.empty()) {
4619	if (isSplat(VL: TE.Scalars))
4620	return std::nullopt;
4621	// Check if reuse shuffle indices can be improved by reordering.
4622	// For this, check that reuse mask is "clustered", i.e. each scalar values
4623	// is used once in each submask of size <number_of_scalars>.
4624	// Example: 4 scalar values.
4625	// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4626	// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4627	// element 3 is used twice in the second submask.
4628	unsigned Sz = TE.Scalars.size();
4629	if (TE.State == TreeEntry::NeedToGather) {
4630	if (std::optional<OrdersType> CurrentOrder =
4631	findReusedOrderedScalars(TE)) {
4632	SmallVector<int> Mask;
4633	fixupOrderingIndices(Order: *CurrentOrder);
4634	inversePermutation(Indices: *CurrentOrder, Mask);
4635	::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
4636	OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4637	unsigned Sz = TE.Scalars.size();
4638	for (int K = `0`, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4639	for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
4640	if (Idx != PoisonMaskElem)
4641	Res [Idx + K * Sz] = I + K * Sz;
4642	}
4643	return std::move(Res);
4644	}
4645	}
4646	if (Sz == `2` && TE.getVectorFactor() == `4` &&
4647	TTI->getNumberOfParts(Tp: FixedVectorType::get(
4648	ElementType: TE.Scalars.front()->getType(), NumElts: `2` * TE.getVectorFactor())) == `1`)
4649	return std::nullopt;
4650	if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
4651	VF: Sz)) {
4652	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4653	if (TE.ReorderIndices.empty())
4654	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
4655	else
4656	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
4657	::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
4658	unsigned VF = ReorderMask.size();
4659	OrdersType ResOrder(VF, VF);
4660	unsigned NumParts = VF / Sz;
4661	SmallBitVector UsedVals(NumParts);
4662	for (unsigned I = `0`; I < VF; I += Sz) {
4663	int Val = PoisonMaskElem;
4664	unsigned UndefCnt = `0`;
4665	if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Sz),
4666	P: [&](int Idx) {
4667	if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4668	Val = Idx;
4669	if (Idx == PoisonMaskElem)
4670	++UndefCnt;
4671	return Idx != PoisonMaskElem && Idx != Val;
4672	}) \|\|
4673	Val >= static_cast<int>(NumParts) \|\| UsedVals.test(Idx: Val) \|\|
4674	UndefCnt > Sz / `2`)
4675	return std::nullopt;
4676	UsedVals.set(Val);
4677	for (unsigned K = `0`; K < NumParts; ++K)
4678	ResOrder [Val + Sz * K] = I + K;
4679	}
4680	return std::move(ResOrder);
4681	}
4682	unsigned VF = TE.getVectorFactor();
4683	// Try build correct order for extractelement instructions.
4684	SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4685	TE.ReuseShuffleIndices.end());
4686	if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4687	all_of(Range: TE.Scalars, P: [Sz](Value *V) {
4688	std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
4689	return Idx && *Idx < Sz;
4690	})) {
4691	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4692	if (TE.ReorderIndices.empty())
4693	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
4694	else
4695	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
4696	for (unsigned I = `0`; I < VF; ++I) {
4697	int &Idx = ReusedMask [I];
4698	if (Idx == PoisonMaskElem)
4699	continue;
4700	Value *V = TE.Scalars [ReorderMask [Idx]];
4701	std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
4702	Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
4703	}
4704	}
4705	// Build the order of the VF size, need to reorder reuses shuffles, they are
4706	// always of VF size.
4707	OrdersType ResOrder(VF);
4708	std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: `0`);
4709	auto *It = ResOrder.begin();
4710	for (unsigned K = `0`; K < VF; K += Sz) {
4711	OrdersType CurrentOrder(TE.ReorderIndices);
4712	SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
4713	if (SubMask.front() == PoisonMaskElem)
4714	std::iota(first: SubMask.begin(), last: SubMask.end(), value: `0`);
4715	reorderOrder(Order&: CurrentOrder, Mask: SubMask);
4716	transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
4717	std::advance(i&: It, n: Sz);
4718	}
4719	if (TE.State == TreeEntry::NeedToGather &&
4720	all_of(Range: enumerate(First&: ResOrder),
4721	P: [](const auto &Data) { return Data.index() == Data.value(); }))
4722	return std::nullopt; // No need to reorder.
4723	return std::move(ResOrder);
4724	}
4725	if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4726	any_of(Range: TE.UserTreeIndices,
4727	P: [](const EdgeInfo &EI) {
4728	return !Instruction::isBinaryOp(Opcode: EI.UserTE->getOpcode());
4729	}) &&
4730	(TE.ReorderIndices.empty() \|\| isReverseOrder(Order: TE.ReorderIndices)))
4731	return std::nullopt;
4732	if ((TE.State == TreeEntry::Vectorize \|\|
4733	TE.State == TreeEntry::StridedVectorize) &&
4734	(isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) \|\|
4735	(TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))) &&
4736	!TE.isAltShuffle())
4737	return TE.ReorderIndices;
4738	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4739	auto PHICompare = [&](unsigned I1, unsigned I2) {
4740	Value *V1 = TE.Scalars [I1];
4741	Value *V2 = TE.Scalars [I2];
4742	if (V1 == V2 \|\| (V1->getNumUses() == `0` && V2->getNumUses() == `0`))
4743	return false;
4744	if (V1->getNumUses() < V2->getNumUses())
4745	return true;
4746	if (V1->getNumUses() > V2->getNumUses())
4747	return false;
4748	auto FirstUserOfPhi1 = cast<Instruction>(Val: V1->user_begin());
4749	auto FirstUserOfPhi2 = cast<Instruction>(Val: V2->user_begin());
4750	if (auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1))
4751	if (auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2)) {
4752	if (!areTwoInsertFromSameBuildVector(
4753	VU: IE1, V: IE2,
4754	GetBaseOperand: [](InsertElementInst II) { return* II->getOperand(i_nocapture: `0`); }))
4755	return I1 < I2;
4756	return getInsertIndex(InsertInst: IE1) < getInsertIndex(InsertInst: IE2);
4757	}
4758	if (auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1))
4759	if (auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2)) {
4760	if (EE1->getOperand(i_nocapture: `0`) != EE2->getOperand(i_nocapture: `0`))
4761	return I1 < I2;
4762	return getInsertIndex(InsertInst: EE1) < getInsertIndex(InsertInst: EE2);
4763	}
4764	return I1 < I2;
4765	};
4766	auto IsIdentityOrder = [](const OrdersType &Order) {
4767	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Order.size()))
4768	if (Idx != Order [Idx])
4769	return false;
4770	return true;
4771	};
4772	if (!TE.ReorderIndices.empty())
4773	return TE.ReorderIndices;
4774	DenseMap<unsigned, unsigned> PhiToId;
4775	SmallVector<unsigned> Phis(TE.Scalars.size());
4776	std::iota(first: Phis.begin(), last: Phis.end(), value: `0`);
4777	OrdersType ResOrder(TE.Scalars.size());
4778	for (unsigned Id = `0`, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4779	PhiToId [Id] = Id;
4780	stable_sort(Range&: Phis, C: PHICompare);
4781	for (unsigned Id = `0`, Sz = Phis.size(); Id < Sz; ++Id)
4782	ResOrder [Id] = PhiToId [Phis [Id]];
4783	if (IsIdentityOrder (ResOrder))
4784	return std::nullopt; // No need to reorder.
4785	return std::move(ResOrder);
4786	}
4787	if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4788	allSameType(VL: TE.Scalars)) {
4789	// TODO: add analysis of other gather nodes with extractelement
4790	// instructions and other values/instructions, not only undefs.
4791	if ((TE.getOpcode() == Instruction::ExtractElement \|\|
4792	(all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
4793	any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
4794	all_of(Range: TE.Scalars, P: [](Value *V) {
4795	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
4796	return !EE \|\| isa<FixedVectorType>(Val: EE->getVectorOperandType());
4797	})) {
4798	// Check that gather of extractelements can be represented as
4799	// just a shuffle of a single vector.
4800	OrdersType CurrentOrder;
4801	bool Reuse = canReuseExtract(VL: TE.Scalars, OpValue: TE.getMainOp(), CurrentOrder,
4802	/ResizeAllowed=/true);
4803	if (Reuse \|\| !CurrentOrder.empty())
4804	return std::move(CurrentOrder);
4805	}
4806	// If the gather node is <undef, v, .., poison> and
4807	// insertelement poison, v, 0 [+ permute]
4808	// is cheaper than
4809	// insertelement poison, v, n - try to reorder.
4810	// If rotating the whole graph, exclude the permute cost, the whole graph
4811	// might be transformed.
4812	int Sz = TE.Scalars.size();
4813	if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
4814	count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - `1`) {
4815	const auto *It =
4816	find_if(Range: TE.Scalars, P: [](Value V) { return* !isConstant(V); });
4817	if (It == TE.Scalars.begin())
4818	return OrdersType ();
4819	auto *Ty = FixedVectorType::get(ElementType: TE.Scalars.front()->getType(), NumElts: Sz);
4820	if (It != TE.Scalars.end()) {
4821	OrdersType Order(Sz, Sz);
4822	unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
4823	Order [Idx] = `0`;
4824	fixupOrderingIndices(Order);
4825	SmallVector<int> Mask;
4826	inversePermutation(Indices: Order, Mask);
4827	InstructionCost PermuteCost =
4828	TopToBottom
4829	? `0`
4830	: TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
4831	InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4832	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: `0`,
4833	Op0: PoisonValue::get(T: Ty), Op1: *It);
4834	InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4835	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
4836	Op0: PoisonValue::get(T: Ty), Op1: *It);
4837	if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4838	OrdersType Order(Sz, Sz);
4839	Order [Idx] = `0`;
4840	return std::move(Order);
4841	}
4842	}
4843	}
4844	if (isSplat(VL: TE.Scalars))
4845	return std::nullopt;
4846	if (TE.Scalars.size() >= `4`)
4847	if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4848	return Order;
4849	if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4850	return CurrentOrder;
4851	}
4852	return std::nullopt;
4853	}
4854
4855	/// Checks if the given mask is a "clustered" mask with the same clusters of
4856	/// size \p Sz, which are not identity submasks.
4857	static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
4858	unsigned Sz) {
4859	ArrayRef<int> FirstCluster = Mask.slice(N: `0`, M: Sz);
4860	if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
4861	return false;
4862	for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4863	ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
4864	if (Cluster != FirstCluster)
4865	return false;
4866	}
4867	return true;
4868	}
4869
4870	void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4871	// Reorder reuses mask.
4872	reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
4873	const unsigned Sz = TE.Scalars.size();
4874	// For vectorized and non-clustered reused no need to do anything else.
4875	if (TE.State != TreeEntry::NeedToGather \|\|
4876	!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
4877	VF: Sz) \|\|
4878	!isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
4879	return;
4880	SmallVector<int> NewMask;
4881	inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
4882	addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
4883	// Clear reorder since it is going to be applied to the new mask.
4884	TE.ReorderIndices.clear();
4885	// Try to improve gathered nodes with clustered reuses, if possible.
4886	ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: `0`, M: Sz);
4887	SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4888	inversePermutation(Indices: NewOrder, Mask&: NewMask);
4889	reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
4890	// Fill the reuses mask with the identity submasks.
4891	for (auto *It = TE.ReuseShuffleIndices.begin(),
4892	*End = TE.ReuseShuffleIndices.end();
4893	It != End; std::advance(i&: It, n: Sz))
4894	std::iota(first: It, last: std::next(x: It, n: Sz), value: `0`);
4895	}
4896
4897	static void combineOrders(MutableArrayRef<unsigned> Order,
4898	ArrayRef<unsigned> SecondaryOrder) {
4899	assert((SecondaryOrder.empty() \|\| Order.size() == SecondaryOrder.size()) &&
4900	"Expected same size of orders");
4901	unsigned Sz = Order.size();
4902	SmallBitVector UsedIndices(Sz);
4903	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz)) {
4904	if (Order [Idx] != Sz)
4905	UsedIndices.set(Order [Idx]);
4906	}
4907	if (SecondaryOrder.empty()) {
4908	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
4909	if (Order [Idx] == Sz && !UsedIndices.test(Idx))
4910	Order [Idx] = Idx;
4911	} else {
4912	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
4913	if (SecondaryOrder [Idx] != Sz && Order [Idx] == Sz &&
4914	!UsedIndices.test(Idx: SecondaryOrder [Idx]))
4915	Order [Idx] = SecondaryOrder [Idx];
4916	}
4917	}
4918
4919	void BoUpSLP::reorderTopToBottom() {
4920	// Maps VF to the graph nodes.
4921	DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
4922	// ExtractElement gather nodes which can be vectorized and need to handle
4923	// their ordering.
4924	DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
4925
4926	// Phi nodes can have preferred ordering based on their result users
4927	DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
4928
4929	// AltShuffles can also have a preferred ordering that leads to fewer
4930	// instructions, e.g., the addsub instruction in x86.
4931	DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4932
4933	// Maps a TreeEntry to the reorder indices of external users.
4934	DenseMap<const TreeEntry *, SmallVector<OrdersType, `1`>>
4935	ExternalUserReorderMap;
4936	// Find all reorderable nodes with the given VF.
4937	// Currently the are vectorized stores,loads,extracts + some gathering of
4938	// extracts.
4939	for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
4940	const std::unique_ptr<TreeEntry> &TE) {
4941	// Look for external users that will probably be vectorized.
4942	SmallVector<OrdersType, `1`> ExternalUserReorderIndices =
4943	findExternalStoreUsersReorderIndices(TE: TE.get());
4944	if (!ExternalUserReorderIndices.empty()) {
4945	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
4946	ExternalUserReorderMap.try_emplace(Key: TE.get(),
4947	Args: std::move(ExternalUserReorderIndices));
4948	}
4949
4950	// Patterns like [fadd,fsub] can be combined into a single instruction in
4951	// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4952	// to take into account their order when looking for the most used order.
4953	if (TE ->isAltShuffle()) {
4954	VectorType *VecTy =
4955	FixedVectorType::get(ElementType: TE ->Scalars [`0`]->getType(), NumElts: TE ->Scalars.size());
4956	unsigned Opcode0 = TE ->getOpcode();
4957	unsigned Opcode1 = TE ->getAltOpcode();
4958	// The opcode mask selects between the two opcodes.
4959	SmallBitVector OpcodeMask(TE ->Scalars.size(), false);
4960	for (unsigned Lane : seq<unsigned>(Begin: `0`, End: TE ->Scalars.size()))
4961	if (cast<Instruction>(Val: TE ->Scalars [Lane])->getOpcode() == Opcode1)
4962	OpcodeMask.set(Lane);
4963	// If this pattern is supported by the target then we consider the order.
4964	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4965	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
4966	AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType ());
4967	}
4968	// TODO: Check the reverse order too.
4969	}
4970
4971	if (std::optional<OrdersType> CurrentOrder =
4972	getReorderingData(TE: TE, /TopToBottom=/*true)) {
4973	// Do not include ordering for nodes used in the alt opcode vectorization,
4974	// better to reorder them during bottom-to-top stage. If follow the order
4975	// here, it causes reordering of the whole graph though actually it is
4976	// profitable just to reorder the subgraph that starts from the alternate
4977	// opcode vectorization node. Such nodes already end-up with the shuffle
4978	// instruction and it is just enough to change this shuffle rather than
4979	// rotate the scalars for the whole graph.
4980	unsigned Cnt = `0`;
4981	const TreeEntry *UserTE = TE.get();
4982	while (UserTE && Cnt < RecursionMaxDepth) {
4983	if (UserTE->UserTreeIndices.size() != `1`)
4984	break;
4985	if (all_of(Range: UserTE->UserTreeIndices, P: [](const EdgeInfo &EI) {
4986	return EI.UserTE->State == TreeEntry::Vectorize &&
4987	EI.UserTE->isAltShuffle() && EI.UserTE->Idx != `0`;
4988	}))
4989	return;
4990	UserTE = UserTE->UserTreeIndices.back().UserTE;
4991	++Cnt;
4992	}
4993	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
4994	if (!(TE ->State == TreeEntry::Vectorize \|\|
4995	TE ->State == TreeEntry::StridedVectorize) \|\|
4996	!TE ->ReuseShuffleIndices.empty())
4997	GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
4998	if (TE ->State == TreeEntry::Vectorize &&
4999	TE ->getOpcode() == Instruction::PHI)
5000	PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
5001	}
5002	});
5003
5004	// Reorder the graph nodes according to their vectorization factor.
5005	for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > `1`;
5006	VF /= `2`) {
5007	auto It = VFToOrderedEntries.find(Val: VF);
5008	if (It == VFToOrderedEntries.end())
5009	continue;
5010	// Try to find the most profitable order. We just are looking for the most
5011	// used order and reorder scalar elements in the nodes according to this
5012	// mostly used order.
5013	ArrayRef<TreeEntry *> OrderedEntries = It ->second.getArrayRef();
5014	// All operands are reordered and used only in this node - propagate the
5015	// most used order to the user node.
5016	MapVector<OrdersType, unsigned,
5017	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5018	OrdersUses;
5019	SmallPtrSet<const TreeEntry *, `4`> VisitedOps;
5020	for (const TreeEntry *OpTE : OrderedEntries) {
5021	// No need to reorder this nodes, still need to extend and to use shuffle,
5022	// just need to merge reordering shuffle and the reuse shuffle.
5023	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE))
5024	continue;
5025	// Count number of orders uses.
5026	const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5027	&PhisToOrders]() -> const OrdersType & {
5028	if (OpTE->State == TreeEntry::NeedToGather \|\|
5029	!OpTE->ReuseShuffleIndices.empty()) {
5030	auto It = GathersToOrders.find(Val: OpTE);
5031	if (It != GathersToOrders.end())
5032	return It ->second;
5033	}
5034	if (OpTE->isAltShuffle()) {
5035	auto It = AltShufflesToOrders.find(Val: OpTE);
5036	if (It != AltShufflesToOrders.end())
5037	return It ->second;
5038	}
5039	if (OpTE->State == TreeEntry::Vectorize &&
5040	OpTE->getOpcode() == Instruction::PHI) {
5041	auto It = PhisToOrders.find(Val: OpTE);
5042	if (It != PhisToOrders.end())
5043	return It ->second;
5044	}
5045	return OpTE->ReorderIndices;
5046	}();
5047	// First consider the order of the external scalar users.
5048	auto It = ExternalUserReorderMap.find(Val: OpTE);
5049	if (It != ExternalUserReorderMap.end()) {
5050	const auto &ExternalUserReorderIndices = It ->second;
5051	// If the OpTE vector factor != number of scalars - use natural order,
5052	// it is an attempt to reorder node with reused scalars but with
5053	// external uses.
5054	if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5055	OrdersUses.insert(KV: std::make_pair(x: OrdersType (), y: `0`)).first->second +=
5056	ExternalUserReorderIndices.size();
5057	} else {
5058	for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5059	++OrdersUses.insert(KV: std::make_pair(x: ExtOrder, y: `0`)).first->second;
5060	}
5061	// No other useful reorder data in this entry.
5062	if (Order.empty())
5063	continue;
5064	}
5065	// Stores actually store the mask, not the order, need to invert.
5066	if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5067	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5068	SmallVector<int> Mask;
5069	inversePermutation(Indices: Order, Mask);
5070	unsigned E = Order.size();
5071	OrdersType CurrentOrder(E, E);
5072	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5073	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5074	});
5075	fixupOrderingIndices(Order: CurrentOrder);
5076	++OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: `0`)).first->second;
5077	} else {
5078	++OrdersUses.insert(KV: std::make_pair(x: Order, y: `0`)).first->second;
5079	}
5080	}
5081	if (OrdersUses.empty())
5082	continue;
5083	auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5084	const unsigned Sz = Order.size();
5085	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
5086	if (Idx != Order [Idx] && Order [Idx] != Sz)
5087	return false;
5088	return true;
5089	};
5090	// Choose the most used order.
5091	unsigned IdentityCnt = `0`;
5092	unsigned FilledIdentityCnt = `0`;
5093	OrdersType IdentityOrder(VF, VF);
5094	for (auto &Pair : OrdersUses) {
5095	if (Pair.first.empty() \|\| IsIdentityOrder (Pair.first)) {
5096	if (!Pair.first.empty())
5097	FilledIdentityCnt += Pair.second;
5098	IdentityCnt += Pair.second;
5099	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5100	}
5101	}
5102	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5103	unsigned Cnt = IdentityCnt;
5104	for (auto &Pair : OrdersUses) {
5105	// Prefer identity order. But, if filled identity found (non-empty order)
5106	// with same number of uses, as the new candidate order, we can choose
5107	// this candidate order.
5108	if (Cnt < Pair.second \|\|
5109	(Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5110	Cnt == Pair.second && !BestOrder.empty() &&
5111	IsIdentityOrder (BestOrder))) {
5112	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5113	BestOrder = Pair.first;
5114	Cnt = Pair.second;
5115	} else {
5116	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5117	}
5118	}
5119	// Set order of the user node.
5120	if (IsIdentityOrder (BestOrder))
5121	continue;
5122	fixupOrderingIndices(Order: BestOrder);
5123	SmallVector<int> Mask;
5124	inversePermutation(Indices: BestOrder, Mask);
5125	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5126	unsigned E = BestOrder.size();
5127	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5128	return I < E ? static_cast<int>(I) : PoisonMaskElem;
5129	});
5130	// Do an actual reordering, if profitable.
5131	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5132	// Just do the reordering for the nodes with the given VF.
5133	if (TE ->Scalars.size() != VF) {
5134	if (TE ->ReuseShuffleIndices.size() == VF) {
5135	// Need to reorder the reuses masks of the operands with smaller VF to
5136	// be able to find the match between the graph nodes and scalar
5137	// operands of the given node during vectorization/cost estimation.
5138	assert(all_of(TE->UserTreeIndices,
5139	[VF, &TE](const EdgeInfo &EI) {
5140	return EI.UserTE->Scalars.size() == VF \|\|
5141	EI.UserTE->Scalars.size() ==
5142	TE ->Scalars.size();
5143	}) &&
5144	"All users must be of VF size.");
5145	// Update ordering of the operands with the smaller VF than the given
5146	// one.
5147	reorderNodeWithReuses(TE&: *TE, Mask);
5148	}
5149	continue;
5150	}
5151	if ((TE ->State == TreeEntry::Vectorize \|\|
5152	TE ->State == TreeEntry::StridedVectorize) &&
5153	isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
5154	InsertElementInst>(Val: TE ->getMainOp()) &&
5155	!TE ->isAltShuffle()) {
5156	// Build correct orders for extract{element,value}, loads and
5157	// stores.
5158	reorderOrder(Order&: TE ->ReorderIndices, Mask);
5159	if (isa<InsertElementInst, StoreInst>(Val: TE ->getMainOp()))
5160	TE ->reorderOperands(Mask);
5161	} else {
5162	// Reorder the node and its operands.
5163	TE ->reorderOperands(Mask);
5164	assert(TE ->ReorderIndices.empty() &&
5165	"Expected empty reorder sequence.");
5166	reorderScalars(Scalars&: TE ->Scalars, Mask);
5167	}
5168	if (!TE ->ReuseShuffleIndices.empty()) {
5169	// Apply reversed order to keep the original ordering of the reused
5170	// elements to avoid extra reorder indices shuffling.
5171	OrdersType CurrentOrder;
5172	reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
5173	SmallVector<int> NewReuses;
5174	inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
5175	addMask(Mask&: NewReuses, SubMask: TE ->ReuseShuffleIndices);
5176	TE ->ReuseShuffleIndices.swap(RHS&: NewReuses);
5177	}
5178	}
5179	}
5180	}
5181
5182	bool BoUpSLP::canReorderOperands(
5183	TreeEntry UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry >> &Edges,
5184	ArrayRef<TreeEntry *> ReorderableGathers,
5185	SmallVectorImpl<TreeEntry *> &GatherOps) {
5186	// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5187	if (UserTE->isNonPowOf2Vec())
5188	return false;
5189
5190	for (unsigned I = `0`, E = UserTE->getNumOperands(); I < E; ++I) {
5191	if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5192	return OpData.first == I &&
5193	(OpData.second->State == TreeEntry::Vectorize \|\|
5194	OpData.second->State == TreeEntry::StridedVectorize);
5195	}))
5196	continue;
5197	if (TreeEntry *TE = getVectorizedOperand(UserTE, OpIdx: I)) {
5198	// Do not reorder if operand node is used by many user nodes.
5199	if (any_of(Range&: TE->UserTreeIndices,
5200	P: [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5201	return false;
5202	// Add the node to the list of the ordered nodes with the identity
5203	// order.
5204	Edges.emplace_back(Args&: I, Args&: TE);
5205	// Add ScatterVectorize nodes to the list of operands, where just
5206	// reordering of the scalars is required. Similar to the gathers, so
5207	// simply add to the list of gathered ops.
5208	// If there are reused scalars, process this node as a regular vectorize
5209	// node, just reorder reuses mask.
5210	if (TE->State != TreeEntry::Vectorize &&
5211	TE->State != TreeEntry::StridedVectorize &&
5212	TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5213	GatherOps.push_back(Elt: TE);
5214	continue;
5215	}
5216	TreeEntry Gather = nullptr*;
5217	if (count_if(Range&: ReorderableGathers,
5218	P: [&Gather, UserTE, I](TreeEntry *TE) {
5219	assert(TE->State != TreeEntry::Vectorize &&
5220	TE->State != TreeEntry::StridedVectorize &&
5221	"Only non-vectorized nodes are expected.");
5222	if (any_of(Range&: TE->UserTreeIndices,
5223	P: [UserTE, I](const EdgeInfo &EI) {
5224	return EI.UserTE == UserTE && EI.EdgeIdx == I;
5225	})) {
5226	assert(TE->isSame(UserTE->getOperand(I)) &&
5227	"Operand entry does not match operands.");
5228	Gather = TE;
5229	return true;
5230	}
5231	return false;
5232	}) > `1` &&
5233	!allConstant(VL: UserTE->getOperand(OpIdx: I)))
5234	return false;
5235	if (Gather)
5236	GatherOps.push_back(Elt: Gather);
5237	}
5238	return true;
5239	}
5240
5241	void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5242	SetVector<TreeEntry *> OrderedEntries;
5243	DenseSet<const TreeEntry *> GathersToOrders;
5244	// Find all reorderable leaf nodes with the given VF.
5245	// Currently the are vectorized loads,extracts without alternate operands +
5246	// some gathering of extracts.
5247	SmallVector<TreeEntry *> NonVectorized;
5248	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5249	if (TE ->State != TreeEntry::Vectorize &&
5250	TE ->State != TreeEntry::StridedVectorize)
5251	NonVectorized.push_back(Elt: TE.get());
5252	if (std::optional<OrdersType> CurrentOrder =
5253	getReorderingData(TE: TE, /TopToBottom=/*false)) {
5254	OrderedEntries.insert(X: TE.get());
5255	if (!(TE ->State == TreeEntry::Vectorize \|\|
5256	TE ->State == TreeEntry::StridedVectorize) \|\|
5257	!TE ->ReuseShuffleIndices.empty())
5258	GathersToOrders.insert(V: TE.get());
5259	}
5260	}
5261
5262	// 1. Propagate order to the graph nodes, which use only reordered nodes.
5263	// I.e., if the node has operands, that are reordered, try to make at least
5264	// one operand order in the natural order and reorder others + reorder the
5265	// user node itself.
5266	SmallPtrSet<const TreeEntry *, `4`> Visited;
5267	while (!OrderedEntries.empty()) {
5268	// 1. Filter out only reordered nodes.
5269	// 2. If the entry has multiple uses - skip it and jump to the next node.
5270	DenseMap<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>> Users;
5271	SmallVector<TreeEntry *> Filtered;
5272	for (TreeEntry *TE : OrderedEntries) {
5273	if (!(TE->State == TreeEntry::Vectorize \|\|
5274	TE->State == TreeEntry::StridedVectorize \|\|
5275	(TE->State == TreeEntry::NeedToGather &&
5276	GathersToOrders.contains(V: TE))) \|\|
5277	TE->UserTreeIndices.empty() \|\| !TE->ReuseShuffleIndices.empty() \|\|
5278	!all_of(Range: drop_begin(RangeOrContainer&: TE->UserTreeIndices),
5279	P: [TE](const EdgeInfo &EI) {
5280	return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5281	}) \|\|
5282	!Visited.insert(Ptr: TE).second) {
5283	Filtered.push_back(Elt: TE);
5284	continue;
5285	}
5286	// Build a map between user nodes and their operands order to speedup
5287	// search. The graph currently does not provide this dependency directly.
5288	for (EdgeInfo &EI : TE->UserTreeIndices) {
5289	TreeEntry *UserTE = EI.UserTE;
5290	auto It = Users.find(Val: UserTE);
5291	if (It == Users.end())
5292	It = Users.insert(KV: {UserTE, {}}).first;
5293	It ->second.emplace_back(Args&: EI.EdgeIdx, Args&: TE);
5294	}
5295	}
5296	// Erase filtered entries.
5297	for (TreeEntry *TE : Filtered)
5298	OrderedEntries.remove(X: TE);
5299	SmallVector<
5300	std::pair<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>>>
5301	UsersVec(Users.begin(), Users.end());
5302	sort(C&: UsersVec, Comp: [](const auto &Data1, const auto &Data2) {
5303	return Data1.first->Idx > Data2.first->Idx;
5304	});
5305	for (auto &Data : UsersVec) {
5306	// Check that operands are used only in the User node.
5307	SmallVector<TreeEntry *> GatherOps;
5308	if (!canReorderOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
5309	GatherOps)) {
5310	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5311	OrderedEntries.remove(X: Op.second);
5312	continue;
5313	}
5314	// All operands are reordered and used only in this node - propagate the
5315	// most used order to the user node.
5316	MapVector<OrdersType, unsigned,
5317	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5318	OrdersUses;
5319	// Do the analysis for each tree entry only once, otherwise the order of
5320	// the same node my be considered several times, though might be not
5321	// profitable.
5322	SmallPtrSet<const TreeEntry *, `4`> VisitedOps;
5323	SmallPtrSet<const TreeEntry *, `4`> VisitedUsers;
5324	for (const auto &Op : Data.second) {
5325	TreeEntry *OpTE = Op.second;
5326	if (!VisitedOps.insert(Ptr: OpTE).second)
5327	continue;
5328	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
5329	continue;
5330	const auto Order = [&]() -> const OrdersType {
5331	if (OpTE->State == TreeEntry::NeedToGather \|\|
5332	!OpTE->ReuseShuffleIndices.empty())
5333	return getReorderingData(TE: OpTE, /TopToBottom=/*false)
5334	.value_or(u: OrdersType (`1`));
5335	return OpTE->ReorderIndices;
5336	}();
5337	// The order is partially ordered, skip it in favor of fully non-ordered
5338	// orders.
5339	if (Order.size() == `1`)
5340	continue;
5341	unsigned NumOps = count_if(
5342	Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5343	return P.second == OpTE;
5344	});
5345	// Stores actually store the mask, not the order, need to invert.
5346	if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5347	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5348	SmallVector<int> Mask;
5349	inversePermutation(Indices: Order, Mask);
5350	unsigned E = Order.size();
5351	OrdersType CurrentOrder(E, E);
5352	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5353	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5354	});
5355	fixupOrderingIndices(Order: CurrentOrder);
5356	OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: `0`)).first->second +=
5357	NumOps;
5358	} else {
5359	OrdersUses.insert(KV: std::make_pair(x: Order, y: `0`)).first->second += NumOps;
5360	}
5361	auto Res = OrdersUses.insert(KV: std::make_pair(x: OrdersType (), y: `0`));
5362	const auto AllowsReordering = [&](const TreeEntry *TE) {
5363	// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5364	if (TE->isNonPowOf2Vec())
5365	return false;
5366	if (!TE->ReorderIndices.empty() \|\| !TE->ReuseShuffleIndices.empty() \|\|
5367	(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) \|\|
5368	(IgnoreReorder && TE->Idx == `0`))
5369	return true;
5370	if (TE->State == TreeEntry::NeedToGather) {
5371	if (GathersToOrders.contains(V: TE))
5372	return !getReorderingData(TE: TE, /TopToBottom=/*false)
5373	.value_or(u: OrdersType (`1`))
5374	.empty();
5375	return true;
5376	}
5377	return false;
5378	};
5379	for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5380	TreeEntry *UserTE = EI.UserTE;
5381	if (!VisitedUsers.insert(Ptr: UserTE).second)
5382	continue;
5383	// May reorder user node if it requires reordering, has reused
5384	// scalars, is an alternate op vectorize node or its op nodes require
5385	// reordering.
5386	if (AllowsReordering (UserTE))
5387	continue;
5388	// Check if users allow reordering.
5389	// Currently look up just 1 level of operands to avoid increase of
5390	// the compile time.
5391	// Profitable to reorder if definitely more operands allow
5392	// reordering rather than those with natural order.
5393	ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users [UserTE];
5394	if (static_cast<unsigned>(count_if(
5395	Range&: Ops, P: [UserTE, &AllowsReordering](
5396	const std::pair<unsigned, TreeEntry *> &Op) {
5397	return AllowsReordering (Op.second) &&
5398	all_of(Range&: Op.second->UserTreeIndices,
5399	P: [UserTE](const EdgeInfo &EI) {
5400	return EI.UserTE == UserTE;
5401	});
5402	})) <= Ops.size() / `2`)
5403	++Res.first->second;
5404	}
5405	}
5406	if (OrdersUses.empty()) {
5407	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5408	OrderedEntries.remove(X: Op.second);
5409	continue;
5410	}
5411	auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5412	const unsigned Sz = Order.size();
5413	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
5414	if (Idx != Order [Idx] && Order [Idx] != Sz)
5415	return false;
5416	return true;
5417	};
5418	// Choose the most used order.
5419	unsigned IdentityCnt = `0`;
5420	unsigned VF = Data.second.front().second->getVectorFactor();
5421	OrdersType IdentityOrder(VF, VF);
5422	for (auto &Pair : OrdersUses) {
5423	if (Pair.first.empty() \|\| IsIdentityOrder (Pair.first)) {
5424	IdentityCnt += Pair.second;
5425	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5426	}
5427	}
5428	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5429	unsigned Cnt = IdentityCnt;
5430	for (auto &Pair : OrdersUses) {
5431	// Prefer identity order. But, if filled identity found (non-empty
5432	// order) with same number of uses, as the new candidate order, we can
5433	// choose this candidate order.
5434	if (Cnt < Pair.second) {
5435	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5436	BestOrder = Pair.first;
5437	Cnt = Pair.second;
5438	} else {
5439	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5440	}
5441	}
5442	// Set order of the user node.
5443	if (IsIdentityOrder (BestOrder)) {
5444	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5445	OrderedEntries.remove(X: Op.second);
5446	continue;
5447	}
5448	fixupOrderingIndices(Order: BestOrder);
5449	// Erase operands from OrderedEntries list and adjust their orders.
5450	VisitedOps.clear();
5451	SmallVector<int> Mask;
5452	inversePermutation(Indices: BestOrder, Mask);
5453	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5454	unsigned E = BestOrder.size();
5455	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5456	return I < E ? static_cast<int>(I) : PoisonMaskElem;
5457	});
5458	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5459	TreeEntry *TE = Op.second;
5460	OrderedEntries.remove(X: TE);
5461	if (!VisitedOps.insert(Ptr: TE).second)
5462	continue;
5463	if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5464	reorderNodeWithReuses(TE&: *TE, Mask);
5465	continue;
5466	}
5467	// Gathers are processed separately.
5468	if (TE->State != TreeEntry::Vectorize &&
5469	TE->State != TreeEntry::StridedVectorize &&
5470	(TE->State != TreeEntry::ScatterVectorize \|\|
5471	TE->ReorderIndices.empty()))
5472	continue;
5473	assert((BestOrder.size() == TE->ReorderIndices.size() \|\|
5474	TE->ReorderIndices.empty()) &&
5475	"Non-matching sizes of user/operand entries.");
5476	reorderOrder(Order&: TE->ReorderIndices, Mask);
5477	if (IgnoreReorder && TE == VectorizableTree.front().get())
5478	IgnoreReorder = false;
5479	}
5480	// For gathers just need to reorder its scalars.
5481	for (TreeEntry *Gather : GatherOps) {
5482	assert(Gather->ReorderIndices.empty() &&
5483	"Unexpected reordering of gathers.");
5484	if (!Gather->ReuseShuffleIndices.empty()) {
5485	// Just reorder reuses indices.
5486	reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
5487	continue;
5488	}
5489	reorderScalars(Scalars&: Gather->Scalars, Mask);
5490	OrderedEntries.remove(X: Gather);
5491	}
5492	// Reorder operands of the user node and set the ordering for the user
5493	// node itself.
5494	if (Data.first->State != TreeEntry::Vectorize \|\|
5495	!isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5496	Val: Data.first->getMainOp()) \|\|
5497	Data.first->isAltShuffle())
5498	Data.first->reorderOperands(Mask);
5499	if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) \|\|
5500	Data.first->isAltShuffle() \|\|
5501	Data.first->State == TreeEntry::StridedVectorize) {
5502	reorderScalars(Scalars&: Data.first->Scalars, Mask);
5503	reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
5504	/BottomOrder=/true);
5505	if (Data.first->ReuseShuffleIndices.empty() &&
5506	!Data.first->ReorderIndices.empty() &&
5507	!Data.first->isAltShuffle()) {
5508	// Insert user node to the list to try to sink reordering deeper in
5509	// the graph.
5510	OrderedEntries.insert(X: Data.first);
5511	}
5512	} else {
5513	reorderOrder(Order&: Data.first->ReorderIndices, Mask);
5514	}
5515	}
5516	}
5517	// If the reordering is unnecessary, just remove the reorder.
5518	if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5519	VectorizableTree.front()->ReuseShuffleIndices.empty())
5520	VectorizableTree.front()->ReorderIndices.clear();
5521	}
5522
5523	void BoUpSLP::buildExternalUses(
5524	const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5525	DenseMap<Value , unsigned*> ScalarToExtUses;
5526	// Collect the values that we need to extract from the tree.
5527	for (auto &TEPtr : VectorizableTree) {
5528	TreeEntry *Entry = TEPtr.get();
5529
5530	// No need to handle users of gathered values.
5531	if (Entry->State == TreeEntry::NeedToGather)
5532	continue;
5533
5534	// For each lane:
5535	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5536	Value *Scalar = Entry->Scalars [Lane];
5537	if (!isa<Instruction>(Val: Scalar))
5538	continue;
5539	// All uses must be replaced already? No need to do it again.
5540	auto It = ScalarToExtUses.find(Val: Scalar);
5541	if (It != ScalarToExtUses.end() && !ExternalUses [It ->second].User)
5542	continue;
5543
5544	// Check if the scalar is externally used as an extra arg.
5545	const auto *ExtI = ExternallyUsedValues.find(Key: Scalar);
5546	if (ExtI != ExternallyUsedValues.end()) {
5547	int FoundLane = Entry->findLaneForValue(V: Scalar);
5548	LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5549	<< FoundLane << " from " << *Scalar << ".\n");
5550	ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
5551	ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: FoundLane);
5552	continue;
5553	}
5554	for (User *U : Scalar->users()) {
5555	LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5556
5557	Instruction *UserInst = dyn_cast<Instruction>(Val: U);
5558	if (!UserInst \|\| isDeleted(I: UserInst))
5559	continue;
5560
5561	// Ignore users in the user ignore list.
5562	if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
5563	continue;
5564
5565	// Skip in-tree scalars that become vectors
5566	if (TreeEntry *UseEntry = getTreeEntry(V: U)) {
5567	// Some in-tree scalars will remain as scalar in vectorized
5568	// instructions. If that is the case, the one in FoundLane will
5569	// be used.
5570	if (UseEntry->State == TreeEntry::ScatterVectorize \|\|
5571	!doesInTreeUserNeedToExtract(
5572	Scalar, UserInst: cast<Instruction>(Val: UseEntry->Scalars.front()), TLI)) {
5573	LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5574	<< ".\n");
5575	assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5576	continue;
5577	}
5578	U = nullptr;
5579	if (It != ScalarToExtUses.end()) {
5580	ExternalUses [It ->second].User = nullptr;
5581	break;
5582	}
5583	}
5584
5585	int FoundLane = Entry->findLaneForValue(V: Scalar);
5586	LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5587	<< " from lane " << FoundLane << " from " << *Scalar
5588	<< ".\n");
5589	It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
5590	ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: FoundLane);
5591	if (!U)
5592	break;
5593	}
5594	}
5595	}
5596	}
5597
5598	DenseMap<Value , SmallVector<StoreInst >>
5599	BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry TE) const* {
5600	DenseMap<Value , SmallVector<StoreInst >> PtrToStoresMap;
5601	for (unsigned Lane : seq<unsigned>(Begin: `0`, End: TE->Scalars.size())) {
5602	Value *V = TE->Scalars [Lane];
5603	// To save compilation time we don't visit if we have too many users.
5604	if (V->hasNUsesOrMore(N: UsesLimit))
5605	break;
5606
5607	// Collect stores per pointer object.
5608	for (User *U : V->users()) {
5609	auto *SI = dyn_cast<StoreInst>(Val: U);
5610	if (SI == nullptr \|\| !SI->isSimple() \|\|
5611	!isValidElementType(Ty: SI->getValueOperand()->getType()))
5612	continue;
5613	// Skip entry if already
5614	if (getTreeEntry(V: U))
5615	continue;
5616
5617	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
5618	auto &StoresVec = PtrToStoresMap [Ptr];
5619	// For now just keep one store per pointer object per lane.
5620	// TODO: Extend this to support multiple stores per pointer per lane
5621	if (StoresVec.size() > Lane)
5622	continue;
5623	// Skip if in different BBs.
5624	if (!StoresVec.empty() &&
5625	SI->getParent() != StoresVec.back()->getParent())
5626	continue;
5627	// Make sure that the stores are of the same type.
5628	if (!StoresVec.empty() &&
5629	SI->getValueOperand()->getType() !=
5630	StoresVec.back()->getValueOperand()->getType())
5631	continue;
5632	StoresVec.push_back(Elt: SI);
5633	}
5634	}
5635	return PtrToStoresMap;
5636	}
5637
5638	bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5639	OrdersType &ReorderIndices) const {
5640	// We check whether the stores in StoreVec can form a vector by sorting them
5641	// and checking whether they are consecutive.
5642
5643	// To avoid calling getPointersDiff() while sorting we create a vector of
5644	// pairs {store, offset from first} and sort this instead.
5645	SmallVector<std::pair<StoreInst , int*>> StoreOffsetVec(StoresVec.size());
5646	StoreInst *S0 = StoresVec [`0`];
5647	StoreOffsetVec [`0`] = {S0, `0`};
5648	Type *S0Ty = S0->getValueOperand()->getType();
5649	Value *S0Ptr = S0->getPointerOperand();
5650	for (unsigned Idx : seq<unsigned>(Begin: `1`, End: StoresVec.size())) {
5651	StoreInst *SI = StoresVec [Idx];
5652	std::optional<int> Diff =
5653	getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
5654	PtrB: SI->getPointerOperand(), DL: DL, SE&: SE,
5655	/StrictCheck=/true);
5656	// We failed to compare the pointers so just abandon this StoresVec.
5657	if (!Diff)
5658	return false;
5659	StoreOffsetVec [Idx] = {StoresVec [Idx], *Diff};
5660	}
5661
5662	// Sort the vector based on the pointers. We create a copy because we may
5663	// need the original later for calculating the reorder (shuffle) indices.
5664	stable_sort(Range&: StoreOffsetVec, C: [](const std::pair<StoreInst , int*> &Pair1,
5665	const std::pair<StoreInst , int*> &Pair2) {
5666	int Offset1 = Pair1.second;
5667	int Offset2 = Pair2.second;
5668	return Offset1 < Offset2;
5669	});
5670
5671	// Check if the stores are consecutive by checking if their difference is 1.
5672	for (unsigned Idx : seq<unsigned>(Begin: `1`, End: StoreOffsetVec.size()))
5673	if (StoreOffsetVec [Idx].second != StoreOffsetVec [Idx - `1`].second + `1`)
5674	return false;
5675
5676	// Calculate the shuffle indices according to their offset against the sorted
5677	// StoreOffsetVec.
5678	ReorderIndices.reserve(N: StoresVec.size());
5679	for (StoreInst *SI : StoresVec) {
5680	unsigned Idx = find_if(Range&: StoreOffsetVec,
5681	P: [SI](const std::pair<StoreInst , int*> &Pair) {
5682	return Pair.first == SI;
5683	}) -
5684	StoreOffsetVec.begin();
5685	ReorderIndices.push_back(Elt: Idx);
5686	}
5687	// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5688	// reorderTopToBottom() and reorderBottomToTop(), so we are following the
5689	// same convention here.
5690	auto IsIdentityOrder = [](const OrdersType &Order) {
5691	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Order.size()))
5692	if (Idx != Order [Idx])
5693	return false;
5694	return true;
5695	};
5696	if (IsIdentityOrder (ReorderIndices))
5697	ReorderIndices.clear();
5698
5699	return true;
5700	}
5701
5702	#ifndef NDEBUG
5703	LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
5704	for (unsigned Idx : Order)
5705	dbgs() << Idx << ", ";
5706	dbgs() << "\n";
5707	}
5708	#endif
5709
5710	SmallVector<BoUpSLP::OrdersType, `1`>
5711	BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry TE) const* {
5712	unsigned NumLanes = TE->Scalars.size();
5713
5714	DenseMap<Value , SmallVector<StoreInst >> PtrToStoresMap =
5715	collectUserStores(TE);
5716
5717	// Holds the reorder indices for each candidate store vector that is a user of
5718	// the current TreeEntry.
5719	SmallVector<OrdersType, `1`> ExternalReorderIndices;
5720
5721	// Now inspect the stores collected per pointer and look for vectorization
5722	// candidates. For each candidate calculate the reorder index vector and push
5723	// it into `ExternalReorderIndices`
5724	for (const auto &Pair : PtrToStoresMap) {
5725	auto &StoresVec = Pair.second;
5726	// If we have fewer than NumLanes stores, then we can't form a vector.
5727	if (StoresVec.size() != NumLanes)
5728	continue;
5729
5730	// If the stores are not consecutive then abandon this StoresVec.
5731	OrdersType ReorderIndices;
5732	if (!canFormVector(StoresVec, ReorderIndices))
5733	continue;
5734
5735	// We now know that the scalars in StoresVec can form a vector instruction,
5736	// so set the reorder indices.
5737	ExternalReorderIndices.push_back(Elt: ReorderIndices);
5738	}
5739	return ExternalReorderIndices;
5740	}
5741
5742	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
5743	const SmallDenseSet<Value *> &UserIgnoreLst) {
5744	deleteTree();
5745	UserIgnoreList = &UserIgnoreLst;
5746	if (!allSameType(VL: Roots))
5747	return;
5748	buildTree_rec(Roots, Depth: `0`, EI: EdgeInfo ());
5749	}
5750
5751	void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
5752	deleteTree();
5753	if (!allSameType(VL: Roots))
5754	return;
5755	buildTree_rec(Roots, Depth: `0`, EI: EdgeInfo ());
5756	}
5757
5758	/// \return true if the specified list of values has only one instruction that
5759	/// requires scheduling, false otherwise.
5760	#ifndef NDEBUG
5761	static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
5762	Value NeedsScheduling = nullptr*;
5763	for (Value *V : VL) {
5764	if (doesNotNeedToBeScheduled(V))
5765	continue;
5766	if (!NeedsScheduling) {
5767	NeedsScheduling = V;
5768	continue;
5769	}
5770	return false;
5771	}
5772	return NeedsScheduling;
5773	}
5774	#endif
5775
5776	/// Generates key/subkey pair for the given value to provide effective sorting
5777	/// of the values and better detection of the vectorizable values sequences. The
5778	/// keys/subkeys can be used for better sorting of the values themselves (keys)
5779	/// and in values subgroups (subkeys).
5780	static std::pair<size_t, size_t> generateKeySubkey(
5781	Value V, const* TargetLibraryInfo *TLI,
5782	function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5783	bool AllowAlternate) {
5784	hash_code Key = hash_value(value: V->getValueID() + `2`);
5785	hash_code SubKey = hash_value(value: `0`);
5786	// Sort the loads by the distance between the pointers.
5787	if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
5788	Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
5789	if (LI->isSimple())
5790	SubKey = hash_value(code: LoadsSubkeyGenerator (Key, LI));
5791	else
5792	Key = SubKey = hash_value(ptr: LI);
5793	} else if (isVectorLikeInstWithConstOps(V)) {
5794	// Sort extracts by the vector operands.
5795	if (isa<ExtractElementInst, UndefValue>(Val: V))
5796	Key = hash_value(value: Value::UndefValueVal + `1`);
5797	if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
5798	if (!isUndefVector(V: EI->getVectorOperand()).all() &&
5799	!isa<UndefValue>(Val: EI->getIndexOperand()))
5800	SubKey = hash_value(ptr: EI->getVectorOperand());
5801	}
5802	} else if (auto *I = dyn_cast<Instruction>(Val: V)) {
5803	// Sort other instructions just by the opcodes except for CMPInst.
5804	// For CMP also sort by the predicate kind.
5805	if ((isa<BinaryOperator, CastInst>(Val: I)) &&
5806	isValidForAlternation(Opcode: I->getOpcode())) {
5807	if (AllowAlternate)
5808	Key = hash_value(value: isa<BinaryOperator>(Val: I) ? `1` : `0`);
5809	else
5810	Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
5811	SubKey = hash_combine(
5812	args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
5813	args: hash_value(ptr: isa<BinaryOperator>(Val: I)
5814	? I->getType()
5815	: cast<CastInst>(Val: I)->getOperand(i_nocapture: `0`)->getType()));
5816	// For casts, look through the only operand to improve compile time.
5817	if (isa<CastInst>(Val: I)) {
5818	std::pair<size_t, size_t> OpVals =
5819	generateKeySubkey(V: I->getOperand(i: `0`), TLI, LoadsSubkeyGenerator,
5820	/AllowAlternate=/true);
5821	Key = hash_combine(args: OpVals.first, args: Key);
5822	SubKey = hash_combine(args: OpVals.first, args: SubKey);
5823	}
5824	} else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
5825	CmpInst::Predicate Pred = CI->getPredicate();
5826	if (CI->isCommutative())
5827	Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
5828	CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
5829	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
5830	args: hash_value(value: SwapPred),
5831	args: hash_value(ptr: CI->getOperand(i_nocapture: `0`)->getType()));
5832	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
5833	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
5834	if (isTriviallyVectorizable(ID)) {
5835	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
5836	} else if (!VFDatabase (Call).getMappings(CI: Call).empty()) {
5837	SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
5838	args: hash_value(ptr: Call->getCalledFunction()));
5839	} else {
5840	Key = hash_combine(args: hash_value(ptr: Call), args: Key);
5841	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
5842	}
5843	for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5844	SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
5845	args: hash_value(ptr: Op.Tag), args: SubKey);
5846	} else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
5847	if (Gep->getNumOperands() == `2` && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: `1`)))
5848	SubKey = hash_value(ptr: Gep->getPointerOperand());
5849	else
5850	SubKey = hash_value(ptr: Gep);
5851	} else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
5852	!isa<ConstantInt>(Val: I->getOperand(i: `1`))) {
5853	// Do not try to vectorize instructions with potentially high cost.
5854	SubKey = hash_value(ptr: I);
5855	} else {
5856	SubKey = hash_value(value: I->getOpcode());
5857	}
5858	Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
5859	}
5860	return std::make_pair(x&: Key, y&: SubKey);
5861	}
5862
5863	/// Checks if the specified instruction \p I is an alternate operation for
5864	/// the given \p MainOp and \p AltOp instructions.
5865	static bool isAlternateInstruction(const Instruction *I,
5866	const Instruction *MainOp,
5867	const Instruction *AltOp,
5868	const TargetLibraryInfo &TLI);
5869
5870	bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5871	ArrayRef<Value > VL) const* {
5872	unsigned Opcode0 = S.getOpcode();
5873	unsigned Opcode1 = S.getAltOpcode();
5874	// The opcode mask selects between the two opcodes.
5875	SmallBitVector OpcodeMask(VL.size(), false);
5876	for (unsigned Lane : seq<unsigned>(Begin: `0`, End: VL.size()))
5877	if (cast<Instruction>(Val: VL [Lane])->getOpcode() == Opcode1)
5878	OpcodeMask.set(Lane);
5879	// If this pattern is supported by the target then consider it profitable.
5880	if (TTI->isLegalAltInstr(VecTy: FixedVectorType::get(ElementType: S.MainOp->getType(), NumElts: VL.size()),
5881	Opcode0, Opcode1, OpcodeMask))
5882	return true;
5883	SmallVector<ValueList> Operands;
5884	for (unsigned I : seq<unsigned>(Begin: `0`, End: S.MainOp->getNumOperands())) {
5885	Operands.emplace_back();
5886	// Prepare the operand vector.
5887	for (Value *V : VL)
5888	Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
5889	}
5890	if (Operands.size() == `2`) {
5891	// Try find best operands candidates.
5892	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL.size() - `1`)) {
5893	SmallVector<std::pair<Value , Value >> Candidates(`3`);
5894	Candidates [`0`] = std::make_pair(x&: Operands [`0`][I], y&: Operands [`0`][I + `1`]);
5895	Candidates [`1`] = std::make_pair(x&: Operands [`0`][I], y&: Operands [`1`][I + `1`]);
5896	Candidates [`2`] = std::make_pair(x&: Operands [`1`][I], y&: Operands [`0`][I + `1`]);
5897	std::optional<int> Res = findBestRootPair(Candidates);
5898	switch (Res.value_or(u: `0`)) {
5899	case `0`:
5900	break;
5901	case `1`:
5902	std::swap(a&: Operands [`0`][I + `1`], b&: Operands [`1`][I + `1`]);
5903	break;
5904	case `2`:
5905	std::swap(a&: Operands [`0`][I], b&: Operands [`1`][I]);
5906	break;
5907	default:
5908	llvm_unreachable("Unexpected index.");
5909	}
5910	}
5911	}
5912	DenseSet<unsigned> UniqueOpcodes;
5913	constexpr unsigned NumAltInsts = `3`; // main + alt + shuffle.
5914	unsigned NonInstCnt = `0`;
5915	// Estimate number of instructions, required for the vectorized node and for
5916	// the buildvector node.
5917	unsigned UndefCnt = `0`;
5918	// Count the number of extra shuffles, required for vector nodes.
5919	unsigned ExtraShuffleInsts = `0`;
5920	// Check that operands do not contain same values and create either perfect
5921	// diamond match or shuffled match.
5922	if (Operands.size() == `2`) {
5923	// Do not count same operands twice.
5924	if (Operands.front() == Operands.back()) {
5925	Operands.erase(CI: Operands.begin());
5926	} else if (!allConstant(VL: Operands.front()) &&
5927	all_of(Range&: Operands.front(), P: [&](Value *V) {
5928	return is_contained(Range&: Operands.back(), Element: V);
5929	})) {
5930	Operands.erase(CI: Operands.begin());
5931	++ExtraShuffleInsts;
5932	}
5933	}
5934	const Loop *L = LI->getLoopFor(BB: S.MainOp->getParent());
5935	// Vectorize node, if:
5936	// 1. at least single operand is constant or splat.
5937	// 2. Operands have many loop invariants (the instructions are not loop
5938	// invariants).
5939	// 3. At least single unique operands is supposed to vectorized.
5940	return none_of(Range&: Operands,
5941	P: [&](ArrayRef<Value *> Op) {
5942	if (allConstant(VL: Op) \|\|
5943	(!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) &&
5944	getSameOpcode(VL: Op, TLI: *TLI).MainOp))
5945	return false;
5946	DenseMap<Value , unsigned*> Uniques;
5947	for (Value *V : Op) {
5948	if (isa<Constant, ExtractElementInst>(Val: V) \|\|
5949	getTreeEntry(V) \|\| (L && L->isLoopInvariant(V))) {
5950	if (isa<UndefValue>(Val: V))
5951	++UndefCnt;
5952	continue;
5953	}
5954	auto Res = Uniques.try_emplace(Key: V, Args: `0`);
5955	// Found first duplicate - need to add shuffle.
5956	if (!Res.second && Res.first ->second == `1`)
5957	++ExtraShuffleInsts;
5958	++Res.first ->getSecond();
5959	if (auto *I = dyn_cast<Instruction>(Val: V))
5960	UniqueOpcodes.insert(V: I->getOpcode());
5961	else if (Res.second)
5962	++NonInstCnt;
5963	}
5964	return none_of(Range&: Uniques, P: [&](const auto &P) {
5965	return P.first->hasNUsesOrMore(P.second + `1`) &&
5966	none_of(P.first->users(), [&](User *U) {
5967	return getTreeEntry(V: U) \|\| Uniques.contains(Val: U);
5968	});
5969	});
5970	}) \|\|
5971	// Do not vectorize node, if estimated number of vector instructions is
5972	// more than estimated number of buildvector instructions. Number of
5973	// vector operands is number of vector instructions + number of vector
5974	// instructions for operands (buildvectors). Number of buildvector
5975	// instructions is just number_of_operands number_of_scalars.*
5976	(UndefCnt < (VL.size() - `1`) * S.MainOp->getNumOperands() &&
5977	(UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
5978	NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5979	}
5980
5981	BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5982	InstructionsState &S, ArrayRef<Value > VL, bool* IsScatterVectorizeUserTE,
5983	OrdersType &CurrentOrder, SmallVectorImpl<Value > &PointerOps) const* {
5984	assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5985
5986	unsigned ShuffleOrOp =
5987	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
5988	auto *VL0 = cast<Instruction>(Val: S.OpValue);
5989	switch (ShuffleOrOp) {
5990	case Instruction::PHI: {
5991	// Check for terminator values (e.g. invoke).
5992	for (Value *V : VL)
5993	for (Value *Incoming : cast<PHINode>(Val: V)->incoming_values()) {
5994	Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
5995	if (Term && Term->isTerminator()) {
5996	LLVM_DEBUG(dbgs()
5997	<< "SLP: Need to swizzle PHINodes (terminator use).\n");
5998	return TreeEntry::NeedToGather;
5999	}
6000	}
6001
6002	return TreeEntry::Vectorize;
6003	}
6004	case Instruction::ExtractValue:
6005	case Instruction::ExtractElement: {
6006	bool Reuse = canReuseExtract(VL, OpValue: VL0, CurrentOrder);
6007	// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6008	if (!isPowerOf2_32(Value: VL.size()))
6009	return TreeEntry::NeedToGather;
6010	if (Reuse \|\| !CurrentOrder.empty())
6011	return TreeEntry::Vectorize;
6012	LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6013	return TreeEntry::NeedToGather;
6014	}
6015	case Instruction::InsertElement: {
6016	// Check that we have a buildvector and not a shuffle of 2 or more
6017	// different vectors.
6018	ValueSet SourceVectors;
6019	for (Value *V : VL) {
6020	SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: `0`));
6021	assert(getInsertIndex(V) != std::nullopt &&
6022	"Non-constant or undef index?");
6023	}
6024
6025	if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
6026	return !SourceVectors.contains(Ptr: V);
6027	}) >= `2`) {
6028	// Found 2nd source vector - cancel.
6029	LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6030	"different source vectors.\n");
6031	return TreeEntry::NeedToGather;
6032	}
6033
6034	return TreeEntry::Vectorize;
6035	}
6036	case Instruction::Load: {
6037	// Check that a vectorized load would load the same memory as a scalar
6038	// load. For example, we don't want to vectorize loads that are smaller
6039	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6040	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
6041	// from such a struct, we read/write packed bits disagreeing with the
6042	// unvectorized version.
6043	switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps)) {
6044	case LoadsState::Vectorize:
6045	return TreeEntry::Vectorize;
6046	case LoadsState::ScatterVectorize:
6047	return TreeEntry::ScatterVectorize;
6048	case LoadsState::StridedVectorize:
6049	return TreeEntry::StridedVectorize;
6050	case LoadsState::Gather:
6051	#ifndef NDEBUG
6052	Type *ScalarTy = VL0->getType();
6053	if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
6054	DL->getTypeAllocSizeInBits(Ty: ScalarTy))
6055	LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6056	else if (any_of(Range&: VL,
6057	P: [](Value V) { return* !cast<LoadInst>(Val: V)->isSimple(); }))
6058	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6059	else
6060	LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6061	#endif // NDEBUG
6062	return TreeEntry::NeedToGather;
6063	}
6064	llvm_unreachable("Unexpected state of loads");
6065	}
6066	case Instruction::ZExt:
6067	case Instruction::SExt:
6068	case Instruction::FPToUI:
6069	case Instruction::FPToSI:
6070	case Instruction::FPExt:
6071	case Instruction::PtrToInt:
6072	case Instruction::IntToPtr:
6073	case Instruction::SIToFP:
6074	case Instruction::UIToFP:
6075	case Instruction::Trunc:
6076	case Instruction::FPTrunc:
6077	case Instruction::BitCast: {
6078	Type *SrcTy = VL0->getOperand(i: `0`)->getType();
6079	for (Value *V : VL) {
6080	Type *Ty = cast<Instruction>(Val: V)->getOperand(i: `0`)->getType();
6081	if (Ty != SrcTy \|\| !isValidElementType(Ty)) {
6082	LLVM_DEBUG(
6083	dbgs() << "SLP: Gathering casts with different src types.\n");
6084	return TreeEntry::NeedToGather;
6085	}
6086	}
6087	return TreeEntry::Vectorize;
6088	}
6089	case Instruction::ICmp:
6090	case Instruction::FCmp: {
6091	// Check that all of the compares have the same predicate.
6092	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
6093	CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
6094	Type *ComparedTy = VL0->getOperand(i: `0`)->getType();
6095	for (Value *V : VL) {
6096	CmpInst *Cmp = cast<CmpInst>(Val: V);
6097	if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) \|\|
6098	Cmp->getOperand(i_nocapture: `0`)->getType() != ComparedTy) {
6099	LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6100	return TreeEntry::NeedToGather;
6101	}
6102	}
6103	return TreeEntry::Vectorize;
6104	}
6105	case Instruction::Select:
6106	case Instruction::FNeg:
6107	case Instruction::Add:
6108	case Instruction::FAdd:
6109	case Instruction::Sub:
6110	case Instruction::FSub:
6111	case Instruction::Mul:
6112	case Instruction::FMul:
6113	case Instruction::UDiv:
6114	case Instruction::SDiv:
6115	case Instruction::FDiv:
6116	case Instruction::URem:
6117	case Instruction::SRem:
6118	case Instruction::FRem:
6119	case Instruction::Shl:
6120	case Instruction::LShr:
6121	case Instruction::AShr:
6122	case Instruction::And:
6123	case Instruction::Or:
6124	case Instruction::Xor:
6125	return TreeEntry::Vectorize;
6126	case Instruction::GetElementPtr: {
6127	// We don't combine GEPs with complicated (nested) indexing.
6128	for (Value *V : VL) {
6129	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6130	if (!I)
6131	continue;
6132	if (I->getNumOperands() != `2`) {
6133	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6134	return TreeEntry::NeedToGather;
6135	}
6136	}
6137
6138	// We can't combine several GEPs into one vector if they operate on
6139	// different types.
6140	Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
6141	for (Value *V : VL) {
6142	auto *GEP = dyn_cast<GEPOperator>(Val: V);
6143	if (!GEP)
6144	continue;
6145	Type *CurTy = GEP->getSourceElementType();
6146	if (Ty0 != CurTy) {
6147	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6148	return TreeEntry::NeedToGather;
6149	}
6150	}
6151
6152	// We don't combine GEPs with non-constant indexes.
6153	Type *Ty1 = VL0->getOperand(i: `1`)->getType();
6154	for (Value *V : VL) {
6155	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6156	if (!I)
6157	continue;
6158	auto *Op = I->getOperand(i_nocapture: `1`);
6159	if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
6160	(Op->getType() != Ty1 &&
6161	((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
6162	Op->getType()->getScalarSizeInBits() >
6163	DL->getIndexSizeInBits(
6164	AS: V->getType()->getPointerAddressSpace())))) {
6165	LLVM_DEBUG(
6166	dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6167	return TreeEntry::NeedToGather;
6168	}
6169	}
6170
6171	return TreeEntry::Vectorize;
6172	}
6173	case Instruction::Store: {
6174	// Check if the stores are consecutive or if we need to swizzle them.
6175	llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
6176	// Avoid types that are padded when being allocated as scalars, while
6177	// being packed together in a vector (such as i1).
6178	if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
6179	DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
6180	LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6181	return TreeEntry::NeedToGather;
6182	}
6183	// Make sure all stores in the bundle are simple - we can't vectorize
6184	// atomic or volatile stores.
6185	for (Value *V : VL) {
6186	auto *SI = cast<StoreInst>(Val: V);
6187	if (!SI->isSimple()) {
6188	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6189	return TreeEntry::NeedToGather;
6190	}
6191	PointerOps.push_back(Elt: SI->getPointerOperand());
6192	}
6193
6194	// Check the order of pointer operands.
6195	if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: CurrentOrder)) {
6196	Value *Ptr0;
6197	Value *PtrN;
6198	if (CurrentOrder.empty()) {
6199	Ptr0 = PointerOps.front();
6200	PtrN = PointerOps.back();
6201	} else {
6202	Ptr0 = PointerOps [CurrentOrder.front()];
6203	PtrN = PointerOps [CurrentOrder.back()];
6204	}
6205	std::optional<int> Dist =
6206	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
6207	// Check that the sorted pointer operands are consecutive.
6208	if (static_cast<unsigned>(*Dist) == VL.size() - `1`)
6209	return TreeEntry::Vectorize;
6210	}
6211
6212	LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6213	return TreeEntry::NeedToGather;
6214	}
6215	case Instruction::Call: {
6216	// Check if the calls are all to the same vectorizable intrinsic or
6217	// library function.
6218	CallInst *CI = cast<CallInst>(Val: VL0);
6219	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6220
6221	VFShape Shape = VFShape::get(
6222	FTy: CI->getFunctionType(),
6223	EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
6224	HasGlobalPred: false /HasGlobalPred/);
6225	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
6226
6227	if (!VecFunc && !isTriviallyVectorizable(ID)) {
6228	LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6229	return TreeEntry::NeedToGather;
6230	}
6231	Function *F = CI->getCalledFunction();
6232	unsigned NumArgs = CI->arg_size();
6233	SmallVector<Value , `4`> ScalarArgs(NumArgs, nullptr*);
6234	for (unsigned J = `0`; J != NumArgs; ++J)
6235	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J))
6236	ScalarArgs [J] = CI->getArgOperand(i: J);
6237	for (Value *V : VL) {
6238	CallInst *CI2 = dyn_cast<CallInst>(Val: V);
6239	if (!CI2 \|\| CI2->getCalledFunction() != F \|\|
6240	getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID \|\|
6241	(VecFunc &&
6242	VecFunc != VFDatabase (*CI2).getVectorizedFunction(Shape)) \|\|
6243	!CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
6244	LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << CI << "!=" << V
6245	<< "\n");
6246	return TreeEntry::NeedToGather;
6247	}
6248	// Some intrinsics have scalar arguments and should be same in order for
6249	// them to be vectorized.
6250	for (unsigned J = `0`; J != NumArgs; ++J) {
6251	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J)) {
6252	Value *A1J = CI2->getArgOperand(i: J);
6253	if (ScalarArgs [J] != A1J) {
6254	LLVM_DEBUG(dbgs()
6255	<< "SLP: mismatched arguments in call:" << *CI
6256	<< " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6257	return TreeEntry::NeedToGather;
6258	}
6259	}
6260	}
6261	// Verify that the bundle operands are identical between the two calls.
6262	if (CI->hasOperandBundles() &&
6263	!std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
6264	last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
6265	first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6266	LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6267	<< "!=" << *V << `'\n'`);
6268	return TreeEntry::NeedToGather;
6269	}
6270	}
6271
6272	return TreeEntry::Vectorize;
6273	}
6274	case Instruction::ShuffleVector: {
6275	// If this is not an alternate sequence of opcode like add-sub
6276	// then do not vectorize this instruction.
6277	if (!S.isAltShuffle()) {
6278	LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6279	return TreeEntry::NeedToGather;
6280	}
6281	if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6282	LLVM_DEBUG(
6283	dbgs()
6284	<< "SLP: ShuffleVector not vectorized, operands are buildvector and "
6285	"the whole alt sequence is not profitable.\n");
6286	return TreeEntry::NeedToGather;
6287	}
6288
6289	return TreeEntry::Vectorize;
6290	}
6291	default:
6292	LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6293	return TreeEntry::NeedToGather;
6294	}
6295	}
6296
6297	void BoUpSLP::buildTree_rec(ArrayRef<Value > VL, unsigned* Depth,
6298	const EdgeInfo &UserTreeIdx) {
6299	assert((allConstant(VL) \|\| allSameType(VL)) && "Invalid types!");
6300
6301	SmallVector<int> ReuseShuffleIndicies;
6302	SmallVector<Value *> UniqueValues;
6303	SmallVector<Value *> NonUniqueValueVL;
6304	auto TryToFindDuplicates = [&](const InstructionsState &S,
6305	bool DoNotFail = false) {
6306	// Check that every instruction appears once in this bundle.
6307	DenseMap<Value , unsigned*> UniquePositions(VL.size());
6308	for (Value *V : VL) {
6309	if (isConstant(V)) {
6310	ReuseShuffleIndicies.emplace_back(
6311	Args: isa<UndefValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
6312	UniqueValues.emplace_back(Args&: V);
6313	continue;
6314	}
6315	auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
6316	ReuseShuffleIndicies.emplace_back(Args&: Res.first ->second);
6317	if (Res.second)
6318	UniqueValues.emplace_back(Args&: V);
6319	}
6320	size_t NumUniqueScalarValues = UniqueValues.size();
6321	if (NumUniqueScalarValues == VL.size()) {
6322	ReuseShuffleIndicies.clear();
6323	} else {
6324	// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6325	if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6326	LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6327	"for nodes with padding.\n");
6328	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6329	return false;
6330	}
6331	LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6332	if (NumUniqueScalarValues <= `1` \|\|
6333	(UniquePositions.size() == `1` && all_of(Range&: UniqueValues,
6334	P: [](Value *V) {
6335	return isa<UndefValue>(Val: V) \|\|
6336	!isConstant(V);
6337	})) \|\|
6338	!llvm::has_single_bit<uint32_t>(Value: NumUniqueScalarValues)) {
6339	if (DoNotFail && UniquePositions.size() > `1` &&
6340	NumUniqueScalarValues > `1` && S.MainOp->isSafeToRemove() &&
6341	all_of(Range&: UniqueValues, P: [=](Value *V) {
6342	return isa<ExtractElementInst>(Val: V) \|\|
6343	areAllUsersVectorized(I: cast<Instruction>(Val: V),
6344	VectorizedVals: UserIgnoreList);
6345	})) {
6346	unsigned PWSz = PowerOf2Ceil(A: UniqueValues.size());
6347	if (PWSz == VL.size()) {
6348	ReuseShuffleIndicies.clear();
6349	} else {
6350	NonUniqueValueVL.assign(in_start: UniqueValues.begin(), in_end: UniqueValues.end());
6351	NonUniqueValueVL.append(NumInputs: PWSz - UniqueValues.size(),
6352	Elt: UniqueValues.back());
6353	VL = NonUniqueValueVL;
6354	}
6355	return true;
6356	}
6357	LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6358	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6359	return false;
6360	}
6361	VL = UniqueValues;
6362	}
6363	return true;
6364	};
6365
6366	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
6367
6368	// Don't vectorize ephemeral values.
6369	if (!EphValues.empty()) {
6370	for (Value *V : VL) {
6371	if (EphValues.count(Ptr: V)) {
6372	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6373	<< ") is ephemeral.\n");
6374	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6375	return;
6376	}
6377	}
6378	}
6379
6380	// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6381	// a load), in which case peek through to include it in the tree, without
6382	// ballooning over-budget.
6383	if (Depth >= RecursionMaxDepth &&
6384	!(S.MainOp && isa<Instruction>(Val: S.MainOp) && S.MainOp == S.AltOp &&
6385	VL.size() >= `4` &&
6386	(match(V: S.MainOp, P: m_Load(Op: m_Value())) \|\| all_of(Range&: VL, P: [&S](const Value *I) {
6387	return match(V: I,
6388	P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
6389	cast<Instruction>(Val: I)->getOpcode() ==
6390	cast<Instruction>(Val: S.MainOp)->getOpcode();
6391	})))) {
6392	LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6393	if (TryToFindDuplicates (S))
6394	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6395	ReuseShuffleIndices: ReuseShuffleIndicies);
6396	return;
6397	}
6398
6399	// Don't handle scalable vectors
6400	if (S.getOpcode() == Instruction::ExtractElement &&
6401	isa<ScalableVectorType>(
6402	Val: cast<ExtractElementInst>(Val: S.OpValue)->getVectorOperandType())) {
6403	LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6404	if (TryToFindDuplicates (S))
6405	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6406	ReuseShuffleIndices: ReuseShuffleIndicies);
6407	return;
6408	}
6409
6410	// Don't handle vectors.
6411	if (S.OpValue->getType()->isVectorTy() &&
6412	!isa<InsertElementInst>(Val: S.OpValue)) {
6413	LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6414	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6415	return;
6416	}
6417
6418	if (StoreInst *SI = dyn_cast<StoreInst>(Val: S.OpValue))
6419	if (SI->getValueOperand()->getType()->isVectorTy()) {
6420	LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6421	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6422	return;
6423	}
6424
6425	// If all of the operands are identical or constant we have a simple solution.
6426	// If we deal with insert/extract instructions, they all must have constant
6427	// indices, otherwise we should gather them, not try to vectorize.
6428	// If alternate op node with 2 elements with gathered operands - do not
6429	// vectorize.
6430	auto &&NotProfitableForVectorization = [&S, this,
6431	Depth](ArrayRef<Value *> VL) {
6432	if (!S.getOpcode() \|\| !S.isAltShuffle() \|\| VL.size() > `2`)
6433	return false;
6434	if (VectorizableTree.size() < MinTreeSize)
6435	return false;
6436	if (Depth >= RecursionMaxDepth - `1`)
6437	return true;
6438	// Check if all operands are extracts, part of vector node or can build a
6439	// regular vectorize node.
6440	SmallVector<unsigned, `2`> InstsCount(VL.size(), `0`);
6441	for (Value *V : VL) {
6442	auto *I = cast<Instruction>(Val: V);
6443	InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) {
6444	return isa<Instruction>(Val: Op) \|\| isVectorLikeInstWithConstOps(V: Op);
6445	}));
6446	}
6447	bool IsCommutative = isCommutative(I: S.MainOp) \|\| isCommutative(I: S.AltOp);
6448	if ((IsCommutative &&
6449	std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: `0`) < `2`) \|\|
6450	(!IsCommutative &&
6451	all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < `2`; })))
6452	return true;
6453	assert(VL.size() == `2` && "Expected only 2 alternate op instructions.");
6454	SmallVector<SmallVector<std::pair<Value , Value >>> Candidates;
6455	auto *I1 = cast<Instruction>(Val: VL.front());
6456	auto *I2 = cast<Instruction>(Val: VL.back());
6457	for (int Op = `0`, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6458	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6459	Args: I2->getOperand(i: Op));
6460	if (static_cast<unsigned>(count_if(
6461	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
6462	return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6463	})) >= S.MainOp->getNumOperands() / `2`)
6464	return false;
6465	if (S.MainOp->getNumOperands() > `2`)
6466	return true;
6467	if (IsCommutative) {
6468	// Check permuted operands.
6469	Candidates.clear();
6470	for (int Op = `0`, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6471	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6472	Args: I2->getOperand(i: (Op + `1`) % E));
6473	if (any_of(
6474	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
6475	return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6476	}))
6477	return false;
6478	}
6479	return true;
6480	};
6481	SmallVector<unsigned> SortedIndices;
6482	BasicBlock BB = nullptr*;
6483	bool IsScatterVectorizeUserTE =
6484	UserTreeIdx.UserTE &&
6485	UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6486	bool AreAllSameInsts =
6487	(S.getOpcode() && allSameBlock(VL)) \|\|
6488	(S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6489	VL.size() > `2` &&
6490	all_of(Range&: VL,
6491	P: [&BB](Value *V) {
6492	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6493	if (!I)
6494	return doesNotNeedToBeScheduled(V);
6495	if (!BB)
6496	BB = I->getParent();
6497	return BB == I->getParent() && I->getNumOperands() == `2`;
6498	}) &&
6499	BB &&
6500	sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: DL, SE&: SE,
6501	SortedIndices));
6502	if (!AreAllSameInsts \|\| allConstant(VL) \|\| isSplat(VL) \|\|
6503	(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6504	Val: S.OpValue) &&
6505	!all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) \|\|
6506	NotProfitableForVectorization (VL)) {
6507	LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6508	if (TryToFindDuplicates (S))
6509	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6510	ReuseShuffleIndices: ReuseShuffleIndicies);
6511	return;
6512	}
6513
6514	// We now know that this is a vector of instructions of the same type from
6515	// the same block.
6516
6517	// Check if this is a duplicate of another entry.
6518	if (TreeEntry *E = getTreeEntry(V: S.OpValue)) {
6519	LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6520	if (!E->isSame(VL)) {
6521	auto It = MultiNodeScalars.find(Val: S.OpValue);
6522	if (It != MultiNodeScalars.end()) {
6523	auto *TEIt = find_if(Range&: It ->getSecond(),
6524	P: [&](TreeEntry ME) { return* ME->isSame(VL); });
6525	if (TEIt != It ->getSecond().end())
6526	E = *TEIt;
6527	else
6528	E = nullptr;
6529	} else {
6530	E = nullptr;
6531	}
6532	}
6533	if (!E) {
6534	if (!doesNotNeedToBeScheduled(V: S.OpValue)) {
6535	LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6536	if (TryToFindDuplicates (S))
6537	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6538	ReuseShuffleIndices: ReuseShuffleIndicies);
6539	return;
6540	}
6541	} else {
6542	// Record the reuse of the tree node. FIXME, currently this is only used
6543	// to properly draw the graph rather than for the actual vectorization.
6544	E->UserTreeIndices.push_back(Elt: UserTreeIdx);
6545	LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6546	<< ".\n");
6547	return;
6548	}
6549	}
6550
6551	// Check that none of the instructions in the bundle are already in the tree.
6552	for (Value *V : VL) {
6553	if ((!IsScatterVectorizeUserTE && !isa<Instruction>(Val: V)) \|\|
6554	doesNotNeedToBeScheduled(V))
6555	continue;
6556	if (getTreeEntry(V)) {
6557	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6558	<< ") is already in tree.\n");
6559	if (TryToFindDuplicates (S))
6560	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6561	ReuseShuffleIndices: ReuseShuffleIndicies);
6562	return;
6563	}
6564	}
6565
6566	// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6567	if (UserIgnoreList && !UserIgnoreList->empty()) {
6568	for (Value *V : VL) {
6569	if (UserIgnoreList && UserIgnoreList->contains(V)) {
6570	LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6571	if (TryToFindDuplicates (S))
6572	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6573	ReuseShuffleIndices: ReuseShuffleIndicies);
6574	return;
6575	}
6576	}
6577	}
6578
6579	// Special processing for sorted pointers for ScatterVectorize node with
6580	// constant indeces only.
6581	if (AreAllSameInsts && UserTreeIdx.UserTE &&
6582	UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6583	!(S.getOpcode() && allSameBlock(VL))) {
6584	assert(S.OpValue->getType()->isPointerTy() &&
6585	count_if(VL, IsaPred<GetElementPtrInst>) >= `2` &&
6586	"Expected pointers only.");
6587	// Reset S to make it GetElementPtr kind of node.
6588	const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
6589	assert(It != VL.end() && "Expected at least one GEP.");
6590	S = getSameOpcode(VL: It, TLI: TLI);
6591	}
6592
6593	// Check that all of the users of the scalars that we want to vectorize are
6594	// schedulable.
6595	auto *VL0 = cast<Instruction>(Val: S.OpValue);
6596	BB = VL0->getParent();
6597
6598	if (!DT->isReachableFromEntry(A: BB)) {
6599	// Don't go into unreachable blocks. They may contain instructions with
6600	// dependency cycles which confuse the final scheduling.
6601	LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6602	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6603	return;
6604	}
6605
6606	// Don't go into catchswitch blocks, which can happen with PHIs.
6607	// Such blocks can only have PHIs and the catchswitch. There is no
6608	// place to insert a shuffle if we need to, so just avoid that issue.
6609	if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
6610	LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6611	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6612	return;
6613	}
6614
6615	// Check that every instruction appears once in this bundle.
6616	if (!TryToFindDuplicates (S, /DoNotFail=/true))
6617	return;
6618
6619	// Perform specific checks for each particular instruction kind.
6620	OrdersType CurrentOrder;
6621	SmallVector<Value *> PointerOps;
6622	TreeEntry::EntryState State = getScalarsVectorizationState(
6623	S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6624	if (State == TreeEntry::NeedToGather) {
6625	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6626	ReuseShuffleIndices: ReuseShuffleIndicies);
6627	return;
6628	}
6629
6630	auto &BSRef = BlocksSchedules [BB];
6631	if (!BSRef)
6632	BSRef = std::make_unique<BlockScheduling>(args&: BB);
6633
6634	BlockScheduling &BS = *BSRef;
6635
6636	std::optional<ScheduleData *> Bundle =
6637	BS.tryScheduleBundle(VL: UniqueValues, SLP: this, S);
6638	#ifdef EXPENSIVE_CHECKS
6639	// Make sure we didn't break any internal invariants
6640	BS.verify();
6641	#endif
6642	if (!Bundle) {
6643	LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6644	assert((!BS.getScheduleData(VL0) \|\|
6645	!BS.getScheduleData(VL0)->isPartOfBundle()) &&
6646	"tryScheduleBundle should cancelScheduling on failure");
6647	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6648	ReuseShuffleIndices: ReuseShuffleIndicies);
6649	return;
6650	}
6651	LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6652
6653	unsigned ShuffleOrOp = S.isAltShuffle() ?
6654	(unsigned) Instruction::ShuffleVector : S.getOpcode();
6655	switch (ShuffleOrOp) {
6656	case Instruction::PHI: {
6657	auto *PH = cast<PHINode>(Val: VL0);
6658
6659	TreeEntry *TE =
6660	newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies);
6661	LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6662
6663	// Keeps the reordered operands to avoid code duplication.
6664	SmallVector<ValueList, `2`> OperandsVec;
6665	for (unsigned I = `0`, E = PH->getNumIncomingValues(); I < E; ++I) {
6666	if (!DT->isReachableFromEntry(A: PH->getIncomingBlock(i: I))) {
6667	ValueList Operands(VL.size(), PoisonValue::get(T: PH->getType()));
6668	TE->setOperand(OpIdx: I, OpVL: Operands);
6669	OperandsVec.push_back(Elt: Operands);
6670	continue;
6671	}
6672	ValueList Operands;
6673	// Prepare the operand vector.
6674	for (Value *V : VL)
6675	Operands.push_back(Elt: cast<PHINode>(Val: V)->getIncomingValueForBlock(
6676	BB: PH->getIncomingBlock(i: I)));
6677	TE->setOperand(OpIdx: I, OpVL: Operands);
6678	OperandsVec.push_back(Elt: Operands);
6679	}
6680	for (unsigned OpIdx = `0`, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
6681	buildTree_rec(VL: OperandsVec [OpIdx], Depth: Depth + `1`, UserTreeIdx: {TE, OpIdx});
6682	return;
6683	}
6684	case Instruction::ExtractValue:
6685	case Instruction::ExtractElement: {
6686	if (CurrentOrder.empty()) {
6687	LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6688	newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
6689	ReuseShuffleIndices: ReuseShuffleIndicies);
6690	// This is a special case, as it does not gather, but at the same time
6691	// we are not extending buildTree_rec() towards the operands.
6692	ValueList Op0;
6693	Op0.assign(NumElts: VL.size(), Elt: VL0->getOperand(i: `0`));
6694	VectorizableTree.back()->setOperand(OpIdx: `0`, OpVL: Op0);
6695	return;
6696	}
6697	LLVM_DEBUG({
6698	dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6699	"with order";
6700	for (unsigned Idx : CurrentOrder)
6701	dbgs() << " " << Idx;
6702	dbgs() << "\n";
6703	});
6704	fixupOrderingIndices(Order: CurrentOrder);
6705	// Insert new order with initial value 0, if it does not exist,
6706	// otherwise return the iterator to the existing one.
6707	newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
6708	ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
6709	// This is a special case, as it does not gather, but at the same time
6710	// we are not extending buildTree_rec() towards the operands.
6711	ValueList Op0;
6712	Op0.assign(NumElts: VL.size(), Elt: VL0->getOperand(i: `0`));
6713	VectorizableTree.back()->setOperand(OpIdx: `0`, OpVL: Op0);
6714	return;
6715	}
6716	case Instruction::InsertElement: {
6717	assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6718
6719	auto OrdCompare = [](const std::pair<int, int> &P1,
6720	const std::pair<int, int> &P2) {
6721	return P1.first > P2.first;
6722	};
6723	PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
6724	decltype(OrdCompare)>
6725	Indices(OrdCompare);
6726	for (int I = `0`, E = VL.size(); I < E; ++I) {
6727	unsigned Idx = *getInsertIndex(InsertInst: VL [I]);
6728	Indices.emplace(args&: Idx, args&: I);
6729	}
6730	OrdersType CurrentOrder(VL.size(), VL.size());
6731	bool IsIdentity = true;
6732	for (int I = `0`, E = VL.size(); I < E; ++I) {
6733	CurrentOrder [Indices.top().second] = I;
6734	IsIdentity &= Indices.top().second == I;
6735	Indices.pop();
6736	}
6737	if (IsIdentity)
6738	CurrentOrder.clear();
6739	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
6740	ReuseShuffleIndices: std::nullopt, ReorderIndices: CurrentOrder);
6741	LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6742
6743	constexpr int NumOps = `2`;
6744	ValueList VectorOperands[NumOps];
6745	for (int I = `0`; I < NumOps; ++I) {
6746	for (Value *V : VL)
6747	VectorOperands[I].push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6748
6749	TE->setOperand(OpIdx: I, OpVL: VectorOperands[I]);
6750	}
6751	buildTree_rec(VL: VectorOperands[NumOps - `1`], Depth: Depth + `1`, UserTreeIdx: {TE, NumOps - `1`});
6752	return;
6753	}
6754	case Instruction::Load: {
6755	// Check that a vectorized load would load the same memory as a scalar
6756	// load. For example, we don't want to vectorize loads that are smaller
6757	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6758	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
6759	// from such a struct, we read/write packed bits disagreeing with the
6760	// unvectorized version.
6761	TreeEntry TE = nullptr*;
6762	fixupOrderingIndices(Order: CurrentOrder);
6763	switch (State) {
6764	case TreeEntry::Vectorize:
6765	if (CurrentOrder.empty()) {
6766	// Original loads are consecutive and does not require reordering.
6767	TE = newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
6768	ReuseShuffleIndices: ReuseShuffleIndicies);
6769	LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6770	} else {
6771	// Need to reorder.
6772	TE = newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
6773	ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
6774	LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6775	}
6776	TE->setOperandsInOrder();
6777	break;
6778	case TreeEntry::StridedVectorize:
6779	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
6780	if (CurrentOrder.empty()) {
6781	TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
6782	UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies);
6783	} else {
6784	TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
6785	UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
6786	}
6787	TE->setOperandsInOrder();
6788	LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6789	break;
6790	case TreeEntry::ScatterVectorize:
6791	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
6792	TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
6793	UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies);
6794	TE->setOperandsInOrder();
6795	buildTree_rec(VL: PointerOps, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
6796	LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6797	break;
6798	case TreeEntry::NeedToGather:
6799	llvm_unreachable("Unexpected loads state.");
6800	}
6801	return;
6802	}
6803	case Instruction::ZExt:
6804	case Instruction::SExt:
6805	case Instruction::FPToUI:
6806	case Instruction::FPToSI:
6807	case Instruction::FPExt:
6808	case Instruction::PtrToInt:
6809	case Instruction::IntToPtr:
6810	case Instruction::SIToFP:
6811	case Instruction::UIToFP:
6812	case Instruction::Trunc:
6813	case Instruction::FPTrunc:
6814	case Instruction::BitCast: {
6815	auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6816	u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
6817	y: std::numeric_limits<unsigned>::max()));
6818	if (ShuffleOrOp == Instruction::ZExt \|\|
6819	ShuffleOrOp == Instruction::SExt) {
6820	CastMaxMinBWSizes = std::make_pair(
6821	x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
6822	b: PrevMaxBW),
6823	y: std::min<unsigned>(
6824	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
6825	b: PrevMinBW));
6826	} else if (ShuffleOrOp == Instruction::Trunc) {
6827	CastMaxMinBWSizes = std::make_pair(
6828	x: std::max<unsigned>(
6829	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
6830	b: PrevMaxBW),
6831	y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
6832	b: PrevMinBW));
6833	ExtraBitWidthNodes.insert(V: VectorizableTree.size() + `1`);
6834	} else if (ShuffleOrOp == Instruction::SIToFP \|\|
6835	ShuffleOrOp == Instruction::UIToFP) {
6836	unsigned NumSignBits =
6837	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
6838	if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: `0`))) {
6839	APInt Mask = DB->getDemandedBits(I: OpI);
6840	NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
6841	}
6842	if (NumSignBits * `2` >=
6843	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
6844	ExtraBitWidthNodes.insert(V: VectorizableTree.size() + `1`);
6845	}
6846	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
6847	ReuseShuffleIndices: ReuseShuffleIndicies);
6848	LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6849
6850	TE->setOperandsInOrder();
6851	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL0->getNumOperands())) {
6852	ValueList Operands;
6853	// Prepare the operand vector.
6854	for (Value *V : VL)
6855	Operands.push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6856
6857	buildTree_rec(VL: Operands, Depth: Depth + `1`, UserTreeIdx: {TE, I});
6858	}
6859	return;
6860	}
6861	case Instruction::ICmp:
6862	case Instruction::FCmp: {
6863	// Check that all of the compares have the same predicate.
6864	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
6865	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
6866	ReuseShuffleIndices: ReuseShuffleIndicies);
6867	LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6868
6869	ValueList Left, Right;
6870	if (cast<CmpInst>(Val: VL0)->isCommutative()) {
6871	// Commutative predicate - collect + sort operands of the instructions
6872	// so that each side is more likely to have the same opcode.
6873	assert(P0 == CmpInst::getSwappedPredicate(P0) &&
6874	"Commutative Predicate mismatch");
6875	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
6876	} else {
6877	// Collect operands - commute if it uses the swapped predicate.
6878	for (Value *V : VL) {
6879	auto *Cmp = cast<CmpInst>(Val: V);
6880	Value *LHS = Cmp->getOperand(i_nocapture: `0`);
6881	Value *RHS = Cmp->getOperand(i_nocapture: `1`);
6882	if (Cmp->getPredicate() != P0)
6883	std::swap(a&: LHS, b&: RHS);
6884	Left.push_back(Elt: LHS);
6885	Right.push_back(Elt: RHS);
6886	}
6887	}
6888	TE->setOperand(OpIdx: `0`, OpVL: Left);
6889	TE->setOperand(OpIdx: `1`, OpVL: Right);
6890	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
6891	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
6892	if (ShuffleOrOp == Instruction::ICmp) {
6893	unsigned NumSignBits0 =
6894	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
6895	if (NumSignBits0 * `2` >=
6896	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
6897	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
6898	unsigned NumSignBits1 =
6899	ComputeNumSignBits(Op: VL0->getOperand(i: `1`), DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
6900	if (NumSignBits1 * `2` >=
6901	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `1`)->getType()))
6902	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `1`)->Idx);
6903	}
6904	return;
6905	}
6906	case Instruction::Select:
6907	case Instruction::FNeg:
6908	case Instruction::Add:
6909	case Instruction::FAdd:
6910	case Instruction::Sub:
6911	case Instruction::FSub:
6912	case Instruction::Mul:
6913	case Instruction::FMul:
6914	case Instruction::UDiv:
6915	case Instruction::SDiv:
6916	case Instruction::FDiv:
6917	case Instruction::URem:
6918	case Instruction::SRem:
6919	case Instruction::FRem:
6920	case Instruction::Shl:
6921	case Instruction::LShr:
6922	case Instruction::AShr:
6923	case Instruction::And:
6924	case Instruction::Or:
6925	case Instruction::Xor: {
6926	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
6927	ReuseShuffleIndices: ReuseShuffleIndicies);
6928	LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6929
6930	// Sort operands of the instructions so that each side is more likely to
6931	// have the same opcode.
6932	if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
6933	ValueList Left, Right;
6934	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
6935	TE->setOperand(OpIdx: `0`, OpVL: Left);
6936	TE->setOperand(OpIdx: `1`, OpVL: Right);
6937	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
6938	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
6939	return;
6940	}
6941
6942	TE->setOperandsInOrder();
6943	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL0->getNumOperands())) {
6944	ValueList Operands;
6945	// Prepare the operand vector.
6946	for (Value *V : VL)
6947	Operands.push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6948
6949	buildTree_rec(VL: Operands, Depth: Depth + `1`, UserTreeIdx: {TE, I});
6950	}
6951	return;
6952	}
6953	case Instruction::GetElementPtr: {
6954	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
6955	ReuseShuffleIndices: ReuseShuffleIndicies);
6956	LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6957	SmallVector<ValueList, `2`> Operands(`2`);
6958	// Prepare the operand vector for pointer operands.
6959	for (Value *V : VL) {
6960	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
6961	if (!GEP) {
6962	Operands.front().push_back(Elt: V);
6963	continue;
6964	}
6965	Operands.front().push_back(Elt: GEP->getPointerOperand());
6966	}
6967	TE->setOperand(OpIdx: `0`, OpVL: Operands.front());
6968	// Need to cast all indices to the same type before vectorization to
6969	// avoid crash.
6970	// Required to be able to find correct matches between different gather
6971	// nodes and reuse the vectorized values rather than trying to gather them
6972	// again.
6973	int IndexIdx = `1`;
6974	Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
6975	Type *Ty = all_of(Range&: VL,
6976	P: [VL0Ty, IndexIdx](Value *V) {
6977	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
6978	if (!GEP)
6979	return true;
6980	return VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
6981	})
6982	? VL0Ty
6983	: DL->getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
6984	->getPointerOperandType()
6985	->getScalarType());
6986	// Prepare the operand vector.
6987	for (Value *V : VL) {
6988	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6989	if (!I) {
6990	Operands.back().push_back(
6991	Elt: ConstantInt::get(Ty, V: `0`, /isSigned=/IsSigned: false));
6992	continue;
6993	}
6994	auto *Op = I->getOperand(i_nocapture: IndexIdx);
6995	auto *CI = dyn_cast<ConstantInt>(Val: Op);
6996	if (!CI)
6997	Operands.back().push_back(Elt: Op);
6998	else
6999	Operands.back().push_back(Elt: ConstantFoldIntegerCast(
7000	C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL: *DL));
7001	}
7002	TE->setOperand(OpIdx: IndexIdx, OpVL: Operands.back());
7003
7004	for (unsigned I = `0`, Ops = Operands.size(); I < Ops; ++I)
7005	buildTree_rec(VL: Operands [I], Depth: Depth + `1`, UserTreeIdx: {TE, I});
7006	return;
7007	}
7008	case Instruction::Store: {
7009	// Check if the stores are consecutive or if we need to swizzle them.
7010	ValueList Operands(VL.size());
7011	auto *OIter = Operands.begin();
7012	for (Value *V : VL) {
7013	auto *SI = cast<StoreInst>(Val: V);
7014	*OIter = SI->getValueOperand();
7015	++OIter;
7016	}
7017	// Check that the sorted pointer operands are consecutive.
7018	if (CurrentOrder.empty()) {
7019	// Original stores are consecutive and does not require reordering.
7020	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7021	ReuseShuffleIndices: ReuseShuffleIndicies);
7022	TE->setOperandsInOrder();
7023	buildTree_rec(VL: Operands, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7024	LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7025	} else {
7026	fixupOrderingIndices(Order: CurrentOrder);
7027	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7028	ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
7029	TE->setOperandsInOrder();
7030	buildTree_rec(VL: Operands, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7031	LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7032	}
7033	return;
7034	}
7035	case Instruction::Call: {
7036	// Check if the calls are all to the same vectorizable intrinsic or
7037	// library function.
7038	CallInst *CI = cast<CallInst>(Val: VL0);
7039	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7040
7041	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7042	ReuseShuffleIndices: ReuseShuffleIndicies);
7043	// Sort operands of the instructions so that each side is more likely to
7044	// have the same opcode.
7045	if (isCommutative(I: VL0)) {
7046	ValueList Left, Right;
7047	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7048	TE->setOperand(OpIdx: `0`, OpVL: Left);
7049	TE->setOperand(OpIdx: `1`, OpVL: Right);
7050	SmallVector<ValueList> Operands;
7051	for (unsigned I : seq<unsigned>(Begin: `2`, End: CI->arg_size())) {
7052	Operands.emplace_back();
7053	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7054	continue;
7055	for (Value *V : VL) {
7056	auto *CI2 = cast<CallInst>(Val: V);
7057	Operands.back().push_back(Elt: CI2->getArgOperand(i: I));
7058	}
7059	TE->setOperand(OpIdx: I, OpVL: Operands.back());
7060	}
7061	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7062	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
7063	for (unsigned I : seq<unsigned>(Begin: `2`, End: CI->arg_size())) {
7064	if (Operands [I - `2`].empty())
7065	continue;
7066	buildTree_rec(VL: Operands [I - `2`], Depth: Depth + `1`, UserTreeIdx: {TE, I});
7067	}
7068	return;
7069	}
7070	TE->setOperandsInOrder();
7071	for (unsigned I : seq<unsigned>(Begin: `0`, End: CI->arg_size())) {
7072	// For scalar operands no need to create an entry since no need to
7073	// vectorize it.
7074	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7075	continue;
7076	ValueList Operands;
7077	// Prepare the operand vector.
7078	for (Value *V : VL) {
7079	auto *CI2 = cast<CallInst>(Val: V);
7080	Operands.push_back(Elt: CI2->getArgOperand(i: I));
7081	}
7082	buildTree_rec(VL: Operands, Depth: Depth + `1`, UserTreeIdx: {TE, I});
7083	}
7084	return;
7085	}
7086	case Instruction::ShuffleVector: {
7087	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7088	ReuseShuffleIndices: ReuseShuffleIndicies);
7089	LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7090
7091	// Reorder operands if reordering would enable vectorization.
7092	auto *CI = dyn_cast<CmpInst>(Val: VL0);
7093	if (isa<BinaryOperator>(Val: VL0) \|\| CI) {
7094	ValueList Left, Right;
7095	if (!CI \|\| all_of(Range&: VL, P: [](Value *V) {
7096	return cast<CmpInst>(Val: V)->isCommutative();
7097	})) {
7098	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7099	} else {
7100	auto *MainCI = cast<CmpInst>(Val: S.MainOp);
7101	auto *AltCI = cast<CmpInst>(Val: S.AltOp);
7102	CmpInst::Predicate MainP = MainCI->getPredicate();
7103	CmpInst::Predicate AltP = AltCI->getPredicate();
7104	assert(MainP != AltP &&
7105	"Expected different main/alternate predicates.");
7106	// Collect operands - commute if it uses the swapped predicate or
7107	// alternate operation.
7108	for (Value *V : VL) {
7109	auto *Cmp = cast<CmpInst>(Val: V);
7110	Value *LHS = Cmp->getOperand(i_nocapture: `0`);
7111	Value *RHS = Cmp->getOperand(i_nocapture: `1`);
7112
7113	if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
7114	if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7115	std::swap(a&: LHS, b&: RHS);
7116	} else {
7117	if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7118	std::swap(a&: LHS, b&: RHS);
7119	}
7120	Left.push_back(Elt: LHS);
7121	Right.push_back(Elt: RHS);
7122	}
7123	}
7124	TE->setOperand(OpIdx: `0`, OpVL: Left);
7125	TE->setOperand(OpIdx: `1`, OpVL: Right);
7126	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7127	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
7128	return;
7129	}
7130
7131	TE->setOperandsInOrder();
7132	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL0->getNumOperands())) {
7133	ValueList Operands;
7134	// Prepare the operand vector.
7135	for (Value *V : VL)
7136	Operands.push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
7137
7138	buildTree_rec(VL: Operands, Depth: Depth + `1`, UserTreeIdx: {TE, I});
7139	}
7140	return;
7141	}
7142	default:
7143	break;
7144	}
7145	llvm_unreachable("Unexpected vectorization of the instructions.");
7146	}
7147
7148	unsigned BoUpSLP::canMapToVector(Type T) const* {
7149	unsigned N = `1`;
7150	Type *EltTy = T;
7151
7152	while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
7153	if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
7154	// Check that struct is homogeneous.
7155	for (const auto *Ty : ST->elements())
7156	if (Ty != *ST->element_begin())
7157	return `0`;
7158	N *= ST->getNumElements();
7159	EltTy = *ST->element_begin();
7160	} else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
7161	N *= AT->getNumElements();
7162	EltTy = AT->getElementType();
7163	} else {
7164	auto *VT = cast<FixedVectorType>(Val: EltTy);
7165	N *= VT->getNumElements();
7166	EltTy = VT->getElementType();
7167	}
7168	}
7169
7170	if (!isValidElementType(Ty: EltTy))
7171	return `0`;
7172	uint64_t VTSize = DL->getTypeStoreSizeInBits(Ty: FixedVectorType::get(ElementType: EltTy, NumElts: N));
7173	if (VTSize < MinVecRegSize \|\| VTSize > MaxVecRegSize \|\|
7174	VTSize != DL->getTypeStoreSizeInBits(Ty: T))
7175	return `0`;
7176	return N;
7177	}
7178
7179	bool BoUpSLP::canReuseExtract(ArrayRef<Value > VL, Value OpValue,
7180	SmallVectorImpl<unsigned> &CurrentOrder,
7181	bool ResizeAllowed) const {
7182	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
7183	assert(It != VL.end() && "Expected at least one extract instruction.");
7184	auto E0 = cast<Instruction>(Val: It);
7185	assert(
7186	all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7187	"Invalid opcode");
7188	// Check if all of the extracts come from the same vector and from the
7189	// correct offset.
7190	Value *Vec = E0->getOperand(i: `0`);
7191
7192	CurrentOrder.clear();
7193
7194	// We have to extract from a vector/aggregate with the same number of elements.
7195	unsigned NElts;
7196	if (E0->getOpcode() == Instruction::ExtractValue) {
7197	NElts = canMapToVector(T: Vec->getType());
7198	if (!NElts)
7199	return false;
7200	// Check if load can be rewritten as load of vector.
7201	LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
7202	if (!LI \|\| !LI->isSimple() \|\| !LI->hasNUses(N: VL.size()))
7203	return false;
7204	} else {
7205	NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
7206	}
7207
7208	unsigned E = VL.size();
7209	if (!ResizeAllowed && NElts != E)
7210	return false;
7211	SmallVector<int> Indices(E, PoisonMaskElem);
7212	unsigned MinIdx = NElts, MaxIdx = `0`;
7213	for (auto [I, V] : enumerate(First&: VL)) {
7214	auto *Inst = dyn_cast<Instruction>(Val: V);
7215	if (!Inst)
7216	continue;
7217	if (Inst->getOperand(i: `0`) != Vec)
7218	return false;
7219	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
7220	if (isa<UndefValue>(Val: EE->getIndexOperand()))
7221	continue;
7222	std::optional<unsigned> Idx = getExtractIndex(E: Inst);
7223	if (!Idx)
7224	return false;
7225	const unsigned ExtIdx = *Idx;
7226	if (ExtIdx >= NElts)
7227	continue;
7228	Indices [I] = ExtIdx;
7229	if (MinIdx > ExtIdx)
7230	MinIdx = ExtIdx;
7231	if (MaxIdx < ExtIdx)
7232	MaxIdx = ExtIdx;
7233	}
7234	if (MaxIdx - MinIdx + `1` > E)
7235	return false;
7236	if (MaxIdx + `1` <= E)
7237	MinIdx = `0`;
7238
7239	// Check that all of the indices extract from the correct offset.
7240	bool ShouldKeepOrder = true;
7241	// Assign to all items the initial value E + 1 so we can check if the extract
7242	// instruction index was used already.
7243	// Also, later we can check that all the indices are used and we have a
7244	// consecutive access in the extract instructions, by checking that no
7245	// element of CurrentOrder still has value E + 1.
7246	CurrentOrder.assign(NumElts: E, Elt: E);
7247	for (unsigned I = `0`; I < E; ++I) {
7248	if (Indices [I] == PoisonMaskElem)
7249	continue;
7250	const unsigned ExtIdx = Indices [I] - MinIdx;
7251	if (CurrentOrder [ExtIdx] != E) {
7252	CurrentOrder.clear();
7253	return false;
7254	}
7255	ShouldKeepOrder &= ExtIdx == I;
7256	CurrentOrder [ExtIdx] = I;
7257	}
7258	if (ShouldKeepOrder)
7259	CurrentOrder.clear();
7260
7261	return ShouldKeepOrder;
7262	}
7263
7264	bool BoUpSLP::areAllUsersVectorized(
7265	Instruction I, const* SmallDenseSet<Value > VectorizedVals) const {
7266	return (I->hasOneUse() && (!VectorizedVals \|\| VectorizedVals->contains(V: I))) \|\|
7267	all_of(Range: I->users(), P: [this](User *U) {
7268	return ScalarToTreeEntry.contains(Val: U) \|\|
7269	isVectorLikeInstWithConstOps(V: U) \|\|
7270	(isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
7271	});
7272	}
7273
7274	static std::pair<InstructionCost, InstructionCost>
7275	getVectorCallCosts(CallInst CI, FixedVectorType VecTy,
7276	TargetTransformInfo TTI, TargetLibraryInfo TLI,
7277	ArrayRef<Type *> ArgTys) {
7278	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7279
7280	// Calculate the cost of the scalar and vector calls.
7281	FastMathFlags FMF;
7282	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
7283	FMF = FPCI->getFastMathFlags();
7284	SmallVector<const Value *> Arguments(CI->args());
7285	IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7286	dyn_cast<IntrinsicInst>(Val: CI));
7287	auto IntrinsicCost =
7288	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
7289
7290	auto Shape = VFShape::get(FTy: CI->getFunctionType(),
7291	EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
7292	HasGlobalPred: false /HasGlobalPred/);
7293	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
7294	auto LibCost = IntrinsicCost;
7295	if (!CI->isNoBuiltin() && VecFunc) {
7296	// Calculate the cost of the vector library call.
7297	// If the corresponding vector call is cheaper, return its cost.
7298	LibCost =
7299	TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
7300	}
7301	return {IntrinsicCost, LibCost};
7302	}
7303
7304	void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7305	const function_ref<bool(Instruction )> IsAltOp, SmallVectorImpl<int*> &Mask,
7306	SmallVectorImpl<Value > OpScalars,
7307	SmallVectorImpl<Value > AltScalars) const {
7308	unsigned Sz = Scalars.size();
7309	Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
7310	SmallVector<int> OrderMask;
7311	if (!ReorderIndices.empty())
7312	inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
7313	for (unsigned I = `0`; I < Sz; ++I) {
7314	unsigned Idx = I;
7315	if (!ReorderIndices.empty())
7316	Idx = OrderMask [I];
7317	auto *OpInst = cast<Instruction>(Val: Scalars [Idx]);
7318	if (IsAltOp (OpInst)) {
7319	Mask [I] = Sz + Idx;
7320	if (AltScalars)
7321	AltScalars->push_back(Elt: OpInst);
7322	} else {
7323	Mask [I] = Idx;
7324	if (OpScalars)
7325	OpScalars->push_back(Elt: OpInst);
7326	}
7327	}
7328	if (!ReuseShuffleIndices.empty()) {
7329	SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7330	transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
7331	return Idx != PoisonMaskElem ? Mask [Idx] : PoisonMaskElem;
7332	});
7333	Mask.swap(RHS&: NewMask);
7334	}
7335	}
7336
7337	static bool isAlternateInstruction(const Instruction *I,
7338	const Instruction *MainOp,
7339	const Instruction *AltOp,
7340	const TargetLibraryInfo &TLI) {
7341	if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
7342	auto *AltCI = cast<CmpInst>(Val: AltOp);
7343	CmpInst::Predicate MainP = MainCI->getPredicate();
7344	CmpInst::Predicate AltP = AltCI->getPredicate();
7345	assert(MainP != AltP && "Expected different main/alternate predicates.");
7346	auto *CI = cast<CmpInst>(Val: I);
7347	if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
7348	return false;
7349	if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
7350	return true;
7351	CmpInst::Predicate P = CI->getPredicate();
7352	CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
7353
7354	assert((MainP == P \|\| AltP == P \|\| MainP == SwappedP \|\| AltP == SwappedP) &&
7355	"CmpInst expected to match either main or alternate predicate or "
7356	"their swap.");
7357	(void)AltP;
7358	return MainP != P && MainP != SwappedP;
7359	}
7360	return I->getOpcode() == AltOp->getOpcode();
7361	}
7362
7363	TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7364	assert(!Ops.empty());
7365	const auto *Op0 = Ops.front();
7366
7367	const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
7368	// TODO: We should allow undef elements here
7369	return isConstant(V) && !isa<UndefValue>(Val: V);
7370	});
7371	const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
7372	// TODO: We should allow undef elements here
7373	return V == Op0;
7374	});
7375	const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7376	// TODO: We should allow undef elements here
7377	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7378	return CI->getValue().isPowerOf2();
7379	return false;
7380	});
7381	const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7382	// TODO: We should allow undef elements here
7383	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7384	return CI->getValue().isNegatedPowerOf2();
7385	return false;
7386	});
7387
7388	TTI::OperandValueKind VK = TTI::OK_AnyValue;
7389	if (IsConstant && IsUniform)
7390	VK = TTI::OK_UniformConstantValue;
7391	else if (IsConstant)
7392	VK = TTI::OK_NonUniformConstantValue;
7393	else if (IsUniform)
7394	VK = TTI::OK_UniformValue;
7395
7396	TTI::OperandValueProperties VP = TTI::OP_None;
7397	VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7398	VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7399
7400	return {.Kind: VK, .Properties: VP};
7401	}
7402
7403	namespace {
7404	/// The base class for shuffle instruction emission and shuffle cost estimation.
7405	class BaseShuffleAnalysis {
7406	protected:
7407	/// Checks if the mask is an identity mask.
7408	/// \param IsStrict if is true the function returns false if mask size does
7409	/// not match vector size.
7410	static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7411	bool IsStrict) {
7412	int Limit = Mask.size();
7413	int VF = VecTy->getNumElements();
7414	int Index = -`1`;
7415	if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
7416	return true;
7417	if (!IsStrict) {
7418	// Consider extract subvector starting from index 0.
7419	if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
7420	Index == `0`)
7421	return true;
7422	// All VF-size submasks are identity (e.g.
7423	// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7424	if (Limit % VF == `0` && all_of(Range: seq<int>(Begin: `0`, End: Limit / VF), P: [=](int Idx) {
7425	ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
7426	return all_of(Range&: Slice, P: [](int I) { return I == PoisonMaskElem; }) \|\|
7427	ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
7428	}))
7429	return true;
7430	}
7431	return false;
7432	}
7433
7434	/// Tries to combine 2 different masks into single one.
7435	/// \param LocalVF Vector length of the permuted input vector. \p Mask may
7436	/// change the size of the vector, \p LocalVF is the original size of the
7437	/// shuffled vector.
7438	static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7439	ArrayRef<int> ExtMask) {
7440	unsigned VF = Mask.size();
7441	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7442	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
7443	if (ExtMask [I] == PoisonMaskElem)
7444	continue;
7445	int MaskedIdx = Mask [ExtMask [I] % VF];
7446	NewMask [I] =
7447	MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7448	}
7449	Mask.swap(RHS&: NewMask);
7450	}
7451
7452	/// Looks through shuffles trying to reduce final number of shuffles in the
7453	/// code. The function looks through the previously emitted shuffle
7454	/// instructions and properly mark indices in mask as undef.
7455	/// For example, given the code
7456	/// \code
7457	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7458	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7459	/// \endcode
7460	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7461	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
7462	/// <0, 1, 2, 3> for the shuffle.
7463	/// If 2 operands are of different size, the smallest one will be resized and
7464	/// the mask recalculated properly.
7465	/// For example, given the code
7466	/// \code
7467	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7468	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7469	/// \endcode
7470	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7471	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
7472	/// <0, 1, 2, 3> for the shuffle.
7473	/// So, it tries to transform permutations to simple vector merge, if
7474	/// possible.
7475	/// \param V The input vector which must be shuffled using the given \p Mask.
7476	/// If the better candidate is found, \p V is set to this best candidate
7477	/// vector.
7478	/// \param Mask The input mask for the shuffle. If the best candidate is found
7479	/// during looking-through-shuffles attempt, it is updated accordingly.
7480	/// \param SinglePermute true if the shuffle operation is originally a
7481	/// single-value-permutation. In this case the look-through-shuffles procedure
7482	/// may look for resizing shuffles as the best candidates.
7483	/// \return true if the shuffle results in the non-resizing identity shuffle
7484	/// (and thus can be ignored), false - otherwise.
7485	static bool peekThroughShuffles(Value &V, SmallVectorImpl<int*> &Mask,
7486	bool SinglePermute) {
7487	Value *Op = V;
7488	ShuffleVectorInst IdentityOp = nullptr*;
7489	SmallVector<int> IdentityMask;
7490	while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
7491	// Exit if not a fixed vector type or changing size shuffle.
7492	auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
7493	if (!SVTy)
7494	break;
7495	// Remember the identity or broadcast mask, if it is not a resizing
7496	// shuffle. If no better candidates are found, this Op and Mask will be
7497	// used in the final shuffle.
7498	if (isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/false)) {
7499	if (!IdentityOp \|\| !SinglePermute \|\|
7500	(isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/true) &&
7501	!ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
7502	NumSrcElts: IdentityMask.size()))) {
7503	IdentityOp = SV;
7504	// Store current mask in the IdentityMask so later we did not lost
7505	// this info if IdentityOp is selected as the best candidate for the
7506	// permutation.
7507	IdentityMask.assign(RHS: Mask);
7508	}
7509	}
7510	// Remember the broadcast mask. If no better candidates are found, this Op
7511	// and Mask will be used in the final shuffle.
7512	// Zero splat can be used as identity too, since it might be used with
7513	// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7514	// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7515	// expensive, the analysis founds out, that the source vector is just a
7516	// broadcast, this original mask can be transformed to identity mask <0,
7517	// 1, 2, 3>.
7518	// \code
7519	// %0 = shuffle %v, poison, zeroinitalizer
7520	// %res = shuffle %0, poison, <3, 1, 2, 0>
7521	// \endcode
7522	// may be transformed to
7523	// \code
7524	// %0 = shuffle %v, poison, zeroinitalizer
7525	// %res = shuffle %0, poison, <0, 1, 2, 3>
7526	// \endcode
7527	if (SV->isZeroEltSplat()) {
7528	IdentityOp = SV;
7529	IdentityMask.assign(RHS: Mask);
7530	}
7531	int LocalVF = Mask.size();
7532	if (auto *SVOpTy =
7533	dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType()))
7534	LocalVF = SVOpTy->getNumElements();
7535	SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7536	for (auto [Idx, I] : enumerate(First&: Mask)) {
7537	if (I == PoisonMaskElem \|\|
7538	static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7539	continue;
7540	ExtMask [Idx] = SV->getMaskValue(Elt: I);
7541	}
7542	bool IsOp1Undef =
7543	isUndefVector(V: SV->getOperand(i_nocapture: `0`),
7544	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
7545	.all();
7546	bool IsOp2Undef =
7547	isUndefVector(V: SV->getOperand(i_nocapture: `1`),
7548	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
7549	.all();
7550	if (!IsOp1Undef && !IsOp2Undef) {
7551	// Update mask and mark undef elems.
7552	for (int &I : Mask) {
7553	if (I == PoisonMaskElem)
7554	continue;
7555	if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
7556	PoisonMaskElem)
7557	I = PoisonMaskElem;
7558	}
7559	break;
7560	}
7561	SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7562	SV->getShuffleMask().end());
7563	combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
7564	Mask.swap(RHS&: ShuffleMask);
7565	if (IsOp2Undef)
7566	Op = SV->getOperand(i_nocapture: `0`);
7567	else
7568	Op = SV->getOperand(i_nocapture: `1`);
7569	}
7570	if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
7571	!OpTy \|\| !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) \|\|
7572	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
7573	if (IdentityOp) {
7574	V = IdentityOp;
7575	assert(Mask.size() == IdentityMask.size() &&
7576	"Expected masks of same sizes.");
7577	// Clear known poison elements.
7578	for (auto [I, Idx] : enumerate(First&: Mask))
7579	if (Idx == PoisonMaskElem)
7580	IdentityMask [I] = PoisonMaskElem;
7581	Mask.swap(RHS&: IdentityMask);
7582	auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
7583	return SinglePermute &&
7584	(isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
7585	/IsStrict=/true) \|\|
7586	(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7587	Shuffle->isZeroEltSplat() &&
7588	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())));
7589	}
7590	V = Op;
7591	return false;
7592	}
7593	V = Op;
7594	return true;
7595	}
7596
7597	/// Smart shuffle instruction emission, walks through shuffles trees and
7598	/// tries to find the best matching vector for the actual shuffle
7599	/// instruction.
7600	template <typename T, typename ShuffleBuilderTy>
7601	static T createShuffle(Value V1, Value V2, ArrayRef<int> Mask,
7602	ShuffleBuilderTy &Builder) {
7603	assert(V1 && "Expected at least one vector value.");
7604	if (V2)
7605	Builder.resizeToMatch(V1, V2);
7606	int VF = Mask.size();
7607	if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
7608	VF = FTy->getNumElements();
7609	if (V2 &&
7610	!isUndefVector(V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg)).all()) {
7611	// Peek through shuffles.
7612	Value *Op1 = V1;
7613	Value *Op2 = V2;
7614	int VF =
7615	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
7616	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7617	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7618	for (int I = `0`, E = Mask.size(); I < E; ++I) {
7619	if (Mask [I] < VF)
7620	CombinedMask1 [I] = Mask [I];
7621	else
7622	CombinedMask2 [I] = Mask [I] - VF;
7623	}
7624	Value *PrevOp1;
7625	Value *PrevOp2;
7626	do {
7627	PrevOp1 = Op1;
7628	PrevOp2 = Op2;
7629	(void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /SinglePermute=/false);
7630	(void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /SinglePermute=/false);
7631	// Check if we have 2 resizing shuffles - need to peek through operands
7632	// again.
7633	if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
7634	if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
7635	SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7636	for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
7637	if (I == PoisonMaskElem)
7638	continue;
7639	ExtMask1 [Idx] = SV1->getMaskValue(Elt: I);
7640	}
7641	SmallBitVector UseMask1 = buildUseMask(
7642	VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: `1`)->getType())
7643	->getNumElements(),
7644	Mask: ExtMask1, MaskArg: UseMask::SecondArg);
7645	SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7646	for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
7647	if (I == PoisonMaskElem)
7648	continue;
7649	ExtMask2 [Idx] = SV2->getMaskValue(Elt: I);
7650	}
7651	SmallBitVector UseMask2 = buildUseMask(
7652	VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: `1`)->getType())
7653	->getNumElements(),
7654	Mask: ExtMask2, MaskArg: UseMask::SecondArg);
7655	if (SV1->getOperand(i_nocapture: `0`)->getType() ==
7656	SV2->getOperand(i_nocapture: `0`)->getType() &&
7657	SV1->getOperand(i_nocapture: `0`)->getType() != SV1->getType() &&
7658	isUndefVector(V: SV1->getOperand(i_nocapture: `1`), UseMask: UseMask1).all() &&
7659	isUndefVector(V: SV2->getOperand(i_nocapture: `1`), UseMask: UseMask2).all()) {
7660	Op1 = SV1->getOperand(i_nocapture: `0`);
7661	Op2 = SV2->getOperand(i_nocapture: `0`);
7662	SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7663	SV1->getShuffleMask().end());
7664	int LocalVF = ShuffleMask1.size();
7665	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
7666	LocalVF = FTy->getNumElements();
7667	combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
7668	CombinedMask1.swap(RHS&: ShuffleMask1);
7669	SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7670	SV2->getShuffleMask().end());
7671	LocalVF = ShuffleMask2.size();
7672	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
7673	LocalVF = FTy->getNumElements();
7674	combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
7675	CombinedMask2.swap(RHS&: ShuffleMask2);
7676	}
7677	}
7678	} while (PrevOp1 != Op1 \|\| PrevOp2 != Op2);
7679	Builder.resizeToMatch(Op1, Op2);
7680	VF = std::max(a: cast<VectorType>(Val: Op1->getType())
7681	->getElementCount()
7682	.getKnownMinValue(),
7683	b: cast<VectorType>(Val: Op2->getType())
7684	->getElementCount()
7685	.getKnownMinValue());
7686	for (int I = `0`, E = Mask.size(); I < E; ++I) {
7687	if (CombinedMask2 [I] != PoisonMaskElem) {
7688	assert(CombinedMask1[I] == PoisonMaskElem &&
7689	"Expected undefined mask element");
7690	CombinedMask1 [I] = CombinedMask2 [I] + (Op1 == Op2 ? `0` : VF);
7691	}
7692	}
7693	if (Op1 == Op2 &&
7694	(ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) \|\|
7695	(ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
7696	isa<ShuffleVectorInst>(Val: Op1) &&
7697	cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
7698	ArrayRef(CombinedMask1))))
7699	return Builder.createIdentity(Op1);
7700	return Builder.createShuffleVector(
7701	Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
7702	CombinedMask1);
7703	}
7704	if (isa<PoisonValue>(Val: V1))
7705	return Builder.createPoison(
7706	cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
7707	SmallVector<int> NewMask(Mask.begin(), Mask.end());
7708	bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /SinglePermute=/true);
7709	assert(V1 && "Expected non-null value after looking through shuffles.");
7710
7711	if (!IsIdentity)
7712	return Builder.createShuffleVector(V1, NewMask);
7713	return Builder.createIdentity(V1);
7714	}
7715	};
7716	} // namespace
7717
7718	/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7719	/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7720	/// subvector pattern.
7721	static InstructionCost
7722	getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
7723	VectorType Tp, ArrayRef<int*> Mask = std::nullopt,
7724	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
7725	int Index = `0`, VectorType SubTp = nullptr*,
7726	ArrayRef<const Value *> Args = std::nullopt) {
7727	if (Kind != TTI::SK_PermuteTwoSrc)
7728	return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7729	int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7730	int NumSubElts;
7731	if (Mask.size() > `2` && ShuffleVectorInst::isInsertSubvectorMask(
7732	Mask, NumSrcElts, NumSubElts, Index)) {
7733	if (Index + NumSubElts > NumSrcElts &&
7734	Index + NumSrcElts <= static_cast<int>(Mask.size()))
7735	return TTI.getShuffleCost(
7736	Kind: TTI::SK_InsertSubvector,
7737	Tp: FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: Mask.size()), Mask,
7738	CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
7739	}
7740	return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7741	}
7742
7743	/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7744	static std::pair<InstructionCost, InstructionCost>
7745	getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
7746	Value BasePtr, unsigned* Opcode, TTI::TargetCostKind CostKind,
7747	Type ScalarTy, VectorType VecTy) {
7748	InstructionCost ScalarCost = `0`;
7749	InstructionCost VecCost = `0`;
7750	// Here we differentiate two cases: (1) when Ptrs represent a regular
7751	// vectorization tree node (as they are pointer arguments of scattered
7752	// loads) or (2) when Ptrs are the arguments of loads or stores being
7753	// vectorized as plane wide unit-stride load/store since all the
7754	// loads/stores are known to be from/to adjacent locations.
7755	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store) {
7756	// Case 2: estimate costs for pointer related costs when vectorizing to
7757	// a wide load/store.
7758	// Scalar cost is estimated as a set of pointers with known relationship
7759	// between them.
7760	// For vector code we will use BasePtr as argument for the wide load/store
7761	// but we also need to account all the instructions which are going to
7762	// stay in vectorized code due to uses outside of these scalar
7763	// loads/stores.
7764	ScalarCost = TTI.getPointersChainCost(
7765	Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
7766	CostKind);
7767
7768	SmallVector<const Value *> PtrsRetainedInVecCode;
7769	for (Value *V : Ptrs) {
7770	if (V == BasePtr) {
7771	PtrsRetainedInVecCode.push_back(Elt: V);
7772	continue;
7773	}
7774	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
7775	// For simplicity assume Ptr to stay in vectorized code if it's not a
7776	// GEP instruction. We don't care since it's cost considered free.
7777	// TODO: We should check for any uses outside of vectorizable tree
7778	// rather than just single use.
7779	if (!Ptr \|\| !Ptr->hasOneUse())
7780	PtrsRetainedInVecCode.push_back(Elt: V);
7781	}
7782
7783	if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7784	// If all pointers stay in vectorized code then we don't have
7785	// any savings on that.
7786	return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
7787	}
7788	VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
7789	Info: TTI::PointersChainInfo::getKnownStride(),
7790	AccessTy: VecTy, CostKind);
7791	} else {
7792	// Case 1: Ptrs are the arguments of loads that we are going to transform
7793	// into masked gather load intrinsic.
7794	// All the scalar GEPs will be removed as a result of vectorization.
7795	// For any external uses of some lanes extract element instructions will
7796	// be generated (which cost is estimated separately).
7797	TTI::PointersChainInfo PtrsInfo =
7798	all_of(Range&: Ptrs,
7799	P: [](const Value *V) {
7800	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
7801	return Ptr && !Ptr->hasAllConstantIndices();
7802	})
7803	? TTI::PointersChainInfo::getUnknownStride()
7804	: TTI::PointersChainInfo::getKnownStride();
7805
7806	ScalarCost =
7807	TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
7808	if (auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr)) {
7809	SmallVector<const Value *> Indices(BaseGEP->indices());
7810	VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
7811	Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
7812	CostKind);
7813	}
7814	}
7815
7816	return std::make_pair(x&: ScalarCost, y&: VecCost);
7817	}
7818
7819	void BoUpSLP::transformNodes() {
7820	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7821	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7822	TreeEntry &E = *TE.get();
7823	switch (E.getOpcode()) {
7824	case Instruction::Load: {
7825	Type *ScalarTy = E.getMainOp()->getType();
7826	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: E.Scalars.size());
7827	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
7828	// Check if profitable to represent consecutive load + reverse as strided
7829	// load with stride -1.
7830	if (isReverseOrder(Order: E.ReorderIndices) &&
7831	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
7832	SmallVector<int> Mask;
7833	inversePermutation(Indices: E.ReorderIndices, Mask);
7834	auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
7835	InstructionCost OriginalVecCost =
7836	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
7837	AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
7838	OpdInfo: TTI::OperandValueInfo ()) +
7839	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
7840	InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
7841	Opcode: Instruction::Load, DataTy: VecTy, Ptr: BaseLI->getPointerOperand(),
7842	/VariableMask=/false, Alignment: CommonAlignment, CostKind, I: BaseLI);
7843	if (StridedCost < OriginalVecCost)
7844	// Strided load is more profitable than consecutive load + reverse -
7845	// transform the node to strided load.
7846	E.State = TreeEntry::StridedVectorize;
7847	}
7848	break;
7849	}
7850	default:
7851	break;
7852	}
7853	}
7854	}
7855
7856	/// Merges shuffle masks and emits final shuffle instruction, if required. It
7857	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7858	/// when the actual shuffle instruction is generated only if this is actually
7859	/// required. Otherwise, the shuffle instruction emission is delayed till the
7860	/// end of the process, to reduce the number of emitted instructions and further
7861	/// analysis/transformations.
7862	class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7863	bool IsFinalized = false;
7864	SmallVector<int> CommonMask;
7865	SmallVector<PointerUnion<Value , const* TreeEntry *>, `2`> InVectors;
7866	const TargetTransformInfo &TTI;
7867	InstructionCost Cost = `0`;
7868	SmallDenseSet<Value *> VectorizedVals;
7869	BoUpSLP &R;
7870	SmallPtrSetImpl<Value *> &CheckedExtracts;
7871	constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7872	/// While set, still trying to estimate the cost for the same nodes and we
7873	/// can delay actual cost estimation (virtual shuffle instruction emission).
7874	/// May help better estimate the cost if same nodes must be permuted + allows
7875	/// to move most of the long shuffles cost estimation to TTI.
7876	bool SameNodesEstimated = true;
7877
7878	static Constant getAllOnesValue(const* DataLayout &DL, Type *Ty) {
7879	if (Ty->getScalarType()->isPointerTy()) {
7880	Constant *Res = ConstantExpr::getIntToPtr(
7881	C: ConstantInt::getAllOnesValue(
7882	Ty: IntegerType::get(C&: Ty->getContext(),
7883	NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
7884	Ty: Ty->getScalarType());
7885	if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
7886	Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
7887	return Res;
7888	}
7889	return Constant::getAllOnesValue(Ty);
7890	}
7891
7892	InstructionCost getBuildVectorCost(ArrayRef<Value > VL, Value Root) {
7893	if ((!Root && allConstant(VL)) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>))
7894	return TTI::TCC_Free;
7895	auto *VecTy = FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: VL.size());
7896	InstructionCost GatherCost = `0`;
7897	SmallVector<Value *> Gathers(VL.begin(), VL.end());
7898	// Improve gather cost for gather of loads, if we can group some of the
7899	// loads into vector loads.
7900	InstructionsState S = getSameOpcode(VL, TLI: *R.TLI);
7901	const unsigned Sz = R.DL->getTypeSizeInBits(Ty: VL.front()->getType());
7902	unsigned MinVF = R.getMinVF(Sz: `2` * Sz);
7903	if (VL.size() > `2` &&
7904	((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) \|\|
7905	(InVectors.empty() &&
7906	any_of(Range: seq<unsigned>(Begin: `0`, End: VL.size() / MinVF),
7907	P: [&](unsigned Idx) {
7908	ArrayRef<Value > SubVL = VL.slice(N: Idx MinVF, M: MinVF);
7909	InstructionsState S = getSameOpcode(VL: SubVL, TLI: *R.TLI);
7910	return S.getOpcode() == Instruction::Load &&
7911	!S.isAltShuffle();
7912	}))) &&
7913	!all_of(Range&: Gathers, P: [&](Value V) { return* R.getTreeEntry(V); }) &&
7914	!isSplat(VL: Gathers)) {
7915	InstructionCost BaseCost = R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root);
7916	SetVector<Value *> VectorizedLoads;
7917	SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
7918	SmallVector<unsigned> ScatterVectorized;
7919	unsigned StartIdx = `0`;
7920	unsigned VF = VL.size() / `2`;
7921	for (; VF >= MinVF; VF /= `2`) {
7922	for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
7923	Cnt += VF) {
7924	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
7925	if (S.getOpcode() != Instruction::Load \|\| S.isAltShuffle()) {
7926	InstructionsState SliceS = getSameOpcode(VL: Slice, TLI: *R.TLI);
7927	if (SliceS.getOpcode() != Instruction::Load \|\|
7928	SliceS.isAltShuffle())
7929	continue;
7930	}
7931	if (!VectorizedLoads.count(key: Slice.front()) &&
7932	!VectorizedLoads.count(key: Slice.back()) && allSameBlock(VL: Slice)) {
7933	SmallVector<Value *> PointerOps;
7934	OrdersType CurrentOrder;
7935	LoadsState LS = R.canVectorizeLoads(VL: Slice, VL0: Slice.front(),
7936	Order&: CurrentOrder, PointerOps);
7937	switch (LS) {
7938	case LoadsState::Vectorize:
7939	case LoadsState::ScatterVectorize:
7940	case LoadsState::StridedVectorize:
7941	// Mark the vectorized loads so that we don't vectorize them
7942	// again.
7943	// TODO: better handling of loads with reorders.
7944	if (((LS == LoadsState::Vectorize \|\|
7945	LS == LoadsState::StridedVectorize) &&
7946	CurrentOrder.empty()) \|\|
7947	(LS == LoadsState::StridedVectorize &&
7948	isReverseOrder(Order: CurrentOrder)))
7949	VectorizedStarts.emplace_back(Args&: Cnt, Args&: LS);
7950	else
7951	ScatterVectorized.push_back(Elt: Cnt);
7952	VectorizedLoads.insert(Start: Slice.begin(), End: Slice.end());
7953	// If we vectorized initial block, no need to try to vectorize
7954	// it again.
7955	if (Cnt == StartIdx)
7956	StartIdx += VF;
7957	break;
7958	case LoadsState::Gather:
7959	break;
7960	}
7961	}
7962	}
7963	// Check if the whole array was vectorized already - exit.
7964	if (StartIdx >= VL.size())
7965	break;
7966	// Found vectorizable parts - exit.
7967	if (!VectorizedLoads.empty())
7968	break;
7969	}
7970	if (!VectorizedLoads.empty()) {
7971	unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
7972	bool NeedInsertSubvectorAnalysis =
7973	!NumParts \|\| (VL.size() / VF) > NumParts;
7974	// Get the cost for gathered loads.
7975	for (unsigned I = `0`, End = VL.size(); I < End; I += VF) {
7976	if (VectorizedLoads.contains(key: VL [I]))
7977	continue;
7978	GatherCost +=
7979	getBuildVectorCost(VL: VL.slice(N: I, M: std::min(a: End - I, b: VF)), Root);
7980	}
7981	// Exclude potentially vectorized loads from list of gathered
7982	// scalars.
7983	Gathers.assign(NumElts: Gathers.size(), Elt: PoisonValue::get(T: VL.front()->getType()));
7984	// The cost for vectorized loads.
7985	InstructionCost ScalarsCost = `0`;
7986	for (Value *V : VectorizedLoads) {
7987	auto *LI = cast<LoadInst>(Val: V);
7988	ScalarsCost +=
7989	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LI->getType(),
7990	Alignment: LI->getAlign(), AddressSpace: LI->getPointerAddressSpace(),
7991	CostKind, OpdInfo: TTI::OperandValueInfo (), I: LI);
7992	}
7993	auto *LoadTy = FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: VF);
7994	for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
7995	auto *LI = cast<LoadInst>(Val: VL [P.first]);
7996	Align Alignment = LI->getAlign();
7997	GatherCost +=
7998	P.second == LoadsState::Vectorize
7999	? TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment,
8000	AddressSpace: LI->getPointerAddressSpace(), CostKind,
8001	OpdInfo: TTI::OperandValueInfo (), I: LI)
8002	: TTI.getStridedMemoryOpCost(
8003	Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI->getPointerOperand(),
8004	/VariableMask=/false, Alignment, CostKind, I: LI);
8005	// Estimate GEP cost.
8006	SmallVector<Value *> PointerOps(VF);
8007	for (auto [I, V] : enumerate(First: VL.slice(N: P.first, M: VF)))
8008	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
8009	auto [ScalarGEPCost, VectorGEPCost] =
8010	getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: LI->getPointerOperand(),
8011	Opcode: Instruction::Load, CostKind, ScalarTy: LI->getType(), VecTy: LoadTy);
8012	GatherCost += VectorGEPCost - ScalarGEPCost;
8013	}
8014	for (unsigned P : ScatterVectorized) {
8015	auto *LI0 = cast<LoadInst>(Val: VL [P]);
8016	ArrayRef<Value *> Slice = VL.slice(N: P, M: VF);
8017	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: Slice);
8018	GatherCost += TTI.getGatherScatterOpCost(
8019	Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI0->getPointerOperand(),
8020	/VariableMask=/false, Alignment: CommonAlignment, CostKind, I: LI0);
8021	// Estimate GEP cost.
8022	SmallVector<Value *> PointerOps(VF);
8023	for (auto [I, V] : enumerate(First&: Slice))
8024	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
8025	OrdersType Order;
8026	if (sortPtrAccesses(VL: PointerOps, ElemTy: LI0->getType(), DL: R.DL, SE&: R.SE,
8027	SortedIndices&: Order)) {
8028	// TODO: improve checks if GEPs can be vectorized.
8029	Value *Ptr0 = PointerOps.front();
8030	Type *ScalarTy = Ptr0->getType();
8031	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF);
8032	auto [ScalarGEPCost, VectorGEPCost] =
8033	getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: Ptr0, Opcode: Instruction::GetElementPtr,
8034	CostKind, ScalarTy, VecTy);
8035	GatherCost += VectorGEPCost - ScalarGEPCost;
8036	if (!Order.empty()) {
8037	SmallVector<int> Mask;
8038	inversePermutation(Indices: Order, Mask);
8039	GatherCost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
8040	Tp: VecTy, Mask, CostKind);
8041	}
8042	} else {
8043	GatherCost += R.getGatherCost(VL: PointerOps, /ForPoisonSrc=/true);
8044	}
8045	}
8046	if (NeedInsertSubvectorAnalysis) {
8047	// Add the cost for the subvectors insert.
8048	SmallVector<int> ShuffleMask(VL.size());
8049	for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8050	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: E))
8051	ShuffleMask [Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8052	GatherCost += TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: VecTy,
8053	Mask: ShuffleMask, CostKind, Index: I, SubTp: LoadTy);
8054	}
8055	}
8056	GatherCost -= ScalarsCost;
8057	}
8058	GatherCost = std::min(a: BaseCost, b: GatherCost);
8059	} else if (!Root && isSplat(VL)) {
8060	// Found the broadcasting of the single scalar, calculate the cost as
8061	// the broadcast.
8062	const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
8063	assert(It != VL.end() && "Expected at least one non-undef value.");
8064	// Add broadcast for non-identity shuffle only.
8065	bool NeedShuffle =
8066	count(Range&: VL, Element: *It) > `1` &&
8067	(VL.front() != *It \|\| !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
8068	if (!NeedShuffle)
8069	return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
8070	CostKind, Index: std::distance(first: VL.begin(), last: It),
8071	Op0: PoisonValue::get(T: VecTy), Op1: *It);
8072
8073	SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8074	transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
8075	return isa<PoisonValue>(Val: V) ? PoisonMaskElem : `0`;
8076	});
8077	InstructionCost InsertCost = TTI.getVectorInstrCost(
8078	Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: `0`,
8079	Op0: PoisonValue::get(T: VecTy), Op1: *It);
8080	return InsertCost +
8081	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VecTy,
8082	Mask: ShuffleMask, CostKind, /Index=/`0`,
8083	/SubTp=/nullptr, /Args=/*It);
8084	}
8085	return GatherCost +
8086	(all_of(Range&: Gathers, P: IsaPred<UndefValue>)
8087	? TTI::TCC_Free
8088	: R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers)));
8089	};
8090
8091	/// Compute the cost of creating a vector containing the extracted values from
8092	/// \p VL.
8093	InstructionCost
8094	computeExtractCost(ArrayRef<Value > VL, ArrayRef<int*> Mask,
8095	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8096	unsigned NumParts) {
8097	assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8098	unsigned NumElts =
8099	std::accumulate(first: VL.begin(), last: VL.end(), init: `0`, binary_op: [](unsigned Sz, Value *V) {
8100	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8101	if (!EE)
8102	return Sz;
8103	auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
8104	if (!VecTy)
8105	return Sz;
8106	return std::max(a: Sz, b: VecTy->getNumElements());
8107	});
8108	unsigned NumSrcRegs = TTI.getNumberOfParts(
8109	Tp: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts));
8110	if (NumSrcRegs == `0`)
8111	NumSrcRegs = `1`;
8112	// FIXME: this must be moved to TTI for better estimation.
8113	unsigned EltsPerVector = PowerOf2Ceil(A: std::max(
8114	a: divideCeil(Numerator: VL.size(), Denominator: NumParts), b: divideCeil(Numerator: NumElts, Denominator: NumSrcRegs)));
8115	auto CheckPerRegistersShuffle =
8116	[&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8117	DenseSet<int> RegIndices;
8118	// Check that if trying to permute same single/2 input vectors.
8119	TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
8120	int FirstRegId = -`1`;
8121	for (int &I : Mask) {
8122	if (I == PoisonMaskElem)
8123	continue;
8124	int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8125	if (FirstRegId < `0`)
8126	FirstRegId = RegId;
8127	RegIndices.insert(V: RegId);
8128	if (RegIndices.size() > `2`)
8129	return std::nullopt;
8130	if (RegIndices.size() == `2`)
8131	ShuffleKind = TTI::SK_PermuteTwoSrc;
8132	I = (I % NumElts) % EltsPerVector +
8133	(RegId == FirstRegId ? `0` : EltsPerVector);
8134	}
8135	return ShuffleKind;
8136	};
8137	InstructionCost Cost = `0`;
8138
8139	// Process extracts in blocks of EltsPerVector to check if the source vector
8140	// operand can be re-used directly. If not, add the cost of creating a
8141	// shuffle to extract the values into a vector register.
8142	for (unsigned Part = `0`; Part < NumParts; ++Part) {
8143	if (!ShuffleKinds [Part])
8144	continue;
8145	ArrayRef<int> MaskSlice =
8146	Mask.slice(N: Part * EltsPerVector,
8147	M: (Part == NumParts - `1` && Mask.size() % EltsPerVector != `0`)
8148	? Mask.size() % EltsPerVector
8149	: EltsPerVector);
8150	SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8151	copy(Range&: MaskSlice, Out: SubMask.begin());
8152	std::optional<TTI::ShuffleKind> RegShuffleKind =
8153	CheckPerRegistersShuffle(SubMask);
8154	if (!RegShuffleKind) {
8155	Cost += ::getShuffleCost(
8156	TTI, Kind: *ShuffleKinds [Part],
8157	Tp: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts), Mask: MaskSlice);
8158	continue;
8159	}
8160	if (*RegShuffleKind != TTI::SK_PermuteSingleSrc \|\|
8161	!ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
8162	Cost += ::getShuffleCost(
8163	TTI, Kind: *RegShuffleKind,
8164	Tp: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: EltsPerVector),
8165	Mask: SubMask);
8166	}
8167	}
8168	return Cost;
8169	}
8170	/// Transforms mask \p CommonMask per given \p Mask to make proper set after
8171	/// shuffle emission.
8172	static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8173	ArrayRef<int> Mask) {
8174	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8175	if (Mask [Idx] != PoisonMaskElem)
8176	CommonMask [Idx] = Idx;
8177	}
8178	/// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8179	/// mask \p Mask, register number \p Part, that includes \p SliceSize
8180	/// elements.
8181	void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8182	ArrayRef<int> Mask, unsigned Part,
8183	unsigned SliceSize) {
8184	if (SameNodesEstimated) {
8185	// Delay the cost estimation if the same nodes are reshuffling.
8186	// If we already requested the cost of reshuffling of E1 and E2 before, no
8187	// need to estimate another cost with the sub-Mask, instead include this
8188	// sub-Mask into the CommonMask to estimate it later and avoid double cost
8189	// estimation.
8190	if ((InVectors.size() == `2` &&
8191	InVectors.front().get<const TreeEntry *>() == &E1 &&
8192	InVectors.back().get<const TreeEntry *>() == E2) \|\|
8193	(!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8194	assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8195	[](int Idx) { return Idx == PoisonMaskElem; }) &&
8196	"Expected all poisoned elements.");
8197	ArrayRef<int> SubMask =
8198	ArrayRef(Mask).slice(N: Part * SliceSize, M: SliceSize);
8199	copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
8200	return;
8201	}
8202	// Found non-matching nodes - need to estimate the cost for the matched
8203	// and transform mask.
8204	Cost += createShuffle(P1: InVectors.front(),
8205	P2: InVectors.size() == `1` ? nullptr : InVectors.back(),
8206	Mask: CommonMask);
8207	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8208	}
8209	SameNodesEstimated = false;
8210	if (!E2 && InVectors.size() == `1`) {
8211	unsigned VF = E1.getVectorFactor();
8212	if (Value V1 = InVectors.front().dyn_cast<Value >()) {
8213	VF = std::max(a: VF,
8214	b: cast<FixedVectorType>(Val: V1->getType())->getNumElements());
8215	} else {
8216	const auto E = InVectors.front().get<const* TreeEntry *>();
8217	VF = std::max(a: VF, b: E->getVectorFactor());
8218	}
8219	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8220	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
8221	CommonMask [Idx] = Mask [Idx] + VF;
8222	Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
8223	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8224	} else {
8225	Cost += createShuffle(P1: &E1, P2: E2, Mask);
8226	transformMaskAfterShuffle(CommonMask, Mask);
8227	}
8228	}
8229
8230	class ShuffleCostBuilder {
8231	const TargetTransformInfo &TTI;
8232
8233	static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8234	int Index = -`1`;
8235	return Mask.empty() \|\|
8236	(VF == Mask.size() &&
8237	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) \|\|
8238	(ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
8239	Index == `0`);
8240	}
8241
8242	public:
8243	ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8244	~ShuffleCostBuilder() = default;
8245	InstructionCost createShuffleVector(Value V1, Value ,
8246	ArrayRef<int> Mask) const {
8247	// Empty mask or identity mask are free.
8248	unsigned VF =
8249	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8250	if (isEmptyOrIdentity(Mask, VF))
8251	return TTI::TCC_Free;
8252	return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
8253	Tp: cast<VectorType>(Val: V1->getType()), Mask);
8254	}
8255	InstructionCost createShuffleVector(Value V1, ArrayRef<int> Mask) const* {
8256	// Empty mask or identity mask are free.
8257	unsigned VF =
8258	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8259	if (isEmptyOrIdentity(Mask, VF))
8260	return TTI::TCC_Free;
8261	return TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc,
8262	Tp: cast<VectorType>(Val: V1->getType()), Mask);
8263	}
8264	InstructionCost createIdentity(Value ) const* { return TTI::TCC_Free; }
8265	InstructionCost createPoison(Type Ty, unsigned* VF) const {
8266	return TTI::TCC_Free;
8267	}
8268	void resizeToMatch(Value &, Value &) const {}
8269	};
8270
8271	/// Smart shuffle instruction emission, walks through shuffles trees and
8272	/// tries to find the best matching vector for the actual shuffle
8273	/// instruction.
8274	InstructionCost
8275	createShuffle(const PointerUnion<Value , const* TreeEntry *> &P1,
8276	const PointerUnion<Value , const* TreeEntry *> &P2,
8277	ArrayRef<int> Mask) {
8278	ShuffleCostBuilder Builder(TTI);
8279	SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8280	Value V1 = P1.dyn_cast<Value >(), V2 = P2.dyn_cast<Value >();
8281	unsigned CommonVF = Mask.size();
8282	if (!V1 && !V2 && !P2.isNull()) {
8283	// Shuffle 2 entry nodes.
8284	const TreeEntry E = P1.get<const* TreeEntry *>();
8285	unsigned VF = E->getVectorFactor();
8286	const TreeEntry E2 = P2.get<const* TreeEntry *>();
8287	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8288	assert(all_of(Mask,
8289	[=](int Idx) {
8290	return Idx < `2` * static_cast<int>(CommonVF);
8291	}) &&
8292	"All elements in mask must be less than 2 * CommonVF.");
8293	if (E->Scalars.size() == E2->Scalars.size()) {
8294	SmallVector<int> EMask = E->getCommonMask();
8295	SmallVector<int> E2Mask = E2->getCommonMask();
8296	if (!EMask.empty() \|\| !E2Mask.empty()) {
8297	for (int &Idx : CommonMask) {
8298	if (Idx == PoisonMaskElem)
8299	continue;
8300	if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8301	Idx = EMask [Idx];
8302	else if (Idx >= static_cast<int>(CommonVF))
8303	Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask [Idx - CommonVF]) +
8304	E->Scalars.size();
8305	}
8306	}
8307	CommonVF = E->Scalars.size();
8308	}
8309	V1 = Constant::getNullValue(
8310	Ty: FixedVectorType::get(ElementType: E->Scalars.front()->getType(), NumElts: CommonVF));
8311	V2 = getAllOnesValue(
8312	DL: *R.DL, Ty: FixedVectorType::get(ElementType: E->Scalars.front()->getType(), NumElts: CommonVF));
8313	} else if (!V1 && P2.isNull()) {
8314	// Shuffle single entry node.
8315	const TreeEntry E = P1.get<const* TreeEntry *>();
8316	unsigned VF = E->getVectorFactor();
8317	CommonVF = VF;
8318	assert(
8319	all_of(Mask,
8320	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8321	"All elements in mask must be less than CommonVF.");
8322	if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8323	SmallVector<int> EMask = E->getCommonMask();
8324	assert(!EMask.empty() && "Expected non-empty common mask.");
8325	for (int &Idx : CommonMask) {
8326	if (Idx != PoisonMaskElem)
8327	Idx = EMask [Idx];
8328	}
8329	CommonVF = E->Scalars.size();
8330	}
8331	V1 = Constant::getNullValue(
8332	Ty: FixedVectorType::get(ElementType: E->Scalars.front()->getType(), NumElts: CommonVF));
8333	// Not identity/broadcast? Try to see if the original vector is better.
8334	if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8335	CommonVF == CommonMask.size() &&
8336	any_of(Range: enumerate(First&: CommonMask),
8337	P: [](const auto &&P) {
8338	return P.value() != PoisonMaskElem &&
8339	static_cast<unsigned>(P.value()) != P.index();
8340	}) &&
8341	any_of(Range&: CommonMask,
8342	P: [](int Idx) { return Idx != PoisonMaskElem && Idx != `0`; })) {
8343	SmallVector<int> ReorderMask;
8344	inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
8345	::addMask(Mask&: CommonMask, SubMask: ReorderMask);
8346	}
8347	} else if (V1 && P2.isNull()) {
8348	// Shuffle single vector.
8349	CommonVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8350	assert(
8351	all_of(Mask,
8352	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8353	"All elements in mask must be less than CommonVF.");
8354	} else if (V1 && !V2) {
8355	// Shuffle vector and tree node.
8356	unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8357	const TreeEntry E2 = P2.get<const* TreeEntry *>();
8358	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8359	assert(all_of(Mask,
8360	[=](int Idx) {
8361	return Idx < `2` * static_cast<int>(CommonVF);
8362	}) &&
8363	"All elements in mask must be less than 2 * CommonVF.");
8364	if (E2->Scalars.size() == VF && VF != CommonVF) {
8365	SmallVector<int> E2Mask = E2->getCommonMask();
8366	assert(!E2Mask.empty() && "Expected non-empty common mask.");
8367	for (int &Idx : CommonMask) {
8368	if (Idx == PoisonMaskElem)
8369	continue;
8370	if (Idx >= static_cast<int>(CommonVF))
8371	Idx = E2Mask [Idx - CommonVF] + VF;
8372	}
8373	CommonVF = VF;
8374	}
8375	V1 = Constant::getNullValue(
8376	Ty: FixedVectorType::get(ElementType: E2->Scalars.front()->getType(), NumElts: CommonVF));
8377	V2 = getAllOnesValue(
8378	DL: *R.DL,
8379	Ty: FixedVectorType::get(ElementType: E2->Scalars.front()->getType(), NumElts: CommonVF));
8380	} else if (!V1 && V2) {
8381	// Shuffle vector and tree node.
8382	unsigned VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
8383	const TreeEntry E1 = P1.get<const* TreeEntry *>();
8384	CommonVF = std::max(a: VF, b: E1->getVectorFactor());
8385	assert(all_of(Mask,
8386	[=](int Idx) {
8387	return Idx < `2` * static_cast<int>(CommonVF);
8388	}) &&
8389	"All elements in mask must be less than 2 * CommonVF.");
8390	if (E1->Scalars.size() == VF && VF != CommonVF) {
8391	SmallVector<int> E1Mask = E1->getCommonMask();
8392	assert(!E1Mask.empty() && "Expected non-empty common mask.");
8393	for (int &Idx : CommonMask) {
8394	if (Idx == PoisonMaskElem)
8395	continue;
8396	if (Idx >= static_cast<int>(CommonVF))
8397	Idx = E1Mask [Idx - CommonVF] + VF;
8398	else
8399	Idx = E1Mask [Idx];
8400	}
8401	CommonVF = VF;
8402	}
8403	V1 = Constant::getNullValue(
8404	Ty: FixedVectorType::get(ElementType: E1->Scalars.front()->getType(), NumElts: CommonVF));
8405	V2 = getAllOnesValue(
8406	DL: *R.DL,
8407	Ty: FixedVectorType::get(ElementType: E1->Scalars.front()->getType(), NumElts: CommonVF));
8408	} else {
8409	assert(V1 && V2 && "Expected both vectors.");
8410	unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8411	CommonVF =
8412	std::max(a: VF, b: cast<FixedVectorType>(Val: V2->getType())->getNumElements());
8413	assert(all_of(Mask,
8414	[=](int Idx) {
8415	return Idx < `2` * static_cast<int>(CommonVF);
8416	}) &&
8417	"All elements in mask must be less than 2 * CommonVF.");
8418	if (V1->getType() != V2->getType()) {
8419	V1 = Constant::getNullValue(Ty: FixedVectorType::get(
8420	ElementType: cast<FixedVectorType>(Val: V1->getType())->getElementType(), NumElts: CommonVF));
8421	V2 = getAllOnesValue(
8422	DL: *R.DL, Ty: FixedVectorType::get(
8423	ElementType: cast<FixedVectorType>(Val: V1->getType())->getElementType(),
8424	NumElts: CommonVF));
8425	}
8426	}
8427	InVectors.front() = Constant::getNullValue(Ty: FixedVectorType::get(
8428	ElementType: cast<FixedVectorType>(Val: V1->getType())->getElementType(),
8429	NumElts: CommonMask.size()));
8430	if (InVectors.size() == `2`)
8431	InVectors.pop_back();
8432	return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8433	V1, V2, Mask: CommonMask, Builder);
8434	}
8435
8436	public:
8437	ShuffleCostEstimator(TargetTransformInfo &TTI,
8438	ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8439	SmallPtrSetImpl<Value *> &CheckedExtracts)
8440	: TTI(TTI), VectorizedVals (VectorizedVals.begin(), VectorizedVals.end()),
8441	R(R), CheckedExtracts(CheckedExtracts) {}
8442	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
8443	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8444	unsigned NumParts, bool &UseVecBaseAsInput) {
8445	UseVecBaseAsInput = false;
8446	if (Mask.empty())
8447	return nullptr;
8448	Value VecBase = nullptr*;
8449	ArrayRef<Value *> VL = E->Scalars;
8450	// If the resulting type is scalarized, do not adjust the cost.
8451	if (NumParts == VL.size())
8452	return nullptr;
8453	// Check if it can be considered reused if same extractelements were
8454	// vectorized already.
8455	bool PrevNodeFound = any_of(
8456	Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
8457	P: [&](const std::unique_ptr<TreeEntry> &TE) {
8458	return ((!TE ->isAltShuffle() &&
8459	TE ->getOpcode() == Instruction::ExtractElement) \|\|
8460	TE ->State == TreeEntry::NeedToGather) &&
8461	all_of(Range: enumerate(First&: TE ->Scalars), P: [&](auto &&Data) {
8462	return VL.size() > Data.index() &&
8463	(Mask[Data.index()] == PoisonMaskElem \|\|
8464	isa<UndefValue>(VL[Data.index()]) \|\|
8465	Data.value() == VL[Data.index()]);
8466	});
8467	});
8468	SmallPtrSet<Value *, `4`> UniqueBases;
8469	unsigned SliceSize = VL.size() / NumParts;
8470	for (unsigned Part = `0`; Part < NumParts; ++Part) {
8471	ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: SliceSize);
8472	for (auto [I, V] : enumerate(First: VL.slice(N: Part * SliceSize, M: SliceSize))) {
8473	// Ignore non-extractelement scalars.
8474	if (isa<UndefValue>(Val: V) \|\|
8475	(!SubMask.empty() && SubMask [I] == PoisonMaskElem))
8476	continue;
8477	// If all users of instruction are going to be vectorized and this
8478	// instruction itself is not going to be vectorized, consider this
8479	// instruction as dead and remove its cost from the final cost of the
8480	// vectorized tree.
8481	// Also, avoid adjusting the cost for extractelements with multiple uses
8482	// in different graph entries.
8483	auto *EE = cast<ExtractElementInst>(Val: V);
8484	VecBase = EE->getVectorOperand();
8485	UniqueBases.insert(Ptr: VecBase);
8486	const TreeEntry *VE = R.getTreeEntry(V);
8487	if (!CheckedExtracts.insert(Ptr: V).second \|\|
8488	!R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) \|\|
8489	(VE && VE != E))
8490	continue;
8491	std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
8492	if (!EEIdx)
8493	continue;
8494	unsigned Idx = *EEIdx;
8495	// Take credit for instruction that will become dead.
8496	if (EE->hasOneUse() \|\| !PrevNodeFound) {
8497	Instruction *Ext = EE->user_back();
8498	if (isa<SExtInst, ZExtInst>(Val: Ext) &&
8499	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
8500	// Use getExtractWithExtendCost() to calculate the cost of
8501	// extractelement/ext pair.
8502	Cost -=
8503	TTI.getExtractWithExtendCost(Opcode: Ext->getOpcode(), Dst: Ext->getType(),
8504	VecTy: EE->getVectorOperandType(), Index: Idx);
8505	// Add back the cost of s\|zext which is subtracted separately.
8506	Cost += TTI.getCastInstrCost(
8507	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
8508	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
8509	continue;
8510	}
8511	}
8512	Cost -= TTI.getVectorInstrCost(I: *EE, Val: EE->getVectorOperandType(),
8513	CostKind, Index: Idx);
8514	}
8515	}
8516	// Check that gather of extractelements can be represented as just a
8517	// shuffle of a single/two vectors the scalars are extracted from.
8518	// Found the bunch of extractelement instructions that must be gathered
8519	// into a vector and can be represented as a permutation elements in a
8520	// single input vector or of 2 input vectors.
8521	// Done for reused if same extractelements were vectorized already.
8522	if (!PrevNodeFound)
8523	Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8524	InVectors.assign(NumElts: `1`, Elt: E);
8525	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8526	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8527	SameNodesEstimated = false;
8528	if (NumParts != `1` && UniqueBases.size() != `1`) {
8529	UseVecBaseAsInput = true;
8530	VecBase = Constant::getNullValue(
8531	Ty: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: CommonMask.size()));
8532	}
8533	return VecBase;
8534	}
8535	/// Checks if the specified entry \p E needs to be delayed because of its
8536	/// dependency nodes.
8537	std::optional<InstructionCost>
8538	needToDelay(const TreeEntry *,
8539	ArrayRef<SmallVector<const TreeEntry >>) const* {
8540	// No need to delay the cost estimation during analysis.
8541	return std::nullopt;
8542	}
8543	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8544	if (&E1 == &E2) {
8545	assert(all_of(Mask,
8546	[&](int Idx) {
8547	return Idx < static_cast<int>(E1.getVectorFactor());
8548	}) &&
8549	"Expected single vector shuffle mask.");
8550	add(E1, Mask);
8551	return;
8552	}
8553	if (InVectors.empty()) {
8554	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8555	InVectors.assign(IL: {&E1, &E2});
8556	return;
8557	}
8558	assert(!CommonMask.empty() && "Expected non-empty common mask.");
8559	auto *MaskVecTy =
8560	FixedVectorType::get(ElementType: E1.Scalars.front()->getType(), NumElts: Mask.size());
8561	unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
8562	if (NumParts == `0` \|\| NumParts >= Mask.size())
8563	NumParts = `1`;
8564	unsigned SliceSize = Mask.size() / NumParts;
8565	const auto *It =
8566	find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
8567	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
8568	estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
8569	}
8570	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8571	if (InVectors.empty()) {
8572	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8573	InVectors.assign(NumElts: `1`, Elt: &E1);
8574	return;
8575	}
8576	assert(!CommonMask.empty() && "Expected non-empty common mask.");
8577	auto *MaskVecTy =
8578	FixedVectorType::get(ElementType: E1.Scalars.front()->getType(), NumElts: Mask.size());
8579	unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
8580	if (NumParts == `0` \|\| NumParts >= Mask.size())
8581	NumParts = `1`;
8582	unsigned SliceSize = Mask.size() / NumParts;
8583	const auto *It =
8584	find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
8585	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
8586	estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
8587	if (!SameNodesEstimated && InVectors.size() == `1`)
8588	InVectors.emplace_back(Args: &E1);
8589	}
8590	/// Adds 2 input vectors and the mask for their shuffling.
8591	void add(Value V1, Value V2, ArrayRef<int> Mask) {
8592	// May come only for shuffling of 2 vectors with extractelements, already
8593	// handled in adjustExtracts.
8594	assert(InVectors.size() == `1` &&
8595	all_of(enumerate(CommonMask),
8596	[&](auto P) {
8597	if (P.value() == PoisonMaskElem)
8598	return Mask[P.index()] == PoisonMaskElem;
8599	auto *EI =
8600	cast<ExtractElementInst>(InVectors.front()
8601	.get<const TreeEntry *>()
8602	->Scalars[P.index()]);
8603	return EI->getVectorOperand() == V1 \|\|
8604	EI->getVectorOperand() == V2;
8605	}) &&
8606	"Expected extractelement vectors.");
8607	}
8608	/// Adds another one input vector and the mask for the shuffling.
8609	void add(Value V1, ArrayRef<int> Mask, bool* ForExtracts = false) {
8610	if (InVectors.empty()) {
8611	assert(CommonMask.empty() && !ForExtracts &&
8612	"Expected empty input mask/vectors.");
8613	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8614	InVectors.assign(NumElts: `1`, Elt: V1);
8615	return;
8616	}
8617	if (ForExtracts) {
8618	// No need to add vectors here, already handled them in adjustExtracts.
8619	assert(InVectors.size() == `1` &&
8620	InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8621	all_of(enumerate(CommonMask),
8622	[&](auto P) {
8623	Value *Scalar = InVectors.front()
8624	.get<const TreeEntry *>()
8625	->Scalars[P.index()];
8626	if (P.value() == PoisonMaskElem)
8627	return P.value() == Mask[P.index()] \|\|
8628	isa<UndefValue>(Scalar);
8629	if (isa<Constant>(V1))
8630	return true;
8631	auto *EI = cast<ExtractElementInst>(Scalar);
8632	return EI->getVectorOperand() == V1;
8633	}) &&
8634	"Expected only tree entry for extractelement vectors.");
8635	return;
8636	}
8637	assert(!InVectors.empty() && !CommonMask.empty() &&
8638	"Expected only tree entries from extracts/reused buildvectors.");
8639	unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8640	if (InVectors.size() == `2`) {
8641	Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
8642	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8643	VF = std::max<unsigned>(a: VF, b: CommonMask.size());
8644	} else if (const auto *InTE =
8645	InVectors.front().dyn_cast<const TreeEntry *>()) {
8646	VF = std::max(a: VF, b: InTE->getVectorFactor());
8647	} else {
8648	VF = std::max(
8649	a: VF, b: cast<FixedVectorType>(Val: InVectors.front().get<Value *>()->getType())
8650	->getNumElements());
8651	}
8652	InVectors.push_back(Elt: V1);
8653	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8654	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
8655	CommonMask [Idx] = Mask [Idx] + VF;
8656	}
8657	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
8658	Value Root = nullptr*) {
8659	Cost += getBuildVectorCost(VL, Root);
8660	if (!Root) {
8661	// FIXME: Need to find a way to avoid use of getNullValue here.
8662	SmallVector<Constant *> Vals;
8663	unsigned VF = VL.size();
8664	if (MaskVF != `0`)
8665	VF = std::min(a: VF, b: MaskVF);
8666	for (Value *V : VL.take_front(N: VF)) {
8667	if (isa<UndefValue>(Val: V)) {
8668	Vals.push_back(Elt: cast<Constant>(Val: V));
8669	continue;
8670	}
8671	Vals.push_back(Elt: Constant::getNullValue(Ty: V->getType()));
8672	}
8673	return ConstantVector::get(V: Vals);
8674	}
8675	return ConstantVector::getSplat(
8676	EC: ElementCount::getFixed(
8677	MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
8678	Elt: getAllOnesValue(DL: *R.DL, Ty: VL.front()->getType()));
8679	}
8680	InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
8681	/// Finalize emission of the shuffles.
8682	InstructionCost
8683	finalize(ArrayRef<int> ExtMask, unsigned VF = `0`,
8684	function_ref<void(Value &, SmallVectorImpl<int*> &)> Action = {}) {
8685	IsFinalized = true;
8686	if (Action) {
8687	const PointerUnion<Value , const* TreeEntry *> &Vec = InVectors.front();
8688	if (InVectors.size() == `2`)
8689	Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
8690	else
8691	Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
8692	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8693	if (CommonMask [Idx] != PoisonMaskElem)
8694	CommonMask [Idx] = Idx;
8695	assert(VF > `0` &&
8696	"Expected vector length for the final value before action.");
8697	Value V = Vec.get<Value >();
8698	Action (V, CommonMask);
8699	InVectors.front() = V;
8700	}
8701	::addMask(Mask&: CommonMask, SubMask: ExtMask, /ExtendingManyInputs=/true);
8702	if (CommonMask.empty()) {
8703	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
8704	return Cost;
8705	}
8706	return Cost +
8707	createShuffle(P1: InVectors.front(),
8708	P2: InVectors.size() == `2` ? InVectors.back() : nullptr,
8709	Mask: CommonMask);
8710	}
8711
8712	~ShuffleCostEstimator() {
8713	assert((IsFinalized \|\| CommonMask.empty()) &&
8714	"Shuffle construction must be finalized.");
8715	}
8716	};
8717
8718	const BoUpSLP::TreeEntry BoUpSLP::getOperandEntry(const* TreeEntry *E,
8719	unsigned Idx) const {
8720	Value *Op = E->getOperand(OpIdx: Idx).front();
8721	if (const TreeEntry *TE = getTreeEntry(V: Op)) {
8722	if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
8723	return EI.EdgeIdx == Idx && EI.UserTE == E;
8724	}) != TE->UserTreeIndices.end())
8725	return TE;
8726	auto MIt = MultiNodeScalars.find(Val: Op);
8727	if (MIt != MultiNodeScalars.end()) {
8728	for (const TreeEntry *TE : MIt ->second) {
8729	if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
8730	return EI.EdgeIdx == Idx && EI.UserTE == E;
8731	}) != TE->UserTreeIndices.end())
8732	return TE;
8733	}
8734	}
8735	}
8736	const auto *It =
8737	find_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8738	return TE ->State == TreeEntry::NeedToGather &&
8739	find_if(Range&: TE ->UserTreeIndices, P: [&](const EdgeInfo &EI) {
8740	return EI.EdgeIdx == Idx && EI.UserTE == E;
8741	}) != TE ->UserTreeIndices.end();
8742	});
8743	assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
8744	return It->get();
8745	}
8746
8747	TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
8748	if (TE.State == TreeEntry::ScatterVectorize \|\|
8749	TE.State == TreeEntry::StridedVectorize)
8750	return TTI::CastContextHint::GatherScatter;
8751	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
8752	!TE.isAltShuffle()) {
8753	if (TE.ReorderIndices.empty())
8754	return TTI::CastContextHint::Normal;
8755	SmallVector<int> Mask;
8756	inversePermutation(Indices: TE.ReorderIndices, Mask);
8757	if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
8758	return TTI::CastContextHint::Reversed;
8759	}
8760	return TTI::CastContextHint::None;
8761	}
8762
8763	/// Builds the arguments types vector for the given call instruction with the
8764	/// given \p ID for the specified vector factor.
8765	static SmallVector<Type > buildIntrinsicArgTypes(const* CallInst *CI,
8766	const Intrinsic::ID ID,
8767	const unsigned VF,
8768	unsigned MinBW) {
8769	SmallVector<Type *> ArgTys;
8770	for (auto [Idx, Arg] : enumerate(First: CI->args())) {
8771	if (ID != Intrinsic::not_intrinsic) {
8772	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx)) {
8773	ArgTys.push_back(Elt: Arg ->getType());
8774	continue;
8775	}
8776	if (MinBW > `0`) {
8777	ArgTys.push_back(Elt: FixedVectorType::get(
8778	ElementType: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), NumElts: VF));
8779	continue;
8780	}
8781	}
8782	ArgTys.push_back(Elt: FixedVectorType::get(ElementType: Arg ->getType(), NumElts: VF));
8783	}
8784	return ArgTys;
8785	}
8786
8787	InstructionCost
8788	BoUpSLP::getEntryCost(const TreeEntry E, ArrayRef<Value > VectorizedVals,
8789	SmallPtrSetImpl<Value *> &CheckedExtracts) {
8790	ArrayRef<Value *> VL = E->Scalars;
8791
8792	Type *ScalarTy = VL [`0`]->getType();
8793	if (E->State != TreeEntry::NeedToGather) {
8794	if (auto *SI = dyn_cast<StoreInst>(Val: VL [`0`]))
8795	ScalarTy = SI->getValueOperand()->getType();
8796	else if (auto *CI = dyn_cast<CmpInst>(Val: VL [`0`]))
8797	ScalarTy = CI->getOperand(i_nocapture: `0`)->getType();
8798	else if (auto *IE = dyn_cast<InsertElementInst>(Val: VL [`0`]))
8799	ScalarTy = IE->getOperand(i_nocapture: `1`)->getType();
8800	}
8801	if (!isValidElementType(Ty: ScalarTy))
8802	return InstructionCost::getInvalid();
8803	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
8804	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8805
8806	// If we have computed a smaller type for the expression, update VecTy so
8807	// that the costs will be accurate.
8808	auto It = MinBWs.find(Val: E);
8809	Type *OrigScalarTy = ScalarTy;
8810	if (It != MinBWs.end()) {
8811	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
8812	VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
8813	}
8814	unsigned EntryVF = E->getVectorFactor();
8815	auto *FinalVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: EntryVF);
8816
8817	bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8818	if (E->State == TreeEntry::NeedToGather) {
8819	if (allConstant(VL))
8820	return `0`;
8821	if (isa<InsertElementInst>(Val: VL [`0`]))
8822	return InstructionCost::getInvalid();
8823	return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8824	E, Params&: TTI, Params&: VectorizedVals, Params&: this, Params&: CheckedExtracts);
8825	}
8826	InstructionCost CommonCost = `0`;
8827	SmallVector<int> Mask;
8828	bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
8829	if (!E->ReorderIndices.empty() &&
8830	(E->State != TreeEntry::StridedVectorize \|\| !IsReverseOrder)) {
8831	SmallVector<int> NewMask;
8832	if (E->getOpcode() == Instruction::Store) {
8833	// For stores the order is actually a mask.
8834	NewMask.resize(N: E->ReorderIndices.size());
8835	copy(Range: E->ReorderIndices, Out: NewMask.begin());
8836	} else {
8837	inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
8838	}
8839	::addMask(Mask, SubMask: NewMask);
8840	}
8841	if (NeedToShuffleReuses)
8842	::addMask(Mask, SubMask: E->ReuseShuffleIndices);
8843	if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
8844	CommonCost =
8845	TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
8846	assert((E->State == TreeEntry::Vectorize \|\|
8847	E->State == TreeEntry::ScatterVectorize \|\|
8848	E->State == TreeEntry::StridedVectorize) &&
8849	"Unhandled state");
8850	assert(E->getOpcode() &&
8851	((allSameType(VL) && allSameBlock(VL)) \|\|
8852	(E->getOpcode() == Instruction::GetElementPtr &&
8853	E->getMainOp()->getType()->isPointerTy())) &&
8854	"Invalid VL");
8855	Instruction *VL0 = E->getMainOp();
8856	unsigned ShuffleOrOp =
8857	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
8858	SetVector<Value *> UniqueValues(VL.begin(), VL.end());
8859	const unsigned Sz = UniqueValues.size();
8860	SmallBitVector UsedScalars(Sz, false);
8861	for (unsigned I = `0`; I < Sz; ++I) {
8862	if (getTreeEntry(V: UniqueValues [I]) == E)
8863	continue;
8864	UsedScalars.set(I);
8865	}
8866	auto GetCastContextHint = [&](Value *V) {
8867	if (const TreeEntry *OpTE = getTreeEntry(V))
8868	return getCastContextHint(TE: *OpTE);
8869	InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: `0`), TLI: *TLI);
8870	if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8871	return TTI::CastContextHint::GatherScatter;
8872	return TTI::CastContextHint::None;
8873	};
8874	auto GetCostDiff =
8875	[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
8876	function_ref<InstructionCost(InstructionCost)> VectorCost) {
8877	// Calculate the cost of this instruction.
8878	InstructionCost ScalarCost = `0`;
8879	if (isa<CastInst, CmpInst, SelectInst, CallInst>(Val: VL0)) {
8880	// For some of the instructions no need to calculate cost for each
8881	// particular instruction, we can use the cost of the single
8882	// instruction x total number of scalar instructions.
8883	ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost (`0`);
8884	} else {
8885	for (unsigned I = `0`; I < Sz; ++I) {
8886	if (UsedScalars.test(Idx: I))
8887	continue;
8888	ScalarCost += ScalarEltCost (I);
8889	}
8890	}
8891
8892	InstructionCost VecCost = VectorCost (CommonCost);
8893	// Check if the current node must be resized, if the parent node is not
8894	// resized.
8895	if (!UnaryInstruction::isCast(Opcode: E->getOpcode()) && E->Idx != `0`) {
8896	const EdgeInfo &EI = E->UserTreeIndices.front();
8897	if ((EI.UserTE->getOpcode() != Instruction::Select \|\|
8898	EI.EdgeIdx != `0`) &&
8899	It != MinBWs.end()) {
8900	auto UserBWIt = MinBWs.find(Val: EI.UserTE);
8901	Type *UserScalarTy =
8902	EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
8903	if (UserBWIt != MinBWs.end())
8904	UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
8905	NumBits: UserBWIt ->second.first);
8906	if (ScalarTy != UserScalarTy) {
8907	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
8908	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
8909	unsigned VecOpcode;
8910	auto *UserVecTy =
8911	FixedVectorType::get(ElementType: UserScalarTy, NumElts: E->getVectorFactor());
8912	if (BWSz > SrcBWSz)
8913	VecOpcode = Instruction::Trunc;
8914	else
8915	VecOpcode =
8916	It ->second.second ? Instruction::SExt : Instruction::ZExt;
8917	TTI::CastContextHint CCH = GetCastContextHint (VL0);
8918	VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
8919	CostKind);
8920	}
8921	}
8922	}
8923	LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8924	ScalarCost, "Calculated costs for Tree"));
8925	return VecCost - ScalarCost;
8926	};
8927	// Calculate cost difference from vectorizing set of GEPs.
8928	// Negative value means vectorizing is profitable.
8929	auto GetGEPCostDiff = [=](ArrayRef<Value > Ptrs, Value BasePtr) {
8930	assert((E->State == TreeEntry::Vectorize \|\|
8931	E->State == TreeEntry::StridedVectorize) &&
8932	"Entry state expected to be Vectorize or StridedVectorize here.");
8933	InstructionCost ScalarCost = `0`;
8934	InstructionCost VecCost = `0`;
8935	std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
8936	TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
8937	LLVM_DEBUG(dumpTreeCosts(E, `0`, VecCost, ScalarCost,
8938	"Calculated GEPs cost for Tree"));
8939
8940	return VecCost - ScalarCost;
8941	};
8942
8943	switch (ShuffleOrOp) {
8944	case Instruction::PHI: {
8945	// Count reused scalars.
8946	InstructionCost ScalarCost = `0`;
8947	SmallPtrSet<const TreeEntry *, `4`> CountedOps;
8948	for (Value *V : UniqueValues) {
8949	auto *PHI = dyn_cast<PHINode>(Val: V);
8950	if (!PHI)
8951	continue;
8952
8953	ValueList Operands(PHI->getNumIncomingValues(), nullptr);
8954	for (unsigned I = `0`, N = PHI->getNumIncomingValues(); I < N; ++I) {
8955	Value *Op = PHI->getIncomingValue(i: I);
8956	Operands [I] = Op;
8957	}
8958	if (const TreeEntry *OpTE = getTreeEntry(V: Operands.front()))
8959	if (OpTE->isSame(VL: Operands) && CountedOps.insert(Ptr: OpTE).second)
8960	if (!OpTE->ReuseShuffleIndices.empty())
8961	ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8962	OpTE->Scalars.size());
8963	}
8964
8965	return CommonCost - ScalarCost;
8966	}
8967	case Instruction::ExtractValue:
8968	case Instruction::ExtractElement: {
8969	auto GetScalarCost = [&](unsigned Idx) {
8970	auto *I = cast<Instruction>(Val: UniqueValues [Idx]);
8971	VectorType *SrcVecTy;
8972	if (ShuffleOrOp == Instruction::ExtractElement) {
8973	auto *EE = cast<ExtractElementInst>(Val: I);
8974	SrcVecTy = EE->getVectorOperandType();
8975	} else {
8976	auto *EV = cast<ExtractValueInst>(Val: I);
8977	Type *AggregateTy = EV->getAggregateOperand()->getType();
8978	unsigned NumElts;
8979	if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
8980	NumElts = ATy->getNumElements();
8981	else
8982	NumElts = AggregateTy->getStructNumElements();
8983	SrcVecTy = FixedVectorType::get(ElementType: OrigScalarTy, NumElts);
8984	}
8985	if (I->hasOneUse()) {
8986	Instruction *Ext = I->user_back();
8987	if ((isa<SExtInst>(Val: Ext) \|\| isa<ZExtInst>(Val: Ext)) &&
8988	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
8989	// Use getExtractWithExtendCost() to calculate the cost of
8990	// extractelement/ext pair.
8991	InstructionCost Cost = TTI->getExtractWithExtendCost(
8992	Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I));
8993	// Subtract the cost of s\|zext which is subtracted separately.
8994	Cost -= TTI->getCastInstrCost(
8995	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
8996	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
8997	return Cost;
8998	}
8999	}
9000	return TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: SrcVecTy,
9001	CostKind, Index: *getExtractIndex(E: I));
9002	};
9003	auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9004	return GetCostDiff (GetScalarCost, GetVectorCost);
9005	}
9006	case Instruction::InsertElement: {
9007	assert(E->ReuseShuffleIndices.empty() &&
9008	"Unique insertelements only are expected.");
9009	auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
9010	unsigned const NumElts = SrcVecTy->getNumElements();
9011	unsigned const NumScalars = VL.size();
9012
9013	unsigned NumOfParts = TTI->getNumberOfParts(Tp: SrcVecTy);
9014
9015	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9016	unsigned OffsetBeg = *getInsertIndex(InsertInst: VL.front());
9017	unsigned OffsetEnd = OffsetBeg;
9018	InsertMask [OffsetBeg] = `0`;
9019	for (auto [I, V] : enumerate(First: VL.drop_front())) {
9020	unsigned Idx = *getInsertIndex(InsertInst: V);
9021	if (OffsetBeg > Idx)
9022	OffsetBeg = Idx;
9023	else if (OffsetEnd < Idx)
9024	OffsetEnd = Idx;
9025	InsertMask [Idx] = I + `1`;
9026	}
9027	unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
9028	if (NumOfParts > `0`)
9029	VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - `1`) / NumOfParts);
9030	unsigned VecSz = (`1` + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9031	VecScalarsSz;
9032	unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9033	unsigned InsertVecSz = std::min<unsigned>(
9034	a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + `1`),
9035	b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9036	bool IsWholeSubvector =
9037	OffsetBeg == Offset && ((OffsetEnd + `1`) % VecScalarsSz == `0`);
9038	// Check if we can safely insert a subvector. If it is not possible, just
9039	// generate a whole-sized vector and shuffle the source vector and the new
9040	// subvector.
9041	if (OffsetBeg + InsertVecSz > VecSz) {
9042	// Align OffsetBeg to generate correct mask.
9043	OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
9044	InsertVecSz = VecSz;
9045	}
9046
9047	APInt DemandedElts = APInt::getZero(numBits: NumElts);
9048	// TODO: Add support for Instruction::InsertValue.
9049	SmallVector<int> Mask;
9050	if (!E->ReorderIndices.empty()) {
9051	inversePermutation(Indices: E->ReorderIndices, Mask);
9052	Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
9053	} else {
9054	Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
9055	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: `0`);
9056	}
9057	bool IsIdentity = true;
9058	SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9059	Mask.swap(RHS&: PrevMask);
9060	for (unsigned I = `0`; I < NumScalars; ++I) {
9061	unsigned InsertIdx = *getInsertIndex(InsertInst: VL [PrevMask [I]]);
9062	DemandedElts.setBit(InsertIdx);
9063	IsIdentity &= InsertIdx - OffsetBeg == I;
9064	Mask [InsertIdx - OffsetBeg] = I;
9065	}
9066	assert(Offset < NumElts && "Failed to find vector index offset");
9067
9068	InstructionCost Cost = `0`;
9069	Cost -= TTI->getScalarizationOverhead(Ty: SrcVecTy, DemandedElts,
9070	/Insert/ true, /Extract/ false,
9071	CostKind);
9072
9073	// First cost - resize to actual vector size if not identity shuffle or
9074	// need to shift the vector.
9075	// Do not calculate the cost if the actual size is the register size and
9076	// we can merge this shuffle with the following SK_Select.
9077	auto *InsertVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: InsertVecSz);
9078	if (!IsIdentity)
9079	Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
9080	Tp: InsertVecTy, Mask);
9081	auto FirstInsert = cast<Instruction>(Val: find_if(Range: E->Scalars, P: [E](Value *V) {
9082	return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
9083	}));
9084	// Second cost - permutation with subvector, if some elements are from the
9085	// initial vector or inserting a subvector.
9086	// TODO: Implement the analysis of the FirstInsert->getOperand(0)
9087	// subvector of ActualVecTy.
9088	SmallBitVector InMask =
9089	isUndefVector(V: FirstInsert->getOperand(i: `0`),
9090	UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
9091	if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9092	if (InsertVecSz != VecSz) {
9093	auto *ActualVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VecSz);
9094	Cost += TTI->getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy,
9095	Mask: std::nullopt, CostKind, Index: OffsetBeg - Offset,
9096	SubTp: InsertVecTy);
9097	} else {
9098	for (unsigned I = `0`, End = OffsetBeg - Offset; I < End; ++I)
9099	Mask [I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
9100	for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9101	I <= End; ++I)
9102	if (Mask [I] != PoisonMaskElem)
9103	Mask [I] = I + VecSz;
9104	for (unsigned I = OffsetEnd + `1` - Offset; I < VecSz; ++I)
9105	Mask [I] =
9106	((I >= InMask.size()) \|\| InMask.test(Idx: I)) ? PoisonMaskElem : I;
9107	Cost +=
9108	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
9109	}
9110	}
9111	return Cost;
9112	}
9113	case Instruction::ZExt:
9114	case Instruction::SExt:
9115	case Instruction::FPToUI:
9116	case Instruction::FPToSI:
9117	case Instruction::FPExt:
9118	case Instruction::PtrToInt:
9119	case Instruction::IntToPtr:
9120	case Instruction::SIToFP:
9121	case Instruction::UIToFP:
9122	case Instruction::Trunc:
9123	case Instruction::FPTrunc:
9124	case Instruction::BitCast: {
9125	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
9126	Type *SrcScalarTy = VL0->getOperand(i: `0`)->getType();
9127	auto *SrcVecTy = FixedVectorType::get(ElementType: SrcScalarTy, NumElts: VL.size());
9128	unsigned Opcode = ShuffleOrOp;
9129	unsigned VecOpcode = Opcode;
9130	if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9131	(SrcIt != MinBWs.end() \|\| It != MinBWs.end())) {
9132	// Check if the values are candidates to demote.
9133	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
9134	if (SrcIt != MinBWs.end()) {
9135	SrcBWSz = SrcIt ->second.first;
9136	SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
9137	SrcVecTy = FixedVectorType::get(ElementType: SrcScalarTy, NumElts: VL.size());
9138	}
9139	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9140	if (BWSz == SrcBWSz) {
9141	VecOpcode = Instruction::BitCast;
9142	} else if (BWSz < SrcBWSz) {
9143	VecOpcode = Instruction::Trunc;
9144	} else if (It != MinBWs.end()) {
9145	assert(BWSz > SrcBWSz && "Invalid cast!");
9146	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
9147	} else if (SrcIt != MinBWs.end()) {
9148	assert(BWSz > SrcBWSz && "Invalid cast!");
9149	VecOpcode =
9150	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
9151	}
9152	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9153	!SrcIt ->second.second) {
9154	VecOpcode = Instruction::UIToFP;
9155	}
9156	auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9157	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9158	return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
9159	Src: VL0->getOperand(i: `0`)->getType(),
9160	CCH: TTI::getCastContextHint(I: VI), CostKind, I: VI);
9161	};
9162	auto GetVectorCost = [=](InstructionCost CommonCost) {
9163	// Do not count cost here if minimum bitwidth is in effect and it is just
9164	// a bitcast (here it is just a noop).
9165	if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9166	return CommonCost;
9167	auto VI = VL0->getOpcode() == Opcode ? VL0 : nullptr*;
9168	TTI::CastContextHint CCH = GetCastContextHint (VL0->getOperand(i: `0`));
9169	return CommonCost +
9170	TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
9171	I: VecOpcode == Opcode ? VI : nullptr);
9172	};
9173	return GetCostDiff (GetScalarCost, GetVectorCost);
9174	}
9175	case Instruction::FCmp:
9176	case Instruction::ICmp:
9177	case Instruction::Select: {
9178	CmpInst::Predicate VecPred, SwappedVecPred;
9179	auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
9180	if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) \|\|
9181	match(V: VL0, P: MatchCmp))
9182	SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
9183	else
9184	SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9185	? CmpInst::BAD_FCMP_PREDICATE
9186	: CmpInst::BAD_ICMP_PREDICATE;
9187	auto GetScalarCost = [&](unsigned Idx) {
9188	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9189	CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9190	? CmpInst::BAD_FCMP_PREDICATE
9191	: CmpInst::BAD_ICMP_PREDICATE;
9192	auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
9193	if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
9194	!match(V: VI, P: MatchCmp)) \|\|
9195	(CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9196	VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9197	? CmpInst::BAD_FCMP_PREDICATE
9198	: CmpInst::BAD_ICMP_PREDICATE;
9199
9200	return TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: OrigScalarTy,
9201	CondTy: Builder.getInt1Ty(), VecPred: CurrentPred, CostKind,
9202	I: VI);
9203	};
9204	auto GetVectorCost = [&](InstructionCost CommonCost) {
9205	auto *MaskTy = FixedVectorType::get(ElementType: Builder.getInt1Ty(), NumElts: VL.size());
9206
9207	InstructionCost VecCost = TTI->getCmpSelInstrCost(
9208	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred, CostKind, I: VL0);
9209	// Check if it is possible and profitable to use min/max for selects
9210	// in VL.
9211	//
9212	auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9213	if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9214	IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9215	{VecTy, VecTy});
9216	InstructionCost IntrinsicCost =
9217	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9218	// If the selects are the only uses of the compares, they will be
9219	// dead and we can adjust the cost by removing their cost.
9220	if (IntrinsicAndUse.second)
9221	IntrinsicCost -= TTI->getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: VecTy,
9222	CondTy: MaskTy, VecPred, CostKind);
9223	VecCost = std::min(a: VecCost, b: IntrinsicCost);
9224	}
9225	return VecCost + CommonCost;
9226	};
9227	return GetCostDiff (GetScalarCost, GetVectorCost);
9228	}
9229	case Instruction::FNeg:
9230	case Instruction::Add:
9231	case Instruction::FAdd:
9232	case Instruction::Sub:
9233	case Instruction::FSub:
9234	case Instruction::Mul:
9235	case Instruction::FMul:
9236	case Instruction::UDiv:
9237	case Instruction::SDiv:
9238	case Instruction::FDiv:
9239	case Instruction::URem:
9240	case Instruction::SRem:
9241	case Instruction::FRem:
9242	case Instruction::Shl:
9243	case Instruction::LShr:
9244	case Instruction::AShr:
9245	case Instruction::And:
9246	case Instruction::Or:
9247	case Instruction::Xor: {
9248	auto GetScalarCost = [&](unsigned Idx) {
9249	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9250	unsigned OpIdx = isa<UnaryOperator>(Val: VI) ? `0` : `1`;
9251	TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: VI->getOperand(i: `0`));
9252	TTI::OperandValueInfo Op2Info =
9253	TTI::getOperandInfo(V: VI->getOperand(i: OpIdx));
9254	SmallVector<const Value *> Operands(VI->operand_values());
9255	return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind,
9256	Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands, CxtI: VI);
9257	};
9258	auto GetVectorCost = [=](InstructionCost CommonCost) {
9259	unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? `0` : `1`;
9260	TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
9261	TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
9262	return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
9263	Opd2Info: Op2Info, Args: std::nullopt, CxtI: nullptr, TLibInfo: TLI) +
9264	CommonCost;
9265	};
9266	return GetCostDiff (GetScalarCost, GetVectorCost);
9267	}
9268	case Instruction::GetElementPtr: {
9269	return CommonCost + GetGEPCostDiff (VL, VL0);
9270	}
9271	case Instruction::Load: {
9272	auto GetScalarCost = [&](unsigned Idx) {
9273	auto *VI = cast<LoadInst>(Val: UniqueValues [Idx]);
9274	return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
9275	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9276	CostKind, OpdInfo: TTI::OperandValueInfo (), I: VI);
9277	};
9278	auto *LI0 = cast<LoadInst>(Val: VL0);
9279	auto GetVectorCost = [&](InstructionCost CommonCost) {
9280	InstructionCost VecLdCost;
9281	if (E->State == TreeEntry::Vectorize) {
9282	VecLdCost = TTI->getMemoryOpCost(
9283	Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
9284	AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo ());
9285	} else if (E->State == TreeEntry::StridedVectorize) {
9286	Align CommonAlignment =
9287	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9288	VecLdCost = TTI->getStridedMemoryOpCost(
9289	Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9290	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
9291	} else {
9292	assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9293	Align CommonAlignment =
9294	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9295	VecLdCost = TTI->getGatherScatterOpCost(
9296	Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9297	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
9298	}
9299	return VecLdCost + CommonCost;
9300	};
9301
9302	InstructionCost Cost = GetCostDiff (GetScalarCost, GetVectorCost);
9303	// If this node generates masked gather load then it is not a terminal node.
9304	// Hence address operand cost is estimated separately.
9305	if (E->State == TreeEntry::ScatterVectorize)
9306	return Cost;
9307
9308	// Estimate cost of GEPs since this tree node is a terminator.
9309	SmallVector<Value *> PointerOps(VL.size());
9310	for (auto [I, V] : enumerate(First&: VL))
9311	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
9312	return Cost + GetGEPCostDiff (PointerOps, LI0->getPointerOperand());
9313	}
9314	case Instruction::Store: {
9315	bool IsReorder = !E->ReorderIndices.empty();
9316	auto GetScalarCost = [=](unsigned Idx) {
9317	auto *VI = cast<StoreInst>(Val: VL [Idx]);
9318	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
9319	return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
9320	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9321	CostKind, OpdInfo: OpInfo, I: VI);
9322	};
9323	auto *BaseSI =
9324	cast<StoreInst>(Val: IsReorder ? VL [E->ReorderIndices.front()] : VL0);
9325	auto GetVectorCost = [=](InstructionCost CommonCost) {
9326	// We know that we can merge the stores. Calculate the cost.
9327	TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
9328	return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
9329	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
9330	OpdInfo: OpInfo) +
9331	CommonCost;
9332	};
9333	SmallVector<Value *> PointerOps(VL.size());
9334	for (auto [I, V] : enumerate(First&: VL)) {
9335	unsigned Idx = IsReorder ? E->ReorderIndices [I] : I;
9336	PointerOps [Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
9337	}
9338
9339	return GetCostDiff (GetScalarCost, GetVectorCost) +
9340	GetGEPCostDiff (PointerOps, BaseSI->getPointerOperand());
9341	}
9342	case Instruction::Call: {
9343	auto GetScalarCost = [&](unsigned Idx) {
9344	auto *CI = cast<CallInst>(Val: UniqueValues [Idx]);
9345	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9346	if (ID != Intrinsic::not_intrinsic) {
9347	IntrinsicCostAttributes CostAttrs(ID, *CI, `1`);
9348	return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9349	}
9350	return TTI->getCallInstrCost(F: CI->getCalledFunction(),
9351	RetTy: CI->getFunctionType()->getReturnType(),
9352	Tys: CI->getFunctionType()->params(), CostKind);
9353	};
9354	auto GetVectorCost = [=](InstructionCost CommonCost) {
9355	auto *CI = cast<CallInst>(Val: VL0);
9356	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9357	SmallVector<Type *> ArgTys =
9358	buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
9359	MinBW: It != MinBWs.end() ? It ->second.first : `0`);
9360	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9361	return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
9362	};
9363	return GetCostDiff (GetScalarCost, GetVectorCost);
9364	}
9365	case Instruction::ShuffleVector: {
9366	assert(E->isAltShuffle() &&
9367	((Instruction::isBinaryOp(E->getOpcode()) &&
9368	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
9369	(Instruction::isCast(E->getOpcode()) &&
9370	Instruction::isCast(E->getAltOpcode())) \|\|
9371	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9372	"Invalid Shuffle Vector Operand");
9373	// Try to find the previous shuffle node with the same operands and same
9374	// main/alternate ops.
9375	auto TryFindNodeWithEqualOperands = [=]() {
9376	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9377	if (TE.get() == E)
9378	break;
9379	if (TE ->isAltShuffle() &&
9380	((TE ->getOpcode() == E->getOpcode() &&
9381	TE ->getAltOpcode() == E->getAltOpcode()) \|\|
9382	(TE ->getOpcode() == E->getAltOpcode() &&
9383	TE ->getAltOpcode() == E->getOpcode())) &&
9384	TE ->hasEqualOperands(TE: *E))
9385	return true;
9386	}
9387	return false;
9388	};
9389	auto GetScalarCost = [&](unsigned Idx) {
9390	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9391	assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9392	(void)E;
9393	return TTI->getInstructionCost(U: VI, CostKind);
9394	};
9395	// Need to clear CommonCost since the final shuffle cost is included into
9396	// vector cost.
9397	auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9398	// VecCost is equal to sum of the cost of creating 2 vectors
9399	// and the cost of creating shuffle.
9400	InstructionCost VecCost = `0`;
9401	if (TryFindNodeWithEqualOperands ()) {
9402	LLVM_DEBUG({
9403	dbgs() << "SLP: diamond match for alternate node found.\n";
9404	E->dump();
9405	});
9406	// No need to add new vector costs here since we're going to reuse
9407	// same main/alternate vector ops, just do different shuffling.
9408	} else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
9409	VecCost =
9410	TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
9411	VecCost +=
9412	TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
9413	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
9414	auto *MaskTy = FixedVectorType::get(ElementType: Builder.getInt1Ty(), NumElts: VL.size());
9415	VecCost = TTIRef.getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9416	VecPred: CI0->getPredicate(), CostKind, I: VL0);
9417	VecCost += TTIRef.getCmpSelInstrCost(
9418	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9419	VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
9420	I: E->getAltOp());
9421	} else {
9422	Type *SrcSclTy = E->getMainOp()->getOperand(i: `0`)->getType();
9423	auto *SrcTy = FixedVectorType::get(ElementType: SrcSclTy, NumElts: VL.size());
9424	if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9425	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
9426	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9427	unsigned SrcBWSz =
9428	DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: `0`)->getType());
9429	if (SrcIt != MinBWs.end()) {
9430	SrcBWSz = SrcIt ->second.first;
9431	SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
9432	SrcTy = FixedVectorType::get(ElementType: SrcSclTy, NumElts: VL.size());
9433	}
9434	if (BWSz <= SrcBWSz) {
9435	if (BWSz < SrcBWSz)
9436	VecCost =
9437	TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
9438	CCH: TTI::CastContextHint::None, CostKind);
9439	LLVM_DEBUG({
9440	dbgs()
9441	<< "SLP: alternate extension, which should be truncated.\n";
9442	E->dump();
9443	});
9444	return VecCost;
9445	}
9446	}
9447	VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
9448	CCH: TTI::CastContextHint::None, CostKind);
9449	VecCost +=
9450	TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
9451	CCH: TTI::CastContextHint::None, CostKind);
9452	}
9453	SmallVector<int> Mask;
9454	E->buildAltOpShuffleMask(
9455	IsAltOp: [E](Instruction *I) {
9456	assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9457	return I->getOpcode() == E->getAltOpcode();
9458	},
9459	Mask);
9460	VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
9461	Tp: FinalVecTy, Mask);
9462	// Patterns like [fadd,fsub] can be combined into a single instruction
9463	// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9464	// need to take into account their order when looking for the most used
9465	// order.
9466	unsigned Opcode0 = E->getOpcode();
9467	unsigned Opcode1 = E->getAltOpcode();
9468	// The opcode mask selects between the two opcodes.
9469	SmallBitVector OpcodeMask(E->Scalars.size(), false);
9470	for (unsigned Lane : seq<unsigned>(Begin: `0`, End: E->Scalars.size()))
9471	if (cast<Instruction>(Val: E->Scalars [Lane])->getOpcode() == Opcode1)
9472	OpcodeMask.set(Lane);
9473	// If this pattern is supported by the target then we consider the
9474	// order.
9475	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9476	InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9477	VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9478	return AltVecCost < VecCost ? AltVecCost : VecCost;
9479	}
9480	// TODO: Check the reverse order too.
9481	return VecCost;
9482	};
9483	return GetCostDiff (GetScalarCost, GetVectorCost);
9484	}
9485	default:
9486	llvm_unreachable("Unknown instruction");
9487	}
9488	}
9489
9490	bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9491	LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9492	<< VectorizableTree.size() << " is fully vectorizable .\n");
9493
9494	auto &&AreVectorizableGathers = [this](const TreeEntry TE, unsigned* Limit) {
9495	SmallVector<int> Mask;
9496	return TE->State == TreeEntry::NeedToGather &&
9497	!any_of(Range: TE->Scalars,
9498	P: [this](Value V) { return* EphValues.contains(Ptr: V); }) &&
9499	(allConstant(VL: TE->Scalars) \|\| isSplat(VL: TE->Scalars) \|\|
9500	TE->Scalars.size() < Limit \|\|
9501	((TE->getOpcode() == Instruction::ExtractElement \|\|
9502	all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
9503	isFixedVectorShuffle(VL: TE->Scalars, Mask)) \|\|
9504	(TE->State == TreeEntry::NeedToGather &&
9505	TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9506	};
9507
9508	// We only handle trees of heights 1 and 2.
9509	if (VectorizableTree.size() == `1` &&
9510	(VectorizableTree [`0`]->State == TreeEntry::Vectorize \|\|
9511	(ForReduction &&
9512	AreVectorizableGathers (VectorizableTree [`0`].get(),
9513	VectorizableTree [`0`]->Scalars.size()) &&
9514	VectorizableTree [`0`]->getVectorFactor() > `2`)))
9515	return true;
9516
9517	if (VectorizableTree.size() != `2`)
9518	return false;
9519
9520	// Handle splat and all-constants stores. Also try to vectorize tiny trees
9521	// with the second gather nodes if they have less scalar operands rather than
9522	// the initial tree element (may be profitable to shuffle the second gather)
9523	// or they are extractelements, which form shuffle.
9524	SmallVector<int> Mask;
9525	if (VectorizableTree [`0`]->State == TreeEntry::Vectorize &&
9526	AreVectorizableGathers (VectorizableTree [`1`].get(),
9527	VectorizableTree [`0`]->Scalars.size()))
9528	return true;
9529
9530	// Gathering cost would be too much for tiny trees.
9531	if (VectorizableTree [`0`]->State == TreeEntry::NeedToGather \|\|
9532	(VectorizableTree [`1`]->State == TreeEntry::NeedToGather &&
9533	VectorizableTree [`0`]->State != TreeEntry::ScatterVectorize &&
9534	VectorizableTree [`0`]->State != TreeEntry::StridedVectorize))
9535	return false;
9536
9537	return true;
9538	}
9539
9540	static bool isLoadCombineCandidateImpl(Value Root, unsigned* NumElts,
9541	TargetTransformInfo *TTI,
9542	bool MustMatchOrInst) {
9543	// Look past the root to find a source value. Arbitrarily follow the
9544	// path through operand 0 of any 'or'. Also, peek through optional
9545	// shift-left-by-multiple-of-8-bits.
9546	Value *ZextLoad = Root;
9547	const APInt *ShAmtC;
9548	bool FoundOr = false;
9549	while (!isa<ConstantExpr>(Val: ZextLoad) &&
9550	(match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) \|\|
9551	(match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) &&
9552	ShAmtC->urem(RHS: `8`) == `0`))) {
9553	auto *BinOp = cast<BinaryOperator>(Val: ZextLoad);
9554	ZextLoad = BinOp->getOperand(i_nocapture: `0`);
9555	if (BinOp->getOpcode() == Instruction::Or)
9556	FoundOr = true;
9557	}
9558	// Check if the input is an extended load of the required or/shift expression.
9559	Value *Load;
9560	if ((MustMatchOrInst && !FoundOr) \|\| ZextLoad == Root \|\|
9561	!match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) \|\| !isa<LoadInst>(Val: Load))
9562	return false;
9563
9564	// Require that the total load bit width is a legal integer type.
9565	// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9566	// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9567	Type *SrcTy = Load->getType();
9568	unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9569	if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth)))
9570	return false;
9571
9572	// Everything matched - assume that we can fold the whole sequence using
9573	// load combining.
9574	LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9575	<< *(cast<Instruction>(Root)) << "\n");
9576
9577	return true;
9578	}
9579
9580	bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
9581	if (RdxKind != RecurKind::Or)
9582	return false;
9583
9584	unsigned NumElts = VectorizableTree [`0`]->Scalars.size();
9585	Value *FirstReduced = VectorizableTree [`0`]->Scalars [`0`];
9586	return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI,
9587	/ MatchOr / MustMatchOrInst: false);
9588	}
9589
9590	bool BoUpSLP::isLoadCombineCandidate() const {
9591	// Peek through a final sequence of stores and check if all operations are
9592	// likely to be load-combined.
9593	unsigned NumElts = VectorizableTree [`0`]->Scalars.size();
9594	for (Value *Scalar : VectorizableTree [`0`]->Scalars) {
9595	Value *X;
9596	if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) \|\|
9597	!isLoadCombineCandidateImpl(Root: X, NumElts, TTI, / MatchOr / MustMatchOrInst: true))
9598	return false;
9599	}
9600	return true;
9601	}
9602
9603	bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9604	// No need to vectorize inserts of gathered values.
9605	if (VectorizableTree.size() == `2` &&
9606	isa<InsertElementInst>(Val: VectorizableTree [`0`]->Scalars [`0`]) &&
9607	VectorizableTree [`1`]->State == TreeEntry::NeedToGather &&
9608	(VectorizableTree [`1`]->getVectorFactor() <= `2` \|\|
9609	!(isSplat(VL: VectorizableTree [`1`]->Scalars) \|\|
9610	allConstant(VL: VectorizableTree [`1`]->Scalars))))
9611	return true;
9612
9613	// If the graph includes only PHI nodes and gathers, it is defnitely not
9614	// profitable for the vectorization, we can skip it, if the cost threshold is
9615	// default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9616	// gathers/buildvectors.
9617	constexpr int Limit = `4`;
9618	if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9619	!VectorizableTree.empty() &&
9620	all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
9621	return (TE ->State == TreeEntry::NeedToGather &&
9622	TE ->getOpcode() != Instruction::ExtractElement &&
9623	count_if(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) \|\|
9624	TE ->getOpcode() == Instruction::PHI;
9625	}))
9626	return true;
9627
9628	// We can vectorize the tree if its size is greater than or equal to the
9629	// minimum size specified by the MinTreeSize command line option.
9630	if (VectorizableTree.size() >= MinTreeSize)
9631	return false;
9632
9633	// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9634	// can vectorize it if we can prove it fully vectorizable.
9635	if (isFullyVectorizableTinyTree(ForReduction))
9636	return false;
9637
9638	// Check if any of the gather node forms an insertelement buildvector
9639	// somewhere.
9640	bool IsAllowedSingleBVNode =
9641	VectorizableTree.size() > `1` \|\|
9642	(VectorizableTree.size() == `1` && VectorizableTree.front()->getOpcode() &&
9643	!VectorizableTree.front()->isAltShuffle() &&
9644	VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9645	VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9646	allSameBlock(VL: VectorizableTree.front()->Scalars));
9647	if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
9648	return TE ->State == TreeEntry::NeedToGather &&
9649	all_of(Range&: TE ->Scalars, P: [&](Value *V) {
9650	return isa<ExtractElementInst, UndefValue>(Val: V) \|\|
9651	(IsAllowedSingleBVNode &&
9652	!V->hasNUsesOrMore(N: UsesLimit) &&
9653	any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
9654	});
9655	}))
9656	return false;
9657
9658	assert(VectorizableTree.empty()
9659	? ExternalUses.empty()
9660	: true && "We shouldn't have any external users");
9661
9662	// Otherwise, we can't vectorize the tree. It is both tiny and not fully
9663	// vectorizable.
9664	return true;
9665	}
9666
9667	InstructionCost BoUpSLP::getSpillCost() const {
9668	// Walk from the bottom of the tree to the top, tracking which values are
9669	// live. When we see a call instruction that is not part of our tree,
9670	// query TTI to see if there is a cost to keeping values live over it
9671	// (for example, if spills and fills are required).
9672	unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9673	InstructionCost Cost = `0`;
9674
9675	SmallPtrSet<Instruction *, `4`> LiveValues;
9676	Instruction PrevInst = nullptr*;
9677
9678	// The entries in VectorizableTree are not necessarily ordered by their
9679	// position in basic blocks. Collect them and order them by dominance so later
9680	// instructions are guaranteed to be visited first. For instructions in
9681	// different basic blocks, we only scan to the beginning of the block, so
9682	// their order does not matter, as long as all instructions in a basic block
9683	// are grouped together. Using dominance ensures a deterministic order.
9684	SmallVector<Instruction *, `16`> OrderedScalars;
9685	for (const auto &TEPtr : VectorizableTree) {
9686	if (TEPtr ->State != TreeEntry::Vectorize)
9687	continue;
9688	Instruction *Inst = dyn_cast<Instruction>(Val: TEPtr ->Scalars [`0`]);
9689	if (!Inst)
9690	continue;
9691	OrderedScalars.push_back(Elt: Inst);
9692	}
9693	llvm::sort(C&: OrderedScalars, Comp: [&](Instruction A, Instruction B) {
9694	auto *NodeA = DT->getNode(BB: A->getParent());
9695	auto *NodeB = DT->getNode(BB: B->getParent());
9696	assert(NodeA && "Should only process reachable instructions");
9697	assert(NodeB && "Should only process reachable instructions");
9698	assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9699	"Different nodes should have different DFS numbers");
9700	if (NodeA != NodeB)
9701	return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9702	return B->comesBefore(Other: A);
9703	});
9704
9705	for (Instruction *Inst : OrderedScalars) {
9706	if (!PrevInst) {
9707	PrevInst = Inst;
9708	continue;
9709	}
9710
9711	// Update LiveValues.
9712	LiveValues.erase(Ptr: PrevInst);
9713	for (auto &J : PrevInst->operands()) {
9714	if (isa<Instruction>(Val: &J) && getTreeEntry(V: &J))
9715	LiveValues.insert(Ptr: cast<Instruction>(Val: &*J));
9716	}
9717
9718	LLVM_DEBUG({
9719	dbgs() << "SLP: #LV: " << LiveValues.size();
9720	for (auto *X : LiveValues)
9721	dbgs() << " " << X->getName();
9722	dbgs() << ", Looking at ";
9723	Inst->dump();
9724	});
9725
9726	// Now find the sequence of instructions between PrevInst and Inst.
9727	unsigned NumCalls = `0`;
9728	BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
9729	PrevInstIt =
9730	PrevInst->getIterator().getReverse();
9731	while (InstIt != PrevInstIt) {
9732	if (PrevInstIt == PrevInst->getParent()->rend()) {
9733	PrevInstIt = Inst->getParent()->rbegin();
9734	continue;
9735	}
9736
9737	auto NoCallIntrinsic = [this](Instruction *I) {
9738	if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
9739	if (II->isAssumeLikeIntrinsic())
9740	return true;
9741	FastMathFlags FMF;
9742	SmallVector<Type *, `4`> Tys;
9743	for (auto &ArgOp : II->args())
9744	Tys.push_back(Elt: ArgOp ->getType());
9745	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: II))
9746	FMF = FPMO->getFastMathFlags();
9747	IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
9748	FMF);
9749	InstructionCost IntrCost =
9750	TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
9751	InstructionCost CallCost = TTI->getCallInstrCost(
9752	F: nullptr, RetTy: II->getType(), Tys, CostKind: TTI::TCK_RecipThroughput);
9753	if (IntrCost < CallCost)
9754	return true;
9755	}
9756	return false;
9757	};
9758
9759	// Debug information does not impact spill cost.
9760	if (isa<CallBase>(Val: &PrevInstIt) && !NoCallIntrinsic (&PrevInstIt) &&
9761	&*PrevInstIt != PrevInst)
9762	NumCalls++;
9763
9764	++PrevInstIt;
9765	}
9766
9767	if (NumCalls) {
9768	SmallVector<Type *, `4`> V;
9769	for (auto *II : LiveValues) {
9770	auto *ScalarTy = II->getType();
9771	if (auto *VectorTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
9772	ScalarTy = VectorTy->getElementType();
9773	V.push_back(Elt: FixedVectorType::get(ElementType: ScalarTy, NumElts: BundleWidth));
9774	}
9775	Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(Tys: V);
9776	}
9777
9778	PrevInst = Inst;
9779	}
9780
9781	return Cost;
9782	}
9783
9784	/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
9785	/// buildvector sequence.
9786	static bool isFirstInsertElement(const InsertElementInst *IE1,
9787	const InsertElementInst *IE2) {
9788	if (IE1 == IE2)
9789	return false;
9790	const auto *I1 = IE1;
9791	const auto *I2 = IE2;
9792	const InsertElementInst *PrevI1;
9793	const InsertElementInst *PrevI2;
9794	unsigned Idx1 = *getInsertIndex(InsertInst: IE1);
9795	unsigned Idx2 = *getInsertIndex(InsertInst: IE2);
9796	do {
9797	if (I2 == IE1)
9798	return true;
9799	if (I1 == IE2)
9800	return false;
9801	PrevI1 = I1;
9802	PrevI2 = I2;
9803	if (I1 && (I1 == IE1 \|\| I1->hasOneUse()) &&
9804	getInsertIndex(InsertInst: I1).value_or(u&: Idx2) != Idx2)
9805	I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: `0`));
9806	if (I2 && ((I2 == IE2 \|\| I2->hasOneUse())) &&
9807	getInsertIndex(InsertInst: I2).value_or(u&: Idx1) != Idx1)
9808	I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: `0`));
9809	} while ((I1 && PrevI1 != I1) \|\| (I2 && PrevI2 != I2));
9810	llvm_unreachable("Two different buildvectors not expected.");
9811	}
9812
9813	namespace {
9814	/// Returns incoming Value , if the requested type is Value * too, or a default*
9815	/// value, otherwise.
9816	struct ValueSelect {
9817	template <typename U>
9818	static std::enable_if_t<std::is_same_v<Value , U>, Value > get(Value *V) {
9819	return V;
9820	}
9821	template <typename U>
9822	static std::enable_if_t<!std::is_same_v<Value , U>, U> get(Value ) {
9823	return U();
9824	}
9825	};
9826	} // namespace
9827
9828	/// Does the analysis of the provided shuffle masks and performs the requested
9829	/// actions on the vectors with the given shuffle masks. It tries to do it in
9830	/// several steps.
9831	/// 1. If the Base vector is not undef vector, resizing the very first mask to
9832	/// have common VF and perform action for 2 input vectors (including non-undef
9833	/// Base). Other shuffle masks are combined with the resulting after the 1 stage
9834	/// and processed as a shuffle of 2 elements.
9835	/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
9836	/// action only for 1 vector with the given mask, if it is not the identity
9837	/// mask.
9838	/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
9839	/// vectors, combing the masks properly between the steps.
9840	template <typename T>
9841	static T *performExtractsShuffleAction(
9842	MutableArrayRef<std::pair<T , SmallVector<int>>> ShuffleMask, Value Base,
9843	function_ref<unsigned(T *)> GetVF,
9844	function_ref<std::pair<T , bool>(T , ArrayRef<int>, bool)> ResizeAction,
9845	function_ref<T (ArrayRef<int>, ArrayRef<T >)> Action) {
9846	assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
9847	SmallVector<int> Mask(ShuffleMask.begin()->second);
9848	auto VMIt = std::next(ShuffleMask.begin());
9849	T Prev = nullptr*;
9850	SmallBitVector UseMask =
9851	buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
9852	SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
9853	if (!IsBaseUndef.all()) {
9854	// Base is not undef, need to combine it with the next subvectors.
9855	std::pair<T , bool*> Res =
9856	ResizeAction(ShuffleMask.begin()->first, Mask, /ForSingleMask=/false);
9857	SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
9858	for (unsigned Idx = `0`, VF = Mask.size(); Idx < VF; ++Idx) {
9859	if (Mask [Idx] == PoisonMaskElem)
9860	Mask [Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
9861	else
9862	Mask [Idx] = (Res.second ? Idx : Mask [Idx]) + VF;
9863	}
9864	auto V = ValueSelect::get<T >(Base);
9865	(void)V;
9866	assert((!V \|\| GetVF(V) == Mask.size()) &&
9867	"Expected base vector of VF number of elements.");
9868	Prev = Action(Mask, {nullptr, Res.first});
9869	} else if (ShuffleMask.size() == `1`) {
9870	// Base is undef and only 1 vector is shuffled - perform the action only for
9871	// single vector, if the mask is not the identity mask.
9872	std::pair<T , bool*> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9873	/ForSingleMask=/true);
9874	if (Res.second)
9875	// Identity mask is found.
9876	Prev = Res.first;
9877	else
9878	Prev = Action(Mask, {ShuffleMask.begin()->first});
9879	} else {
9880	// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
9881	// shuffles step by step, combining shuffle between the steps.
9882	unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9883	unsigned Vec2VF = GetVF(VMIt->first);
9884	if (Vec1VF == Vec2VF) {
9885	// No need to resize the input vectors since they are of the same size, we
9886	// can shuffle them directly.
9887	ArrayRef<int> SecMask = VMIt->second;
9888	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
9889	if (SecMask [I] != PoisonMaskElem) {
9890	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9891	Mask [I] = SecMask [I] + Vec1VF;
9892	}
9893	}
9894	Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9895	} else {
9896	// Vectors of different sizes - resize and reshuffle.
9897	std::pair<T , bool*> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9898	/ForSingleMask=/false);
9899	std::pair<T , bool*> Res2 =
9900	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
9901	ArrayRef<int> SecMask = VMIt->second;
9902	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
9903	if (Mask [I] != PoisonMaskElem) {
9904	assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9905	if (Res1.second)
9906	Mask [I] = I;
9907	} else if (SecMask [I] != PoisonMaskElem) {
9908	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9909	Mask [I] = (Res2.second ? I : SecMask [I]) + VF;
9910	}
9911	}
9912	Prev = Action(Mask, {Res1.first, Res2.first});
9913	}
9914	VMIt = std::next(VMIt);
9915	}
9916	bool IsBaseNotUndef = !IsBaseUndef.all();
9917	(void)IsBaseNotUndef;
9918	// Perform requested actions for the remaining masks/vectors.
9919	for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9920	// Shuffle other input vectors, if any.
9921	std::pair<T , bool*> Res =
9922	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
9923	ArrayRef<int> SecMask = VMIt->second;
9924	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
9925	if (SecMask [I] != PoisonMaskElem) {
9926	assert((Mask[I] == PoisonMaskElem \|\| IsBaseNotUndef) &&
9927	"Multiple uses of scalars.");
9928	Mask [I] = (Res.second ? I : SecMask [I]) + VF;
9929	} else if (Mask [I] != PoisonMaskElem) {
9930	Mask [I] = I;
9931	}
9932	}
9933	Prev = Action(Mask, {Prev, Res.first});
9934	}
9935	return Prev;
9936	}
9937
9938	InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
9939	InstructionCost Cost = `0`;
9940	LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
9941	<< VectorizableTree.size() << ".\n");
9942
9943	unsigned BundleWidth = VectorizableTree [`0`]->Scalars.size();
9944
9945	SmallPtrSet<Value *, `4`> CheckedExtracts;
9946	for (unsigned I = `0`, E = VectorizableTree.size(); I < E; ++I) {
9947	TreeEntry &TE = *VectorizableTree [I];
9948	if (TE.State == TreeEntry::NeedToGather) {
9949	if (const TreeEntry *E = getTreeEntry(V: TE.getMainOp());
9950	E && E->getVectorFactor() == TE.getVectorFactor() &&
9951	E->isSame(VL: TE.Scalars)) {
9952	// Some gather nodes might be absolutely the same as some vectorizable
9953	// nodes after reordering, need to handle it.
9954	LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
9955	<< shortBundleName(TE.Scalars) << ".\n"
9956	<< "SLP: Current total cost = " << Cost << "\n");
9957	continue;
9958	}
9959	}
9960
9961	InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
9962	Cost += C;
9963	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
9964	<< shortBundleName(TE.Scalars) << ".\n"
9965	<< "SLP: Current total cost = " << Cost << "\n");
9966	}
9967
9968	SmallPtrSet<Value *, `16`> ExtractCostCalculated;
9969	InstructionCost ExtractCost = `0`;
9970	SmallVector<MapVector<const TreeEntry , SmallVector<int*>>> ShuffleMasks;
9971	SmallVector<std::pair<Value , const* TreeEntry *>> FirstUsers;
9972	SmallVector<APInt> DemandedElts;
9973	SmallDenseSet<Value *, `4`> UsedInserts;
9974	DenseSet<std::pair<const TreeEntry , Type >> VectorCasts;
9975	std::optional<DenseMap<Value , unsigned*>> ValueToExtUses;
9976	for (ExternalUser &EU : ExternalUses) {
9977	// We only add extract cost once for the same scalar.
9978	if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
9979	!ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
9980	continue;
9981
9982	// Uses by ephemeral values are free (because the ephemeral value will be
9983	// removed prior to code generation, and so the extraction will be
9984	// removed as well).
9985	if (EphValues.count(Ptr: EU.User))
9986	continue;
9987
9988	// No extract cost for vector "scalar"
9989	if (isa<FixedVectorType>(Val: EU.Scalar->getType()))
9990	continue;
9991
9992	// If found user is an insertelement, do not calculate extract cost but try
9993	// to detect it as a final shuffled/identity match.
9994	if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User)) {
9995	if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
9996	if (!UsedInserts.insert(V: VU).second)
9997	continue;
9998	std::optional<unsigned> InsertIdx = getInsertIndex(InsertInst: VU);
9999	if (InsertIdx) {
10000	const TreeEntry *ScalarTE = getTreeEntry(V: EU.Scalar);
10001	auto *It = find_if(
10002	Range&: FirstUsers,
10003	P: [this, VU](const std::pair<Value , const* TreeEntry *> &Pair) {
10004	return areTwoInsertFromSameBuildVector(
10005	VU, V: cast<InsertElementInst>(Val: Pair.first),
10006	GetBaseOperand: [this](InsertElementInst II) -> Value {
10007	Value *Op0 = II->getOperand(i_nocapture: `0`);
10008	if (getTreeEntry(V: II) && !getTreeEntry(V: Op0))
10009	return nullptr;
10010	return Op0;
10011	});
10012	});
10013	int VecId = -`1`;
10014	if (It == FirstUsers.end()) {
10015	(void)ShuffleMasks.emplace_back();
10016	SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10017	if (Mask.empty())
10018	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10019	// Find the insertvector, vectorized in tree, if any.
10020	Value *Base = VU;
10021	while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
10022	if (IEBase != EU.User &&
10023	(!IEBase->hasOneUse() \|\|
10024	getInsertIndex(InsertInst: IEBase).value_or(u&: InsertIdx) == InsertIdx))
10025	break;
10026	// Build the mask for the vectorized insertelement instructions.
10027	if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
10028	VU = IEBase;
10029	do {
10030	IEBase = cast<InsertElementInst>(Val: Base);
10031	int Idx = *getInsertIndex(InsertInst: IEBase);
10032	assert(Mask[Idx] == PoisonMaskElem &&
10033	"InsertElementInstruction used already.");
10034	Mask [Idx] = Idx;
10035	Base = IEBase->getOperand(i_nocapture: `0`);
10036	} while (E == getTreeEntry(V: Base));
10037	break;
10038	}
10039	Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: `0`);
10040	}
10041	FirstUsers.emplace_back(Args&: VU, Args&: ScalarTE);
10042	DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
10043	VecId = FirstUsers.size() - `1`;
10044	auto It = MinBWs.find(Val: ScalarTE);
10045	if (It != MinBWs.end() &&
10046	VectorCasts
10047	.insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
10048	.second) {
10049	unsigned BWSz = It ->second.first;
10050	unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
10051	unsigned VecOpcode;
10052	if (DstBWSz < BWSz)
10053	VecOpcode = Instruction::Trunc;
10054	else
10055	VecOpcode =
10056	It ->second.second ? Instruction::SExt : Instruction::ZExt;
10057	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10058	InstructionCost C = TTI->getCastInstrCost(
10059	Opcode: VecOpcode, Dst: FTy,
10060	Src: FixedVectorType::get(
10061	ElementType: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
10062	NumElts: FTy->getNumElements()),
10063	CCH: TTI::CastContextHint::None, CostKind);
10064	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10065	<< " for extending externally used vector with "
10066	"non-equal minimum bitwidth.\n");
10067	Cost += C;
10068	}
10069	} else {
10070	if (isFirstInsertElement(IE1: VU, IE2: cast<InsertElementInst>(Val: It->first)))
10071	It->first = VU;
10072	VecId = std::distance(first: FirstUsers.begin(), last: It);
10073	}
10074	int InIdx = *InsertIdx;
10075	SmallVectorImpl<int> &Mask = ShuffleMasks [VecId][ScalarTE];
10076	if (Mask.empty())
10077	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10078	Mask [InIdx] = EU.Lane;
10079	DemandedElts [VecId].setBit(InIdx);
10080	continue;
10081	}
10082	}
10083	}
10084	// Leave the GEPs as is, they are free in most cases and better to keep them
10085	// as GEPs.
10086	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10087	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: EU.Scalar)) {
10088	if (!ValueToExtUses) {
10089	ValueToExtUses.emplace();
10090	for_each(Range: enumerate(First&: ExternalUses), F: [&](const auto &P) {
10091	ValueToExtUses ->try_emplace(P.value().Scalar, P.index());
10092	});
10093	}
10094	// Can use original GEP, if no operands vectorized or they are marked as
10095	// externally used already.
10096	bool CanBeUsedAsGEP = all_of(Range: GEP->operands(), P: [&](Value *V) {
10097	if (!getTreeEntry(V))
10098	return true;
10099	auto It = ValueToExtUses ->find(Val: V);
10100	if (It != ValueToExtUses ->end()) {
10101	// Replace all uses to avoid compiler crash.
10102	ExternalUses [It ->second].User = nullptr;
10103	return true;
10104	}
10105	return false;
10106	});
10107	if (CanBeUsedAsGEP) {
10108	ExtractCost += TTI->getInstructionCost(U: GEP, CostKind);
10109	ExternalUsesAsGEPs.insert(Ptr: EU.Scalar);
10110	continue;
10111	}
10112	}
10113
10114	// If we plan to rewrite the tree in a smaller type, we will need to sign
10115	// extend the extracted value back to the original type. Here, we account
10116	// for the extract and the added cost of the sign extend if needed.
10117	auto *VecTy = FixedVectorType::get(ElementType: EU.Scalar->getType(), NumElts: BundleWidth);
10118	auto It = MinBWs.find(Val: getTreeEntry(V: EU.Scalar));
10119	if (It != MinBWs.end()) {
10120	auto *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
10121	unsigned Extend =
10122	It ->second.second ? Instruction::SExt : Instruction::ZExt;
10123	VecTy = FixedVectorType::get(ElementType: MinTy, NumElts: BundleWidth);
10124	ExtractCost += TTI->getExtractWithExtendCost(Opcode: Extend, Dst: EU.Scalar->getType(),
10125	VecTy, Index: EU.Lane);
10126	} else {
10127	ExtractCost += TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
10128	CostKind, Index: EU.Lane);
10129	}
10130	}
10131	// Add reduced value cost, if resized.
10132	if (!VectorizedVals.empty()) {
10133	const TreeEntry &Root = *VectorizableTree.front().get();
10134	auto BWIt = MinBWs.find(Val: &Root);
10135	if (BWIt != MinBWs.end()) {
10136	Type *DstTy = Root.Scalars.front()->getType();
10137	unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy);
10138	unsigned SrcSz =
10139	ReductionBitWidth == `0` ? BWIt ->second.first : ReductionBitWidth;
10140	if (OriginalSz != SrcSz) {
10141	unsigned Opcode = Instruction::Trunc;
10142	if (OriginalSz > SrcSz)
10143	Opcode = BWIt ->second.second ? Instruction::SExt : Instruction::ZExt;
10144	Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
10145	Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
10146	CCH: TTI::CastContextHint::None,
10147	CostKind: TTI::TCK_RecipThroughput);
10148	}
10149	}
10150	}
10151
10152	InstructionCost SpillCost = getSpillCost();
10153	Cost += SpillCost + ExtractCost;
10154	auto &&ResizeToVF = [this, &Cost](const TreeEntry TE, ArrayRef<int*> Mask,
10155	bool) {
10156	InstructionCost C = `0`;
10157	unsigned VF = Mask.size();
10158	unsigned VecVF = TE->getVectorFactor();
10159	if (VF != VecVF &&
10160	(any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); }) \|\|
10161	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))) {
10162	SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10163	std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
10164	result: OrigMask.begin());
10165	C = TTI->getShuffleCost(
10166	Kind: TTI::SK_PermuteSingleSrc,
10167	Tp: FixedVectorType::get(ElementType: TE->getMainOp()->getType(), NumElts: VecVF), Mask: OrigMask);
10168	LLVM_DEBUG(
10169	dbgs() << "SLP: Adding cost " << C
10170	<< " for final shuffle of insertelement external users.\n";
10171	TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10172	Cost += C;
10173	return std::make_pair(x&: TE, y: true);
10174	}
10175	return std::make_pair(x&: TE, y: false);
10176	};
10177	// Calculate the cost of the reshuffled vectors, if any.
10178	for (int I = `0`, E = FirstUsers.size(); I < E; ++I) {
10179	Value *Base = cast<Instruction>(Val: FirstUsers [I].first)->getOperand(i: `0`);
10180	auto Vector = ShuffleMasks [I].takeVector();
10181	unsigned VF = `0`;
10182	auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10183	ArrayRef<const TreeEntry *> TEs) {
10184	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
10185	"Expected exactly 1 or 2 tree entries.");
10186	if (TEs.size() == `1`) {
10187	if (VF == `0`)
10188	VF = TEs.front()->getVectorFactor();
10189	auto *FTy =
10190	FixedVectorType::get(ElementType: TEs.back()->Scalars.front()->getType(), NumElts: VF);
10191	if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
10192	!all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
10193	return Data.value() == PoisonMaskElem \|\|
10194	(Data.index() < VF &&
10195	static_cast<int>(Data.index()) == Data.value());
10196	})) {
10197	InstructionCost C =
10198	TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
10199	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10200	<< " for final shuffle of insertelement "
10201	"external users.\n";
10202	TEs.front()->dump();
10203	dbgs() << "SLP: Current total cost = " << Cost << "\n");
10204	Cost += C;
10205	}
10206	} else {
10207	if (VF == `0`) {
10208	if (TEs.front() &&
10209	TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10210	VF = TEs.front()->getVectorFactor();
10211	else
10212	VF = Mask.size();
10213	}
10214	auto *FTy =
10215	FixedVectorType::get(ElementType: TEs.back()->Scalars.front()->getType(), NumElts: VF);
10216	InstructionCost C =
10217	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
10218	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10219	<< " for final shuffle of vector node and external "
10220	"insertelement users.\n";
10221	if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10222	dbgs() << "SLP: Current total cost = " << Cost << "\n");
10223	Cost += C;
10224	}
10225	VF = Mask.size();
10226	return TEs.back();
10227	};
10228	(void)performExtractsShuffleAction<const TreeEntry>(
10229	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
10230	GetVF: [](const TreeEntry E) { return* E->getVectorFactor(); }, ResizeAction: ResizeToVF,
10231	Action: EstimateShufflesCost);
10232	InstructionCost InsertCost = TTI->getScalarizationOverhead(
10233	Ty: cast<FixedVectorType>(Val: FirstUsers [I].first->getType()), DemandedElts: DemandedElts [I],
10234	/Insert/ true, /Extract/ false, CostKind: TTI::TCK_RecipThroughput);
10235	Cost -= InsertCost;
10236	}
10237
10238	// Add the cost for reduced value resize (if required).
10239	if (ReductionBitWidth != `0`) {
10240	assert(UserIgnoreList && "Expected reduction tree.");
10241	const TreeEntry &E = *VectorizableTree.front().get();
10242	auto It = MinBWs.find(Val: &E);
10243	if (It != MinBWs.end() && It ->second.first != ReductionBitWidth) {
10244	unsigned SrcSize = It ->second.first;
10245	unsigned DstSize = ReductionBitWidth;
10246	unsigned Opcode = Instruction::Trunc;
10247	if (SrcSize < DstSize)
10248	Opcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
10249	auto *SrcVecTy =
10250	FixedVectorType::get(ElementType: Builder.getIntNTy(N: SrcSize), NumElts: E.getVectorFactor());
10251	auto *DstVecTy =
10252	FixedVectorType::get(ElementType: Builder.getIntNTy(N: DstSize), NumElts: E.getVectorFactor());
10253	TTI::CastContextHint CCH = getCastContextHint(TE: E);
10254	InstructionCost CastCost;
10255	switch (E.getOpcode()) {
10256	case Instruction::SExt:
10257	case Instruction::ZExt:
10258	case Instruction::Trunc: {
10259	const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: `0`);
10260	CCH = getCastContextHint(TE: *OpTE);
10261	break;
10262	}
10263	default:
10264	break;
10265	}
10266	CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
10267	CostKind: TTI::TCK_RecipThroughput);
10268	Cost += CastCost;
10269	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10270	<< " for final resize for reduction from " << SrcVecTy
10271	<< " to " << DstVecTy << "\n";
10272	dbgs() << "SLP: Current total cost = " << Cost << "\n");
10273	}
10274	}
10275
10276	#ifndef NDEBUG
10277	SmallString<`256`> Str;
10278	{
10279	raw_svector_ostream OS(Str);
10280	OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10281	<< "SLP: Extract Cost = " << ExtractCost << ".\n"
10282	<< "SLP: Total Cost = " << Cost << ".\n";
10283	}
10284	LLVM_DEBUG(dbgs() << Str);
10285	if (ViewSLPTree)
10286	ViewGraph(G: this, Name: "SLP" + F->getName(), ShortNames: false, Title: Str);
10287	#endif
10288
10289	return Cost;
10290	}
10291
10292	/// Tries to find extractelement instructions with constant indices from fixed
10293	/// vector type and gather such instructions into a bunch, which highly likely
10294	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10295	/// successful, the matched scalars are replaced by poison values in \p VL for
10296	/// future analysis.
10297	std::optional<TTI::ShuffleKind>
10298	BoUpSLP::tryToGatherSingleRegisterExtractElements(
10299	MutableArrayRef<Value > VL, SmallVectorImpl<int> &Mask) const* {
10300	// Scan list of gathered scalars for extractelements that can be represented
10301	// as shuffles.
10302	MapVector<Value , SmallVector<int*>> VectorOpToIdx;
10303	SmallVector<int> UndefVectorExtracts;
10304	for (int I = `0`, E = VL.size(); I < E; ++I) {
10305	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
10306	if (!EI) {
10307	if (isa<UndefValue>(Val: VL [I]))
10308	UndefVectorExtracts.push_back(Elt: I);
10309	continue;
10310	}
10311	auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
10312	if (!VecTy \|\| !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
10313	continue;
10314	std::optional<unsigned> Idx = getExtractIndex(E: EI);
10315	// Undefined index.
10316	if (!Idx) {
10317	UndefVectorExtracts.push_back(Elt: I);
10318	continue;
10319	}
10320	SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10321	ExtractMask.reset(Idx: *Idx);
10322	if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
10323	UndefVectorExtracts.push_back(Elt: I);
10324	continue;
10325	}
10326	VectorOpToIdx [EI->getVectorOperand()].push_back(Elt: I);
10327	}
10328	// Sort the vector operands by the maximum number of uses in extractelements.
10329	MapVector<unsigned, SmallVector<Value *>> VFToVector;
10330	for (const auto &Data : VectorOpToIdx)
10331	VFToVector [cast<FixedVectorType>(Val: Data.first->getType())->getNumElements()]
10332	.push_back(Elt: Data.first);
10333	for (auto &Data : VFToVector) {
10334	stable_sort(Range&: Data.second, C: [&VectorOpToIdx](Value V1, Value V2) {
10335	return VectorOpToIdx.find(Key: V1)->second.size() >
10336	VectorOpToIdx.find(Key: V2)->second.size();
10337	});
10338	}
10339	// Find the best pair of the vectors with the same number of elements or a
10340	// single vector.
10341	const int UndefSz = UndefVectorExtracts.size();
10342	unsigned SingleMax = `0`;
10343	Value SingleVec = nullptr*;
10344	unsigned PairMax = `0`;
10345	std::pair<Value , Value > PairVec(nullptr, nullptr);
10346	for (auto &Data : VFToVector) {
10347	Value *V1 = Data.second.front();
10348	if (SingleMax < VectorOpToIdx [V1].size() + UndefSz) {
10349	SingleMax = VectorOpToIdx [V1].size() + UndefSz;
10350	SingleVec = V1;
10351	}
10352	Value V2 = nullptr*;
10353	if (Data.second.size() > `1`)
10354	V2 = *std::next(x: Data.second.begin());
10355	if (V2 && PairMax < VectorOpToIdx [V1].size() + VectorOpToIdx [V2].size() +
10356	UndefSz) {
10357	PairMax = VectorOpToIdx [V1].size() + VectorOpToIdx [V2].size() + UndefSz;
10358	PairVec = std::make_pair(x&: V1, y&: V2);
10359	}
10360	}
10361	if (SingleMax == `0` && PairMax == `0` && UndefSz == `0`)
10362	return std::nullopt;
10363	// Check if better to perform a shuffle of 2 vectors or just of a single
10364	// vector.
10365	SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10366	SmallVector<Value *> GatheredExtracts(
10367	VL.size(), PoisonValue::get(T: VL.front()->getType()));
10368	if (SingleMax >= PairMax && SingleMax) {
10369	for (int Idx : VectorOpToIdx [SingleVec])
10370	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
10371	} else {
10372	for (Value *V : {PairVec.first, PairVec.second})
10373	for (int Idx : VectorOpToIdx [V])
10374	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
10375	}
10376	// Add extracts from undefs too.
10377	for (int Idx : UndefVectorExtracts)
10378	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
10379	// Check that gather of extractelements can be represented as just a
10380	// shuffle of a single/two vectors the scalars are extracted from.
10381	std::optional<TTI::ShuffleKind> Res =
10382	isFixedVectorShuffle(VL: GatheredExtracts, Mask);
10383	if (!Res) {
10384	// TODO: try to check other subsets if possible.
10385	// Restore the original VL if attempt was not successful.
10386	copy(Range&: SavedVL, Out: VL.begin());
10387	return std::nullopt;
10388	}
10389	// Restore unused scalars from mask, if some of the extractelements were not
10390	// selected for shuffle.
10391	for (int I = `0`, E = GatheredExtracts.size(); I < E; ++I) {
10392	if (Mask [I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts [I]) &&
10393	isa<UndefValue>(Val: GatheredExtracts [I])) {
10394	std::swap(a&: VL [I], b&: GatheredExtracts [I]);
10395	continue;
10396	}
10397	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
10398	if (!EI \|\| !isa<FixedVectorType>(Val: EI->getVectorOperandType()) \|\|
10399	!isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) \|\|
10400	is_contained(Range&: UndefVectorExtracts, Element: I))
10401	continue;
10402	}
10403	return Res;
10404	}
10405
10406	/// Tries to find extractelement instructions with constant indices from fixed
10407	/// vector type and gather such instructions into a bunch, which highly likely
10408	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10409	/// successful, the matched scalars are replaced by poison values in \p VL for
10410	/// future analysis.
10411	SmallVector<std::optional<TTI::ShuffleKind>>
10412	BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10413	SmallVectorImpl<int> &Mask,
10414	unsigned NumParts) const {
10415	assert(NumParts > `0` && "NumParts expected be greater than or equal to 1.");
10416	SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10417	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
10418	unsigned SliceSize = VL.size() / NumParts;
10419	for (unsigned Part = `0`; Part < NumParts; ++Part) {
10420	// Scan list of gathered scalars for extractelements that can be represented
10421	// as shuffles.
10422	MutableArrayRef<Value *> SubVL =
10423	MutableArrayRef(VL).slice(N: Part * SliceSize, M: SliceSize);
10424	SmallVector<int> SubMask;
10425	std::optional<TTI::ShuffleKind> Res =
10426	tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
10427	ShufflesRes [Part] = Res;
10428	copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
10429	}
10430	if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
10431	return Res.has_value();
10432	}))
10433	ShufflesRes.clear();
10434	return ShufflesRes;
10435	}
10436
10437	std::optional<TargetTransformInfo::ShuffleKind>
10438	BoUpSLP::isGatherShuffledSingleRegisterEntry(
10439	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
10440	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part, bool ForOrder) {
10441	Entries.clear();
10442	// TODO: currently checking only for Scalars in the tree entry, need to count
10443	// reused elements too for better cost estimation.
10444	const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10445	const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
10446	const BasicBlock TEInsertBlock = nullptr*;
10447	// Main node of PHI entries keeps the correct order of operands/incoming
10448	// blocks.
10449	if (auto *PHI = dyn_cast<PHINode>(Val: TEUseEI.UserTE->getMainOp())) {
10450	TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
10451	TEInsertPt = TEInsertBlock->getTerminator();
10452	} else {
10453	TEInsertBlock = TEInsertPt->getParent();
10454	}
10455	if (!DT->isReachableFromEntry(A: TEInsertBlock))
10456	return std::nullopt;
10457	auto *NodeUI = DT->getNode(BB: TEInsertBlock);
10458	assert(NodeUI && "Should only process reachable instructions");
10459	SmallPtrSet<Value *, `4`> GatheredScalars(VL.begin(), VL.end());
10460	auto CheckOrdering = [&](const Instruction *InsertPt) {
10461	// Argument InsertPt is an instruction where vector code for some other
10462	// tree entry (one that shares one or more scalars with TE) is going to be
10463	// generated. This lambda returns true if insertion point of vector code
10464	// for the TE dominates that point (otherwise dependency is the other way
10465	// around). The other node is not limited to be of a gather kind. Gather
10466	// nodes are not scheduled and their vector code is inserted before their
10467	// first user. If user is PHI, that is supposed to be at the end of a
10468	// predecessor block. Otherwise it is the last instruction among scalars of
10469	// the user node. So, instead of checking dependency between instructions
10470	// themselves, we check dependency between their insertion points for vector
10471	// code (since each scalar instruction ends up as a lane of a vector
10472	// instruction).
10473	const BasicBlock *InsertBlock = InsertPt->getParent();
10474	auto *NodeEUI = DT->getNode(BB: InsertBlock);
10475	if (!NodeEUI)
10476	return false;
10477	assert((NodeUI == NodeEUI) ==
10478	(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10479	"Different nodes should have different DFS numbers");
10480	// Check the order of the gather nodes users.
10481	if (TEInsertPt->getParent() != InsertBlock &&
10482	(DT->dominates(A: NodeUI, B: NodeEUI) \|\| !DT->dominates(A: NodeEUI, B: NodeUI)))
10483	return false;
10484	if (TEInsertPt->getParent() == InsertBlock &&
10485	TEInsertPt->comesBefore(Other: InsertPt))
10486	return false;
10487	return true;
10488	};
10489	// Find all tree entries used by the gathered values. If no common entries
10490	// found - not a shuffle.
10491	// Here we build a set of tree nodes for each gathered value and trying to
10492	// find the intersection between these sets. If we have at least one common
10493	// tree node for each gathered value - we have just a permutation of the
10494	// single vector. If we have 2 different sets, we're in situation where we
10495	// have a permutation of 2 input vectors.
10496	SmallVector<SmallPtrSet<const TreeEntry *, `4`>> UsedTEs;
10497	DenseMap<Value , int*> UsedValuesEntry;
10498	for (Value *V : VL) {
10499	if (isConstant(V))
10500	continue;
10501	// Build a list of tree entries where V is used.
10502	SmallPtrSet<const TreeEntry *, `4`> VToTEs;
10503	for (const TreeEntry *TEPtr : ValueToGatherNodes.find(Val: V)->second) {
10504	if (TEPtr == TE)
10505	continue;
10506	assert(any_of(TEPtr->Scalars,
10507	[&](Value V) { return* GatheredScalars.contains(V); }) &&
10508	"Must contain at least single gathered value.");
10509	assert(TEPtr->UserTreeIndices.size() == `1` &&
10510	"Expected only single user of a gather node.");
10511	const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10512
10513	PHINode *UserPHI = dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp());
10514	const Instruction *InsertPt =
10515	UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
10516	: &getLastInstructionInBundle(E: UseEI.UserTE);
10517	if (TEInsertPt == InsertPt) {
10518	// If 2 gathers are operands of the same entry (regardless of whether
10519	// user is PHI or else), compare operands indices, use the earlier one
10520	// as the base.
10521	if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10522	continue;
10523	// If the user instruction is used for some reason in different
10524	// vectorized nodes - make it depend on index.
10525	if (TEUseEI.UserTE != UseEI.UserTE &&
10526	TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10527	continue;
10528	}
10529
10530	// Check if the user node of the TE comes after user node of TEPtr,
10531	// otherwise TEPtr depends on TE.
10532	if ((TEInsertBlock != InsertPt->getParent() \|\|
10533	TEUseEI.EdgeIdx < UseEI.EdgeIdx \|\| TEUseEI.UserTE != UseEI.UserTE) &&
10534	!CheckOrdering (InsertPt))
10535	continue;
10536	VToTEs.insert(Ptr: TEPtr);
10537	}
10538	if (const TreeEntry *VTE = getTreeEntry(V)) {
10539	if (ForOrder) {
10540	if (VTE->State != TreeEntry::Vectorize) {
10541	auto It = MultiNodeScalars.find(Val: V);
10542	if (It == MultiNodeScalars.end())
10543	continue;
10544	VTE = *It ->getSecond().begin();
10545	// Iterate through all vectorized nodes.
10546	auto MIt = find_if(Range&: It ->getSecond(), P: [](const* TreeEntry *MTE) {
10547	return MTE->State == TreeEntry::Vectorize;
10548	});
10549	if (MIt == It ->getSecond().end())
10550	continue;
10551	VTE = *MIt;
10552	}
10553	}
10554	Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
10555	if (&LastBundleInst == TEInsertPt \|\| !CheckOrdering (&LastBundleInst))
10556	continue;
10557	VToTEs.insert(Ptr: VTE);
10558	}
10559	if (VToTEs.empty())
10560	continue;
10561	if (UsedTEs.empty()) {
10562	// The first iteration, just insert the list of nodes to vector.
10563	UsedTEs.push_back(Elt: VToTEs);
10564	UsedValuesEntry.try_emplace(Key: V, Args: `0`);
10565	} else {
10566	// Need to check if there are any previously used tree nodes which use V.
10567	// If there are no such nodes, consider that we have another one input
10568	// vector.
10569	SmallPtrSet<const TreeEntry *, `4`> SavedVToTEs(VToTEs);
10570	unsigned Idx = `0`;
10571	for (SmallPtrSet<const TreeEntry *, `4`> &Set : UsedTEs) {
10572	// Do we have a non-empty intersection of previously listed tree entries
10573	// and tree entries using current V?
10574	set_intersect(S1&: VToTEs, S2: Set);
10575	if (!VToTEs.empty()) {
10576	// Yes, write the new subset and continue analysis for the next
10577	// scalar.
10578	Set.swap(RHS&: VToTEs);
10579	break;
10580	}
10581	VToTEs = SavedVToTEs;
10582	++Idx;
10583	}
10584	// No non-empty intersection found - need to add a second set of possible
10585	// source vectors.
10586	if (Idx == UsedTEs.size()) {
10587	// If the number of input vectors is greater than 2 - not a permutation,
10588	// fallback to the regular gather.
10589	// TODO: support multiple reshuffled nodes.
10590	if (UsedTEs.size() == `2`)
10591	continue;
10592	UsedTEs.push_back(Elt: SavedVToTEs);
10593	Idx = UsedTEs.size() - `1`;
10594	}
10595	UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
10596	}
10597	}
10598
10599	if (UsedTEs.empty()) {
10600	Entries.clear();
10601	return std::nullopt;
10602	}
10603
10604	unsigned VF = `0`;
10605	if (UsedTEs.size() == `1`) {
10606	// Keep the order to avoid non-determinism.
10607	SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10608	UsedTEs.front().end());
10609	sort(C&: FirstEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
10610	return TE1->Idx < TE2->Idx;
10611	});
10612	// Try to find the perfect match in another gather node at first.
10613	auto It = find_if(Range&: FirstEntries, P: [=](const* TreeEntry *EntryPtr) {
10614	return EntryPtr->isSame(VL) \|\| EntryPtr->isSame(VL: TE->Scalars);
10615	});
10616	if (It != FirstEntries.end() &&
10617	((*It)->getVectorFactor() == VL.size() \|\|
10618	((*It)->getVectorFactor() == TE->Scalars.size() &&
10619	TE->ReuseShuffleIndices.size() == VL.size() &&
10620	(*It)->isSame(VL: TE->Scalars)))) {
10621	Entries.push_back(Elt: *It);
10622	if ((*It)->getVectorFactor() == VL.size()) {
10623	std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
10624	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), value: `0`);
10625	} else {
10626	SmallVector<int> CommonMask = TE->getCommonMask();
10627	copy(Range&: CommonMask, Out: Mask.begin());
10628	}
10629	// Clear undef scalars.
10630	for (int I = `0`, Sz = VL.size(); I < Sz; ++I)
10631	if (isa<PoisonValue>(Val: VL [I]))
10632	Mask [I] = PoisonMaskElem;
10633	return TargetTransformInfo::SK_PermuteSingleSrc;
10634	}
10635	// No perfect match, just shuffle, so choose the first tree node from the
10636	// tree.
10637	Entries.push_back(Elt: FirstEntries.front());
10638	} else {
10639	// Try to find nodes with the same vector factor.
10640	assert(UsedTEs.size() == `2` && "Expected at max 2 permuted entries.");
10641	// Keep the order of tree nodes to avoid non-determinism.
10642	DenseMap<int, const TreeEntry *> VFToTE;
10643	for (const TreeEntry *TE : UsedTEs.front()) {
10644	unsigned VF = TE->getVectorFactor();
10645	auto It = VFToTE.find(Val: VF);
10646	if (It != VFToTE.end()) {
10647	if (It ->second->Idx > TE->Idx)
10648	It ->getSecond() = TE;
10649	continue;
10650	}
10651	VFToTE.try_emplace(Key: VF, Args&: TE);
10652	}
10653	// Same, keep the order to avoid non-determinism.
10654	SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10655	UsedTEs.back().end());
10656	sort(C&: SecondEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
10657	return TE1->Idx < TE2->Idx;
10658	});
10659	for (const TreeEntry *TE : SecondEntries) {
10660	auto It = VFToTE.find(Val: TE->getVectorFactor());
10661	if (It != VFToTE.end()) {
10662	VF = It ->first;
10663	Entries.push_back(Elt: It ->second);
10664	Entries.push_back(Elt: TE);
10665	break;
10666	}
10667	}
10668	// No 2 source vectors with the same vector factor - just choose 2 with max
10669	// index.
10670	if (Entries.empty()) {
10671	Entries.push_back(Elt: *llvm::max_element(
10672	Range&: UsedTEs.front(), C: [](const TreeEntry TE1, const* TreeEntry *TE2) {
10673	return TE1->Idx < TE2->Idx;
10674	}));
10675	Entries.push_back(Elt: SecondEntries.front());
10676	VF = std::max(a: Entries.front()->getVectorFactor(),
10677	b: Entries.back()->getVectorFactor());
10678	}
10679	}
10680
10681	bool IsSplatOrUndefs = isSplat(VL) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>);
10682	// Checks if the 2 PHIs are compatible in terms of high possibility to be
10683	// vectorized.
10684	auto AreCompatiblePHIs = [&](Value V, Value V1) {
10685	auto *PHI = cast<PHINode>(Val: V);
10686	auto *PHI1 = cast<PHINode>(Val: V1);
10687	// Check that all incoming values are compatible/from same parent (if they
10688	// are instructions).
10689	// The incoming values are compatible if they all are constants, or
10690	// instruction with the same/alternate opcodes from the same basic block.
10691	for (int I = `0`, E = PHI->getNumIncomingValues(); I < E; ++I) {
10692	Value *In = PHI->getIncomingValue(i: I);
10693	Value *In1 = PHI1->getIncomingValue(i: I);
10694	if (isConstant(V: In) && isConstant(V: In1))
10695	continue;
10696	if (!getSameOpcode(VL: {In, In1}, TLI: *TLI).getOpcode())
10697	return false;
10698	if (cast<Instruction>(Val: In)->getParent() !=
10699	cast<Instruction>(Val: In1)->getParent())
10700	return false;
10701	}
10702	return true;
10703	};
10704	// Check if the value can be ignored during analysis for shuffled gathers.
10705	// We suppose it is better to ignore instruction, which do not form splats,
10706	// are not vectorized/not extractelements (these instructions will be handled
10707	// by extractelements processing) or may form vector node in future.
10708	auto MightBeIgnored = [=](Value *V) {
10709	auto *I = dyn_cast<Instruction>(Val: V);
10710	return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(Val: I) &&
10711	!isVectorLikeInstWithConstOps(V: I) &&
10712	!areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
10713	};
10714	// Check that the neighbor instruction may form a full vector node with the
10715	// current instruction V. It is possible, if they have same/alternate opcode
10716	// and same parent basic block.
10717	auto NeighborMightBeIgnored = [&](Value V, int* Idx) {
10718	Value *V1 = VL [Idx];
10719	bool UsedInSameVTE = false;
10720	auto It = UsedValuesEntry.find(Val: V1);
10721	if (It != UsedValuesEntry.end())
10722	UsedInSameVTE = It ->second == UsedValuesEntry.find(Val: V)->second;
10723	return V != V1 && MightBeIgnored (V1) && !UsedInSameVTE &&
10724	getSameOpcode(VL: {V, V1}, TLI: *TLI).getOpcode() &&
10725	cast<Instruction>(Val: V)->getParent() ==
10726	cast<Instruction>(Val: V1)->getParent() &&
10727	(!isa<PHINode>(Val: V1) \|\| AreCompatiblePHIs (V, V1));
10728	};
10729	// Build a shuffle mask for better cost estimation and vector emission.
10730	SmallBitVector UsedIdxs(Entries.size());
10731	SmallVector<std::pair<unsigned, int>> EntryLanes;
10732	for (int I = `0`, E = VL.size(); I < E; ++I) {
10733	Value *V = VL [I];
10734	auto It = UsedValuesEntry.find(Val: V);
10735	if (It == UsedValuesEntry.end())
10736	continue;
10737	// Do not try to shuffle scalars, if they are constants, or instructions
10738	// that can be vectorized as a result of the following vector build
10739	// vectorization.
10740	if (isConstant(V) \|\| (MightBeIgnored (V) &&
10741	((I > `0` && NeighborMightBeIgnored (V, I - `1`)) \|\|
10742	(I != E - `1` && NeighborMightBeIgnored (V, I + `1`)))))
10743	continue;
10744	unsigned Idx = It ->second;
10745	EntryLanes.emplace_back(Args&: Idx, Args&: I);
10746	UsedIdxs.set(Idx);
10747	}
10748	// Iterate through all shuffled scalars and select entries, which can be used
10749	// for final shuffle.
10750	SmallVector<const TreeEntry *> TempEntries;
10751	for (unsigned I = `0`, Sz = Entries.size(); I < Sz; ++I) {
10752	if (!UsedIdxs.test(Idx: I))
10753	continue;
10754	// Fix the entry number for the given scalar. If it is the first entry, set
10755	// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
10756	// These indices are used when calculating final shuffle mask as the vector
10757	// offset.
10758	for (std::pair<unsigned, int> &Pair : EntryLanes)
10759	if (Pair.first == I)
10760	Pair.first = TempEntries.size();
10761	TempEntries.push_back(Elt: Entries [I]);
10762	}
10763	Entries.swap(RHS&: TempEntries);
10764	if (EntryLanes.size() == Entries.size() &&
10765	!VL.equals(RHS: ArrayRef(TE->Scalars)
10766	.slice(N: Part * VL.size(),
10767	M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
10768	// We may have here 1 or 2 entries only. If the number of scalars is equal
10769	// to the number of entries, no need to do the analysis, it is not very
10770	// profitable. Since VL is not the same as TE->Scalars, it means we already
10771	// have some shuffles before. Cut off not profitable case.
10772	Entries.clear();
10773	return std::nullopt;
10774	}
10775	// Build the final mask, check for the identity shuffle, if possible.
10776	bool IsIdentity = Entries.size() == `1`;
10777	// Pair.first is the offset to the vector, while Pair.second is the index of
10778	// scalar in the list.
10779	for (const std::pair<unsigned, int> &Pair : EntryLanes) {
10780	unsigned Idx = Part * VL.size() + Pair.second;
10781	Mask [Idx] =
10782	Pair.first * VF +
10783	(ForOrder ? std::distance(
10784	first: Entries [Pair.first]->Scalars.begin(),
10785	last: find(Range: Entries [Pair.first]->Scalars, Val: VL [Pair.second]))
10786	: Entries [Pair.first]->findLaneForValue(V: VL [Pair.second]));
10787	IsIdentity &= Mask [Idx] == Pair.second;
10788	}
10789	switch (Entries.size()) {
10790	case `1`:
10791	if (IsIdentity \|\| EntryLanes.size() > `1` \|\| VL.size() <= `2`)
10792	return TargetTransformInfo::SK_PermuteSingleSrc;
10793	break;
10794	case `2`:
10795	if (EntryLanes.size() > `2` \|\| VL.size() <= `2`)
10796	return TargetTransformInfo::SK_PermuteTwoSrc;
10797	break;
10798	default:
10799	break;
10800	}
10801	Entries.clear();
10802	// Clear the corresponding mask elements.
10803	std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
10804	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), value: PoisonMaskElem);
10805	return std::nullopt;
10806	}
10807
10808	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
10809	BoUpSLP::isGatherShuffledEntry(
10810	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
10811	SmallVectorImpl<SmallVector<const TreeEntry >> &Entries, unsigned* NumParts,
10812	bool ForOrder) {
10813	assert(NumParts > `0` && NumParts < VL.size() &&
10814	"Expected positive number of registers.");
10815	Entries.clear();
10816	// No need to check for the topmost gather node.
10817	if (TE == VectorizableTree.front().get())
10818	return {};
10819	// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10820	if (TE->isNonPowOf2Vec())
10821	return {};
10822	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
10823	assert(TE->UserTreeIndices.size() == `1` &&
10824	"Expected only single user of the gather node.");
10825	assert(VL.size() % NumParts == `0` &&
10826	"Number of scalars must be divisible by NumParts.");
10827	unsigned SliceSize = VL.size() / NumParts;
10828	SmallVector<std::optional<TTI::ShuffleKind>> Res;
10829	for (unsigned Part = `0`; Part < NumParts; ++Part) {
10830	ArrayRef<Value > SubVL = VL.slice(N: Part SliceSize, M: SliceSize);
10831	SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
10832	std::optional<TTI::ShuffleKind> SubRes =
10833	isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
10834	ForOrder);
10835	if (!SubRes)
10836	SubEntries.clear();
10837	Res.push_back(Elt: SubRes);
10838	if (SubEntries.size() == `1` && *SubRes == TTI::SK_PermuteSingleSrc &&
10839	SubEntries.front()->getVectorFactor() == VL.size() &&
10840	(SubEntries.front()->isSame(VL: TE->Scalars) \|\|
10841	SubEntries.front()->isSame(VL))) {
10842	SmallVector<const TreeEntry *> LocalSubEntries;
10843	LocalSubEntries.swap(RHS&: SubEntries);
10844	Entries.clear();
10845	Res.clear();
10846	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
10847	// Clear undef scalars.
10848	for (int I = `0`, Sz = VL.size(); I < Sz; ++I)
10849	if (isa<PoisonValue>(Val: VL [I]))
10850	Mask [I] = PoisonMaskElem;
10851	Entries.emplace_back(Args: `1`, Args&: LocalSubEntries.front());
10852	Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
10853	return Res;
10854	}
10855	}
10856	if (all_of(Range&: Res,
10857	P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
10858	Entries.clear();
10859	return {};
10860	}
10861	return Res;
10862	}
10863
10864	InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
10865	bool ForPoisonSrc) const {
10866	// Find the type of the operands in VL.
10867	Type *ScalarTy = VL [`0`]->getType();
10868	if (StoreInst *SI = dyn_cast<StoreInst>(Val: VL [`0`]))
10869	ScalarTy = SI->getValueOperand()->getType();
10870	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
10871	bool DuplicateNonConst = false;
10872	// Find the cost of inserting/extracting values from the vector.
10873	// Check if the same elements are inserted several times and count them as
10874	// shuffle candidates.
10875	APInt ShuffledElements = APInt::getZero(numBits: VL.size());
10876	DenseMap<Value , unsigned*> UniqueElements;
10877	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10878	InstructionCost Cost;
10879	auto EstimateInsertCost = [&](unsigned I, Value *V) {
10880	if (!ForPoisonSrc)
10881	Cost +=
10882	TTI->getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
10883	Index: I, Op0: Constant::getNullValue(Ty: VecTy), Op1: V);
10884	};
10885	SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10886	for (unsigned I = `0`, E = VL.size(); I < E; ++I) {
10887	Value *V = VL [I];
10888	// No need to shuffle duplicates for constants.
10889	if ((ForPoisonSrc && isConstant(V)) \|\| isa<UndefValue>(Val: V)) {
10890	ShuffledElements.setBit(I);
10891	ShuffleMask [I] = isa<PoisonValue>(Val: V) ? PoisonMaskElem : I;
10892	continue;
10893	}
10894
10895	auto Res = UniqueElements.try_emplace(Key: V, Args&: I);
10896	if (Res.second) {
10897	EstimateInsertCost (I, V);
10898	ShuffleMask [I] = I;
10899	continue;
10900	}
10901
10902	DuplicateNonConst = true;
10903	ShuffledElements.setBit(I);
10904	ShuffleMask [I] = Res.first ->second;
10905	}
10906	if (ForPoisonSrc)
10907	Cost =
10908	TTI->getScalarizationOverhead(Ty: VecTy, DemandedElts: ~ShuffledElements, /Insert/ true,
10909	/Extract/ false, CostKind);
10910	if (DuplicateNonConst)
10911	Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
10912	Tp: VecTy, Mask: ShuffleMask);
10913	return Cost;
10914	}
10915
10916	// Perform operand reordering on the instructions in VL and return the reordered
10917	// operands in Left and Right.
10918	void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
10919	SmallVectorImpl<Value *> &Left,
10920	SmallVectorImpl<Value *> &Right,
10921	const BoUpSLP &R) {
10922	if (VL.empty())
10923	return;
10924	VLOperands Ops(VL, R);
10925	// Reorder the operands in place.
10926	Ops.reorder();
10927	Left = Ops.getVL(OpIdx: `0`);
10928	Right = Ops.getVL(OpIdx: `1`);
10929	}
10930
10931	Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
10932	auto &Res = EntryToLastInstruction.FindAndConstruct(Key: E);
10933	if (Res.second)
10934	return *Res.second;
10935	// Get the basic block this bundle is in. All instructions in the bundle
10936	// should be in this block (except for extractelement-like instructions with
10937	// constant indeces).
10938	auto *Front = E->getMainOp();
10939	auto *BB = Front->getParent();
10940	assert(llvm::all_of(E->Scalars, [=](Value V) -> bool* {
10941	if (E->getOpcode() == Instruction::GetElementPtr &&
10942	!isa<GetElementPtrInst>(V))
10943	return true;
10944	auto *I = cast<Instruction>(V);
10945	return !E->isOpcodeOrAlt(I) \|\| I->getParent() == BB \|\|
10946	isVectorLikeInstWithConstOps(I);
10947	}));
10948
10949	auto FindLastInst = [&]() {
10950	Instruction *LastInst = Front;
10951	for (Value *V : E->Scalars) {
10952	auto *I = dyn_cast<Instruction>(Val: V);
10953	if (!I)
10954	continue;
10955	if (LastInst->getParent() == I->getParent()) {
10956	if (LastInst->comesBefore(Other: I))
10957	LastInst = I;
10958	continue;
10959	}
10960	assert(((E->getOpcode() == Instruction::GetElementPtr &&
10961	!isa<GetElementPtrInst>(I)) \|\|
10962	(isVectorLikeInstWithConstOps(LastInst) &&
10963	isVectorLikeInstWithConstOps(I))) &&
10964	"Expected vector-like or non-GEP in GEP node insts only.");
10965	if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
10966	LastInst = I;
10967	continue;
10968	}
10969	if (!DT->isReachableFromEntry(A: I->getParent()))
10970	continue;
10971	auto *NodeA = DT->getNode(BB: LastInst->getParent());
10972	auto *NodeB = DT->getNode(BB: I->getParent());
10973	assert(NodeA && "Should only process reachable instructions");
10974	assert(NodeB && "Should only process reachable instructions");
10975	assert((NodeA == NodeB) ==
10976	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10977	"Different nodes should have different DFS numbers");
10978	if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10979	LastInst = I;
10980	}
10981	BB = LastInst->getParent();
10982	return LastInst;
10983	};
10984
10985	auto FindFirstInst = [&]() {
10986	Instruction *FirstInst = Front;
10987	for (Value *V : E->Scalars) {
10988	auto *I = dyn_cast<Instruction>(Val: V);
10989	if (!I)
10990	continue;
10991	if (FirstInst->getParent() == I->getParent()) {
10992	if (I->comesBefore(Other: FirstInst))
10993	FirstInst = I;
10994	continue;
10995	}
10996	assert(((E->getOpcode() == Instruction::GetElementPtr &&
10997	!isa<GetElementPtrInst>(I)) \|\|
10998	(isVectorLikeInstWithConstOps(FirstInst) &&
10999	isVectorLikeInstWithConstOps(I))) &&
11000	"Expected vector-like or non-GEP in GEP node insts only.");
11001	if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
11002	FirstInst = I;
11003	continue;
11004	}
11005	if (!DT->isReachableFromEntry(A: I->getParent()))
11006	continue;
11007	auto *NodeA = DT->getNode(BB: FirstInst->getParent());
11008	auto *NodeB = DT->getNode(BB: I->getParent());
11009	assert(NodeA && "Should only process reachable instructions");
11010	assert(NodeB && "Should only process reachable instructions");
11011	assert((NodeA == NodeB) ==
11012	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11013	"Different nodes should have different DFS numbers");
11014	if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11015	FirstInst = I;
11016	}
11017	return FirstInst;
11018	};
11019
11020	// Set the insert point to the beginning of the basic block if the entry
11021	// should not be scheduled.
11022	if (doesNotNeedToSchedule(VL: E->Scalars) \|\|
11023	(E->State != TreeEntry::NeedToGather &&
11024	all_of(Range: E->Scalars, P: isVectorLikeInstWithConstOps))) {
11025	if ((E->getOpcode() == Instruction::GetElementPtr &&
11026	any_of(Range: E->Scalars,
11027	P: [](Value *V) {
11028	return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
11029	})) \|\|
11030	all_of(Range: E->Scalars,
11031	P: [](Value *V) {
11032	return !isVectorLikeInstWithConstOps(V) &&
11033	isUsedOutsideBlock(V);
11034	}) \|\|
11035	(E->State == TreeEntry::NeedToGather && E->Idx == `0` &&
11036	all_of(Range: E->Scalars, P: [](Value *V) {
11037	return isa<ExtractElementInst, UndefValue>(Val: V) \|\|
11038	areAllOperandsNonInsts(V);
11039	})))
11040	Res.second = FindLastInst ();
11041	else
11042	Res.second = FindFirstInst ();
11043	return *Res.second;
11044	}
11045
11046	// Find the last instruction. The common case should be that BB has been
11047	// scheduled, and the last instruction is VL.back(). So we start with
11048	// VL.back() and iterate over schedule data until we reach the end of the
11049	// bundle. The end of the bundle is marked by null ScheduleData.
11050	if (BlocksSchedules.count(Key: BB)) {
11051	Value *V = E->isOneOf(Op: E->Scalars.back());
11052	if (doesNotNeedToBeScheduled(V))
11053	V = *find_if_not(Range: E->Scalars, P: doesNotNeedToBeScheduled);
11054	auto *Bundle = BlocksSchedules [BB]->getScheduleData(V);
11055	if (Bundle && Bundle->isPartOfBundle())
11056	for (; Bundle; Bundle = Bundle->NextInBundle)
11057	if (Bundle->OpValue == Bundle->Inst)
11058	Res.second = Bundle->Inst;
11059	}
11060
11061	// LastInst can still be null at this point if there's either not an entry
11062	// for BB in BlocksSchedules or there's no ScheduleData available for
11063	// VL.back(). This can be the case if buildTree_rec aborts for various
11064	// reasons (e.g., the maximum recursion depth is reached, the maximum region
11065	// size is reached, etc.). ScheduleData is initialized in the scheduling
11066	// "dry-run".
11067	//
11068	// If this happens, we can still find the last instruction by brute force. We
11069	// iterate forwards from Front (inclusive) until we either see all
11070	// instructions in the bundle or reach the end of the block. If Front is the
11071	// last instruction in program order, LastInst will be set to Front, and we
11072	// will visit all the remaining instructions in the block.
11073	//
11074	// One of the reasons we exit early from buildTree_rec is to place an upper
11075	// bound on compile-time. Thus, taking an additional compile-time hit here is
11076	// not ideal. However, this should be exceedingly rare since it requires that
11077	// we both exit early from buildTree_rec and that the bundle be out-of-order
11078	// (causing us to iterate all the way to the end of the block).
11079	if (!Res.second)
11080	Res.second = FindLastInst ();
11081	assert(Res.second && "Failed to find last instruction in bundle");
11082	return *Res.second;
11083	}
11084
11085	void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11086	auto *Front = E->getMainOp();
11087	Instruction *LastInst = &getLastInstructionInBundle(E);
11088	assert(LastInst && "Failed to find last instruction in bundle");
11089	BasicBlock::iterator LastInstIt = LastInst->getIterator();
11090	// If the instruction is PHI, set the insert point after all the PHIs.
11091	bool IsPHI = isa<PHINode>(Val: LastInst);
11092	if (IsPHI)
11093	LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11094	if (IsPHI \|\| (E->State != TreeEntry::NeedToGather &&
11095	doesNotNeedToSchedule(VL: E->Scalars))) {
11096	Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
11097	} else {
11098	// Set the insertion point after the last instruction in the bundle. Set the
11099	// debug location to Front.
11100	Builder.SetInsertPoint(
11101	TheBB: LastInst->getParent(),
11102	IP: LastInst->getNextNonDebugInstruction()->getIterator());
11103	}
11104	Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11105	}
11106
11107	Value BoUpSLP::gather(ArrayRef<Value > VL, Value *Root) {
11108	// List of instructions/lanes from current block and/or the blocks which are
11109	// part of the current loop. These instructions will be inserted at the end to
11110	// make it possible to optimize loops and hoist invariant instructions out of
11111	// the loops body with better chances for success.
11112	SmallVector<std::pair<Value , unsigned*>, `4`> PostponedInsts;
11113	SmallSet<int, `4`> PostponedIndices;
11114	Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
11115	auto &&CheckPredecessor = [](BasicBlock InstBB, BasicBlock InsertBB) {
11116	SmallPtrSet<BasicBlock *, `4`> Visited;
11117	while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
11118	InsertBB = InsertBB->getSinglePredecessor();
11119	return InsertBB && InsertBB == InstBB;
11120	};
11121	for (int I = `0`, E = VL.size(); I < E; ++I) {
11122	if (auto *Inst = dyn_cast<Instruction>(Val: VL [I]))
11123	if ((CheckPredecessor (Inst->getParent(), Builder.GetInsertBlock()) \|\|
11124	getTreeEntry(V: Inst) \|\|
11125	(L && (!Root \|\| L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
11126	PostponedIndices.insert(V: I).second)
11127	PostponedInsts.emplace_back(Args&: Inst, Args&: I);
11128	}
11129
11130	auto &&CreateInsertElement = [this](Value Vec, Value V, unsigned Pos,
11131	Type *Ty) {
11132	Value *Scalar = V;
11133	if (cast<VectorType>(Val: Vec->getType())->getElementType() != Ty) {
11134	assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11135	"Expected integer types only.");
11136	Vec = Builder.CreateIntCast(
11137	V: Vec,
11138	DestTy: VectorType::get(ElementType: Ty,
11139	EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
11140	isSigned: !isKnownNonNegative(V: Vec, SQ: SimplifyQuery (*DL)));
11141	}
11142
11143	Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
11144	auto *InsElt = dyn_cast<InsertElementInst>(Val: Vec);
11145	if (!InsElt)
11146	return Vec;
11147	GatherShuffleExtractSeq.insert(X: InsElt);
11148	CSEBlocks.insert(V: InsElt->getParent());
11149	// Add to our 'need-to-extract' list.
11150	if (isa<Instruction>(Val: V)) {
11151	if (TreeEntry *Entry = getTreeEntry(V)) {
11152	// Find which lane we need to extract.
11153	User UserOp = nullptr*;
11154	if (Scalar != V) {
11155	if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
11156	UserOp = SI;
11157	} else {
11158	UserOp = InsElt;
11159	}
11160	if (UserOp) {
11161	unsigned FoundLane = Entry->findLaneForValue(V);
11162	ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: FoundLane);
11163	}
11164	}
11165	}
11166	return Vec;
11167	};
11168	Value *Val0 =
11169	isa<StoreInst>(Val: VL [`0`]) ? cast<StoreInst>(Val: VL [`0`])->getValueOperand() : VL [`0`];
11170	Type *ScalarTy = Val0->getType();
11171	FixedVectorType *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
11172	Value *Vec = Root ? Root : PoisonValue::get(T: VecTy);
11173	SmallVector<int> NonConsts;
11174	// Insert constant values at first.
11175	for (int I = `0`, E = VL.size(); I < E; ++I) {
11176	if (PostponedIndices.contains(V: I))
11177	continue;
11178	if (!isConstant(V: VL [I])) {
11179	NonConsts.push_back(Elt: I);
11180	continue;
11181	}
11182	if (Root) {
11183	if (!isa<UndefValue>(Val: VL [I])) {
11184	NonConsts.push_back(Elt: I);
11185	continue;
11186	}
11187	if (isa<PoisonValue>(Val: VL [I]))
11188	continue;
11189	if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Root)) {
11190	if (SV->getMaskValue(Elt: I) == PoisonMaskElem)
11191	continue;
11192	}
11193	}
11194	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
11195	}
11196	// Insert non-constant values.
11197	for (int I : NonConsts)
11198	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
11199	// Append instructions, which are/may be part of the loop, in the end to make
11200	// it possible to hoist non-loop-based instructions.
11201	for (const std::pair<Value , unsigned*> &Pair : PostponedInsts)
11202	Vec = CreateInsertElement (Vec, Pair.first, Pair.second, ScalarTy);
11203
11204	return Vec;
11205	}
11206
11207	/// Merges shuffle masks and emits final shuffle instruction, if required. It
11208	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11209	/// when the actual shuffle instruction is generated only if this is actually
11210	/// required. Otherwise, the shuffle instruction emission is delayed till the
11211	/// end of the process, to reduce the number of emitted instructions and further
11212	/// analysis/transformations.
11213	/// The class also will look through the previously emitted shuffle instructions
11214	/// and properly mark indices in mask as undef.
11215	/// For example, given the code
11216	/// \code
11217	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11218	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11219	/// \endcode
11220	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11221	/// look through %s1 and %s2 and emit
11222	/// \code
11223	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11224	/// \endcode
11225	/// instead.
11226	/// If 2 operands are of different size, the smallest one will be resized and
11227	/// the mask recalculated properly.
11228	/// For example, given the code
11229	/// \code
11230	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11231	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11232	/// \endcode
11233	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11234	/// look through %s1 and %s2 and emit
11235	/// \code
11236	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11237	/// \endcode
11238	/// instead.
11239	class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11240	bool IsFinalized = false;
11241	/// Combined mask for all applied operands and masks. It is built during
11242	/// analysis and actual emission of shuffle vector instructions.
11243	SmallVector<int> CommonMask;
11244	/// List of operands for the shuffle vector instruction. It hold at max 2
11245	/// operands, if the 3rd is going to be added, the first 2 are combined into
11246	/// shuffle with \p CommonMask mask, the first operand sets to be the
11247	/// resulting shuffle and the second operand sets to be the newly added
11248	/// operand. The \p CommonMask is transformed in the proper way after that.
11249	SmallVector<Value *, `2`> InVectors;
11250	IRBuilderBase &Builder;
11251	BoUpSLP &R;
11252
11253	class ShuffleIRBuilder {
11254	IRBuilderBase &Builder;
11255	/// Holds all of the instructions that we gathered.
11256	SetVector<Instruction *> &GatherShuffleExtractSeq;
11257	/// A list of blocks that we are going to CSE.
11258	DenseSet<BasicBlock *> &CSEBlocks;
11259	/// Data layout.
11260	const DataLayout &DL;
11261
11262	public:
11263	ShuffleIRBuilder(IRBuilderBase &Builder,
11264	SetVector<Instruction *> &GatherShuffleExtractSeq,
11265	DenseSet<BasicBlock > &CSEBlocks, const* DataLayout &DL)
11266	: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11267	CSEBlocks(CSEBlocks), DL(DL) {}
11268	~ShuffleIRBuilder() = default;
11269	/// Creates shufflevector for the 2 operands with the given mask.
11270	Value createShuffleVector(Value V1, Value V2, ArrayRef<int*> Mask) {
11271	if (V1->getType() != V2->getType()) {
11272	assert(V1->getType()->isIntOrIntVectorTy() &&
11273	V1->getType()->isIntOrIntVectorTy() &&
11274	"Expected integer vector types only.");
11275	if (V1->getType() != V2->getType()) {
11276	if (cast<VectorType>(Val: V2->getType())
11277	->getElementType()
11278	->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
11279	->getElementType()
11280	->getIntegerBitWidth())
11281	V2 = Builder.CreateIntCast(
11282	V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery (DL)));
11283	else
11284	V1 = Builder.CreateIntCast(
11285	V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery (DL)));
11286	}
11287	}
11288	Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11289	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11290	GatherShuffleExtractSeq.insert(X: I);
11291	CSEBlocks.insert(V: I->getParent());
11292	}
11293	return Vec;
11294	}
11295	/// Creates permutation of the single vector operand with the given mask, if
11296	/// it is not identity mask.
11297	Value createShuffleVector(Value V1, ArrayRef<int> Mask) {
11298	if (Mask.empty())
11299	return V1;
11300	unsigned VF = Mask.size();
11301	unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11302	if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
11303	return V1;
11304	Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
11305	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11306	GatherShuffleExtractSeq.insert(X: I);
11307	CSEBlocks.insert(V: I->getParent());
11308	}
11309	return Vec;
11310	}
11311	Value createIdentity(Value V) { return V; }
11312	Value createPoison(Type Ty, unsigned VF) {
11313	return PoisonValue::get(T: FixedVectorType::get(ElementType: Ty, NumElts: VF));
11314	}
11315	/// Resizes 2 input vector to match the sizes, if the they are not equal
11316	/// yet. The smallest vector is resized to the size of the larger vector.
11317	void resizeToMatch(Value &V1, Value &V2) {
11318	if (V1->getType() == V2->getType())
11319	return;
11320	int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11321	int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
11322	int VF = std::max(a: V1VF, b: V2VF);
11323	int MinVF = std::min(a: V1VF, b: V2VF);
11324	SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11325	std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
11326	value: `0`);
11327	Value *&Op = MinVF == V1VF ? V1 : V2;
11328	Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
11329	if (auto *I = dyn_cast<Instruction>(Val: Op)) {
11330	GatherShuffleExtractSeq.insert(X: I);
11331	CSEBlocks.insert(V: I->getParent());
11332	}
11333	if (MinVF == V1VF)
11334	V1 = Op;
11335	else
11336	V2 = Op;
11337	}
11338	};
11339
11340	/// Smart shuffle instruction emission, walks through shuffles trees and
11341	/// tries to find the best matching vector for the actual shuffle
11342	/// instruction.
11343	Value createShuffle(Value V1, Value V2, ArrayRef<int*> Mask) {
11344	assert(V1 && "Expected at least one vector value.");
11345	ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11346	R.CSEBlocks, *R.DL);
11347	return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11348	Builder&: ShuffleBuilder);
11349	}
11350
11351	/// Transforms mask \p CommonMask per given \p Mask to make proper set after
11352	/// shuffle emission.
11353	static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11354	ArrayRef<int> Mask) {
11355	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11356	if (Mask [Idx] != PoisonMaskElem)
11357	CommonMask [Idx] = Idx;
11358	}
11359
11360	public:
11361	ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
11362	: Builder(Builder), R(R) {}
11363
11364	/// Adjusts extractelements after reusing them.
11365	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
11366	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11367	unsigned NumParts, bool &UseVecBaseAsInput) {
11368	UseVecBaseAsInput = false;
11369	SmallPtrSet<Value *, `4`> UniqueBases;
11370	Value VecBase = nullptr*;
11371	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
11372	int Idx = Mask [I];
11373	if (Idx == PoisonMaskElem)
11374	continue;
11375	auto *EI = cast<ExtractElementInst>(Val: E->Scalars [I]);
11376	VecBase = EI->getVectorOperand();
11377	if (const TreeEntry *TE = R.getTreeEntry(V: VecBase))
11378	VecBase = TE->VectorizedValue;
11379	assert(VecBase && "Expected vectorized value.");
11380	UniqueBases.insert(Ptr: VecBase);
11381	// If the only one use is vectorized - can delete the extractelement
11382	// itself.
11383	if (!EI->hasOneUse() \|\| (NumParts != `1` && count(Range: E->Scalars, Element: EI) > `1`) \|\|
11384	any_of(Range: EI->users(), P: [&](User *U) {
11385	const TreeEntry *UTE = R.getTreeEntry(V: U);
11386	return !UTE \|\| R.MultiNodeScalars.contains(Val: U) \|\|
11387	count_if(Range&: R.VectorizableTree,
11388	P: [&](const std::unique_ptr<TreeEntry> &TE) {
11389	return any_of(Range&: TE ->UserTreeIndices,
11390	P: [&](const EdgeInfo &Edge) {
11391	return Edge.UserTE == UTE;
11392	}) &&
11393	is_contained(Range&: TE ->Scalars, Element: EI);
11394	}) != `1`;
11395	}))
11396	continue;
11397	R.eraseInstruction(I: EI);
11398	}
11399	if (NumParts == `1` \|\| UniqueBases.size() == `1`)
11400	return VecBase;
11401	UseVecBaseAsInput = true;
11402	auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11403	for (auto [I, Idx] : enumerate(First&: Mask))
11404	if (Idx != PoisonMaskElem)
11405	Idx = I;
11406	};
11407	// Perform multi-register vector shuffle, joining them into a single virtual
11408	// long vector.
11409	// Need to shuffle each part independently and then insert all this parts
11410	// into a long virtual vector register, forming the original vector.
11411	Value Vec = nullptr*;
11412	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11413	unsigned SliceSize = E->Scalars.size() / NumParts;
11414	for (unsigned Part = `0`; Part < NumParts; ++Part) {
11415	ArrayRef<Value *> VL =
11416	ArrayRef(E->Scalars).slice(N: Part * SliceSize, M: SliceSize);
11417	MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: SliceSize);
11418	constexpr int MaxBases = `2`;
11419	SmallVector<Value *, MaxBases> Bases(MaxBases);
11420	#ifndef NDEBUG
11421	int PrevSize = `0`;
11422	#endif // NDEBUG
11423	for (const auto [I, V]: enumerate(First&: VL)) {
11424	if (SubMask [I] == PoisonMaskElem)
11425	continue;
11426	Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
11427	if (const TreeEntry *TE = R.getTreeEntry(V: VecOp))
11428	VecOp = TE->VectorizedValue;
11429	assert(VecOp && "Expected vectorized value.");
11430	const int Size =
11431	cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
11432	#ifndef NDEBUG
11433	assert((PrevSize == Size \|\| PrevSize == `0`) &&
11434	"Expected vectors of the same size.");
11435	PrevSize = Size;
11436	#endif // NDEBUG
11437	Bases [SubMask [I] < Size ? `0` : `1`] = VecOp;
11438	}
11439	if (!Bases.front())
11440	continue;
11441	Value *SubVec;
11442	if (Bases.back()) {
11443	SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
11444	TransformToIdentity(SubMask);
11445	} else {
11446	SubVec = Bases.front();
11447	}
11448	if (!Vec) {
11449	Vec = SubVec;
11450	assert((Part == `0` \|\| all_of(seq<unsigned>(`0`, Part),
11451	[&](unsigned P) {
11452	ArrayRef<int> SubMask =
11453	Mask.slice(P * SliceSize, SliceSize);
11454	return all_of(SubMask, [](int Idx) {
11455	return Idx == PoisonMaskElem;
11456	});
11457	})) &&
11458	"Expected first part or all previous parts masked.");
11459	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11460	} else {
11461	unsigned VF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
11462	if (Vec->getType() != SubVec->getType()) {
11463	unsigned SubVecVF =
11464	cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
11465	VF = std::max(a: VF, b: SubVecVF);
11466	}
11467	// Adjust SubMask.
11468	for (int &Idx : SubMask)
11469	if (Idx != PoisonMaskElem)
11470	Idx += VF;
11471	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11472	Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
11473	TransformToIdentity(VecMask);
11474	}
11475	}
11476	copy(Range&: VecMask, Out: Mask.begin());
11477	return Vec;
11478	}
11479	/// Checks if the specified entry \p E needs to be delayed because of its
11480	/// dependency nodes.
11481	std::optional<Value *>
11482	needToDelay(const TreeEntry *E,
11483	ArrayRef<SmallVector<const TreeEntry >> Deps) const* {
11484	// No need to delay emission if all deps are ready.
11485	if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
11486	return all_of(
11487	Range&: TEs, P: [](const TreeEntry TE) { return* TE->VectorizedValue; });
11488	}))
11489	return std::nullopt;
11490	// Postpone gather emission, will be emitted after the end of the
11491	// process to keep correct order.
11492	auto *VecTy = FixedVectorType::get(ElementType: E->Scalars.front()->getType(),
11493	NumElts: E->getVectorFactor());
11494	return Builder.CreateAlignedLoad(
11495	Ty: VecTy, Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: VecTy->getContext())),
11496	Align: MaybeAlign ());
11497	}
11498	/// Adds 2 input vectors (in form of tree entries) and the mask for their
11499	/// shuffling.
11500	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11501	add(V1: E1.VectorizedValue, V2: E2.VectorizedValue, Mask);
11502	}
11503	/// Adds single input vector (in form of tree entry) and the mask for its
11504	/// shuffling.
11505	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11506	add(V1: E1.VectorizedValue, Mask);
11507	}
11508	/// Adds 2 input vectors and the mask for their shuffling.
11509	void add(Value V1, Value V2, ArrayRef<int> Mask) {
11510	assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11511	if (InVectors.empty()) {
11512	InVectors.push_back(Elt: V1);
11513	InVectors.push_back(Elt: V2);
11514	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
11515	return;
11516	}
11517	Value *Vec = InVectors.front();
11518	if (InVectors.size() == `2`) {
11519	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
11520	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11521	} else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
11522	Mask.size()) {
11523	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
11524	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11525	}
11526	V1 = createShuffle(V1, V2, Mask);
11527	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11528	if (Mask [Idx] != PoisonMaskElem)
11529	CommonMask [Idx] = Idx + Sz;
11530	InVectors.front() = Vec;
11531	if (InVectors.size() == `2`)
11532	InVectors.back() = V1;
11533	else
11534	InVectors.push_back(Elt: V1);
11535	}
11536	/// Adds another one input vector and the mask for the shuffling.
11537	void add(Value V1, ArrayRef<int> Mask, bool* = false) {
11538	if (InVectors.empty()) {
11539	if (!isa<FixedVectorType>(Val: V1->getType())) {
11540	V1 = createShuffle(V1, V2: nullptr, Mask: CommonMask);
11541	CommonMask.assign(NumElts: Mask.size(), Elt: PoisonMaskElem);
11542	transformMaskAfterShuffle(CommonMask, Mask);
11543	}
11544	InVectors.push_back(Elt: V1);
11545	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
11546	return;
11547	}
11548	const auto *It = find(Range&: InVectors, Val: V1);
11549	if (It == InVectors.end()) {
11550	if (InVectors.size() == `2` \|\|
11551	InVectors.front()->getType() != V1->getType() \|\|
11552	!isa<FixedVectorType>(Val: V1->getType())) {
11553	Value *V = InVectors.front();
11554	if (InVectors.size() == `2`) {
11555	V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
11556	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11557	} else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
11558	CommonMask.size()) {
11559	V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
11560	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11561	}
11562	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11563	if (CommonMask [Idx] == PoisonMaskElem && Mask [Idx] != PoisonMaskElem)
11564	CommonMask [Idx] =
11565	V->getType() != V1->getType()
11566	? Idx + Sz
11567	: Mask [Idx] + cast<FixedVectorType>(Val: V1->getType())
11568	->getNumElements();
11569	if (V->getType() != V1->getType())
11570	V1 = createShuffle(V1, V2: nullptr, Mask);
11571	InVectors.front() = V;
11572	if (InVectors.size() == `2`)
11573	InVectors.back() = V1;
11574	else
11575	InVectors.push_back(Elt: V1);
11576	return;
11577	}
11578	// Check if second vector is required if the used elements are already
11579	// used from the first one.
11580	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11581	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem) {
11582	InVectors.push_back(Elt: V1);
11583	break;
11584	}
11585	}
11586	int VF = CommonMask.size();
11587	if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
11588	VF = FTy->getNumElements();
11589	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11590	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
11591	CommonMask [Idx] = Mask [Idx] + (It == InVectors.begin() ? `0` : VF);
11592	}
11593	/// Adds another one input vector and the mask for the shuffling.
11594	void addOrdered(Value V1, ArrayRef<unsigned*> Order) {
11595	SmallVector<int> NewMask;
11596	inversePermutation(Indices: Order, Mask&: NewMask);
11597	add(V1, Mask: NewMask);
11598	}
11599	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
11600	Value Root = nullptr*) {
11601	return R.gather(VL, Root);
11602	}
11603	Value createFreeze(Value V) { return Builder.CreateFreeze(V); }
11604	/// Finalize emission of the shuffles.
11605	/// \param Action the action (if any) to be performed before final applying of
11606	/// the \p ExtMask mask.
11607	Value *
11608	finalize(ArrayRef<int> ExtMask, unsigned VF = `0`,
11609	function_ref<void(Value &, SmallVectorImpl<int*> &)> Action = {}) {
11610	IsFinalized = true;
11611	if (Action) {
11612	Value *Vec = InVectors.front();
11613	if (InVectors.size() == `2`) {
11614	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
11615	InVectors.pop_back();
11616	} else {
11617	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
11618	}
11619	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11620	if (CommonMask [Idx] != PoisonMaskElem)
11621	CommonMask [Idx] = Idx;
11622	assert(VF > `0` &&
11623	"Expected vector length for the final value before action.");
11624	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
11625	if (VecVF < VF) {
11626	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11627	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: `0`);
11628	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
11629	}
11630	Action (Vec, CommonMask);
11631	InVectors.front() = Vec;
11632	}
11633	if (!ExtMask.empty()) {
11634	if (CommonMask.empty()) {
11635	CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
11636	} else {
11637	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11638	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
11639	if (ExtMask [I] == PoisonMaskElem)
11640	continue;
11641	NewMask [I] = CommonMask [ExtMask [I]];
11642	}
11643	CommonMask.swap(RHS&: NewMask);
11644	}
11645	}
11646	if (CommonMask.empty()) {
11647	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
11648	return InVectors.front();
11649	}
11650	if (InVectors.size() == `2`)
11651	return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
11652	return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
11653	}
11654
11655	~ShuffleInstructionBuilder() {
11656	assert((IsFinalized \|\| CommonMask.empty()) &&
11657	"Shuffle construction must be finalized.");
11658	}
11659	};
11660
11661	Value BoUpSLP::vectorizeOperand(TreeEntry E, unsigned NodeIdx,
11662	bool PostponedPHIs) {
11663	ValueList &VL = E->getOperand(OpIdx: NodeIdx);
11664	const unsigned VF = VL.size();
11665	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
11666	// Special processing for GEPs bundle, which may include non-gep values.
11667	if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11668	const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
11669	if (It != VL.end())
11670	S = getSameOpcode(VL: It, TLI: TLI);
11671	}
11672	if (S.getOpcode()) {
11673	auto CheckSameVE = [&](const TreeEntry *VE) {
11674	return VE->isSame(VL) &&
11675	(any_of(Range: VE->UserTreeIndices,
11676	P: [E, NodeIdx](const EdgeInfo &EI) {
11677	return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11678	}) \|\|
11679	any_of(Range&: VectorizableTree,
11680	P: [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
11681	return TE ->isOperandGatherNode(UserEI: {E, NodeIdx}) &&
11682	VE->isSame(VL: TE ->Scalars);
11683	}));
11684	};
11685	TreeEntry *VE = getTreeEntry(V: S.OpValue);
11686	bool IsSameVE = VE && CheckSameVE (VE);
11687	if (!IsSameVE) {
11688	auto It = MultiNodeScalars.find(Val: S.OpValue);
11689	if (It != MultiNodeScalars.end()) {
11690	auto I = find_if(Range&: It ->getSecond(), P: [&](const* TreeEntry *TE) {
11691	return TE != VE && CheckSameVE (TE);
11692	});
11693	if (I != It ->getSecond().end()) {
11694	VE = *I;
11695	IsSameVE = true;
11696	}
11697	}
11698	}
11699	if (IsSameVE) {
11700	auto FinalShuffle = [&](Value V, ArrayRef<int*> Mask) {
11701	ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11702	ShuffleBuilder.add(V1: V, Mask);
11703	return ShuffleBuilder.finalize(ExtMask: std::nullopt);
11704	};
11705	Value *V = vectorizeTree(E: VE, PostponedPHIs);
11706	if (VF != cast<FixedVectorType>(Val: V->getType())->getNumElements()) {
11707	if (!VE->ReuseShuffleIndices.empty()) {
11708	// Reshuffle to get only unique values.
11709	// If some of the scalars are duplicated in the vectorization
11710	// tree entry, we do not vectorize them but instead generate a
11711	// mask for the reuses. But if there are several users of the
11712	// same entry, they may have different vectorization factors.
11713	// This is especially important for PHI nodes. In this case, we
11714	// need to adapt the resulting instruction for the user
11715	// vectorization factor and have to reshuffle it again to take
11716	// only unique elements of the vector. Without this code the
11717	// function incorrectly returns reduced vector instruction with
11718	// the same elements, not with the unique ones.
11719
11720	// block:
11721	// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
11722	// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
11723	// ... (use %2)
11724	// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
11725	// br %block
11726	SmallVector<int> Mask(VF, PoisonMaskElem);
11727	for (auto [I, V] : enumerate(First&: VL)) {
11728	if (isa<PoisonValue>(Val: V))
11729	continue;
11730	Mask [I] = VE->findLaneForValue(V);
11731	}
11732	V = FinalShuffle (V, Mask);
11733	} else {
11734	assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
11735	"Expected vectorization factor less "
11736	"than original vector size.");
11737	SmallVector<int> UniformMask(VF, `0`);
11738	std::iota(first: UniformMask.begin(), last: UniformMask.end(), value: `0`);
11739	V = FinalShuffle (V, UniformMask);
11740	}
11741	}
11742	// Need to update the operand gather node, if actually the operand is not a
11743	// vectorized node, but the buildvector/gather node, which matches one of
11744	// the vectorized nodes.
11745	if (find_if(Range&: VE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
11746	return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11747	}) == VE->UserTreeIndices.end()) {
11748	auto *It = find_if(
11749	Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
11750	return TE ->State == TreeEntry::NeedToGather &&
11751	TE ->UserTreeIndices.front().UserTE == E &&
11752	TE ->UserTreeIndices.front().EdgeIdx == NodeIdx;
11753	});
11754	assert(It != VectorizableTree.end() && "Expected gather node operand.");
11755	(*It)->VectorizedValue = V;
11756	}
11757	return V;
11758	}
11759	}
11760
11761	// Find the corresponding gather entry and vectorize it.
11762	// Allows to be more accurate with tree/graph transformations, checks for the
11763	// correctness of the transformations in many cases.
11764	auto *I = find_if(Range&: VectorizableTree,
11765	P: [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
11766	return TE ->isOperandGatherNode(UserEI: {E, NodeIdx});
11767	});
11768	assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
11769	assert(I->get()->UserTreeIndices.size() == `1` &&
11770	"Expected only single user for the gather node.");
11771	assert(I->get()->isSame(VL) && "Expected same list of scalars.");
11772	return vectorizeTree(E: I->get(), PostponedPHIs);
11773	}
11774
11775	template <typename BVTy, typename ResTy, typename... Args>
11776	ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11777	assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
11778	unsigned VF = E->getVectorFactor();
11779
11780	bool NeedFreeze = false;
11781	SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
11782	E->ReuseShuffleIndices.end());
11783	SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
11784	// Build a mask out of the reorder indices and reorder scalars per this
11785	// mask.
11786	SmallVector<int> ReorderMask;
11787	inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
11788	if (!ReorderMask.empty())
11789	reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
11790	auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
11791	unsigned I, unsigned SliceSize) {
11792	if (!isSplat(VL: E->Scalars) \|\| none_of(E->Scalars, [](Value *V) {
11793	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
11794	}))
11795	return false;
11796	TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11797	unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11798	if (UserTE->getNumOperands() != `2`)
11799	return false;
11800	auto *It =
11801	find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
11802	return find_if(TE ->UserTreeIndices, [=](const EdgeInfo &EI) {
11803	return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11804	}) != TE ->UserTreeIndices.end();
11805	});
11806	if (It == VectorizableTree.end())
11807	return false;
11808	int Idx;
11809	if ((Mask.size() < InputVF &&
11810	ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
11811	Idx == `0`) \|\|
11812	(Mask.size() == InputVF &&
11813	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
11814	std::iota(first: std::next(x: Mask.begin(), n: I * SliceSize),
11815	last: std::next(x: Mask.begin(), n: (I + `1`) * SliceSize), value: `0`);
11816	} else {
11817	unsigned IVal =
11818	find_if_not(Mask, [](int* Idx) { return Idx == PoisonMaskElem; });
11819	std::fill(first: std::next(x: Mask.begin(), n: I * SliceSize),
11820	last: std::next(x: Mask.begin(), n: (I + `1`) * SliceSize), value: IVal);
11821	}
11822	return true;
11823	};
11824	BVTy ShuffleBuilder(Params...);
11825	ResTy Res = ResTy();
11826	SmallVector<int> Mask;
11827	SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
11828	SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
11829	Value ExtractVecBase = nullptr*;
11830	bool UseVecBaseAsInput = false;
11831	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
11832	SmallVector<SmallVector<const TreeEntry *>> Entries;
11833	Type *ScalarTy = GatheredScalars.front()->getType();
11834	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: GatheredScalars.size());
11835	unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
11836	if (NumParts == `0` \|\| NumParts >= GatheredScalars.size())
11837	NumParts = `1`;
11838	if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
11839	// Check for gathered extracts.
11840	bool Resized = false;
11841	ExtractShuffles =
11842	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
11843	if (!ExtractShuffles.empty()) {
11844	SmallVector<const TreeEntry *> ExtractEntries;
11845	for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
11846	if (I == PoisonMaskElem)
11847	continue;
11848	if (const auto *TE = getTreeEntry(
11849	V: cast<ExtractElementInst>(Val: E->Scalars [Idx])->getVectorOperand()))
11850	ExtractEntries.push_back(Elt: TE);
11851	}
11852	if (std::optional<ResTy> Delayed =
11853	ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11854	// Delay emission of gathers which are not ready yet.
11855	PostponedGathers.insert(X: E);
11856	// Postpone gather emission, will be emitted after the end of the
11857	// process to keep correct order.
11858	return *Delayed;
11859	}
11860	if (Value *VecBase = ShuffleBuilder.adjustExtracts(
11861	E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11862	ExtractVecBase = VecBase;
11863	if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
11864	if (VF == VecBaseTy->getNumElements() &&
11865	GatheredScalars.size() != VF) {
11866	Resized = true;
11867	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
11868	Elt: PoisonValue::get(T: ScalarTy));
11869	}
11870	}
11871	}
11872	// Gather extracts after we check for full matched gathers only.
11873	if (!ExtractShuffles.empty() \|\| E->getOpcode() != Instruction::Load \|\|
11874	E->isAltShuffle() \|\|
11875	all_of(E->Scalars, [this](Value V) { return* getTreeEntry(V); }) \|\|
11876	isSplat(VL: E->Scalars) \|\|
11877	(E->Scalars != GatheredScalars && GatheredScalars.size() <= `2`)) {
11878	GatherShuffles =
11879	isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
11880	}
11881	if (!GatherShuffles.empty()) {
11882	if (std::optional<ResTy> Delayed =
11883	ShuffleBuilder.needToDelay(E, Entries)) {
11884	// Delay emission of gathers which are not ready yet.
11885	PostponedGathers.insert(X: E);
11886	// Postpone gather emission, will be emitted after the end of the
11887	// process to keep correct order.
11888	return *Delayed;
11889	}
11890	if (GatherShuffles.size() == `1` &&
11891	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
11892	Entries.front().front()->isSame(VL: E->Scalars)) {
11893	// Perfect match in the graph, will reuse the previously vectorized
11894	// node. Cost is 0.
11895	LLVM_DEBUG(
11896	dbgs()
11897	<< "SLP: perfect diamond match for gather bundle "
11898	<< shortBundleName(E->Scalars) << ".\n");
11899	// Restore the mask for previous partially matched values.
11900	Mask.resize(N: E->Scalars.size());
11901	const TreeEntry *FrontTE = Entries.front().front();
11902	if (FrontTE->ReorderIndices.empty() &&
11903	((FrontTE->ReuseShuffleIndices.empty() &&
11904	E->Scalars.size() == FrontTE->Scalars.size()) \|\|
11905	(E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11906	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
11907	} else {
11908	for (auto [I, V] : enumerate(First: E->Scalars)) {
11909	if (isa<PoisonValue>(Val: V)) {
11910	Mask [I] = PoisonMaskElem;
11911	continue;
11912	}
11913	Mask [I] = FrontTE->findLaneForValue(V);
11914	}
11915	}
11916	ShuffleBuilder.add(*FrontTE, Mask);
11917	Res = ShuffleBuilder.finalize(E->getCommonMask());
11918	return Res;
11919	}
11920	if (!Resized) {
11921	if (GatheredScalars.size() != VF &&
11922	any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
11923	return any_of(TEs, [&](const TreeEntry *TE) {
11924	return TE->getVectorFactor() == VF;
11925	});
11926	}))
11927	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
11928	Elt: PoisonValue::get(T: ScalarTy));
11929	}
11930	// Remove shuffled elements from list of gathers.
11931	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
11932	if (Mask [I] != PoisonMaskElem)
11933	GatheredScalars [I] = PoisonValue::get(T: ScalarTy);
11934	}
11935	}
11936	}
11937	auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
11938	SmallVectorImpl<int> &ReuseMask,
11939	bool IsRootPoison) {
11940	// For splats with can emit broadcasts instead of gathers, so try to find
11941	// such sequences.
11942	bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
11943	(Scalars.size() > `2` \|\| Scalars.front() == Scalars.back());
11944	Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: ScalarTy));
11945	SmallVector<int> UndefPos;
11946	DenseMap<Value , unsigned*> UniquePositions;
11947	// Gather unique non-const values and all constant values.
11948	// For repeated values, just shuffle them.
11949	int NumNonConsts = `0`;
11950	int SinglePos = `0`;
11951	for (auto [I, V] : enumerate(First&: Scalars)) {
11952	if (isa<UndefValue>(Val: V)) {
11953	if (!isa<PoisonValue>(Val: V)) {
11954	ReuseMask [I] = I;
11955	UndefPos.push_back(Elt: I);
11956	}
11957	continue;
11958	}
11959	if (isConstant(V)) {
11960	ReuseMask [I] = I;
11961	continue;
11962	}
11963	++NumNonConsts;
11964	SinglePos = I;
11965	Value *OrigV = V;
11966	Scalars [I] = PoisonValue::get(T: ScalarTy);
11967	if (IsSplat) {
11968	Scalars.front() = OrigV;
11969	ReuseMask [I] = `0`;
11970	} else {
11971	const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
11972	Scalars [Res.first ->second] = OrigV;
11973	ReuseMask [I] = Res.first ->second;
11974	}
11975	}
11976	if (NumNonConsts == `1`) {
11977	// Restore single insert element.
11978	if (IsSplat) {
11979	ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
11980	std::swap(a&: Scalars.front(), b&: Scalars [SinglePos]);
11981	if (!UndefPos.empty() && UndefPos.front() == `0`)
11982	Scalars.front() = UndefValue::get(T: ScalarTy);
11983	}
11984	ReuseMask [SinglePos] = SinglePos;
11985	} else if (!UndefPos.empty() && IsSplat) {
11986	// For undef values, try to replace them with the simple broadcast.
11987	// We can do it if the broadcasted value is guaranteed to be
11988	// non-poisonous, or by freezing the incoming scalar value first.
11989	auto It = find_if(Scalars, [this, E](Value V) {
11990	return !isa<UndefValue>(Val: V) &&
11991	(getTreeEntry(V) \|\| isGuaranteedNotToBePoison(V) \|\|
11992	(E->UserTreeIndices.size() == `1` &&
11993	any_of(V->uses(), [E](const Use &U) {
11994	// Check if the value already used in the same operation in
11995	// one of the nodes already.
11996	return E->UserTreeIndices.front().EdgeIdx !=
11997	U.getOperandNo() &&
11998	is_contained(
11999	Range&: E->UserTreeIndices.front().UserTE->Scalars,
12000	Element: U.getUser());
12001	})));
12002	});
12003	if (It != Scalars.end()) {
12004	// Replace undefs by the non-poisoned scalars and emit broadcast.
12005	int Pos = std::distance(Scalars.begin(), It);
12006	for (int I : UndefPos) {
12007	// Set the undef position to the non-poisoned scalar.
12008	ReuseMask [I] = Pos;
12009	// Replace the undef by the poison, in the mask it is replaced by
12010	// non-poisoned scalar already.
12011	if (I != Pos)
12012	Scalars [I] = PoisonValue::get(T: ScalarTy);
12013	}
12014	} else {
12015	// Replace undefs by the poisons, emit broadcast and then emit
12016	// freeze.
12017	for (int I : UndefPos) {
12018	ReuseMask [I] = PoisonMaskElem;
12019	if (isa<UndefValue>(Val: Scalars [I]))
12020	Scalars [I] = PoisonValue::get(T: ScalarTy);
12021	}
12022	NeedFreeze = true;
12023	}
12024	}
12025	};
12026	if (!ExtractShuffles.empty() \|\| !GatherShuffles.empty()) {
12027	bool IsNonPoisoned = true;
12028	bool IsUsedInExpr = true;
12029	Value Vec1 = nullptr*;
12030	if (!ExtractShuffles.empty()) {
12031	// Gather of extractelements can be represented as just a shuffle of
12032	// a single/two vectors the scalars are extracted from.
12033	// Find input vectors.
12034	Value Vec2 = nullptr*;
12035	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
12036	if (!Mask.empty() && Mask [I] != PoisonMaskElem)
12037	ExtractMask [I] = PoisonMaskElem;
12038	}
12039	if (UseVecBaseAsInput) {
12040	Vec1 = ExtractVecBase;
12041	} else {
12042	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
12043	if (ExtractMask [I] == PoisonMaskElem)
12044	continue;
12045	if (isa<UndefValue>(Val: E->Scalars [I]))
12046	continue;
12047	auto *EI = cast<ExtractElementInst>(Val: E->Scalars [I]);
12048	Value *VecOp = EI->getVectorOperand();
12049	if (const auto *TE = getTreeEntry(V: VecOp))
12050	if (TE->VectorizedValue)
12051	VecOp = TE->VectorizedValue;
12052	if (!Vec1) {
12053	Vec1 = VecOp;
12054	} else if (Vec1 != VecOp) {
12055	assert((!Vec2 \|\| Vec2 == VecOp) &&
12056	"Expected only 1 or 2 vectors shuffle.");
12057	Vec2 = VecOp;
12058	}
12059	}
12060	}
12061	if (Vec2) {
12062	IsUsedInExpr = false;
12063	IsNonPoisoned &=
12064	isGuaranteedNotToBePoison(V: Vec1) && isGuaranteedNotToBePoison(V: Vec2);
12065	ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12066	} else if (Vec1) {
12067	IsUsedInExpr &= FindReusedSplat(
12068	ExtractMask,
12069	cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), `0`,
12070	ExtractMask.size());
12071	ShuffleBuilder.add(Vec1, ExtractMask, /ForExtracts=/true);
12072	IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1);
12073	} else {
12074	IsUsedInExpr = false;
12075	ShuffleBuilder.add(PoisonValue::get(T: FixedVectorType::get(
12076	ElementType: ScalarTy, NumElts: GatheredScalars.size())),
12077	ExtractMask, /ForExtracts=/true);
12078	}
12079	}
12080	if (!GatherShuffles.empty()) {
12081	unsigned SliceSize = E->Scalars.size() / NumParts;
12082	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12083	for (const auto [I, TEs] : enumerate(First&: Entries)) {
12084	if (TEs.empty()) {
12085	assert(!GatherShuffles[I] &&
12086	"No shuffles with empty entries list expected.");
12087	continue;
12088	}
12089	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
12090	"Expected shuffle of 1 or 2 entries.");
12091	auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: SliceSize);
12092	VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
12093	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
12094	if (TEs.size() == `1`) {
12095	IsUsedInExpr &= FindReusedSplat(
12096	VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12097	ShuffleBuilder.add(*TEs.front(), VecMask);
12098	if (TEs.front()->VectorizedValue)
12099	IsNonPoisoned &=
12100	isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue);
12101	} else {
12102	IsUsedInExpr = false;
12103	ShuffleBuilder.add(TEs.front(), TEs.back(), VecMask);
12104	if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12105	IsNonPoisoned &=
12106	isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue) &&
12107	isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue);
12108	}
12109	}
12110	}
12111	// Try to figure out best way to combine values: build a shuffle and insert
12112	// elements or just build several shuffles.
12113	// Insert non-constant scalars.
12114	SmallVector<Value *> NonConstants(GatheredScalars);
12115	int EMSz = ExtractMask.size();
12116	int MSz = Mask.size();
12117	// Try to build constant vector and shuffle with it only if currently we
12118	// have a single permutation and more than 1 scalar constants.
12119	bool IsSingleShuffle = ExtractShuffles.empty() \|\| GatherShuffles.empty();
12120	bool IsIdentityShuffle =
12121	((UseVecBaseAsInput \|\|
12122	all_of(ExtractShuffles,
12123	[](const std::optional<TTI::ShuffleKind> &SK) {
12124	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12125	TTI::SK_PermuteSingleSrc;
12126	})) &&
12127	none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12128	ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) \|\|
12129	(!GatherShuffles.empty() &&
12130	all_of(GatherShuffles,
12131	[](const std::optional<TTI::ShuffleKind> &SK) {
12132	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12133	TTI::SK_PermuteSingleSrc;
12134	}) &&
12135	none_of(Mask, [&](int I) { return I >= MSz; }) &&
12136	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
12137	bool EnoughConstsForShuffle =
12138	IsSingleShuffle &&
12139	(none_of(GatheredScalars,
12140	[](Value *V) {
12141	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
12142	}) \|\|
12143	any_of(GatheredScalars,
12144	[](Value *V) {
12145	return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
12146	})) &&
12147	(!IsIdentityShuffle \|\|
12148	(GatheredScalars.size() == `2` &&
12149	any_of(GatheredScalars,
12150	[](Value V) { return* !isa<UndefValue>(Val: V); })) \|\|
12151	count_if(GatheredScalars, [](Value *V) {
12152	return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
12153	}) > `1`);
12154	// NonConstants array contains just non-constant values, GatheredScalars
12155	// contains only constant to build final vector and then shuffle.
12156	for (int I = `0`, Sz = GatheredScalars.size(); I < Sz; ++I) {
12157	if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars [I]))
12158	NonConstants [I] = PoisonValue::get(T: ScalarTy);
12159	else
12160	GatheredScalars [I] = PoisonValue::get(T: ScalarTy);
12161	}
12162	// Generate constants for final shuffle and build a mask for them.
12163	if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
12164	SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12165	TryPackScalars(GatheredScalars, BVMask, /IsRootPoison=/true);
12166	Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12167	ShuffleBuilder.add(BV, BVMask);
12168	}
12169	if (all_of(NonConstants, [=](Value *V) {
12170	return isa<PoisonValue>(Val: V) \|\|
12171	(IsSingleShuffle && ((IsIdentityShuffle &&
12172	IsNonPoisoned) \|\| IsUsedInExpr) && isa<UndefValue>(Val: V));
12173	}))
12174	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12175	else
12176	Res = ShuffleBuilder.finalize(
12177	E->ReuseShuffleIndices, E->Scalars.size(),
12178	[&](Value &Vec, SmallVectorImpl<int*> &Mask) {
12179	TryPackScalars(NonConstants, Mask, /IsRootPoison=/false);
12180	Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12181	});
12182	} else if (!allConstant(VL: GatheredScalars)) {
12183	// Gather unique scalars and all constants.
12184	SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12185	TryPackScalars(GatheredScalars, ReuseMask, /IsRootPoison=/true);
12186	Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12187	ShuffleBuilder.add(BV, ReuseMask);
12188	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12189	} else {
12190	// Gather all constants.
12191	SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12192	for (auto [I, V] : enumerate(First: E->Scalars)) {
12193	if (!isa<PoisonValue>(Val: V))
12194	Mask [I] = I;
12195	}
12196	Value *BV = ShuffleBuilder.gather(E->Scalars);
12197	ShuffleBuilder.add(BV, Mask);
12198	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12199	}
12200
12201	if (NeedFreeze)
12202	Res = ShuffleBuilder.createFreeze(Res);
12203	return Res;
12204	}
12205
12206	Value BoUpSLP::createBuildVector(const* TreeEntry *E) {
12207	return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Params&: Builder,
12208	Params&: *this);
12209	}
12210
12211	Value BoUpSLP::vectorizeTree(TreeEntry E, bool PostponedPHIs) {
12212	IRBuilderBase::InsertPointGuard Guard(Builder);
12213
12214	if (E->VectorizedValue &&
12215	(E->State != TreeEntry::Vectorize \|\| E->getOpcode() != Instruction::PHI \|\|
12216	E->isAltShuffle())) {
12217	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[`0`] << ".\n");
12218	return E->VectorizedValue;
12219	}
12220
12221	if (E->State == TreeEntry::NeedToGather) {
12222	// Set insert point for non-reduction initial nodes.
12223	if (E->getMainOp() && E->Idx == `0` && !UserIgnoreList)
12224	setInsertPointAfterBundle(E);
12225	Value *Vec = createBuildVector(E);
12226	E->VectorizedValue = Vec;
12227	return Vec;
12228	}
12229
12230	bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
12231	auto FinalShuffle = [&](Value V, const* TreeEntry E, VectorType VecTy) {
12232	ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
12233	if (E->getOpcode() == Instruction::Store) {
12234	ArrayRef<int> Mask =
12235	ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12236	E->ReorderIndices.size());
12237	ShuffleBuilder.add(V1: V, Mask);
12238	} else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12239	ShuffleBuilder.addOrdered(V1: V, Order: std::nullopt);
12240	} else {
12241	ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
12242	}
12243	return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices);
12244	};
12245
12246	assert((E->State == TreeEntry::Vectorize \|\|
12247	E->State == TreeEntry::ScatterVectorize \|\|
12248	E->State == TreeEntry::StridedVectorize) &&
12249	"Unhandled state");
12250	unsigned ShuffleOrOp =
12251	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12252	Instruction *VL0 = E->getMainOp();
12253	Type *ScalarTy = VL0->getType();
12254	if (auto *Store = dyn_cast<StoreInst>(Val: VL0))
12255	ScalarTy = Store->getValueOperand()->getType();
12256	else if (auto *IE = dyn_cast<InsertElementInst>(Val: VL0))
12257	ScalarTy = IE->getOperand(i_nocapture: `1`)->getType();
12258	auto It = MinBWs.find(Val: E);
12259	if (It != MinBWs.end())
12260	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
12261	auto GetOperandSignedness = [&](unsigned Idx) {
12262	const TreeEntry *OpE = getOperandEntry(E, Idx);
12263	bool IsSigned = false;
12264	auto It = MinBWs.find(Val: OpE);
12265	if (It != MinBWs.end())
12266	IsSigned = It ->second.second;
12267	else
12268	IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
12269	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
12270	});
12271	return IsSigned;
12272	};
12273	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: E->Scalars.size());
12274	switch (ShuffleOrOp) {
12275	case Instruction::PHI: {
12276	assert((E->ReorderIndices.empty() \|\| !E->ReuseShuffleIndices.empty() \|\|
12277	E != VectorizableTree.front().get() \|\|
12278	!E->UserTreeIndices.empty()) &&
12279	"PHI reordering is free.");
12280	if (PostponedPHIs && E->VectorizedValue)
12281	return E->VectorizedValue;
12282	auto *PH = cast<PHINode>(Val: VL0);
12283	Builder.SetInsertPoint(TheBB: PH->getParent(),
12284	IP: PH->getParent()->getFirstNonPHIIt());
12285	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12286	if (PostponedPHIs \|\| !E->VectorizedValue) {
12287	PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
12288	E->PHI = NewPhi;
12289	Value *V = NewPhi;
12290
12291	// Adjust insertion point once all PHI's have been generated.
12292	Builder.SetInsertPoint(TheBB: PH->getParent(),
12293	IP: PH->getParent()->getFirstInsertionPt());
12294	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12295
12296	V = FinalShuffle (V, E, VecTy);
12297
12298	E->VectorizedValue = V;
12299	if (PostponedPHIs)
12300	return V;
12301	}
12302	PHINode *NewPhi = cast<PHINode>(Val: E->PHI);
12303	// If phi node is fully emitted - exit.
12304	if (NewPhi->getNumIncomingValues() != `0`)
12305	return NewPhi;
12306
12307	// PHINodes may have multiple entries from the same block. We want to
12308	// visit every block once.
12309	SmallPtrSet<BasicBlock *, `4`> VisitedBBs;
12310
12311	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumIncomingValues())) {
12312	ValueList Operands;
12313	BasicBlock *IBB = PH->getIncomingBlock(i: I);
12314
12315	// Stop emission if all incoming values are generated.
12316	if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12317	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12318	return NewPhi;
12319	}
12320
12321	if (!VisitedBBs.insert(Ptr: IBB).second) {
12322	NewPhi->addIncoming(V: NewPhi->getIncomingValueForBlock(BB: IBB), BB: IBB);
12323	continue;
12324	}
12325
12326	Builder.SetInsertPoint(IBB->getTerminator());
12327	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12328	Value Vec = vectorizeOperand(E, NodeIdx: I, /PostponedPHIs=/*true);
12329	if (VecTy != Vec->getType()) {
12330	assert((It != MinBWs.end() \|\|
12331	getOperandEntry(E, I)->State == TreeEntry::NeedToGather \|\|
12332	MinBWs.contains(getOperandEntry(E, I))) &&
12333	"Expected item in MinBWs.");
12334	Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
12335	}
12336	NewPhi->addIncoming(V: Vec, BB: IBB);
12337	}
12338
12339	assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12340	"Invalid number of incoming values");
12341	return NewPhi;
12342	}
12343
12344	case Instruction::ExtractElement: {
12345	Value *V = E->getSingleOperand(OpIdx: `0`);
12346	if (const TreeEntry *TE = getTreeEntry(V))
12347	V = TE->VectorizedValue;
12348	setInsertPointAfterBundle(E);
12349	V = FinalShuffle (V, E, VecTy);
12350	E->VectorizedValue = V;
12351	return V;
12352	}
12353	case Instruction::ExtractValue: {
12354	auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: `0`));
12355	Builder.SetInsertPoint(LI);
12356	Value *Ptr = LI->getPointerOperand();
12357	LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
12358	Value *NewV = propagateMetadata(I: V, VL: E->Scalars);
12359	NewV = FinalShuffle (NewV, E, VecTy);
12360	E->VectorizedValue = NewV;
12361	return NewV;
12362	}
12363	case Instruction::InsertElement: {
12364	assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12365	Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
12366	Value *V = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
12367	ArrayRef<Value *> Op = E->getOperand(OpIdx: `1`);
12368	Type *ScalarTy = Op.front()->getType();
12369	if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
12370	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12371	std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: `1`));
12372	assert(Res.first > `0` && "Expected item in MinBWs.");
12373	V = Builder.CreateIntCast(
12374	V,
12375	DestTy: FixedVectorType::get(
12376	ElementType: ScalarTy,
12377	NumElts: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
12378	isSigned: Res.second);
12379	}
12380
12381	// Create InsertVector shuffle if necessary
12382	auto FirstInsert = cast<Instruction>(Val: find_if(Range&: E->Scalars, P: [E](Value *V) {
12383	return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
12384	}));
12385	const unsigned NumElts =
12386	cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
12387	const unsigned NumScalars = E->Scalars.size();
12388
12389	unsigned Offset = *getInsertIndex(InsertInst: VL0);
12390	assert(Offset < NumElts && "Failed to find vector index offset");
12391
12392	// Create shuffle to resize vector
12393	SmallVector<int> Mask;
12394	if (!E->ReorderIndices.empty()) {
12395	inversePermutation(Indices: E->ReorderIndices, Mask);
12396	Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
12397	} else {
12398	Mask.assign(NumElts, Elt: PoisonMaskElem);
12399	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: `0`);
12400	}
12401	// Create InsertVector shuffle if necessary
12402	bool IsIdentity = true;
12403	SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12404	Mask.swap(RHS&: PrevMask);
12405	for (unsigned I = `0`; I < NumScalars; ++I) {
12406	Value *Scalar = E->Scalars [PrevMask [I]];
12407	unsigned InsertIdx = *getInsertIndex(InsertInst: Scalar);
12408	IsIdentity &= InsertIdx - Offset == I;
12409	Mask [InsertIdx - Offset] = I;
12410	}
12411	if (!IsIdentity \|\| NumElts != NumScalars) {
12412	Value V2 = nullptr*;
12413	bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12414	SmallVector<int> InsertMask(Mask);
12415	if (NumElts != NumScalars && Offset == `0`) {
12416	// Follow all insert element instructions from the current buildvector
12417	// sequence.
12418	InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
12419	do {
12420	std::optional<unsigned> InsertIdx = getInsertIndex(InsertInst: Ins);
12421	if (!InsertIdx)
12422	break;
12423	if (InsertMask [*InsertIdx] == PoisonMaskElem)
12424	InsertMask [InsertIdx] = InsertIdx;
12425	if (!Ins->hasOneUse())
12426	break;
12427	Ins = dyn_cast_or_null<InsertElementInst>(
12428	Val: Ins->getUniqueUndroppableUser());
12429	} while (Ins);
12430	SmallBitVector UseMask =
12431	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
12432	SmallBitVector IsFirstPoison =
12433	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
12434	SmallBitVector IsFirstUndef =
12435	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
12436	if (!IsFirstPoison.all()) {
12437	unsigned Idx = `0`;
12438	for (unsigned I = `0`; I < NumElts; I++) {
12439	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
12440	IsFirstUndef.test(Idx: I)) {
12441	if (IsVNonPoisonous) {
12442	InsertMask [I] = I < NumScalars ? I : `0`;
12443	continue;
12444	}
12445	if (!V2)
12446	V2 = UndefValue::get(T: V->getType());
12447	if (Idx >= NumScalars)
12448	Idx = NumScalars - `1`;
12449	InsertMask [I] = NumScalars + Idx;
12450	++Idx;
12451	} else if (InsertMask [I] != PoisonMaskElem &&
12452	Mask [I] == PoisonMaskElem) {
12453	InsertMask [I] = PoisonMaskElem;
12454	}
12455	}
12456	} else {
12457	InsertMask = Mask;
12458	}
12459	}
12460	if (!V2)
12461	V2 = PoisonValue::get(T: V->getType());
12462	V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
12463	if (auto *I = dyn_cast<Instruction>(Val: V)) {
12464	GatherShuffleExtractSeq.insert(X: I);
12465	CSEBlocks.insert(V: I->getParent());
12466	}
12467	}
12468
12469	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12470	for (unsigned I = `0`; I < NumElts; I++) {
12471	if (Mask [I] != PoisonMaskElem)
12472	InsertMask [Offset + I] = I;
12473	}
12474	SmallBitVector UseMask =
12475	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
12476	SmallBitVector IsFirstUndef =
12477	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
12478	if ((!IsIdentity \|\| Offset != `0` \|\| !IsFirstUndef.all()) &&
12479	NumElts != NumScalars) {
12480	if (IsFirstUndef.all()) {
12481	if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
12482	SmallBitVector IsFirstPoison =
12483	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
12484	if (!IsFirstPoison.all()) {
12485	for (unsigned I = `0`; I < NumElts; I++) {
12486	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
12487	InsertMask [I] = I + NumElts;
12488	}
12489	}
12490	V = Builder.CreateShuffleVector(
12491	V1: V,
12492	V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
12493	: FirstInsert->getOperand(i: `0`),
12494	Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
12495	if (auto *I = dyn_cast<Instruction>(Val: V)) {
12496	GatherShuffleExtractSeq.insert(X: I);
12497	CSEBlocks.insert(V: I->getParent());
12498	}
12499	}
12500	} else {
12501	SmallBitVector IsFirstPoison =
12502	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
12503	for (unsigned I = `0`; I < NumElts; I++) {
12504	if (InsertMask [I] == PoisonMaskElem)
12505	InsertMask [I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
12506	else
12507	InsertMask [I] += NumElts;
12508	}
12509	V = Builder.CreateShuffleVector(
12510	V1: FirstInsert->getOperand(i: `0`), V2: V, Mask: InsertMask,
12511	Name: cast<Instruction>(Val: E->Scalars.back())->getName());
12512	if (auto *I = dyn_cast<Instruction>(Val: V)) {
12513	GatherShuffleExtractSeq.insert(X: I);
12514	CSEBlocks.insert(V: I->getParent());
12515	}
12516	}
12517	}
12518
12519	++NumVectorInstructions;
12520	E->VectorizedValue = V;
12521	return V;
12522	}
12523	case Instruction::ZExt:
12524	case Instruction::SExt:
12525	case Instruction::FPToUI:
12526	case Instruction::FPToSI:
12527	case Instruction::FPExt:
12528	case Instruction::PtrToInt:
12529	case Instruction::IntToPtr:
12530	case Instruction::SIToFP:
12531	case Instruction::UIToFP:
12532	case Instruction::Trunc:
12533	case Instruction::FPTrunc:
12534	case Instruction::BitCast: {
12535	setInsertPointAfterBundle(E);
12536
12537	Value *InVec = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12538	if (E->VectorizedValue) {
12539	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12540	return E->VectorizedValue;
12541	}
12542
12543	auto *CI = cast<CastInst>(Val: VL0);
12544	Instruction::CastOps VecOpcode = CI->getOpcode();
12545	Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
12546	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
12547	if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12548	(SrcIt != MinBWs.end() \|\| It != MinBWs.end() \|\|
12549	SrcScalarTy != CI->getOperand(i_nocapture: `0`)->getType())) {
12550	// Check if the values are candidates to demote.
12551	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
12552	if (SrcIt != MinBWs.end())
12553	SrcBWSz = SrcIt ->second.first;
12554	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
12555	if (BWSz == SrcBWSz) {
12556	VecOpcode = Instruction::BitCast;
12557	} else if (BWSz < SrcBWSz) {
12558	VecOpcode = Instruction::Trunc;
12559	} else if (It != MinBWs.end()) {
12560	assert(BWSz > SrcBWSz && "Invalid cast!");
12561	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
12562	} else if (SrcIt != MinBWs.end()) {
12563	assert(BWSz > SrcBWSz && "Invalid cast!");
12564	VecOpcode =
12565	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
12566	}
12567	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12568	!SrcIt ->second.second) {
12569	VecOpcode = Instruction::UIToFP;
12570	}
12571	Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12572	? InVec
12573	: Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
12574	V = FinalShuffle (V, E, VecTy);
12575
12576	E->VectorizedValue = V;
12577	++NumVectorInstructions;
12578	return V;
12579	}
12580	case Instruction::FCmp:
12581	case Instruction::ICmp: {
12582	setInsertPointAfterBundle(E);
12583
12584	Value *L = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12585	if (E->VectorizedValue) {
12586	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12587	return E->VectorizedValue;
12588	}
12589	Value *R = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
12590	if (E->VectorizedValue) {
12591	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12592	return E->VectorizedValue;
12593	}
12594	if (L->getType() != R->getType()) {
12595	assert((getOperandEntry(E, `0`)->State == TreeEntry::NeedToGather \|\|
12596	getOperandEntry(E, `1`)->State == TreeEntry::NeedToGather \|\|
12597	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
12598	MinBWs.contains(getOperandEntry(E, `1`))) &&
12599	"Expected item in MinBWs.");
12600	if (cast<VectorType>(Val: L->getType())
12601	->getElementType()
12602	->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
12603	->getElementType()
12604	->getIntegerBitWidth()) {
12605	Type *CastTy = R->getType();
12606	L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
12607	} else {
12608	Type *CastTy = L->getType();
12609	R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
12610	}
12611	}
12612
12613	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
12614	Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
12615	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
12616	// Do not cast for cmps.
12617	VecTy = cast<FixedVectorType>(Val: V->getType());
12618	V = FinalShuffle (V, E, VecTy);
12619
12620	E->VectorizedValue = V;
12621	++NumVectorInstructions;
12622	return V;
12623	}
12624	case Instruction::Select: {
12625	setInsertPointAfterBundle(E);
12626
12627	Value *Cond = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12628	if (E->VectorizedValue) {
12629	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12630	return E->VectorizedValue;
12631	}
12632	Value *True = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
12633	if (E->VectorizedValue) {
12634	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12635	return E->VectorizedValue;
12636	}
12637	Value *False = vectorizeOperand(E, NodeIdx: `2`, PostponedPHIs);
12638	if (E->VectorizedValue) {
12639	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12640	return E->VectorizedValue;
12641	}
12642	if (True->getType() != VecTy \|\| False->getType() != VecTy) {
12643	assert((It != MinBWs.end() \|\|
12644	getOperandEntry(E, `1`)->State == TreeEntry::NeedToGather \|\|
12645	getOperandEntry(E, `2`)->State == TreeEntry::NeedToGather \|\|
12646	MinBWs.contains(getOperandEntry(E, `1`)) \|\|
12647	MinBWs.contains(getOperandEntry(E, `2`))) &&
12648	"Expected item in MinBWs.");
12649	if (True->getType() != VecTy)
12650	True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
12651	if (False->getType() != VecTy)
12652	False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness (`2`));
12653	}
12654
12655	Value *V = Builder.CreateSelect(C: Cond, True, False);
12656	V = FinalShuffle (V, E, VecTy);
12657
12658	E->VectorizedValue = V;
12659	++NumVectorInstructions;
12660	return V;
12661	}
12662	case Instruction::FNeg: {
12663	setInsertPointAfterBundle(E);
12664
12665	Value *Op = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12666
12667	if (E->VectorizedValue) {
12668	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12669	return E->VectorizedValue;
12670	}
12671
12672	Value *V = Builder.CreateUnOp(
12673	Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
12674	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
12675	if (auto *I = dyn_cast<Instruction>(Val: V))
12676	V = propagateMetadata(I, VL: E->Scalars);
12677
12678	V = FinalShuffle (V, E, VecTy);
12679
12680	E->VectorizedValue = V;
12681	++NumVectorInstructions;
12682
12683	return V;
12684	}
12685	case Instruction::Add:
12686	case Instruction::FAdd:
12687	case Instruction::Sub:
12688	case Instruction::FSub:
12689	case Instruction::Mul:
12690	case Instruction::FMul:
12691	case Instruction::UDiv:
12692	case Instruction::SDiv:
12693	case Instruction::FDiv:
12694	case Instruction::URem:
12695	case Instruction::SRem:
12696	case Instruction::FRem:
12697	case Instruction::Shl:
12698	case Instruction::LShr:
12699	case Instruction::AShr:
12700	case Instruction::And:
12701	case Instruction::Or:
12702	case Instruction::Xor: {
12703	setInsertPointAfterBundle(E);
12704
12705	Value *LHS = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12706	if (E->VectorizedValue) {
12707	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12708	return E->VectorizedValue;
12709	}
12710	Value *RHS = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
12711	if (E->VectorizedValue) {
12712	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12713	return E->VectorizedValue;
12714	}
12715	if (LHS->getType() != VecTy \|\| RHS->getType() != VecTy) {
12716	assert((It != MinBWs.end() \|\|
12717	getOperandEntry(E, `0`)->State == TreeEntry::NeedToGather \|\|
12718	getOperandEntry(E, `1`)->State == TreeEntry::NeedToGather \|\|
12719	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
12720	MinBWs.contains(getOperandEntry(E, `1`))) &&
12721	"Expected item in MinBWs.");
12722	if (LHS->getType() != VecTy)
12723	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
12724	if (RHS->getType() != VecTy)
12725	RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
12726	}
12727
12728	Value *V = Builder.CreateBinOp(
12729	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
12730	RHS);
12731	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0, IncludeWrapFlags: It == MinBWs.end());
12732	if (auto *I = dyn_cast<Instruction>(Val: V)) {
12733	V = propagateMetadata(I, VL: E->Scalars);
12734	// Drop nuw flags for abs(sub(commutative), true).
12735	if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
12736	any_of(Range&: E->Scalars, P: [](Value *V) {
12737	return isCommutative(I: cast<Instruction>(Val: V));
12738	}))
12739	I->setHasNoUnsignedWrap(/b=/false);
12740	}
12741
12742	V = FinalShuffle (V, E, VecTy);
12743
12744	E->VectorizedValue = V;
12745	++NumVectorInstructions;
12746
12747	return V;
12748	}
12749	case Instruction::Load: {
12750	// Loads are inserted at the head of the tree because we don't want to
12751	// sink them all the way down past store instructions.
12752	setInsertPointAfterBundle(E);
12753
12754	LoadInst *LI = cast<LoadInst>(Val: VL0);
12755	Instruction *NewLI;
12756	Value *PO = LI->getPointerOperand();
12757	if (E->State == TreeEntry::Vectorize) {
12758	NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
12759	} else if (E->State == TreeEntry::StridedVectorize) {
12760	Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
12761	Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
12762	PO = IsReverseOrder ? PtrN : Ptr0;
12763	std::optional<int> Diff = getPointersDiff(
12764	ElemTyA: VL0->getType(), PtrA: Ptr0, ElemTyB: VL0->getType(), PtrB: PtrN, DL: DL, SE&: SE);
12765	Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
12766	Value *StrideVal;
12767	if (Diff) {
12768	int Stride = Diff / (static_cast<int*>(E->Scalars.size()) - `1`);
12769	StrideVal =
12770	ConstantInt::get(Ty: StrideTy, V: (IsReverseOrder ? -`1` : `1`) * Stride *
12771	DL->getTypeAllocSize(Ty: ScalarTy));
12772	} else {
12773	SmallVector<Value > PointerOps(E->Scalars.size(), nullptr*);
12774	transform(Range&: E->Scalars, d_first: PointerOps.begin(), F: [](Value *V) {
12775	return cast<LoadInst>(Val: V)->getPointerOperand();
12776	});
12777	OrdersType Order;
12778	std::optional<Value *> Stride =
12779	calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order,
12780	Inst: &*Builder.GetInsertPoint());
12781	Value *NewStride =
12782	Builder.CreateIntCast(V: Stride, DestTy: StrideTy, /isSigned=/*true);
12783	StrideVal = Builder.CreateMul(
12784	LHS: NewStride,
12785	RHS: ConstantInt::get(
12786	Ty: StrideTy,
12787	V: (IsReverseOrder ? -`1` : `1`) *
12788	static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))));
12789	}
12790	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
12791	auto *Inst = Builder.CreateIntrinsic(
12792	Intrinsic::experimental_vp_strided_load,
12793	{VecTy, PO->getType(), StrideTy},
12794	{PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
12795	Builder.getInt32(E->Scalars.size())});
12796	Inst->addParamAttr(
12797	/ArgNo=/`0`,
12798	Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
12799	NewLI = Inst;
12800	} else {
12801	assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
12802	Value *VecPtr = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12803	if (E->VectorizedValue) {
12804	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12805	return E->VectorizedValue;
12806	}
12807	// Use the minimum alignment of the gathered loads.
12808	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
12809	NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
12810	}
12811	Value *V = propagateMetadata(I: NewLI, VL: E->Scalars);
12812
12813	V = FinalShuffle (V, E, VecTy);
12814	E->VectorizedValue = V;
12815	++NumVectorInstructions;
12816	return V;
12817	}
12818	case Instruction::Store: {
12819	auto *SI = cast<StoreInst>(Val: VL0);
12820
12821	setInsertPointAfterBundle(E);
12822
12823	Value *VecValue = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12824	if (VecValue->getType() != VecTy)
12825	VecValue =
12826	Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
12827	VecValue = FinalShuffle (VecValue, E, VecTy);
12828
12829	Value *Ptr = SI->getPointerOperand();
12830	StoreInst *ST =
12831	Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
12832
12833	Value *V = propagateMetadata(I: ST, VL: E->Scalars);
12834
12835	E->VectorizedValue = V;
12836	++NumVectorInstructions;
12837	return V;
12838	}
12839	case Instruction::GetElementPtr: {
12840	auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
12841	setInsertPointAfterBundle(E);
12842
12843	Value *Op0 = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12844	if (E->VectorizedValue) {
12845	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12846	return E->VectorizedValue;
12847	}
12848
12849	SmallVector<Value *> OpVecs;
12850	for (int J = `1`, N = GEP0->getNumOperands(); J < N; ++J) {
12851	Value *OpVec = vectorizeOperand(E, NodeIdx: J, PostponedPHIs);
12852	if (E->VectorizedValue) {
12853	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12854	return E->VectorizedValue;
12855	}
12856	OpVecs.push_back(Elt: OpVec);
12857	}
12858
12859	Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
12860	if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
12861	SmallVector<Value *> GEPs;
12862	for (Value *V : E->Scalars) {
12863	if (isa<GetElementPtrInst>(Val: V))
12864	GEPs.push_back(Elt: V);
12865	}
12866	V = propagateMetadata(I, VL: GEPs);
12867	}
12868
12869	V = FinalShuffle (V, E, VecTy);
12870
12871	E->VectorizedValue = V;
12872	++NumVectorInstructions;
12873
12874	return V;
12875	}
12876	case Instruction::Call: {
12877	CallInst *CI = cast<CallInst>(Val: VL0);
12878	setInsertPointAfterBundle(E);
12879
12880	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
12881
12882	SmallVector<Type *> ArgTys =
12883	buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
12884	MinBW: It != MinBWs.end() ? It ->second.first : `0`);
12885	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
12886	bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12887	VecCallCosts.first <= VecCallCosts.second;
12888
12889	Value ScalarArg = nullptr*;
12890	SmallVector<Value *> OpVecs;
12891	SmallVector<Type *, `2`> TysForDecl;
12892	// Add return type if intrinsic is overloaded on it.
12893	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -`1`))
12894	TysForDecl.push_back(Elt: VecTy);
12895	auto *CEI = cast<CallInst>(Val: VL0);
12896	for (unsigned I : seq<unsigned>(Begin: `0`, End: CI->arg_size())) {
12897	ValueList OpVL;
12898	// Some intrinsics have scalar arguments. This argument should not be
12899	// vectorized.
12900	if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I)) {
12901	ScalarArg = CEI->getArgOperand(i: I);
12902	// if decided to reduce bitwidth of abs intrinsic, it second argument
12903	// must be set false (do not return poison, if value issigned min).
12904	if (ID == Intrinsic::abs && It != MinBWs.end() &&
12905	It ->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
12906	ScalarArg = Builder.getFalse();
12907	OpVecs.push_back(Elt: ScalarArg);
12908	if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
12909	TysForDecl.push_back(Elt: ScalarArg->getType());
12910	continue;
12911	}
12912
12913	Value *OpVec = vectorizeOperand(E, NodeIdx: I, PostponedPHIs);
12914	if (E->VectorizedValue) {
12915	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12916	return E->VectorizedValue;
12917	}
12918	ScalarArg = CEI->getArgOperand(i: I);
12919	if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
12920	ScalarArg->getType() &&
12921	It == MinBWs.end()) {
12922	auto *CastTy = FixedVectorType::get(ElementType: ScalarArg->getType(),
12923	NumElts: VecTy->getNumElements());
12924	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness (I));
12925	} else if (It != MinBWs.end()) {
12926	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
12927	}
12928	LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
12929	OpVecs.push_back(Elt: OpVec);
12930	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
12931	TysForDecl.push_back(Elt: OpVec->getType());
12932	}
12933
12934	Function *CF;
12935	if (!UseIntrinsic) {
12936	VFShape Shape =
12937	VFShape::get(FTy: CI->getFunctionType(),
12938	EC: ElementCount::getFixed(
12939	MinVal: static_cast<unsigned>(VecTy->getNumElements())),
12940	HasGlobalPred: false /HasGlobalPred/);
12941	CF = VFDatabase (*CI).getVectorizedFunction(Shape);
12942	} else {
12943	CF = Intrinsic::getDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
12944	}
12945
12946	SmallVector<OperandBundleDef, `1`> OpBundles;
12947	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
12948	Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
12949
12950	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
12951	V = FinalShuffle (V, E, VecTy);
12952
12953	E->VectorizedValue = V;
12954	++NumVectorInstructions;
12955	return V;
12956	}
12957	case Instruction::ShuffleVector: {
12958	assert(E->isAltShuffle() &&
12959	((Instruction::isBinaryOp(E->getOpcode()) &&
12960	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
12961	(Instruction::isCast(E->getOpcode()) &&
12962	Instruction::isCast(E->getAltOpcode())) \|\|
12963	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12964	"Invalid Shuffle Vector Operand");
12965
12966	Value LHS = nullptr, RHS = nullptr;
12967	if (Instruction::isBinaryOp(Opcode: E->getOpcode()) \|\| isa<CmpInst>(Val: VL0)) {
12968	setInsertPointAfterBundle(E);
12969	LHS = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12970	if (E->VectorizedValue) {
12971	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12972	return E->VectorizedValue;
12973	}
12974	RHS = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
12975	} else {
12976	setInsertPointAfterBundle(E);
12977	LHS = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
12978	}
12979	if (E->VectorizedValue) {
12980	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12981	return E->VectorizedValue;
12982	}
12983	if (LHS && RHS &&
12984	((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
12985	(LHS->getType() != VecTy \|\| RHS->getType() != VecTy)) \|\|
12986	(isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
12987	assert((It != MinBWs.end() \|\|
12988	getOperandEntry(E, `0`)->State == TreeEntry::NeedToGather \|\|
12989	getOperandEntry(E, `1`)->State == TreeEntry::NeedToGather \|\|
12990	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
12991	MinBWs.contains(getOperandEntry(E, `1`))) &&
12992	"Expected item in MinBWs.");
12993	Type *CastTy = VecTy;
12994	if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
12995	if (cast<VectorType>(Val: LHS->getType())
12996	->getElementType()
12997	->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
12998	->getElementType()
12999	->getIntegerBitWidth())
13000	CastTy = RHS->getType();
13001	else
13002	CastTy = LHS->getType();
13003	}
13004	if (LHS->getType() != CastTy)
13005	LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
13006	if (RHS->getType() != CastTy)
13007	RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
13008	}
13009
13010	Value V0, V1;
13011	if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
13012	V0 = Builder.CreateBinOp(
13013	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13014	V1 = Builder.CreateBinOp(
13015	Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13016	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
13017	V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
13018	auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
13019	CmpInst::Predicate AltPred = AltCI->getPredicate();
13020	V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
13021	} else {
13022	if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13023	unsigned SrcBWSz = DL->getTypeSizeInBits(
13024	Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
13025	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13026	if (BWSz <= SrcBWSz) {
13027	if (BWSz < SrcBWSz)
13028	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It ->second.first);
13029	assert(LHS->getType() == VecTy && "Expected same type as operand.");
13030	if (auto *I = dyn_cast<Instruction>(Val: LHS))
13031	LHS = propagateMetadata(I, VL: E->Scalars);
13032	E->VectorizedValue = LHS;
13033	++NumVectorInstructions;
13034	return LHS;
13035	}
13036	}
13037	V0 = Builder.CreateCast(
13038	Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
13039	V1 = Builder.CreateCast(
13040	Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
13041	}
13042	// Add V0 and V1 to later analysis to try to find and remove matching
13043	// instruction, if any.
13044	for (Value *V : {V0, V1}) {
13045	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13046	GatherShuffleExtractSeq.insert(X: I);
13047	CSEBlocks.insert(V: I->getParent());
13048	}
13049	}
13050
13051	// Create shuffle to take alternate operations from the vector.
13052	// Also, gather up main and alt scalar ops to propagate IR flags to
13053	// each vector operation.
13054	ValueList OpScalars, AltScalars;
13055	SmallVector<int> Mask;
13056	E->buildAltOpShuffleMask(
13057	IsAltOp: [E, this](Instruction *I) {
13058	assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13059	return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
13060	TLI: *TLI);
13061	},
13062	Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
13063
13064	propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
13065	propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
13066	auto DropNuwFlag = [&](Value Vec, unsigned* Opcode) {
13067	// Drop nuw flags for abs(sub(commutative), true).
13068	if (auto *I = dyn_cast<Instruction>(Val: Vec);
13069	I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
13070	any_of(Range&: E->Scalars, P: [](Value *V) {
13071	auto *IV = cast<Instruction>(Val: V);
13072	return IV->getOpcode() == Instruction::Sub &&
13073	isCommutative(I: cast<Instruction>(Val: IV));
13074	}))
13075	I->setHasNoUnsignedWrap(/b=/false);
13076	};
13077	DropNuwFlag (V0, E->getOpcode());
13078	DropNuwFlag (V1, E->getAltOpcode());
13079
13080	Value *V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
13081	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13082	V = propagateMetadata(I, VL: E->Scalars);
13083	GatherShuffleExtractSeq.insert(X: I);
13084	CSEBlocks.insert(V: I->getParent());
13085	}
13086
13087	E->VectorizedValue = V;
13088	++NumVectorInstructions;
13089
13090	return V;
13091	}
13092	default:
13093	llvm_unreachable("unknown inst");
13094	}
13095	return nullptr;
13096	}
13097
13098	Value *BoUpSLP::vectorizeTree() {
13099	ExtraValueToDebugLocsMap ExternallyUsedValues;
13100	SmallVector<std::pair<Value , Value >> ReplacedExternals;
13101	return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13102	}
13103
13104	namespace {
13105	/// Data type for handling buildvector sequences with the reused scalars from
13106	/// other tree entries.
13107	struct ShuffledInsertData {
13108	/// List of insertelements to be replaced by shuffles.
13109	SmallVector<InsertElementInst *> InsertElements;
13110	/// The parent vectors and shuffle mask for the given list of inserts.
13111	MapVector<Value , SmallVector<int*>> ValueMasks;
13112	};
13113	} // namespace
13114
13115	Value *BoUpSLP::vectorizeTree(
13116	const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13117	SmallVectorImpl<std::pair<Value , Value >> &ReplacedExternals,
13118	Instruction *ReductionRoot) {
13119	// All blocks must be scheduled before any instructions are inserted.
13120	for (auto &BSIter : BlocksSchedules) {
13121	scheduleBlock(BS: BSIter.second.get());
13122	}
13123	// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13124	// need to rebuild it.
13125	EntryToLastInstruction.clear();
13126
13127	if (ReductionRoot)
13128	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
13129	IP: ReductionRoot->getIterator());
13130	else
13131	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13132
13133	// Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13134	(void)vectorizeTree(E: VectorizableTree [`0`].get(), /PostponedPHIs=/true);
13135	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13136	if (TE ->State == TreeEntry::Vectorize &&
13137	TE ->getOpcode() == Instruction::PHI && !TE ->isAltShuffle() &&
13138	TE ->VectorizedValue)
13139	(void)vectorizeTree(E: TE.get(), /PostponedPHIs=/false);
13140	// Run through the list of postponed gathers and emit them, replacing the temp
13141	// emitted allocas with actual vector instructions.
13142	ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13143	DenseMap<Value , SmallVector<TreeEntry >> PostponedValues;
13144	for (const TreeEntry *E : PostponedNodes) {
13145	auto TE = const_cast<TreeEntry >(E);
13146	if (auto *VecTE = getTreeEntry(V: TE->Scalars.front()))
13147	if (VecTE->isSame(VL: TE->UserTreeIndices.front().UserTE->getOperand(
13148	OpIdx: TE->UserTreeIndices.front().EdgeIdx)))
13149	// Found gather node which is absolutely the same as one of the
13150	// vectorized nodes. It may happen after reordering.
13151	continue;
13152	auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
13153	TE->VectorizedValue = nullptr;
13154	auto *UserI =
13155	cast<Instruction>(Val&: TE->UserTreeIndices.front().UserTE->VectorizedValue);
13156	// If user is a PHI node, its vector code have to be inserted right before
13157	// block terminator. Since the node was delayed, there were some unresolved
13158	// dependencies at the moment when stab instruction was emitted. In a case
13159	// when any of these dependencies turn out an operand of another PHI, coming
13160	// from this same block, position of a stab instruction will become invalid.
13161	// The is because source vector that supposed to feed this gather node was
13162	// inserted at the end of the block [after stab instruction]. So we need
13163	// to adjust insertion point again to the end of block.
13164	if (isa<PHINode>(Val: UserI)) {
13165	// Insert before all users.
13166	Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13167	for (User *U : PrevVec->users()) {
13168	if (U == UserI)
13169	continue;
13170	auto *UI = dyn_cast<Instruction>(Val: U);
13171	if (!UI \|\| isa<PHINode>(Val: UI) \|\| UI->getParent() != InsertPt->getParent())
13172	continue;
13173	if (UI->comesBefore(Other: InsertPt))
13174	InsertPt = UI;
13175	}
13176	Builder.SetInsertPoint(InsertPt);
13177	} else {
13178	Builder.SetInsertPoint(PrevVec);
13179	}
13180	Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13181	Value Vec = vectorizeTree(E: TE, /PostponedPHIs=/*false);
13182	if (Vec->getType() != PrevVec->getType()) {
13183	assert(Vec->getType()->isIntOrIntVectorTy() &&
13184	PrevVec->getType()->isIntOrIntVectorTy() &&
13185	"Expected integer vector types only.");
13186	std::optional<bool> IsSigned;
13187	for (Value *V : TE->Scalars) {
13188	if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13189	auto It = MinBWs.find(Val: BaseTE);
13190	if (It != MinBWs.end()) {
13191	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
13192	if (*IsSigned)
13193	break;
13194	}
13195	for (const TreeEntry *MNTE : MultiNodeScalars.lookup(Val: V)) {
13196	auto It = MinBWs.find(Val: MNTE);
13197	if (It != MinBWs.end()) {
13198	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
13199	if (*IsSigned)
13200	break;
13201	}
13202	}
13203	if (IsSigned.value_or(u: false))
13204	break;
13205	// Scan through gather nodes.
13206	for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
13207	auto It = MinBWs.find(Val: BVE);
13208	if (It != MinBWs.end()) {
13209	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
13210	if (*IsSigned)
13211	break;
13212	}
13213	}
13214	if (IsSigned.value_or(u: false))
13215	break;
13216	if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
13217	IsSigned =
13218	IsSigned.value_or(u: false) \|\|
13219	!isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery (*DL));
13220	continue;
13221	}
13222	if (IsSigned.value_or(u: false))
13223	break;
13224	}
13225	}
13226	if (IsSigned.value_or(u: false)) {
13227	// Final attempt - check user node.
13228	auto It = MinBWs.find(Val: TE->UserTreeIndices.front().UserTE);
13229	if (It != MinBWs.end())
13230	IsSigned = It ->second.second;
13231	}
13232	assert(IsSigned &&
13233	"Expected user node or perfect diamond match in MinBWs.");
13234	Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
13235	}
13236	PrevVec->replaceAllUsesWith(V: Vec);
13237	PostponedValues.try_emplace(Key: Vec).first ->second.push_back(Elt: TE);
13238	// Replace the stub vector node, if it was used before for one of the
13239	// buildvector nodes already.
13240	auto It = PostponedValues.find(Val: PrevVec);
13241	if (It != PostponedValues.end()) {
13242	for (TreeEntry *VTE : It ->getSecond())
13243	VTE->VectorizedValue = Vec;
13244	}
13245	eraseInstruction(I: PrevVec);
13246	}
13247
13248	LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13249	<< " values .\n");
13250
13251	SmallVector<ShuffledInsertData> ShuffledInserts;
13252	// Maps vector instruction to original insertelement instruction
13253	DenseMap<Value , InsertElementInst > VectorToInsertElement;
13254	// Maps extract Scalar to the corresponding extractelement instruction in the
13255	// basic block. Only one extractelement per block should be emitted.
13256	DenseMap<Value *,
13257	DenseMap<BasicBlock , std::pair<Instruction , Instruction *>>>
13258	ScalarToEEs;
13259	SmallDenseSet<Value *, `4`> UsedInserts;
13260	DenseMap<std::pair<Value , Type >, Value *> VectorCasts;
13261	SmallDenseSet<Value *, `4`> ScalarsWithNullptrUser;
13262	// Extract all of the elements with the external uses.
13263	for (const auto &ExternalUse : ExternalUses) {
13264	Value *Scalar = ExternalUse.Scalar;
13265	llvm::User *User = ExternalUse.User;
13266
13267	// Skip users that we already RAUW. This happens when one instruction
13268	// has multiple uses of the same value.
13269	if (User && !is_contained(Range: Scalar->users(), Element: User))
13270	continue;
13271	TreeEntry *E = getTreeEntry(V: Scalar);
13272	assert(E && "Invalid scalar");
13273	assert(E->State != TreeEntry::NeedToGather &&
13274	"Extracting from a gather list");
13275	// Non-instruction pointers are not deleted, just skip them.
13276	if (E->getOpcode() == Instruction::GetElementPtr &&
13277	!isa<GetElementPtrInst>(Val: Scalar))
13278	continue;
13279
13280	Value *Vec = E->VectorizedValue;
13281	assert(Vec && "Can't find vectorizable value");
13282
13283	Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
13284	auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13285	if (Scalar->getType() != Vec->getType()) {
13286	Value Ex = nullptr*;
13287	Value ExV = nullptr*;
13288	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Scalar);
13289	bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(Ptr: GEP);
13290	auto It = ScalarToEEs.find(Val: Scalar);
13291	if (It != ScalarToEEs.end()) {
13292	// No need to emit many extracts, just move the only one in the
13293	// current block.
13294	auto EEIt = It ->second.find(Val: Builder.GetInsertBlock());
13295	if (EEIt != It ->second.end()) {
13296	Instruction *I = EEIt ->second.first;
13297	if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13298	Builder.GetInsertPoint()->comesBefore(Other: I)) {
13299	I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
13300	I: Builder.GetInsertPoint());
13301	if (auto *CI = EEIt ->second.second)
13302	CI->moveAfter(MovePos: I);
13303	}
13304	Ex = I;
13305	ExV = EEIt ->second.second ? EEIt ->second.second : Ex;
13306	}
13307	}
13308	if (!Ex) {
13309	// "Reuse" the existing extract to improve final codegen.
13310	if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar)) {
13311	Value *V = ES->getVectorOperand();
13312	if (const TreeEntry *ETE = getTreeEntry(V))
13313	V = ETE->VectorizedValue;
13314	Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
13315	} else if (ReplaceGEP) {
13316	// Leave the GEPs as is, they are free in most cases and better to
13317	// keep them as GEPs.
13318	auto *CloneGEP = GEP->clone();
13319	CloneGEP->insertBefore(BB&: *Builder.GetInsertBlock(),
13320	InsertPos: Builder.GetInsertPoint());
13321	if (GEP->hasName())
13322	CloneGEP->takeName(V: GEP);
13323	Ex = CloneGEP;
13324	} else {
13325	Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
13326	}
13327	// If necessary, sign-extend or zero-extend ScalarRoot
13328	// to the larger type.
13329	ExV = Ex;
13330	if (Scalar->getType() != Ex->getType())
13331	ExV = Builder.CreateIntCast(V: Ex, DestTy: Scalar->getType(),
13332	isSigned: MinBWs.find(Val: E)->second.second);
13333	if (auto *I = dyn_cast<Instruction>(Val: Ex))
13334	ScalarToEEs [Scalar].try_emplace(
13335	Key: Builder.GetInsertBlock(),
13336	Args: std::make_pair(x&: I, y: cast<Instruction>(Val: ExV)));
13337	}
13338	// The then branch of the previous if may produce constants, since 0
13339	// operand might be a constant.
13340	if (auto *ExI = dyn_cast<Instruction>(Val: Ex)) {
13341	GatherShuffleExtractSeq.insert(X: ExI);
13342	CSEBlocks.insert(V: ExI->getParent());
13343	}
13344	return ExV;
13345	}
13346	assert(isa<FixedVectorType>(Scalar->getType()) &&
13347	isa<InsertElementInst>(Scalar) &&
13348	"In-tree scalar of vector type is not insertelement?");
13349	auto *IE = cast<InsertElementInst>(Val: Scalar);
13350	VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
13351	return Vec;
13352	};
13353	// If User == nullptr, the Scalar remains as scalar in vectorized
13354	// instructions or is used as extra arg. Generate ExtractElement instruction
13355	// and update the record for this scalar in ExternallyUsedValues.
13356	if (!User) {
13357	if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
13358	continue;
13359	assert((ExternallyUsedValues.count(Scalar) \|\|
13360	any_of(Scalar->users(),
13361	[&](llvm::User *U) {
13362	if (ExternalUsesAsGEPs.contains(U))
13363	return true;
13364	TreeEntry *UseEntry = getTreeEntry(U);
13365	return UseEntry &&
13366	(UseEntry->State == TreeEntry::Vectorize \|\|
13367	UseEntry->State ==
13368	TreeEntry::StridedVectorize) &&
13369	(E->State == TreeEntry::Vectorize \|\|
13370	E->State == TreeEntry::StridedVectorize) &&
13371	doesInTreeUserNeedToExtract(
13372	Scalar,
13373	cast<Instruction>(UseEntry->Scalars.front()),
13374	TLI);
13375	})) &&
13376	"Scalar with nullptr User must be registered in "
13377	"ExternallyUsedValues map or remain as scalar in vectorized "
13378	"instructions");
13379	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
13380	if (auto *PHI = dyn_cast<PHINode>(Val: VecI))
13381	Builder.SetInsertPoint(TheBB: PHI->getParent(),
13382	IP: PHI->getParent()->getFirstNonPHIIt());
13383	else
13384	Builder.SetInsertPoint(TheBB: VecI->getParent(),
13385	IP: std::next(x: VecI->getIterator()));
13386	} else {
13387	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13388	}
13389	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
13390	// Required to update internally referenced instructions.
13391	Scalar->replaceAllUsesWith(V: NewInst);
13392	ReplacedExternals.emplace_back(Args&: Scalar, Args&: NewInst);
13393	continue;
13394	}
13395
13396	if (auto *VU = dyn_cast<InsertElementInst>(Val: User)) {
13397	// Skip if the scalar is another vector op or Vec is not an instruction.
13398	if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
13399	if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
13400	if (!UsedInserts.insert(V: VU).second)
13401	continue;
13402	// Need to use original vector, if the root is truncated.
13403	auto BWIt = MinBWs.find(Val: E);
13404	if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13405	auto *ScalarTy = FTy->getElementType();
13406	auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
13407	auto VecIt = VectorCasts.find(Val: Key);
13408	if (VecIt == VectorCasts.end()) {
13409	IRBuilderBase::InsertPointGuard Guard(Builder);
13410	if (auto *IVec = dyn_cast<Instruction>(Val: Vec))
13411	Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13412	Vec = Builder.CreateIntCast(
13413	V: Vec,
13414	DestTy: FixedVectorType::get(
13415	ElementType: ScalarTy,
13416	NumElts: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
13417	isSigned: BWIt ->second.second);
13418	VectorCasts.try_emplace(Key, Args&: Vec);
13419	} else {
13420	Vec = VecIt ->second;
13421	}
13422	}
13423
13424	std::optional<unsigned> InsertIdx = getInsertIndex(InsertInst: VU);
13425	if (InsertIdx) {
13426	auto *It =
13427	find_if(Range&: ShuffledInserts, P: [VU](const ShuffledInsertData &Data) {
13428	// Checks if 2 insertelements are from the same buildvector.
13429	InsertElementInst *VecInsert = Data.InsertElements.front();
13430	return areTwoInsertFromSameBuildVector(
13431	VU, V: VecInsert,
13432	GetBaseOperand: [](InsertElementInst II) { return* II->getOperand(i_nocapture: `0`); });
13433	});
13434	unsigned Idx = *InsertIdx;
13435	if (It == ShuffledInserts.end()) {
13436	(void)ShuffledInserts.emplace_back();
13437	It = std::next(x: ShuffledInserts.begin(),
13438	n: ShuffledInserts.size() - `1`);
13439	SmallVectorImpl<int> &Mask = It->ValueMasks [Vec];
13440	if (Mask.empty())
13441	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
13442	// Find the insertvector, vectorized in tree, if any.
13443	Value *Base = VU;
13444	while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
13445	if (IEBase != User &&
13446	(!IEBase->hasOneUse() \|\|
13447	getInsertIndex(InsertInst: IEBase).value_or(u&: Idx) == Idx))
13448	break;
13449	// Build the mask for the vectorized insertelement instructions.
13450	if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
13451	do {
13452	IEBase = cast<InsertElementInst>(Val: Base);
13453	int IEIdx = *getInsertIndex(InsertInst: IEBase);
13454	assert(Mask[IEIdx] == PoisonMaskElem &&
13455	"InsertElementInstruction used already.");
13456	Mask [IEIdx] = IEIdx;
13457	Base = IEBase->getOperand(i_nocapture: `0`);
13458	} while (E == getTreeEntry(V: Base));
13459	break;
13460	}
13461	Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: `0`);
13462	// After the vectorization the def-use chain has changed, need
13463	// to look through original insertelement instructions, if they
13464	// get replaced by vector instructions.
13465	auto It = VectorToInsertElement.find(Val: Base);
13466	if (It != VectorToInsertElement.end())
13467	Base = It ->second;
13468	}
13469	}
13470	SmallVectorImpl<int> &Mask = It->ValueMasks [Vec];
13471	if (Mask.empty())
13472	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
13473	Mask [Idx] = ExternalUse.Lane;
13474	It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
13475	continue;
13476	}
13477	}
13478	}
13479	}
13480
13481	// Generate extracts for out-of-tree users.
13482	// Find the insertion point for the extractelement lane.
13483	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
13484	if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
13485	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumIncomingValues())) {
13486	if (PH->getIncomingValue(i: I) == Scalar) {
13487	Instruction *IncomingTerminator =
13488	PH->getIncomingBlock(i: I)->getTerminator();
13489	if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
13490	Builder.SetInsertPoint(TheBB: VecI->getParent(),
13491	IP: std::next(x: VecI->getIterator()));
13492	} else {
13493	Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
13494	}
13495	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
13496	PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
13497	}
13498	}
13499	} else {
13500	Builder.SetInsertPoint(cast<Instruction>(Val: User));
13501	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
13502	User->replaceUsesOfWith(From: Scalar, To: NewInst);
13503	}
13504	} else {
13505	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13506	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
13507	User->replaceUsesOfWith(From: Scalar, To: NewInst);
13508	}
13509
13510	LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13511	}
13512
13513	auto CreateShuffle = [&](Value V1, Value V2, ArrayRef<int> Mask) {
13514	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13515	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13516	int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
13517	for (int I = `0`, E = Mask.size(); I < E; ++I) {
13518	if (Mask [I] < VF)
13519	CombinedMask1 [I] = Mask [I];
13520	else
13521	CombinedMask2 [I] = Mask [I] - VF;
13522	}
13523	ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
13524	ShuffleBuilder.add(V1, Mask: CombinedMask1);
13525	if (V2)
13526	ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
13527	return ShuffleBuilder.finalize(ExtMask: std::nullopt);
13528	};
13529
13530	auto &&ResizeToVF = [&CreateShuffle](Value Vec, ArrayRef<int*> Mask,
13531	bool ForSingleMask) {
13532	unsigned VF = Mask.size();
13533	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
13534	if (VF != VecVF) {
13535	if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13536	Vec = CreateShuffle (Vec, nullptr, Mask);
13537	return std::make_pair(x&: Vec, y: true);
13538	}
13539	if (!ForSingleMask) {
13540	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13541	for (unsigned I = `0`; I < VF; ++I) {
13542	if (Mask [I] != PoisonMaskElem)
13543	ResizeMask [Mask [I]] = Mask [I];
13544	}
13545	Vec = CreateShuffle (Vec, nullptr, ResizeMask);
13546	}
13547	}
13548
13549	return std::make_pair(x&: Vec, y: false);
13550	};
13551	// Perform shuffling of the vectorize tree entries for better handling of
13552	// external extracts.
13553	for (int I = `0`, E = ShuffledInserts.size(); I < E; ++I) {
13554	// Find the first and the last instruction in the list of insertelements.
13555	sort(C&: ShuffledInserts [I].InsertElements, Comp: isFirstInsertElement);
13556	InsertElementInst *FirstInsert = ShuffledInserts [I].InsertElements.front();
13557	InsertElementInst *LastInsert = ShuffledInserts [I].InsertElements.back();
13558	Builder.SetInsertPoint(LastInsert);
13559	auto Vector = ShuffledInserts [I].ValueMasks.takeVector();
13560	Value *NewInst = performExtractsShuffleAction<Value>(
13561	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
13562	Base: FirstInsert->getOperand(i_nocapture: `0`),
13563	GetVF: [](Value *Vec) {
13564	return cast<VectorType>(Val: Vec->getType())
13565	->getElementCount()
13566	.getKnownMinValue();
13567	},
13568	ResizeAction: ResizeToVF,
13569	Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13570	ArrayRef<Value *> Vals) {
13571	assert((Vals.size() == `1` \|\| Vals.size() == `2`) &&
13572	"Expected exactly 1 or 2 input values.");
13573	if (Vals.size() == `1`) {
13574	// Do not create shuffle if the mask is a simple identity
13575	// non-resizing mask.
13576	if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
13577	->getNumElements() \|\|
13578	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
13579	return CreateShuffle (Vals.front(), nullptr, Mask);
13580	return Vals.front();
13581	}
13582	return CreateShuffle (Vals.front() ? Vals.front()
13583	: FirstInsert->getOperand(i_nocapture: `0`),
13584	Vals.back(), Mask);
13585	});
13586	auto It = ShuffledInserts [I].InsertElements.rbegin();
13587	// Rebuild buildvector chain.
13588	InsertElementInst II = nullptr*;
13589	if (It != ShuffledInserts [I].InsertElements.rend())
13590	II = *It;
13591	SmallVector<Instruction *> Inserts;
13592	while (It != ShuffledInserts [I].InsertElements.rend()) {
13593	assert(II && "Must be an insertelement instruction.");
13594	if (*It == II)
13595	++It;
13596	else
13597	Inserts.push_back(Elt: cast<Instruction>(Val: II));
13598	II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: `0`));
13599	}
13600	for (Instruction *II : reverse(C&: Inserts)) {
13601	II->replaceUsesOfWith(From: II->getOperand(i: `0`), To: NewInst);
13602	if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
13603	if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
13604	II->moveAfter(MovePos: NewI);
13605	NewInst = II;
13606	}
13607	LastInsert->replaceAllUsesWith(V: NewInst);
13608	for (InsertElementInst *IE : reverse(C&: ShuffledInserts [I].InsertElements)) {
13609	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `0`),
13610	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `0`)->getType()));
13611	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `1`),
13612	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `1`)->getType()));
13613	eraseInstruction(I: IE);
13614	}
13615	CSEBlocks.insert(V: LastInsert->getParent());
13616	}
13617
13618	SmallVector<Instruction *> RemovedInsts;
13619	// For each vectorized value:
13620	for (auto &TEPtr : VectorizableTree) {
13621	TreeEntry *Entry = TEPtr.get();
13622
13623	// No need to handle users of gathered values.
13624	if (Entry->State == TreeEntry::NeedToGather)
13625	continue;
13626
13627	assert(Entry->VectorizedValue && "Can't find vectorizable value");
13628
13629	// For each lane:
13630	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13631	Value *Scalar = Entry->Scalars [Lane];
13632
13633	if (Entry->getOpcode() == Instruction::GetElementPtr &&
13634	!isa<GetElementPtrInst>(Val: Scalar))
13635	continue;
13636	#ifndef NDEBUG
13637	Type *Ty = Scalar->getType();
13638	if (!Ty->isVoidTy()) {
13639	for (User *U : Scalar->users()) {
13640	LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
13641
13642	// It is legal to delete users in the ignorelist.
13643	assert((getTreeEntry(U) \|\|
13644	(UserIgnoreList && UserIgnoreList->contains(U)) \|\|
13645	(isa_and_nonnull<Instruction>(U) &&
13646	isDeleted(cast<Instruction>(U)))) &&
13647	"Deleting out-of-tree value");
13648	}
13649	}
13650	#endif
13651	LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
13652	eraseInstruction(I: cast<Instruction>(Val: Scalar));
13653	// Retain to-be-deleted instructions for some debug-info
13654	// bookkeeping. NOTE: eraseInstruction only marks the instruction for
13655	// deletion - instructions are not deleted until later.
13656	RemovedInsts.push_back(Elt: cast<Instruction>(Val: Scalar));
13657	}
13658	}
13659
13660	// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
13661	// new vector instruction.
13662	if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree [`0`]->VectorizedValue))
13663	V->mergeDIAssignID(SourceInstructions: RemovedInsts);
13664
13665	Builder.ClearInsertionPoint();
13666	InstrElementSize.clear();
13667
13668	const TreeEntry &RootTE = *VectorizableTree.front().get();
13669	Value *Vec = RootTE.VectorizedValue;
13670	if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != `0` &&
13671	It != MinBWs.end() &&
13672	ReductionBitWidth != It ->second.first) {
13673	IRBuilder<>::InsertPointGuard Guard(Builder);
13674	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
13675	IP: ReductionRoot->getIterator());
13676	Vec = Builder.CreateIntCast(
13677	V: Vec,
13678	DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
13679	EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
13680	isSigned: It ->second.second);
13681	}
13682	return Vec;
13683	}
13684
13685	void BoUpSLP::optimizeGatherSequence() {
13686	LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
13687	<< " gather sequences instructions.\n");
13688	// LICM InsertElementInst sequences.
13689	for (Instruction *I : GatherShuffleExtractSeq) {
13690	if (isDeleted(I))
13691	continue;
13692
13693	// Check if this block is inside a loop.
13694	Loop *L = LI->getLoopFor(BB: I->getParent());
13695	if (!L)
13696	continue;
13697
13698	// Check if it has a preheader.
13699	BasicBlock *PreHeader = L->getLoopPreheader();
13700	if (!PreHeader)
13701	continue;
13702
13703	// If the vector or the element that we insert into it are
13704	// instructions that are defined in this basic block then we can't
13705	// hoist this instruction.
13706	if (any_of(Range: I->operands(), P: [L](Value *V) {
13707	auto *OpI = dyn_cast<Instruction>(Val: V);
13708	return OpI && L->contains(Inst: OpI);
13709	}))
13710	continue;
13711
13712	// We can hoist this instruction. Move it to the pre-header.
13713	I->moveBefore(MovePos: PreHeader->getTerminator());
13714	CSEBlocks.insert(V: PreHeader);
13715	}
13716
13717	// Make a list of all reachable blocks in our CSE queue.
13718	SmallVector<const DomTreeNode *, `8`> CSEWorkList;
13719	CSEWorkList.reserve(N: CSEBlocks.size());
13720	for (BasicBlock *BB : CSEBlocks)
13721	if (DomTreeNode *N = DT->getNode(BB)) {
13722	assert(DT->isReachableFromEntry(N));
13723	CSEWorkList.push_back(Elt: N);
13724	}
13725
13726	// Sort blocks by domination. This ensures we visit a block after all blocks
13727	// dominating it are visited.
13728	llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode A, const* DomTreeNode *B) {
13729	assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
13730	"Different nodes should have different DFS numbers");
13731	return A->getDFSNumIn() < B->getDFSNumIn();
13732	});
13733
13734	// Less defined shuffles can be replaced by the more defined copies.
13735	// Between two shuffles one is less defined if it has the same vector operands
13736	// and its mask indeces are the same as in the first one or undefs. E.g.
13737	// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
13738	// poison, <0, 0, 0, 0>.
13739	auto &&IsIdenticalOrLessDefined = [this](Instruction I1, Instruction I2,
13740	SmallVectorImpl<int> &NewMask) {
13741	if (I1->getType() != I2->getType())
13742	return false;
13743	auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
13744	auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
13745	if (!SI1 \|\| !SI2)
13746	return I1->isIdenticalTo(I: I2);
13747	if (SI1->isIdenticalTo(I: SI2))
13748	return true;
13749	for (int I = `0`, E = SI1->getNumOperands(); I < E; ++I)
13750	if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
13751	return false;
13752	// Check if the second instruction is more defined than the first one.
13753	NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
13754	ArrayRef<int> SM1 = SI1->getShuffleMask();
13755	// Count trailing undefs in the mask to check the final number of used
13756	// registers.
13757	unsigned LastUndefsCnt = `0`;
13758	for (int I = `0`, E = NewMask.size(); I < E; ++I) {
13759	if (SM1 [I] == PoisonMaskElem)
13760	++LastUndefsCnt;
13761	else
13762	LastUndefsCnt = `0`;
13763	if (NewMask [I] != PoisonMaskElem && SM1 [I] != PoisonMaskElem &&
13764	NewMask [I] != SM1 [I])
13765	return false;
13766	if (NewMask [I] == PoisonMaskElem)
13767	NewMask [I] = SM1 [I];
13768	}
13769	// Check if the last undefs actually change the final number of used vector
13770	// registers.
13771	return SM1.size() - LastUndefsCnt > `1` &&
13772	TTI->getNumberOfParts(Tp: SI1->getType()) ==
13773	TTI->getNumberOfParts(
13774	Tp: FixedVectorType::get(ElementType: SI1->getType()->getElementType(),
13775	NumElts: SM1.size() - LastUndefsCnt));
13776	};
13777	// Perform O(N^2) search over the gather/shuffle sequences and merge identical
13778	// instructions. TODO: We can further optimize this scan if we split the
13779	// instructions into different buckets based on the insert lane.
13780	SmallVector<Instruction *, `16`> Visited;
13781	for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
13782	assert(*I &&
13783	(I == CSEWorkList.begin() \|\| !DT->dominates(I, std::prev(I))) &&
13784	"Worklist not sorted properly!");
13785	BasicBlock BB = (I)->getBlock();
13786	// For all instructions in blocks containing gather sequences:
13787	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
13788	if (isDeleted(I: &In))
13789	continue;
13790	if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
13791	!GatherShuffleExtractSeq.contains(key: &In))
13792	continue;
13793
13794	// Check if we can replace this instruction with any of the
13795	// visited instructions.
13796	bool Replaced = false;
13797	for (Instruction *&V : Visited) {
13798	SmallVector<int> NewMask;
13799	if (IsIdenticalOrLessDefined (&In, V, NewMask) &&
13800	DT->dominates(A: V->getParent(), B: In.getParent())) {
13801	In.replaceAllUsesWith(V);
13802	eraseInstruction(I: &In);
13803	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
13804	if (!NewMask.empty())
13805	SI->setShuffleMask(NewMask);
13806	Replaced = true;
13807	break;
13808	}
13809	if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
13810	GatherShuffleExtractSeq.contains(key: V) &&
13811	IsIdenticalOrLessDefined (V, &In, NewMask) &&
13812	DT->dominates(A: In.getParent(), B: V->getParent())) {
13813	In.moveAfter(MovePos: V);
13814	V->replaceAllUsesWith(V: &In);
13815	eraseInstruction(I: V);
13816	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
13817	if (!NewMask.empty())
13818	SI->setShuffleMask(NewMask);
13819	V = &In;
13820	Replaced = true;
13821	break;
13822	}
13823	}
13824	if (!Replaced) {
13825	assert(!is_contained(Visited, &In));
13826	Visited.push_back(Elt: &In);
13827	}
13828	}
13829	}
13830	CSEBlocks.clear();
13831	GatherShuffleExtractSeq.clear();
13832	}
13833
13834	BoUpSLP::ScheduleData *
13835	BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
13836	ScheduleData Bundle = nullptr*;
13837	ScheduleData PrevInBundle = nullptr*;
13838	for (Value *V : VL) {
13839	if (doesNotNeedToBeScheduled(V))
13840	continue;
13841	ScheduleData *BundleMember = getScheduleData(V);
13842	assert(BundleMember &&
13843	"no ScheduleData for bundle member "
13844	"(maybe not in same basic block)");
13845	assert(BundleMember->isSchedulingEntity() &&
13846	"bundle member already part of other bundle");
13847	if (PrevInBundle) {
13848	PrevInBundle->NextInBundle = BundleMember;
13849	} else {
13850	Bundle = BundleMember;
13851	}
13852
13853	// Group the instructions to a bundle.
13854	BundleMember->FirstInBundle = Bundle;
13855	PrevInBundle = BundleMember;
13856	}
13857	assert(Bundle && "Failed to find schedule bundle");
13858	return Bundle;
13859	}
13860
13861	// Groups the instructions to a bundle (which is then a single scheduling entity)
13862	// and schedules instructions until the bundle gets ready.
13863	std::optional<BoUpSLP::ScheduleData *>
13864	BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
13865	const InstructionsState &S) {
13866	// No need to schedule PHIs, insertelement, extractelement and extractvalue
13867	// instructions.
13868	if (isa<PHINode>(Val: S.OpValue) \|\| isVectorLikeInstWithConstOps(V: S.OpValue) \|\|
13869	doesNotNeedToSchedule(VL))
13870	return nullptr;
13871
13872	// Initialize the instruction bundle.
13873	Instruction *OldScheduleEnd = ScheduleEnd;
13874	LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
13875
13876	auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
13877	ScheduleData *Bundle) {
13878	// The scheduling region got new instructions at the lower end (or it is a
13879	// new region for the first bundle). This makes it necessary to
13880	// recalculate all dependencies.
13881	// It is seldom that this needs to be done a second time after adding the
13882	// initial bundle to the region.
13883	if (ScheduleEnd != OldScheduleEnd) {
13884	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
13885	doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->clearDependencies(); });
13886	ReSchedule = true;
13887	}
13888	if (Bundle) {
13889	LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
13890	<< " in block " << BB->getName() << "\n");
13891	calculateDependencies(SD: Bundle, /InsertInReadyList=/true, SLP);
13892	}
13893
13894	if (ReSchedule) {
13895	resetSchedule();
13896	initialFillReadyList(ReadyList&: ReadyInsts);
13897	}
13898
13899	// Now try to schedule the new bundle or (if no bundle) just calculate
13900	// dependencies. As soon as the bundle is "ready" it means that there are no
13901	// cyclic dependencies and we can schedule it. Note that's important that we
13902	// don't "schedule" the bundle yet (see cancelScheduling).
13903	while (((!Bundle && ReSchedule) \|\| (Bundle && !Bundle->isReady())) &&
13904	!ReadyInsts.empty()) {
13905	ScheduleData *Picked = ReadyInsts.pop_back_val();
13906	assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13907	"must be ready to schedule");
13908	schedule(SD: Picked, ReadyList&: ReadyInsts);
13909	}
13910	};
13911
13912	// Make sure that the scheduling region contains all
13913	// instructions of the bundle.
13914	for (Value *V : VL) {
13915	if (doesNotNeedToBeScheduled(V))
13916	continue;
13917	if (!extendSchedulingRegion(V, S)) {
13918	// If the scheduling region got new instructions at the lower end (or it
13919	// is a new region for the first bundle). This makes it necessary to
13920	// recalculate all dependencies.
13921	// Otherwise the compiler may crash trying to incorrectly calculate
13922	// dependencies and emit instruction in the wrong order at the actual
13923	// scheduling.
13924	TryScheduleBundleImpl (/ReSchedule=/false, nullptr);
13925	return std::nullopt;
13926	}
13927	}
13928
13929	bool ReSchedule = false;
13930	for (Value *V : VL) {
13931	if (doesNotNeedToBeScheduled(V))
13932	continue;
13933	ScheduleData *BundleMember = getScheduleData(V);
13934	assert(BundleMember &&
13935	"no ScheduleData for bundle member (maybe not in same basic block)");
13936
13937	// Make sure we don't leave the pieces of the bundle in the ready list when
13938	// whole bundle might not be ready.
13939	ReadyInsts.remove(X: BundleMember);
13940
13941	if (!BundleMember->IsScheduled)
13942	continue;
13943	// A bundle member was scheduled as single instruction before and now
13944	// needs to be scheduled as part of the bundle. We just get rid of the
13945	// existing schedule.
13946	LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
13947	<< " was already scheduled\n");
13948	ReSchedule = true;
13949	}
13950
13951	auto *Bundle = buildBundle(VL);
13952	TryScheduleBundleImpl (ReSchedule, Bundle);
13953	if (!Bundle->isReady()) {
13954	cancelScheduling(VL, OpValue: S.OpValue);
13955	return std::nullopt;
13956	}
13957	return Bundle;
13958	}
13959
13960	void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
13961	Value *OpValue) {
13962	if (isa<PHINode>(Val: OpValue) \|\| isVectorLikeInstWithConstOps(V: OpValue) \|\|
13963	doesNotNeedToSchedule(VL))
13964	return;
13965
13966	if (doesNotNeedToBeScheduled(V: OpValue))
13967	OpValue = *find_if_not(Range&: VL, P: doesNotNeedToBeScheduled);
13968	ScheduleData *Bundle = getScheduleData(V: OpValue);
13969	LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
13970	assert(!Bundle->IsScheduled &&
13971	"Can't cancel bundle which is already scheduled");
13972	assert(Bundle->isSchedulingEntity() &&
13973	(Bundle->isPartOfBundle() \|\| needToScheduleSingleInstruction(VL)) &&
13974	"tried to unbundle something which is not a bundle");
13975
13976	// Remove the bundle from the ready list.
13977	if (Bundle->isReady())
13978	ReadyInsts.remove(X: Bundle);
13979
13980	// Un-bundle: make single instructions out of the bundle.
13981	ScheduleData *BundleMember = Bundle;
13982	while (BundleMember) {
13983	assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
13984	BundleMember->FirstInBundle = BundleMember;
13985	ScheduleData *Next = BundleMember->NextInBundle;
13986	BundleMember->NextInBundle = nullptr;
13987	BundleMember->TE = nullptr;
13988	if (BundleMember->unscheduledDepsInBundle() == `0`) {
13989	ReadyInsts.insert(X: BundleMember);
13990	}
13991	BundleMember = Next;
13992	}
13993	}
13994
13995	BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13996	// Allocate a new ScheduleData for the instruction.
13997	if (ChunkPos >= ChunkSize) {
13998	ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
13999	ChunkPos = `0`;
14000	}
14001	return &(ScheduleDataChunks.back()[ChunkPos++]);
14002	}
14003
14004	bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14005	const InstructionsState &S) {
14006	if (getScheduleData(V, Key: isOneOf(S, Op: V)))
14007	return true;
14008	Instruction *I = dyn_cast<Instruction>(Val: V);
14009	assert(I && "bundle member must be an instruction");
14010	assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14011	!doesNotNeedToBeScheduled(I) &&
14012	"phi nodes/insertelements/extractelements/extractvalues don't need to "
14013	"be scheduled");
14014	auto &&CheckScheduleForI = [this, &S](Instruction I) -> bool* {
14015	ScheduleData *ISD = getScheduleData(I);
14016	if (!ISD)
14017	return false;
14018	assert(isInSchedulingRegion(ISD) &&
14019	"ScheduleData not in scheduling region");
14020	ScheduleData *SD = allocateScheduleDataChunks();
14021	SD->Inst = I;
14022	SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: S.OpValue);
14023	ExtraScheduleDataMap [I][S.OpValue] = SD;
14024	return true;
14025	};
14026	if (CheckScheduleForI (I))
14027	return true;
14028	if (!ScheduleStart) {
14029	// It's the first instruction in the new region.
14030	initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
14031	ScheduleStart = I;
14032	ScheduleEnd = I->getNextNode();
14033	if (isOneOf(S, Op: I) != I)
14034	CheckScheduleForI (I);
14035	assert(ScheduleEnd && "tried to vectorize a terminator?");
14036	LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14037	return true;
14038	}
14039	// Search up and down at the same time, because we don't know if the new
14040	// instruction is above or below the existing scheduling region.
14041	// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14042	// against the budget. Otherwise debug info could affect codegen.
14043	BasicBlock::reverse_iterator UpIter =
14044	++ScheduleStart->getIterator().getReverse();
14045	BasicBlock::reverse_iterator UpperEnd = BB->rend();
14046	BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14047	BasicBlock::iterator LowerEnd = BB->end();
14048	auto IsAssumeLikeIntr = [](const Instruction &I) {
14049	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
14050	return II->isAssumeLikeIntrinsic();
14051	return false;
14052	};
14053	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14054	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14055	while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14056	&*DownIter != I) {
14057	if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14058	LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14059	return false;
14060	}
14061
14062	++UpIter;
14063	++DownIter;
14064
14065	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14066	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14067	}
14068	if (DownIter == LowerEnd \|\| (UpIter != UpperEnd && &*UpIter == I)) {
14069	assert(I->getParent() == ScheduleStart->getParent() &&
14070	"Instruction is in wrong basic block.");
14071	initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
14072	ScheduleStart = I;
14073	if (isOneOf(S, Op: I) != I)
14074	CheckScheduleForI (I);
14075	LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14076	<< "\n");
14077	return true;
14078	}
14079	assert((UpIter == UpperEnd \|\| (DownIter != LowerEnd && &*DownIter == I)) &&
14080	"Expected to reach top of the basic block or instruction down the "
14081	"lower end.");
14082	assert(I->getParent() == ScheduleEnd->getParent() &&
14083	"Instruction is in wrong basic block.");
14084	initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
14085	NextLoadStore: nullptr);
14086	ScheduleEnd = I->getNextNode();
14087	if (isOneOf(S, Op: I) != I)
14088	CheckScheduleForI (I);
14089	assert(ScheduleEnd && "tried to vectorize a terminator?");
14090	LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14091	return true;
14092	}
14093
14094	void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14095	Instruction *ToI,
14096	ScheduleData *PrevLoadStore,
14097	ScheduleData *NextLoadStore) {
14098	ScheduleData *CurrentLoadStore = PrevLoadStore;
14099	for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14100	// No need to allocate data for non-schedulable instructions.
14101	if (doesNotNeedToBeScheduled(V: I))
14102	continue;
14103	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
14104	if (!SD) {
14105	SD = allocateScheduleDataChunks();
14106	ScheduleDataMap [I] = SD;
14107	SD->Inst = I;
14108	}
14109	assert(!isInSchedulingRegion(SD) &&
14110	"new ScheduleData already in scheduling region");
14111	SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: I);
14112
14113	if (I->mayReadOrWriteMemory() &&
14114	(!isa<IntrinsicInst>(Val: I) \|\|
14115	(cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
14116	cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
14117	Intrinsic::pseudoprobe))) {
14118	// Update the linked list of memory accessing instructions.
14119	if (CurrentLoadStore) {
14120	CurrentLoadStore->NextLoadStore = SD;
14121	} else {
14122	FirstLoadStoreInRegion = SD;
14123	}
14124	CurrentLoadStore = SD;
14125	}
14126
14127	if (match(I, m_Intrinsic<Intrinsic::stacksave>()) \|\|
14128	match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14129	RegionHasStackSave = true;
14130	}
14131	if (NextLoadStore) {
14132	if (CurrentLoadStore)
14133	CurrentLoadStore->NextLoadStore = NextLoadStore;
14134	} else {
14135	LastLoadStoreInRegion = CurrentLoadStore;
14136	}
14137	}
14138
14139	void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14140	bool InsertInReadyList,
14141	BoUpSLP *SLP) {
14142	assert(SD->isSchedulingEntity());
14143
14144	SmallVector<ScheduleData *, `10`> WorkList;
14145	WorkList.push_back(Elt: SD);
14146
14147	while (!WorkList.empty()) {
14148	ScheduleData *SD = WorkList.pop_back_val();
14149	for (ScheduleData *BundleMember = SD; BundleMember;
14150	BundleMember = BundleMember->NextInBundle) {
14151	assert(isInSchedulingRegion(BundleMember));
14152	if (BundleMember->hasValidDependencies())
14153	continue;
14154
14155	LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14156	<< "\n");
14157	BundleMember->Dependencies = `0`;
14158	BundleMember->resetUnscheduledDeps();
14159
14160	// Handle def-use chain dependencies.
14161	if (BundleMember->OpValue != BundleMember->Inst) {
14162	if (ScheduleData *UseSD = getScheduleData(I: BundleMember->Inst)) {
14163	BundleMember->Dependencies++;
14164	ScheduleData *DestBundle = UseSD->FirstInBundle;
14165	if (!DestBundle->IsScheduled)
14166	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14167	if (!DestBundle->hasValidDependencies())
14168	WorkList.push_back(Elt: DestBundle);
14169	}
14170	} else {
14171	for (User *U : BundleMember->Inst->users()) {
14172	if (ScheduleData *UseSD = getScheduleData(I: cast<Instruction>(Val: U))) {
14173	BundleMember->Dependencies++;
14174	ScheduleData *DestBundle = UseSD->FirstInBundle;
14175	if (!DestBundle->IsScheduled)
14176	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14177	if (!DestBundle->hasValidDependencies())
14178	WorkList.push_back(Elt: DestBundle);
14179	}
14180	}
14181	}
14182
14183	auto MakeControlDependent = [&](Instruction *I) {
14184	auto *DepDest = getScheduleData(I);
14185	assert(DepDest && "must be in schedule window");
14186	DepDest->ControlDependencies.push_back(Elt: BundleMember);
14187	BundleMember->Dependencies++;
14188	ScheduleData *DestBundle = DepDest->FirstInBundle;
14189	if (!DestBundle->IsScheduled)
14190	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14191	if (!DestBundle->hasValidDependencies())
14192	WorkList.push_back(Elt: DestBundle);
14193	};
14194
14195	// Any instruction which isn't safe to speculate at the beginning of the
14196	// block is control dependend on any early exit or non-willreturn call
14197	// which proceeds it.
14198	if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->Inst)) {
14199	for (Instruction *I = BundleMember->Inst->getNextNode();
14200	I != ScheduleEnd; I = I->getNextNode()) {
14201	if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
14202	continue;
14203
14204	// Add the dependency
14205	MakeControlDependent (I);
14206
14207	if (!isGuaranteedToTransferExecutionToSuccessor(I))
14208	// Everything past here must be control dependent on I.
14209	break;
14210	}
14211	}
14212
14213	if (RegionHasStackSave) {
14214	// If we have an inalloc alloca instruction, it needs to be scheduled
14215	// after any preceeding stacksave. We also need to prevent any alloca
14216	// from reordering above a preceeding stackrestore.
14217	if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) \|\|
14218	match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14219	for (Instruction *I = BundleMember->Inst->getNextNode();
14220	I != ScheduleEnd; I = I->getNextNode()) {
14221	if (match(I, m_Intrinsic<Intrinsic::stacksave>()) \|\|
14222	match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14223	// Any allocas past here must be control dependent on I, and I
14224	// must be memory dependend on BundleMember->Inst.
14225	break;
14226
14227	if (!isa<AllocaInst>(Val: I))
14228	continue;
14229
14230	// Add the dependency
14231	MakeControlDependent (I);
14232	}
14233	}
14234
14235	// In addition to the cases handle just above, we need to prevent
14236	// allocas and loads/stores from moving below a stacksave or a
14237	// stackrestore. Avoiding moving allocas below stackrestore is currently
14238	// thought to be conservatism. Moving loads/stores below a stackrestore
14239	// can lead to incorrect code.
14240	if (isa<AllocaInst>(Val: BundleMember->Inst) \|\|
14241	BundleMember->Inst->mayReadOrWriteMemory()) {
14242	for (Instruction *I = BundleMember->Inst->getNextNode();
14243	I != ScheduleEnd; I = I->getNextNode()) {
14244	if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14245	!match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14246	continue;
14247
14248	// Add the dependency
14249	MakeControlDependent (I);
14250	break;
14251	}
14252	}
14253	}
14254
14255	// Handle the memory dependencies (if any).
14256	ScheduleData *DepDest = BundleMember->NextLoadStore;
14257	if (!DepDest)
14258	continue;
14259	Instruction *SrcInst = BundleMember->Inst;
14260	assert(SrcInst->mayReadOrWriteMemory() &&
14261	"NextLoadStore list for non memory effecting bundle?");
14262	MemoryLocation SrcLoc = getLocation(I: SrcInst);
14263	bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14264	unsigned NumAliased = `0`;
14265	unsigned DistToSrc = `1`;
14266
14267	for (; DepDest; DepDest = DepDest->NextLoadStore) {
14268	assert(isInSchedulingRegion(DepDest));
14269
14270	// We have two limits to reduce the complexity:
14271	// 1) AliasedCheckLimit: It's a small limit to reduce calls to
14272	// SLP->isAliased (which is the expensive part in this loop).
14273	// 2) MaxMemDepDistance: It's for very large blocks and it aborts
14274	// the whole loop (even if the loop is fast, it's quadratic).
14275	// It's important for the loop break condition (see below) to
14276	// check this limit even between two read-only instructions.
14277	if (DistToSrc >= MaxMemDepDistance \|\|
14278	((SrcMayWrite \|\| DepDest->Inst->mayWriteToMemory()) &&
14279	(NumAliased >= AliasedCheckLimit \|\|
14280	SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->Inst)))) {
14281
14282	// We increment the counter only if the locations are aliased
14283	// (instead of counting all alias checks). This gives a better
14284	// balance between reduced runtime and accurate dependencies.
14285	NumAliased++;
14286
14287	DepDest->MemoryDependencies.push_back(Elt: BundleMember);
14288	BundleMember->Dependencies++;
14289	ScheduleData *DestBundle = DepDest->FirstInBundle;
14290	if (!DestBundle->IsScheduled) {
14291	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14292	}
14293	if (!DestBundle->hasValidDependencies()) {
14294	WorkList.push_back(Elt: DestBundle);
14295	}
14296	}
14297
14298	// Example, explaining the loop break condition: Let's assume our
14299	// starting instruction is i0 and MaxMemDepDistance = 3.
14300	//
14301	// +--------v--v--v
14302	// i0,i1,i2,i3,i4,i5,i6,i7,i8
14303	// +--------^--^--^
14304	//
14305	// MaxMemDepDistance let us stop alias-checking at i3 and we add
14306	// dependencies from i0 to i3,i4,.. (even if they are not aliased).
14307	// Previously we already added dependencies from i3 to i6,i7,i8
14308	// (because of MaxMemDepDistance). As we added a dependency from
14309	// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14310	// and we can abort this loop at i6.
14311	if (DistToSrc >= `2` * MaxMemDepDistance)
14312	break;
14313	DistToSrc++;
14314	}
14315	}
14316	if (InsertInReadyList && SD->isReady()) {
14317	ReadyInsts.insert(X: SD);
14318	LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14319	<< "\n");
14320	}
14321	}
14322	}
14323
14324	void BoUpSLP::BlockScheduling::resetSchedule() {
14325	assert(ScheduleStart &&
14326	"tried to reset schedule on block which has not been scheduled");
14327	for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14328	doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
14329	assert(isInSchedulingRegion(SD) &&
14330	"ScheduleData not in scheduling region");
14331	SD->IsScheduled = false;
14332	SD->resetUnscheduledDeps();
14333	});
14334	}
14335	ReadyInsts.clear();
14336	}
14337
14338	void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14339	if (!BS->ScheduleStart)
14340	return;
14341
14342	LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14343
14344	// A key point - if we got here, pre-scheduling was able to find a valid
14345	// scheduling of the sub-graph of the scheduling window which consists
14346	// of all vector bundles and their transitive users. As such, we do not
14347	// need to reschedule anything outside of* that subgraph.*
14348
14349	BS->resetSchedule();
14350
14351	// For the real scheduling we use a more sophisticated ready-list: it is
14352	// sorted by the original instruction location. This lets the final schedule
14353	// be as close as possible to the original instruction order.
14354	// WARNING: If changing this order causes a correctness issue, that means
14355	// there is some missing dependence edge in the schedule data graph.
14356	struct ScheduleDataCompare {
14357	bool operator()(ScheduleData SD1, ScheduleData SD2) const {
14358	return SD2->SchedulingPriority < SD1->SchedulingPriority;
14359	}
14360	};
14361	std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14362
14363	// Ensure that all dependency data is updated (for nodes in the sub-graph)
14364	// and fill the ready-list with initial instructions.
14365	int Idx = `0`;
14366	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14367	I = I->getNextNode()) {
14368	BS->doForAllOpcodes(V: I, Action: [this, &Idx, BS](ScheduleData *SD) {
14369	TreeEntry *SDTE = getTreeEntry(V: SD->Inst);
14370	(void)SDTE;
14371	assert((isVectorLikeInstWithConstOps(SD->Inst) \|\|
14372	SD->isPartOfBundle() ==
14373	(SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14374	"scheduler and vectorizer bundle mismatch");
14375	SD->FirstInBundle->SchedulingPriority = Idx++;
14376
14377	if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14378	BS->calculateDependencies(SD, InsertInReadyList: false, SLP: this);
14379	});
14380	}
14381	BS->initialFillReadyList(ReadyList&: ReadyInsts);
14382
14383	Instruction *LastScheduledInst = BS->ScheduleEnd;
14384
14385	// Do the "real" scheduling.
14386	while (!ReadyInsts.empty()) {
14387	ScheduleData Picked = ReadyInsts.begin();
14388	ReadyInsts.erase(position: ReadyInsts.begin());
14389
14390	// Move the scheduled instruction(s) to their dedicated places, if not
14391	// there yet.
14392	for (ScheduleData *BundleMember = Picked; BundleMember;
14393	BundleMember = BundleMember->NextInBundle) {
14394	Instruction *PickedInst = BundleMember->Inst;
14395	if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14396	PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
14397	LastScheduledInst = PickedInst;
14398	}
14399
14400	BS->schedule(SD: Picked, ReadyList&: ReadyInsts);
14401	}
14402
14403	// Check that we didn't break any of our invariants.
14404	#ifdef EXPENSIVE_CHECKS
14405	BS->verify();
14406	#endif
14407
14408	#if !defined(NDEBUG) \|\| defined(EXPENSIVE_CHECKS)
14409	// Check that all schedulable entities got scheduled
14410	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14411	BS->doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
14412	if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14413	assert(SD->IsScheduled && "must be scheduled at this point");
14414	}
14415	});
14416	}
14417	#endif
14418
14419	// Avoid duplicate scheduling of the block.
14420	BS->ScheduleStart = nullptr;
14421	}
14422
14423	unsigned BoUpSLP::getVectorElementSize(Value *V) {
14424	// If V is a store, just return the width of the stored value (or value
14425	// truncated just before storing) without traversing the expression tree.
14426	// This is the common case.
14427	if (auto *Store = dyn_cast<StoreInst>(Val: V))
14428	return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
14429
14430	if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
14431	return getVectorElementSize(V: IEI->getOperand(i_nocapture: `1`));
14432
14433	auto E = InstrElementSize.find(Val: V);
14434	if (E != InstrElementSize.end())
14435	return E ->second;
14436
14437	// If V is not a store, we can traverse the expression tree to find loads
14438	// that feed it. The type of the loaded value may indicate a more suitable
14439	// width than V's type. We want to base the vector element size on the width
14440	// of memory operations where possible.
14441	SmallVector<std::tuple<Instruction , BasicBlock , unsigned>> Worklist;
14442	SmallPtrSet<Instruction *, `16`> Visited;
14443	if (auto *I = dyn_cast<Instruction>(Val: V)) {
14444	Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: `0`);
14445	Visited.insert(Ptr: I);
14446	}
14447
14448	// Traverse the expression tree in bottom-up order looking for loads. If we
14449	// encounter an instruction we don't yet handle, we give up.
14450	auto Width = `0u`;
14451	Value FirstNonBool = nullptr*;
14452	while (!Worklist.empty()) {
14453	auto [I, Parent, Level] = Worklist.pop_back_val();
14454
14455	// We should only be looking at scalar instructions here. If the current
14456	// instruction has a vector type, skip.
14457	auto *Ty = I->getType();
14458	if (isa<VectorType>(Val: Ty))
14459	continue;
14460	if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14461	FirstNonBool = I;
14462	if (Level > RecursionMaxDepth)
14463	continue;
14464
14465	// If the current instruction is a load, update MaxWidth to reflect the
14466	// width of the loaded value.
14467	if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
14468	Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
14469
14470	// Otherwise, we need to visit the operands of the instruction. We only
14471	// handle the interesting cases from buildTree here. If an operand is an
14472	// instruction we haven't yet visited and from the same basic block as the
14473	// user or the use is a PHI node, we add it to the worklist.
14474	else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
14475	BinaryOperator, UnaryOperator>(Val: I)) {
14476	for (Use &U : I->operands()) {
14477	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
14478	if (Visited.insert(Ptr: J).second &&
14479	(isa<PHINode>(Val: I) \|\| J->getParent() == Parent)) {
14480	Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + `1`);
14481	continue;
14482	}
14483	if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14484	FirstNonBool = U.get();
14485	}
14486	} else {
14487	break;
14488	}
14489	}
14490
14491	// If we didn't encounter a memory access in the expression tree, or if we
14492	// gave up for some reason, just return the width of V. Otherwise, return the
14493	// maximum width we found.
14494	if (!Width) {
14495	if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14496	V = FirstNonBool;
14497	Width = DL->getTypeSizeInBits(Ty: V->getType());
14498	}
14499
14500	for (Instruction *I : Visited)
14501	InstrElementSize [I] = Width;
14502
14503	return Width;
14504	}
14505
14506	bool BoUpSLP::collectValuesToDemote(
14507	const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14508	SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
14509	unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14510	bool IsTruncRoot) const {
14511	// We can always demote constants.
14512	if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
14513	return true;
14514
14515	unsigned OrigBitWidth = DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType());
14516	if (OrigBitWidth == BitWidth) {
14517	MaxDepthLevel = `1`;
14518	return true;
14519	}
14520
14521	// If the value is not a vectorized instruction in the expression and not used
14522	// by the insertelement instruction and not used in multiple vector nodes, it
14523	// cannot be demoted.
14524	auto IsPotentiallyTruncated = [&](Value V, unsigned* &BitWidth) -> bool {
14525	if (MultiNodeScalars.contains(Val: V))
14526	return false;
14527	if (OrigBitWidth > BitWidth) {
14528	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14529	if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery (*DL)))
14530	return true;
14531	}
14532	auto NumSignBits = ComputeNumSignBits(Op: V, DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
14533	unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14534	bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery (*DL));
14535	if (IsSigned)
14536	++BitWidth1;
14537	if (auto *I = dyn_cast<Instruction>(Val: V)) {
14538	APInt Mask = DB->getDemandedBits(I);
14539	unsigned BitWidth2 =
14540	std::max<unsigned>(a: `1`, b: Mask.getBitWidth() - Mask.countl_zero());
14541	while (!IsSigned && BitWidth2 < OrigBitWidth) {
14542	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - `1`);
14543	if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery (*DL)))
14544	break;
14545	BitWidth2 *= `2`;
14546	}
14547	BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
14548	}
14549	BitWidth = std::max(a: BitWidth, b: BitWidth1);
14550	return BitWidth > `0` && OrigBitWidth >= (BitWidth * `2`);
14551	};
14552	using namespace std::placeholders;
14553	auto FinalAnalysis = [&]() {
14554	if (!IsProfitableToDemote)
14555	return false;
14556	bool Res = all_of(
14557	Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
14558	// Gather demoted constant operands.
14559	if (Res && E.State == TreeEntry::NeedToGather &&
14560	all_of(Range: E.Scalars, P: IsaPred<Constant>))
14561	ToDemote.push_back(Elt: E.Idx);
14562	return Res;
14563	};
14564	// TODO: improve handling of gathered values and others.
14565	if (E.State == TreeEntry::NeedToGather \|\| !Visited.insert(V: &E).second \|\|
14566	any_of(Range: E.Scalars, P: [&](Value *V) {
14567	return all_of(Range: V->users(), P: [&](User *U) {
14568	return isa<InsertElementInst>(Val: U) && !getTreeEntry(V: U);
14569	});
14570	}))
14571	return FinalAnalysis ();
14572
14573	if (any_of(Range: E.Scalars, P: [&](Value *V) {
14574	return !all_of(Range: V->users(), P: [=](User *U) {
14575	return getTreeEntry(V: U) \|\|
14576	(UserIgnoreList && UserIgnoreList->contains(V: U)) \|\|
14577	(!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
14578	!U->getType()->isScalableTy() &&
14579	DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
14580	}) && !IsPotentiallyTruncated (V, BitWidth);
14581	}))
14582	return false;
14583
14584	auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14585	bool &NeedToExit) {
14586	NeedToExit = false;
14587	unsigned InitLevel = MaxDepthLevel;
14588	for (const TreeEntry *Op : Operands) {
14589	unsigned Level = InitLevel;
14590	if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
14591	ToDemote, Visited, MaxDepthLevel&: Level, IsProfitableToDemote,
14592	IsTruncRoot)) {
14593	if (!IsProfitableToDemote)
14594	return false;
14595	NeedToExit = true;
14596	if (!FinalAnalysis ())
14597	return false;
14598	continue;
14599	}
14600	MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
14601	}
14602	return true;
14603	};
14604	auto AttemptCheckBitwidth =
14605	[&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14606	// Try all bitwidth < OrigBitWidth.
14607	NeedToExit = false;
14608	unsigned BestFailBitwidth = `0`;
14609	for (; BitWidth < OrigBitWidth; BitWidth *= `2`) {
14610	if (Checker (BitWidth, OrigBitWidth))
14611	return true;
14612	if (BestFailBitwidth == `0` && FinalAnalysis ())
14613	BestFailBitwidth = BitWidth;
14614	}
14615	if (BitWidth >= OrigBitWidth) {
14616	if (BestFailBitwidth == `0`) {
14617	BitWidth = OrigBitWidth;
14618	return false;
14619	}
14620	MaxDepthLevel = `1`;
14621	BitWidth = BestFailBitwidth;
14622	NeedToExit = true;
14623	return true;
14624	}
14625	return false;
14626	};
14627	auto TryProcessInstruction =
14628	[&](unsigned &BitWidth,
14629	ArrayRef<const TreeEntry *> Operands = std::nullopt,
14630	function_ref<bool(unsigned, unsigned)> Checker = {}) {
14631	if (Operands.empty()) {
14632	if (!IsTruncRoot)
14633	MaxDepthLevel = `1`;
14634	(void)for_each(Range: E.Scalars, F: std::bind(f&: IsPotentiallyTruncated, args: _1,
14635	args: std::ref(t&: BitWidth)));
14636	} else {
14637	// Several vectorized uses? Check if we can truncate it, otherwise -
14638	// exit.
14639	if (E.UserTreeIndices.size() > `1` &&
14640	!all_of(Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1,
14641	args: std::ref(t&: BitWidth))))
14642	return false;
14643	bool NeedToExit = false;
14644	if (Checker && !AttemptCheckBitwidth (Checker, NeedToExit))
14645	return false;
14646	if (NeedToExit)
14647	return true;
14648	if (!ProcessOperands (Operands, NeedToExit))
14649	return false;
14650	if (NeedToExit)
14651	return true;
14652	}
14653
14654	++MaxDepthLevel;
14655	// Record the entry that we can demote.
14656	ToDemote.push_back(Elt: E.Idx);
14657	return IsProfitableToDemote;
14658	};
14659	switch (E.getOpcode()) {
14660
14661	// We can always demote truncations and extensions. Since truncations can
14662	// seed additional demotion, we save the truncated value.
14663	case Instruction::Trunc:
14664	if (IsProfitableToDemoteRoot)
14665	IsProfitableToDemote = true;
14666	return TryProcessInstruction (BitWidth);
14667	case Instruction::ZExt:
14668	case Instruction::SExt:
14669	IsProfitableToDemote = true;
14670	return TryProcessInstruction (BitWidth);
14671
14672	// We can demote certain binary operations if we can demote both of their
14673	// operands.
14674	case Instruction::Add:
14675	case Instruction::Sub:
14676	case Instruction::Mul:
14677	case Instruction::And:
14678	case Instruction::Or:
14679	case Instruction::Xor: {
14680	return TryProcessInstruction (
14681	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)});
14682	}
14683	case Instruction::Shl: {
14684	// If we are truncating the result of this SHL, and if it's a shift of an
14685	// inrange amount, we can always perform a SHL in a smaller type.
14686	auto ShlChecker = [&](unsigned BitWidth, unsigned) {
14687	return all_of(Range: E.Scalars, P: [&](Value *V) {
14688	auto *I = cast<Instruction>(Val: V);
14689	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
14690	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
14691	});
14692	};
14693	return TryProcessInstruction (
14694	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, ShlChecker);
14695	}
14696	case Instruction::LShr: {
14697	// If this is a truncate of a logical shr, we can truncate it to a smaller
14698	// lshr iff we know that the bits we would otherwise be shifting in are
14699	// already zeros.
14700	auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14701	return all_of(Range: E.Scalars, P: [&](Value *V) {
14702	auto *I = cast<Instruction>(Val: V);
14703	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
14704	APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14705	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
14706	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask: ShiftedBits,
14707	DL: SimplifyQuery (*DL));
14708	});
14709	};
14710	return TryProcessInstruction (
14711	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
14712	LShrChecker);
14713	}
14714	case Instruction::AShr: {
14715	// If this is a truncate of an arithmetic shr, we can truncate it to a
14716	// smaller ashr iff we know that all the bits from the sign bit of the
14717	// original type and the sign bit of the truncate type are similar.
14718	auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14719	return all_of(Range: E.Scalars, P: [&](Value *V) {
14720	auto *I = cast<Instruction>(Val: V);
14721	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
14722	unsigned ShiftedBits = OrigBitWidth - BitWidth;
14723	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
14724	ShiftedBits < ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: *DL, Depth: `0`, AC,
14725	CxtI: nullptr, DT);
14726	});
14727	};
14728	return TryProcessInstruction (
14729	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
14730	AShrChecker);
14731	}
14732	case Instruction::UDiv:
14733	case Instruction::URem: {
14734	// UDiv and URem can be truncated if all the truncated bits are zero.
14735	auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14736	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14737	return all_of(Range: E.Scalars, P: [&](Value *V) {
14738	auto *I = cast<Instruction>(Val: V);
14739	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14740	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask, DL: SimplifyQuery (*DL)) &&
14741	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, DL: SimplifyQuery (*DL));
14742	});
14743	};
14744	return TryProcessInstruction (
14745	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, Checker);
14746	}
14747
14748	// We can demote selects if we can demote their true and false values.
14749	case Instruction::Select: {
14750	return TryProcessInstruction (
14751	BitWidth, {getOperandEntry(E: &E, Idx: `1`), getOperandEntry(E: &E, Idx: `2`)});
14752	}
14753
14754	// We can demote phis if we can demote all their incoming operands. Note that
14755	// we don't need to worry about cycles since we ensure single use above.
14756	case Instruction::PHI: {
14757	const unsigned NumOps = E.getNumOperands();
14758	SmallVector<const TreeEntry *> Ops(NumOps);
14759	transform(Range: seq<unsigned>(Begin: `0`, End: NumOps), d_first: Ops.begin(),
14760	F: std::bind(f: &BoUpSLP::getOperandEntry, args: this, args: &E, args: _1));
14761
14762	return TryProcessInstruction (BitWidth, Ops);
14763	}
14764
14765	case Instruction::Call: {
14766	auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
14767	if (!IC)
14768	break;
14769	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
14770	if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14771	ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14772	break;
14773	SmallVector<const TreeEntry *, `2`> Operands(`1`, getOperandEntry(E: &E, Idx: `0`));
14774	function_ref<bool(unsigned, unsigned)> CallChecker;
14775	auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14776	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14777	return all_of(Range: E.Scalars, P: [&](Value *V) {
14778	auto *I = cast<Instruction>(Val: V);
14779	if (ID == Intrinsic::umin \|\| ID == Intrinsic::umax) {
14780	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14781	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
14782	DL: SimplifyQuery (*DL)) &&
14783	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, DL: SimplifyQuery (*DL));
14784	}
14785	assert((ID == Intrinsic::smin \|\| ID == Intrinsic::smax) &&
14786	"Expected min/max intrinsics only.");
14787	unsigned SignBits = OrigBitWidth - BitWidth;
14788	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - `1`);
14789	return SignBits <= ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: *DL, Depth: `0`, AC,
14790	CxtI: nullptr, DT) &&
14791	(!isKnownNonNegative(V: I->getOperand(i: `0`), SQ: SimplifyQuery (*DL)) \|\|
14792	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
14793	DL: SimplifyQuery (*DL))) &&
14794	SignBits <= ComputeNumSignBits(Op: I->getOperand(i: `1`), DL: *DL, Depth: `0`, AC,
14795	CxtI: nullptr, DT) &&
14796	(!isKnownNonNegative(V: I->getOperand(i: `1`), SQ: SimplifyQuery (*DL)) \|\|
14797	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, DL: SimplifyQuery (*DL)));
14798	});
14799	};
14800	if (ID != Intrinsic::abs) {
14801	Operands.push_back(Elt: getOperandEntry(E: &E, Idx: `1`));
14802	CallChecker = CompChecker;
14803	}
14804	InstructionCost BestCost =
14805	std::numeric_limits<InstructionCost::CostType>::max();
14806	unsigned BestBitWidth = BitWidth;
14807	unsigned VF = E.Scalars.size();
14808	// Choose the best bitwidth based on cost estimations.
14809	auto Checker = [&](unsigned BitWidth, unsigned) {
14810	unsigned MinBW = PowerOf2Ceil(A: BitWidth);
14811	SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW);
14812	auto VecCallCosts = getVectorCallCosts(
14813	CI: IC,
14814	VecTy: FixedVectorType::get(ElementType: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), NumElts: VF),
14815	TTI, TLI, ArgTys);
14816	InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
14817	if (Cost < BestCost) {
14818	BestCost = Cost;
14819	BestBitWidth = BitWidth;
14820	}
14821	return false;
14822	};
14823	[[maybe_unused]] bool NeedToExit;
14824	(void)AttemptCheckBitwidth (Checker, NeedToExit);
14825	BitWidth = BestBitWidth;
14826	return TryProcessInstruction (BitWidth, Operands, CallChecker);
14827	}
14828
14829	// Otherwise, conservatively give up.
14830	default:
14831	break;
14832	}
14833	MaxDepthLevel = `1`;
14834	return FinalAnalysis ();
14835	}
14836
14837	static RecurKind getRdxKind(Value *V);
14838
14839	void BoUpSLP::computeMinimumValueSizes() {
14840	// We only attempt to truncate integer expressions.
14841	bool IsStoreOrInsertElt =
14842	VectorizableTree.front()->getOpcode() == Instruction::Store \|\|
14843	VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14844	if ((IsStoreOrInsertElt \|\| UserIgnoreList) &&
14845	ExtraBitWidthNodes.size() <= `1` &&
14846	(!CastMaxMinBWSizes \|\| CastMaxMinBWSizes ->second == `0` \|\|
14847	CastMaxMinBWSizes ->first / CastMaxMinBWSizes ->second <= `2`))
14848	return;
14849
14850	unsigned NodeIdx = `0`;
14851	if (IsStoreOrInsertElt &&
14852	VectorizableTree.front()->State != TreeEntry::NeedToGather)
14853	NodeIdx = `1`;
14854
14855	// Ensure the roots of the vectorizable tree don't form a cycle.
14856	if (VectorizableTree [NodeIdx]->State == TreeEntry::NeedToGather \|\|
14857	(NodeIdx == `0` && !VectorizableTree [NodeIdx]->UserTreeIndices.empty()) \|\|
14858	(NodeIdx != `0` && any_of(Range&: VectorizableTree [NodeIdx]->UserTreeIndices,
14859	P: [NodeIdx](const EdgeInfo &EI) {
14860	return EI.UserTE->Idx >
14861	static_cast<int>(NodeIdx);
14862	})))
14863	return;
14864
14865	// The first value node for store/insertelement is sext/zext/trunc? Skip it,
14866	// resize to the final type.
14867	bool IsTruncRoot = false;
14868	bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14869	SmallVector<unsigned> RootDemotes;
14870	if (NodeIdx != `0` &&
14871	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
14872	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
14873	assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
14874	IsTruncRoot = true;
14875	RootDemotes.push_back(Elt: NodeIdx);
14876	IsProfitableToDemoteRoot = true;
14877	++NodeIdx;
14878	}
14879
14880	// Analyzed the reduction already and not profitable - exit.
14881	if (AnalyzedMinBWVals.contains(V: VectorizableTree [NodeIdx]->Scalars.front()))
14882	return;
14883
14884	SmallVector<unsigned> ToDemote;
14885	auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
14886	bool IsProfitableToDemoteRoot, unsigned Opcode,
14887	unsigned Limit, bool IsTruncRoot,
14888	bool IsSignedCmp) {
14889	ToDemote.clear();
14890	unsigned VF = E.getVectorFactor();
14891	auto *TreeRootIT = dyn_cast<IntegerType>(Val: E.Scalars.front()->getType());
14892	if (!TreeRootIT \|\| !Opcode)
14893	return `0u`;
14894
14895	if (any_of(Range: E.Scalars,
14896	P: [&](Value V) { return* AnalyzedMinBWVals.contains(V); }))
14897	return `0u`;
14898
14899	unsigned NumParts =
14900	TTI->getNumberOfParts(Tp: FixedVectorType::get(ElementType: TreeRootIT, NumElts: VF));
14901
14902	// The maximum bit width required to represent all the values that can be
14903	// demoted without loss of precision. It would be safe to truncate the roots
14904	// of the expression to this width.
14905	unsigned MaxBitWidth = `1u`;
14906
14907	// True if the roots can be zero-extended back to their original type,
14908	// rather than sign-extended. We know that if the leading bits are not
14909	// demanded, we can safely zero-extend. So we initialize IsKnownPositive to
14910	// True.
14911	// Determine if the sign bit of all the roots is known to be zero. If not,
14912	// IsKnownPositive is set to False.
14913	bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
14914	KnownBits Known = computeKnownBits(V: R, DL: *DL);
14915	return Known.isNonNegative();
14916	});
14917
14918	// We first check if all the bits of the roots are demanded. If they're not,
14919	// we can truncate the roots to this narrower type.
14920	for (Value *Root : E.Scalars) {
14921	unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
14922	TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: Root->getType());
14923	unsigned BitWidth1 = NumTypeBits - NumSignBits;
14924	// If we can't prove that the sign bit is zero, we must add one to the
14925	// maximum bit width to account for the unknown sign bit. This preserves
14926	// the existing sign bit so we can safely sign-extend the root back to the
14927	// original type. Otherwise, if we know the sign bit is zero, we will
14928	// zero-extend the root instead.
14929	//
14930	// FIXME: This is somewhat suboptimal, as there will be cases where adding
14931	// one to the maximum bit width will yield a larger-than-necessary
14932	// type. In general, we need to add an extra bit only if we can't
14933	// prove that the upper bit of the original type is equal to the
14934	// upper bit of the proposed smaller type. If these two bits are
14935	// the same (either zero or one) we know that sign-extending from
14936	// the smaller type will result in the same value. Here, since we
14937	// can't yet prove this, we are just making the proposed smaller
14938	// type larger to ensure correctness.
14939	if (!IsKnownPositive)
14940	++BitWidth1;
14941
14942	APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: Root));
14943	unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14944	MaxBitWidth =
14945	std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
14946	}
14947
14948	if (MaxBitWidth < `8` && MaxBitWidth > `1`)
14949	MaxBitWidth = `8`;
14950
14951	// If the original type is large, but reduced type does not improve the reg
14952	// use - ignore it.
14953	if (NumParts > `1` &&
14954	NumParts ==
14955	TTI->getNumberOfParts(Tp: FixedVectorType::get(
14956	ElementType: IntegerType::get(C&: F->getContext(), NumBits: bit_ceil(Value: MaxBitWidth)), NumElts: VF)))
14957	return `0u`;
14958
14959	bool IsProfitableToDemote = Opcode == Instruction::Trunc \|\|
14960	Opcode == Instruction::SExt \|\|
14961	Opcode == Instruction::ZExt \|\| NumParts > `1`;
14962	// Conservatively determine if we can actually truncate the roots of the
14963	// expression. Collect the values that can be demoted in ToDemote and
14964	// additional roots that require investigating in Roots.
14965	DenseSet<const TreeEntry *> Visited;
14966	unsigned MaxDepthLevel = IsTruncRoot ? Limit : `1`;
14967	bool NeedToDemote = IsProfitableToDemote;
14968
14969	if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
14970	ToDemote, Visited, MaxDepthLevel, IsProfitableToDemote&: NeedToDemote,
14971	IsTruncRoot) \|\|
14972	(MaxDepthLevel <= Limit &&
14973	!(((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
14974	(!IsTopRoot \|\| !(IsStoreOrInsertElt \|\| UserIgnoreList) \|\|
14975	DL->getTypeSizeInBits(Ty: TreeRootIT) /
14976	DL->getTypeSizeInBits(Ty: cast<Instruction>(Val: E.Scalars.front())
14977	->getOperand(i: `0`)
14978	->getType()) >
14979	`2`)))))
14980	return `0u`;
14981	// Round MaxBitWidth up to the next power-of-two.
14982	MaxBitWidth = bit_ceil(Value: MaxBitWidth);
14983
14984	return MaxBitWidth;
14985	};
14986
14987	// If we can truncate the root, we must collect additional values that might
14988	// be demoted as a result. That is, those seeded by truncations we will
14989	// modify.
14990	// Add reduction ops sizes, if any.
14991	if (UserIgnoreList &&
14992	isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
14993	for (Value V : UserIgnoreList) {
14994	auto NumSignBits = ComputeNumSignBits(Op: V, DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
14995	auto NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
14996	unsigned BitWidth1 = NumTypeBits - NumSignBits;
14997	if (!isKnownNonNegative(V, SQ: SimplifyQuery (*DL)))
14998	++BitWidth1;
14999	unsigned BitWidth2 = BitWidth1;
15000	if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
15001	auto Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
15002	BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15003	}
15004	ReductionBitWidth =
15005	std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
15006	}
15007	if (ReductionBitWidth < `8` && ReductionBitWidth > `1`)
15008	ReductionBitWidth = `8`;
15009
15010	ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
15011	}
15012	bool IsTopRoot = NodeIdx == `0`;
15013	while (NodeIdx < VectorizableTree.size() &&
15014	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
15015	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
15016	RootDemotes.push_back(Elt: NodeIdx);
15017	++NodeIdx;
15018	IsTruncRoot = true;
15019	}
15020	bool IsSignedCmp = false;
15021	while (NodeIdx < VectorizableTree.size()) {
15022	ArrayRef<Value *> TreeRoot = VectorizableTree [NodeIdx]->Scalars;
15023	unsigned Limit = `2`;
15024	unsigned Opcode = VectorizableTree [NodeIdx]->getOpcode();
15025	if (IsTopRoot &&
15026	ReductionBitWidth ==
15027	DL->getTypeSizeInBits(
15028	Ty: VectorizableTree.front()->Scalars.front()->getType()))
15029	Limit = `3`;
15030	unsigned MaxBitWidth = ComputeMaxBitWidth (
15031	*VectorizableTree [NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15032	Opcode, Limit, IsTruncRoot, IsSignedCmp);
15033	if (ReductionBitWidth != `0` && (IsTopRoot \|\| !RootDemotes.empty())) {
15034	if (MaxBitWidth != `0` && ReductionBitWidth < MaxBitWidth)
15035	ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
15036	else if (MaxBitWidth == `0`)
15037	ReductionBitWidth = `0`;
15038	}
15039
15040	for (unsigned Idx : RootDemotes) {
15041	if (all_of(Range&: VectorizableTree [Idx]->Scalars, P: [&](Value *V) {
15042	uint32_t OrigBitWidth = DL->getTypeSizeInBits(Ty: V->getType());
15043	if (OrigBitWidth > MaxBitWidth) {
15044	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
15045	return MaskedValueIsZero(V, Mask, DL: SimplifyQuery (*DL));
15046	}
15047	return false;
15048	}))
15049	ToDemote.push_back(Elt: Idx);
15050	}
15051	RootDemotes.clear();
15052	IsTopRoot = false;
15053	IsProfitableToDemoteRoot = true;
15054
15055	if (ExtraBitWidthNodes.empty()) {
15056	NodeIdx = VectorizableTree.size();
15057	} else {
15058	unsigned NewIdx = `0`;
15059	do {
15060	NewIdx = *ExtraBitWidthNodes.begin();
15061	ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
15062	} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15063	NodeIdx = NewIdx;
15064	IsTruncRoot =
15065	NodeIdx < VectorizableTree.size() &&
15066	any_of(Range&: VectorizableTree [NodeIdx]->UserTreeIndices,
15067	P: [](const EdgeInfo &EI) {
15068	return EI.EdgeIdx == `0` &&
15069	EI.UserTE->getOpcode() == Instruction::Trunc &&
15070	!EI.UserTE->isAltShuffle();
15071	});
15072	IsSignedCmp =
15073	NodeIdx < VectorizableTree.size() &&
15074	any_of(Range&: VectorizableTree [NodeIdx]->UserTreeIndices,
15075	P: [](const EdgeInfo &EI) {
15076	return EI.UserTE->getOpcode() == Instruction::ICmp &&
15077	any_of(Range&: EI.UserTE->Scalars, P: [](Value *V) {
15078	auto *IC = dyn_cast<ICmpInst>(Val: V);
15079	return IC && IC->isSigned();
15080	});
15081	});
15082	}
15083
15084	// If the maximum bit width we compute is less than the with of the roots'
15085	// type, we can proceed with the narrowing. Otherwise, do nothing.
15086	if (MaxBitWidth == `0` \|\|
15087	MaxBitWidth >=
15088	cast<IntegerType>(Val: TreeRoot.front()->getType())->getBitWidth()) {
15089	if (UserIgnoreList)
15090	AnalyzedMinBWVals.insert(I: TreeRoot.begin(), E: TreeRoot.end());
15091	continue;
15092	}
15093
15094	// Finally, map the values we can demote to the maximum bit with we
15095	// computed.
15096	for (unsigned Idx : ToDemote) {
15097	TreeEntry *TE = VectorizableTree [Idx].get();
15098	if (MinBWs.contains(Val: TE))
15099	continue;
15100	bool IsSigned = TE->getOpcode() == Instruction::SExt \|\|
15101	any_of(Range&: TE->Scalars, P: [&](Value *R) {
15102	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
15103	});
15104	MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
15105	}
15106	}
15107	}
15108
15109	PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
15110	auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
15111	auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
15112	auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
15113	auto *AA = &AM.getResult<AAManager>(IR&: F);
15114	auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
15115	auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
15116	auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
15117	auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
15118	auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
15119
15120	bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
15121	if (!Changed)
15122	return PreservedAnalyses::all();
15123
15124	PreservedAnalyses PA;
15125	PA.preserveSet<CFGAnalyses>();
15126	return PA;
15127	}
15128
15129	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
15130	TargetTransformInfo *TTI_,
15131	TargetLibraryInfo TLI_, AAResults AA_,
15132	LoopInfo LI_, DominatorTree DT_,
15133	AssumptionCache AC_, DemandedBits DB_,
15134	OptimizationRemarkEmitter *ORE_) {
15135	if (!RunSLPVectorization)
15136	return false;
15137	SE = SE_;
15138	TTI = TTI_;
15139	TLI = TLI_;
15140	AA = AA_;
15141	LI = LI_;
15142	DT = DT_;
15143	AC = AC_;
15144	DB = DB_;
15145	DL = &F.getParent()->getDataLayout();
15146
15147	Stores.clear();
15148	GEPs.clear();
15149	bool Changed = false;
15150
15151	// If the target claims to have no vector registers don't attempt
15152	// vectorization.
15153	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
15154	LLVM_DEBUG(
15155	dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15156	return false;
15157	}
15158
15159	// Don't vectorize when the attribute NoImplicitFloat is used.
15160	if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15161	return false;
15162
15163	LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15164
15165	// Use the bottom up slp vectorizer to construct chains that start with
15166	// store instructions.
15167	BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15168
15169	// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15170	// delete instructions.
15171
15172	// Update DFS numbers now so that we can use them for ordering.
15173	DT->updateDFSNumbers();
15174
15175	// Scan the blocks in the function in post order.
15176	for (auto *BB : post_order(G: &F.getEntryBlock())) {
15177	// Start new block - clear the list of reduction roots.
15178	R.clearReductionData();
15179	collectSeedInstructions(BB);
15180
15181	// Vectorize trees that end at stores.
15182	if (!Stores.empty()) {
15183	LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15184	<< " underlying objects.\n");
15185	Changed \|= vectorizeStoreChains(R);
15186	}
15187
15188	// Vectorize trees that end at reductions.
15189	Changed \|= vectorizeChainsInBlock(BB, R);
15190
15191	// Vectorize the index computations of getelementptr instructions. This
15192	// is primarily intended to catch gather-like idioms ending at
15193	// non-consecutive loads.
15194	if (!GEPs.empty()) {
15195	LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15196	<< " underlying objects.\n");
15197	Changed \|= vectorizeGEPIndices(BB, R);
15198	}
15199	}
15200
15201	if (Changed) {
15202	R.optimizeGatherSequence();
15203	LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15204	}
15205	return Changed;
15206	}
15207
15208	bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15209	unsigned Idx, unsigned MinVF) {
15210	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15211	<< "\n");
15212	const unsigned Sz = R.getVectorElementSize(V: Chain [`0`]);
15213	unsigned VF = Chain.size();
15214
15215	if (!isPowerOf2_32(Value: Sz) \|\| !isPowerOf2_32(Value: VF) \|\| VF < `2` \|\| VF < MinVF) {
15216	// Check if vectorizing with a non-power-of-2 VF should be considered. At
15217	// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15218	// all vector lanes are used.
15219	if (!VectorizeNonPowerOf2 \|\| (VF < MinVF && VF + `1` != MinVF))
15220	return false;
15221	}
15222
15223	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15224	<< "\n");
15225
15226	R.buildTree(Roots: Chain);
15227	if (R.isTreeTinyAndNotFullyVectorizable())
15228	return false;
15229	if (R.isLoadCombineCandidate())
15230	return false;
15231	R.reorderTopToBottom();
15232	R.reorderBottomToTop();
15233	R.buildExternalUses();
15234
15235	R.computeMinimumValueSizes();
15236	R.transformNodes();
15237
15238	InstructionCost Cost = R.getTreeCost();
15239
15240	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15241	if (Cost < -SLPCostThreshold) {
15242	LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15243
15244	using namespace ore;
15245
15246	R.getORE()->emit(OptDiag&: OptimizationRemark (SV_NAME, "StoresVectorized",
15247	cast<StoreInst>(Val: Chain [`0`]))
15248	<< "Stores SLP vectorized with cost " << NV ("Cost", Cost)
15249	<< " and with tree size "
15250	<< NV ("TreeSize", R.getTreeSize()));
15251
15252	R.vectorizeTree();
15253	return true;
15254	}
15255
15256	return false;
15257	}
15258
15259	bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
15260	BoUpSLP &R) {
15261	// We may run into multiple chains that merge into a single chain. We mark the
15262	// stores that we vectorized so that we don't visit the same store twice.
15263	BoUpSLP::ValueSet VectorizedStores;
15264	bool Changed = false;
15265
15266	// Stores the pair of stores (first_store, last_store) in a range, that were
15267	// already tried to be vectorized. Allows to skip the store ranges that were
15268	// already tried to be vectorized but the attempts were unsuccessful.
15269	DenseSet<std::pair<Value , Value >> TriedSequences;
15270	struct StoreDistCompare {
15271	bool operator()(const std::pair<unsigned, int> &Op1,
15272	const std::pair<unsigned, int> &Op2) const {
15273	return Op1.second < Op2.second;
15274	}
15275	};
15276	// A set of pairs (index of store in Stores array ref, Distance of the store
15277	// address relative to base store address in units).
15278	using StoreIndexToDistSet =
15279	std::set<std::pair<unsigned, int>, StoreDistCompare>;
15280	auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15281	int PrevDist = -`1`;
15282	BoUpSLP::ValueList Operands;
15283	// Collect the chain into a list.
15284	for (auto [Idx, Data] : enumerate(First: Set)) {
15285	if (Operands.empty() \|\| Data.second - PrevDist == `1`) {
15286	Operands.push_back(Elt: Stores [Data.first]);
15287	PrevDist = Data.second;
15288	if (Idx != Set.size() - `1`)
15289	continue;
15290	}
15291	auto E = make_scope_exit(F: [&, &DataVar = Data]() {
15292	Operands.clear();
15293	Operands.push_back(Elt: Stores [DataVar.first]);
15294	PrevDist = DataVar.second;
15295	});
15296
15297	if (Operands.size() <= `1`)
15298	continue;
15299
15300	unsigned MaxVecRegSize = R.getMaxVecRegSize();
15301	unsigned EltSize = R.getVectorElementSize(V: Operands [`0`]);
15302	unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
15303
15304	unsigned MaxVF =
15305	std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
15306	auto *Store = cast<StoreInst>(Val: Operands [`0`]);
15307	Type *StoreTy = Store->getValueOperand()->getType();
15308	Type *ValueTy = StoreTy;
15309	if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
15310	ValueTy = Trunc->getSrcTy();
15311	unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF(
15312	VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreTy)), ScalarMemTy: StoreTy, ScalarValTy: ValueTy));
15313
15314	if (MaxVF < MinVF) {
15315	LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15316	<< ") < "
15317	<< "MinVF (" << MinVF << ")\n");
15318	continue;
15319	}
15320
15321	unsigned NonPowerOf2VF = `0`;
15322	if (VectorizeNonPowerOf2) {
15323	// First try vectorizing with a non-power-of-2 VF. At the moment, only
15324	// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15325	// lanes are used.
15326	unsigned CandVF = Operands.size();
15327	if (isPowerOf2_32(Value: CandVF + `1`) && CandVF <= MaxVF)
15328	NonPowerOf2VF = CandVF;
15329	}
15330
15331	unsigned Sz = `1` + Log2_32(Value: MaxVF) - Log2_32(Value: MinVF);
15332	SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > `0` ? `1` : `0`));
15333	unsigned Size = MinVF;
15334	for_each(Range: reverse(C&: CandidateVFs), F: [&](unsigned &VF) {
15335	VF = Size > MaxVF ? NonPowerOf2VF : Size;
15336	Size *= `2`;
15337	});
15338	unsigned StartIdx = `0`;
15339	for (unsigned Size : CandidateVFs) {
15340	for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
15341	ArrayRef<Value *> Slice = ArrayRef(Operands).slice(N: Cnt, M: Size);
15342	assert(
15343	all_of(
15344	Slice,
15345	[&](Value *V) {
15346	return cast<StoreInst>(V)->getValueOperand()->getType() ==
15347	cast<StoreInst>(Slice.front())
15348	->getValueOperand()
15349	->getType();
15350	}) &&
15351	"Expected all operands of same type.");
15352	if (!VectorizedStores.count(Ptr: Slice.front()) &&
15353	!VectorizedStores.count(Ptr: Slice.back()) &&
15354	TriedSequences.insert(V: std::make_pair(x: Slice.front(), y: Slice.back()))
15355	.second &&
15356	vectorizeStoreChain(Chain: Slice, R, Idx: Cnt, MinVF)) {
15357	// Mark the vectorized stores so that we don't vectorize them again.
15358	VectorizedStores.insert(I: Slice.begin(), E: Slice.end());
15359	Changed = true;
15360	// If we vectorized initial block, no need to try to vectorize it
15361	// again.
15362	if (Cnt == StartIdx)
15363	StartIdx += Size;
15364	Cnt += Size;
15365	continue;
15366	}
15367	++Cnt;
15368	}
15369	// Check if the whole array was vectorized already - exit.
15370	if (StartIdx >= Operands.size())
15371	break;
15372	}
15373	}
15374	};
15375
15376	// Stores pair (first: index of the store into Stores array ref, address of
15377	// which taken as base, second: sorted set of pairs {index, dist}, which are
15378	// indices of stores in the set and their store location distances relative to
15379	// the base address).
15380
15381	// Need to store the index of the very first store separately, since the set
15382	// may be reordered after the insertion and the first store may be moved. This
15383	// container allows to reduce number of calls of getPointersDiff() function.
15384	SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
15385	// Inserts the specified store SI with the given index Idx to the set of the
15386	// stores. If the store with the same distance is found already - stop
15387	// insertion, try to vectorize already found stores. If some stores from this
15388	// sequence were not vectorized - try to vectorize them with the new store
15389	// later. But this logic is applied only to the stores, that come before the
15390	// previous store with the same distance.
15391	// Example:
15392	// 1. store x, %p
15393	// 2. store y, %p+1
15394	// 3. store z, %p+2
15395	// 4. store a, %p
15396	// 5. store b, %p+3
15397	// - Scan this from the last to first store. The very first bunch of stores is
15398	// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
15399	// vector).
15400	// - The next store in the list - #1 - has the same distance from store #5 as
15401	// the store #4.
15402	// - Try to vectorize sequence of stores 4,2,3,5.
15403	// - If all these stores are vectorized - just drop them.
15404	// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
15405	// - Start new stores sequence.
15406	// The new bunch of stores is {1, {1, 0}}.
15407	// - Add the stores from previous sequence, that were not vectorized.
15408	// Here we consider the stores in the reversed order, rather they are used in
15409	// the IR (Stores are reversed already, see vectorizeStoreChains() function).
15410	// Store #3 can be added -> comes after store #4 with the same distance as
15411	// store #1.
15412	// Store #5 cannot be added - comes before store #4.
15413	// This logic allows to improve the compile time, we assume that the stores
15414	// after previous store with the same distance most likely have memory
15415	// dependencies and no need to waste compile time to try to vectorize them.
15416	// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
15417	auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
15418	for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15419	std::optional<int> Diff = getPointersDiff(
15420	ElemTyA: Stores [Set.first]->getValueOperand()->getType(),
15421	PtrA: Stores [Set.first]->getPointerOperand(),
15422	ElemTyB: SI->getValueOperand()->getType(), PtrB: SI->getPointerOperand(), DL: DL, SE&: SE,
15423	/StrictCheck=/true);
15424	if (!Diff)
15425	continue;
15426	auto It = Set.second.find(x: std::make_pair(x&: Idx, y&: *Diff));
15427	if (It == Set.second.end()) {
15428	Set.second.emplace(args&: Idx, args&: *Diff);
15429	return;
15430	}
15431	// Try to vectorize the first found set to avoid duplicate analysis.
15432	TryToVectorize (Set.second);
15433	StoreIndexToDistSet PrevSet;
15434	PrevSet.swap(x&: Set.second);
15435	Set.first = Idx;
15436	Set.second.emplace(args&: Idx, args: `0`);
15437	// Insert stores that followed previous match to try to vectorize them
15438	// with this store.
15439	unsigned StartIdx = It ->first + `1`;
15440	SmallBitVector UsedStores(Idx - StartIdx);
15441	// Distances to previously found dup store (or this store, since they
15442	// store to the same addresses).
15443	SmallVector<int> Dists(Idx - StartIdx, `0`);
15444	for (const std::pair<unsigned, int> &Pair : reverse(C&: PrevSet)) {
15445	// Do not try to vectorize sequences, we already tried.
15446	if (Pair.first <= It ->first \|\|
15447	VectorizedStores.contains(Ptr: Stores [Pair.first]))
15448	break;
15449	unsigned BI = Pair.first - StartIdx;
15450	UsedStores.set(BI);
15451	Dists [BI] = Pair.second - It ->second;
15452	}
15453	for (unsigned I = StartIdx; I < Idx; ++I) {
15454	unsigned BI = I - StartIdx;
15455	if (UsedStores.test(Idx: BI))
15456	Set.second.emplace(args&: I, args&: Dists [BI]);
15457	}
15458	return;
15459	}
15460	auto &Res = SortedStores.emplace_back();
15461	Res.first = Idx;
15462	Res.second.emplace(args&: Idx, args: `0`);
15463	};
15464	StoreInst *PrevStore = Stores.front();
15465	for (auto [I, SI] : enumerate(First&: Stores)) {
15466	// Check that we do not try to vectorize stores of different types.
15467	if (PrevStore->getValueOperand()->getType() !=
15468	SI->getValueOperand()->getType()) {
15469	for (auto &Set : SortedStores)
15470	TryToVectorize (Set.second);
15471	SortedStores.clear();
15472	PrevStore = SI;
15473	}
15474	FillStoresSet (I, SI);
15475	}
15476
15477	// Final vectorization attempt.
15478	for (auto &Set : SortedStores)
15479	TryToVectorize (Set.second);
15480
15481	return Changed;
15482	}
15483
15484	void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
15485	// Initialize the collections. We will make a single pass over the block.
15486	Stores.clear();
15487	GEPs.clear();
15488
15489	// Visit the store and getelementptr instructions in BB and organize them in
15490	// Stores and GEPs according to the underlying objects of their pointer
15491	// operands.
15492	for (Instruction &I : *BB) {
15493	// Ignore store instructions that are volatile or have a pointer operand
15494	// that doesn't point to a scalar type.
15495	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
15496	if (!SI->isSimple())
15497	continue;
15498	if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
15499	continue;
15500	Stores [getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
15501	}
15502
15503	// Ignore getelementptr instructions that have more than one index, a
15504	// constant index, or a pointer operand that doesn't point to a scalar
15505	// type.
15506	else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
15507	if (GEP->getNumIndices() != `1`)
15508	continue;
15509	Value *Idx = GEP->idx_begin()->get();
15510	if (isa<Constant>(Val: Idx))
15511	continue;
15512	if (!isValidElementType(Ty: Idx->getType()))
15513	continue;
15514	if (GEP->getType()->isVectorTy())
15515	continue;
15516	GEPs [GEP->getPointerOperand()].push_back(Elt: GEP);
15517	}
15518	}
15519	}
15520
15521	bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
15522	bool MaxVFOnly) {
15523	if (VL.size() < `2`)
15524	return false;
15525
15526	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
15527	<< VL.size() << ".\n");
15528
15529	// Check that all of the parts are instructions of the same type,
15530	// we permit an alternate opcode via InstructionsState.
15531	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
15532	if (!S.getOpcode())
15533	return false;
15534
15535	Instruction *I0 = cast<Instruction>(Val: S.OpValue);
15536	// Make sure invalid types (including vector type) are rejected before
15537	// determining vectorization factor for scalar instructions.
15538	for (Value *V : VL) {
15539	Type *Ty = V->getType();
15540	if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
15541	// NOTE: the following will give user internal llvm type name, which may
15542	// not be useful.
15543	R.getORE()->emit(RemarkBuilder: [&]() {
15544	std::string TypeStr;
15545	llvm::raw_string_ostream rso(TypeStr);
15546	Ty->print(O&: rso);
15547	return OptimizationRemarkMissed (SV_NAME, "UnsupportedType", I0)
15548	<< "Cannot SLP vectorize list: type "
15549	<< rso.str() + " is unsupported by vectorizer";
15550	});
15551	return false;
15552	}
15553	}
15554
15555	unsigned Sz = R.getVectorElementSize(V: I0);
15556	unsigned MinVF = R.getMinVF(Sz);
15557	unsigned MaxVF = std::max<unsigned>(a: llvm::bit_floor(Value: VL.size()), b: MinVF);
15558	MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
15559	if (MaxVF < `2`) {
15560	R.getORE()->emit(RemarkBuilder: [&]() {
15561	return OptimizationRemarkMissed (SV_NAME, "SmallVF", I0)
15562	<< "Cannot SLP vectorize list: vectorization factor "
15563	<< "less than 2 is not supported";
15564	});
15565	return false;
15566	}
15567
15568	bool Changed = false;
15569	bool CandidateFound = false;
15570	InstructionCost MinCost = SLPCostThreshold.getValue();
15571	Type *ScalarTy = VL [`0`]->getType();
15572	if (auto *IE = dyn_cast<InsertElementInst>(Val: VL [`0`]))
15573	ScalarTy = IE->getOperand(i_nocapture: `1`)->getType();
15574
15575	unsigned NextInst = `0`, MaxInst = VL.size();
15576	for (unsigned VF = MaxVF; NextInst + `1` < MaxInst && VF >= MinVF; VF /= `2`) {
15577	// No actual vectorization should happen, if number of parts is the same as
15578	// provided vectorization factor (i.e. the scalar type is used for vector
15579	// code during codegen).
15580	auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF);
15581	if (TTI->getNumberOfParts(Tp: VecTy) == VF)
15582	continue;
15583	for (unsigned I = NextInst; I < MaxInst; ++I) {
15584	unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
15585
15586	if (!isPowerOf2_32(Value: ActualVF))
15587	continue;
15588
15589	if (MaxVFOnly && ActualVF < MaxVF)
15590	break;
15591	if ((VF > MinVF && ActualVF <= VF / `2`) \|\| (VF == MinVF && ActualVF < `2`))
15592	break;
15593
15594	ArrayRef<Value *> Ops = VL.slice(N: I, M: ActualVF);
15595	// Check that a previous iteration of this loop did not delete the Value.
15596	if (llvm::any_of(Range&: Ops, P: [&R](Value *V) {
15597	auto *I = dyn_cast<Instruction>(Val: V);
15598	return I && R.isDeleted(I);
15599	}))
15600	continue;
15601
15602	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
15603	<< "\n");
15604
15605	R.buildTree(Roots: Ops);
15606	if (R.isTreeTinyAndNotFullyVectorizable())
15607	continue;
15608	R.reorderTopToBottom();
15609	R.reorderBottomToTop(
15610	/IgnoreReorder=/!isa<InsertElementInst>(Val: Ops.front()) &&
15611	!R.doesRootHaveInTreeUses());
15612	R.buildExternalUses();
15613
15614	R.computeMinimumValueSizes();
15615	R.transformNodes();
15616	InstructionCost Cost = R.getTreeCost();
15617	CandidateFound = true;
15618	MinCost = std::min(a: MinCost, b: Cost);
15619
15620	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
15621	<< " for VF=" << ActualVF << "\n");
15622	if (Cost < -SLPCostThreshold) {
15623	LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
15624	R.getORE()->emit(OptDiag&: OptimizationRemark (SV_NAME, "VectorizedList",
15625	cast<Instruction>(Val: Ops [`0`]))
15626	<< "SLP vectorized with cost " << ore::NV ("Cost", Cost)
15627	<< " and with tree size "
15628	<< ore::NV ("TreeSize", R.getTreeSize()));
15629
15630	R.vectorizeTree();
15631	// Move to the next bundle.
15632	I += VF - `1`;
15633	NextInst = I + `1`;
15634	Changed = true;
15635	}
15636	}
15637	}
15638
15639	if (!Changed && CandidateFound) {
15640	R.getORE()->emit(RemarkBuilder: [&]() {
15641	return OptimizationRemarkMissed (SV_NAME, "NotBeneficial", I0)
15642	<< "List vectorization was possible but not beneficial with cost "
15643	<< ore::NV ("Cost", MinCost) << " >= "
15644	<< ore::NV ("Treshold", -SLPCostThreshold);
15645	});
15646	} else if (!Changed) {
15647	R.getORE()->emit(RemarkBuilder: [&]() {
15648	return OptimizationRemarkMissed (SV_NAME, "NotPossible", I0)
15649	<< "Cannot SLP vectorize list: vectorization was impossible"
15650	<< " with available vectorization factors";
15651	});
15652	}
15653	return Changed;
15654	}
15655
15656	bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
15657	if (!I)
15658	return false;
15659
15660	if (!isa<BinaryOperator, CmpInst>(Val: I) \|\| isa<VectorType>(Val: I->getType()))
15661	return false;
15662
15663	Value *P = I->getParent();
15664
15665	// Vectorize in current basic block only.
15666	auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
15667	auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: `1`));
15668	if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P)
15669	return false;
15670
15671	// First collect all possible candidates
15672	SmallVector<std::pair<Value , Value >, `4`> Candidates;
15673	Candidates.emplace_back(Args&: Op0, Args&: Op1);
15674
15675	auto *A = dyn_cast<BinaryOperator>(Val: Op0);
15676	auto *B = dyn_cast<BinaryOperator>(Val: Op1);
15677	// Try to skip B.
15678	if (A && B && B->hasOneUse()) {
15679	auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `0`));
15680	auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `1`));
15681	if (B0 && B0->getParent() == P)
15682	Candidates.emplace_back(Args&: A, Args&: B0);
15683	if (B1 && B1->getParent() == P)
15684	Candidates.emplace_back(Args&: A, Args&: B1);
15685	}
15686	// Try to skip A.
15687	if (B && A && A->hasOneUse()) {
15688	auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `0`));
15689	auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `1`));
15690	if (A0 && A0->getParent() == P)
15691	Candidates.emplace_back(Args&: A0, Args&: B);
15692	if (A1 && A1->getParent() == P)
15693	Candidates.emplace_back(Args&: A1, Args&: B);
15694	}
15695
15696	if (Candidates.size() == `1`)
15697	return tryToVectorizeList(VL: {Op0, Op1}, R);
15698
15699	// We have multiple options. Try to pick the single best.
15700	std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
15701	if (!BestCandidate)
15702	return false;
15703	return tryToVectorizeList(
15704	VL: {Candidates [BestCandidate].first, Candidates [BestCandidate].second}, R);
15705	}
15706
15707	namespace {
15708
15709	/// Model horizontal reductions.
15710	///
15711	/// A horizontal reduction is a tree of reduction instructions that has values
15712	/// that can be put into a vector as its leaves. For example:
15713	///
15714	/// mul mul mul mul
15715	/// \ / \ /
15716	/// + +
15717	/// \ /
15718	/// +
15719	/// This tree has "mul" as its leaf values and "+" as its reduction
15720	/// instructions. A reduction can feed into a store or a binary operation
15721	/// feeding a phi.
15722	/// ...
15723	/// \ /
15724	/// +
15725	/// \|
15726	/// phi +=
15727	///
15728	/// Or:
15729	/// ...
15730	/// \ /
15731	/// +
15732	/// \|
15733	/// p =*
15734	///
15735	class HorizontalReduction {
15736	using ReductionOpsType = SmallVector<Value *, `16`>;
15737	using ReductionOpsListType = SmallVector<ReductionOpsType, `2`>;
15738	ReductionOpsListType ReductionOps;
15739	/// List of possibly reduced values.
15740	SmallVector<SmallVector<Value *>> ReducedVals;
15741	/// Maps reduced value to the corresponding reduction operation.
15742	DenseMap<Value , SmallVector<Instruction >> ReducedValsToOps;
15743	// Use map vector to make stable output.
15744	MapVector<Instruction , Value > ExtraArgs;
15745	WeakTrackingVH ReductionRoot;
15746	/// The type of reduction operation.
15747	RecurKind RdxKind;
15748	/// Checks if the optimization of original scalar identity operations on
15749	/// matched horizontal reductions is enabled and allowed.
15750	bool IsSupportedHorRdxIdentityOp = false;
15751
15752	static bool isCmpSelMinMax(Instruction *I) {
15753	return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
15754	RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
15755	}
15756
15757	// And/or are potentially poison-safe logical patterns like:
15758	// select x, y, false
15759	// select x, true, y
15760	static bool isBoolLogicOp(Instruction *I) {
15761	return isa<SelectInst>(Val: I) &&
15762	(match(V: I, P: m_LogicalAnd()) \|\| match(V: I, P: m_LogicalOr()));
15763	}
15764
15765	/// Checks if instruction is associative and can be vectorized.
15766	static bool isVectorizable(RecurKind Kind, Instruction *I) {
15767	if (Kind == RecurKind::None)
15768	return false;
15769
15770	// Integer ops that map to select instructions or intrinsics are fine.
15771	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) \|\|
15772	isBoolLogicOp(I))
15773	return true;
15774
15775	if (Kind == RecurKind::FMax \|\| Kind == RecurKind::FMin) {
15776	// FP min/max are associative except for NaN and -0.0. We do not
15777	// have to rule out -0.0 here because the intrinsic semantics do not
15778	// specify a fixed result for it.
15779	return I->getFastMathFlags().noNaNs();
15780	}
15781
15782	if (Kind == RecurKind::FMaximum \|\| Kind == RecurKind::FMinimum)
15783	return true;
15784
15785	return I->isAssociative();
15786	}
15787
15788	static Value getRdxOperand(Instruction I, unsigned Index) {
15789	// Poison-safe 'or' takes the form: select X, true, Y
15790	// To make that work with the normal operand processing, we skip the
15791	// true value operand.
15792	// TODO: Change the code and data structures to handle this without a hack.
15793	if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == `1`)
15794	return I->getOperand(i: `2`);
15795	return I->getOperand(i: Index);
15796	}
15797
15798	/// Creates reduction operation with the current opcode.
15799	static Value createOp(IRBuilderBase &Builder, RecurKind Kind, Value LHS,
15800	Value RHS, const* Twine &Name, bool UseSelect) {
15801	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
15802	switch (Kind) {
15803	case RecurKind::Or:
15804	if (UseSelect &&
15805	LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
15806	return Builder.CreateSelect(C: LHS, True: Builder.getTrue(), False: RHS, Name);
15807	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15808	Name);
15809	case RecurKind::And:
15810	if (UseSelect &&
15811	LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
15812	return Builder.CreateSelect(C: LHS, True: RHS, False: Builder.getFalse(), Name);
15813	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15814	Name);
15815	case RecurKind::Add:
15816	case RecurKind::Mul:
15817	case RecurKind::Xor:
15818	case RecurKind::FAdd:
15819	case RecurKind::FMul:
15820	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15821	Name);
15822	case RecurKind::FMax:
15823	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: maxnum, LHS, RHS);
15824	case RecurKind::FMin:
15825	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: minnum, LHS, RHS);
15826	case RecurKind::FMaximum:
15827	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: maximum, LHS, RHS);
15828	case RecurKind::FMinimum:
15829	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: minimum, LHS, RHS);
15830	case RecurKind::SMax:
15831	if (UseSelect) {
15832	Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
15833	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15834	}
15835	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: smax, LHS, RHS);
15836	case RecurKind::SMin:
15837	if (UseSelect) {
15838	Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
15839	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15840	}
15841	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: smin, LHS, RHS);
15842	case RecurKind::UMax:
15843	if (UseSelect) {
15844	Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
15845	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15846	}
15847	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: umax, LHS, RHS);
15848	case RecurKind::UMin:
15849	if (UseSelect) {
15850	Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
15851	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15852	}
15853	return Builder.CreateBinaryIntrinsic(Intrinsic::ID: umin, LHS, RHS);
15854	default:
15855	llvm_unreachable("Unknown reduction operation.");
15856	}
15857	}
15858
15859	/// Creates reduction operation with the current opcode with the IR flags
15860	/// from \p ReductionOps, dropping nuw/nsw flags.
15861	static Value createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value LHS,
15862	Value RHS, const* Twine &Name,
15863	const ReductionOpsListType &ReductionOps) {
15864	bool UseSelect = ReductionOps.size() == `2` \|\|
15865	// Logical or/and.
15866	(ReductionOps.size() == `1` &&
15867	any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
15868	assert((!UseSelect \|\| ReductionOps.size() != `2` \|\|
15869	isa<SelectInst>(ReductionOps[`1`][`0`])) &&
15870	"Expected cmp + select pairs for reduction");
15871	Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
15872	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
15873	if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
15874	propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps [`0`], OpValue: nullptr,
15875	/IncludeWrapFlags=/false);
15876	propagateIRFlags(I: Op, VL: ReductionOps [`1`], OpValue: nullptr,
15877	/IncludeWrapFlags=/false);
15878	return Op;
15879	}
15880	}
15881	propagateIRFlags(I: Op, VL: ReductionOps [`0`], OpValue: nullptr, /IncludeWrapFlags=/false);
15882	return Op;
15883	}
15884
15885	public:
15886	static RecurKind getRdxKind(Value *V) {
15887	auto *I = dyn_cast<Instruction>(Val: V);
15888	if (!I)
15889	return RecurKind::None;
15890	if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
15891	return RecurKind::Add;
15892	if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
15893	return RecurKind::Mul;
15894	if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) \|\|
15895	match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
15896	return RecurKind::And;
15897	if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) \|\|
15898	match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
15899	return RecurKind::Or;
15900	if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
15901	return RecurKind::Xor;
15902	if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
15903	return RecurKind::FAdd;
15904	if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
15905	return RecurKind::FMul;
15906
15907	if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
15908	return RecurKind::FMax;
15909	if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
15910	return RecurKind::FMin;
15911
15912	if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
15913	return RecurKind::FMaximum;
15914	if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
15915	return RecurKind::FMinimum;
15916	// This matches either cmp+select or intrinsics. SLP is expected to handle
15917	// either form.
15918	// TODO: If we are canonicalizing to intrinsics, we can remove several
15919	// special-case paths that deal with selects.
15920	if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
15921	return RecurKind::SMax;
15922	if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
15923	return RecurKind::SMin;
15924	if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
15925	return RecurKind::UMax;
15926	if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
15927	return RecurKind::UMin;
15928
15929	if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
15930	// Try harder: look for min/max pattern based on instructions producing
15931	// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
15932	// During the intermediate stages of SLP, it's very common to have
15933	// pattern like this (since optimizeGatherSequence is run only once
15934	// at the end):
15935	// %1 = extractelement <2 x i32> %a, i32 0
15936	// %2 = extractelement <2 x i32> %a, i32 1
15937	// %cond = icmp sgt i32 %1, %2
15938	// %3 = extractelement <2 x i32> %a, i32 0
15939	// %4 = extractelement <2 x i32> %a, i32 1
15940	// %select = select i1 %cond, i32 %3, i32 %4
15941	CmpInst::Predicate Pred;
15942	Instruction *L1;
15943	Instruction *L2;
15944
15945	Value *LHS = Select->getTrueValue();
15946	Value *RHS = Select->getFalseValue();
15947	Value *Cond = Select->getCondition();
15948
15949	// TODO: Support inverse predicates.
15950	if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
15951	if (!isa<ExtractElementInst>(Val: RHS) \|\|
15952	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
15953	return RecurKind::None;
15954	} else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
15955	if (!isa<ExtractElementInst>(Val: LHS) \|\|
15956	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
15957	return RecurKind::None;
15958	} else {
15959	if (!isa<ExtractElementInst>(Val: LHS) \|\| !isa<ExtractElementInst>(Val: RHS))
15960	return RecurKind::None;
15961	if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) \|\|
15962	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) \|\|
15963	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
15964	return RecurKind::None;
15965	}
15966
15967	switch (Pred) {
15968	default:
15969	return RecurKind::None;
15970	case CmpInst::ICMP_SGT:
15971	case CmpInst::ICMP_SGE:
15972	return RecurKind::SMax;
15973	case CmpInst::ICMP_SLT:
15974	case CmpInst::ICMP_SLE:
15975	return RecurKind::SMin;
15976	case CmpInst::ICMP_UGT:
15977	case CmpInst::ICMP_UGE:
15978	return RecurKind::UMax;
15979	case CmpInst::ICMP_ULT:
15980	case CmpInst::ICMP_ULE:
15981	return RecurKind::UMin;
15982	}
15983	}
15984	return RecurKind::None;
15985	}
15986
15987	/// Get the index of the first operand.
15988	static unsigned getFirstOperandIndex(Instruction *I) {
15989	return isCmpSelMinMax(I) ? `1` : `0`;
15990	}
15991
15992	private:
15993	/// Total number of operands in the reduction operation.
15994	static unsigned getNumberOfOperands(Instruction *I) {
15995	return isCmpSelMinMax(I) ? `3` : `2`;
15996	}
15997
15998	/// Checks if the instruction is in basic block \p BB.
15999	/// For a cmp+sel min/max reduction check that both ops are in \p BB.
16000	static bool hasSameParent(Instruction I, BasicBlock BB) {
16001	if (isCmpSelMinMax(I) \|\| isBoolLogicOp(I)) {
16002	auto *Sel = cast<SelectInst>(Val: I);
16003	auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
16004	return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16005	}
16006	return I->getParent() == BB;
16007	}
16008
16009	/// Expected number of uses for reduction operations/reduced values.
16010	static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16011	if (IsCmpSelMinMax) {
16012	// SelectInst must be used twice while the condition op must have single
16013	// use only.
16014	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
16015	return Sel->hasNUses(N: `2`) && Sel->getCondition()->hasOneUse();
16016	return I->hasNUses(N: `2`);
16017	}
16018
16019	// Arithmetic reduction operation must be used once only.
16020	return I->hasOneUse();
16021	}
16022
16023	/// Initializes the list of reduction operations.
16024	void initReductionOps(Instruction *I) {
16025	if (isCmpSelMinMax(I))
16026	ReductionOps.assign(NumElts: `2`, Elt: ReductionOpsType ());
16027	else
16028	ReductionOps.assign(NumElts: `1`, Elt: ReductionOpsType ());
16029	}
16030
16031	/// Add all reduction operations for the reduction instruction \p I.
16032	void addReductionOps(Instruction *I) {
16033	if (isCmpSelMinMax(I)) {
16034	ReductionOps [`0`].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
16035	ReductionOps [`1`].emplace_back(Args&: I);
16036	} else {
16037	ReductionOps [`0`].emplace_back(Args&: I);
16038	}
16039	}
16040
16041	static bool isGoodForReduction(ArrayRef<Value *> Data) {
16042	int Sz = Data.size();
16043	auto *I = dyn_cast<Instruction>(Val: Data.front());
16044	return Sz > `1` \|\| isConstant(V: Data.front()) \|\|
16045	(I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
16046	}
16047
16048	public:
16049	HorizontalReduction() = default;
16050
16051	/// Try to find a reduction tree.
16052	bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16053	ScalarEvolution &SE, const DataLayout &DL,
16054	const TargetLibraryInfo &TLI) {
16055	RdxKind = HorizontalReduction::getRdxKind(V: Root);
16056	if (!isVectorizable(Kind: RdxKind, I: Root))
16057	return false;
16058
16059	// Analyze "regular" integer/FP types for reductions - no target-specific
16060	// types or pointers.
16061	Type *Ty = Root->getType();
16062	if (!isValidElementType(Ty) \|\| Ty->isPointerTy())
16063	return false;
16064
16065	// Though the ultimate reduction may have multiple uses, its condition must
16066	// have only single use.
16067	if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
16068	if (!Sel->getCondition()->hasOneUse())
16069	return false;
16070
16071	ReductionRoot = Root;
16072
16073	// Iterate through all the operands of the possible reduction tree and
16074	// gather all the reduced values, sorting them by their value id.
16075	BasicBlock *BB = Root->getParent();
16076	bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
16077	SmallVector<Instruction *> Worklist(`1`, Root);
16078	// Checks if the operands of the \p TreeN instruction are also reduction
16079	// operations or should be treated as reduced values or an extra argument,
16080	// which is not part of the reduction.
16081	auto CheckOperands = [&](Instruction *TreeN,
16082	SmallVectorImpl<Value *> &ExtraArgs,
16083	SmallVectorImpl<Value *> &PossibleReducedVals,
16084	SmallVectorImpl<Instruction *> &ReductionOps) {
16085	for (int I = getFirstOperandIndex(I: TreeN),
16086	End = getNumberOfOperands(I: TreeN);
16087	I < End; ++I) {
16088	Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
16089	ReducedValsToOps [EdgeVal].push_back(Elt: TreeN);
16090	auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
16091	// Edge has wrong parent - mark as an extra argument.
16092	if (EdgeInst && !isVectorLikeInstWithConstOps(V: EdgeInst) &&
16093	!hasSameParent(I: EdgeInst, BB)) {
16094	ExtraArgs.push_back(Elt: EdgeVal);
16095	continue;
16096	}
16097	// If the edge is not an instruction, or it is different from the main
16098	// reduction opcode or has too many uses - possible reduced value.
16099	// Also, do not try to reduce const values, if the operation is not
16100	// foldable.
16101	if (!EdgeInst \|\| getRdxKind(V: EdgeInst) != RdxKind \|\|
16102	IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) \|\|
16103	!hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) \|\|
16104	!isVectorizable(Kind: RdxKind, I: EdgeInst) \|\|
16105	(R.isAnalyzedReductionRoot(I: EdgeInst) &&
16106	all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
16107	PossibleReducedVals.push_back(Elt: EdgeVal);
16108	continue;
16109	}
16110	ReductionOps.push_back(Elt: EdgeInst);
16111	}
16112	};
16113	// Try to regroup reduced values so that it gets more profitable to try to
16114	// reduce them. Values are grouped by their value ids, instructions - by
16115	// instruction op id and/or alternate op id, plus do extra analysis for
16116	// loads (grouping them by the distabce between pointers) and cmp
16117	// instructions (grouping them by the predicate).
16118	MapVector<size_t, MapVector<size_t, MapVector<Value , unsigned*>>>
16119	PossibleReducedVals;
16120	initReductionOps(I: Root);
16121	DenseMap<Value , SmallVector<LoadInst >> LoadsMap;
16122	SmallSet<size_t, `2`> LoadKeyUsed;
16123	SmallPtrSet<Value *, `4`> DoNotReverseVals;
16124
16125	auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16126	Value *Ptr = getUnderlyingObject(V: LI->getPointerOperand());
16127	if (LoadKeyUsed.contains(V: Key)) {
16128	auto LIt = LoadsMap.find(Val: Ptr);
16129	if (LIt != LoadsMap.end()) {
16130	for (LoadInst *RLI : LIt ->second) {
16131	if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
16132	ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
16133	/StrictCheck=/true))
16134	return hash_value(ptr: RLI->getPointerOperand());
16135	}
16136	for (LoadInst *RLI : LIt ->second) {
16137	if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
16138	Ptr2: LI->getPointerOperand(), TLI)) {
16139	hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
16140	DoNotReverseVals.insert(Ptr: RLI);
16141	return SubKey;
16142	}
16143	}
16144	if (LIt ->second.size() > `2`) {
16145	hash_code SubKey =
16146	hash_value(ptr: LIt ->second.back()->getPointerOperand());
16147	DoNotReverseVals.insert(Ptr: LIt ->second.back());
16148	return SubKey;
16149	}
16150	}
16151	}
16152	LoadKeyUsed.insert(V: Key);
16153	LoadsMap.try_emplace(Key: Ptr).first ->second.push_back(Elt: LI);
16154	return hash_value(ptr: LI->getPointerOperand());
16155	};
16156
16157	while (!Worklist.empty()) {
16158	Instruction *TreeN = Worklist.pop_back_val();
16159	SmallVector<Value *> Args;
16160	SmallVector<Value *> PossibleRedVals;
16161	SmallVector<Instruction *> PossibleReductionOps;
16162	CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16163	// If too many extra args - mark the instruction itself as a reduction
16164	// value, not a reduction operation.
16165	if (Args.size() < `2`) {
16166	addReductionOps(I: TreeN);
16167	// Add extra args.
16168	if (!Args.empty()) {
16169	assert(Args.size() == `1` && "Expected only single argument.");
16170	ExtraArgs [TreeN] = Args.front();
16171	}
16172	// Add reduction values. The values are sorted for better vectorization
16173	// results.
16174	for (Value *V : PossibleRedVals) {
16175	size_t Key, Idx;
16176	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
16177	/AllowAlternate=/false);
16178	++PossibleReducedVals [Key][Idx]
16179	.insert(KV: std::make_pair(x&: V, y: `0`))
16180	.first->second;
16181	}
16182	Worklist.append(in_start: PossibleReductionOps.rbegin(),
16183	in_end: PossibleReductionOps.rend());
16184	} else {
16185	size_t Key, Idx;
16186	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V: TreeN, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
16187	/AllowAlternate=/false);
16188	++PossibleReducedVals [Key][Idx]
16189	.insert(KV: std::make_pair(x&: TreeN, y: `0`))
16190	.first->second;
16191	}
16192	}
16193	auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16194	// Sort values by the total number of values kinds to start the reduction
16195	// from the longest possible reduced values sequences.
16196	for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16197	auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16198	SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16199	for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16200	It != E; ++It) {
16201	PossibleRedValsVect.emplace_back();
16202	auto RedValsVect = It->second.takeVector();
16203	stable_sort(Range&: RedValsVect, C: llvm::less_second ());
16204	for (const std::pair<Value , unsigned*> &Data : RedValsVect)
16205	PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
16206	}
16207	stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
16208	return P1.size() > P2.size();
16209	});
16210	int NewIdx = -`1`;
16211	for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16212	if (isGoodForReduction(Data) \|\|
16213	(isa<LoadInst>(Val: Data.front()) && NewIdx >= `0` &&
16214	isa<LoadInst>(Val: ReducedVals [NewIdx].front()) &&
16215	getUnderlyingObject(
16216	V: cast<LoadInst>(Val: Data.front())->getPointerOperand()) ==
16217	getUnderlyingObject(V: cast<LoadInst>(Val: ReducedVals [NewIdx].front())
16218	->getPointerOperand()))) {
16219	if (NewIdx < `0`) {
16220	NewIdx = ReducedVals.size();
16221	ReducedVals.emplace_back();
16222	}
16223	if (DoNotReverseVals.contains(Ptr: Data.front()))
16224	ReducedVals [NewIdx].append(in_start: Data.begin(), in_end: Data.end());
16225	else
16226	ReducedVals [NewIdx].append(in_start: Data.rbegin(), in_end: Data.rend());
16227	} else {
16228	ReducedVals.emplace_back().append(in_start: Data.rbegin(), in_end: Data.rend());
16229	}
16230	}
16231	}
16232	// Sort the reduced values by number of same/alternate opcode and/or pointer
16233	// operand.
16234	stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value > P1, ArrayRef<Value > P2) {
16235	return P1.size() > P2.size();
16236	});
16237	return true;
16238	}
16239
16240	/// Attempt to vectorize the tree found by matchAssociativeReduction.
16241	Value tryToReduce(BoUpSLP &V, const* DataLayout &DL, TargetTransformInfo *TTI,
16242	const TargetLibraryInfo &TLI) {
16243	constexpr int ReductionLimit = `4`;
16244	constexpr unsigned RegMaxNumber = `4`;
16245	constexpr unsigned RedValsMaxNumber = `128`;
16246	// If there are a sufficient number of reduction values, reduce
16247	// to a nearby power-of-2. We can safely generate oversized
16248	// vectors and rely on the backend to split them to legal sizes.
16249	unsigned NumReducedVals =
16250	std::accumulate(first: ReducedVals.begin(), last: ReducedVals.end(), init: `0`,
16251	binary_op: [](unsigned Num, ArrayRef<Value > Vals) -> unsigned* {
16252	if (!isGoodForReduction(Data: Vals))
16253	return Num;
16254	return Num + Vals.size();
16255	});
16256	if (NumReducedVals < ReductionLimit &&
16257	(!AllowHorRdxIdenityOptimization \|\|
16258	all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
16259	return RedV.size() < `2` \|\| !allConstant(VL: RedV) \|\| !isSplat(VL: RedV);
16260	}))) {
16261	for (ReductionOpsType &RdxOps : ReductionOps)
16262	for (Value *RdxOp : RdxOps)
16263	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
16264	return nullptr;
16265	}
16266
16267	IRBuilder<TargetFolder> Builder(ReductionRoot ->getContext(),
16268	TargetFolder (DL));
16269	Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
16270
16271	// Track the reduced values in case if they are replaced by extractelement
16272	// because of the vectorization.
16273	DenseMap<Value *, WeakTrackingVH> TrackedVals(
16274	ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16275	BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16276	SmallVector<std::pair<Value , Value >> ReplacedExternals;
16277	ExternallyUsedValues.reserve(NumEntries: ExtraArgs.size() + `1`);
16278	// The same extra argument may be used several times, so log each attempt
16279	// to use it.
16280	for (const std::pair<Instruction , Value > &Pair : ExtraArgs) {
16281	assert(Pair.first && "DebugLoc must be set.");
16282	ExternallyUsedValues [Pair.second].push_back(Elt: Pair.first);
16283	TrackedVals.try_emplace(Key: Pair.second, Args: Pair.second);
16284	}
16285
16286	// The compare instruction of a min/max is the insertion point for new
16287	// instructions and may be replaced with a new compare instruction.
16288	auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16289	assert(isa<SelectInst>(RdxRootInst) &&
16290	"Expected min/max reduction to have select root instruction");
16291	Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
16292	assert(isa<Instruction>(ScalarCond) &&
16293	"Expected min/max reduction to have compare condition");
16294	return cast<Instruction>(Val: ScalarCond);
16295	};
16296
16297	// Return new VectorizedTree, based on previous value.
16298	auto GetNewVectorizedTree = [&](Value VectorizedTree, Value Res) {
16299	if (VectorizedTree) {
16300	// Update the final value in the reduction.
16301	Builder.SetCurrentDebugLocation(
16302	cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
16303	if ((isa<PoisonValue>(Val: VectorizedTree) && !isa<PoisonValue>(Val: Res)) \|\|
16304	(isGuaranteedNotToBePoison(V: Res) &&
16305	!isGuaranteedNotToBePoison(V: VectorizedTree))) {
16306	auto It = ReducedValsToOps.find(Val: Res);
16307	if (It != ReducedValsToOps.end() &&
16308	any_of(Range&: It ->getSecond(),
16309	P: [](Instruction I) { return* isBoolLogicOp(I); }))
16310	std::swap(a&: VectorizedTree, b&: Res);
16311	}
16312
16313	return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
16314	ReductionOps);
16315	}
16316	// Initialize the final value in the reduction.
16317	return Res;
16318	};
16319	bool AnyBoolLogicOp =
16320	any_of(Range&: ReductionOps.back(), P: [](Value *V) {
16321	return isBoolLogicOp(I: cast<Instruction>(Val: V));
16322	});
16323	// The reduction root is used as the insertion point for new instructions,
16324	// so set it as externally used to prevent it from being deleted.
16325	ExternallyUsedValues [ReductionRoot];
16326	SmallDenseSet<Value > IgnoreList(ReductionOps.size()
16327	ReductionOps.front().size());
16328	for (ReductionOpsType &RdxOps : ReductionOps)
16329	for (Value *RdxOp : RdxOps) {
16330	if (!RdxOp)
16331	continue;
16332	IgnoreList.insert(V: RdxOp);
16333	}
16334	// Intersect the fast-math-flags from all reduction operations.
16335	FastMathFlags RdxFMF;
16336	RdxFMF.set();
16337	for (Value *U : IgnoreList)
16338	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
16339	RdxFMF &= FPMO->getFastMathFlags();
16340	bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
16341
16342	// Need to track reduced vals, they may be changed during vectorization of
16343	// subvectors.
16344	for (ArrayRef<Value *> Candidates : ReducedVals)
16345	for (Value *V : Candidates)
16346	TrackedVals.try_emplace(Key: V, Args&: V);
16347
16348	DenseMap<Value , unsigned*> VectorizedVals(ReducedVals.size());
16349	// List of the values that were reduced in other trees as part of gather
16350	// nodes and thus requiring extract if fully vectorized in other trees.
16351	SmallPtrSet<Value *, `4`> RequiredExtract;
16352	Value VectorizedTree = nullptr*;
16353	bool CheckForReusedReductionOps = false;
16354	// Try to vectorize elements based on their type.
16355	for (unsigned I = `0`, E = ReducedVals.size(); I < E; ++I) {
16356	ArrayRef<Value *> OrigReducedVals = ReducedVals [I];
16357	InstructionsState S = getSameOpcode(VL: OrigReducedVals, TLI);
16358	SmallVector<Value *> Candidates;
16359	Candidates.reserve(N: `2` * OrigReducedVals.size());
16360	DenseMap<Value , Value > TrackedToOrig(`2` * OrigReducedVals.size());
16361	for (unsigned Cnt = `0`, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16362	Value *RdxVal = TrackedVals.find(Val: OrigReducedVals [Cnt])->second;
16363	// Check if the reduction value was not overriden by the extractelement
16364	// instruction because of the vectorization and exclude it, if it is not
16365	// compatible with other values.
16366	// Also check if the instruction was folded to constant/other value.
16367	auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
16368	if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
16369	(!S.getOpcode() \|\| !S.isOpcodeOrAlt(I: Inst))) \|\|
16370	(S.getOpcode() && !Inst))
16371	continue;
16372	Candidates.push_back(Elt: RdxVal);
16373	TrackedToOrig.try_emplace(Key: RdxVal, Args: OrigReducedVals [Cnt]);
16374	}
16375	bool ShuffledExtracts = false;
16376	// Try to handle shuffled extractelements.
16377	if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16378	I + `1` < E) {
16379	InstructionsState NextS = getSameOpcode(VL: ReducedVals [I + `1`], TLI);
16380	if (NextS.getOpcode() == Instruction::ExtractElement &&
16381	!NextS.isAltShuffle()) {
16382	SmallVector<Value *> CommonCandidates(Candidates);
16383	for (Value *RV : ReducedVals [I + `1`]) {
16384	Value *RdxVal = TrackedVals.find(Val: RV)->second;
16385	// Check if the reduction value was not overriden by the
16386	// extractelement instruction because of the vectorization and
16387	// exclude it, if it is not compatible with other values.
16388	if (auto *Inst = dyn_cast<Instruction>(Val: RdxVal))
16389	if (!NextS.getOpcode() \|\| !NextS.isOpcodeOrAlt(I: Inst))
16390	continue;
16391	CommonCandidates.push_back(Elt: RdxVal);
16392	TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
16393	}
16394	SmallVector<int> Mask;
16395	if (isFixedVectorShuffle(VL: CommonCandidates, Mask)) {
16396	++I;
16397	Candidates.swap(RHS&: CommonCandidates);
16398	ShuffledExtracts = true;
16399	}
16400	}
16401	}
16402
16403	// Emit code for constant values.
16404	if (AllowHorRdxIdenityOptimization && Candidates.size() > `1` &&
16405	allConstant(VL: Candidates)) {
16406	Value *Res = Candidates.front();
16407	++VectorizedVals.try_emplace(Key: Candidates.front(), Args: `0`).first ->getSecond();
16408	for (Value *VC : ArrayRef(Candidates).drop_front()) {
16409	Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
16410	++VectorizedVals.try_emplace(Key: VC, Args: `0`).first ->getSecond();
16411	if (auto *ResI = dyn_cast<Instruction>(Val: Res))
16412	V.analyzedReductionRoot(I: ResI);
16413	}
16414	VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16415	continue;
16416	}
16417
16418	unsigned NumReducedVals = Candidates.size();
16419	if (NumReducedVals < ReductionLimit &&
16420	(NumReducedVals < `2` \|\| !AllowHorRdxIdenityOptimization \|\|
16421	!isSplat(VL: Candidates)))
16422	continue;
16423
16424	// Check if we support repeated scalar values processing (optimization of
16425	// original scalar identity operations on matched horizontal reductions).
16426	IsSupportedHorRdxIdentityOp =
16427	AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
16428	RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16429	// Gather same values.
16430	MapVector<Value , unsigned*> SameValuesCounter;
16431	if (IsSupportedHorRdxIdentityOp)
16432	for (Value *V : Candidates)
16433	++SameValuesCounter.insert(KV: std::make_pair(x&: V, y: `0`)).first->second;
16434	// Used to check if the reduced values used same number of times. In this
16435	// case the compiler may produce better code. E.g. if reduced values are
16436	// aabbccdd (8 x values), then the first node of the tree will have a node
16437	// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
16438	// Plus, the final reduction will be performed on <8 x aabbccdd>.
16439	// Instead compiler may build <4 x abcd> tree immediately, + reduction (4
16440	// x abcd) 2.*
16441	// Currently it only handles add/fadd/xor. and/or/min/max do not require
16442	// this analysis, other operations may require an extra estimation of
16443	// the profitability.
16444	bool SameScaleFactor = false;
16445	bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16446	SameValuesCounter.size() != Candidates.size();
16447	if (OptReusedScalars) {
16448	SameScaleFactor =
16449	(RdxKind == RecurKind::Add \|\| RdxKind == RecurKind::FAdd \|\|
16450	RdxKind == RecurKind::Xor) &&
16451	all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
16452	P: [&SameValuesCounter](const std::pair<Value , unsigned*> &P) {
16453	return P.second == SameValuesCounter.front().second;
16454	});
16455	Candidates.resize(N: SameValuesCounter.size());
16456	transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
16457	F: [](const auto &P) { return P.first; });
16458	NumReducedVals = Candidates.size();
16459	// Have a reduction of the same element.
16460	if (NumReducedVals == `1`) {
16461	Value *OrigV = TrackedToOrig.find(Val: Candidates.front())->second;
16462	unsigned Cnt = SameValuesCounter.lookup(Key: OrigV);
16463	Value *RedVal =
16464	emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
16465	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16466	VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
16467	continue;
16468	}
16469	}
16470
16471	unsigned MaxVecRegSize = V.getMaxVecRegSize();
16472	unsigned EltSize = V.getVectorElementSize(V: Candidates [`0`]);
16473	unsigned MaxElts =
16474	RegMaxNumber * llvm::bit_floor(Value: MaxVecRegSize / EltSize);
16475
16476	unsigned ReduxWidth = std::min<unsigned>(
16477	a: llvm::bit_floor(Value: NumReducedVals),
16478	b: std::clamp<unsigned>(val: MaxElts, lo: RedValsMaxNumber,
16479	hi: RegMaxNumber * RedValsMaxNumber));
16480	unsigned Start = `0`;
16481	unsigned Pos = Start;
16482	// Restarts vectorization attempt with lower vector factor.
16483	unsigned PrevReduxWidth = ReduxWidth;
16484	bool CheckForReusedReductionOpsLocal = false;
16485	auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16486	&CheckForReusedReductionOpsLocal,
16487	&PrevReduxWidth, &V,
16488	&IgnoreList](bool IgnoreVL = false) {
16489	bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
16490	if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16491	// Check if any of the reduction ops are gathered. If so, worth
16492	// trying again with less number of reduction ops.
16493	CheckForReusedReductionOpsLocal \|= IsAnyRedOpGathered;
16494	}
16495	++Pos;
16496	if (Pos < NumReducedVals - ReduxWidth + `1`)
16497	return IsAnyRedOpGathered;
16498	Pos = Start;
16499	ReduxWidth /= `2`;
16500	return IsAnyRedOpGathered;
16501	};
16502	bool AnyVectorized = false;
16503	while (Pos < NumReducedVals - ReduxWidth + `1` &&
16504	ReduxWidth >= ReductionLimit) {
16505	// Dependency in tree of the reduction ops - drop this attempt, try
16506	// later.
16507	if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16508	Start == `0`) {
16509	CheckForReusedReductionOps = true;
16510	break;
16511	}
16512	PrevReduxWidth = ReduxWidth;
16513	ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
16514	// Beeing analyzed already - skip.
16515	if (V.areAnalyzedReductionVals(VL)) {
16516	(void)AdjustReducedVals(/IgnoreVL=/true);
16517	continue;
16518	}
16519	// Early exit if any of the reduction values were deleted during
16520	// previous vectorization attempts.
16521	if (any_of(Range&: VL, P: [&V](Value *RedVal) {
16522	auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
16523	if (!RedValI)
16524	return false;
16525	return V.isDeleted(I: RedValI);
16526	}))
16527	break;
16528	V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
16529	if (V.isTreeTinyAndNotFullyVectorizable(/ForReduction=/true)) {
16530	if (!AdjustReducedVals())
16531	V.analyzedReductionVals(VL);
16532	continue;
16533	}
16534	if (V.isLoadCombineReductionCandidate(RdxKind)) {
16535	if (!AdjustReducedVals())
16536	V.analyzedReductionVals(VL);
16537	continue;
16538	}
16539	V.reorderTopToBottom();
16540	// No need to reorder the root node at all.
16541	V.reorderBottomToTop(/IgnoreReorder=/true);
16542	// Keep extracted other reduction values, if they are used in the
16543	// vectorization trees.
16544	BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
16545	ExternallyUsedValues);
16546	for (unsigned Cnt = `0`, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
16547	if (Cnt == I \|\| (ShuffledExtracts && Cnt == I - `1`))
16548	continue;
16549	for (Value *V : ReducedVals [Cnt])
16550	if (isa<Instruction>(Val: V))
16551	LocalExternallyUsedValues [TrackedVals [V]];
16552	}
16553	if (!IsSupportedHorRdxIdentityOp) {
16554	// Number of uses of the candidates in the vector of values.
16555	assert(SameValuesCounter.empty() &&
16556	"Reused values counter map is not empty");
16557	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
16558	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16559	continue;
16560	Value *V = Candidates [Cnt];
16561	Value *OrigV = TrackedToOrig.find(Val: V)->second;
16562	++SameValuesCounter [OrigV];
16563	}
16564	}
16565	SmallPtrSet<Value *, `4`> VLScalars(VL.begin(), VL.end());
16566	// Gather externally used values.
16567	SmallPtrSet<Value *, `4`> Visited;
16568	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
16569	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16570	continue;
16571	Value *RdxVal = Candidates [Cnt];
16572	if (!Visited.insert(Ptr: RdxVal).second)
16573	continue;
16574	// Check if the scalar was vectorized as part of the vectorization
16575	// tree but not the top node.
16576	if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
16577	LocalExternallyUsedValues [RdxVal];
16578	continue;
16579	}
16580	Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
16581	unsigned NumOps =
16582	VectorizedVals.lookup(Val: RdxVal) + SameValuesCounter [OrigV];
16583	if (NumOps != ReducedValsToOps.find(Val: OrigV)->second.size())
16584	LocalExternallyUsedValues [RdxVal];
16585	}
16586	// Do not need the list of reused scalars in regular mode anymore.
16587	if (!IsSupportedHorRdxIdentityOp)
16588	SameValuesCounter.clear();
16589	for (Value *RdxVal : VL)
16590	if (RequiredExtract.contains(Ptr: RdxVal))
16591	LocalExternallyUsedValues [RdxVal];
16592	// Update LocalExternallyUsedValues for the scalar, replaced by
16593	// extractelement instructions.
16594	DenseMap<Value , Value > ReplacementToExternal;
16595	for (const std::pair<Value , Value > &Pair : ReplacedExternals)
16596	ReplacementToExternal.try_emplace(Key: Pair.second, Args: Pair.first);
16597	for (const std::pair<Value , Value > &Pair : ReplacedExternals) {
16598	Value *Ext = Pair.first;
16599	auto RIt = ReplacementToExternal.find(Val: Ext);
16600	while (RIt != ReplacementToExternal.end()) {
16601	Ext = RIt ->second;
16602	RIt = ReplacementToExternal.find(Val: Ext);
16603	}
16604	auto *It = ExternallyUsedValues.find(Key: Ext);
16605	if (It == ExternallyUsedValues.end())
16606	continue;
16607	LocalExternallyUsedValues [Pair.second].append(RHS: It->second);
16608	}
16609	V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
16610
16611	V.computeMinimumValueSizes();
16612	V.transformNodes();
16613
16614	// Estimate cost.
16615	InstructionCost TreeCost = V.getTreeCost(VectorizedVals: VL);
16616	InstructionCost ReductionCost =
16617	getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, ReduxWidth, FMF: RdxFMF);
16618	InstructionCost Cost = TreeCost + ReductionCost;
16619	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16620	<< " for reduction\n");
16621	if (!Cost.isValid())
16622	break;
16623	if (Cost >= -SLPCostThreshold) {
16624	V.getORE()->emit(RemarkBuilder: [&]() {
16625	return OptimizationRemarkMissed (
16626	SV_NAME, "HorSLPNotBeneficial",
16627	ReducedValsToOps.find(Val: VL [`0`])->second.front())
16628	<< "Vectorizing horizontal reduction is possible "
16629	<< "but not beneficial with cost " << ore::NV ("Cost", Cost)
16630	<< " and threshold "
16631	<< ore::NV ("Threshold", -SLPCostThreshold);
16632	});
16633	if (!AdjustReducedVals())
16634	V.analyzedReductionVals(VL);
16635	continue;
16636	}
16637
16638	LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
16639	<< Cost << ". (HorRdx)\n");
16640	V.getORE()->emit(RemarkBuilder: [&]() {
16641	return OptimizationRemark (
16642	SV_NAME, "VectorizedHorizontalReduction",
16643	ReducedValsToOps.find(Val: VL [`0`])->second.front())
16644	<< "Vectorized horizontal reduction with cost "
16645	<< ore::NV ("Cost", Cost) << " and with tree size "
16646	<< ore::NV ("TreeSize", V.getTreeSize());
16647	});
16648
16649	Builder.setFastMathFlags(RdxFMF);
16650
16651	// Emit a reduction. If the root is a select (min/max idiom), the insert
16652	// point is the compare condition of that select.
16653	Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
16654	Instruction *InsertPt = RdxRootInst;
16655	if (IsCmpSelMinMax)
16656	InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16657
16658	// Vectorize a tree.
16659	Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues: LocalExternallyUsedValues,
16660	ReplacedExternals, ReductionRoot: InsertPt);
16661
16662	Builder.SetInsertPoint(InsertPt);
16663
16664	// To prevent poison from leaking across what used to be sequential,
16665	// safe, scalar boolean logic operations, the reduction operand must be
16666	// frozen.
16667	if ((isBoolLogicOp(I: RdxRootInst) \|\|
16668	(AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16669	!isGuaranteedNotToBePoison(V: VectorizedRoot))
16670	VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
16671
16672	// Emit code to correctly handle reused reduced values, if required.
16673	if (OptReusedScalars && !SameScaleFactor) {
16674	VectorizedRoot =
16675	emitReusedOps(VectorizedValue: VectorizedRoot, Builder, VL: V.getRootNodeScalars(),
16676	SameValuesCounter, TrackedToOrig);
16677	}
16678
16679	Value *ReducedSubTree =
16680	emitReduction(VectorizedValue: VectorizedRoot, Builder, ReduxWidth, TTI);
16681	if (ReducedSubTree->getType() != VL.front()->getType()) {
16682	ReducedSubTree = Builder.CreateIntCast(
16683	V: ReducedSubTree, DestTy: VL.front()->getType(), isSigned: any_of(Range&: VL, P: [&](Value *R) {
16684	KnownBits Known = computeKnownBits(
16685	V: R, DL: cast<Instruction>(Val: ReductionOps.front().front())
16686	->getModule()
16687	->getDataLayout());
16688	return !Known.isNonNegative();
16689	}));
16690	}
16691
16692	// Improved analysis for add/fadd/xor reductions with same scale factor
16693	// for all operands of reductions. We can emit scalar ops for them
16694	// instead.
16695	if (OptReusedScalars && SameScaleFactor)
16696	ReducedSubTree = emitScaleForReusedOps(
16697	VectorizedValue: ReducedSubTree, Builder, Cnt: SameValuesCounter.front().second);
16698
16699	VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16700	// Count vectorized reduced values to exclude them from final reduction.
16701	for (Value *RdxVal : VL) {
16702	Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
16703	if (IsSupportedHorRdxIdentityOp) {
16704	VectorizedVals.try_emplace(Key: OrigV, Args&: SameValuesCounter [RdxVal]);
16705	continue;
16706	}
16707	++VectorizedVals.try_emplace(Key: OrigV, Args: `0`).first ->getSecond();
16708	if (!V.isVectorized(V: RdxVal))
16709	RequiredExtract.insert(Ptr: RdxVal);
16710	}
16711	Pos += ReduxWidth;
16712	Start = Pos;
16713	ReduxWidth = llvm::bit_floor(Value: NumReducedVals - Pos);
16714	AnyVectorized = true;
16715	}
16716	if (OptReusedScalars && !AnyVectorized) {
16717	for (const std::pair<Value , unsigned*> &P : SameValuesCounter) {
16718	Value *RedVal = emitScaleForReusedOps(VectorizedValue: P.first, Builder, Cnt: P.second);
16719	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16720	Value *OrigV = TrackedToOrig.find(Val: P.first)->second;
16721	VectorizedVals.try_emplace(Key: OrigV, Args: P.second);
16722	}
16723	continue;
16724	}
16725	}
16726	if (VectorizedTree) {
16727	// Reorder operands of bool logical op in the natural order to avoid
16728	// possible problem with poison propagation. If not possible to reorder
16729	// (both operands are originally RHS), emit an extra freeze instruction
16730	// for the LHS operand.
16731	// I.e., if we have original code like this:
16732	// RedOp1 = select i1 ?, i1 LHS, i1 false
16733	// RedOp2 = select i1 RHS, i1 ?, i1 false
16734
16735	// Then, we swap LHS/RHS to create a new op that matches the poison
16736	// semantics of the original code.
16737
16738	// If we have original code like this and both values could be poison:
16739	// RedOp1 = select i1 ?, i1 LHS, i1 false
16740	// RedOp2 = select i1 ?, i1 RHS, i1 false
16741
16742	// Then, we must freeze LHS in the new op.
16743	auto FixBoolLogicalOps = [&, VectorizedTree](Value &LHS, Value &RHS,
16744	Instruction *RedOp1,
16745	Instruction *RedOp2,
16746	bool InitStep) {
16747	if (!AnyBoolLogicOp)
16748	return;
16749	if (isBoolLogicOp(I: RedOp1) &&
16750	((!InitStep && LHS == VectorizedTree) \|\|
16751	getRdxOperand(I: RedOp1, Index: `0`) == LHS \|\| isGuaranteedNotToBePoison(V: LHS)))
16752	return;
16753	if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) \|\|
16754	getRdxOperand(I: RedOp2, Index: `0`) == RHS \|\|
16755	isGuaranteedNotToBePoison(V: RHS))) {
16756	std::swap(a&: LHS, b&: RHS);
16757	return;
16758	}
16759	if (LHS != VectorizedTree)
16760	LHS = Builder.CreateFreeze(V: LHS);
16761	};
16762	// Finish the reduction.
16763	// Need to add extra arguments and not vectorized possible reduction
16764	// values.
16765	// Try to avoid dependencies between the scalar remainders after
16766	// reductions.
16767	auto FinalGen =
16768	[&](ArrayRef<std::pair<Instruction , Value >> InstVals,
16769	bool InitStep) {
16770	unsigned Sz = InstVals.size();
16771	SmallVector<std::pair<Instruction , Value >> ExtraReds(Sz / `2` +
16772	Sz % `2`);
16773	for (unsigned I = `0`, E = (Sz / `2`) * `2`; I < E; I += `2`) {
16774	Instruction *RedOp = InstVals [I + `1`].first;
16775	Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
16776	Value *RdxVal1 = InstVals [I].second;
16777	Value *StableRdxVal1 = RdxVal1;
16778	auto It1 = TrackedVals.find(Val: RdxVal1);
16779	if (It1 != TrackedVals.end())
16780	StableRdxVal1 = It1 ->second;
16781	Value *RdxVal2 = InstVals [I + `1`].second;
16782	Value *StableRdxVal2 = RdxVal2;
16783	auto It2 = TrackedVals.find(Val: RdxVal2);
16784	if (It2 != TrackedVals.end())
16785	StableRdxVal2 = It2 ->second;
16786	// To prevent poison from leaking across what used to be
16787	// sequential, safe, scalar boolean logic operations, the
16788	// reduction operand must be frozen.
16789	FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals [I].first,
16790	RedOp, InitStep);
16791	Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
16792	RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
16793	ExtraReds [I / `2`] = std::make_pair(x: InstVals [I].first, y&: ExtraRed);
16794	}
16795	if (Sz % `2` == `1`)
16796	ExtraReds [Sz / `2`] = InstVals.back();
16797	return ExtraReds;
16798	};
16799	SmallVector<std::pair<Instruction , Value >> ExtraReductions;
16800	ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
16801	Args&: VectorizedTree);
16802	SmallPtrSet<Value *, `8`> Visited;
16803	for (ArrayRef<Value *> Candidates : ReducedVals) {
16804	for (Value *RdxVal : Candidates) {
16805	if (!Visited.insert(Ptr: RdxVal).second)
16806	continue;
16807	unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
16808	for (Instruction *RedOp :
16809	ArrayRef(ReducedValsToOps.find(Val: RdxVal)->second)
16810	.drop_back(N: NumOps))
16811	ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
16812	}
16813	}
16814	for (auto &Pair : ExternallyUsedValues) {
16815	// Add each externally used value to the final reduction.
16816	for (auto *I : Pair.second)
16817	ExtraReductions.emplace_back(Args&: I, Args&: Pair.first);
16818	}
16819	// Iterate through all not-vectorized reduction values/extra arguments.
16820	bool InitStep = true;
16821	while (ExtraReductions.size() > `1`) {
16822	VectorizedTree = ExtraReductions.front().second;
16823	SmallVector<std::pair<Instruction , Value >> NewReds =
16824	FinalGen(ExtraReductions, InitStep);
16825	ExtraReductions.swap(RHS&: NewReds);
16826	InitStep = false;
16827	}
16828	VectorizedTree = ExtraReductions.front().second;
16829
16830	ReductionRoot ->replaceAllUsesWith(V: VectorizedTree);
16831
16832	// The original scalar reduction is expected to have no remaining
16833	// uses outside the reduction tree itself. Assert that we got this
16834	// correct, replace internal uses with undef, and mark for eventual
16835	// deletion.
16836	#ifndef NDEBUG
16837	SmallSet<Value *, `4`> IgnoreSet;
16838	for (ArrayRef<Value *> RdxOps : ReductionOps)
16839	IgnoreSet.insert(I: RdxOps.begin(), E: RdxOps.end());
16840	#endif
16841	for (ArrayRef<Value *> RdxOps : ReductionOps) {
16842	for (Value *Ignore : RdxOps) {
16843	if (!Ignore)
16844	continue;
16845	#ifndef NDEBUG
16846	for (auto *U : Ignore->users()) {
16847	assert(IgnoreSet.count(U) &&
16848	"All users must be either in the reduction ops list.");
16849	}
16850	#endif
16851	if (!Ignore->use_empty()) {
16852	Value *Undef = UndefValue::get(T: Ignore->getType());
16853	Ignore->replaceAllUsesWith(V: Undef);
16854	}
16855	V.eraseInstruction(I: cast<Instruction>(Val: Ignore));
16856	}
16857	}
16858	} else if (!CheckForReusedReductionOps) {
16859	for (ReductionOpsType &RdxOps : ReductionOps)
16860	for (Value *RdxOp : RdxOps)
16861	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
16862	}
16863	return VectorizedTree;
16864	}
16865
16866	private:
16867	/// Calculate the cost of a reduction.
16868	InstructionCost getReductionCost(TargetTransformInfo *TTI,
16869	ArrayRef<Value *> ReducedVals,
16870	bool IsCmpSelMinMax, unsigned ReduxWidth,
16871	FastMathFlags FMF) {
16872	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
16873	Type *ScalarTy = ReducedVals.front()->getType();
16874	FixedVectorType *VectorTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: ReduxWidth);
16875	InstructionCost VectorCost = `0`, ScalarCost;
16876	// If all of the reduced values are constant, the vector cost is 0, since
16877	// the reduction value can be calculated at the compile time.
16878	bool AllConsts = allConstant(VL: ReducedVals);
16879	auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
16880	InstructionCost Cost = `0`;
16881	// Scalar cost is repeated for N-1 elements.
16882	int Cnt = ReducedVals.size();
16883	for (Value *RdxVal : ReducedVals) {
16884	if (Cnt == `1`)
16885	break;
16886	--Cnt;
16887	if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? `3` : `2`)) {
16888	Cost += GenCostFn ();
16889	continue;
16890	}
16891	InstructionCost ScalarCost = `0`;
16892	for (User *U : RdxVal->users()) {
16893	auto *RdxOp = cast<Instruction>(Val: U);
16894	if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
16895	ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
16896	continue;
16897	}
16898	ScalarCost = InstructionCost::getInvalid();
16899	break;
16900	}
16901	if (ScalarCost.isValid())
16902	Cost += ScalarCost;
16903	else
16904	Cost += GenCostFn ();
16905	}
16906	return Cost;
16907	};
16908	switch (RdxKind) {
16909	case RecurKind::Add:
16910	case RecurKind::Mul:
16911	case RecurKind::Or:
16912	case RecurKind::And:
16913	case RecurKind::Xor:
16914	case RecurKind::FAdd:
16915	case RecurKind::FMul: {
16916	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
16917	if (!AllConsts)
16918	VectorCost =
16919	TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy, FMF, CostKind);
16920	ScalarCost = EvaluateScalarCost([&]() {
16921	return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
16922	});
16923	break;
16924	}
16925	case RecurKind::FMax:
16926	case RecurKind::FMin:
16927	case RecurKind::FMaximum:
16928	case RecurKind::FMinimum:
16929	case RecurKind::SMax:
16930	case RecurKind::SMin:
16931	case RecurKind::UMax:
16932	case RecurKind::UMin: {
16933	Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
16934	if (!AllConsts)
16935	VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
16936	ScalarCost = EvaluateScalarCost([&]() {
16937	IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
16938	return TTI->getIntrinsicInstrCost(ICA, CostKind);
16939	});
16940	break;
16941	}
16942	default:
16943	llvm_unreachable("Expected arithmetic or min/max reduction operation");
16944	}
16945
16946	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
16947	<< " for reduction of " << shortBundleName(ReducedVals)
16948	<< " (It is a splitting reduction)\n");
16949	return VectorCost - ScalarCost;
16950	}
16951
16952	/// Emit a horizontal reduction of the vectorized value.
16953	Value emitReduction(Value VectorizedValue, IRBuilderBase &Builder,
16954	unsigned ReduxWidth, const TargetTransformInfo *TTI) {
16955	assert(VectorizedValue && "Need to have a vectorized tree node");
16956	assert(isPowerOf2_32(ReduxWidth) &&
16957	"We only handle power-of-two reductions for now");
16958	assert(RdxKind != RecurKind::FMulAdd &&
16959	"A call to the llvm.fmuladd intrinsic is not handled yet");
16960
16961	++NumVectorInstructions;
16962	return createSimpleTargetReduction(B&: Builder, Src: VectorizedValue, RdxKind);
16963	}
16964
16965	/// Emits optimized code for unique scalar value reused \p Cnt times.
16966	Value emitScaleForReusedOps(Value VectorizedValue, IRBuilderBase &Builder,
16967	unsigned Cnt) {
16968	assert(IsSupportedHorRdxIdentityOp &&
16969	"The optimization of matched scalar identity horizontal reductions "
16970	"must be supported.");
16971	switch (RdxKind) {
16972	case RecurKind::Add: {
16973	// res = mul vv, n
16974	Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt);
16975	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
16976	<< VectorizedValue << ". (HorRdx)\n");
16977	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
16978	}
16979	case RecurKind::Xor: {
16980	// res = n % 2 ? 0 : vv
16981	LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
16982	<< ". (HorRdx)\n");
16983	if (Cnt % `2` == `0`)
16984	return Constant::getNullValue(Ty: VectorizedValue->getType());
16985	return VectorizedValue;
16986	}
16987	case RecurKind::FAdd: {
16988	// res = fmul v, n
16989	Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
16990	LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
16991	<< VectorizedValue << ". (HorRdx)\n");
16992	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
16993	}
16994	case RecurKind::And:
16995	case RecurKind::Or:
16996	case RecurKind::SMax:
16997	case RecurKind::SMin:
16998	case RecurKind::UMax:
16999	case RecurKind::UMin:
17000	case RecurKind::FMax:
17001	case RecurKind::FMin:
17002	case RecurKind::FMaximum:
17003	case RecurKind::FMinimum:
17004	// res = vv
17005	return VectorizedValue;
17006	case RecurKind::Mul:
17007	case RecurKind::FMul:
17008	case RecurKind::FMulAdd:
17009	case RecurKind::IAnyOf:
17010	case RecurKind::FAnyOf:
17011	case RecurKind::None:
17012	llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17013	}
17014	return nullptr;
17015	}
17016
17017	/// Emits actual operation for the scalar identity values, found during
17018	/// horizontal reduction analysis.
17019	Value emitReusedOps(Value VectorizedValue, IRBuilderBase &Builder,
17020	ArrayRef<Value *> VL,
17021	const MapVector<Value , unsigned*> &SameValuesCounter,
17022	const DenseMap<Value , Value > &TrackedToOrig) {
17023	assert(IsSupportedHorRdxIdentityOp &&
17024	"The optimization of matched scalar identity horizontal reductions "
17025	"must be supported.");
17026	auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
17027	if (VTy->getElementType() != VL.front()->getType()) {
17028	VectorizedValue = Builder.CreateIntCast(
17029	V: VectorizedValue,
17030	DestTy: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: VTy->getNumElements()),
17031	isSigned: any_of(Range&: VL, P: [&](Value *R) {
17032	KnownBits Known = computeKnownBits(
17033	V: R, DL: cast<Instruction>(Val: ReductionOps.front().front())
17034	->getModule()
17035	->getDataLayout());
17036	return !Known.isNonNegative();
17037	}));
17038	}
17039	switch (RdxKind) {
17040	case RecurKind::Add: {
17041	// root = mul prev_root, <1, 1, n, 1>
17042	SmallVector<Constant *> Vals;
17043	for (Value *V : VL) {
17044	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17045	Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /IsSigned=/false));
17046	}
17047	auto *Scale = ConstantVector::get(V: Vals);
17048	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17049	<< VectorizedValue << ". (HorRdx)\n");
17050	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
17051	}
17052	case RecurKind::And:
17053	case RecurKind::Or:
17054	// No need for multiple or/and(s).
17055	LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17056	<< ". (HorRdx)\n");
17057	return VectorizedValue;
17058	case RecurKind::SMax:
17059	case RecurKind::SMin:
17060	case RecurKind::UMax:
17061	case RecurKind::UMin:
17062	case RecurKind::FMax:
17063	case RecurKind::FMin:
17064	case RecurKind::FMaximum:
17065	case RecurKind::FMinimum:
17066	// No need for multiple min/max(s) of the same value.
17067	LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17068	<< ". (HorRdx)\n");
17069	return VectorizedValue;
17070	case RecurKind::Xor: {
17071	// Replace values with even number of repeats with 0, since
17072	// x xor x = 0.
17073	// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17074	// 7>, if elements 4th and 6th elements have even number of repeats.
17075	SmallVector<int> Mask(
17076	cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
17077	PoisonMaskElem);
17078	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
17079	bool NeedShuffle = false;
17080	for (unsigned I = `0`, VF = VL.size(); I < VF; ++I) {
17081	Value *V = VL [I];
17082	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17083	if (Cnt % `2` == `0`) {
17084	Mask [I] = VF;
17085	NeedShuffle = true;
17086	}
17087	}
17088	LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17089	: Mask) dbgs()
17090	<< I << " ";
17091	dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17092	if (NeedShuffle)
17093	VectorizedValue = Builder.CreateShuffleVector(
17094	V1: VectorizedValue,
17095	V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
17096	return VectorizedValue;
17097	}
17098	case RecurKind::FAdd: {
17099	// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17100	SmallVector<Constant *> Vals;
17101	for (Value *V : VL) {
17102	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17103	Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
17104	}
17105	auto *Scale = ConstantVector::get(V: Vals);
17106	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
17107	}
17108	case RecurKind::Mul:
17109	case RecurKind::FMul:
17110	case RecurKind::FMulAdd:
17111	case RecurKind::IAnyOf:
17112	case RecurKind::FAnyOf:
17113	case RecurKind::None:
17114	llvm_unreachable("Unexpected reduction kind for reused scalars.");
17115	}
17116	return nullptr;
17117	}
17118	};
17119	} // end anonymous namespace
17120
17121	/// Gets recurrence kind from the specified value.
17122	static RecurKind getRdxKind(Value *V) {
17123	return HorizontalReduction::getRdxKind(V);
17124	}
17125	static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17126	if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
17127	return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
17128
17129	unsigned AggregateSize = `1`;
17130	auto *IV = cast<InsertValueInst>(Val: InsertInst);
17131	Type *CurrentType = IV->getType();
17132	do {
17133	if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
17134	for (auto *Elt : ST->elements())
17135	if (Elt != ST->getElementType(N: `0`)) // check homogeneity
17136	return std::nullopt;
17137	AggregateSize *= ST->getNumElements();
17138	CurrentType = ST->getElementType(N: `0`);
17139	} else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
17140	AggregateSize *= AT->getNumElements();
17141	CurrentType = AT->getElementType();
17142	} else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
17143	AggregateSize *= VT->getNumElements();
17144	return AggregateSize;
17145	} else if (CurrentType->isSingleValueType()) {
17146	return AggregateSize;
17147	} else {
17148	return std::nullopt;
17149	}
17150	} while (true);
17151	}
17152
17153	static void findBuildAggregate_rec(Instruction *LastInsertInst,
17154	TargetTransformInfo *TTI,
17155	SmallVectorImpl<Value *> &BuildVectorOpds,
17156	SmallVectorImpl<Value *> &InsertElts,
17157	unsigned OperandOffset) {
17158	do {
17159	Value *InsertedOperand = LastInsertInst->getOperand(i: `1`);
17160	std::optional<unsigned> OperandIndex =
17161	getInsertIndex(InsertInst: LastInsertInst, Offset: OperandOffset);
17162	if (!OperandIndex)
17163	return;
17164	if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
17165	findBuildAggregate_rec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
17166	BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex);
17167
17168	} else {
17169	BuildVectorOpds [*OperandIndex] = InsertedOperand;
17170	InsertElts [*OperandIndex] = LastInsertInst;
17171	}
17172	LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: `0`));
17173	} while (LastInsertInst != nullptr &&
17174	isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
17175	LastInsertInst->hasOneUse());
17176	}
17177
17178	/// Recognize construction of vectors like
17179	/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17180	/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17181	/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17182	/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17183	/// starting from the last insertelement or insertvalue instruction.
17184	///
17185	/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17186	/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17187	/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17188	///
17189	/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17190	///
17191	/// \return true if it matches.
17192	static bool findBuildAggregate(Instruction *LastInsertInst,
17193	TargetTransformInfo *TTI,
17194	SmallVectorImpl<Value *> &BuildVectorOpds,
17195	SmallVectorImpl<Value *> &InsertElts) {
17196
17197	assert((isa<InsertElementInst>(LastInsertInst) \|\|
17198	isa<InsertValueInst>(LastInsertInst)) &&
17199	"Expected insertelement or insertvalue instruction!");
17200
17201	assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17202	"Expected empty result vectors!");
17203
17204	std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
17205	if (!AggregateSize)
17206	return false;
17207	BuildVectorOpds.resize(N: *AggregateSize);
17208	InsertElts.resize(N: *AggregateSize);
17209
17210	findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: `0`);
17211	llvm::erase(C&: BuildVectorOpds, V: nullptr);
17212	llvm::erase(C&: InsertElts, V: nullptr);
17213	if (BuildVectorOpds.size() >= `2`)
17214	return true;
17215
17216	return false;
17217	}
17218
17219	/// Try and get a reduction instruction from a phi node.
17220	///
17221	/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17222	/// if they come from either \p ParentBB or a containing loop latch.
17223	///
17224	/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17225	/// if not possible.
17226	static Instruction getReductionInstr(const* DominatorTree DT, PHINode P,
17227	BasicBlock ParentBB, LoopInfo LI) {
17228	// There are situations where the reduction value is not dominated by the
17229	// reduction phi. Vectorizing such cases has been reported to cause
17230	// miscompiles. See PR25787.
17231	auto DominatedReduxValue = [&](Value *R) {
17232	return isa<Instruction>(Val: R) &&
17233	DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
17234	};
17235
17236	Instruction Rdx = nullptr*;
17237
17238	// Return the incoming value if it comes from the same BB as the phi node.
17239	if (P->getIncomingBlock(i: `0`) == ParentBB) {
17240	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
17241	} else if (P->getIncomingBlock(i: `1`) == ParentBB) {
17242	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
17243	}
17244
17245	if (Rdx && DominatedReduxValue (Rdx))
17246	return Rdx;
17247
17248	// Otherwise, check whether we have a loop latch to look at.
17249	Loop *BBL = LI->getLoopFor(BB: ParentBB);
17250	if (!BBL)
17251	return nullptr;
17252	BasicBlock *BBLatch = BBL->getLoopLatch();
17253	if (!BBLatch)
17254	return nullptr;
17255
17256	// There is a loop latch, return the incoming value if it comes from
17257	// that. This reduction pattern occasionally turns up.
17258	if (P->getIncomingBlock(i: `0`) == BBLatch) {
17259	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
17260	} else if (P->getIncomingBlock(i: `1`) == BBLatch) {
17261	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
17262	}
17263
17264	if (Rdx && DominatedReduxValue (Rdx))
17265	return Rdx;
17266
17267	return nullptr;
17268	}
17269
17270	static bool matchRdxBop(Instruction I, Value &V0, Value *&V1) {
17271	if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
17272	return true;
17273	if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V&: V0), m_Value(V&: V1))))
17274	return true;
17275	if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V&: V0), m_Value(V&: V1))))
17276	return true;
17277	if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V&: V0), m_Value(V&: V1))))
17278	return true;
17279	if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17280	return true;
17281	if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17282	return true;
17283	if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17284	return true;
17285	if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17286	return true;
17287	if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17288	return true;
17289	return false;
17290	}
17291
17292	/// We could have an initial reduction that is not an add.
17293	/// r = v1 + v2 + v3 + v4*
17294	/// In such a case start looking for a tree rooted in the first '+'.
17295	/// \Returns the new root if found, which may be nullptr if not an instruction.
17296	static Instruction tryGetSecondaryReductionRoot(PHINode Phi,
17297	Instruction *Root) {
17298	assert((isa<BinaryOperator>(Root) \|\| isa<SelectInst>(Root) \|\|
17299	isa<IntrinsicInst>(Root)) &&
17300	"Expected binop, select, or intrinsic for reduction matching");
17301	Value *LHS =
17302	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
17303	Value *RHS =
17304	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + `1`);
17305	if (LHS == Phi)
17306	return dyn_cast<Instruction>(Val: RHS);
17307	if (RHS == Phi)
17308	return dyn_cast<Instruction>(Val: LHS);
17309	return nullptr;
17310	}
17311
17312	/// \p Returns the first operand of \p I that does not match \p Phi. If
17313	/// operand is not an instruction it returns nullptr.
17314	static Instruction getNonPhiOperand(Instruction I, PHINode *Phi) {
17315	Value Op0 = nullptr*;
17316	Value Op1 = nullptr*;
17317	if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
17318	return nullptr;
17319	return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
17320	}
17321
17322	/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17323	static bool isReductionCandidate(Instruction *I) {
17324	bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
17325	Value B0 = nullptr, B1 = nullptr;
17326	bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
17327	return IsBinop \|\| IsSelect;
17328	}
17329
17330	bool SLPVectorizerPass::vectorizeHorReduction(
17331	PHINode P, Instruction Root, BasicBlock BB, BoUpSLP &R, TargetTransformInfo TTI,
17332	SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17333	if (!ShouldVectorizeHor)
17334	return false;
17335	bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
17336
17337	if (Root->getParent() != BB \|\| isa<PHINode>(Val: Root))
17338	return false;
17339
17340	// If we can find a secondary reduction root, use that instead.
17341	auto SelectRoot = [&]() {
17342	if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
17343	HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
17344	if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
17345	return NewRoot;
17346	return Root;
17347	};
17348
17349	// Start analysis starting from Root instruction. If horizontal reduction is
17350	// found, try to vectorize it. If it is not a horizontal reduction or
17351	// vectorization is not possible or not effective, and currently analyzed
17352	// instruction is a binary operation, try to vectorize the operands, using
17353	// pre-order DFS traversal order. If the operands were not vectorized, repeat
17354	// the same procedure considering each operand as a possible root of the
17355	// horizontal reduction.
17356	// Interrupt the process if the Root instruction itself was vectorized or all
17357	// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17358	// If a horizintal reduction was not matched or vectorized we collect
17359	// instructions for possible later attempts for vectorization.
17360	std::queue<std::pair<Instruction , unsigned*>> Stack;
17361	Stack.emplace(args: SelectRoot (), args: `0`);
17362	SmallPtrSet<Value *, `8`> VisitedInstrs;
17363	bool Res = false;
17364	auto &&TryToReduce = [this, TTI, &R](Instruction Inst) -> Value {
17365	if (R.isAnalyzedReductionRoot(I: Inst))
17366	return nullptr;
17367	if (!isReductionCandidate(I: Inst))
17368	return nullptr;
17369	HorizontalReduction HorRdx;
17370	if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: SE, DL: DL, TLI: *TLI))
17371	return nullptr;
17372	return HorRdx.tryToReduce(V&: R, DL: DL, TTI, TLI: TLI);
17373	};
17374	auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17375	if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17376	FutureSeed = getNonPhiOperand(I: Root, Phi: P);
17377	if (!FutureSeed)
17378	return false;
17379	}
17380	// Do not collect CmpInst or InsertElementInst/InsertValueInst as their
17381	// analysis is done separately.
17382	if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
17383	PostponedInsts.push_back(Elt: FutureSeed);
17384	return true;
17385	};
17386
17387	while (!Stack.empty()) {
17388	Instruction *Inst;
17389	unsigned Level;
17390	std::tie(args&: Inst, args&: Level) = Stack.front();
17391	Stack.pop();
17392	// Do not try to analyze instruction that has already been vectorized.
17393	// This may happen when we vectorize instruction operands on a previous
17394	// iteration while stack was populated before that happened.
17395	if (R.isDeleted(I: Inst))
17396	continue;
17397	if (Value *VectorizedV = TryToReduce (Inst)) {
17398	Res = true;
17399	if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
17400	// Try to find another reduction.
17401	Stack.emplace(args&: I, args&: Level);
17402	continue;
17403	}
17404	} else {
17405	// We could not vectorize `Inst` so try to use it as a future seed.
17406	if (!TryAppendToPostponedInsts (Inst)) {
17407	assert(Stack.empty() && "Expected empty stack");
17408	break;
17409	}
17410	}
17411
17412	// Try to vectorize operands.
17413	// Continue analysis for the instruction from the same basic block only to
17414	// save compile time.
17415	if (++Level < RecursionMaxDepth)
17416	for (auto *Op : Inst->operand_values())
17417	if (VisitedInstrs.insert(Ptr: Op).second)
17418	if (auto *I = dyn_cast<Instruction>(Val: Op))
17419	// Do not try to vectorize CmpInst operands, this is done
17420	// separately.
17421	if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
17422	!R.isDeleted(I) && I->getParent() == BB)
17423	Stack.emplace(args&: I, args&: Level);
17424	}
17425	return Res;
17426	}
17427
17428	bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Instruction Root,
17429	BasicBlock *BB, BoUpSLP &R,
17430	TargetTransformInfo *TTI) {
17431	SmallVector<WeakTrackingVH> PostponedInsts;
17432	bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
17433	Res \|= tryToVectorize(Insts: PostponedInsts, R);
17434	return Res;
17435	}
17436
17437	bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
17438	BoUpSLP &R) {
17439	bool Res = false;
17440	for (Value *V : Insts)
17441	if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
17442	Res \|= tryToVectorize(I: Inst, R);
17443	return Res;
17444	}
17445
17446	bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
17447	BasicBlock *BB, BoUpSLP &R) {
17448	if (!R.canMapToVector(T: IVI->getType()))
17449	return false;
17450
17451	SmallVector<Value *, `16`> BuildVectorOpds;
17452	SmallVector<Value *, `16`> BuildVectorInsts;
17453	if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts))
17454	return false;
17455
17456	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
17457	// Aggregate value is unlikely to be processed in vector register.
17458	return tryToVectorizeList(VL: BuildVectorOpds, R);
17459	}
17460
17461	bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
17462	BasicBlock *BB, BoUpSLP &R) {
17463	SmallVector<Value *, `16`> BuildVectorInsts;
17464	SmallVector<Value *, `16`> BuildVectorOpds;
17465	SmallVector<int> Mask;
17466	if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts) \|\|
17467	(llvm::all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
17468	isFixedVectorShuffle(VL: BuildVectorOpds, Mask)))
17469	return false;
17470
17471	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
17472	return tryToVectorizeList(VL: BuildVectorInsts, R);
17473	}
17474
17475	template <typename T>
17476	static bool tryToVectorizeSequence(
17477	SmallVectorImpl<T > &Incoming, function_ref<bool(T , T *)> Comparator,
17478	function_ref<bool(T , T )> AreCompatible,
17479	function_ref<bool(ArrayRef<T >, bool*)> TryToVectorizeHelper,
17480	bool MaxVFOnly, BoUpSLP &R) {
17481	bool Changed = false;
17482	// Sort by type, parent, operands.
17483	stable_sort(Incoming, Comparator);
17484
17485	// Try to vectorize elements base on their type.
17486	SmallVector<T *> Candidates;
17487	for (auto IncIt = Incoming.begin(), E = Incoming.end(); IncIt != E;) {
17488	// Look for the next elements with the same type, parent and operand
17489	// kinds.
17490	auto *SameTypeIt = IncIt;
17491	while (SameTypeIt != E && AreCompatible(SameTypeIt, IncIt))
17492	++SameTypeIt;
17493
17494	// Try to vectorize them.
17495	unsigned NumElts = (SameTypeIt - IncIt);
17496	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
17497	<< NumElts << ")\n");
17498	// The vectorization is a 3-state attempt:
17499	// 1. Try to vectorize instructions with the same/alternate opcodes with the
17500	// size of maximal register at first.
17501	// 2. Try to vectorize remaining instructions with the same type, if
17502	// possible. This may result in the better vectorization results rather than
17503	// if we try just to vectorize instructions with the same/alternate opcodes.
17504	// 3. Final attempt to try to vectorize all instructions with the
17505	// same/alternate ops only, this may result in some extra final
17506	// vectorization.
17507	if (NumElts > `1` &&
17508	TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17509	// Success start over because instructions might have been changed.
17510	Changed = true;
17511	} else {
17512	/// \Returns the minimum number of elements that we will attempt to
17513	/// vectorize.
17514	auto GetMinNumElements = [&R](Value *V) {
17515	unsigned EltSize = R.getVectorElementSize(V);
17516	return std::max(a: `2U`, b: R.getMaxVecRegSize() / EltSize);
17517	};
17518	if (NumElts < GetMinNumElements(*IncIt) &&
17519	(Candidates.empty() \|\|
17520	Candidates.front()->getType() == (*IncIt)->getType())) {
17521	Candidates.append(IncIt, std::next(IncIt, NumElts));
17522	}
17523	}
17524	// Final attempt to vectorize instructions with the same types.
17525	if (Candidates.size() > `1` &&
17526	(SameTypeIt == E \|\| (SameTypeIt)->getType() != (IncIt)->getType())) {
17527	if (TryToVectorizeHelper(Candidates, /MaxVFOnly=/false)) {
17528	// Success start over because instructions might have been changed.
17529	Changed = true;
17530	} else if (MaxVFOnly) {
17531	// Try to vectorize using small vectors.
17532	for (auto It = Candidates.begin(), End = Candidates.end();
17533	It != End;) {
17534	auto *SameTypeIt = It;
17535	while (SameTypeIt != End && AreCompatible(SameTypeIt, It))
17536	++SameTypeIt;
17537	unsigned NumElts = (SameTypeIt - It);
17538	if (NumElts > `1` && TryToVectorizeHelper(ArrayRef(It, NumElts),
17539	/MaxVFOnly=/false))
17540	Changed = true;
17541	It = SameTypeIt;
17542	}
17543	}
17544	Candidates.clear();
17545	}
17546
17547	// Start over at the next instruction of a different type (or the end).
17548	IncIt = SameTypeIt;
17549	}
17550	return Changed;
17551	}
17552
17553	/// Compare two cmp instructions. If IsCompatibility is true, function returns
17554	/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
17555	/// operands. If IsCompatibility is false, function implements strict weak
17556	/// ordering relation between two cmp instructions, returning true if the first
17557	/// instruction is "less" than the second, i.e. its predicate is less than the
17558	/// predicate of the second or the operands IDs are less than the operands IDs
17559	/// of the second cmp instruction.
17560	template <bool IsCompatibility>
17561	static bool compareCmp(Value V, Value V2, TargetLibraryInfo &TLI,
17562	const DominatorTree &DT) {
17563	assert(isValidElementType(V->getType()) &&
17564	isValidElementType(V2->getType()) &&
17565	"Expected valid element types only.");
17566	if (V == V2)
17567	return IsCompatibility;
17568	auto *CI1 = cast<CmpInst>(Val: V);
17569	auto *CI2 = cast<CmpInst>(Val: V2);
17570	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() <
17571	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
17572	return !IsCompatibility;
17573	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() >
17574	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
17575	return false;
17576	CmpInst::Predicate Pred1 = CI1->getPredicate();
17577	CmpInst::Predicate Pred2 = CI2->getPredicate();
17578	CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
17579	CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
17580	CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
17581	CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
17582	if (BasePred1 < BasePred2)
17583	return !IsCompatibility;
17584	if (BasePred1 > BasePred2)
17585	return false;
17586	// Compare operands.
17587	bool CI1Preds = Pred1 == BasePred1;
17588	bool CI2Preds = Pred2 == BasePred1;
17589	for (int I = `0`, E = CI1->getNumOperands(); I < E; ++I) {
17590	auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - `1`);
17591	auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - `1`);
17592	if (Op1 == Op2)
17593	continue;
17594	if (Op1->getValueID() < Op2->getValueID())
17595	return !IsCompatibility;
17596	if (Op1->getValueID() > Op2->getValueID())
17597	return false;
17598	if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
17599	if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
17600	if (IsCompatibility) {
17601	if (I1->getParent() != I2->getParent())
17602	return false;
17603	} else {
17604	// Try to compare nodes with same parent.
17605	DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
17606	DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
17607	if (!NodeI1)
17608	return NodeI2 != nullptr;
17609	if (!NodeI2)
17610	return false;
17611	assert((NodeI1 == NodeI2) ==
17612	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17613	"Different nodes should have different DFS numbers");
17614	if (NodeI1 != NodeI2)
17615	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17616	}
17617	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
17618	if (S.getOpcode() && (IsCompatibility \|\| !S.isAltShuffle()))
17619	continue;
17620	if (IsCompatibility)
17621	return false;
17622	if (I1->getOpcode() != I2->getOpcode())
17623	return I1->getOpcode() < I2->getOpcode();
17624	}
17625	}
17626	return IsCompatibility;
17627	}
17628
17629	template <typename ItT>
17630	bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
17631	BasicBlock *BB, BoUpSLP &R) {
17632	bool Changed = false;
17633	// Try to find reductions first.
17634	for (CmpInst *I : CmpInsts) {
17635	if (R.isDeleted(I))
17636	continue;
17637	for (Value *Op : I->operands())
17638	if (auto *RootOp = dyn_cast<Instruction>(Val: Op))
17639	Changed \|= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R, TTI);
17640	}
17641	// Try to vectorize operands as vector bundles.
17642	for (CmpInst *I : CmpInsts) {
17643	if (R.isDeleted(I))
17644	continue;
17645	Changed \|= tryToVectorize(I, R);
17646	}
17647	// Try to vectorize list of compares.
17648	// Sort by type, compare predicate, etc.
17649	auto CompareSorter = [&](Value V, Value V2) {
17650	if (V == V2)
17651	return false;
17652	return compareCmp<false>(V, V2, TLI&: TLI, DT: DT);
17653	};
17654
17655	auto AreCompatibleCompares = [&](Value V1, Value V2) {
17656	if (V1 == V2)
17657	return true;
17658	return compareCmp<true>(V: V1, V2, TLI&: TLI, DT: DT);
17659	};
17660
17661	SmallVector<Value *> Vals;
17662	for (Instruction *V : CmpInsts)
17663	if (!R.isDeleted(I: V) && isValidElementType(Ty: V->getType()))
17664	Vals.push_back(Elt: V);
17665	if (Vals.size() <= `1`)
17666	return Changed;
17667	Changed \|= tryToVectorizeSequence<Value>(
17668	Vals, CompareSorter, AreCompatibleCompares,
17669	[this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
17670	// Exclude possible reductions from other blocks.
17671	bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
17672	return any_of(V->users(), [V](User *U) {
17673	auto *Select = dyn_cast<SelectInst>(Val: U);
17674	return Select &&
17675	Select->getParent() != cast<Instruction>(Val: V)->getParent();
17676	});
17677	});
17678	if (ArePossiblyReducedInOtherBlock)
17679	return false;
17680	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
17681	},
17682	/MaxVFOnly=/true, R);
17683	return Changed;
17684	}
17685
17686	bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17687	BasicBlock *BB, BoUpSLP &R) {
17688	assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17689	"This function only accepts Insert instructions");
17690	bool OpsChanged = false;
17691	SmallVector<WeakTrackingVH> PostponedInsts;
17692	// pass1 - try to vectorize reductions only
17693	for (auto *I : reverse(C&: Instructions)) {
17694	if (R.isDeleted(I))
17695	continue;
17696	OpsChanged \|= vectorizeHorReduction(P: nullptr, Root: I, BB, R, TTI, PostponedInsts);
17697	}
17698	// pass2 - try to match and vectorize a buildvector sequence.
17699	for (auto *I : reverse(C&: Instructions)) {
17700	if (R.isDeleted(I) \|\| isa<CmpInst>(Val: I))
17701	continue;
17702	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
17703	OpsChanged \|= vectorizeInsertValueInst(IVI: LastInsertValue, BB, R);
17704	} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
17705	OpsChanged \|= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R);
17706	}
17707	}
17708	// Now try to vectorize postponed instructions.
17709	OpsChanged \|= tryToVectorize(Insts: PostponedInsts, R);
17710
17711	Instructions.clear();
17712	return OpsChanged;
17713	}
17714
17715	bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
17716	bool Changed = false;
17717	SmallVector<Value *, `4`> Incoming;
17718	SmallPtrSet<Value *, `16`> VisitedInstrs;
17719	// Maps phi nodes to the non-phi nodes found in the use tree for each phi
17720	// node. Allows better to identify the chains that can be vectorized in the
17721	// better way.
17722	DenseMap<Value , SmallVector<Value , `4`>> PHIToOpcodes;
17723	auto PHICompare = [this, &PHIToOpcodes](Value V1, Value V2) {
17724	assert(isValidElementType(V1->getType()) &&
17725	isValidElementType(V2->getType()) &&
17726	"Expected vectorizable types only.");
17727	// It is fine to compare type IDs here, since we expect only vectorizable
17728	// types, like ints, floats and pointers, we don't care about other type.
17729	if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
17730	return true;
17731	if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
17732	return false;
17733	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
17734	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
17735	if (Opcodes1.size() < Opcodes2.size())
17736	return true;
17737	if (Opcodes1.size() > Opcodes2.size())
17738	return false;
17739	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
17740	{
17741	// Instructions come first.
17742	auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]);
17743	auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I]);
17744	if (I1 && I2) {
17745	DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
17746	DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
17747	if (!NodeI1)
17748	return NodeI2 != nullptr;
17749	if (!NodeI2)
17750	return false;
17751	assert((NodeI1 == NodeI2) ==
17752	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17753	"Different nodes should have different DFS numbers");
17754	if (NodeI1 != NodeI2)
17755	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17756	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
17757	if (S.getOpcode() && !S.isAltShuffle())
17758	continue;
17759	return I1->getOpcode() < I2->getOpcode();
17760	}
17761	if (I1)
17762	return true;
17763	if (I2)
17764	return false;
17765	}
17766	{
17767	// Non-undef constants come next.
17768	bool C1 = isa<Constant>(Val: Opcodes1 [I]) && !isa<UndefValue>(Val: Opcodes1 [I]);
17769	bool C2 = isa<Constant>(Val: Opcodes2 [I]) && !isa<UndefValue>(Val: Opcodes2 [I]);
17770	if (C1 && C2)
17771	continue;
17772	if (C1)
17773	return true;
17774	if (C2)
17775	return false;
17776	}
17777	bool U1 = isa<UndefValue>(Val: Opcodes1 [I]);
17778	bool U2 = isa<UndefValue>(Val: Opcodes2 [I]);
17779	{
17780	// Non-constant non-instructions come next.
17781	if (!U1 && !U2) {
17782	auto ValID1 = Opcodes1 [I]->getValueID();
17783	auto ValID2 = Opcodes2 [I]->getValueID();
17784	if (ValID1 == ValID2)
17785	continue;
17786	if (ValID1 < ValID2)
17787	return true;
17788	if (ValID1 > ValID2)
17789	return false;
17790	}
17791	if (!U1)
17792	return true;
17793	if (!U2)
17794	return false;
17795	}
17796	// Undefs come last.
17797	assert(U1 && U2 && "The only thing left should be undef & undef.");
17798	continue;
17799	}
17800	return false;
17801	};
17802	auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value V1, Value V2) {
17803	if (V1 == V2)
17804	return true;
17805	if (V1->getType() != V2->getType())
17806	return false;
17807	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
17808	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
17809	if (Opcodes1.size() != Opcodes2.size())
17810	return false;
17811	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
17812	// Undefs are compatible with any other value.
17813	if (isa<UndefValue>(Val: Opcodes1 [I]) \|\| isa<UndefValue>(Val: Opcodes2 [I]))
17814	continue;
17815	if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]))
17816	if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I])) {
17817	if (I1->getParent() != I2->getParent())
17818	return false;
17819	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
17820	if (S.getOpcode())
17821	continue;
17822	return false;
17823	}
17824	if (isa<Constant>(Val: Opcodes1 [I]) && isa<Constant>(Val: Opcodes2 [I]))
17825	continue;
17826	if (Opcodes1 [I]->getValueID() != Opcodes2 [I]->getValueID())
17827	return false;
17828	}
17829	return true;
17830	};
17831
17832	bool HaveVectorizedPhiNodes = false;
17833	do {
17834	// Collect the incoming values from the PHIs.
17835	Incoming.clear();
17836	for (Instruction &I : *BB) {
17837	PHINode *P = dyn_cast<PHINode>(Val: &I);
17838	if (!P)
17839	break;
17840
17841	// No need to analyze deleted, vectorized and non-vectorizable
17842	// instructions.
17843	if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
17844	isValidElementType(Ty: P->getType()))
17845	Incoming.push_back(Elt: P);
17846	}
17847
17848	if (Incoming.size() <= `1`)
17849	break;
17850
17851	// Find the corresponding non-phi nodes for better matching when trying to
17852	// build the tree.
17853	for (Value *V : Incoming) {
17854	SmallVectorImpl<Value *> &Opcodes =
17855	PHIToOpcodes.try_emplace(Key: V).first ->getSecond();
17856	if (!Opcodes.empty())
17857	continue;
17858	SmallVector<Value *, `4`> Nodes(`1`, V);
17859	SmallPtrSet<Value *, `4`> Visited;
17860	while (!Nodes.empty()) {
17861	auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
17862	if (!Visited.insert(Ptr: PHI).second)
17863	continue;
17864	for (Value *V : PHI->incoming_values()) {
17865	if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
17866	Nodes.push_back(Elt: PHI1);
17867	continue;
17868	}
17869	Opcodes.emplace_back(Args&: V);
17870	}
17871	}
17872	}
17873
17874	HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
17875	Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
17876	TryToVectorizeHelper: [this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
17877	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
17878	},
17879	/MaxVFOnly=/true, R);
17880	Changed \|= HaveVectorizedPhiNodes;
17881	VisitedInstrs.insert(I: Incoming.begin(), E: Incoming.end());
17882	} while (HaveVectorizedPhiNodes);
17883
17884	VisitedInstrs.clear();
17885
17886	InstSetVector PostProcessInserts;
17887	SmallSetVector<CmpInst *, `8`> PostProcessCmps;
17888	// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
17889	// also vectorizes `PostProcessCmps`.
17890	auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
17891	bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
17892	if (VectorizeCmps) {
17893	Changed \|= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
17894	PostProcessCmps.clear();
17895	}
17896	PostProcessInserts.clear();
17897	return Changed;
17898	};
17899	// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
17900	auto IsInPostProcessInstrs = [&](Instruction *I) {
17901	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
17902	return PostProcessCmps.contains(key: Cmp);
17903	return isa<InsertElementInst, InsertValueInst>(Val: I) &&
17904	PostProcessInserts.contains(key: I);
17905	};
17906	// Returns true if `I` is an instruction without users, like terminator, or
17907	// function call with ignored return value, store. Ignore unused instructions
17908	// (basing on instruction type, except for CallInst and InvokeInst).
17909	auto HasNoUsers = [](Instruction *I) {
17910	return I->use_empty() &&
17911	(I->getType()->isVoidTy() \|\| isa<CallInst, InvokeInst>(Val: I));
17912	};
17913	for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
17914	// Skip instructions with scalable type. The num of elements is unknown at
17915	// compile-time for scalable type.
17916	if (isa<ScalableVectorType>(Val: It ->getType()))
17917	continue;
17918
17919	// Skip instructions marked for the deletion.
17920	if (R.isDeleted(I: &*It))
17921	continue;
17922	// We may go through BB multiple times so skip the one we have checked.
17923	if (!VisitedInstrs.insert(Ptr: &*It).second) {
17924	if (HasNoUsers (&*It) &&
17925	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator())) {
17926	// We would like to start over since some instructions are deleted
17927	// and the iterator may become invalid value.
17928	Changed = true;
17929	It = BB->begin();
17930	E = BB->end();
17931	}
17932	continue;
17933	}
17934
17935	if (isa<DbgInfoIntrinsic>(Val: It))
17936	continue;
17937
17938	// Try to vectorize reductions that use PHINodes.
17939	if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
17940	// Check that the PHI is a reduction PHI.
17941	if (P->getNumIncomingValues() == `2`) {
17942	// Try to match and vectorize a horizontal reduction.
17943	Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
17944	if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
17945	Changed = true;
17946	It = BB->begin();
17947	E = BB->end();
17948	continue;
17949	}
17950	}
17951	// Try to vectorize the incoming values of the PHI, to catch reductions
17952	// that feed into PHIs.
17953	for (unsigned I = `0`, E = P->getNumIncomingValues(); I != E; I++) {
17954	// Skip if the incoming block is the current BB for now. Also, bypass
17955	// unreachable IR for efficiency and to avoid crashing.
17956	// TODO: Collect the skipped incoming values and try to vectorize them
17957	// after processing BB.
17958	if (BB == P->getIncomingBlock(i: I) \|\|
17959	!DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
17960	continue;
17961
17962	// Postponed instructions should not be vectorized here, delay their
17963	// vectorization.
17964	if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
17965	PI && !IsInPostProcessInstrs (PI))
17966	Changed \|= vectorizeRootInstruction(P: nullptr, Root: PI,
17967	BB: P->getIncomingBlock(i: I), R, TTI);
17968	}
17969	continue;
17970	}
17971
17972	if (HasNoUsers (&*It)) {
17973	bool OpsChanged = false;
17974	auto *SI = dyn_cast<StoreInst>(Val&: It);
17975	bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore \|\| !SI;
17976	if (SI) {
17977	auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
17978	// Try to vectorize chain in store, if this is the only store to the
17979	// address in the block.
17980	// TODO: This is just a temporarily solution to save compile time. Need
17981	// to investigate if we can safely turn on slp-vectorize-hor-store
17982	// instead to allow lookup for reduction chains in all non-vectorized
17983	// stores (need to check side effects and compile time).
17984	TryToVectorizeRoot \|= (I == Stores.end() \|\| I->second.size() == `1`) &&
17985	SI->getValueOperand()->hasOneUse();
17986	}
17987	if (TryToVectorizeRoot) {
17988	for (auto *V : It ->operand_values()) {
17989	// Postponed instructions should not be vectorized here, delay their
17990	// vectorization.
17991	if (auto *VI = dyn_cast<Instruction>(Val: V);
17992	VI && !IsInPostProcessInstrs (VI))
17993	// Try to match and vectorize a horizontal reduction.
17994	OpsChanged \|= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R, TTI);
17995	}
17996	}
17997	// Start vectorization of post-process list of instructions from the
17998	// top-tree instructions to try to vectorize as many instructions as
17999	// possible.
18000	OpsChanged \|=
18001	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator());
18002	if (OpsChanged) {
18003	// We would like to start over since some instructions are deleted
18004	// and the iterator may become invalid value.
18005	Changed = true;
18006	It = BB->begin();
18007	E = BB->end();
18008	continue;
18009	}
18010	}
18011
18012	if (isa<InsertElementInst, InsertValueInst>(Val: It))
18013	PostProcessInserts.insert(X: &*It);
18014	else if (isa<CmpInst>(Val: It))
18015	PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
18016	}
18017
18018	return Changed;
18019	}
18020
18021	bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18022	auto Changed = false;
18023	for (auto &Entry : GEPs) {
18024	// If the getelementptr list has fewer than two elements, there's nothing
18025	// to do.
18026	if (Entry.second.size() < `2`)
18027	continue;
18028
18029	LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18030	<< Entry.second.size() << ".\n");
18031
18032	// Process the GEP list in chunks suitable for the target's supported
18033	// vector size. If a vector register can't hold 1 element, we are done. We
18034	// are trying to vectorize the index computations, so the maximum number of
18035	// elements is based on the size of the index expression, rather than the
18036	// size of the GEP itself (the target's pointer size).
18037	unsigned MaxVecRegSize = R.getMaxVecRegSize();
18038	unsigned EltSize = R.getVectorElementSize(V: *Entry.second [`0`]->idx_begin());
18039	if (MaxVecRegSize < EltSize)
18040	continue;
18041
18042	unsigned MaxElts = MaxVecRegSize / EltSize;
18043	for (unsigned BI = `0`, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18044	auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
18045	ArrayRef<GetElementPtrInst *> GEPList(&Entry.second [BI], Len);
18046
18047	// Initialize a set a candidate getelementptrs. Note that we use a
18048	// SetVector here to preserve program order. If the index computations
18049	// are vectorizable and begin with loads, we want to minimize the chance
18050	// of having to reorder them later.
18051	SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18052
18053	// Some of the candidates may have already been vectorized after we
18054	// initially collected them or their index is optimized to constant value.
18055	// If so, they are marked as deleted, so remove them from the set of
18056	// candidates.
18057	Candidates.remove_if(P: [&R](Value *I) {
18058	return R.isDeleted(I: cast<Instruction>(Val: I)) \|\|
18059	isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
18060	});
18061
18062	// Remove from the set of candidates all pairs of getelementptrs with
18063	// constant differences. Such getelementptrs are likely not good
18064	// candidates for vectorization in a bottom-up phase since one can be
18065	// computed from the other. We also ensure all candidate getelementptr
18066	// indices are unique.
18067	for (int I = `0`, E = GEPList.size(); I < E && Candidates.size() > `1`; ++I) {
18068	auto *GEPI = GEPList [I];
18069	if (!Candidates.count(key: GEPI))
18070	continue;
18071	auto *SCEVI = SE->getSCEV(V: GEPList [I]);
18072	for (int J = I + `1`; J < E && Candidates.size() > `1`; ++J) {
18073	auto *GEPJ = GEPList [J];
18074	auto *SCEVJ = SE->getSCEV(V: GEPList [J]);
18075	if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
18076	Candidates.remove(X: GEPI);
18077	Candidates.remove(X: GEPJ);
18078	} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18079	Candidates.remove(X: GEPJ);
18080	}
18081	}
18082	}
18083
18084	// We break out of the above computation as soon as we know there are
18085	// fewer than two candidates remaining.
18086	if (Candidates.size() < `2`)
18087	continue;
18088
18089	// Add the single, non-constant index of each candidate to the bundle. We
18090	// ensured the indices met these constraints when we originally collected
18091	// the getelementptrs.
18092	SmallVector<Value *, `16`> Bundle(Candidates.size());
18093	auto BundleIndex = `0u`;
18094	for (auto *V : Candidates) {
18095	auto *GEP = cast<GetElementPtrInst>(Val: V);
18096	auto *GEPIdx = GEP->idx_begin()->get();
18097	assert(GEP->getNumIndices() == `1` && !isa<Constant>(GEPIdx));
18098	Bundle [BundleIndex++] = GEPIdx;
18099	}
18100
18101	// Try and vectorize the indices. We are currently only interested in
18102	// gather-like cases of the form:
18103	//
18104	// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18105	//
18106	// where the loads of "a", the loads of "b", and the subtractions can be
18107	// performed in parallel. It's likely that detecting this pattern in a
18108	// bottom-up phase will be simpler and less costly than building a
18109	// full-blown top-down phase beginning at the consecutive loads.
18110	Changed \|= tryToVectorizeList(VL: Bundle, R);
18111	}
18112	}
18113	return Changed;
18114	}
18115
18116	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18117	bool Changed = false;
18118	// Sort by type, base pointers and values operand. Value operands must be
18119	// compatible (have the same opcode, same parent), otherwise it is
18120	// definitely not profitable to try to vectorize them.
18121	auto &&StoreSorter = [this](StoreInst V, StoreInst V2) {
18122	if (V->getValueOperand()->getType()->getTypeID() <
18123	V2->getValueOperand()->getType()->getTypeID())
18124	return true;
18125	if (V->getValueOperand()->getType()->getTypeID() >
18126	V2->getValueOperand()->getType()->getTypeID())
18127	return false;
18128	if (V->getPointerOperandType()->getTypeID() <
18129	V2->getPointerOperandType()->getTypeID())
18130	return true;
18131	if (V->getPointerOperandType()->getTypeID() >
18132	V2->getPointerOperandType()->getTypeID())
18133	return false;
18134	// UndefValues are compatible with all other values.
18135	if (isa<UndefValue>(Val: V->getValueOperand()) \|\|
18136	isa<UndefValue>(Val: V2->getValueOperand()))
18137	return false;
18138	if (auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand()))
18139	if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
18140	DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
18141	DT->getNode(BB: I1->getParent());
18142	DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
18143	DT->getNode(BB: I2->getParent());
18144	assert(NodeI1 && "Should only process reachable instructions");
18145	assert(NodeI2 && "Should only process reachable instructions");
18146	assert((NodeI1 == NodeI2) ==
18147	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18148	"Different nodes should have different DFS numbers");
18149	if (NodeI1 != NodeI2)
18150	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18151	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18152	if (S.getOpcode())
18153	return false;
18154	return I1->getOpcode() < I2->getOpcode();
18155	}
18156	if (isa<Constant>(Val: V->getValueOperand()) &&
18157	isa<Constant>(Val: V2->getValueOperand()))
18158	return false;
18159	return V->getValueOperand()->getValueID() <
18160	V2->getValueOperand()->getValueID();
18161	};
18162
18163	auto &&AreCompatibleStores = [this](StoreInst V1, StoreInst V2) {
18164	if (V1 == V2)
18165	return true;
18166	if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18167	return false;
18168	if (V1->getPointerOperandType() != V2->getPointerOperandType())
18169	return false;
18170	// Undefs are compatible with any other value.
18171	if (isa<UndefValue>(Val: V1->getValueOperand()) \|\|
18172	isa<UndefValue>(Val: V2->getValueOperand()))
18173	return true;
18174	if (auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand()))
18175	if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
18176	if (I1->getParent() != I2->getParent())
18177	return false;
18178	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18179	return S.getOpcode() > `0`;
18180	}
18181	if (isa<Constant>(Val: V1->getValueOperand()) &&
18182	isa<Constant>(Val: V2->getValueOperand()))
18183	return true;
18184	return V1->getValueOperand()->getValueID() ==
18185	V2->getValueOperand()->getValueID();
18186	};
18187
18188	// Attempt to sort and vectorize each of the store-groups.
18189	for (auto &Pair : Stores) {
18190	if (Pair.second.size() < `2`)
18191	continue;
18192
18193	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18194	<< Pair.second.size() << ".\n");
18195
18196	if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
18197	continue;
18198
18199	// Reverse stores to do bottom-to-top analysis. This is important if the
18200	// values are stores to the same addresses several times, in this case need
18201	// to follow the stores order (reversed to meet the memory dependecies).
18202	SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18203	Pair.second.rend());
18204	Changed \|= tryToVectorizeSequence<StoreInst>(
18205	Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
18206	TryToVectorizeHelper: [this, &R](ArrayRef<StoreInst > Candidates, bool*) {
18207	return vectorizeStores(Stores: Candidates, R);
18208	},
18209	/MaxVFOnly=/false, R);
18210	}
18211	return Changed;
18212	}
18213

source code of llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp