1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/LoopAccessAnalysis.h"
42#include "llvm/Analysis/LoopInfo.h"
43#include "llvm/Analysis/MemoryLocation.h"
44#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45#include "llvm/Analysis/ScalarEvolution.h"
46#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47#include "llvm/Analysis/TargetLibraryInfo.h"
48#include "llvm/Analysis/TargetTransformInfo.h"
49#include "llvm/Analysis/ValueTracking.h"
50#include "llvm/Analysis/VectorUtils.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
67#include "llvm/IR/PatternMatch.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/DOTGraphTraits.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/ErrorHandling.h"
83#include "llvm/Support/GraphWriter.h"
84#include "llvm/Support/InstructionCost.h"
85#include "llvm/Support/KnownBits.h"
86#include "llvm/Support/MathExtras.h"
87#include "llvm/Support/raw_ostream.h"
88#include "llvm/Transforms/Utils/InjectTLIMappings.h"
89#include "llvm/Transforms/Utils/Local.h"
90#include "llvm/Transforms/Utils/LoopUtils.h"
91#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
117 SLPCostThreshold("slp-threshold", cl::init(Val: 0), cl::Hidden,
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
121static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
122 "slp-skip-early-profitability-check", cl::init(Val: false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
130static cl::opt<bool> ShouldStartVectorizeHorAtStore(
131 "slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
137static cl::opt<bool> AllowHorRdxIdenityOptimization(
138 "slp-optimize-identity-hor-reduction-ops", cl::init(Val: true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
143MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: 128), cl::Hidden,
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
146static cl::opt<unsigned>
147MaxVFOption("slp-max-vf", cl::init(Val: 0), cl::Hidden,
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: 100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
158static cl::opt<int> MinVectorRegSizeOption(
159 "slp-min-reg-size", cl::init(Val: 128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
162static cl::opt<unsigned> RecursionMaxDepth(
163 "slp-recursion-max-depth", cl::init(Val: 12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
166static cl::opt<unsigned> MinTreeSize(
167 "slp-min-tree-size", cl::init(Val: 3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
172static cl::opt<int> LookAheadMaxDepth(
173 "slp-max-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
181static cl::opt<int> RootLookAheadMaxDepth(
182 "slp-max-root-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
185static cl::opt<unsigned> MinProfitableStridedLoads(
186 "slp-min-strided-loads", cl::init(Val: 2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
190static cl::opt<unsigned> MaxProfitableLoadStride(
191 "slp-max-stride", cl::init(Val: 8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
198static cl::opt<bool> VectorizeNonPowerOf2(
199 "slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Predicate for the element types that the SLP vectorizer supports.
220///
221/// The most important thing to filter here are types which are invalid in LLVM
222/// vectors. We also filter target specific types which have absolutely no
223/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
224/// avoids spending time checking the cost model and realizing that they will
225/// be inevitably scalarized.
226static bool isValidElementType(Type *Ty) {
227 return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
228 !Ty->isPPC_FP128Ty();
229}
230
231/// \returns True if the value is a constant (but not globals/constant
232/// expressions).
233static bool isConstant(Value *V) {
234 return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
235}
236
237/// Checks if \p V is one of vector-like instructions, i.e. undef,
238/// insertelement/extractelement with constant indices for fixed vector type or
239/// extractvalue instruction.
240static bool isVectorLikeInstWithConstOps(Value *V) {
241 if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
242 !isa<ExtractValueInst, UndefValue>(Val: V))
243 return false;
244 auto *I = dyn_cast<Instruction>(Val: V);
245 if (!I || isa<ExtractValueInst>(Val: I))
246 return true;
247 if (!isa<FixedVectorType>(Val: I->getOperand(i: 0)->getType()))
248 return false;
249 if (isa<ExtractElementInst>(Val: I))
250 return isConstant(V: I->getOperand(i: 1));
251 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
252 return isConstant(V: I->getOperand(i: 2));
253}
254
255#if !defined(NDEBUG)
256/// Print a short descriptor of the instruction bundle suitable for debug output.
257static std::string shortBundleName(ArrayRef<Value *> VL) {
258 std::string Result;
259 raw_string_ostream OS(Result);
260 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
261 OS.flush();
262 return Result;
263}
264#endif
265
266/// \returns true if all of the instructions in \p VL are in the same block or
267/// false otherwise.
268static bool allSameBlock(ArrayRef<Value *> VL) {
269 Instruction *I0 = dyn_cast<Instruction>(Val: VL[0]);
270 if (!I0)
271 return false;
272 if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
273 return true;
274
275 BasicBlock *BB = I0->getParent();
276 for (int I = 1, E = VL.size(); I < E; I++) {
277 auto *II = dyn_cast<Instruction>(Val: VL[I]);
278 if (!II)
279 return false;
280
281 if (BB != II->getParent())
282 return false;
283 }
284 return true;
285}
286
287/// \returns True if all of the values in \p VL are constants (but not
288/// globals/constant expressions).
289static bool allConstant(ArrayRef<Value *> VL) {
290 // Constant expressions and globals can't be vectorized like normal integer/FP
291 // constants.
292 return all_of(Range&: VL, P: isConstant);
293}
294
295/// \returns True if all of the values in \p VL are identical or some of them
296/// are UndefValue.
297static bool isSplat(ArrayRef<Value *> VL) {
298 Value *FirstNonUndef = nullptr;
299 for (Value *V : VL) {
300 if (isa<UndefValue>(Val: V))
301 continue;
302 if (!FirstNonUndef) {
303 FirstNonUndef = V;
304 continue;
305 }
306 if (V != FirstNonUndef)
307 return false;
308 }
309 return FirstNonUndef != nullptr;
310}
311
312/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
313static bool isCommutative(Instruction *I) {
314 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
315 return Cmp->isCommutative();
316 if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
317 return BO->isCommutative() ||
318 (BO->getOpcode() == Instruction::Sub &&
319 !BO->hasNUsesOrMore(N: UsesLimit) &&
320 all_of(
321 Range: BO->uses(),
322 P: [](const Use &U) {
323 // Commutative, if icmp eq/ne sub, 0
324 ICmpInst::Predicate Pred;
325 if (match(V: U.getUser(),
326 P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
327 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
328 return true;
329 // Commutative, if abs(sub nsw, true) or abs(sub, false).
330 ConstantInt *Flag;
331 return match(U.getUser(),
332 m_Intrinsic<Intrinsic::abs>(
333 m_Specific(U.get()), m_ConstantInt(Flag))) &&
334 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
335 Flag->isOne());
336 })) ||
337 (BO->getOpcode() == Instruction::FSub &&
338 !BO->hasNUsesOrMore(N: UsesLimit) &&
339 all_of(Range: BO->uses(), P: [](const Use &U) {
340 return match(U.getUser(),
341 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
342 }));
343 return I->isCommutative();
344}
345
346/// \returns inserting index of InsertElement or InsertValue instruction,
347/// using Offset as base offset for index.
348static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
349 unsigned Offset = 0) {
350 int Index = Offset;
351 if (const auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst)) {
352 const auto *VT = dyn_cast<FixedVectorType>(Val: IE->getType());
353 if (!VT)
354 return std::nullopt;
355 const auto *CI = dyn_cast<ConstantInt>(Val: IE->getOperand(i_nocapture: 2));
356 if (!CI)
357 return std::nullopt;
358 if (CI->getValue().uge(RHS: VT->getNumElements()))
359 return std::nullopt;
360 Index *= VT->getNumElements();
361 Index += CI->getZExtValue();
362 return Index;
363 }
364
365 const auto *IV = cast<InsertValueInst>(Val: InsertInst);
366 Type *CurrentType = IV->getType();
367 for (unsigned I : IV->indices()) {
368 if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
369 Index *= ST->getNumElements();
370 CurrentType = ST->getElementType(N: I);
371 } else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
372 Index *= AT->getNumElements();
373 CurrentType = AT->getElementType();
374 } else {
375 return std::nullopt;
376 }
377 Index += I;
378 }
379 return Index;
380}
381
382namespace {
383/// Specifies the way the mask should be analyzed for undefs/poisonous elements
384/// in the shuffle mask.
385enum class UseMask {
386 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
387 ///< check for the mask elements for the first argument (mask
388 ///< indices are in range [0:VF)).
389 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
390 ///< for the mask elements for the second argument (mask indices
391 ///< are in range [VF:2*VF))
392 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
393 ///< future shuffle elements and mark them as ones as being used
394 ///< in future. Non-undef elements are considered as unused since
395 ///< they're already marked as used in the mask.
396};
397} // namespace
398
399/// Prepares a use bitset for the given mask either for the first argument or
400/// for the second.
401static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
402 UseMask MaskArg) {
403 SmallBitVector UseMask(VF, true);
404 for (auto [Idx, Value] : enumerate(First&: Mask)) {
405 if (Value == PoisonMaskElem) {
406 if (MaskArg == UseMask::UndefsAsMask)
407 UseMask.reset(Idx);
408 continue;
409 }
410 if (MaskArg == UseMask::FirstArg && Value < VF)
411 UseMask.reset(Idx: Value);
412 else if (MaskArg == UseMask::SecondArg && Value >= VF)
413 UseMask.reset(Idx: Value - VF);
414 }
415 return UseMask;
416}
417
418/// Checks if the given value is actually an undefined constant vector.
419/// Also, if the \p UseMask is not empty, tries to check if the non-masked
420/// elements actually mask the insertelement buildvector, if any.
421template <bool IsPoisonOnly = false>
422static SmallBitVector isUndefVector(const Value *V,
423 const SmallBitVector &UseMask = {}) {
424 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
425 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
426 if (isa<T>(V))
427 return Res;
428 auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
429 if (!VecTy)
430 return Res.reset();
431 auto *C = dyn_cast<Constant>(Val: V);
432 if (!C) {
433 if (!UseMask.empty()) {
434 const Value *Base = V;
435 while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
436 Base = II->getOperand(i_nocapture: 0);
437 if (isa<T>(II->getOperand(i_nocapture: 1)))
438 continue;
439 std::optional<unsigned> Idx = getInsertIndex(InsertInst: II);
440 if (!Idx) {
441 Res.reset();
442 return Res;
443 }
444 if (*Idx < UseMask.size() && !UseMask.test(Idx: *Idx))
445 Res.reset(Idx: *Idx);
446 }
447 // TODO: Add analysis for shuffles here too.
448 if (V == Base) {
449 Res.reset();
450 } else {
451 SmallBitVector SubMask(UseMask.size(), false);
452 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
453 }
454 } else {
455 Res.reset();
456 }
457 return Res;
458 }
459 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
460 if (Constant *Elem = C->getAggregateElement(Elt: I))
461 if (!isa<T>(Elem) &&
462 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(Idx: I))))
463 Res.reset(Idx: I);
464 }
465 return Res;
466}
467
468/// Checks if the vector of instructions can be represented as a shuffle, like:
469/// %x0 = extractelement <4 x i8> %x, i32 0
470/// %x3 = extractelement <4 x i8> %x, i32 3
471/// %y1 = extractelement <4 x i8> %y, i32 1
472/// %y2 = extractelement <4 x i8> %y, i32 2
473/// %x0x0 = mul i8 %x0, %x0
474/// %x3x3 = mul i8 %x3, %x3
475/// %y1y1 = mul i8 %y1, %y1
476/// %y2y2 = mul i8 %y2, %y2
477/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
478/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
479/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
480/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
481/// ret <4 x i8> %ins4
482/// can be transformed into:
483/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
484/// i32 6>
485/// %2 = mul <4 x i8> %1, %1
486/// ret <4 x i8> %2
487/// Mask will return the Shuffle Mask equivalent to the extracted elements.
488/// TODO: Can we split off and reuse the shuffle mask detection from
489/// ShuffleVectorInst/getShuffleCost?
490static std::optional<TargetTransformInfo::ShuffleKind>
491isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
492 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
493 if (It == VL.end())
494 return std::nullopt;
495 auto *EI0 = cast<ExtractElementInst>(Val: *It);
496 if (isa<ScalableVectorType>(Val: EI0->getVectorOperandType()))
497 return std::nullopt;
498 unsigned Size =
499 cast<FixedVectorType>(Val: EI0->getVectorOperandType())->getNumElements();
500 Value *Vec1 = nullptr;
501 Value *Vec2 = nullptr;
502 enum ShuffleMode { Unknown, Select, Permute };
503 ShuffleMode CommonShuffleMode = Unknown;
504 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
505 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
506 // Undef can be represented as an undef element in a vector.
507 if (isa<UndefValue>(Val: VL[I]))
508 continue;
509 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
510 if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
511 return std::nullopt;
512 auto *Vec = EI->getVectorOperand();
513 // We can extractelement from undef or poison vector.
514 if (isUndefVector(V: Vec).all())
515 continue;
516 // All vector operands must have the same number of vector elements.
517 if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() != Size)
518 return std::nullopt;
519 if (isa<UndefValue>(Val: EI->getIndexOperand()))
520 continue;
521 auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
522 if (!Idx)
523 return std::nullopt;
524 // Undefined behavior if Idx is negative or >= Size.
525 if (Idx->getValue().uge(RHS: Size))
526 continue;
527 unsigned IntIdx = Idx->getValue().getZExtValue();
528 Mask[I] = IntIdx;
529 // For correct shuffling we have to have at most 2 different vector operands
530 // in all extractelement instructions.
531 if (!Vec1 || Vec1 == Vec) {
532 Vec1 = Vec;
533 } else if (!Vec2 || Vec2 == Vec) {
534 Vec2 = Vec;
535 Mask[I] += Size;
536 } else {
537 return std::nullopt;
538 }
539 if (CommonShuffleMode == Permute)
540 continue;
541 // If the extract index is not the same as the operation number, it is a
542 // permutation.
543 if (IntIdx != I) {
544 CommonShuffleMode = Permute;
545 continue;
546 }
547 CommonShuffleMode = Select;
548 }
549 // If we're not crossing lanes in different vectors, consider it as blending.
550 if (CommonShuffleMode == Select && Vec2)
551 return TargetTransformInfo::SK_Select;
552 // If Vec2 was never used, we have a permutation of a single vector, otherwise
553 // we have permutation of 2 vectors.
554 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
555 : TargetTransformInfo::SK_PermuteSingleSrc;
556}
557
558/// \returns True if Extract{Value,Element} instruction extracts element Idx.
559static std::optional<unsigned> getExtractIndex(Instruction *E) {
560 unsigned Opcode = E->getOpcode();
561 assert((Opcode == Instruction::ExtractElement ||
562 Opcode == Instruction::ExtractValue) &&
563 "Expected extractelement or extractvalue instruction.");
564 if (Opcode == Instruction::ExtractElement) {
565 auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: 1));
566 if (!CI)
567 return std::nullopt;
568 return CI->getZExtValue();
569 }
570 auto *EI = cast<ExtractValueInst>(Val: E);
571 if (EI->getNumIndices() != 1)
572 return std::nullopt;
573 return *EI->idx_begin();
574}
575
576namespace {
577
578/// Main data required for vectorization of instructions.
579struct InstructionsState {
580 /// The very first instruction in the list with the main opcode.
581 Value *OpValue = nullptr;
582
583 /// The main/alternate instruction.
584 Instruction *MainOp = nullptr;
585 Instruction *AltOp = nullptr;
586
587 /// The main/alternate opcodes for the list of instructions.
588 unsigned getOpcode() const {
589 return MainOp ? MainOp->getOpcode() : 0;
590 }
591
592 unsigned getAltOpcode() const {
593 return AltOp ? AltOp->getOpcode() : 0;
594 }
595
596 /// Some of the instructions in the list have alternate opcodes.
597 bool isAltShuffle() const { return AltOp != MainOp; }
598
599 bool isOpcodeOrAlt(Instruction *I) const {
600 unsigned CheckedOpcode = I->getOpcode();
601 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
602 }
603
604 InstructionsState() = delete;
605 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
606 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
607};
608
609} // end anonymous namespace
610
611/// Chooses the correct key for scheduling data. If \p Op has the same (or
612/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
613/// OpValue.
614static Value *isOneOf(const InstructionsState &S, Value *Op) {
615 auto *I = dyn_cast<Instruction>(Val: Op);
616 if (I && S.isOpcodeOrAlt(I))
617 return Op;
618 return S.OpValue;
619}
620
621/// \returns true if \p Opcode is allowed as part of the main/alternate
622/// instruction for SLP vectorization.
623///
624/// Example of unsupported opcode is SDIV that can potentially cause UB if the
625/// "shuffled out" lane would result in division by zero.
626static bool isValidForAlternation(unsigned Opcode) {
627 if (Instruction::isIntDivRem(Opcode))
628 return false;
629
630 return true;
631}
632
633static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
634 const TargetLibraryInfo &TLI,
635 unsigned BaseIndex = 0);
636
637/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
638/// compatible instructions or constants, or just some other regular values.
639static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
640 Value *Op1, const TargetLibraryInfo &TLI) {
641 return (isConstant(V: BaseOp0) && isConstant(V: Op0)) ||
642 (isConstant(V: BaseOp1) && isConstant(V: Op1)) ||
643 (!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
644 !isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) ||
645 BaseOp0 == Op0 || BaseOp1 == Op1 ||
646 getSameOpcode(VL: {BaseOp0, Op0}, TLI).getOpcode() ||
647 getSameOpcode(VL: {BaseOp1, Op1}, TLI).getOpcode();
648}
649
650/// \returns true if a compare instruction \p CI has similar "look" and
651/// same predicate as \p BaseCI, "as is" or with its operands and predicate
652/// swapped, false otherwise.
653static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
654 const TargetLibraryInfo &TLI) {
655 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
656 "Assessing comparisons of different types?");
657 CmpInst::Predicate BasePred = BaseCI->getPredicate();
658 CmpInst::Predicate Pred = CI->getPredicate();
659 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
660
661 Value *BaseOp0 = BaseCI->getOperand(i_nocapture: 0);
662 Value *BaseOp1 = BaseCI->getOperand(i_nocapture: 1);
663 Value *Op0 = CI->getOperand(i_nocapture: 0);
664 Value *Op1 = CI->getOperand(i_nocapture: 1);
665
666 return (BasePred == Pred &&
667 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
668 (BasePred == SwappedPred &&
669 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
670}
671
672/// \returns analysis of the Instructions in \p VL described in
673/// InstructionsState, the Opcode that we suppose the whole list
674/// could be vectorized even if its structure is diverse.
675static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
676 const TargetLibraryInfo &TLI,
677 unsigned BaseIndex) {
678 // Make sure these are all Instructions.
679 if (llvm::any_of(Range&: VL, P: [](Value *V) { return !isa<Instruction>(Val: V); }))
680 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
681
682 bool IsCastOp = isa<CastInst>(Val: VL[BaseIndex]);
683 bool IsBinOp = isa<BinaryOperator>(Val: VL[BaseIndex]);
684 bool IsCmpOp = isa<CmpInst>(Val: VL[BaseIndex]);
685 CmpInst::Predicate BasePred =
686 IsCmpOp ? cast<CmpInst>(Val: VL[BaseIndex])->getPredicate()
687 : CmpInst::BAD_ICMP_PREDICATE;
688 unsigned Opcode = cast<Instruction>(Val: VL[BaseIndex])->getOpcode();
689 unsigned AltOpcode = Opcode;
690 unsigned AltIndex = BaseIndex;
691
692 bool SwappedPredsCompatible = [&]() {
693 if (!IsCmpOp)
694 return false;
695 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
696 UniquePreds.insert(X: BasePred);
697 UniqueNonSwappedPreds.insert(X: BasePred);
698 for (Value *V : VL) {
699 auto *I = dyn_cast<CmpInst>(Val: V);
700 if (!I)
701 return false;
702 CmpInst::Predicate CurrentPred = I->getPredicate();
703 CmpInst::Predicate SwappedCurrentPred =
704 CmpInst::getSwappedPredicate(pred: CurrentPred);
705 UniqueNonSwappedPreds.insert(X: CurrentPred);
706 if (!UniquePreds.contains(key: CurrentPred) &&
707 !UniquePreds.contains(key: SwappedCurrentPred))
708 UniquePreds.insert(X: CurrentPred);
709 }
710 // Total number of predicates > 2, but if consider swapped predicates
711 // compatible only 2, consider swappable predicates as compatible opcodes,
712 // not alternate.
713 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
714 }();
715 // Check for one alternate opcode from another BinaryOperator.
716 // TODO - generalize to support all operators (types, calls etc.).
717 auto *IBase = cast<Instruction>(Val: VL[BaseIndex]);
718 Intrinsic::ID BaseID = 0;
719 SmallVector<VFInfo> BaseMappings;
720 if (auto *CallBase = dyn_cast<CallInst>(Val: IBase)) {
721 BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
722 BaseMappings = VFDatabase(*CallBase).getMappings(CI: *CallBase);
723 if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
724 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
725 }
726 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
727 auto *I = cast<Instruction>(Val: VL[Cnt]);
728 unsigned InstOpcode = I->getOpcode();
729 if (IsBinOp && isa<BinaryOperator>(Val: I)) {
730 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
731 continue;
732 if (Opcode == AltOpcode && isValidForAlternation(Opcode: InstOpcode) &&
733 isValidForAlternation(Opcode)) {
734 AltOpcode = InstOpcode;
735 AltIndex = Cnt;
736 continue;
737 }
738 } else if (IsCastOp && isa<CastInst>(Val: I)) {
739 Value *Op0 = IBase->getOperand(i: 0);
740 Type *Ty0 = Op0->getType();
741 Value *Op1 = I->getOperand(i: 0);
742 Type *Ty1 = Op1->getType();
743 if (Ty0 == Ty1) {
744 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
745 continue;
746 if (Opcode == AltOpcode) {
747 assert(isValidForAlternation(Opcode) &&
748 isValidForAlternation(InstOpcode) &&
749 "Cast isn't safe for alternation, logic needs to be updated!");
750 AltOpcode = InstOpcode;
751 AltIndex = Cnt;
752 continue;
753 }
754 }
755 } else if (auto *Inst = dyn_cast<CmpInst>(Val: VL[Cnt]); Inst && IsCmpOp) {
756 auto *BaseInst = cast<CmpInst>(Val: VL[BaseIndex]);
757 Type *Ty0 = BaseInst->getOperand(i_nocapture: 0)->getType();
758 Type *Ty1 = Inst->getOperand(i_nocapture: 0)->getType();
759 if (Ty0 == Ty1) {
760 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
761 // Check for compatible operands. If the corresponding operands are not
762 // compatible - need to perform alternate vectorization.
763 CmpInst::Predicate CurrentPred = Inst->getPredicate();
764 CmpInst::Predicate SwappedCurrentPred =
765 CmpInst::getSwappedPredicate(pred: CurrentPred);
766
767 if ((E == 2 || SwappedPredsCompatible) &&
768 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
769 continue;
770
771 if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
772 continue;
773 auto *AltInst = cast<CmpInst>(Val: VL[AltIndex]);
774 if (AltIndex != BaseIndex) {
775 if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
776 continue;
777 } else if (BasePred != CurrentPred) {
778 assert(
779 isValidForAlternation(InstOpcode) &&
780 "CmpInst isn't safe for alternation, logic needs to be updated!");
781 AltIndex = Cnt;
782 continue;
783 }
784 CmpInst::Predicate AltPred = AltInst->getPredicate();
785 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
786 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
787 continue;
788 }
789 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
790 if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
791 if (Gep->getNumOperands() != 2 ||
792 Gep->getOperand(i_nocapture: 0)->getType() != IBase->getOperand(i: 0)->getType())
793 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
794 } else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
795 if (!isVectorLikeInstWithConstOps(V: EI))
796 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
797 } else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
798 auto *BaseLI = cast<LoadInst>(Val: IBase);
799 if (!LI->isSimple() || !BaseLI->isSimple())
800 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
801 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
802 auto *CallBase = cast<CallInst>(Val: IBase);
803 if (Call->getCalledFunction() != CallBase->getCalledFunction())
804 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
805 if (Call->hasOperandBundles() &&
806 !std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
807 last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
808 first2: CallBase->op_begin() +
809 CallBase->getBundleOperandsStartIndex()))
810 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
811 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
812 if (ID != BaseID)
813 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
814 if (!ID) {
815 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(CI: *Call);
816 if (Mappings.size() != BaseMappings.size() ||
817 Mappings.front().ISA != BaseMappings.front().ISA ||
818 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
819 Mappings.front().VectorName != BaseMappings.front().VectorName ||
820 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
821 Mappings.front().Shape.Parameters !=
822 BaseMappings.front().Shape.Parameters)
823 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
824 }
825 }
826 continue;
827 }
828 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
829 }
830
831 return InstructionsState(VL[BaseIndex], cast<Instruction>(Val: VL[BaseIndex]),
832 cast<Instruction>(Val: VL[AltIndex]));
833}
834
835/// \returns true if all of the values in \p VL have the same type or false
836/// otherwise.
837static bool allSameType(ArrayRef<Value *> VL) {
838 Type *Ty = VL.front()->getType();
839 return all_of(Range: VL.drop_front(), P: [&](Value *V) { return V->getType() == Ty; });
840}
841
842/// \returns True if in-tree use also needs extract. This refers to
843/// possible scalar operand in vectorized instruction.
844static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
845 TargetLibraryInfo *TLI) {
846 unsigned Opcode = UserInst->getOpcode();
847 switch (Opcode) {
848 case Instruction::Load: {
849 LoadInst *LI = cast<LoadInst>(Val: UserInst);
850 return (LI->getPointerOperand() == Scalar);
851 }
852 case Instruction::Store: {
853 StoreInst *SI = cast<StoreInst>(Val: UserInst);
854 return (SI->getPointerOperand() == Scalar);
855 }
856 case Instruction::Call: {
857 CallInst *CI = cast<CallInst>(Val: UserInst);
858 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
859 return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
860 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
861 Arg.value().get() == Scalar;
862 });
863 }
864 default:
865 return false;
866 }
867}
868
869/// \returns the AA location that is being access by the instruction.
870static MemoryLocation getLocation(Instruction *I) {
871 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
872 return MemoryLocation::get(SI);
873 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
874 return MemoryLocation::get(LI);
875 return MemoryLocation();
876}
877
878/// \returns True if the instruction is not a volatile or atomic load/store.
879static bool isSimple(Instruction *I) {
880 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
881 return LI->isSimple();
882 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
883 return SI->isSimple();
884 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
885 return !MI->isVolatile();
886 return true;
887}
888
889/// Shuffles \p Mask in accordance with the given \p SubMask.
890/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
891/// one but two input vectors.
892static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
893 bool ExtendingManyInputs = false) {
894 if (SubMask.empty())
895 return;
896 assert(
897 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
898 // Check if input scalars were extended to match the size of other node.
899 (SubMask.size() == Mask.size() &&
900 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
901 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
902 "SubMask with many inputs support must be larger than the mask.");
903 if (Mask.empty()) {
904 Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
905 return;
906 }
907 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
908 int TermValue = std::min(a: Mask.size(), b: SubMask.size());
909 for (int I = 0, E = SubMask.size(); I < E; ++I) {
910 if (SubMask[I] == PoisonMaskElem ||
911 (!ExtendingManyInputs &&
912 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
913 continue;
914 NewMask[I] = Mask[SubMask[I]];
915 }
916 Mask.swap(RHS&: NewMask);
917}
918
919/// Order may have elements assigned special value (size) which is out of
920/// bounds. Such indices only appear on places which correspond to undef values
921/// (see canReuseExtract for details) and used in order to avoid undef values
922/// have effect on operands ordering.
923/// The first loop below simply finds all unused indices and then the next loop
924/// nest assigns these indices for undef values positions.
925/// As an example below Order has two undef positions and they have assigned
926/// values 3 and 7 respectively:
927/// before: 6 9 5 4 9 2 1 0
928/// after: 6 3 5 4 7 2 1 0
929static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
930 const unsigned Sz = Order.size();
931 SmallBitVector UnusedIndices(Sz, /*t=*/true);
932 SmallBitVector MaskedIndices(Sz);
933 for (unsigned I = 0; I < Sz; ++I) {
934 if (Order[I] < Sz)
935 UnusedIndices.reset(Idx: Order[I]);
936 else
937 MaskedIndices.set(I);
938 }
939 if (MaskedIndices.none())
940 return;
941 assert(UnusedIndices.count() == MaskedIndices.count() &&
942 "Non-synced masked/available indices.");
943 int Idx = UnusedIndices.find_first();
944 int MIdx = MaskedIndices.find_first();
945 while (MIdx >= 0) {
946 assert(Idx >= 0 && "Indices must be synced.");
947 Order[MIdx] = Idx;
948 Idx = UnusedIndices.find_next(Prev: Idx);
949 MIdx = MaskedIndices.find_next(Prev: MIdx);
950 }
951}
952
953namespace llvm {
954
955static void inversePermutation(ArrayRef<unsigned> Indices,
956 SmallVectorImpl<int> &Mask) {
957 Mask.clear();
958 const unsigned E = Indices.size();
959 Mask.resize(N: E, NV: PoisonMaskElem);
960 for (unsigned I = 0; I < E; ++I)
961 Mask[Indices[I]] = I;
962}
963
964/// Reorders the list of scalars in accordance with the given \p Mask.
965static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
966 ArrayRef<int> Mask) {
967 assert(!Mask.empty() && "Expected non-empty mask.");
968 SmallVector<Value *> Prev(Scalars.size(),
969 UndefValue::get(T: Scalars.front()->getType()));
970 Prev.swap(RHS&: Scalars);
971 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
972 if (Mask[I] != PoisonMaskElem)
973 Scalars[Mask[I]] = Prev[I];
974}
975
976/// Checks if the provided value does not require scheduling. It does not
977/// require scheduling if this is not an instruction or it is an instruction
978/// that does not read/write memory and all operands are either not instructions
979/// or phi nodes or instructions from different blocks.
980static bool areAllOperandsNonInsts(Value *V) {
981 auto *I = dyn_cast<Instruction>(Val: V);
982 if (!I)
983 return true;
984 return !mayHaveNonDefUseDependency(I: *I) &&
985 all_of(Range: I->operands(), P: [I](Value *V) {
986 auto *IO = dyn_cast<Instruction>(Val: V);
987 if (!IO)
988 return true;
989 return isa<PHINode>(Val: IO) || IO->getParent() != I->getParent();
990 });
991}
992
993/// Checks if the provided value does not require scheduling. It does not
994/// require scheduling if this is not an instruction or it is an instruction
995/// that does not read/write memory and all users are phi nodes or instructions
996/// from the different blocks.
997static bool isUsedOutsideBlock(Value *V) {
998 auto *I = dyn_cast<Instruction>(Val: V);
999 if (!I)
1000 return true;
1001 // Limits the number of uses to save compile time.
1002 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1003 all_of(Range: I->users(), P: [I](User *U) {
1004 auto *IU = dyn_cast<Instruction>(Val: U);
1005 if (!IU)
1006 return true;
1007 return IU->getParent() != I->getParent() || isa<PHINode>(Val: IU);
1008 });
1009}
1010
1011/// Checks if the specified value does not require scheduling. It does not
1012/// require scheduling if all operands and all users do not need to be scheduled
1013/// in the current basic block.
1014static bool doesNotNeedToBeScheduled(Value *V) {
1015 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1016}
1017
1018/// Checks if the specified array of instructions does not require scheduling.
1019/// It is so if all either instructions have operands that do not require
1020/// scheduling or their users do not require scheduling since they are phis or
1021/// in other basic blocks.
1022static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1023 return !VL.empty() &&
1024 (all_of(Range&: VL, P: isUsedOutsideBlock) || all_of(Range&: VL, P: areAllOperandsNonInsts));
1025}
1026
1027namespace slpvectorizer {
1028
1029/// Bottom Up SLP Vectorizer.
1030class BoUpSLP {
1031 struct TreeEntry;
1032 struct ScheduleData;
1033 class ShuffleCostEstimator;
1034 class ShuffleInstructionBuilder;
1035
1036public:
1037 /// Tracks the state we can represent the loads in the given sequence.
1038 enum class LoadsState {
1039 Gather,
1040 Vectorize,
1041 ScatterVectorize,
1042 StridedVectorize
1043 };
1044
1045 using ValueList = SmallVector<Value *, 8>;
1046 using InstrList = SmallVector<Instruction *, 16>;
1047 using ValueSet = SmallPtrSet<Value *, 16>;
1048 using StoreList = SmallVector<StoreInst *, 8>;
1049 using ExtraValueToDebugLocsMap =
1050 MapVector<Value *, SmallVector<Instruction *, 2>>;
1051 using OrdersType = SmallVector<unsigned, 4>;
1052
1053 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1054 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1055 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1056 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1057 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1058 AC(AC), DB(DB), DL(DL), ORE(ORE),
1059 Builder(Se->getContext(), TargetFolder(*DL)) {
1060 CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1061 // Use the vector register size specified by the target unless overridden
1062 // by a command-line option.
1063 // TODO: It would be better to limit the vectorization factor based on
1064 // data type rather than just register size. For example, x86 AVX has
1065 // 256-bit registers, but it does not support integer operations
1066 // at that width (that requires AVX2).
1067 if (MaxVectorRegSizeOption.getNumOccurrences())
1068 MaxVecRegSize = MaxVectorRegSizeOption;
1069 else
1070 MaxVecRegSize =
1071 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
1072 .getFixedValue();
1073
1074 if (MinVectorRegSizeOption.getNumOccurrences())
1075 MinVecRegSize = MinVectorRegSizeOption;
1076 else
1077 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1078 }
1079
1080 /// Vectorize the tree that starts with the elements in \p VL.
1081 /// Returns the vectorized root.
1082 Value *vectorizeTree();
1083
1084 /// Vectorize the tree but with the list of externally used values \p
1085 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1086 /// generated extractvalue instructions.
1087 /// \param ReplacedExternals containd list of replaced external values
1088 /// {scalar, replace} after emitting extractelement for external uses.
1089 Value *
1090 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1091 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1092 Instruction *ReductionRoot = nullptr);
1093
1094 /// \returns the cost incurred by unwanted spills and fills, caused by
1095 /// holding live values over call sites.
1096 InstructionCost getSpillCost() const;
1097
1098 /// \returns the vectorization cost of the subtree that starts at \p VL.
1099 /// A negative number means that this is profitable.
1100 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1101
1102 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1103 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1104 void buildTree(ArrayRef<Value *> Roots,
1105 const SmallDenseSet<Value *> &UserIgnoreLst);
1106
1107 /// Construct a vectorizable tree that starts at \p Roots.
1108 void buildTree(ArrayRef<Value *> Roots);
1109
1110 /// Returns whether the root node has in-tree uses.
1111 bool doesRootHaveInTreeUses() const {
1112 return !VectorizableTree.empty() &&
1113 !VectorizableTree.front()->UserTreeIndices.empty();
1114 }
1115
1116 /// Return the scalars of the root node.
1117 ArrayRef<Value *> getRootNodeScalars() const {
1118 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1119 return VectorizableTree.front()->Scalars;
1120 }
1121
1122 /// Builds external uses of the vectorized scalars, i.e. the list of
1123 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1124 /// ExternallyUsedValues contains additional list of external uses to handle
1125 /// vectorization of reductions.
1126 void
1127 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1128
1129 /// Transforms graph nodes to target specific representations, if profitable.
1130 void transformNodes();
1131
1132 /// Clear the internal data structures that are created by 'buildTree'.
1133 void deleteTree() {
1134 VectorizableTree.clear();
1135 ScalarToTreeEntry.clear();
1136 MultiNodeScalars.clear();
1137 MustGather.clear();
1138 EntryToLastInstruction.clear();
1139 ExternalUses.clear();
1140 ExternalUsesAsGEPs.clear();
1141 for (auto &Iter : BlocksSchedules) {
1142 BlockScheduling *BS = Iter.second.get();
1143 BS->clear();
1144 }
1145 MinBWs.clear();
1146 ReductionBitWidth = 0;
1147 CastMaxMinBWSizes.reset();
1148 ExtraBitWidthNodes.clear();
1149 InstrElementSize.clear();
1150 UserIgnoreList = nullptr;
1151 PostponedGathers.clear();
1152 ValueToGatherNodes.clear();
1153 }
1154
1155 unsigned getTreeSize() const { return VectorizableTree.size(); }
1156
1157 /// Perform LICM and CSE on the newly generated gather sequences.
1158 void optimizeGatherSequence();
1159
1160 /// Checks if the specified gather tree entry \p TE can be represented as a
1161 /// shuffled vector entry + (possibly) permutation with other gathers. It
1162 /// implements the checks only for possibly ordered scalars (Loads,
1163 /// ExtractElement, ExtractValue), which can be part of the graph.
1164 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1165
1166 /// Sort loads into increasing pointers offsets to allow greater clustering.
1167 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1168
1169 /// Gets reordering data for the given tree entry. If the entry is vectorized
1170 /// - just return ReorderIndices, otherwise check if the scalars can be
1171 /// reordered and return the most optimal order.
1172 /// \return std::nullopt if ordering is not important, empty order, if
1173 /// identity order is important, or the actual order.
1174 /// \param TopToBottom If true, include the order of vectorized stores and
1175 /// insertelement nodes, otherwise skip them.
1176 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1177 bool TopToBottom);
1178
1179 /// Reorders the current graph to the most profitable order starting from the
1180 /// root node to the leaf nodes. The best order is chosen only from the nodes
1181 /// of the same size (vectorization factor). Smaller nodes are considered
1182 /// parts of subgraph with smaller VF and they are reordered independently. We
1183 /// can make it because we still need to extend smaller nodes to the wider VF
1184 /// and we can merge reordering shuffles with the widening shuffles.
1185 void reorderTopToBottom();
1186
1187 /// Reorders the current graph to the most profitable order starting from
1188 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1189 /// number of reshuffles if the leaf nodes use the same order. In this case we
1190 /// can merge the orders and just shuffle user node instead of shuffling its
1191 /// operands. Plus, even the leaf nodes have different orders, it allows to
1192 /// sink reordering in the graph closer to the root node and merge it later
1193 /// during analysis.
1194 void reorderBottomToTop(bool IgnoreReorder = false);
1195
1196 /// \return The vector element size in bits to use when vectorizing the
1197 /// expression tree ending at \p V. If V is a store, the size is the width of
1198 /// the stored value. Otherwise, the size is the width of the largest loaded
1199 /// value reaching V. This method is used by the vectorizer to calculate
1200 /// vectorization factors.
1201 unsigned getVectorElementSize(Value *V);
1202
1203 /// Compute the minimum type sizes required to represent the entries in a
1204 /// vectorizable tree.
1205 void computeMinimumValueSizes();
1206
1207 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1208 unsigned getMaxVecRegSize() const {
1209 return MaxVecRegSize;
1210 }
1211
1212 // \returns minimum vector register size as set by cl::opt.
1213 unsigned getMinVecRegSize() const {
1214 return MinVecRegSize;
1215 }
1216
1217 unsigned getMinVF(unsigned Sz) const {
1218 return std::max(a: 2U, b: getMinVecRegSize() / Sz);
1219 }
1220
1221 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1222 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1223 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1224 return MaxVF ? MaxVF : UINT_MAX;
1225 }
1226
1227 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1228 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1229 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1230 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1231 ///
1232 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1233 unsigned canMapToVector(Type *T) const;
1234
1235 /// \returns True if the VectorizableTree is both tiny and not fully
1236 /// vectorizable. We do not vectorize such trees.
1237 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1238
1239 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1240 /// can be load combined in the backend. Load combining may not be allowed in
1241 /// the IR optimizer, so we do not want to alter the pattern. For example,
1242 /// partially transforming a scalar bswap() pattern into vector code is
1243 /// effectively impossible for the backend to undo.
1244 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1245 /// may not be necessary.
1246 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1247
1248 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1249 /// can be load combined in the backend. Load combining may not be allowed in
1250 /// the IR optimizer, so we do not want to alter the pattern. For example,
1251 /// partially transforming a scalar bswap() pattern into vector code is
1252 /// effectively impossible for the backend to undo.
1253 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1254 /// may not be necessary.
1255 bool isLoadCombineCandidate() const;
1256
1257 /// Checks if the given array of loads can be represented as a vectorized,
1258 /// scatter or just simple gather.
1259 /// \param VL list of loads.
1260 /// \param VL0 main load value.
1261 /// \param Order returned order of load instructions.
1262 /// \param PointerOps returned list of pointer operands.
1263 /// \param TryRecursiveCheck used to check if long masked gather can be
1264 /// represented as a serie of loads/insert subvector, if profitable.
1265 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
1266 SmallVectorImpl<unsigned> &Order,
1267 SmallVectorImpl<Value *> &PointerOps,
1268 bool TryRecursiveCheck = true) const;
1269
1270 OptimizationRemarkEmitter *getORE() { return ORE; }
1271
1272 /// This structure holds any data we need about the edges being traversed
1273 /// during buildTree_rec(). We keep track of:
1274 /// (i) the user TreeEntry index, and
1275 /// (ii) the index of the edge.
1276 struct EdgeInfo {
1277 EdgeInfo() = default;
1278 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1279 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1280 /// The user TreeEntry.
1281 TreeEntry *UserTE = nullptr;
1282 /// The operand index of the use.
1283 unsigned EdgeIdx = UINT_MAX;
1284#ifndef NDEBUG
1285 friend inline raw_ostream &operator<<(raw_ostream &OS,
1286 const BoUpSLP::EdgeInfo &EI) {
1287 EI.dump(OS);
1288 return OS;
1289 }
1290 /// Debug print.
1291 void dump(raw_ostream &OS) const {
1292 OS << "{User:" << (UserTE ? std::to_string(val: UserTE->Idx) : "null")
1293 << " EdgeIdx:" << EdgeIdx << "}";
1294 }
1295 LLVM_DUMP_METHOD void dump() const { dump(OS&: dbgs()); }
1296#endif
1297 bool operator == (const EdgeInfo &Other) const {
1298 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1299 }
1300 };
1301
1302 /// A helper class used for scoring candidates for two consecutive lanes.
1303 class LookAheadHeuristics {
1304 const TargetLibraryInfo &TLI;
1305 const DataLayout &DL;
1306 ScalarEvolution &SE;
1307 const BoUpSLP &R;
1308 int NumLanes; // Total number of lanes (aka vectorization factor).
1309 int MaxLevel; // The maximum recursion depth for accumulating score.
1310
1311 public:
1312 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1313 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1314 int MaxLevel)
1315 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1316 MaxLevel(MaxLevel) {}
1317
1318 // The hard-coded scores listed here are not very important, though it shall
1319 // be higher for better matches to improve the resulting cost. When
1320 // computing the scores of matching one sub-tree with another, we are
1321 // basically counting the number of values that are matching. So even if all
1322 // scores are set to 1, we would still get a decent matching result.
1323 // However, sometimes we have to break ties. For example we may have to
1324 // choose between matching loads vs matching opcodes. This is what these
1325 // scores are helping us with: they provide the order of preference. Also,
1326 // this is important if the scalar is externally used or used in another
1327 // tree entry node in the different lane.
1328
1329 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1330 static const int ScoreConsecutiveLoads = 4;
1331 /// The same load multiple times. This should have a better score than
1332 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1333 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1334 /// a vector load and 1.0 for a broadcast.
1335 static const int ScoreSplatLoads = 3;
1336 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1337 static const int ScoreReversedLoads = 3;
1338 /// A load candidate for masked gather.
1339 static const int ScoreMaskedGatherCandidate = 1;
1340 /// ExtractElementInst from same vector and consecutive indexes.
1341 static const int ScoreConsecutiveExtracts = 4;
1342 /// ExtractElementInst from same vector and reversed indices.
1343 static const int ScoreReversedExtracts = 3;
1344 /// Constants.
1345 static const int ScoreConstants = 2;
1346 /// Instructions with the same opcode.
1347 static const int ScoreSameOpcode = 2;
1348 /// Instructions with alt opcodes (e.g, add + sub).
1349 static const int ScoreAltOpcodes = 1;
1350 /// Identical instructions (a.k.a. splat or broadcast).
1351 static const int ScoreSplat = 1;
1352 /// Matching with an undef is preferable to failing.
1353 static const int ScoreUndef = 1;
1354 /// Score for failing to find a decent match.
1355 static const int ScoreFail = 0;
1356 /// Score if all users are vectorized.
1357 static const int ScoreAllUserVectorized = 1;
1358
1359 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1360 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1361 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1362 /// MainAltOps.
1363 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
1364 ArrayRef<Value *> MainAltOps) const {
1365 if (!isValidElementType(Ty: V1->getType()) ||
1366 !isValidElementType(Ty: V2->getType()))
1367 return LookAheadHeuristics::ScoreFail;
1368
1369 if (V1 == V2) {
1370 if (isa<LoadInst>(Val: V1)) {
1371 // Retruns true if the users of V1 and V2 won't need to be extracted.
1372 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1373 // Bail out if we have too many uses to save compilation time.
1374 if (V1->hasNUsesOrMore(N: UsesLimit) || V2->hasNUsesOrMore(N: UsesLimit))
1375 return false;
1376
1377 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1378 return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
1379 return U == U1 || U == U2 || R.getTreeEntry(V: U) != nullptr;
1380 });
1381 };
1382 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1383 };
1384 // A broadcast of a load can be cheaper on some targets.
1385 if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
1386 NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
1387 ((int)V1->getNumUses() == NumLanes ||
1388 AllUsersAreInternal(V1, V2)))
1389 return LookAheadHeuristics::ScoreSplatLoads;
1390 }
1391 return LookAheadHeuristics::ScoreSplat;
1392 }
1393
1394 auto *LI1 = dyn_cast<LoadInst>(Val: V1);
1395 auto *LI2 = dyn_cast<LoadInst>(Val: V2);
1396 if (LI1 && LI2) {
1397 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1398 !LI2->isSimple())
1399 return LookAheadHeuristics::ScoreFail;
1400
1401 std::optional<int> Dist = getPointersDiff(
1402 ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
1403 PtrB: LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1404 if (!Dist || *Dist == 0) {
1405 if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
1406 getUnderlyingObject(V: LI2->getPointerOperand()) &&
1407 R.TTI->isLegalMaskedGather(
1408 DataType: FixedVectorType::get(ElementType: LI1->getType(), NumElts: NumLanes),
1409 Alignment: LI1->getAlign()))
1410 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1411 return LookAheadHeuristics::ScoreFail;
1412 }
1413 // The distance is too large - still may be profitable to use masked
1414 // loads/gathers.
1415 if (std::abs(x: *Dist) > NumLanes / 2)
1416 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1417 // This still will detect consecutive loads, but we might have "holes"
1418 // in some cases. It is ok for non-power-2 vectorization and may produce
1419 // better results. It should not affect current vectorization.
1420 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1421 : LookAheadHeuristics::ScoreReversedLoads;
1422 }
1423
1424 auto *C1 = dyn_cast<Constant>(Val: V1);
1425 auto *C2 = dyn_cast<Constant>(Val: V2);
1426 if (C1 && C2)
1427 return LookAheadHeuristics::ScoreConstants;
1428
1429 // Extracts from consecutive indexes of the same vector better score as
1430 // the extracts could be optimized away.
1431 Value *EV1;
1432 ConstantInt *Ex1Idx;
1433 if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
1434 // Undefs are always profitable for extractelements.
1435 // Compiler can easily combine poison and extractelement <non-poison> or
1436 // undef and extractelement <poison>. But combining undef +
1437 // extractelement <non-poison-but-may-produce-poison> requires some
1438 // extra operations.
1439 if (isa<UndefValue>(Val: V2))
1440 return (isa<PoisonValue>(Val: V2) || isUndefVector(V: EV1).all())
1441 ? LookAheadHeuristics::ScoreConsecutiveExtracts
1442 : LookAheadHeuristics::ScoreSameOpcode;
1443 Value *EV2 = nullptr;
1444 ConstantInt *Ex2Idx = nullptr;
1445 if (match(V: V2,
1446 P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
1447 R: m_Undef())))) {
1448 // Undefs are always profitable for extractelements.
1449 if (!Ex2Idx)
1450 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1451 if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
1452 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1453 if (EV2 == EV1) {
1454 int Idx1 = Ex1Idx->getZExtValue();
1455 int Idx2 = Ex2Idx->getZExtValue();
1456 int Dist = Idx2 - Idx1;
1457 // The distance is too large - still may be profitable to use
1458 // shuffles.
1459 if (std::abs(x: Dist) == 0)
1460 return LookAheadHeuristics::ScoreSplat;
1461 if (std::abs(x: Dist) > NumLanes / 2)
1462 return LookAheadHeuristics::ScoreSameOpcode;
1463 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1464 : LookAheadHeuristics::ScoreReversedExtracts;
1465 }
1466 return LookAheadHeuristics::ScoreAltOpcodes;
1467 }
1468 return LookAheadHeuristics::ScoreFail;
1469 }
1470
1471 auto *I1 = dyn_cast<Instruction>(Val: V1);
1472 auto *I2 = dyn_cast<Instruction>(Val: V2);
1473 if (I1 && I2) {
1474 if (I1->getParent() != I2->getParent())
1475 return LookAheadHeuristics::ScoreFail;
1476 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1477 Ops.push_back(Elt: I1);
1478 Ops.push_back(Elt: I2);
1479 InstructionsState S = getSameOpcode(VL: Ops, TLI);
1480 // Note: Only consider instructions with <= 2 operands to avoid
1481 // complexity explosion.
1482 if (S.getOpcode() &&
1483 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1484 !S.isAltShuffle()) &&
1485 all_of(Range&: Ops, P: [&S](Value *V) {
1486 return cast<Instruction>(Val: V)->getNumOperands() ==
1487 S.MainOp->getNumOperands();
1488 }))
1489 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1490 : LookAheadHeuristics::ScoreSameOpcode;
1491 }
1492
1493 if (isa<UndefValue>(Val: V2))
1494 return LookAheadHeuristics::ScoreUndef;
1495
1496 return LookAheadHeuristics::ScoreFail;
1497 }
1498
1499 /// Go through the operands of \p LHS and \p RHS recursively until
1500 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1501 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1502 /// of \p U1 and \p U2), except at the beginning of the recursion where
1503 /// these are set to nullptr.
1504 ///
1505 /// For example:
1506 /// \verbatim
1507 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1508 /// \ / \ / \ / \ /
1509 /// + + + +
1510 /// G1 G2 G3 G4
1511 /// \endverbatim
1512 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1513 /// each level recursively, accumulating the score. It starts from matching
1514 /// the additions at level 0, then moves on to the loads (level 1). The
1515 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1516 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1517 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1518 /// Please note that the order of the operands does not matter, as we
1519 /// evaluate the score of all profitable combinations of operands. In
1520 /// other words the score of G1 and G4 is the same as G1 and G2. This
1521 /// heuristic is based on ideas described in:
1522 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1523 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1524 /// Luís F. W. Góes
1525 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
1526 Instruction *U2, int CurrLevel,
1527 ArrayRef<Value *> MainAltOps) const {
1528
1529 // Get the shallow score of V1 and V2.
1530 int ShallowScoreAtThisLevel =
1531 getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
1532
1533 // If reached MaxLevel,
1534 // or if V1 and V2 are not instructions,
1535 // or if they are SPLAT,
1536 // or if they are not consecutive,
1537 // or if profitable to vectorize loads or extractelements, early return
1538 // the current cost.
1539 auto *I1 = dyn_cast<Instruction>(Val: LHS);
1540 auto *I2 = dyn_cast<Instruction>(Val: RHS);
1541 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1542 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1543 (((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) ||
1544 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1545 (isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
1546 ShallowScoreAtThisLevel))
1547 return ShallowScoreAtThisLevel;
1548 assert(I1 && I2 && "Should have early exited.");
1549
1550 // Contains the I2 operand indexes that got matched with I1 operands.
1551 SmallSet<unsigned, 4> Op2Used;
1552
1553 // Recursion towards the operands of I1 and I2. We are trying all possible
1554 // operand pairs, and keeping track of the best score.
1555 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1556 OpIdx1 != NumOperands1; ++OpIdx1) {
1557 // Try to pair op1I with the best operand of I2.
1558 int MaxTmpScore = 0;
1559 unsigned MaxOpIdx2 = 0;
1560 bool FoundBest = false;
1561 // If I2 is commutative try all combinations.
1562 unsigned FromIdx = isCommutative(I: I2) ? 0 : OpIdx1;
1563 unsigned ToIdx = isCommutative(I: I2)
1564 ? I2->getNumOperands()
1565 : std::min(a: I2->getNumOperands(), b: OpIdx1 + 1);
1566 assert(FromIdx <= ToIdx && "Bad index");
1567 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1568 // Skip operands already paired with OpIdx1.
1569 if (Op2Used.count(V: OpIdx2))
1570 continue;
1571 // Recursively calculate the cost at each level
1572 int TmpScore =
1573 getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
1574 U1: I1, U2: I2, CurrLevel: CurrLevel + 1, MainAltOps: std::nullopt);
1575 // Look for the best score.
1576 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1577 TmpScore > MaxTmpScore) {
1578 MaxTmpScore = TmpScore;
1579 MaxOpIdx2 = OpIdx2;
1580 FoundBest = true;
1581 }
1582 }
1583 if (FoundBest) {
1584 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1585 Op2Used.insert(V: MaxOpIdx2);
1586 ShallowScoreAtThisLevel += MaxTmpScore;
1587 }
1588 }
1589 return ShallowScoreAtThisLevel;
1590 }
1591 };
1592 /// A helper data structure to hold the operands of a vector of instructions.
1593 /// This supports a fixed vector length for all operand vectors.
1594 class VLOperands {
1595 /// For each operand we need (i) the value, and (ii) the opcode that it
1596 /// would be attached to if the expression was in a left-linearized form.
1597 /// This is required to avoid illegal operand reordering.
1598 /// For example:
1599 /// \verbatim
1600 /// 0 Op1
1601 /// |/
1602 /// Op1 Op2 Linearized + Op2
1603 /// \ / ----------> |/
1604 /// - -
1605 ///
1606 /// Op1 - Op2 (0 + Op1) - Op2
1607 /// \endverbatim
1608 ///
1609 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1610 ///
1611 /// Another way to think of this is to track all the operations across the
1612 /// path from the operand all the way to the root of the tree and to
1613 /// calculate the operation that corresponds to this path. For example, the
1614 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1615 /// corresponding operation is a '-' (which matches the one in the
1616 /// linearized tree, as shown above).
1617 ///
1618 /// For lack of a better term, we refer to this operation as Accumulated
1619 /// Path Operation (APO).
1620 struct OperandData {
1621 OperandData() = default;
1622 OperandData(Value *V, bool APO, bool IsUsed)
1623 : V(V), APO(APO), IsUsed(IsUsed) {}
1624 /// The operand value.
1625 Value *V = nullptr;
1626 /// TreeEntries only allow a single opcode, or an alternate sequence of
1627 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1628 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1629 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1630 /// (e.g., Add/Mul)
1631 bool APO = false;
1632 /// Helper data for the reordering function.
1633 bool IsUsed = false;
1634 };
1635
1636 /// During operand reordering, we are trying to select the operand at lane
1637 /// that matches best with the operand at the neighboring lane. Our
1638 /// selection is based on the type of value we are looking for. For example,
1639 /// if the neighboring lane has a load, we need to look for a load that is
1640 /// accessing a consecutive address. These strategies are summarized in the
1641 /// 'ReorderingMode' enumerator.
1642 enum class ReorderingMode {
1643 Load, ///< Matching loads to consecutive memory addresses
1644 Opcode, ///< Matching instructions based on opcode (same or alternate)
1645 Constant, ///< Matching constants
1646 Splat, ///< Matching the same instruction multiple times (broadcast)
1647 Failed, ///< We failed to create a vectorizable group
1648 };
1649
1650 using OperandDataVec = SmallVector<OperandData, 2>;
1651
1652 /// A vector of operand vectors.
1653 SmallVector<OperandDataVec, 4> OpsVec;
1654
1655 const TargetLibraryInfo &TLI;
1656 const DataLayout &DL;
1657 ScalarEvolution &SE;
1658 const BoUpSLP &R;
1659
1660 /// \returns the operand data at \p OpIdx and \p Lane.
1661 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1662 return OpsVec[OpIdx][Lane];
1663 }
1664
1665 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1666 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1667 return OpsVec[OpIdx][Lane];
1668 }
1669
1670 /// Clears the used flag for all entries.
1671 void clearUsed() {
1672 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1673 OpIdx != NumOperands; ++OpIdx)
1674 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1675 ++Lane)
1676 OpsVec[OpIdx][Lane].IsUsed = false;
1677 }
1678
1679 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1680 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1681 std::swap(a&: OpsVec[OpIdx1][Lane], b&: OpsVec[OpIdx2][Lane]);
1682 }
1683
1684 /// \param Lane lane of the operands under analysis.
1685 /// \param OpIdx operand index in \p Lane lane we're looking the best
1686 /// candidate for.
1687 /// \param Idx operand index of the current candidate value.
1688 /// \returns The additional score due to possible broadcasting of the
1689 /// elements in the lane. It is more profitable to have power-of-2 unique
1690 /// elements in the lane, it will be vectorized with higher probability
1691 /// after removing duplicates. Currently the SLP vectorizer supports only
1692 /// vectorization of the power-of-2 number of unique scalars.
1693 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1694 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1695 if (!isa<Instruction>(Val: IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1696 return 0;
1697 SmallPtrSet<Value *, 4> Uniques;
1698 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1699 if (Ln == Lane)
1700 continue;
1701 Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
1702 if (!isa<Instruction>(Val: OpIdxLnV))
1703 return 0;
1704 Uniques.insert(Ptr: OpIdxLnV);
1705 }
1706 int UniquesCount = Uniques.size();
1707 int UniquesCntWithIdxLaneV =
1708 Uniques.contains(Ptr: IdxLaneV) ? UniquesCount : UniquesCount + 1;
1709 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1710 int UniquesCntWithOpIdxLaneV =
1711 Uniques.contains(Ptr: OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1712 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1713 return 0;
1714 return (PowerOf2Ceil(A: UniquesCntWithOpIdxLaneV) -
1715 UniquesCntWithOpIdxLaneV) -
1716 (PowerOf2Ceil(A: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1717 }
1718
1719 /// \param Lane lane of the operands under analysis.
1720 /// \param OpIdx operand index in \p Lane lane we're looking the best
1721 /// candidate for.
1722 /// \param Idx operand index of the current candidate value.
1723 /// \returns The additional score for the scalar which users are all
1724 /// vectorized.
1725 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1726 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1727 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1728 // Do not care about number of uses for vector-like instructions
1729 // (extractelement/extractvalue with constant indices), they are extracts
1730 // themselves and already externally used. Vectorization of such
1731 // instructions does not add extra extractelement instruction, just may
1732 // remove it.
1733 if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
1734 isVectorLikeInstWithConstOps(V: OpIdxLaneV))
1735 return LookAheadHeuristics::ScoreAllUserVectorized;
1736 auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
1737 if (!IdxLaneI || !isa<Instruction>(Val: OpIdxLaneV))
1738 return 0;
1739 return R.areAllUsersVectorized(I: IdxLaneI)
1740 ? LookAheadHeuristics::ScoreAllUserVectorized
1741 : 0;
1742 }
1743
1744 /// Score scaling factor for fully compatible instructions but with
1745 /// different number of external uses. Allows better selection of the
1746 /// instructions with less external uses.
1747 static const int ScoreScaleFactor = 10;
1748
1749 /// \Returns the look-ahead score, which tells us how much the sub-trees
1750 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1751 /// score. This helps break ties in an informed way when we cannot decide on
1752 /// the order of the operands by just considering the immediate
1753 /// predecessors.
1754 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1755 int Lane, unsigned OpIdx, unsigned Idx,
1756 bool &IsUsed) {
1757 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1758 LookAheadMaxDepth);
1759 // Keep track of the instruction stack as we recurse into the operands
1760 // during the look-ahead score exploration.
1761 int Score =
1762 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1763 /*CurrLevel=*/1, MainAltOps);
1764 if (Score) {
1765 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1766 if (Score <= -SplatScore) {
1767 // Set the minimum score for splat-like sequence to avoid setting
1768 // failed state.
1769 Score = 1;
1770 } else {
1771 Score += SplatScore;
1772 // Scale score to see the difference between different operands
1773 // and similar operands but all vectorized/not all vectorized
1774 // uses. It does not affect actual selection of the best
1775 // compatible operand in general, just allows to select the
1776 // operand with all vectorized uses.
1777 Score *= ScoreScaleFactor;
1778 Score += getExternalUseScore(Lane, OpIdx, Idx);
1779 IsUsed = true;
1780 }
1781 }
1782 return Score;
1783 }
1784
1785 /// Best defined scores per lanes between the passes. Used to choose the
1786 /// best operand (with the highest score) between the passes.
1787 /// The key - {Operand Index, Lane}.
1788 /// The value - the best score between the passes for the lane and the
1789 /// operand.
1790 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
1791 BestScoresPerLanes;
1792
1793 // Search all operands in Ops[*][Lane] for the one that matches best
1794 // Ops[OpIdx][LastLane] and return its opreand index.
1795 // If no good match can be found, return std::nullopt.
1796 std::optional<unsigned>
1797 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1798 ArrayRef<ReorderingMode> ReorderingModes,
1799 ArrayRef<Value *> MainAltOps) {
1800 unsigned NumOperands = getNumOperands();
1801
1802 // The operand of the previous lane at OpIdx.
1803 Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
1804
1805 // Our strategy mode for OpIdx.
1806 ReorderingMode RMode = ReorderingModes[OpIdx];
1807 if (RMode == ReorderingMode::Failed)
1808 return std::nullopt;
1809
1810 // The linearized opcode of the operand at OpIdx, Lane.
1811 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1812
1813 // The best operand index and its score.
1814 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1815 // are using the score to differentiate between the two.
1816 struct BestOpData {
1817 std::optional<unsigned> Idx;
1818 unsigned Score = 0;
1819 } BestOp;
1820 BestOp.Score =
1821 BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: 0)
1822 .first->second;
1823
1824 // Track if the operand must be marked as used. If the operand is set to
1825 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1826 // want to reestimate the operands again on the following iterations).
1827 bool IsUsed =
1828 RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1829 // Iterate through all unused operands and look for the best.
1830 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1831 // Get the operand at Idx and Lane.
1832 OperandData &OpData = getData(OpIdx: Idx, Lane);
1833 Value *Op = OpData.V;
1834 bool OpAPO = OpData.APO;
1835
1836 // Skip already selected operands.
1837 if (OpData.IsUsed)
1838 continue;
1839
1840 // Skip if we are trying to move the operand to a position with a
1841 // different opcode in the linearized tree form. This would break the
1842 // semantics.
1843 if (OpAPO != OpIdxAPO)
1844 continue;
1845
1846 // Look for an operand that matches the current mode.
1847 switch (RMode) {
1848 case ReorderingMode::Load:
1849 case ReorderingMode::Constant:
1850 case ReorderingMode::Opcode: {
1851 bool LeftToRight = Lane > LastLane;
1852 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1853 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1854 int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
1855 OpIdx, Idx, IsUsed);
1856 if (Score > static_cast<int>(BestOp.Score)) {
1857 BestOp.Idx = Idx;
1858 BestOp.Score = Score;
1859 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = Score;
1860 }
1861 break;
1862 }
1863 case ReorderingMode::Splat:
1864 if (Op == OpLastLane)
1865 BestOp.Idx = Idx;
1866 break;
1867 case ReorderingMode::Failed:
1868 llvm_unreachable("Not expected Failed reordering mode.");
1869 }
1870 }
1871
1872 if (BestOp.Idx) {
1873 getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
1874 return BestOp.Idx;
1875 }
1876 // If we could not find a good match return std::nullopt.
1877 return std::nullopt;
1878 }
1879
1880 /// Helper for reorderOperandVecs.
1881 /// \returns the lane that we should start reordering from. This is the one
1882 /// which has the least number of operands that can freely move about or
1883 /// less profitable because it already has the most optimal set of operands.
1884 unsigned getBestLaneToStartReordering() const {
1885 unsigned Min = UINT_MAX;
1886 unsigned SameOpNumber = 0;
1887 // std::pair<unsigned, unsigned> is used to implement a simple voting
1888 // algorithm and choose the lane with the least number of operands that
1889 // can freely move about or less profitable because it already has the
1890 // most optimal set of operands. The first unsigned is a counter for
1891 // voting, the second unsigned is the counter of lanes with instructions
1892 // with same/alternate opcodes and same parent basic block.
1893 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
1894 // Try to be closer to the original results, if we have multiple lanes
1895 // with same cost. If 2 lanes have the same cost, use the one with the
1896 // lowest index.
1897 for (int I = getNumLanes(); I > 0; --I) {
1898 unsigned Lane = I - 1;
1899 OperandsOrderData NumFreeOpsHash =
1900 getMaxNumOperandsThatCanBeReordered(Lane);
1901 // Compare the number of operands that can move and choose the one with
1902 // the least number.
1903 if (NumFreeOpsHash.NumOfAPOs < Min) {
1904 Min = NumFreeOpsHash.NumOfAPOs;
1905 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1906 HashMap.clear();
1907 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
1908 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1909 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1910 // Select the most optimal lane in terms of number of operands that
1911 // should be moved around.
1912 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1913 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
1914 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1915 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1916 auto *It = HashMap.find(Key: NumFreeOpsHash.Hash);
1917 if (It == HashMap.end())
1918 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
1919 else
1920 ++It->second.first;
1921 }
1922 }
1923 // Select the lane with the minimum counter.
1924 unsigned BestLane = 0;
1925 unsigned CntMin = UINT_MAX;
1926 for (const auto &Data : reverse(C&: HashMap)) {
1927 if (Data.second.first < CntMin) {
1928 CntMin = Data.second.first;
1929 BestLane = Data.second.second;
1930 }
1931 }
1932 return BestLane;
1933 }
1934
1935 /// Data structure that helps to reorder operands.
1936 struct OperandsOrderData {
1937 /// The best number of operands with the same APOs, which can be
1938 /// reordered.
1939 unsigned NumOfAPOs = UINT_MAX;
1940 /// Number of operands with the same/alternate instruction opcode and
1941 /// parent.
1942 unsigned NumOpsWithSameOpcodeParent = 0;
1943 /// Hash for the actual operands ordering.
1944 /// Used to count operands, actually their position id and opcode
1945 /// value. It is used in the voting mechanism to find the lane with the
1946 /// least number of operands that can freely move about or less profitable
1947 /// because it already has the most optimal set of operands. Can be
1948 /// replaced with SmallVector<unsigned> instead but hash code is faster
1949 /// and requires less memory.
1950 unsigned Hash = 0;
1951 };
1952 /// \returns the maximum number of operands that are allowed to be reordered
1953 /// for \p Lane and the number of compatible instructions(with the same
1954 /// parent/opcode). This is used as a heuristic for selecting the first lane
1955 /// to start operand reordering.
1956 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1957 unsigned CntTrue = 0;
1958 unsigned NumOperands = getNumOperands();
1959 // Operands with the same APO can be reordered. We therefore need to count
1960 // how many of them we have for each APO, like this: Cnt[APO] = x.
1961 // Since we only have two APOs, namely true and false, we can avoid using
1962 // a map. Instead we can simply count the number of operands that
1963 // correspond to one of them (in this case the 'true' APO), and calculate
1964 // the other by subtracting it from the total number of operands.
1965 // Operands with the same instruction opcode and parent are more
1966 // profitable since we don't need to move them in many cases, with a high
1967 // probability such lane already can be vectorized effectively.
1968 bool AllUndefs = true;
1969 unsigned NumOpsWithSameOpcodeParent = 0;
1970 Instruction *OpcodeI = nullptr;
1971 BasicBlock *Parent = nullptr;
1972 unsigned Hash = 0;
1973 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
1974 const OperandData &OpData = getData(OpIdx, Lane);
1975 if (OpData.APO)
1976 ++CntTrue;
1977 // Use Boyer-Moore majority voting for finding the majority opcode and
1978 // the number of times it occurs.
1979 if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
1980 if (!OpcodeI || !getSameOpcode(VL: {OpcodeI, I}, TLI).getOpcode() ||
1981 I->getParent() != Parent) {
1982 if (NumOpsWithSameOpcodeParent == 0) {
1983 NumOpsWithSameOpcodeParent = 1;
1984 OpcodeI = I;
1985 Parent = I->getParent();
1986 } else {
1987 --NumOpsWithSameOpcodeParent;
1988 }
1989 } else {
1990 ++NumOpsWithSameOpcodeParent;
1991 }
1992 }
1993 Hash = hash_combine(
1994 args: Hash, args: hash_value(value: (OpIdx + 1) * (OpData.V->getValueID() + 1)));
1995 AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
1996 }
1997 if (AllUndefs)
1998 return {};
1999 OperandsOrderData Data;
2000 Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
2001 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2002 Data.Hash = Hash;
2003 return Data;
2004 }
2005
2006 /// Go through the instructions in VL and append their operands.
2007 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2008 assert(!VL.empty() && "Bad VL");
2009 assert((empty() || VL.size() == getNumLanes()) &&
2010 "Expected same number of lanes");
2011 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2012 unsigned NumOperands = cast<Instruction>(Val: VL[0])->getNumOperands();
2013 constexpr unsigned IntrinsicNumOperands = 2;
2014 if (isa<IntrinsicInst>(Val: VL[0]))
2015 NumOperands = IntrinsicNumOperands;
2016 OpsVec.resize(N: NumOperands);
2017 unsigned NumLanes = VL.size();
2018 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2019 OpsVec[OpIdx].resize(N: NumLanes);
2020 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2021 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2022 // Our tree has just 3 nodes: the root and two operands.
2023 // It is therefore trivial to get the APO. We only need to check the
2024 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2025 // RHS operand. The LHS operand of both add and sub is never attached
2026 // to an inversese operation in the linearized form, therefore its APO
2027 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2028
2029 // Since operand reordering is performed on groups of commutative
2030 // operations or alternating sequences (e.g., +, -), we can safely
2031 // tell the inverse operations by checking commutativity.
2032 bool IsInverseOperation = !isCommutative(I: cast<Instruction>(Val: VL[Lane]));
2033 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2034 OpsVec[OpIdx][Lane] = {cast<Instruction>(Val: VL[Lane])->getOperand(i: OpIdx),
2035 APO, false};
2036 }
2037 }
2038 }
2039
2040 /// \returns the number of operands.
2041 unsigned getNumOperands() const { return OpsVec.size(); }
2042
2043 /// \returns the number of lanes.
2044 unsigned getNumLanes() const { return OpsVec[0].size(); }
2045
2046 /// \returns the operand value at \p OpIdx and \p Lane.
2047 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2048 return getData(OpIdx, Lane).V;
2049 }
2050
2051 /// \returns true if the data structure is empty.
2052 bool empty() const { return OpsVec.empty(); }
2053
2054 /// Clears the data.
2055 void clear() { OpsVec.clear(); }
2056
2057 /// \Returns true if there are enough operands identical to \p Op to fill
2058 /// the whole vector.
2059 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2060 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2061 bool OpAPO = getData(OpIdx, Lane).APO;
2062 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2063 if (Ln == Lane)
2064 continue;
2065 // This is set to true if we found a candidate for broadcast at Lane.
2066 bool FoundCandidate = false;
2067 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2068 OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2069 if (Data.APO != OpAPO || Data.IsUsed)
2070 continue;
2071 if (Data.V == Op) {
2072 FoundCandidate = true;
2073 Data.IsUsed = true;
2074 break;
2075 }
2076 }
2077 if (!FoundCandidate)
2078 return false;
2079 }
2080 return true;
2081 }
2082
2083 public:
2084 /// Initialize with all the operands of the instruction vector \p RootVL.
2085 VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
2086 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
2087 // Append all the operands of RootVL.
2088 appendOperandsOfVL(VL: RootVL);
2089 }
2090
2091 /// \Returns a value vector with the operands across all lanes for the
2092 /// opearnd at \p OpIdx.
2093 ValueList getVL(unsigned OpIdx) const {
2094 ValueList OpVL(OpsVec[OpIdx].size());
2095 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2096 "Expected same num of lanes across all operands");
2097 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2098 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2099 return OpVL;
2100 }
2101
2102 // Performs operand reordering for 2 or more operands.
2103 // The original operands are in OrigOps[OpIdx][Lane].
2104 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2105 void reorder() {
2106 unsigned NumOperands = getNumOperands();
2107 unsigned NumLanes = getNumLanes();
2108 // Each operand has its own mode. We are using this mode to help us select
2109 // the instructions for each lane, so that they match best with the ones
2110 // we have selected so far.
2111 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2112
2113 // This is a greedy single-pass algorithm. We are going over each lane
2114 // once and deciding on the best order right away with no back-tracking.
2115 // However, in order to increase its effectiveness, we start with the lane
2116 // that has operands that can move the least. For example, given the
2117 // following lanes:
2118 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2119 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2120 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2121 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2122 // we will start at Lane 1, since the operands of the subtraction cannot
2123 // be reordered. Then we will visit the rest of the lanes in a circular
2124 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2125
2126 // Find the first lane that we will start our search from.
2127 unsigned FirstLane = getBestLaneToStartReordering();
2128
2129 // Initialize the modes.
2130 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2131 Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
2132 // Keep track if we have instructions with all the same opcode on one
2133 // side.
2134 if (isa<LoadInst>(Val: OpLane0))
2135 ReorderingModes[OpIdx] = ReorderingMode::Load;
2136 else if (isa<Instruction>(Val: OpLane0)) {
2137 // Check if OpLane0 should be broadcast.
2138 if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane))
2139 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2140 else
2141 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2142 }
2143 else if (isa<Constant>(Val: OpLane0))
2144 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2145 else if (isa<Argument>(Val: OpLane0))
2146 // Our best hope is a Splat. It may save some cost in some cases.
2147 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2148 else
2149 // NOTE: This should be unreachable.
2150 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2151 }
2152
2153 // Check that we don't have same operands. No need to reorder if operands
2154 // are just perfect diamond or shuffled diamond match. Do not do it only
2155 // for possible broadcasts or non-power of 2 number of scalars (just for
2156 // now).
2157 auto &&SkipReordering = [this]() {
2158 SmallPtrSet<Value *, 4> UniqueValues;
2159 ArrayRef<OperandData> Op0 = OpsVec.front();
2160 for (const OperandData &Data : Op0)
2161 UniqueValues.insert(Ptr: Data.V);
2162 for (ArrayRef<OperandData> Op : drop_begin(RangeOrContainer&: OpsVec, N: 1)) {
2163 if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
2164 return !UniqueValues.contains(Ptr: Data.V);
2165 }))
2166 return false;
2167 }
2168 // TODO: Check if we can remove a check for non-power-2 number of
2169 // scalars after full support of non-power-2 vectorization.
2170 return UniqueValues.size() != 2 && isPowerOf2_32(Value: UniqueValues.size());
2171 };
2172
2173 // If the initial strategy fails for any of the operand indexes, then we
2174 // perform reordering again in a second pass. This helps avoid assigning
2175 // high priority to the failed strategy, and should improve reordering for
2176 // the non-failed operand indexes.
2177 for (int Pass = 0; Pass != 2; ++Pass) {
2178 // Check if no need to reorder operands since they're are perfect or
2179 // shuffled diamond match.
2180 // Need to do it to avoid extra external use cost counting for
2181 // shuffled matches, which may cause regressions.
2182 if (SkipReordering())
2183 break;
2184 // Skip the second pass if the first pass did not fail.
2185 bool StrategyFailed = false;
2186 // Mark all operand data as free to use.
2187 clearUsed();
2188 // We keep the original operand order for the FirstLane, so reorder the
2189 // rest of the lanes. We are visiting the nodes in a circular fashion,
2190 // using FirstLane as the center point and increasing the radius
2191 // distance.
2192 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2193 for (unsigned I = 0; I < NumOperands; ++I)
2194 MainAltOps[I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
2195
2196 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2197 // Visit the lane on the right and then the lane on the left.
2198 for (int Direction : {+1, -1}) {
2199 int Lane = FirstLane + Direction * Distance;
2200 if (Lane < 0 || Lane >= (int)NumLanes)
2201 continue;
2202 int LastLane = Lane - Direction;
2203 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2204 "Out of bounds");
2205 // Look for a good match for each operand.
2206 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2207 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2208 std::optional<unsigned> BestIdx = getBestOperand(
2209 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps: MainAltOps[OpIdx]);
2210 // By not selecting a value, we allow the operands that follow to
2211 // select a better matching value. We will get a non-null value in
2212 // the next run of getBestOperand().
2213 if (BestIdx) {
2214 // Swap the current operand with the one returned by
2215 // getBestOperand().
2216 swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
2217 } else {
2218 // We failed to find a best operand, set mode to 'Failed'.
2219 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2220 // Enable the second pass.
2221 StrategyFailed = true;
2222 }
2223 // Try to get the alternate opcode and follow it during analysis.
2224 if (MainAltOps[OpIdx].size() != 2) {
2225 OperandData &AltOp = getData(OpIdx, Lane);
2226 InstructionsState OpS =
2227 getSameOpcode(VL: {MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2228 if (OpS.getOpcode() && OpS.isAltShuffle())
2229 MainAltOps[OpIdx].push_back(Elt: AltOp.V);
2230 }
2231 }
2232 }
2233 }
2234 // Skip second pass if the strategy did not fail.
2235 if (!StrategyFailed)
2236 break;
2237 }
2238 }
2239
2240#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2241 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2242 switch (RMode) {
2243 case ReorderingMode::Load:
2244 return "Load";
2245 case ReorderingMode::Opcode:
2246 return "Opcode";
2247 case ReorderingMode::Constant:
2248 return "Constant";
2249 case ReorderingMode::Splat:
2250 return "Splat";
2251 case ReorderingMode::Failed:
2252 return "Failed";
2253 }
2254 llvm_unreachable("Unimplemented Reordering Type");
2255 }
2256
2257 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2258 raw_ostream &OS) {
2259 return OS << getModeStr(RMode);
2260 }
2261
2262 /// Debug print.
2263 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2264 printMode(RMode, OS&: dbgs());
2265 }
2266
2267 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2268 return printMode(RMode, OS);
2269 }
2270
2271 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2272 const unsigned Indent = 2;
2273 unsigned Cnt = 0;
2274 for (const OperandDataVec &OpDataVec : OpsVec) {
2275 OS << "Operand " << Cnt++ << "\n";
2276 for (const OperandData &OpData : OpDataVec) {
2277 OS.indent(NumSpaces: Indent) << "{";
2278 if (Value *V = OpData.V)
2279 OS << *V;
2280 else
2281 OS << "null";
2282 OS << ", APO:" << OpData.APO << "}\n";
2283 }
2284 OS << "\n";
2285 }
2286 return OS;
2287 }
2288
2289 /// Debug print.
2290 LLVM_DUMP_METHOD void dump() const { print(OS&: dbgs()); }
2291#endif
2292 };
2293
2294 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2295 /// for a pair which have highest score deemed to have best chance to form
2296 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2297 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2298 /// of the cost, considered to be good enough score.
2299 std::optional<int>
2300 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2301 int Limit = LookAheadHeuristics::ScoreFail) const {
2302 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2303 RootLookAheadMaxDepth);
2304 int BestScore = Limit;
2305 std::optional<int> Index;
2306 for (int I : seq<int>(Begin: 0, End: Candidates.size())) {
2307 int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates[I].first,
2308 RHS: Candidates[I].second,
2309 /*U1=*/nullptr, /*U2=*/nullptr,
2310 /*Level=*/CurrLevel: 1, MainAltOps: std::nullopt);
2311 if (Score > BestScore) {
2312 BestScore = Score;
2313 Index = I;
2314 }
2315 }
2316 return Index;
2317 }
2318
2319 /// Checks if the instruction is marked for deletion.
2320 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(V: I); }
2321
2322 /// Removes an instruction from its block and eventually deletes it.
2323 /// It's like Instruction::eraseFromParent() except that the actual deletion
2324 /// is delayed until BoUpSLP is destructed.
2325 void eraseInstruction(Instruction *I) {
2326 DeletedInstructions.insert(V: I);
2327 }
2328
2329 /// Checks if the instruction was already analyzed for being possible
2330 /// reduction root.
2331 bool isAnalyzedReductionRoot(Instruction *I) const {
2332 return AnalyzedReductionsRoots.count(Ptr: I);
2333 }
2334 /// Register given instruction as already analyzed for being possible
2335 /// reduction root.
2336 void analyzedReductionRoot(Instruction *I) {
2337 AnalyzedReductionsRoots.insert(Ptr: I);
2338 }
2339 /// Checks if the provided list of reduced values was checked already for
2340 /// vectorization.
2341 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
2342 return AnalyzedReductionVals.contains(V: hash_value(S: VL));
2343 }
2344 /// Adds the list of reduced values to list of already checked values for the
2345 /// vectorization.
2346 void analyzedReductionVals(ArrayRef<Value *> VL) {
2347 AnalyzedReductionVals.insert(V: hash_value(S: VL));
2348 }
2349 /// Clear the list of the analyzed reduction root instructions.
2350 void clearReductionData() {
2351 AnalyzedReductionsRoots.clear();
2352 AnalyzedReductionVals.clear();
2353 AnalyzedMinBWVals.clear();
2354 }
2355 /// Checks if the given value is gathered in one of the nodes.
2356 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2357 return any_of(Range: MustGather, P: [&](Value *V) { return Vals.contains(V); });
2358 }
2359
2360 /// Check if the value is vectorized in the tree.
2361 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2362
2363 ~BoUpSLP();
2364
2365private:
2366 /// Determine if a node \p E in can be demoted to a smaller type with a
2367 /// truncation. We collect the entries that will be demoted in ToDemote.
2368 /// \param E Node for analysis
2369 /// \param ToDemote indices of the nodes to be demoted.
2370 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2371 unsigned &BitWidth,
2372 SmallVectorImpl<unsigned> &ToDemote,
2373 DenseSet<const TreeEntry *> &Visited,
2374 unsigned &MaxDepthLevel,
2375 bool &IsProfitableToDemote,
2376 bool IsTruncRoot) const;
2377
2378 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2379 /// reordering (i.e. the operands can be reordered because they have only one
2380 /// user and reordarable).
2381 /// \param ReorderableGathers List of all gather nodes that require reordering
2382 /// (e.g., gather of extractlements or partially vectorizable loads).
2383 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2384 /// reordering, subset of \p NonVectorized.
2385 bool
2386 canReorderOperands(TreeEntry *UserTE,
2387 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2388 ArrayRef<TreeEntry *> ReorderableGathers,
2389 SmallVectorImpl<TreeEntry *> &GatherOps);
2390
2391 /// Checks if the given \p TE is a gather node with clustered reused scalars
2392 /// and reorders it per given \p Mask.
2393 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2394
2395 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2396 /// if any. If it is not vectorized (gather node), returns nullptr.
2397 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2398 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2399 TreeEntry *TE = nullptr;
2400 const auto *It = find_if(Range&: VL, P: [&](Value *V) {
2401 TE = getTreeEntry(V);
2402 if (TE && is_contained(Range&: TE->UserTreeIndices, Element: EdgeInfo(UserTE, OpIdx)))
2403 return true;
2404 auto It = MultiNodeScalars.find(Val: V);
2405 if (It != MultiNodeScalars.end()) {
2406 for (TreeEntry *E : It->second) {
2407 if (is_contained(Range&: E->UserTreeIndices, Element: EdgeInfo(UserTE, OpIdx))) {
2408 TE = E;
2409 return true;
2410 }
2411 }
2412 }
2413 return false;
2414 });
2415 if (It != VL.end()) {
2416 assert(TE->isSame(VL) && "Expected same scalars.");
2417 return TE;
2418 }
2419 return nullptr;
2420 }
2421
2422 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2423 /// if any. If it is not vectorized (gather node), returns nullptr.
2424 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2425 unsigned OpIdx) const {
2426 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2427 UserTE: const_cast<TreeEntry *>(UserTE), OpIdx);
2428 }
2429
2430 /// Checks if all users of \p I are the part of the vectorization tree.
2431 bool areAllUsersVectorized(
2432 Instruction *I,
2433 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2434
2435 /// Return information about the vector formed for the specified index
2436 /// of a vector of (the same) instruction.
2437 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2438
2439 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2440 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2441
2442 /// \returns Cast context for the given graph node.
2443 TargetTransformInfo::CastContextHint
2444 getCastContextHint(const TreeEntry &TE) const;
2445
2446 /// \returns the cost of the vectorizable entry.
2447 InstructionCost getEntryCost(const TreeEntry *E,
2448 ArrayRef<Value *> VectorizedVals,
2449 SmallPtrSetImpl<Value *> &CheckedExtracts);
2450
2451 /// This is the recursive part of buildTree.
2452 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2453 const EdgeInfo &EI);
2454
2455 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2456 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2457 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2458 /// returns false, setting \p CurrentOrder to either an empty vector or a
2459 /// non-identity permutation that allows to reuse extract instructions.
2460 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2461 /// extract order.
2462 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2463 SmallVectorImpl<unsigned> &CurrentOrder,
2464 bool ResizeAllowed = false) const;
2465
2466 /// Vectorize a single entry in the tree.
2467 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2468 /// avoid issues with def-use order.
2469 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2470
2471 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2472 /// \p E.
2473 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2474 /// avoid issues with def-use order.
2475 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2476
2477 /// Create a new vector from a list of scalar values. Produces a sequence
2478 /// which exploits values reused across lanes, and arranges the inserts
2479 /// for ease of later optimization.
2480 template <typename BVTy, typename ResTy, typename... Args>
2481 ResTy processBuildVector(const TreeEntry *E, Args &...Params);
2482
2483 /// Create a new vector from a list of scalar values. Produces a sequence
2484 /// which exploits values reused across lanes, and arranges the inserts
2485 /// for ease of later optimization.
2486 Value *createBuildVector(const TreeEntry *E);
2487
2488 /// Returns the instruction in the bundle, which can be used as a base point
2489 /// for scheduling. Usually it is the last instruction in the bundle, except
2490 /// for the case when all operands are external (in this case, it is the first
2491 /// instruction in the list).
2492 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2493
2494 /// Tries to find extractelement instructions with constant indices from fixed
2495 /// vector type and gather such instructions into a bunch, which highly likely
2496 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2497 /// was successful, the matched scalars are replaced by poison values in \p VL
2498 /// for future analysis.
2499 std::optional<TargetTransformInfo::ShuffleKind>
2500 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2501 SmallVectorImpl<int> &Mask) const;
2502
2503 /// Tries to find extractelement instructions with constant indices from fixed
2504 /// vector type and gather such instructions into a bunch, which highly likely
2505 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2506 /// was successful, the matched scalars are replaced by poison values in \p VL
2507 /// for future analysis.
2508 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2509 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2510 SmallVectorImpl<int> &Mask,
2511 unsigned NumParts) const;
2512
2513 /// Checks if the gathered \p VL can be represented as a single register
2514 /// shuffle(s) of previous tree entries.
2515 /// \param TE Tree entry checked for permutation.
2516 /// \param VL List of scalars (a subset of the TE scalar), checked for
2517 /// permutations. Must form single-register vector.
2518 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2519 /// commands to build the mask using the original vector value, without
2520 /// relying on the potential reordering.
2521 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2522 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2523 std::optional<TargetTransformInfo::ShuffleKind>
2524 isGatherShuffledSingleRegisterEntry(
2525 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2526 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2527 bool ForOrder);
2528
2529 /// Checks if the gathered \p VL can be represented as multi-register
2530 /// shuffle(s) of previous tree entries.
2531 /// \param TE Tree entry checked for permutation.
2532 /// \param VL List of scalars (a subset of the TE scalar), checked for
2533 /// permutations.
2534 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2535 /// commands to build the mask using the original vector value, without
2536 /// relying on the potential reordering.
2537 /// \returns per-register series of ShuffleKind, if gathered values can be
2538 /// represented as shuffles of previous tree entries. \p Mask is filled with
2539 /// the shuffle mask (also on per-register base).
2540 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2541 isGatherShuffledEntry(
2542 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2543 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2544 unsigned NumParts, bool ForOrder = false);
2545
2546 /// \returns the scalarization cost for this list of values. Assuming that
2547 /// this subtree gets vectorized, we may need to extract the values from the
2548 /// roots. This method calculates the cost of extracting the values.
2549 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2550 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc) const;
2551
2552 /// Set the Builder insert point to one after the last instruction in
2553 /// the bundle
2554 void setInsertPointAfterBundle(const TreeEntry *E);
2555
2556 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2557 /// specified, the starting vector value is poison.
2558 Value *gather(ArrayRef<Value *> VL, Value *Root);
2559
2560 /// \returns whether the VectorizableTree is fully vectorizable and will
2561 /// be beneficial even the tree height is tiny.
2562 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2563
2564 /// Reorder commutative or alt operands to get better probability of
2565 /// generating vectorized code.
2566 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2567 SmallVectorImpl<Value *> &Left,
2568 SmallVectorImpl<Value *> &Right,
2569 const BoUpSLP &R);
2570
2571 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2572 /// users of \p TE and collects the stores. It returns the map from the store
2573 /// pointers to the collected stores.
2574 DenseMap<Value *, SmallVector<StoreInst *>>
2575 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2576
2577 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2578 /// stores in \p StoresVec can form a vector instruction. If so it returns
2579 /// true and populates \p ReorderIndices with the shuffle indices of the
2580 /// stores when compared to the sorted vector.
2581 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2582 OrdersType &ReorderIndices) const;
2583
2584 /// Iterates through the users of \p TE, looking for scalar stores that can be
2585 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2586 /// their order and builds an order index vector for each store bundle. It
2587 /// returns all these order vectors found.
2588 /// We run this after the tree has formed, otherwise we may come across user
2589 /// instructions that are not yet in the tree.
2590 SmallVector<OrdersType, 1>
2591 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2592
2593 struct TreeEntry {
2594 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2595 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2596
2597 /// \returns Common mask for reorder indices and reused scalars.
2598 SmallVector<int> getCommonMask() const {
2599 SmallVector<int> Mask;
2600 inversePermutation(Indices: ReorderIndices, Mask);
2601 ::addMask(Mask, SubMask: ReuseShuffleIndices);
2602 return Mask;
2603 }
2604
2605 /// \returns true if the scalars in VL are equal to this entry.
2606 bool isSame(ArrayRef<Value *> VL) const {
2607 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2608 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2609 return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
2610 return VL.size() == Mask.size() &&
2611 std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
2612 binary_pred: [Scalars](Value *V, int Idx) {
2613 return (isa<UndefValue>(Val: V) &&
2614 Idx == PoisonMaskElem) ||
2615 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2616 });
2617 };
2618 if (!ReorderIndices.empty()) {
2619 // TODO: implement matching if the nodes are just reordered, still can
2620 // treat the vector as the same if the list of scalars matches VL
2621 // directly, without reordering.
2622 SmallVector<int> Mask;
2623 inversePermutation(Indices: ReorderIndices, Mask);
2624 if (VL.size() == Scalars.size())
2625 return IsSame(Scalars, Mask);
2626 if (VL.size() == ReuseShuffleIndices.size()) {
2627 ::addMask(Mask, SubMask: ReuseShuffleIndices);
2628 return IsSame(Scalars, Mask);
2629 }
2630 return false;
2631 }
2632 return IsSame(Scalars, ReuseShuffleIndices);
2633 }
2634
2635 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2636 return State == TreeEntry::NeedToGather &&
2637 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2638 UserTreeIndices.front().UserTE == UserEI.UserTE;
2639 }
2640
2641 /// \returns true if current entry has same operands as \p TE.
2642 bool hasEqualOperands(const TreeEntry &TE) const {
2643 if (TE.getNumOperands() != getNumOperands())
2644 return false;
2645 SmallBitVector Used(getNumOperands());
2646 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2647 unsigned PrevCount = Used.count();
2648 for (unsigned K = 0; K < E; ++K) {
2649 if (Used.test(Idx: K))
2650 continue;
2651 if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
2652 Used.set(K);
2653 break;
2654 }
2655 }
2656 // Check if we actually found the matching operand.
2657 if (PrevCount == Used.count())
2658 return false;
2659 }
2660 return true;
2661 }
2662
2663 /// \return Final vectorization factor for the node. Defined by the total
2664 /// number of vectorized scalars, including those, used several times in the
2665 /// entry and counted in the \a ReuseShuffleIndices, if any.
2666 unsigned getVectorFactor() const {
2667 if (!ReuseShuffleIndices.empty())
2668 return ReuseShuffleIndices.size();
2669 return Scalars.size();
2670 };
2671
2672 /// A vector of scalars.
2673 ValueList Scalars;
2674
2675 /// The Scalars are vectorized into this value. It is initialized to Null.
2676 WeakTrackingVH VectorizedValue = nullptr;
2677
2678 /// New vector phi instructions emitted for the vectorized phi nodes.
2679 PHINode *PHI = nullptr;
2680
2681 /// Do we need to gather this sequence or vectorize it
2682 /// (either with vector instruction or with scatter/gather
2683 /// intrinsics for store/load)?
2684 enum EntryState {
2685 Vectorize,
2686 ScatterVectorize,
2687 StridedVectorize,
2688 NeedToGather
2689 };
2690 EntryState State;
2691
2692 /// Does this sequence require some shuffling?
2693 SmallVector<int, 4> ReuseShuffleIndices;
2694
2695 /// Does this entry require reordering?
2696 SmallVector<unsigned, 4> ReorderIndices;
2697
2698 /// Points back to the VectorizableTree.
2699 ///
2700 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2701 /// to be a pointer and needs to be able to initialize the child iterator.
2702 /// Thus we need a reference back to the container to translate the indices
2703 /// to entries.
2704 VecTreeTy &Container;
2705
2706 /// The TreeEntry index containing the user of this entry. We can actually
2707 /// have multiple users so the data structure is not truly a tree.
2708 SmallVector<EdgeInfo, 1> UserTreeIndices;
2709
2710 /// The index of this treeEntry in VectorizableTree.
2711 int Idx = -1;
2712
2713 private:
2714 /// The operands of each instruction in each lane Operands[op_index][lane].
2715 /// Note: This helps avoid the replication of the code that performs the
2716 /// reordering of operands during buildTree_rec() and vectorizeTree().
2717 SmallVector<ValueList, 2> Operands;
2718
2719 /// The main/alternate instruction.
2720 Instruction *MainOp = nullptr;
2721 Instruction *AltOp = nullptr;
2722
2723 public:
2724 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2725 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2726 if (Operands.size() < OpIdx + 1)
2727 Operands.resize(N: OpIdx + 1);
2728 assert(Operands[OpIdx].empty() && "Already resized?");
2729 assert(OpVL.size() <= Scalars.size() &&
2730 "Number of operands is greater than the number of scalars.");
2731 Operands[OpIdx].resize(N: OpVL.size());
2732 copy(Range&: OpVL, Out: Operands[OpIdx].begin());
2733 }
2734
2735 /// Set the operands of this bundle in their original order.
2736 void setOperandsInOrder() {
2737 assert(Operands.empty() && "Already initialized?");
2738 auto *I0 = cast<Instruction>(Val: Scalars[0]);
2739 Operands.resize(N: I0->getNumOperands());
2740 unsigned NumLanes = Scalars.size();
2741 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2742 OpIdx != NumOperands; ++OpIdx) {
2743 Operands[OpIdx].resize(N: NumLanes);
2744 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2745 auto *I = cast<Instruction>(Val: Scalars[Lane]);
2746 assert(I->getNumOperands() == NumOperands &&
2747 "Expected same number of operands");
2748 Operands[OpIdx][Lane] = I->getOperand(i: OpIdx);
2749 }
2750 }
2751 }
2752
2753 /// Reorders operands of the node to the given mask \p Mask.
2754 void reorderOperands(ArrayRef<int> Mask) {
2755 for (ValueList &Operand : Operands)
2756 reorderScalars(Scalars&: Operand, Mask);
2757 }
2758
2759 /// \returns the \p OpIdx operand of this TreeEntry.
2760 ValueList &getOperand(unsigned OpIdx) {
2761 assert(OpIdx < Operands.size() && "Off bounds");
2762 return Operands[OpIdx];
2763 }
2764
2765 /// \returns the \p OpIdx operand of this TreeEntry.
2766 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2767 assert(OpIdx < Operands.size() && "Off bounds");
2768 return Operands[OpIdx];
2769 }
2770
2771 /// \returns the number of operands.
2772 unsigned getNumOperands() const { return Operands.size(); }
2773
2774 /// \return the single \p OpIdx operand.
2775 Value *getSingleOperand(unsigned OpIdx) const {
2776 assert(OpIdx < Operands.size() && "Off bounds");
2777 assert(!Operands[OpIdx].empty() && "No operand available");
2778 return Operands[OpIdx][0];
2779 }
2780
2781 /// Some of the instructions in the list have alternate opcodes.
2782 bool isAltShuffle() const { return MainOp != AltOp; }
2783
2784 bool isOpcodeOrAlt(Instruction *I) const {
2785 unsigned CheckedOpcode = I->getOpcode();
2786 return (getOpcode() == CheckedOpcode ||
2787 getAltOpcode() == CheckedOpcode);
2788 }
2789
2790 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2791 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2792 /// \p OpValue.
2793 Value *isOneOf(Value *Op) const {
2794 auto *I = dyn_cast<Instruction>(Val: Op);
2795 if (I && isOpcodeOrAlt(I))
2796 return Op;
2797 return MainOp;
2798 }
2799
2800 void setOperations(const InstructionsState &S) {
2801 MainOp = S.MainOp;
2802 AltOp = S.AltOp;
2803 }
2804
2805 Instruction *getMainOp() const {
2806 return MainOp;
2807 }
2808
2809 Instruction *getAltOp() const {
2810 return AltOp;
2811 }
2812
2813 /// The main/alternate opcodes for the list of instructions.
2814 unsigned getOpcode() const {
2815 return MainOp ? MainOp->getOpcode() : 0;
2816 }
2817
2818 unsigned getAltOpcode() const {
2819 return AltOp ? AltOp->getOpcode() : 0;
2820 }
2821
2822 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2823 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2824 int findLaneForValue(Value *V) const {
2825 unsigned FoundLane = std::distance(first: Scalars.begin(), last: find(Range: Scalars, Val: V));
2826 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2827 if (!ReorderIndices.empty())
2828 FoundLane = ReorderIndices[FoundLane];
2829 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2830 if (!ReuseShuffleIndices.empty()) {
2831 FoundLane = std::distance(first: ReuseShuffleIndices.begin(),
2832 last: find(Range: ReuseShuffleIndices, Val: FoundLane));
2833 }
2834 return FoundLane;
2835 }
2836
2837 /// Build a shuffle mask for graph entry which represents a merge of main
2838 /// and alternate operations.
2839 void
2840 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2841 SmallVectorImpl<int> &Mask,
2842 SmallVectorImpl<Value *> *OpScalars = nullptr,
2843 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2844
2845 /// Return true if this is a non-power-of-2 node.
2846 bool isNonPowOf2Vec() const {
2847 bool IsNonPowerOf2 = !isPowerOf2_32(Value: Scalars.size());
2848 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2849 "Reshuffling not supported with non-power-of-2 vectors yet.");
2850 return IsNonPowerOf2;
2851 }
2852
2853#ifndef NDEBUG
2854 /// Debug printer.
2855 LLVM_DUMP_METHOD void dump() const {
2856 dbgs() << Idx << ".\n";
2857 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2858 dbgs() << "Operand " << OpI << ":\n";
2859 for (const Value *V : Operands[OpI])
2860 dbgs().indent(NumSpaces: 2) << *V << "\n";
2861 }
2862 dbgs() << "Scalars: \n";
2863 for (Value *V : Scalars)
2864 dbgs().indent(NumSpaces: 2) << *V << "\n";
2865 dbgs() << "State: ";
2866 switch (State) {
2867 case Vectorize:
2868 dbgs() << "Vectorize\n";
2869 break;
2870 case ScatterVectorize:
2871 dbgs() << "ScatterVectorize\n";
2872 break;
2873 case StridedVectorize:
2874 dbgs() << "StridedVectorize\n";
2875 break;
2876 case NeedToGather:
2877 dbgs() << "NeedToGather\n";
2878 break;
2879 }
2880 dbgs() << "MainOp: ";
2881 if (MainOp)
2882 dbgs() << *MainOp << "\n";
2883 else
2884 dbgs() << "NULL\n";
2885 dbgs() << "AltOp: ";
2886 if (AltOp)
2887 dbgs() << *AltOp << "\n";
2888 else
2889 dbgs() << "NULL\n";
2890 dbgs() << "VectorizedValue: ";
2891 if (VectorizedValue)
2892 dbgs() << *VectorizedValue << "\n";
2893 else
2894 dbgs() << "NULL\n";
2895 dbgs() << "ReuseShuffleIndices: ";
2896 if (ReuseShuffleIndices.empty())
2897 dbgs() << "Empty";
2898 else
2899 for (int ReuseIdx : ReuseShuffleIndices)
2900 dbgs() << ReuseIdx << ", ";
2901 dbgs() << "\n";
2902 dbgs() << "ReorderIndices: ";
2903 for (unsigned ReorderIdx : ReorderIndices)
2904 dbgs() << ReorderIdx << ", ";
2905 dbgs() << "\n";
2906 dbgs() << "UserTreeIndices: ";
2907 for (const auto &EInfo : UserTreeIndices)
2908 dbgs() << EInfo << ", ";
2909 dbgs() << "\n";
2910 }
2911#endif
2912 };
2913
2914#ifndef NDEBUG
2915 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2916 InstructionCost VecCost, InstructionCost ScalarCost,
2917 StringRef Banner) const {
2918 dbgs() << "SLP: " << Banner << ":\n";
2919 E->dump();
2920 dbgs() << "SLP: Costs:\n";
2921 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2922 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2923 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2924 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2925 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2926 }
2927#endif
2928
2929 /// Create a new VectorizableTree entry.
2930 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2931 std::optional<ScheduleData *> Bundle,
2932 const InstructionsState &S,
2933 const EdgeInfo &UserTreeIdx,
2934 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2935 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2936 TreeEntry::EntryState EntryState =
2937 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
2938 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
2939 ReuseShuffleIndices, ReorderIndices);
2940 }
2941
2942 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
2943 TreeEntry::EntryState EntryState,
2944 std::optional<ScheduleData *> Bundle,
2945 const InstructionsState &S,
2946 const EdgeInfo &UserTreeIdx,
2947 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
2948 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
2949 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
2950 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
2951 "Need to vectorize gather entry?");
2952 VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
2953 TreeEntry *Last = VectorizableTree.back().get();
2954 Last->Idx = VectorizableTree.size() - 1;
2955 Last->State = EntryState;
2956 Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
2957 in_end: ReuseShuffleIndices.end());
2958 if (ReorderIndices.empty()) {
2959 Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
2960 Last->setOperations(S);
2961 } else {
2962 // Reorder scalars and build final mask.
2963 Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
2964 transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
2965 F: [VL](unsigned Idx) -> Value * {
2966 if (Idx >= VL.size())
2967 return UndefValue::get(T: VL.front()->getType());
2968 return VL[Idx];
2969 });
2970 InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
2971 Last->setOperations(S);
2972 Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
2973 }
2974 if (Last->State != TreeEntry::NeedToGather) {
2975 for (Value *V : VL) {
2976 const TreeEntry *TE = getTreeEntry(V);
2977 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
2978 "Scalar already in tree!");
2979 if (TE) {
2980 if (TE != Last)
2981 MultiNodeScalars.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
2982 continue;
2983 }
2984 ScalarToTreeEntry[V] = Last;
2985 }
2986 // Update the scheduler bundle to point to this TreeEntry.
2987 ScheduleData *BundleMember = *Bundle;
2988 assert((BundleMember || isa<PHINode>(S.MainOp) ||
2989 isVectorLikeInstWithConstOps(S.MainOp) ||
2990 doesNotNeedToSchedule(VL)) &&
2991 "Bundle and VL out of sync");
2992 if (BundleMember) {
2993 for (Value *V : VL) {
2994 if (doesNotNeedToBeScheduled(V))
2995 continue;
2996 if (!BundleMember)
2997 continue;
2998 BundleMember->TE = Last;
2999 BundleMember = BundleMember->NextInBundle;
3000 }
3001 }
3002 assert(!BundleMember && "Bundle and VL out of sync");
3003 } else {
3004 // Build a map for gathered scalars to the nodes where they are used.
3005 bool AllConstsOrCasts = true;
3006 for (Value *V : VL)
3007 if (!isConstant(V)) {
3008 auto *I = dyn_cast<CastInst>(Val: V);
3009 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3010 ValueToGatherNodes.try_emplace(Key: V).first->getSecond().insert(Ptr: Last);
3011 }
3012 if (AllConstsOrCasts)
3013 CastMaxMinBWSizes =
3014 std::make_pair(x: std::numeric_limits<unsigned>::max(), y: 1);
3015 MustGather.insert(I: VL.begin(), E: VL.end());
3016 }
3017
3018 if (UserTreeIdx.UserTE) {
3019 Last->UserTreeIndices.push_back(Elt: UserTreeIdx);
3020 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3021 "Reordering isn't implemented for non-power-of-2 nodes yet");
3022 }
3023 return Last;
3024 }
3025
3026 /// -- Vectorization State --
3027 /// Holds all of the tree entries.
3028 TreeEntry::VecTreeTy VectorizableTree;
3029
3030#ifndef NDEBUG
3031 /// Debug printer.
3032 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3033 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3034 VectorizableTree[Id]->dump();
3035 dbgs() << "\n";
3036 }
3037 }
3038#endif
3039
3040 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(Val: V); }
3041
3042 const TreeEntry *getTreeEntry(Value *V) const {
3043 return ScalarToTreeEntry.lookup(Val: V);
3044 }
3045
3046 /// Check that the operand node of alternate node does not generate
3047 /// buildvector sequence. If it is, then probably not worth it to build
3048 /// alternate shuffle, if number of buildvector operands + alternate
3049 /// instruction > than the number of buildvector instructions.
3050 /// \param S the instructions state of the analyzed values.
3051 /// \param VL list of the instructions with alternate opcodes.
3052 bool areAltOperandsProfitable(const InstructionsState &S,
3053 ArrayRef<Value *> VL) const;
3054
3055 /// Checks if the specified list of the instructions/values can be vectorized
3056 /// and fills required data before actual scheduling of the instructions.
3057 TreeEntry::EntryState getScalarsVectorizationState(
3058 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3059 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3060
3061 /// Maps a specific scalar to its tree entry.
3062 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3063
3064 /// List of scalars, used in several vectorize nodes, and the list of the
3065 /// nodes.
3066 SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
3067
3068 /// Maps a value to the proposed vectorizable size.
3069 SmallDenseMap<Value *, unsigned> InstrElementSize;
3070
3071 /// A list of scalars that we found that we need to keep as scalars.
3072 ValueSet MustGather;
3073
3074 /// A map between the vectorized entries and the last instructions in the
3075 /// bundles. The bundles are built in use order, not in the def order of the
3076 /// instructions. So, we cannot rely directly on the last instruction in the
3077 /// bundle being the last instruction in the program order during
3078 /// vectorization process since the basic blocks are affected, need to
3079 /// pre-gather them before.
3080 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3081
3082 /// List of gather nodes, depending on other gather/vector nodes, which should
3083 /// be emitted after the vector instruction emission process to correctly
3084 /// handle order of the vector instructions and shuffles.
3085 SetVector<const TreeEntry *> PostponedGathers;
3086
3087 using ValueToGatherNodesMap =
3088 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3089 ValueToGatherNodesMap ValueToGatherNodes;
3090
3091 /// This POD struct describes one external user in the vectorized tree.
3092 struct ExternalUser {
3093 ExternalUser(Value *S, llvm::User *U, int L)
3094 : Scalar(S), User(U), Lane(L) {}
3095
3096 // Which scalar in our function.
3097 Value *Scalar;
3098
3099 // Which user that uses the scalar.
3100 llvm::User *User;
3101
3102 // Which lane does the scalar belong to.
3103 int Lane;
3104 };
3105 using UserList = SmallVector<ExternalUser, 16>;
3106
3107 /// Checks if two instructions may access the same memory.
3108 ///
3109 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3110 /// is invariant in the calling loop.
3111 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3112 Instruction *Inst2) {
3113 if (!Loc1.Ptr || !isSimple(I: Inst1) || !isSimple(I: Inst2))
3114 return true;
3115 // First check if the result is already in the cache.
3116 AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
3117 auto It = AliasCache.find(Val: Key);
3118 if (It != AliasCache.end())
3119 return It->second;
3120 bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
3121 // Store the result in the cache.
3122 AliasCache.try_emplace(Key, Args&: Aliased);
3123 AliasCache.try_emplace(Key: std::make_pair(x&: Inst2, y&: Inst1), Args&: Aliased);
3124 return Aliased;
3125 }
3126
3127 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3128
3129 /// Cache for alias results.
3130 /// TODO: consider moving this to the AliasAnalysis itself.
3131 DenseMap<AliasCacheKey, bool> AliasCache;
3132
3133 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3134 // globally through SLP because we don't perform any action which
3135 // invalidates capture results.
3136 BatchAAResults BatchAA;
3137
3138 /// Temporary store for deleted instructions. Instructions will be deleted
3139 /// eventually when the BoUpSLP is destructed. The deferral is required to
3140 /// ensure that there are no incorrect collisions in the AliasCache, which
3141 /// can happen if a new instruction is allocated at the same address as a
3142 /// previously deleted instruction.
3143 DenseSet<Instruction *> DeletedInstructions;
3144
3145 /// Set of the instruction, being analyzed already for reductions.
3146 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3147
3148 /// Set of hashes for the list of reduction values already being analyzed.
3149 DenseSet<size_t> AnalyzedReductionVals;
3150
3151 /// Values, already been analyzed for mininmal bitwidth and found to be
3152 /// non-profitable.
3153 DenseSet<Value *> AnalyzedMinBWVals;
3154
3155 /// A list of values that need to extracted out of the tree.
3156 /// This list holds pairs of (Internal Scalar : External User). External User
3157 /// can be nullptr, it means that this Internal Scalar will be used later,
3158 /// after vectorization.
3159 UserList ExternalUses;
3160
3161 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3162 /// extractelement instructions.
3163 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3164
3165 /// Values used only by @llvm.assume calls.
3166 SmallPtrSet<const Value *, 32> EphValues;
3167
3168 /// Holds all of the instructions that we gathered, shuffle instructions and
3169 /// extractelements.
3170 SetVector<Instruction *> GatherShuffleExtractSeq;
3171
3172 /// A list of blocks that we are going to CSE.
3173 DenseSet<BasicBlock *> CSEBlocks;
3174
3175 /// Contains all scheduling relevant data for an instruction.
3176 /// A ScheduleData either represents a single instruction or a member of an
3177 /// instruction bundle (= a group of instructions which is combined into a
3178 /// vector instruction).
3179 struct ScheduleData {
3180 // The initial value for the dependency counters. It means that the
3181 // dependencies are not calculated yet.
3182 enum { InvalidDeps = -1 };
3183
3184 ScheduleData() = default;
3185
3186 void init(int BlockSchedulingRegionID, Value *OpVal) {
3187 FirstInBundle = this;
3188 NextInBundle = nullptr;
3189 NextLoadStore = nullptr;
3190 IsScheduled = false;
3191 SchedulingRegionID = BlockSchedulingRegionID;
3192 clearDependencies();
3193 OpValue = OpVal;
3194 TE = nullptr;
3195 }
3196
3197 /// Verify basic self consistency properties
3198 void verify() {
3199 if (hasValidDependencies()) {
3200 assert(UnscheduledDeps <= Dependencies && "invariant");
3201 } else {
3202 assert(UnscheduledDeps == Dependencies && "invariant");
3203 }
3204
3205 if (IsScheduled) {
3206 assert(isSchedulingEntity() &&
3207 "unexpected scheduled state");
3208 for (const ScheduleData *BundleMember = this; BundleMember;
3209 BundleMember = BundleMember->NextInBundle) {
3210 assert(BundleMember->hasValidDependencies() &&
3211 BundleMember->UnscheduledDeps == 0 &&
3212 "unexpected scheduled state");
3213 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3214 "only bundle is marked scheduled");
3215 }
3216 }
3217
3218 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3219 "all bundle members must be in same basic block");
3220 }
3221
3222 /// Returns true if the dependency information has been calculated.
3223 /// Note that depenendency validity can vary between instructions within
3224 /// a single bundle.
3225 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3226
3227 /// Returns true for single instructions and for bundle representatives
3228 /// (= the head of a bundle).
3229 bool isSchedulingEntity() const { return FirstInBundle == this; }
3230
3231 /// Returns true if it represents an instruction bundle and not only a
3232 /// single instruction.
3233 bool isPartOfBundle() const {
3234 return NextInBundle != nullptr || FirstInBundle != this || TE;
3235 }
3236
3237 /// Returns true if it is ready for scheduling, i.e. it has no more
3238 /// unscheduled depending instructions/bundles.
3239 bool isReady() const {
3240 assert(isSchedulingEntity() &&
3241 "can't consider non-scheduling entity for ready list");
3242 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3243 }
3244
3245 /// Modifies the number of unscheduled dependencies for this instruction,
3246 /// and returns the number of remaining dependencies for the containing
3247 /// bundle.
3248 int incrementUnscheduledDeps(int Incr) {
3249 assert(hasValidDependencies() &&
3250 "increment of unscheduled deps would be meaningless");
3251 UnscheduledDeps += Incr;
3252 return FirstInBundle->unscheduledDepsInBundle();
3253 }
3254
3255 /// Sets the number of unscheduled dependencies to the number of
3256 /// dependencies.
3257 void resetUnscheduledDeps() {
3258 UnscheduledDeps = Dependencies;
3259 }
3260
3261 /// Clears all dependency information.
3262 void clearDependencies() {
3263 Dependencies = InvalidDeps;
3264 resetUnscheduledDeps();
3265 MemoryDependencies.clear();
3266 ControlDependencies.clear();
3267 }
3268
3269 int unscheduledDepsInBundle() const {
3270 assert(isSchedulingEntity() && "only meaningful on the bundle");
3271 int Sum = 0;
3272 for (const ScheduleData *BundleMember = this; BundleMember;
3273 BundleMember = BundleMember->NextInBundle) {
3274 if (BundleMember->UnscheduledDeps == InvalidDeps)
3275 return InvalidDeps;
3276 Sum += BundleMember->UnscheduledDeps;
3277 }
3278 return Sum;
3279 }
3280
3281 void dump(raw_ostream &os) const {
3282 if (!isSchedulingEntity()) {
3283 os << "/ " << *Inst;
3284 } else if (NextInBundle) {
3285 os << '[' << *Inst;
3286 ScheduleData *SD = NextInBundle;
3287 while (SD) {
3288 os << ';' << *SD->Inst;
3289 SD = SD->NextInBundle;
3290 }
3291 os << ']';
3292 } else {
3293 os << *Inst;
3294 }
3295 }
3296
3297 Instruction *Inst = nullptr;
3298
3299 /// Opcode of the current instruction in the schedule data.
3300 Value *OpValue = nullptr;
3301
3302 /// The TreeEntry that this instruction corresponds to.
3303 TreeEntry *TE = nullptr;
3304
3305 /// Points to the head in an instruction bundle (and always to this for
3306 /// single instructions).
3307 ScheduleData *FirstInBundle = nullptr;
3308
3309 /// Single linked list of all instructions in a bundle. Null if it is a
3310 /// single instruction.
3311 ScheduleData *NextInBundle = nullptr;
3312
3313 /// Single linked list of all memory instructions (e.g. load, store, call)
3314 /// in the block - until the end of the scheduling region.
3315 ScheduleData *NextLoadStore = nullptr;
3316
3317 /// The dependent memory instructions.
3318 /// This list is derived on demand in calculateDependencies().
3319 SmallVector<ScheduleData *, 4> MemoryDependencies;
3320
3321 /// List of instructions which this instruction could be control dependent
3322 /// on. Allowing such nodes to be scheduled below this one could introduce
3323 /// a runtime fault which didn't exist in the original program.
3324 /// ex: this is a load or udiv following a readonly call which inf loops
3325 SmallVector<ScheduleData *, 4> ControlDependencies;
3326
3327 /// This ScheduleData is in the current scheduling region if this matches
3328 /// the current SchedulingRegionID of BlockScheduling.
3329 int SchedulingRegionID = 0;
3330
3331 /// Used for getting a "good" final ordering of instructions.
3332 int SchedulingPriority = 0;
3333
3334 /// The number of dependencies. Constitutes of the number of users of the
3335 /// instruction plus the number of dependent memory instructions (if any).
3336 /// This value is calculated on demand.
3337 /// If InvalidDeps, the number of dependencies is not calculated yet.
3338 int Dependencies = InvalidDeps;
3339
3340 /// The number of dependencies minus the number of dependencies of scheduled
3341 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3342 /// for scheduling.
3343 /// Note that this is negative as long as Dependencies is not calculated.
3344 int UnscheduledDeps = InvalidDeps;
3345
3346 /// True if this instruction is scheduled (or considered as scheduled in the
3347 /// dry-run).
3348 bool IsScheduled = false;
3349 };
3350
3351#ifndef NDEBUG
3352 friend inline raw_ostream &operator<<(raw_ostream &os,
3353 const BoUpSLP::ScheduleData &SD) {
3354 SD.dump(os);
3355 return os;
3356 }
3357#endif
3358
3359 friend struct GraphTraits<BoUpSLP *>;
3360 friend struct DOTGraphTraits<BoUpSLP *>;
3361
3362 /// Contains all scheduling data for a basic block.
3363 /// It does not schedules instructions, which are not memory read/write
3364 /// instructions and their operands are either constants, or arguments, or
3365 /// phis, or instructions from others blocks, or their users are phis or from
3366 /// the other blocks. The resulting vector instructions can be placed at the
3367 /// beginning of the basic block without scheduling (if operands does not need
3368 /// to be scheduled) or at the end of the block (if users are outside of the
3369 /// block). It allows to save some compile time and memory used by the
3370 /// compiler.
3371 /// ScheduleData is assigned for each instruction in between the boundaries of
3372 /// the tree entry, even for those, which are not part of the graph. It is
3373 /// required to correctly follow the dependencies between the instructions and
3374 /// their correct scheduling. The ScheduleData is not allocated for the
3375 /// instructions, which do not require scheduling, like phis, nodes with
3376 /// extractelements/insertelements only or nodes with instructions, with
3377 /// uses/operands outside of the block.
3378 struct BlockScheduling {
3379 BlockScheduling(BasicBlock *BB)
3380 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3381
3382 void clear() {
3383 ReadyInsts.clear();
3384 ScheduleStart = nullptr;
3385 ScheduleEnd = nullptr;
3386 FirstLoadStoreInRegion = nullptr;
3387 LastLoadStoreInRegion = nullptr;
3388 RegionHasStackSave = false;
3389
3390 // Reduce the maximum schedule region size by the size of the
3391 // previous scheduling run.
3392 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3393 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3394 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3395 ScheduleRegionSize = 0;
3396
3397 // Make a new scheduling region, i.e. all existing ScheduleData is not
3398 // in the new region yet.
3399 ++SchedulingRegionID;
3400 }
3401
3402 ScheduleData *getScheduleData(Instruction *I) {
3403 if (BB != I->getParent())
3404 // Avoid lookup if can't possibly be in map.
3405 return nullptr;
3406 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
3407 if (SD && isInSchedulingRegion(SD))
3408 return SD;
3409 return nullptr;
3410 }
3411
3412 ScheduleData *getScheduleData(Value *V) {
3413 if (auto *I = dyn_cast<Instruction>(Val: V))
3414 return getScheduleData(I);
3415 return nullptr;
3416 }
3417
3418 ScheduleData *getScheduleData(Value *V, Value *Key) {
3419 if (V == Key)
3420 return getScheduleData(V);
3421 auto I = ExtraScheduleDataMap.find(Val: V);
3422 if (I != ExtraScheduleDataMap.end()) {
3423 ScheduleData *SD = I->second.lookup(Val: Key);
3424 if (SD && isInSchedulingRegion(SD))
3425 return SD;
3426 }
3427 return nullptr;
3428 }
3429
3430 bool isInSchedulingRegion(ScheduleData *SD) const {
3431 return SD->SchedulingRegionID == SchedulingRegionID;
3432 }
3433
3434 /// Marks an instruction as scheduled and puts all dependent ready
3435 /// instructions into the ready-list.
3436 template <typename ReadyListType>
3437 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3438 SD->IsScheduled = true;
3439 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3440
3441 for (ScheduleData *BundleMember = SD; BundleMember;
3442 BundleMember = BundleMember->NextInBundle) {
3443 if (BundleMember->Inst != BundleMember->OpValue)
3444 continue;
3445
3446 // Handle the def-use chain dependencies.
3447
3448 // Decrement the unscheduled counter and insert to ready list if ready.
3449 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3450 doForAllOpcodes(V: I, Action: [&ReadyList](ScheduleData *OpDef) {
3451 if (OpDef && OpDef->hasValidDependencies() &&
3452 OpDef->incrementUnscheduledDeps(Incr: -1) == 0) {
3453 // There are no more unscheduled dependencies after
3454 // decrementing, so we can put the dependent instruction
3455 // into the ready list.
3456 ScheduleData *DepBundle = OpDef->FirstInBundle;
3457 assert(!DepBundle->IsScheduled &&
3458 "already scheduled bundle gets ready");
3459 ReadyList.insert(DepBundle);
3460 LLVM_DEBUG(dbgs()
3461 << "SLP: gets ready (def): " << *DepBundle << "\n");
3462 }
3463 });
3464 };
3465
3466 // If BundleMember is a vector bundle, its operands may have been
3467 // reordered during buildTree(). We therefore need to get its operands
3468 // through the TreeEntry.
3469 if (TreeEntry *TE = BundleMember->TE) {
3470 // Need to search for the lane since the tree entry can be reordered.
3471 int Lane = std::distance(first: TE->Scalars.begin(),
3472 last: find(Range&: TE->Scalars, Val: BundleMember->Inst));
3473 assert(Lane >= 0 && "Lane not set");
3474
3475 // Since vectorization tree is being built recursively this assertion
3476 // ensures that the tree entry has all operands set before reaching
3477 // this code. Couple of exceptions known at the moment are extracts
3478 // where their second (immediate) operand is not added. Since
3479 // immediates do not affect scheduler behavior this is considered
3480 // okay.
3481 auto *In = BundleMember->Inst;
3482 assert(
3483 In &&
3484 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3485 In->getNumOperands() == TE->getNumOperands()) &&
3486 "Missed TreeEntry operands?");
3487 (void)In; // fake use to avoid build failure when assertions disabled
3488
3489 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3490 OpIdx != NumOperands; ++OpIdx)
3491 if (auto *I = dyn_cast<Instruction>(Val: TE->getOperand(OpIdx)[Lane]))
3492 DecrUnsched(I);
3493 } else {
3494 // If BundleMember is a stand-alone instruction, no operand reordering
3495 // has taken place, so we directly access its operands.
3496 for (Use &U : BundleMember->Inst->operands())
3497 if (auto *I = dyn_cast<Instruction>(Val: U.get()))
3498 DecrUnsched(I);
3499 }
3500 // Handle the memory dependencies.
3501 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3502 if (MemoryDepSD->hasValidDependencies() &&
3503 MemoryDepSD->incrementUnscheduledDeps(Incr: -1) == 0) {
3504 // There are no more unscheduled dependencies after decrementing,
3505 // so we can put the dependent instruction into the ready list.
3506 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3507 assert(!DepBundle->IsScheduled &&
3508 "already scheduled bundle gets ready");
3509 ReadyList.insert(DepBundle);
3510 LLVM_DEBUG(dbgs()
3511 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3512 }
3513 }
3514 // Handle the control dependencies.
3515 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3516 if (DepSD->incrementUnscheduledDeps(Incr: -1) == 0) {
3517 // There are no more unscheduled dependencies after decrementing,
3518 // so we can put the dependent instruction into the ready list.
3519 ScheduleData *DepBundle = DepSD->FirstInBundle;
3520 assert(!DepBundle->IsScheduled &&
3521 "already scheduled bundle gets ready");
3522 ReadyList.insert(DepBundle);
3523 LLVM_DEBUG(dbgs()
3524 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3525 }
3526 }
3527 }
3528 }
3529
3530 /// Verify basic self consistency properties of the data structure.
3531 void verify() {
3532 if (!ScheduleStart)
3533 return;
3534
3535 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3536 ScheduleStart->comesBefore(ScheduleEnd) &&
3537 "Not a valid scheduling region?");
3538
3539 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3540 auto *SD = getScheduleData(I);
3541 if (!SD)
3542 continue;
3543 assert(isInSchedulingRegion(SD) &&
3544 "primary schedule data not in window?");
3545 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3546 "entire bundle in window!");
3547 (void)SD;
3548 doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->verify(); });
3549 }
3550
3551 for (auto *SD : ReadyInsts) {
3552 assert(SD->isSchedulingEntity() && SD->isReady() &&
3553 "item in ready list not ready?");
3554 (void)SD;
3555 }
3556 }
3557
3558 void doForAllOpcodes(Value *V,
3559 function_ref<void(ScheduleData *SD)> Action) {
3560 if (ScheduleData *SD = getScheduleData(V))
3561 Action(SD);
3562 auto I = ExtraScheduleDataMap.find(Val: V);
3563 if (I != ExtraScheduleDataMap.end())
3564 for (auto &P : I->second)
3565 if (isInSchedulingRegion(SD: P.second))
3566 Action(P.second);
3567 }
3568
3569 /// Put all instructions into the ReadyList which are ready for scheduling.
3570 template <typename ReadyListType>
3571 void initialFillReadyList(ReadyListType &ReadyList) {
3572 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3573 doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
3574 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3575 SD->isReady()) {
3576 ReadyList.insert(SD);
3577 LLVM_DEBUG(dbgs()
3578 << "SLP: initially in ready list: " << *SD << "\n");
3579 }
3580 });
3581 }
3582 }
3583
3584 /// Build a bundle from the ScheduleData nodes corresponding to the
3585 /// scalar instruction for each lane.
3586 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3587
3588 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3589 /// cyclic dependencies. This is only a dry-run, no instructions are
3590 /// actually moved at this stage.
3591 /// \returns the scheduling bundle. The returned Optional value is not
3592 /// std::nullopt if \p VL is allowed to be scheduled.
3593 std::optional<ScheduleData *>
3594 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3595 const InstructionsState &S);
3596
3597 /// Un-bundles a group of instructions.
3598 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3599
3600 /// Allocates schedule data chunk.
3601 ScheduleData *allocateScheduleDataChunks();
3602
3603 /// Extends the scheduling region so that V is inside the region.
3604 /// \returns true if the region size is within the limit.
3605 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3606
3607 /// Initialize the ScheduleData structures for new instructions in the
3608 /// scheduling region.
3609 void initScheduleData(Instruction *FromI, Instruction *ToI,
3610 ScheduleData *PrevLoadStore,
3611 ScheduleData *NextLoadStore);
3612
3613 /// Updates the dependency information of a bundle and of all instructions/
3614 /// bundles which depend on the original bundle.
3615 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3616 BoUpSLP *SLP);
3617
3618 /// Sets all instruction in the scheduling region to un-scheduled.
3619 void resetSchedule();
3620
3621 BasicBlock *BB;
3622
3623 /// Simple memory allocation for ScheduleData.
3624 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3625
3626 /// The size of a ScheduleData array in ScheduleDataChunks.
3627 int ChunkSize;
3628
3629 /// The allocator position in the current chunk, which is the last entry
3630 /// of ScheduleDataChunks.
3631 int ChunkPos;
3632
3633 /// Attaches ScheduleData to Instruction.
3634 /// Note that the mapping survives during all vectorization iterations, i.e.
3635 /// ScheduleData structures are recycled.
3636 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
3637
3638 /// Attaches ScheduleData to Instruction with the leading key.
3639 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
3640 ExtraScheduleDataMap;
3641
3642 /// The ready-list for scheduling (only used for the dry-run).
3643 SetVector<ScheduleData *> ReadyInsts;
3644
3645 /// The first instruction of the scheduling region.
3646 Instruction *ScheduleStart = nullptr;
3647
3648 /// The first instruction _after_ the scheduling region.
3649 Instruction *ScheduleEnd = nullptr;
3650
3651 /// The first memory accessing instruction in the scheduling region
3652 /// (can be null).
3653 ScheduleData *FirstLoadStoreInRegion = nullptr;
3654
3655 /// The last memory accessing instruction in the scheduling region
3656 /// (can be null).
3657 ScheduleData *LastLoadStoreInRegion = nullptr;
3658
3659 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3660 /// region? Used to optimize the dependence calculation for the
3661 /// common case where there isn't.
3662 bool RegionHasStackSave = false;
3663
3664 /// The current size of the scheduling region.
3665 int ScheduleRegionSize = 0;
3666
3667 /// The maximum size allowed for the scheduling region.
3668 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3669
3670 /// The ID of the scheduling region. For a new vectorization iteration this
3671 /// is incremented which "removes" all ScheduleData from the region.
3672 /// Make sure that the initial SchedulingRegionID is greater than the
3673 /// initial SchedulingRegionID in ScheduleData (which is 0).
3674 int SchedulingRegionID = 1;
3675 };
3676
3677 /// Attaches the BlockScheduling structures to basic blocks.
3678 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3679
3680 /// Performs the "real" scheduling. Done before vectorization is actually
3681 /// performed in a basic block.
3682 void scheduleBlock(BlockScheduling *BS);
3683
3684 /// List of users to ignore during scheduling and that don't need extracting.
3685 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3686
3687 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3688 /// sorted SmallVectors of unsigned.
3689 struct OrdersTypeDenseMapInfo {
3690 static OrdersType getEmptyKey() {
3691 OrdersType V;
3692 V.push_back(Elt: ~1U);
3693 return V;
3694 }
3695
3696 static OrdersType getTombstoneKey() {
3697 OrdersType V;
3698 V.push_back(Elt: ~2U);
3699 return V;
3700 }
3701
3702 static unsigned getHashValue(const OrdersType &V) {
3703 return static_cast<unsigned>(hash_combine_range(first: V.begin(), last: V.end()));
3704 }
3705
3706 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3707 return LHS == RHS;
3708 }
3709 };
3710
3711 // Analysis and block reference.
3712 Function *F;
3713 ScalarEvolution *SE;
3714 TargetTransformInfo *TTI;
3715 TargetLibraryInfo *TLI;
3716 LoopInfo *LI;
3717 DominatorTree *DT;
3718 AssumptionCache *AC;
3719 DemandedBits *DB;
3720 const DataLayout *DL;
3721 OptimizationRemarkEmitter *ORE;
3722
3723 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3724 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3725
3726 /// Instruction builder to construct the vectorized tree.
3727 IRBuilder<TargetFolder> Builder;
3728
3729 /// A map of scalar integer values to the smallest bit width with which they
3730 /// can legally be represented. The values map to (width, signed) pairs,
3731 /// where "width" indicates the minimum bit width and "signed" is True if the
3732 /// value must be signed-extended, rather than zero-extended, back to its
3733 /// original width.
3734 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
3735
3736 /// Final size of the reduced vector, if the current graph represents the
3737 /// input for the reduction and it was possible to narrow the size of the
3738 /// reduction.
3739 unsigned ReductionBitWidth = 0;
3740
3741 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3742 /// type sizes, used in the tree.
3743 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3744
3745 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3746 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3747 DenseSet<unsigned> ExtraBitWidthNodes;
3748};
3749
3750} // end namespace slpvectorizer
3751
3752template <> struct GraphTraits<BoUpSLP *> {
3753 using TreeEntry = BoUpSLP::TreeEntry;
3754
3755 /// NodeRef has to be a pointer per the GraphWriter.
3756 using NodeRef = TreeEntry *;
3757
3758 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
3759
3760 /// Add the VectorizableTree to the index iterator to be able to return
3761 /// TreeEntry pointers.
3762 struct ChildIteratorType
3763 : public iterator_adaptor_base<
3764 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3765 ContainerTy &VectorizableTree;
3766
3767 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
3768 ContainerTy &VT)
3769 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3770
3771 NodeRef operator*() { return I->UserTE; }
3772 };
3773
3774 static NodeRef getEntryNode(BoUpSLP &R) {
3775 return R.VectorizableTree[0].get();
3776 }
3777
3778 static ChildIteratorType child_begin(NodeRef N) {
3779 return {N->UserTreeIndices.begin(), N->Container};
3780 }
3781
3782 static ChildIteratorType child_end(NodeRef N) {
3783 return {N->UserTreeIndices.end(), N->Container};
3784 }
3785
3786 /// For the node iterator we just need to turn the TreeEntry iterator into a
3787 /// TreeEntry* iterator so that it dereferences to NodeRef.
3788 class nodes_iterator {
3789 using ItTy = ContainerTy::iterator;
3790 ItTy It;
3791
3792 public:
3793 nodes_iterator(const ItTy &It2) : It(It2) {}
3794 NodeRef operator*() { return It->get(); }
3795 nodes_iterator operator++() {
3796 ++It;
3797 return *this;
3798 }
3799 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3800 };
3801
3802 static nodes_iterator nodes_begin(BoUpSLP *R) {
3803 return nodes_iterator(R->VectorizableTree.begin());
3804 }
3805
3806 static nodes_iterator nodes_end(BoUpSLP *R) {
3807 return nodes_iterator(R->VectorizableTree.end());
3808 }
3809
3810 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3811};
3812
3813template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3814 using TreeEntry = BoUpSLP::TreeEntry;
3815
3816 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3817
3818 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3819 std::string Str;
3820 raw_string_ostream OS(Str);
3821 OS << Entry->Idx << ".\n";
3822 if (isSplat(VL: Entry->Scalars))
3823 OS << "<splat> ";
3824 for (auto *V : Entry->Scalars) {
3825 OS << *V;
3826 if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
3827 return EU.Scalar == V;
3828 }))
3829 OS << " <extract>";
3830 OS << "\n";
3831 }
3832 return Str;
3833 }
3834
3835 static std::string getNodeAttributes(const TreeEntry *Entry,
3836 const BoUpSLP *) {
3837 if (Entry->State == TreeEntry::NeedToGather)
3838 return "color=red";
3839 if (Entry->State == TreeEntry::ScatterVectorize ||
3840 Entry->State == TreeEntry::StridedVectorize)
3841 return "color=blue";
3842 return "";
3843 }
3844};
3845
3846} // end namespace llvm
3847
3848BoUpSLP::~BoUpSLP() {
3849 SmallVector<WeakTrackingVH> DeadInsts;
3850 for (auto *I : DeletedInstructions) {
3851 for (Use &U : I->operands()) {
3852 auto *Op = dyn_cast<Instruction>(Val: U.get());
3853 if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
3854 wouldInstructionBeTriviallyDead(I: Op, TLI))
3855 DeadInsts.emplace_back(Args&: Op);
3856 }
3857 I->dropAllReferences();
3858 }
3859 for (auto *I : DeletedInstructions) {
3860 assert(I->use_empty() &&
3861 "trying to erase instruction with users.");
3862 I->eraseFromParent();
3863 }
3864
3865 // Cleanup any dead scalar code feeding the vectorized instructions
3866 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
3867
3868#ifdef EXPENSIVE_CHECKS
3869 // If we could guarantee that this call is not extremely slow, we could
3870 // remove the ifdef limitation (see PR47712).
3871 assert(!verifyFunction(*F, &dbgs()));
3872#endif
3873}
3874
3875/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3876/// contains original mask for the scalars reused in the node. Procedure
3877/// transform this mask in accordance with the given \p Mask.
3878static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
3879 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3880 "Expected non-empty mask.");
3881 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3882 Prev.swap(RHS&: Reuses);
3883 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3884 if (Mask[I] != PoisonMaskElem)
3885 Reuses[Mask[I]] = Prev[I];
3886}
3887
3888/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3889/// the original order of the scalars. Procedure transforms the provided order
3890/// in accordance with the given \p Mask. If the resulting \p Order is just an
3891/// identity order, \p Order is cleared.
3892static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
3893 bool BottomOrder = false) {
3894 assert(!Mask.empty() && "Expected non-empty mask.");
3895 unsigned Sz = Mask.size();
3896 if (BottomOrder) {
3897 SmallVector<unsigned> PrevOrder;
3898 if (Order.empty()) {
3899 PrevOrder.resize(N: Sz);
3900 std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: 0);
3901 } else {
3902 PrevOrder.swap(RHS&: Order);
3903 }
3904 Order.assign(NumElts: Sz, Elt: Sz);
3905 for (unsigned I = 0; I < Sz; ++I)
3906 if (Mask[I] != PoisonMaskElem)
3907 Order[I] = PrevOrder[Mask[I]];
3908 if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
3909 return Data.value() == Sz || Data.index() == Data.value();
3910 })) {
3911 Order.clear();
3912 return;
3913 }
3914 fixupOrderingIndices(Order);
3915 return;
3916 }
3917 SmallVector<int> MaskOrder;
3918 if (Order.empty()) {
3919 MaskOrder.resize(N: Sz);
3920 std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: 0);
3921 } else {
3922 inversePermutation(Indices: Order, Mask&: MaskOrder);
3923 }
3924 reorderReuses(Reuses&: MaskOrder, Mask);
3925 if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
3926 Order.clear();
3927 return;
3928 }
3929 Order.assign(NumElts: Sz, Elt: Sz);
3930 for (unsigned I = 0; I < Sz; ++I)
3931 if (MaskOrder[I] != PoisonMaskElem)
3932 Order[MaskOrder[I]] = I;
3933 fixupOrderingIndices(Order);
3934}
3935
3936std::optional<BoUpSLP::OrdersType>
3937BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
3938 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
3939 // Try to find subvector extract/insert patterns and reorder only such
3940 // patterns.
3941 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
3942 Type *ScalarTy = GatheredScalars.front()->getType();
3943 int NumScalars = GatheredScalars.size();
3944 if (!isValidElementType(Ty: ScalarTy))
3945 return std::nullopt;
3946 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: NumScalars);
3947 int NumParts = TTI->getNumberOfParts(Tp: VecTy);
3948 if (NumParts == 0 || NumParts >= NumScalars)
3949 NumParts = 1;
3950 SmallVector<int> ExtractMask;
3951 SmallVector<int> Mask;
3952 SmallVector<SmallVector<const TreeEntry *>> Entries;
3953 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
3954 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
3955 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
3956 isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
3957 /*ForOrder=*/true);
3958 // No shuffled operands - ignore.
3959 if (GatherShuffles.empty() && ExtractShuffles.empty())
3960 return std::nullopt;
3961 OrdersType CurrentOrder(NumScalars, NumScalars);
3962 if (GatherShuffles.size() == 1 &&
3963 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
3964 Entries.front().front()->isSame(VL: TE.Scalars)) {
3965 // Perfect match in the graph, will reuse the previously vectorized
3966 // node. Cost is 0.
3967 std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: 0);
3968 return CurrentOrder;
3969 }
3970 auto IsSplatMask = [](ArrayRef<int> Mask) {
3971 int SingleElt = PoisonMaskElem;
3972 return all_of(Range&: Mask, P: [&](int I) {
3973 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
3974 SingleElt = I;
3975 return I == PoisonMaskElem || I == SingleElt;
3976 });
3977 };
3978 // Exclusive broadcast mask - ignore.
3979 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
3980 (Entries.size() != 1 ||
3981 Entries.front().front()->ReorderIndices.empty())) ||
3982 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
3983 return std::nullopt;
3984 SmallBitVector ShuffledSubMasks(NumParts);
3985 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
3986 ArrayRef<int> Mask, int PartSz, int NumParts,
3987 function_ref<unsigned(unsigned)> GetVF) {
3988 for (int I : seq<int>(Begin: 0, End: NumParts)) {
3989 if (ShuffledSubMasks.test(Idx: I))
3990 continue;
3991 const int VF = GetVF(I);
3992 if (VF == 0)
3993 continue;
3994 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: PartSz);
3995 // Shuffle of at least 2 vectors - ignore.
3996 if (any_of(Range&: Slice, P: [&](int I) { return I != NumScalars; })) {
3997 std::fill(first: Slice.begin(), last: Slice.end(), value: NumScalars);
3998 ShuffledSubMasks.set(I);
3999 continue;
4000 }
4001 // Try to include as much elements from the mask as possible.
4002 int FirstMin = INT_MAX;
4003 int SecondVecFound = false;
4004 for (int K : seq<int>(Begin: 0, End: PartSz)) {
4005 int Idx = Mask[I * PartSz + K];
4006 if (Idx == PoisonMaskElem) {
4007 Value *V = GatheredScalars[I * PartSz + K];
4008 if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
4009 SecondVecFound = true;
4010 break;
4011 }
4012 continue;
4013 }
4014 if (Idx < VF) {
4015 if (FirstMin > Idx)
4016 FirstMin = Idx;
4017 } else {
4018 SecondVecFound = true;
4019 break;
4020 }
4021 }
4022 FirstMin = (FirstMin / PartSz) * PartSz;
4023 // Shuffle of at least 2 vectors - ignore.
4024 if (SecondVecFound) {
4025 std::fill(first: Slice.begin(), last: Slice.end(), value: NumScalars);
4026 ShuffledSubMasks.set(I);
4027 continue;
4028 }
4029 for (int K : seq<int>(Begin: 0, End: PartSz)) {
4030 int Idx = Mask[I * PartSz + K];
4031 if (Idx == PoisonMaskElem)
4032 continue;
4033 Idx -= FirstMin;
4034 if (Idx >= PartSz) {
4035 SecondVecFound = true;
4036 break;
4037 }
4038 if (CurrentOrder[I * PartSz + Idx] >
4039 static_cast<unsigned>(I * PartSz + K) &&
4040 CurrentOrder[I * PartSz + Idx] !=
4041 static_cast<unsigned>(I * PartSz + Idx))
4042 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4043 }
4044 // Shuffle of at least 2 vectors - ignore.
4045 if (SecondVecFound) {
4046 std::fill(first: Slice.begin(), last: Slice.end(), value: NumScalars);
4047 ShuffledSubMasks.set(I);
4048 continue;
4049 }
4050 }
4051 };
4052 int PartSz = NumScalars / NumParts;
4053 if (!ExtractShuffles.empty())
4054 TransformMaskToOrder(
4055 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4056 if (!ExtractShuffles[I])
4057 return 0U;
4058 unsigned VF = 0;
4059 for (unsigned Idx : seq<unsigned>(Begin: 0, End: PartSz)) {
4060 int K = I * PartSz + Idx;
4061 if (ExtractMask[K] == PoisonMaskElem)
4062 continue;
4063 if (!TE.ReuseShuffleIndices.empty())
4064 K = TE.ReuseShuffleIndices[K];
4065 if (!TE.ReorderIndices.empty())
4066 K = std::distance(first: TE.ReorderIndices.begin(),
4067 last: find(Range: TE.ReorderIndices, Val: K));
4068 auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars[K]);
4069 if (!EI)
4070 continue;
4071 VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
4072 ->getElementCount()
4073 .getKnownMinValue());
4074 }
4075 return VF;
4076 });
4077 // Check special corner case - single shuffle of the same entry.
4078 if (GatherShuffles.size() == 1 && NumParts != 1) {
4079 if (ShuffledSubMasks.any())
4080 return std::nullopt;
4081 PartSz = NumScalars;
4082 NumParts = 1;
4083 }
4084 if (!Entries.empty())
4085 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4086 if (!GatherShuffles[I])
4087 return 0U;
4088 return std::max(a: Entries[I].front()->getVectorFactor(),
4089 b: Entries[I].back()->getVectorFactor());
4090 });
4091 int NumUndefs =
4092 count_if(Range&: CurrentOrder, P: [&](int Idx) { return Idx == NumScalars; });
4093 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4094 return std::nullopt;
4095 return std::move(CurrentOrder);
4096}
4097
4098static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4099 const TargetLibraryInfo &TLI,
4100 bool CompareOpcodes = true) {
4101 if (getUnderlyingObject(V: Ptr1) != getUnderlyingObject(V: Ptr2))
4102 return false;
4103 auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
4104 if (!GEP1)
4105 return false;
4106 auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
4107 if (!GEP2)
4108 return false;
4109 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4110 ((isConstant(V: GEP1->getOperand(i_nocapture: 1)) &&
4111 isConstant(V: GEP2->getOperand(i_nocapture: 1))) ||
4112 !CompareOpcodes ||
4113 getSameOpcode(VL: {GEP1->getOperand(i_nocapture: 1), GEP2->getOperand(i_nocapture: 1)}, TLI)
4114 .getOpcode());
4115}
4116
4117/// Calculates minimal alignment as a common alignment.
4118template <typename T>
4119static Align computeCommonAlignment(ArrayRef<Value *> VL) {
4120 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4121 for (Value *V : VL.drop_front())
4122 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4123 return CommonAlignment;
4124}
4125
4126/// Check if \p Order represents reverse order.
4127static bool isReverseOrder(ArrayRef<unsigned> Order) {
4128 unsigned Sz = Order.size();
4129 return !Order.empty() && all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
4130 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4131 });
4132}
4133
4134/// Checks if the provided list of pointers \p Pointers represents the strided
4135/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4136/// Otherwise, if \p Inst is not specified, just initialized optional value is
4137/// returned to show that the pointers represent strided pointers. If \p Inst
4138/// specified, the runtime stride is materialized before the given \p Inst.
4139/// \returns std::nullopt if the pointers are not pointers with the runtime
4140/// stride, nullptr or actual stride value, otherwise.
4141static std::optional<Value *>
4142calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
4143 const DataLayout &DL, ScalarEvolution &SE,
4144 SmallVectorImpl<unsigned> &SortedIndices,
4145 Instruction *Inst = nullptr) {
4146 SmallVector<const SCEV *> SCEVs;
4147 const SCEV *PtrSCEVLowest = nullptr;
4148 const SCEV *PtrSCEVHighest = nullptr;
4149 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4150 // addresses).
4151 for (Value *Ptr : PointerOps) {
4152 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
4153 if (!PtrSCEV)
4154 return std::nullopt;
4155 SCEVs.push_back(Elt: PtrSCEV);
4156 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4157 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4158 continue;
4159 }
4160 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4161 if (isa<SCEVCouldNotCompute>(Val: Diff))
4162 return std::nullopt;
4163 if (Diff->isNonConstantNegative()) {
4164 PtrSCEVLowest = PtrSCEV;
4165 continue;
4166 }
4167 const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
4168 if (isa<SCEVCouldNotCompute>(Val: Diff1))
4169 return std::nullopt;
4170 if (Diff1->isNonConstantNegative()) {
4171 PtrSCEVHighest = PtrSCEV;
4172 continue;
4173 }
4174 }
4175 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4176 const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
4177 if (isa<SCEVCouldNotCompute>(Val: Dist))
4178 return std::nullopt;
4179 int Size = DL.getTypeStoreSize(Ty: ElemTy);
4180 auto TryGetStride = [&](const SCEV *Dist,
4181 const SCEV *Multiplier) -> const SCEV * {
4182 if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
4183 if (M->getOperand(i: 0) == Multiplier)
4184 return M->getOperand(i: 1);
4185 if (M->getOperand(i: 1) == Multiplier)
4186 return M->getOperand(i: 0);
4187 return nullptr;
4188 }
4189 if (Multiplier == Dist)
4190 return SE.getConstant(Ty: Dist->getType(), V: 1);
4191 return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
4192 };
4193 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4194 const SCEV *Stride = nullptr;
4195 if (Size != 1 || SCEVs.size() > 2) {
4196 const SCEV *Sz = SE.getConstant(Ty: Dist->getType(), V: Size * (SCEVs.size() - 1));
4197 Stride = TryGetStride(Dist, Sz);
4198 if (!Stride)
4199 return std::nullopt;
4200 }
4201 if (!Stride || isa<SCEVConstant>(Val: Stride))
4202 return std::nullopt;
4203 // Iterate through all pointers and check if all distances are
4204 // unique multiple of Stride.
4205 using DistOrdPair = std::pair<int64_t, int>;
4206 auto Compare = llvm::less_first();
4207 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4208 int Cnt = 0;
4209 bool IsConsecutive = true;
4210 for (const SCEV *PtrSCEV : SCEVs) {
4211 unsigned Dist = 0;
4212 if (PtrSCEV != PtrSCEVLowest) {
4213 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4214 const SCEV *Coeff = TryGetStride(Diff, Stride);
4215 if (!Coeff)
4216 return std::nullopt;
4217 const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
4218 if (!SC || isa<SCEVCouldNotCompute>(Val: SC))
4219 return std::nullopt;
4220 if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
4221 RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
4222 ->isZero())
4223 return std::nullopt;
4224 Dist = SC->getAPInt().getZExtValue();
4225 }
4226 // If the strides are not the same or repeated, we can't vectorize.
4227 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4228 return std::nullopt;
4229 auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
4230 if (!Res.second)
4231 return std::nullopt;
4232 // Consecutive order if the inserted element is the last one.
4233 IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
4234 ++Cnt;
4235 }
4236 if (Offsets.size() != SCEVs.size())
4237 return std::nullopt;
4238 SortedIndices.clear();
4239 if (!IsConsecutive) {
4240 // Fill SortedIndices array only if it is non-consecutive.
4241 SortedIndices.resize(N: PointerOps.size());
4242 Cnt = 0;
4243 for (const std::pair<int64_t, int> &Pair : Offsets) {
4244 SortedIndices[Cnt] = Pair.second;
4245 ++Cnt;
4246 }
4247 }
4248 if (!Inst)
4249 return nullptr;
4250 SCEVExpander Expander(SE, DL, "strided-load-vec");
4251 return Expander.expandCodeFor(SH: Stride, Ty: Stride->getType(), I: Inst);
4252}
4253
4254BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4255 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4256 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4257 // Check that a vectorized load would load the same memory as a scalar
4258 // load. For example, we don't want to vectorize loads that are smaller
4259 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4260 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4261 // from such a struct, we read/write packed bits disagreeing with the
4262 // unvectorized version.
4263 Type *ScalarTy = VL0->getType();
4264
4265 if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
4266 return LoadsState::Gather;
4267
4268 // Make sure all loads in the bundle are simple - we can't vectorize
4269 // atomic or volatile loads.
4270 PointerOps.clear();
4271 const unsigned Sz = VL.size();
4272 PointerOps.resize(N: Sz);
4273 auto *POIter = PointerOps.begin();
4274 for (Value *V : VL) {
4275 auto *L = cast<LoadInst>(Val: V);
4276 if (!L->isSimple())
4277 return LoadsState::Gather;
4278 *POIter = L->getPointerOperand();
4279 ++POIter;
4280 }
4281
4282 Order.clear();
4283 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: Sz);
4284 // Check the order of pointer operands or that all pointers are the same.
4285 bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order);
4286 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4287 if (!Order.empty() && !isPowerOf2_32(Value: VL.size())) {
4288 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4289 "supported with VectorizeNonPowerOf2");
4290 return LoadsState::Gather;
4291 }
4292
4293 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4294 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(Ty: VecTy) &&
4295 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment) &&
4296 calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
4297 return LoadsState::StridedVectorize;
4298 if (IsSorted || all_of(Range&: PointerOps, P: [&](Value *P) {
4299 return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
4300 })) {
4301 if (IsSorted) {
4302 Value *Ptr0;
4303 Value *PtrN;
4304 if (Order.empty()) {
4305 Ptr0 = PointerOps.front();
4306 PtrN = PointerOps.back();
4307 } else {
4308 Ptr0 = PointerOps[Order.front()];
4309 PtrN = PointerOps[Order.back()];
4310 }
4311 std::optional<int> Diff =
4312 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
4313 // Check that the sorted loads are consecutive.
4314 if (static_cast<unsigned>(*Diff) == Sz - 1)
4315 return LoadsState::Vectorize;
4316 // Simple check if not a strided access - clear order.
4317 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4318 // Try to generate strided load node if:
4319 // 1. Target with strided load support is detected.
4320 // 2. The number of loads is greater than MinProfitableStridedLoads,
4321 // or the potential stride <= MaxProfitableLoadStride and the
4322 // potential stride is power-of-2 (to avoid perf regressions for the very
4323 // small number of loads) and max distance > number of loads, or potential
4324 // stride is -1.
4325 // 3. The loads are ordered, or number of unordered loads <=
4326 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4327 // (this check is to avoid extra costs for very expensive shuffles).
4328 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4329 (static_cast<unsigned>(std::abs(x: *Diff)) <=
4330 MaxProfitableLoadStride * Sz &&
4331 isPowerOf2_32(Value: std::abs(x: *Diff)))) &&
4332 static_cast<unsigned>(std::abs(x: *Diff)) > Sz) ||
4333 *Diff == -(static_cast<int>(Sz) - 1))) {
4334 int Stride = *Diff / static_cast<int>(Sz - 1);
4335 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4336 Align Alignment =
4337 cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()])
4338 ->getAlign();
4339 if (TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment)) {
4340 // Iterate through all pointers and check if all distances are
4341 // unique multiple of Dist.
4342 SmallSet<int, 4> Dists;
4343 for (Value *Ptr : PointerOps) {
4344 int Dist = 0;
4345 if (Ptr == PtrN)
4346 Dist = *Diff;
4347 else if (Ptr != Ptr0)
4348 Dist =
4349 *getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL: *DL, SE&: *SE);
4350 // If the strides are not the same or repeated, we can't
4351 // vectorize.
4352 if (((Dist / Stride) * Stride) != Dist ||
4353 !Dists.insert(V: Dist).second)
4354 break;
4355 }
4356 if (Dists.size() == Sz)
4357 return LoadsState::StridedVectorize;
4358 }
4359 }
4360 }
4361 }
4362 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4363 unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
4364 unsigned MinVF = getMinVF(Sz);
4365 unsigned MaxVF = std::max<unsigned>(a: bit_floor(Value: VL.size() / 2), b: MinVF);
4366 MaxVF = std::min(a: getMaximumVF(ElemWidth: Sz, Opcode: Instruction::Load), b: MaxVF);
4367 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4368 unsigned VectorizedCnt = 0;
4369 SmallVector<LoadsState> States;
4370 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4371 Cnt += VF, ++VectorizedCnt) {
4372 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
4373 SmallVector<unsigned> Order;
4374 SmallVector<Value *> PointerOps;
4375 LoadsState LS =
4376 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
4377 /*TryRecursiveCheck=*/false);
4378 // Check that the sorted loads are consecutive.
4379 if (LS == LoadsState::Gather)
4380 break;
4381 // If need the reorder - consider as high-cost masked gather for now.
4382 if ((LS == LoadsState::Vectorize ||
4383 LS == LoadsState::StridedVectorize) &&
4384 !Order.empty() && !isReverseOrder(Order))
4385 LS = LoadsState::ScatterVectorize;
4386 States.push_back(Elt: LS);
4387 }
4388 // Can be vectorized later as a serie of loads/insertelements.
4389 if (VectorizedCnt == VL.size() / VF) {
4390 // Compare masked gather cost and loads + insersubvector costs.
4391 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4392 InstructionCost MaskedGatherCost = TTI.getGatherScatterOpCost(
4393 Opcode: Instruction::Load, DataTy: VecTy,
4394 Ptr: cast<LoadInst>(Val: VL0)->getPointerOperand(),
4395 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
4396 InstructionCost VecLdCost = 0;
4397 auto *SubVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF);
4398 for (auto [I, LS] : enumerate(First&: States)) {
4399 auto *LI0 = cast<LoadInst>(Val: VL[I * VF]);
4400 switch (LS) {
4401 case LoadsState::Vectorize:
4402 VecLdCost += TTI.getMemoryOpCost(
4403 Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
4404 AddressSpace: LI0->getPointerAddressSpace(), CostKind,
4405 OpdInfo: TTI::OperandValueInfo());
4406 break;
4407 case LoadsState::StridedVectorize:
4408 VecLdCost += TTI.getStridedMemoryOpCost(
4409 Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4410 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
4411 break;
4412 case LoadsState::ScatterVectorize:
4413 VecLdCost += TTI.getGatherScatterOpCost(
4414 Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4415 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
4416 break;
4417 case LoadsState::Gather:
4418 llvm_unreachable(
4419 "Expected only consecutive, strided or masked gather loads.");
4420 }
4421 SmallVector<int> ShuffleMask(VL.size());
4422 for (int Idx : seq<int>(Begin: 0, End: VL.size()))
4423 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4424 VecLdCost +=
4425 TTI.getShuffleCost(Kind: TTI ::SK_InsertSubvector, Tp: VecTy,
4426 Mask: ShuffleMask, CostKind, Index: I * VF, SubTp: SubVecTy);
4427 }
4428 // If masked gather cost is higher - better to vectorize, so
4429 // consider it as a gather node. It will be better estimated
4430 // later.
4431 if (MaskedGatherCost > VecLdCost)
4432 return true;
4433 }
4434 }
4435 return false;
4436 };
4437 // TODO: need to improve analysis of the pointers, if not all of them are
4438 // GEPs or have > 2 operands, we end up with a gather node, which just
4439 // increases the cost.
4440 Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
4441 bool ProfitableGatherPointers =
4442 L && Sz > 2 &&
4443 static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
4444 return L->isLoopInvariant(V);
4445 })) <= Sz / 2;
4446 if (ProfitableGatherPointers || all_of(Range&: PointerOps, P: [IsSorted](Value *P) {
4447 auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
4448 return (IsSorted && !GEP && doesNotNeedToBeScheduled(V: P)) ||
4449 (GEP && GEP->getNumOperands() == 2 &&
4450 isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: 1)));
4451 })) {
4452 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4453 if (TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) &&
4454 !TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment)) {
4455 // Check if potential masked gather can be represented as series
4456 // of loads + insertsubvectors.
4457 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4458 // If masked gather cost is higher - better to vectorize, so
4459 // consider it as a gather node. It will be better estimated
4460 // later.
4461 return LoadsState::Gather;
4462 }
4463 return LoadsState::ScatterVectorize;
4464 }
4465 }
4466 }
4467
4468 return LoadsState::Gather;
4469}
4470
4471static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
4472 const DataLayout &DL, ScalarEvolution &SE,
4473 SmallVectorImpl<unsigned> &SortedIndices) {
4474 assert(llvm::all_of(
4475 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4476 "Expected list of pointer operands.");
4477 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4478 // Ptr into, sort and return the sorted indices with values next to one
4479 // another.
4480 MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
4481 Bases[VL[0]].push_back(Elt: std::make_tuple(args: VL[0], args: 0U, args: 0U));
4482
4483 unsigned Cnt = 1;
4484 for (Value *Ptr : VL.drop_front()) {
4485 bool Found = any_of(Range&: Bases, P: [&](auto &Base) {
4486 std::optional<int> Diff =
4487 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4488 /*StrictCheck=*/true);
4489 if (!Diff)
4490 return false;
4491
4492 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4493 return true;
4494 });
4495
4496 if (!Found) {
4497 // If we haven't found enough to usefully cluster, return early.
4498 if (Bases.size() > VL.size() / 2 - 1)
4499 return false;
4500
4501 // Not found already - add a new Base
4502 Bases[Ptr].emplace_back(Args&: Ptr, Args: 0, Args: Cnt++);
4503 }
4504 }
4505
4506 // For each of the bases sort the pointers by Offset and check if any of the
4507 // base become consecutively allocated.
4508 bool AnyConsecutive = false;
4509 for (auto &Base : Bases) {
4510 auto &Vec = Base.second;
4511 if (Vec.size() > 1) {
4512 llvm::stable_sort(Range&: Vec, C: [](const std::tuple<Value *, int, unsigned> &X,
4513 const std::tuple<Value *, int, unsigned> &Y) {
4514 return std::get<1>(t: X) < std::get<1>(t: Y);
4515 });
4516 int InitialOffset = std::get<1>(t&: Vec[0]);
4517 AnyConsecutive |= all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
4518 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4519 });
4520 }
4521 }
4522
4523 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4524 SortedIndices.clear();
4525 if (!AnyConsecutive)
4526 return false;
4527
4528 for (auto &Base : Bases) {
4529 for (auto &T : Base.second)
4530 SortedIndices.push_back(Elt: std::get<2>(t&: T));
4531 }
4532
4533 assert(SortedIndices.size() == VL.size() &&
4534 "Expected SortedIndices to be the size of VL");
4535 return true;
4536}
4537
4538std::optional<BoUpSLP::OrdersType>
4539BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4540 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4541 Type *ScalarTy = TE.Scalars[0]->getType();
4542
4543 SmallVector<Value *> Ptrs;
4544 Ptrs.reserve(N: TE.Scalars.size());
4545 for (Value *V : TE.Scalars) {
4546 auto *L = dyn_cast<LoadInst>(Val: V);
4547 if (!L || !L->isSimple())
4548 return std::nullopt;
4549 Ptrs.push_back(Elt: L->getPointerOperand());
4550 }
4551
4552 BoUpSLP::OrdersType Order;
4553 if (clusterSortPtrAccesses(VL: Ptrs, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
4554 return std::move(Order);
4555 return std::nullopt;
4556}
4557
4558/// Check if two insertelement instructions are from the same buildvector.
4559static bool areTwoInsertFromSameBuildVector(
4560 InsertElementInst *VU, InsertElementInst *V,
4561 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4562 // Instructions must be from the same basic blocks.
4563 if (VU->getParent() != V->getParent())
4564 return false;
4565 // Checks if 2 insertelements are from the same buildvector.
4566 if (VU->getType() != V->getType())
4567 return false;
4568 // Multiple used inserts are separate nodes.
4569 if (!VU->hasOneUse() && !V->hasOneUse())
4570 return false;
4571 auto *IE1 = VU;
4572 auto *IE2 = V;
4573 std::optional<unsigned> Idx1 = getInsertIndex(InsertInst: IE1);
4574 std::optional<unsigned> Idx2 = getInsertIndex(InsertInst: IE2);
4575 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4576 return false;
4577 // Go through the vector operand of insertelement instructions trying to find
4578 // either VU as the original vector for IE2 or V as the original vector for
4579 // IE1.
4580 SmallBitVector ReusedIdx(
4581 cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
4582 bool IsReusedIdx = false;
4583 do {
4584 if (IE2 == VU && !IE1)
4585 return VU->hasOneUse();
4586 if (IE1 == V && !IE2)
4587 return V->hasOneUse();
4588 if (IE1 && IE1 != V) {
4589 unsigned Idx1 = getInsertIndex(InsertInst: IE1).value_or(u&: *Idx2);
4590 IsReusedIdx |= ReusedIdx.test(Idx: Idx1);
4591 ReusedIdx.set(Idx1);
4592 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4593 IE1 = nullptr;
4594 else
4595 IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE1));
4596 }
4597 if (IE2 && IE2 != VU) {
4598 unsigned Idx2 = getInsertIndex(InsertInst: IE2).value_or(u&: *Idx1);
4599 IsReusedIdx |= ReusedIdx.test(Idx: Idx2);
4600 ReusedIdx.set(Idx2);
4601 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4602 IE2 = nullptr;
4603 else
4604 IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE2));
4605 }
4606 } while (!IsReusedIdx && (IE1 || IE2));
4607 return false;
4608}
4609
4610std::optional<BoUpSLP::OrdersType>
4611BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4612 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4613 if (TE.isNonPowOf2Vec())
4614 return std::nullopt;
4615
4616 // No need to reorder if need to shuffle reuses, still need to shuffle the
4617 // node.
4618 if (!TE.ReuseShuffleIndices.empty()) {
4619 if (isSplat(VL: TE.Scalars))
4620 return std::nullopt;
4621 // Check if reuse shuffle indices can be improved by reordering.
4622 // For this, check that reuse mask is "clustered", i.e. each scalar values
4623 // is used once in each submask of size <number_of_scalars>.
4624 // Example: 4 scalar values.
4625 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4626 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4627 // element 3 is used twice in the second submask.
4628 unsigned Sz = TE.Scalars.size();
4629 if (TE.State == TreeEntry::NeedToGather) {
4630 if (std::optional<OrdersType> CurrentOrder =
4631 findReusedOrderedScalars(TE)) {
4632 SmallVector<int> Mask;
4633 fixupOrderingIndices(Order: *CurrentOrder);
4634 inversePermutation(Indices: *CurrentOrder, Mask);
4635 ::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
4636 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4637 unsigned Sz = TE.Scalars.size();
4638 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4639 for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
4640 if (Idx != PoisonMaskElem)
4641 Res[Idx + K * Sz] = I + K * Sz;
4642 }
4643 return std::move(Res);
4644 }
4645 }
4646 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4647 TTI->getNumberOfParts(Tp: FixedVectorType::get(
4648 ElementType: TE.Scalars.front()->getType(), NumElts: 2 * TE.getVectorFactor())) == 1)
4649 return std::nullopt;
4650 if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
4651 VF: Sz)) {
4652 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4653 if (TE.ReorderIndices.empty())
4654 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
4655 else
4656 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
4657 ::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
4658 unsigned VF = ReorderMask.size();
4659 OrdersType ResOrder(VF, VF);
4660 unsigned NumParts = VF / Sz;
4661 SmallBitVector UsedVals(NumParts);
4662 for (unsigned I = 0; I < VF; I += Sz) {
4663 int Val = PoisonMaskElem;
4664 unsigned UndefCnt = 0;
4665 if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Sz),
4666 P: [&](int Idx) {
4667 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4668 Val = Idx;
4669 if (Idx == PoisonMaskElem)
4670 ++UndefCnt;
4671 return Idx != PoisonMaskElem && Idx != Val;
4672 }) ||
4673 Val >= static_cast<int>(NumParts) || UsedVals.test(Idx: Val) ||
4674 UndefCnt > Sz / 2)
4675 return std::nullopt;
4676 UsedVals.set(Val);
4677 for (unsigned K = 0; K < NumParts; ++K)
4678 ResOrder[Val + Sz * K] = I + K;
4679 }
4680 return std::move(ResOrder);
4681 }
4682 unsigned VF = TE.getVectorFactor();
4683 // Try build correct order for extractelement instructions.
4684 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4685 TE.ReuseShuffleIndices.end());
4686 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4687 all_of(Range: TE.Scalars, P: [Sz](Value *V) {
4688 std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
4689 return Idx && *Idx < Sz;
4690 })) {
4691 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4692 if (TE.ReorderIndices.empty())
4693 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
4694 else
4695 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
4696 for (unsigned I = 0; I < VF; ++I) {
4697 int &Idx = ReusedMask[I];
4698 if (Idx == PoisonMaskElem)
4699 continue;
4700 Value *V = TE.Scalars[ReorderMask[Idx]];
4701 std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
4702 Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
4703 }
4704 }
4705 // Build the order of the VF size, need to reorder reuses shuffles, they are
4706 // always of VF size.
4707 OrdersType ResOrder(VF);
4708 std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: 0);
4709 auto *It = ResOrder.begin();
4710 for (unsigned K = 0; K < VF; K += Sz) {
4711 OrdersType CurrentOrder(TE.ReorderIndices);
4712 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
4713 if (SubMask.front() == PoisonMaskElem)
4714 std::iota(first: SubMask.begin(), last: SubMask.end(), value: 0);
4715 reorderOrder(Order&: CurrentOrder, Mask: SubMask);
4716 transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
4717 std::advance(i&: It, n: Sz);
4718 }
4719 if (TE.State == TreeEntry::NeedToGather &&
4720 all_of(Range: enumerate(First&: ResOrder),
4721 P: [](const auto &Data) { return Data.index() == Data.value(); }))
4722 return std::nullopt; // No need to reorder.
4723 return std::move(ResOrder);
4724 }
4725 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4726 any_of(Range: TE.UserTreeIndices,
4727 P: [](const EdgeInfo &EI) {
4728 return !Instruction::isBinaryOp(Opcode: EI.UserTE->getOpcode());
4729 }) &&
4730 (TE.ReorderIndices.empty() || isReverseOrder(Order: TE.ReorderIndices)))
4731 return std::nullopt;
4732 if ((TE.State == TreeEntry::Vectorize ||
4733 TE.State == TreeEntry::StridedVectorize) &&
4734 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) ||
4735 (TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))) &&
4736 !TE.isAltShuffle())
4737 return TE.ReorderIndices;
4738 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4739 auto PHICompare = [&](unsigned I1, unsigned I2) {
4740 Value *V1 = TE.Scalars[I1];
4741 Value *V2 = TE.Scalars[I2];
4742 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4743 return false;
4744 if (V1->getNumUses() < V2->getNumUses())
4745 return true;
4746 if (V1->getNumUses() > V2->getNumUses())
4747 return false;
4748 auto *FirstUserOfPhi1 = cast<Instruction>(Val: *V1->user_begin());
4749 auto *FirstUserOfPhi2 = cast<Instruction>(Val: *V2->user_begin());
4750 if (auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1))
4751 if (auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2)) {
4752 if (!areTwoInsertFromSameBuildVector(
4753 VU: IE1, V: IE2,
4754 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); }))
4755 return I1 < I2;
4756 return getInsertIndex(InsertInst: IE1) < getInsertIndex(InsertInst: IE2);
4757 }
4758 if (auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1))
4759 if (auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2)) {
4760 if (EE1->getOperand(i_nocapture: 0) != EE2->getOperand(i_nocapture: 0))
4761 return I1 < I2;
4762 return getInsertIndex(InsertInst: EE1) < getInsertIndex(InsertInst: EE2);
4763 }
4764 return I1 < I2;
4765 };
4766 auto IsIdentityOrder = [](const OrdersType &Order) {
4767 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Order.size()))
4768 if (Idx != Order[Idx])
4769 return false;
4770 return true;
4771 };
4772 if (!TE.ReorderIndices.empty())
4773 return TE.ReorderIndices;
4774 DenseMap<unsigned, unsigned> PhiToId;
4775 SmallVector<unsigned> Phis(TE.Scalars.size());
4776 std::iota(first: Phis.begin(), last: Phis.end(), value: 0);
4777 OrdersType ResOrder(TE.Scalars.size());
4778 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4779 PhiToId[Id] = Id;
4780 stable_sort(Range&: Phis, C: PHICompare);
4781 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4782 ResOrder[Id] = PhiToId[Phis[Id]];
4783 if (IsIdentityOrder(ResOrder))
4784 return std::nullopt; // No need to reorder.
4785 return std::move(ResOrder);
4786 }
4787 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4788 allSameType(VL: TE.Scalars)) {
4789 // TODO: add analysis of other gather nodes with extractelement
4790 // instructions and other values/instructions, not only undefs.
4791 if ((TE.getOpcode() == Instruction::ExtractElement ||
4792 (all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
4793 any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
4794 all_of(Range: TE.Scalars, P: [](Value *V) {
4795 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
4796 return !EE || isa<FixedVectorType>(Val: EE->getVectorOperandType());
4797 })) {
4798 // Check that gather of extractelements can be represented as
4799 // just a shuffle of a single vector.
4800 OrdersType CurrentOrder;
4801 bool Reuse = canReuseExtract(VL: TE.Scalars, OpValue: TE.getMainOp(), CurrentOrder,
4802 /*ResizeAllowed=*/true);
4803 if (Reuse || !CurrentOrder.empty())
4804 return std::move(CurrentOrder);
4805 }
4806 // If the gather node is <undef, v, .., poison> and
4807 // insertelement poison, v, 0 [+ permute]
4808 // is cheaper than
4809 // insertelement poison, v, n - try to reorder.
4810 // If rotating the whole graph, exclude the permute cost, the whole graph
4811 // might be transformed.
4812 int Sz = TE.Scalars.size();
4813 if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
4814 count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - 1) {
4815 const auto *It =
4816 find_if(Range: TE.Scalars, P: [](Value *V) { return !isConstant(V); });
4817 if (It == TE.Scalars.begin())
4818 return OrdersType();
4819 auto *Ty = FixedVectorType::get(ElementType: TE.Scalars.front()->getType(), NumElts: Sz);
4820 if (It != TE.Scalars.end()) {
4821 OrdersType Order(Sz, Sz);
4822 unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
4823 Order[Idx] = 0;
4824 fixupOrderingIndices(Order);
4825 SmallVector<int> Mask;
4826 inversePermutation(Indices: Order, Mask);
4827 InstructionCost PermuteCost =
4828 TopToBottom
4829 ? 0
4830 : TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
4831 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4832 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: 0,
4833 Op0: PoisonValue::get(T: Ty), Op1: *It);
4834 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4835 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
4836 Op0: PoisonValue::get(T: Ty), Op1: *It);
4837 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4838 OrdersType Order(Sz, Sz);
4839 Order[Idx] = 0;
4840 return std::move(Order);
4841 }
4842 }
4843 }
4844 if (isSplat(VL: TE.Scalars))
4845 return std::nullopt;
4846 if (TE.Scalars.size() >= 4)
4847 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4848 return Order;
4849 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4850 return CurrentOrder;
4851 }
4852 return std::nullopt;
4853}
4854
4855/// Checks if the given mask is a "clustered" mask with the same clusters of
4856/// size \p Sz, which are not identity submasks.
4857static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
4858 unsigned Sz) {
4859 ArrayRef<int> FirstCluster = Mask.slice(N: 0, M: Sz);
4860 if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
4861 return false;
4862 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4863 ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
4864 if (Cluster != FirstCluster)
4865 return false;
4866 }
4867 return true;
4868}
4869
4870void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4871 // Reorder reuses mask.
4872 reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
4873 const unsigned Sz = TE.Scalars.size();
4874 // For vectorized and non-clustered reused no need to do anything else.
4875 if (TE.State != TreeEntry::NeedToGather ||
4876 !ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
4877 VF: Sz) ||
4878 !isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
4879 return;
4880 SmallVector<int> NewMask;
4881 inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
4882 addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
4883 // Clear reorder since it is going to be applied to the new mask.
4884 TE.ReorderIndices.clear();
4885 // Try to improve gathered nodes with clustered reuses, if possible.
4886 ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: 0, M: Sz);
4887 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4888 inversePermutation(Indices: NewOrder, Mask&: NewMask);
4889 reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
4890 // Fill the reuses mask with the identity submasks.
4891 for (auto *It = TE.ReuseShuffleIndices.begin(),
4892 *End = TE.ReuseShuffleIndices.end();
4893 It != End; std::advance(i&: It, n: Sz))
4894 std::iota(first: It, last: std::next(x: It, n: Sz), value: 0);
4895}
4896
4897static void combineOrders(MutableArrayRef<unsigned> Order,
4898 ArrayRef<unsigned> SecondaryOrder) {
4899 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
4900 "Expected same size of orders");
4901 unsigned Sz = Order.size();
4902 SmallBitVector UsedIndices(Sz);
4903 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) {
4904 if (Order[Idx] != Sz)
4905 UsedIndices.set(Order[Idx]);
4906 }
4907 if (SecondaryOrder.empty()) {
4908 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
4909 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
4910 Order[Idx] = Idx;
4911 } else {
4912 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
4913 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
4914 !UsedIndices.test(Idx: SecondaryOrder[Idx]))
4915 Order[Idx] = SecondaryOrder[Idx];
4916 }
4917}
4918
4919void BoUpSLP::reorderTopToBottom() {
4920 // Maps VF to the graph nodes.
4921 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
4922 // ExtractElement gather nodes which can be vectorized and need to handle
4923 // their ordering.
4924 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
4925
4926 // Phi nodes can have preferred ordering based on their result users
4927 DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
4928
4929 // AltShuffles can also have a preferred ordering that leads to fewer
4930 // instructions, e.g., the addsub instruction in x86.
4931 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
4932
4933 // Maps a TreeEntry to the reorder indices of external users.
4934 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
4935 ExternalUserReorderMap;
4936 // Find all reorderable nodes with the given VF.
4937 // Currently the are vectorized stores,loads,extracts + some gathering of
4938 // extracts.
4939 for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
4940 const std::unique_ptr<TreeEntry> &TE) {
4941 // Look for external users that will probably be vectorized.
4942 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
4943 findExternalStoreUsersReorderIndices(TE: TE.get());
4944 if (!ExternalUserReorderIndices.empty()) {
4945 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
4946 ExternalUserReorderMap.try_emplace(Key: TE.get(),
4947 Args: std::move(ExternalUserReorderIndices));
4948 }
4949
4950 // Patterns like [fadd,fsub] can be combined into a single instruction in
4951 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
4952 // to take into account their order when looking for the most used order.
4953 if (TE->isAltShuffle()) {
4954 VectorType *VecTy =
4955 FixedVectorType::get(ElementType: TE->Scalars[0]->getType(), NumElts: TE->Scalars.size());
4956 unsigned Opcode0 = TE->getOpcode();
4957 unsigned Opcode1 = TE->getAltOpcode();
4958 // The opcode mask selects between the two opcodes.
4959 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
4960 for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size()))
4961 if (cast<Instruction>(Val: TE->Scalars[Lane])->getOpcode() == Opcode1)
4962 OpcodeMask.set(Lane);
4963 // If this pattern is supported by the target then we consider the order.
4964 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
4965 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
4966 AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType());
4967 }
4968 // TODO: Check the reverse order too.
4969 }
4970
4971 if (std::optional<OrdersType> CurrentOrder =
4972 getReorderingData(TE: *TE, /*TopToBottom=*/true)) {
4973 // Do not include ordering for nodes used in the alt opcode vectorization,
4974 // better to reorder them during bottom-to-top stage. If follow the order
4975 // here, it causes reordering of the whole graph though actually it is
4976 // profitable just to reorder the subgraph that starts from the alternate
4977 // opcode vectorization node. Such nodes already end-up with the shuffle
4978 // instruction and it is just enough to change this shuffle rather than
4979 // rotate the scalars for the whole graph.
4980 unsigned Cnt = 0;
4981 const TreeEntry *UserTE = TE.get();
4982 while (UserTE && Cnt < RecursionMaxDepth) {
4983 if (UserTE->UserTreeIndices.size() != 1)
4984 break;
4985 if (all_of(Range: UserTE->UserTreeIndices, P: [](const EdgeInfo &EI) {
4986 return EI.UserTE->State == TreeEntry::Vectorize &&
4987 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
4988 }))
4989 return;
4990 UserTE = UserTE->UserTreeIndices.back().UserTE;
4991 ++Cnt;
4992 }
4993 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
4994 if (!(TE->State == TreeEntry::Vectorize ||
4995 TE->State == TreeEntry::StridedVectorize) ||
4996 !TE->ReuseShuffleIndices.empty())
4997 GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
4998 if (TE->State == TreeEntry::Vectorize &&
4999 TE->getOpcode() == Instruction::PHI)
5000 PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
5001 }
5002 });
5003
5004 // Reorder the graph nodes according to their vectorization factor.
5005 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5006 VF /= 2) {
5007 auto It = VFToOrderedEntries.find(Val: VF);
5008 if (It == VFToOrderedEntries.end())
5009 continue;
5010 // Try to find the most profitable order. We just are looking for the most
5011 // used order and reorder scalar elements in the nodes according to this
5012 // mostly used order.
5013 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5014 // All operands are reordered and used only in this node - propagate the
5015 // most used order to the user node.
5016 MapVector<OrdersType, unsigned,
5017 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5018 OrdersUses;
5019 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5020 for (const TreeEntry *OpTE : OrderedEntries) {
5021 // No need to reorder this nodes, still need to extend and to use shuffle,
5022 // just need to merge reordering shuffle and the reuse shuffle.
5023 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE))
5024 continue;
5025 // Count number of orders uses.
5026 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5027 &PhisToOrders]() -> const OrdersType & {
5028 if (OpTE->State == TreeEntry::NeedToGather ||
5029 !OpTE->ReuseShuffleIndices.empty()) {
5030 auto It = GathersToOrders.find(Val: OpTE);
5031 if (It != GathersToOrders.end())
5032 return It->second;
5033 }
5034 if (OpTE->isAltShuffle()) {
5035 auto It = AltShufflesToOrders.find(Val: OpTE);
5036 if (It != AltShufflesToOrders.end())
5037 return It->second;
5038 }
5039 if (OpTE->State == TreeEntry::Vectorize &&
5040 OpTE->getOpcode() == Instruction::PHI) {
5041 auto It = PhisToOrders.find(Val: OpTE);
5042 if (It != PhisToOrders.end())
5043 return It->second;
5044 }
5045 return OpTE->ReorderIndices;
5046 }();
5047 // First consider the order of the external scalar users.
5048 auto It = ExternalUserReorderMap.find(Val: OpTE);
5049 if (It != ExternalUserReorderMap.end()) {
5050 const auto &ExternalUserReorderIndices = It->second;
5051 // If the OpTE vector factor != number of scalars - use natural order,
5052 // it is an attempt to reorder node with reused scalars but with
5053 // external uses.
5054 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5055 OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0)).first->second +=
5056 ExternalUserReorderIndices.size();
5057 } else {
5058 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5059 ++OrdersUses.insert(KV: std::make_pair(x: ExtOrder, y: 0)).first->second;
5060 }
5061 // No other useful reorder data in this entry.
5062 if (Order.empty())
5063 continue;
5064 }
5065 // Stores actually store the mask, not the order, need to invert.
5066 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5067 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5068 SmallVector<int> Mask;
5069 inversePermutation(Indices: Order, Mask);
5070 unsigned E = Order.size();
5071 OrdersType CurrentOrder(E, E);
5072 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5073 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5074 });
5075 fixupOrderingIndices(Order: CurrentOrder);
5076 ++OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second;
5077 } else {
5078 ++OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second;
5079 }
5080 }
5081 if (OrdersUses.empty())
5082 continue;
5083 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5084 const unsigned Sz = Order.size();
5085 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
5086 if (Idx != Order[Idx] && Order[Idx] != Sz)
5087 return false;
5088 return true;
5089 };
5090 // Choose the most used order.
5091 unsigned IdentityCnt = 0;
5092 unsigned FilledIdentityCnt = 0;
5093 OrdersType IdentityOrder(VF, VF);
5094 for (auto &Pair : OrdersUses) {
5095 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5096 if (!Pair.first.empty())
5097 FilledIdentityCnt += Pair.second;
5098 IdentityCnt += Pair.second;
5099 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5100 }
5101 }
5102 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5103 unsigned Cnt = IdentityCnt;
5104 for (auto &Pair : OrdersUses) {
5105 // Prefer identity order. But, if filled identity found (non-empty order)
5106 // with same number of uses, as the new candidate order, we can choose
5107 // this candidate order.
5108 if (Cnt < Pair.second ||
5109 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5110 Cnt == Pair.second && !BestOrder.empty() &&
5111 IsIdentityOrder(BestOrder))) {
5112 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5113 BestOrder = Pair.first;
5114 Cnt = Pair.second;
5115 } else {
5116 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5117 }
5118 }
5119 // Set order of the user node.
5120 if (IsIdentityOrder(BestOrder))
5121 continue;
5122 fixupOrderingIndices(Order: BestOrder);
5123 SmallVector<int> Mask;
5124 inversePermutation(Indices: BestOrder, Mask);
5125 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5126 unsigned E = BestOrder.size();
5127 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5128 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5129 });
5130 // Do an actual reordering, if profitable.
5131 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5132 // Just do the reordering for the nodes with the given VF.
5133 if (TE->Scalars.size() != VF) {
5134 if (TE->ReuseShuffleIndices.size() == VF) {
5135 // Need to reorder the reuses masks of the operands with smaller VF to
5136 // be able to find the match between the graph nodes and scalar
5137 // operands of the given node during vectorization/cost estimation.
5138 assert(all_of(TE->UserTreeIndices,
5139 [VF, &TE](const EdgeInfo &EI) {
5140 return EI.UserTE->Scalars.size() == VF ||
5141 EI.UserTE->Scalars.size() ==
5142 TE->Scalars.size();
5143 }) &&
5144 "All users must be of VF size.");
5145 // Update ordering of the operands with the smaller VF than the given
5146 // one.
5147 reorderNodeWithReuses(TE&: *TE, Mask);
5148 }
5149 continue;
5150 }
5151 if ((TE->State == TreeEntry::Vectorize ||
5152 TE->State == TreeEntry::StridedVectorize) &&
5153 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
5154 InsertElementInst>(Val: TE->getMainOp()) &&
5155 !TE->isAltShuffle()) {
5156 // Build correct orders for extract{element,value}, loads and
5157 // stores.
5158 reorderOrder(Order&: TE->ReorderIndices, Mask);
5159 if (isa<InsertElementInst, StoreInst>(Val: TE->getMainOp()))
5160 TE->reorderOperands(Mask);
5161 } else {
5162 // Reorder the node and its operands.
5163 TE->reorderOperands(Mask);
5164 assert(TE->ReorderIndices.empty() &&
5165 "Expected empty reorder sequence.");
5166 reorderScalars(Scalars&: TE->Scalars, Mask);
5167 }
5168 if (!TE->ReuseShuffleIndices.empty()) {
5169 // Apply reversed order to keep the original ordering of the reused
5170 // elements to avoid extra reorder indices shuffling.
5171 OrdersType CurrentOrder;
5172 reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
5173 SmallVector<int> NewReuses;
5174 inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
5175 addMask(Mask&: NewReuses, SubMask: TE->ReuseShuffleIndices);
5176 TE->ReuseShuffleIndices.swap(RHS&: NewReuses);
5177 }
5178 }
5179 }
5180}
5181
5182bool BoUpSLP::canReorderOperands(
5183 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5184 ArrayRef<TreeEntry *> ReorderableGathers,
5185 SmallVectorImpl<TreeEntry *> &GatherOps) {
5186 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5187 if (UserTE->isNonPowOf2Vec())
5188 return false;
5189
5190 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5191 if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5192 return OpData.first == I &&
5193 (OpData.second->State == TreeEntry::Vectorize ||
5194 OpData.second->State == TreeEntry::StridedVectorize);
5195 }))
5196 continue;
5197 if (TreeEntry *TE = getVectorizedOperand(UserTE, OpIdx: I)) {
5198 // Do not reorder if operand node is used by many user nodes.
5199 if (any_of(Range&: TE->UserTreeIndices,
5200 P: [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5201 return false;
5202 // Add the node to the list of the ordered nodes with the identity
5203 // order.
5204 Edges.emplace_back(Args&: I, Args&: TE);
5205 // Add ScatterVectorize nodes to the list of operands, where just
5206 // reordering of the scalars is required. Similar to the gathers, so
5207 // simply add to the list of gathered ops.
5208 // If there are reused scalars, process this node as a regular vectorize
5209 // node, just reorder reuses mask.
5210 if (TE->State != TreeEntry::Vectorize &&
5211 TE->State != TreeEntry::StridedVectorize &&
5212 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5213 GatherOps.push_back(Elt: TE);
5214 continue;
5215 }
5216 TreeEntry *Gather = nullptr;
5217 if (count_if(Range&: ReorderableGathers,
5218 P: [&Gather, UserTE, I](TreeEntry *TE) {
5219 assert(TE->State != TreeEntry::Vectorize &&
5220 TE->State != TreeEntry::StridedVectorize &&
5221 "Only non-vectorized nodes are expected.");
5222 if (any_of(Range&: TE->UserTreeIndices,
5223 P: [UserTE, I](const EdgeInfo &EI) {
5224 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5225 })) {
5226 assert(TE->isSame(UserTE->getOperand(I)) &&
5227 "Operand entry does not match operands.");
5228 Gather = TE;
5229 return true;
5230 }
5231 return false;
5232 }) > 1 &&
5233 !allConstant(VL: UserTE->getOperand(OpIdx: I)))
5234 return false;
5235 if (Gather)
5236 GatherOps.push_back(Elt: Gather);
5237 }
5238 return true;
5239}
5240
5241void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5242 SetVector<TreeEntry *> OrderedEntries;
5243 DenseSet<const TreeEntry *> GathersToOrders;
5244 // Find all reorderable leaf nodes with the given VF.
5245 // Currently the are vectorized loads,extracts without alternate operands +
5246 // some gathering of extracts.
5247 SmallVector<TreeEntry *> NonVectorized;
5248 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5249 if (TE->State != TreeEntry::Vectorize &&
5250 TE->State != TreeEntry::StridedVectorize)
5251 NonVectorized.push_back(Elt: TE.get());
5252 if (std::optional<OrdersType> CurrentOrder =
5253 getReorderingData(TE: *TE, /*TopToBottom=*/false)) {
5254 OrderedEntries.insert(X: TE.get());
5255 if (!(TE->State == TreeEntry::Vectorize ||
5256 TE->State == TreeEntry::StridedVectorize) ||
5257 !TE->ReuseShuffleIndices.empty())
5258 GathersToOrders.insert(V: TE.get());
5259 }
5260 }
5261
5262 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5263 // I.e., if the node has operands, that are reordered, try to make at least
5264 // one operand order in the natural order and reorder others + reorder the
5265 // user node itself.
5266 SmallPtrSet<const TreeEntry *, 4> Visited;
5267 while (!OrderedEntries.empty()) {
5268 // 1. Filter out only reordered nodes.
5269 // 2. If the entry has multiple uses - skip it and jump to the next node.
5270 DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
5271 SmallVector<TreeEntry *> Filtered;
5272 for (TreeEntry *TE : OrderedEntries) {
5273 if (!(TE->State == TreeEntry::Vectorize ||
5274 TE->State == TreeEntry::StridedVectorize ||
5275 (TE->State == TreeEntry::NeedToGather &&
5276 GathersToOrders.contains(V: TE))) ||
5277 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5278 !all_of(Range: drop_begin(RangeOrContainer&: TE->UserTreeIndices),
5279 P: [TE](const EdgeInfo &EI) {
5280 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5281 }) ||
5282 !Visited.insert(Ptr: TE).second) {
5283 Filtered.push_back(Elt: TE);
5284 continue;
5285 }
5286 // Build a map between user nodes and their operands order to speedup
5287 // search. The graph currently does not provide this dependency directly.
5288 for (EdgeInfo &EI : TE->UserTreeIndices) {
5289 TreeEntry *UserTE = EI.UserTE;
5290 auto It = Users.find(Val: UserTE);
5291 if (It == Users.end())
5292 It = Users.insert(KV: {UserTE, {}}).first;
5293 It->second.emplace_back(Args&: EI.EdgeIdx, Args&: TE);
5294 }
5295 }
5296 // Erase filtered entries.
5297 for (TreeEntry *TE : Filtered)
5298 OrderedEntries.remove(X: TE);
5299 SmallVector<
5300 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5301 UsersVec(Users.begin(), Users.end());
5302 sort(C&: UsersVec, Comp: [](const auto &Data1, const auto &Data2) {
5303 return Data1.first->Idx > Data2.first->Idx;
5304 });
5305 for (auto &Data : UsersVec) {
5306 // Check that operands are used only in the User node.
5307 SmallVector<TreeEntry *> GatherOps;
5308 if (!canReorderOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
5309 GatherOps)) {
5310 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5311 OrderedEntries.remove(X: Op.second);
5312 continue;
5313 }
5314 // All operands are reordered and used only in this node - propagate the
5315 // most used order to the user node.
5316 MapVector<OrdersType, unsigned,
5317 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5318 OrdersUses;
5319 // Do the analysis for each tree entry only once, otherwise the order of
5320 // the same node my be considered several times, though might be not
5321 // profitable.
5322 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5323 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
5324 for (const auto &Op : Data.second) {
5325 TreeEntry *OpTE = Op.second;
5326 if (!VisitedOps.insert(Ptr: OpTE).second)
5327 continue;
5328 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
5329 continue;
5330 const auto Order = [&]() -> const OrdersType {
5331 if (OpTE->State == TreeEntry::NeedToGather ||
5332 !OpTE->ReuseShuffleIndices.empty())
5333 return getReorderingData(TE: *OpTE, /*TopToBottom=*/false)
5334 .value_or(u: OrdersType(1));
5335 return OpTE->ReorderIndices;
5336 }();
5337 // The order is partially ordered, skip it in favor of fully non-ordered
5338 // orders.
5339 if (Order.size() == 1)
5340 continue;
5341 unsigned NumOps = count_if(
5342 Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5343 return P.second == OpTE;
5344 });
5345 // Stores actually store the mask, not the order, need to invert.
5346 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5347 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5348 SmallVector<int> Mask;
5349 inversePermutation(Indices: Order, Mask);
5350 unsigned E = Order.size();
5351 OrdersType CurrentOrder(E, E);
5352 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5353 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5354 });
5355 fixupOrderingIndices(Order: CurrentOrder);
5356 OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second +=
5357 NumOps;
5358 } else {
5359 OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second += NumOps;
5360 }
5361 auto Res = OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0));
5362 const auto AllowsReordering = [&](const TreeEntry *TE) {
5363 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5364 if (TE->isNonPowOf2Vec())
5365 return false;
5366 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5367 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5368 (IgnoreReorder && TE->Idx == 0))
5369 return true;
5370 if (TE->State == TreeEntry::NeedToGather) {
5371 if (GathersToOrders.contains(V: TE))
5372 return !getReorderingData(TE: *TE, /*TopToBottom=*/false)
5373 .value_or(u: OrdersType(1))
5374 .empty();
5375 return true;
5376 }
5377 return false;
5378 };
5379 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5380 TreeEntry *UserTE = EI.UserTE;
5381 if (!VisitedUsers.insert(Ptr: UserTE).second)
5382 continue;
5383 // May reorder user node if it requires reordering, has reused
5384 // scalars, is an alternate op vectorize node or its op nodes require
5385 // reordering.
5386 if (AllowsReordering(UserTE))
5387 continue;
5388 // Check if users allow reordering.
5389 // Currently look up just 1 level of operands to avoid increase of
5390 // the compile time.
5391 // Profitable to reorder if definitely more operands allow
5392 // reordering rather than those with natural order.
5393 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
5394 if (static_cast<unsigned>(count_if(
5395 Range&: Ops, P: [UserTE, &AllowsReordering](
5396 const std::pair<unsigned, TreeEntry *> &Op) {
5397 return AllowsReordering(Op.second) &&
5398 all_of(Range&: Op.second->UserTreeIndices,
5399 P: [UserTE](const EdgeInfo &EI) {
5400 return EI.UserTE == UserTE;
5401 });
5402 })) <= Ops.size() / 2)
5403 ++Res.first->second;
5404 }
5405 }
5406 if (OrdersUses.empty()) {
5407 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5408 OrderedEntries.remove(X: Op.second);
5409 continue;
5410 }
5411 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5412 const unsigned Sz = Order.size();
5413 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
5414 if (Idx != Order[Idx] && Order[Idx] != Sz)
5415 return false;
5416 return true;
5417 };
5418 // Choose the most used order.
5419 unsigned IdentityCnt = 0;
5420 unsigned VF = Data.second.front().second->getVectorFactor();
5421 OrdersType IdentityOrder(VF, VF);
5422 for (auto &Pair : OrdersUses) {
5423 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5424 IdentityCnt += Pair.second;
5425 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5426 }
5427 }
5428 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5429 unsigned Cnt = IdentityCnt;
5430 for (auto &Pair : OrdersUses) {
5431 // Prefer identity order. But, if filled identity found (non-empty
5432 // order) with same number of uses, as the new candidate order, we can
5433 // choose this candidate order.
5434 if (Cnt < Pair.second) {
5435 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5436 BestOrder = Pair.first;
5437 Cnt = Pair.second;
5438 } else {
5439 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5440 }
5441 }
5442 // Set order of the user node.
5443 if (IsIdentityOrder(BestOrder)) {
5444 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5445 OrderedEntries.remove(X: Op.second);
5446 continue;
5447 }
5448 fixupOrderingIndices(Order: BestOrder);
5449 // Erase operands from OrderedEntries list and adjust their orders.
5450 VisitedOps.clear();
5451 SmallVector<int> Mask;
5452 inversePermutation(Indices: BestOrder, Mask);
5453 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5454 unsigned E = BestOrder.size();
5455 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5456 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5457 });
5458 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5459 TreeEntry *TE = Op.second;
5460 OrderedEntries.remove(X: TE);
5461 if (!VisitedOps.insert(Ptr: TE).second)
5462 continue;
5463 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5464 reorderNodeWithReuses(TE&: *TE, Mask);
5465 continue;
5466 }
5467 // Gathers are processed separately.
5468 if (TE->State != TreeEntry::Vectorize &&
5469 TE->State != TreeEntry::StridedVectorize &&
5470 (TE->State != TreeEntry::ScatterVectorize ||
5471 TE->ReorderIndices.empty()))
5472 continue;
5473 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5474 TE->ReorderIndices.empty()) &&
5475 "Non-matching sizes of user/operand entries.");
5476 reorderOrder(Order&: TE->ReorderIndices, Mask);
5477 if (IgnoreReorder && TE == VectorizableTree.front().get())
5478 IgnoreReorder = false;
5479 }
5480 // For gathers just need to reorder its scalars.
5481 for (TreeEntry *Gather : GatherOps) {
5482 assert(Gather->ReorderIndices.empty() &&
5483 "Unexpected reordering of gathers.");
5484 if (!Gather->ReuseShuffleIndices.empty()) {
5485 // Just reorder reuses indices.
5486 reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
5487 continue;
5488 }
5489 reorderScalars(Scalars&: Gather->Scalars, Mask);
5490 OrderedEntries.remove(X: Gather);
5491 }
5492 // Reorder operands of the user node and set the ordering for the user
5493 // node itself.
5494 if (Data.first->State != TreeEntry::Vectorize ||
5495 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5496 Val: Data.first->getMainOp()) ||
5497 Data.first->isAltShuffle())
5498 Data.first->reorderOperands(Mask);
5499 if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) ||
5500 Data.first->isAltShuffle() ||
5501 Data.first->State == TreeEntry::StridedVectorize) {
5502 reorderScalars(Scalars&: Data.first->Scalars, Mask);
5503 reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
5504 /*BottomOrder=*/true);
5505 if (Data.first->ReuseShuffleIndices.empty() &&
5506 !Data.first->ReorderIndices.empty() &&
5507 !Data.first->isAltShuffle()) {
5508 // Insert user node to the list to try to sink reordering deeper in
5509 // the graph.
5510 OrderedEntries.insert(X: Data.first);
5511 }
5512 } else {
5513 reorderOrder(Order&: Data.first->ReorderIndices, Mask);
5514 }
5515 }
5516 }
5517 // If the reordering is unnecessary, just remove the reorder.
5518 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5519 VectorizableTree.front()->ReuseShuffleIndices.empty())
5520 VectorizableTree.front()->ReorderIndices.clear();
5521}
5522
5523void BoUpSLP::buildExternalUses(
5524 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5525 DenseMap<Value *, unsigned> ScalarToExtUses;
5526 // Collect the values that we need to extract from the tree.
5527 for (auto &TEPtr : VectorizableTree) {
5528 TreeEntry *Entry = TEPtr.get();
5529
5530 // No need to handle users of gathered values.
5531 if (Entry->State == TreeEntry::NeedToGather)
5532 continue;
5533
5534 // For each lane:
5535 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5536 Value *Scalar = Entry->Scalars[Lane];
5537 if (!isa<Instruction>(Val: Scalar))
5538 continue;
5539 // All uses must be replaced already? No need to do it again.
5540 auto It = ScalarToExtUses.find(Val: Scalar);
5541 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5542 continue;
5543
5544 // Check if the scalar is externally used as an extra arg.
5545 const auto *ExtI = ExternallyUsedValues.find(Key: Scalar);
5546 if (ExtI != ExternallyUsedValues.end()) {
5547 int FoundLane = Entry->findLaneForValue(V: Scalar);
5548 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5549 << FoundLane << " from " << *Scalar << ".\n");
5550 ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
5551 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: FoundLane);
5552 continue;
5553 }
5554 for (User *U : Scalar->users()) {
5555 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5556
5557 Instruction *UserInst = dyn_cast<Instruction>(Val: U);
5558 if (!UserInst || isDeleted(I: UserInst))
5559 continue;
5560
5561 // Ignore users in the user ignore list.
5562 if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
5563 continue;
5564
5565 // Skip in-tree scalars that become vectors
5566 if (TreeEntry *UseEntry = getTreeEntry(V: U)) {
5567 // Some in-tree scalars will remain as scalar in vectorized
5568 // instructions. If that is the case, the one in FoundLane will
5569 // be used.
5570 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5571 !doesInTreeUserNeedToExtract(
5572 Scalar, UserInst: cast<Instruction>(Val: UseEntry->Scalars.front()), TLI)) {
5573 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5574 << ".\n");
5575 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5576 continue;
5577 }
5578 U = nullptr;
5579 if (It != ScalarToExtUses.end()) {
5580 ExternalUses[It->second].User = nullptr;
5581 break;
5582 }
5583 }
5584
5585 int FoundLane = Entry->findLaneForValue(V: Scalar);
5586 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5587 << " from lane " << FoundLane << " from " << *Scalar
5588 << ".\n");
5589 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
5590 ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: FoundLane);
5591 if (!U)
5592 break;
5593 }
5594 }
5595 }
5596}
5597
5598DenseMap<Value *, SmallVector<StoreInst *>>
5599BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5600 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
5601 for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size())) {
5602 Value *V = TE->Scalars[Lane];
5603 // To save compilation time we don't visit if we have too many users.
5604 if (V->hasNUsesOrMore(N: UsesLimit))
5605 break;
5606
5607 // Collect stores per pointer object.
5608 for (User *U : V->users()) {
5609 auto *SI = dyn_cast<StoreInst>(Val: U);
5610 if (SI == nullptr || !SI->isSimple() ||
5611 !isValidElementType(Ty: SI->getValueOperand()->getType()))
5612 continue;
5613 // Skip entry if already
5614 if (getTreeEntry(V: U))
5615 continue;
5616
5617 Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
5618 auto &StoresVec = PtrToStoresMap[Ptr];
5619 // For now just keep one store per pointer object per lane.
5620 // TODO: Extend this to support multiple stores per pointer per lane
5621 if (StoresVec.size() > Lane)
5622 continue;
5623 // Skip if in different BBs.
5624 if (!StoresVec.empty() &&
5625 SI->getParent() != StoresVec.back()->getParent())
5626 continue;
5627 // Make sure that the stores are of the same type.
5628 if (!StoresVec.empty() &&
5629 SI->getValueOperand()->getType() !=
5630 StoresVec.back()->getValueOperand()->getType())
5631 continue;
5632 StoresVec.push_back(Elt: SI);
5633 }
5634 }
5635 return PtrToStoresMap;
5636}
5637
5638bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5639 OrdersType &ReorderIndices) const {
5640 // We check whether the stores in StoreVec can form a vector by sorting them
5641 // and checking whether they are consecutive.
5642
5643 // To avoid calling getPointersDiff() while sorting we create a vector of
5644 // pairs {store, offset from first} and sort this instead.
5645 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5646 StoreInst *S0 = StoresVec[0];
5647 StoreOffsetVec[0] = {S0, 0};
5648 Type *S0Ty = S0->getValueOperand()->getType();
5649 Value *S0Ptr = S0->getPointerOperand();
5650 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoresVec.size())) {
5651 StoreInst *SI = StoresVec[Idx];
5652 std::optional<int> Diff =
5653 getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
5654 PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
5655 /*StrictCheck=*/true);
5656 // We failed to compare the pointers so just abandon this StoresVec.
5657 if (!Diff)
5658 return false;
5659 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5660 }
5661
5662 // Sort the vector based on the pointers. We create a copy because we may
5663 // need the original later for calculating the reorder (shuffle) indices.
5664 stable_sort(Range&: StoreOffsetVec, C: [](const std::pair<StoreInst *, int> &Pair1,
5665 const std::pair<StoreInst *, int> &Pair2) {
5666 int Offset1 = Pair1.second;
5667 int Offset2 = Pair2.second;
5668 return Offset1 < Offset2;
5669 });
5670
5671 // Check if the stores are consecutive by checking if their difference is 1.
5672 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoreOffsetVec.size()))
5673 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5674 return false;
5675
5676 // Calculate the shuffle indices according to their offset against the sorted
5677 // StoreOffsetVec.
5678 ReorderIndices.reserve(N: StoresVec.size());
5679 for (StoreInst *SI : StoresVec) {
5680 unsigned Idx = find_if(Range&: StoreOffsetVec,
5681 P: [SI](const std::pair<StoreInst *, int> &Pair) {
5682 return Pair.first == SI;
5683 }) -
5684 StoreOffsetVec.begin();
5685 ReorderIndices.push_back(Elt: Idx);
5686 }
5687 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5688 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5689 // same convention here.
5690 auto IsIdentityOrder = [](const OrdersType &Order) {
5691 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Order.size()))
5692 if (Idx != Order[Idx])
5693 return false;
5694 return true;
5695 };
5696 if (IsIdentityOrder(ReorderIndices))
5697 ReorderIndices.clear();
5698
5699 return true;
5700}
5701
5702#ifndef NDEBUG
5703LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
5704 for (unsigned Idx : Order)
5705 dbgs() << Idx << ", ";
5706 dbgs() << "\n";
5707}
5708#endif
5709
5710SmallVector<BoUpSLP::OrdersType, 1>
5711BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5712 unsigned NumLanes = TE->Scalars.size();
5713
5714 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
5715 collectUserStores(TE);
5716
5717 // Holds the reorder indices for each candidate store vector that is a user of
5718 // the current TreeEntry.
5719 SmallVector<OrdersType, 1> ExternalReorderIndices;
5720
5721 // Now inspect the stores collected per pointer and look for vectorization
5722 // candidates. For each candidate calculate the reorder index vector and push
5723 // it into `ExternalReorderIndices`
5724 for (const auto &Pair : PtrToStoresMap) {
5725 auto &StoresVec = Pair.second;
5726 // If we have fewer than NumLanes stores, then we can't form a vector.
5727 if (StoresVec.size() != NumLanes)
5728 continue;
5729
5730 // If the stores are not consecutive then abandon this StoresVec.
5731 OrdersType ReorderIndices;
5732 if (!canFormVector(StoresVec, ReorderIndices))
5733 continue;
5734
5735 // We now know that the scalars in StoresVec can form a vector instruction,
5736 // so set the reorder indices.
5737 ExternalReorderIndices.push_back(Elt: ReorderIndices);
5738 }
5739 return ExternalReorderIndices;
5740}
5741
5742void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
5743 const SmallDenseSet<Value *> &UserIgnoreLst) {
5744 deleteTree();
5745 UserIgnoreList = &UserIgnoreLst;
5746 if (!allSameType(VL: Roots))
5747 return;
5748 buildTree_rec(Roots, Depth: 0, EI: EdgeInfo());
5749}
5750
5751void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
5752 deleteTree();
5753 if (!allSameType(VL: Roots))
5754 return;
5755 buildTree_rec(Roots, Depth: 0, EI: EdgeInfo());
5756}
5757
5758/// \return true if the specified list of values has only one instruction that
5759/// requires scheduling, false otherwise.
5760#ifndef NDEBUG
5761static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
5762 Value *NeedsScheduling = nullptr;
5763 for (Value *V : VL) {
5764 if (doesNotNeedToBeScheduled(V))
5765 continue;
5766 if (!NeedsScheduling) {
5767 NeedsScheduling = V;
5768 continue;
5769 }
5770 return false;
5771 }
5772 return NeedsScheduling;
5773}
5774#endif
5775
5776/// Generates key/subkey pair for the given value to provide effective sorting
5777/// of the values and better detection of the vectorizable values sequences. The
5778/// keys/subkeys can be used for better sorting of the values themselves (keys)
5779/// and in values subgroups (subkeys).
5780static std::pair<size_t, size_t> generateKeySubkey(
5781 Value *V, const TargetLibraryInfo *TLI,
5782 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5783 bool AllowAlternate) {
5784 hash_code Key = hash_value(value: V->getValueID() + 2);
5785 hash_code SubKey = hash_value(value: 0);
5786 // Sort the loads by the distance between the pointers.
5787 if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
5788 Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
5789 if (LI->isSimple())
5790 SubKey = hash_value(code: LoadsSubkeyGenerator(Key, LI));
5791 else
5792 Key = SubKey = hash_value(ptr: LI);
5793 } else if (isVectorLikeInstWithConstOps(V)) {
5794 // Sort extracts by the vector operands.
5795 if (isa<ExtractElementInst, UndefValue>(Val: V))
5796 Key = hash_value(value: Value::UndefValueVal + 1);
5797 if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
5798 if (!isUndefVector(V: EI->getVectorOperand()).all() &&
5799 !isa<UndefValue>(Val: EI->getIndexOperand()))
5800 SubKey = hash_value(ptr: EI->getVectorOperand());
5801 }
5802 } else if (auto *I = dyn_cast<Instruction>(Val: V)) {
5803 // Sort other instructions just by the opcodes except for CMPInst.
5804 // For CMP also sort by the predicate kind.
5805 if ((isa<BinaryOperator, CastInst>(Val: I)) &&
5806 isValidForAlternation(Opcode: I->getOpcode())) {
5807 if (AllowAlternate)
5808 Key = hash_value(value: isa<BinaryOperator>(Val: I) ? 1 : 0);
5809 else
5810 Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
5811 SubKey = hash_combine(
5812 args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
5813 args: hash_value(ptr: isa<BinaryOperator>(Val: I)
5814 ? I->getType()
5815 : cast<CastInst>(Val: I)->getOperand(i_nocapture: 0)->getType()));
5816 // For casts, look through the only operand to improve compile time.
5817 if (isa<CastInst>(Val: I)) {
5818 std::pair<size_t, size_t> OpVals =
5819 generateKeySubkey(V: I->getOperand(i: 0), TLI, LoadsSubkeyGenerator,
5820 /*AllowAlternate=*/true);
5821 Key = hash_combine(args: OpVals.first, args: Key);
5822 SubKey = hash_combine(args: OpVals.first, args: SubKey);
5823 }
5824 } else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
5825 CmpInst::Predicate Pred = CI->getPredicate();
5826 if (CI->isCommutative())
5827 Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
5828 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
5829 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
5830 args: hash_value(value: SwapPred),
5831 args: hash_value(ptr: CI->getOperand(i_nocapture: 0)->getType()));
5832 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
5833 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
5834 if (isTriviallyVectorizable(ID)) {
5835 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
5836 } else if (!VFDatabase(*Call).getMappings(CI: *Call).empty()) {
5837 SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
5838 args: hash_value(ptr: Call->getCalledFunction()));
5839 } else {
5840 Key = hash_combine(args: hash_value(ptr: Call), args: Key);
5841 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
5842 }
5843 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5844 SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
5845 args: hash_value(ptr: Op.Tag), args: SubKey);
5846 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
5847 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: 1)))
5848 SubKey = hash_value(ptr: Gep->getPointerOperand());
5849 else
5850 SubKey = hash_value(ptr: Gep);
5851 } else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
5852 !isa<ConstantInt>(Val: I->getOperand(i: 1))) {
5853 // Do not try to vectorize instructions with potentially high cost.
5854 SubKey = hash_value(ptr: I);
5855 } else {
5856 SubKey = hash_value(value: I->getOpcode());
5857 }
5858 Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
5859 }
5860 return std::make_pair(x&: Key, y&: SubKey);
5861}
5862
5863/// Checks if the specified instruction \p I is an alternate operation for
5864/// the given \p MainOp and \p AltOp instructions.
5865static bool isAlternateInstruction(const Instruction *I,
5866 const Instruction *MainOp,
5867 const Instruction *AltOp,
5868 const TargetLibraryInfo &TLI);
5869
5870bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5871 ArrayRef<Value *> VL) const {
5872 unsigned Opcode0 = S.getOpcode();
5873 unsigned Opcode1 = S.getAltOpcode();
5874 // The opcode mask selects between the two opcodes.
5875 SmallBitVector OpcodeMask(VL.size(), false);
5876 for (unsigned Lane : seq<unsigned>(Begin: 0, End: VL.size()))
5877 if (cast<Instruction>(Val: VL[Lane])->getOpcode() == Opcode1)
5878 OpcodeMask.set(Lane);
5879 // If this pattern is supported by the target then consider it profitable.
5880 if (TTI->isLegalAltInstr(VecTy: FixedVectorType::get(ElementType: S.MainOp->getType(), NumElts: VL.size()),
5881 Opcode0, Opcode1, OpcodeMask))
5882 return true;
5883 SmallVector<ValueList> Operands;
5884 for (unsigned I : seq<unsigned>(Begin: 0, End: S.MainOp->getNumOperands())) {
5885 Operands.emplace_back();
5886 // Prepare the operand vector.
5887 for (Value *V : VL)
5888 Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
5889 }
5890 if (Operands.size() == 2) {
5891 // Try find best operands candidates.
5892 for (unsigned I : seq<unsigned>(Begin: 0, End: VL.size() - 1)) {
5893 SmallVector<std::pair<Value *, Value *>> Candidates(3);
5894 Candidates[0] = std::make_pair(x&: Operands[0][I], y&: Operands[0][I + 1]);
5895 Candidates[1] = std::make_pair(x&: Operands[0][I], y&: Operands[1][I + 1]);
5896 Candidates[2] = std::make_pair(x&: Operands[1][I], y&: Operands[0][I + 1]);
5897 std::optional<int> Res = findBestRootPair(Candidates);
5898 switch (Res.value_or(u: 0)) {
5899 case 0:
5900 break;
5901 case 1:
5902 std::swap(a&: Operands[0][I + 1], b&: Operands[1][I + 1]);
5903 break;
5904 case 2:
5905 std::swap(a&: Operands[0][I], b&: Operands[1][I]);
5906 break;
5907 default:
5908 llvm_unreachable("Unexpected index.");
5909 }
5910 }
5911 }
5912 DenseSet<unsigned> UniqueOpcodes;
5913 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
5914 unsigned NonInstCnt = 0;
5915 // Estimate number of instructions, required for the vectorized node and for
5916 // the buildvector node.
5917 unsigned UndefCnt = 0;
5918 // Count the number of extra shuffles, required for vector nodes.
5919 unsigned ExtraShuffleInsts = 0;
5920 // Check that operands do not contain same values and create either perfect
5921 // diamond match or shuffled match.
5922 if (Operands.size() == 2) {
5923 // Do not count same operands twice.
5924 if (Operands.front() == Operands.back()) {
5925 Operands.erase(CI: Operands.begin());
5926 } else if (!allConstant(VL: Operands.front()) &&
5927 all_of(Range&: Operands.front(), P: [&](Value *V) {
5928 return is_contained(Range&: Operands.back(), Element: V);
5929 })) {
5930 Operands.erase(CI: Operands.begin());
5931 ++ExtraShuffleInsts;
5932 }
5933 }
5934 const Loop *L = LI->getLoopFor(BB: S.MainOp->getParent());
5935 // Vectorize node, if:
5936 // 1. at least single operand is constant or splat.
5937 // 2. Operands have many loop invariants (the instructions are not loop
5938 // invariants).
5939 // 3. At least single unique operands is supposed to vectorized.
5940 return none_of(Range&: Operands,
5941 P: [&](ArrayRef<Value *> Op) {
5942 if (allConstant(VL: Op) ||
5943 (!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) &&
5944 getSameOpcode(VL: Op, TLI: *TLI).MainOp))
5945 return false;
5946 DenseMap<Value *, unsigned> Uniques;
5947 for (Value *V : Op) {
5948 if (isa<Constant, ExtractElementInst>(Val: V) ||
5949 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
5950 if (isa<UndefValue>(Val: V))
5951 ++UndefCnt;
5952 continue;
5953 }
5954 auto Res = Uniques.try_emplace(Key: V, Args: 0);
5955 // Found first duplicate - need to add shuffle.
5956 if (!Res.second && Res.first->second == 1)
5957 ++ExtraShuffleInsts;
5958 ++Res.first->getSecond();
5959 if (auto *I = dyn_cast<Instruction>(Val: V))
5960 UniqueOpcodes.insert(V: I->getOpcode());
5961 else if (Res.second)
5962 ++NonInstCnt;
5963 }
5964 return none_of(Range&: Uniques, P: [&](const auto &P) {
5965 return P.first->hasNUsesOrMore(P.second + 1) &&
5966 none_of(P.first->users(), [&](User *U) {
5967 return getTreeEntry(V: U) || Uniques.contains(Val: U);
5968 });
5969 });
5970 }) ||
5971 // Do not vectorize node, if estimated number of vector instructions is
5972 // more than estimated number of buildvector instructions. Number of
5973 // vector operands is number of vector instructions + number of vector
5974 // instructions for operands (buildvectors). Number of buildvector
5975 // instructions is just number_of_operands * number_of_scalars.
5976 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
5977 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
5978 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
5979}
5980
5981BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
5982 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
5983 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
5984 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
5985
5986 unsigned ShuffleOrOp =
5987 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
5988 auto *VL0 = cast<Instruction>(Val: S.OpValue);
5989 switch (ShuffleOrOp) {
5990 case Instruction::PHI: {
5991 // Check for terminator values (e.g. invoke).
5992 for (Value *V : VL)
5993 for (Value *Incoming : cast<PHINode>(Val: V)->incoming_values()) {
5994 Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
5995 if (Term && Term->isTerminator()) {
5996 LLVM_DEBUG(dbgs()
5997 << "SLP: Need to swizzle PHINodes (terminator use).\n");
5998 return TreeEntry::NeedToGather;
5999 }
6000 }
6001
6002 return TreeEntry::Vectorize;
6003 }
6004 case Instruction::ExtractValue:
6005 case Instruction::ExtractElement: {
6006 bool Reuse = canReuseExtract(VL, OpValue: VL0, CurrentOrder);
6007 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6008 if (!isPowerOf2_32(Value: VL.size()))
6009 return TreeEntry::NeedToGather;
6010 if (Reuse || !CurrentOrder.empty())
6011 return TreeEntry::Vectorize;
6012 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6013 return TreeEntry::NeedToGather;
6014 }
6015 case Instruction::InsertElement: {
6016 // Check that we have a buildvector and not a shuffle of 2 or more
6017 // different vectors.
6018 ValueSet SourceVectors;
6019 for (Value *V : VL) {
6020 SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: 0));
6021 assert(getInsertIndex(V) != std::nullopt &&
6022 "Non-constant or undef index?");
6023 }
6024
6025 if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
6026 return !SourceVectors.contains(Ptr: V);
6027 }) >= 2) {
6028 // Found 2nd source vector - cancel.
6029 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6030 "different source vectors.\n");
6031 return TreeEntry::NeedToGather;
6032 }
6033
6034 return TreeEntry::Vectorize;
6035 }
6036 case Instruction::Load: {
6037 // Check that a vectorized load would load the same memory as a scalar
6038 // load. For example, we don't want to vectorize loads that are smaller
6039 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6040 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6041 // from such a struct, we read/write packed bits disagreeing with the
6042 // unvectorized version.
6043 switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps)) {
6044 case LoadsState::Vectorize:
6045 return TreeEntry::Vectorize;
6046 case LoadsState::ScatterVectorize:
6047 return TreeEntry::ScatterVectorize;
6048 case LoadsState::StridedVectorize:
6049 return TreeEntry::StridedVectorize;
6050 case LoadsState::Gather:
6051#ifndef NDEBUG
6052 Type *ScalarTy = VL0->getType();
6053 if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
6054 DL->getTypeAllocSizeInBits(Ty: ScalarTy))
6055 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6056 else if (any_of(Range&: VL,
6057 P: [](Value *V) { return !cast<LoadInst>(Val: V)->isSimple(); }))
6058 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6059 else
6060 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6061#endif // NDEBUG
6062 return TreeEntry::NeedToGather;
6063 }
6064 llvm_unreachable("Unexpected state of loads");
6065 }
6066 case Instruction::ZExt:
6067 case Instruction::SExt:
6068 case Instruction::FPToUI:
6069 case Instruction::FPToSI:
6070 case Instruction::FPExt:
6071 case Instruction::PtrToInt:
6072 case Instruction::IntToPtr:
6073 case Instruction::SIToFP:
6074 case Instruction::UIToFP:
6075 case Instruction::Trunc:
6076 case Instruction::FPTrunc:
6077 case Instruction::BitCast: {
6078 Type *SrcTy = VL0->getOperand(i: 0)->getType();
6079 for (Value *V : VL) {
6080 Type *Ty = cast<Instruction>(Val: V)->getOperand(i: 0)->getType();
6081 if (Ty != SrcTy || !isValidElementType(Ty)) {
6082 LLVM_DEBUG(
6083 dbgs() << "SLP: Gathering casts with different src types.\n");
6084 return TreeEntry::NeedToGather;
6085 }
6086 }
6087 return TreeEntry::Vectorize;
6088 }
6089 case Instruction::ICmp:
6090 case Instruction::FCmp: {
6091 // Check that all of the compares have the same predicate.
6092 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
6093 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
6094 Type *ComparedTy = VL0->getOperand(i: 0)->getType();
6095 for (Value *V : VL) {
6096 CmpInst *Cmp = cast<CmpInst>(Val: V);
6097 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6098 Cmp->getOperand(i_nocapture: 0)->getType() != ComparedTy) {
6099 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6100 return TreeEntry::NeedToGather;
6101 }
6102 }
6103 return TreeEntry::Vectorize;
6104 }
6105 case Instruction::Select:
6106 case Instruction::FNeg:
6107 case Instruction::Add:
6108 case Instruction::FAdd:
6109 case Instruction::Sub:
6110 case Instruction::FSub:
6111 case Instruction::Mul:
6112 case Instruction::FMul:
6113 case Instruction::UDiv:
6114 case Instruction::SDiv:
6115 case Instruction::FDiv:
6116 case Instruction::URem:
6117 case Instruction::SRem:
6118 case Instruction::FRem:
6119 case Instruction::Shl:
6120 case Instruction::LShr:
6121 case Instruction::AShr:
6122 case Instruction::And:
6123 case Instruction::Or:
6124 case Instruction::Xor:
6125 return TreeEntry::Vectorize;
6126 case Instruction::GetElementPtr: {
6127 // We don't combine GEPs with complicated (nested) indexing.
6128 for (Value *V : VL) {
6129 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6130 if (!I)
6131 continue;
6132 if (I->getNumOperands() != 2) {
6133 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6134 return TreeEntry::NeedToGather;
6135 }
6136 }
6137
6138 // We can't combine several GEPs into one vector if they operate on
6139 // different types.
6140 Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
6141 for (Value *V : VL) {
6142 auto *GEP = dyn_cast<GEPOperator>(Val: V);
6143 if (!GEP)
6144 continue;
6145 Type *CurTy = GEP->getSourceElementType();
6146 if (Ty0 != CurTy) {
6147 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6148 return TreeEntry::NeedToGather;
6149 }
6150 }
6151
6152 // We don't combine GEPs with non-constant indexes.
6153 Type *Ty1 = VL0->getOperand(i: 1)->getType();
6154 for (Value *V : VL) {
6155 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6156 if (!I)
6157 continue;
6158 auto *Op = I->getOperand(i_nocapture: 1);
6159 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
6160 (Op->getType() != Ty1 &&
6161 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
6162 Op->getType()->getScalarSizeInBits() >
6163 DL->getIndexSizeInBits(
6164 AS: V->getType()->getPointerAddressSpace())))) {
6165 LLVM_DEBUG(
6166 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6167 return TreeEntry::NeedToGather;
6168 }
6169 }
6170
6171 return TreeEntry::Vectorize;
6172 }
6173 case Instruction::Store: {
6174 // Check if the stores are consecutive or if we need to swizzle them.
6175 llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
6176 // Avoid types that are padded when being allocated as scalars, while
6177 // being packed together in a vector (such as i1).
6178 if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
6179 DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
6180 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6181 return TreeEntry::NeedToGather;
6182 }
6183 // Make sure all stores in the bundle are simple - we can't vectorize
6184 // atomic or volatile stores.
6185 for (Value *V : VL) {
6186 auto *SI = cast<StoreInst>(Val: V);
6187 if (!SI->isSimple()) {
6188 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6189 return TreeEntry::NeedToGather;
6190 }
6191 PointerOps.push_back(Elt: SI->getPointerOperand());
6192 }
6193
6194 // Check the order of pointer operands.
6195 if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: CurrentOrder)) {
6196 Value *Ptr0;
6197 Value *PtrN;
6198 if (CurrentOrder.empty()) {
6199 Ptr0 = PointerOps.front();
6200 PtrN = PointerOps.back();
6201 } else {
6202 Ptr0 = PointerOps[CurrentOrder.front()];
6203 PtrN = PointerOps[CurrentOrder.back()];
6204 }
6205 std::optional<int> Dist =
6206 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
6207 // Check that the sorted pointer operands are consecutive.
6208 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6209 return TreeEntry::Vectorize;
6210 }
6211
6212 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6213 return TreeEntry::NeedToGather;
6214 }
6215 case Instruction::Call: {
6216 // Check if the calls are all to the same vectorizable intrinsic or
6217 // library function.
6218 CallInst *CI = cast<CallInst>(Val: VL0);
6219 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6220
6221 VFShape Shape = VFShape::get(
6222 FTy: CI->getFunctionType(),
6223 EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
6224 HasGlobalPred: false /*HasGlobalPred*/);
6225 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6226
6227 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6228 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6229 return TreeEntry::NeedToGather;
6230 }
6231 Function *F = CI->getCalledFunction();
6232 unsigned NumArgs = CI->arg_size();
6233 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6234 for (unsigned J = 0; J != NumArgs; ++J)
6235 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J))
6236 ScalarArgs[J] = CI->getArgOperand(i: J);
6237 for (Value *V : VL) {
6238 CallInst *CI2 = dyn_cast<CallInst>(Val: V);
6239 if (!CI2 || CI2->getCalledFunction() != F ||
6240 getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID ||
6241 (VecFunc &&
6242 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6243 !CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
6244 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6245 << "\n");
6246 return TreeEntry::NeedToGather;
6247 }
6248 // Some intrinsics have scalar arguments and should be same in order for
6249 // them to be vectorized.
6250 for (unsigned J = 0; J != NumArgs; ++J) {
6251 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J)) {
6252 Value *A1J = CI2->getArgOperand(i: J);
6253 if (ScalarArgs[J] != A1J) {
6254 LLVM_DEBUG(dbgs()
6255 << "SLP: mismatched arguments in call:" << *CI
6256 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6257 return TreeEntry::NeedToGather;
6258 }
6259 }
6260 }
6261 // Verify that the bundle operands are identical between the two calls.
6262 if (CI->hasOperandBundles() &&
6263 !std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
6264 last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
6265 first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6266 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6267 << "!=" << *V << '\n');
6268 return TreeEntry::NeedToGather;
6269 }
6270 }
6271
6272 return TreeEntry::Vectorize;
6273 }
6274 case Instruction::ShuffleVector: {
6275 // If this is not an alternate sequence of opcode like add-sub
6276 // then do not vectorize this instruction.
6277 if (!S.isAltShuffle()) {
6278 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6279 return TreeEntry::NeedToGather;
6280 }
6281 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6282 LLVM_DEBUG(
6283 dbgs()
6284 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6285 "the whole alt sequence is not profitable.\n");
6286 return TreeEntry::NeedToGather;
6287 }
6288
6289 return TreeEntry::Vectorize;
6290 }
6291 default:
6292 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6293 return TreeEntry::NeedToGather;
6294 }
6295}
6296
6297void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6298 const EdgeInfo &UserTreeIdx) {
6299 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6300
6301 SmallVector<int> ReuseShuffleIndicies;
6302 SmallVector<Value *> UniqueValues;
6303 SmallVector<Value *> NonUniqueValueVL;
6304 auto TryToFindDuplicates = [&](const InstructionsState &S,
6305 bool DoNotFail = false) {
6306 // Check that every instruction appears once in this bundle.
6307 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6308 for (Value *V : VL) {
6309 if (isConstant(V)) {
6310 ReuseShuffleIndicies.emplace_back(
6311 Args: isa<UndefValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
6312 UniqueValues.emplace_back(Args&: V);
6313 continue;
6314 }
6315 auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
6316 ReuseShuffleIndicies.emplace_back(Args&: Res.first->second);
6317 if (Res.second)
6318 UniqueValues.emplace_back(Args&: V);
6319 }
6320 size_t NumUniqueScalarValues = UniqueValues.size();
6321 if (NumUniqueScalarValues == VL.size()) {
6322 ReuseShuffleIndicies.clear();
6323 } else {
6324 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6325 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6326 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6327 "for nodes with padding.\n");
6328 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6329 return false;
6330 }
6331 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6332 if (NumUniqueScalarValues <= 1 ||
6333 (UniquePositions.size() == 1 && all_of(Range&: UniqueValues,
6334 P: [](Value *V) {
6335 return isa<UndefValue>(Val: V) ||
6336 !isConstant(V);
6337 })) ||
6338 !llvm::has_single_bit<uint32_t>(Value: NumUniqueScalarValues)) {
6339 if (DoNotFail && UniquePositions.size() > 1 &&
6340 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6341 all_of(Range&: UniqueValues, P: [=](Value *V) {
6342 return isa<ExtractElementInst>(Val: V) ||
6343 areAllUsersVectorized(I: cast<Instruction>(Val: V),
6344 VectorizedVals: UserIgnoreList);
6345 })) {
6346 unsigned PWSz = PowerOf2Ceil(A: UniqueValues.size());
6347 if (PWSz == VL.size()) {
6348 ReuseShuffleIndicies.clear();
6349 } else {
6350 NonUniqueValueVL.assign(in_start: UniqueValues.begin(), in_end: UniqueValues.end());
6351 NonUniqueValueVL.append(NumInputs: PWSz - UniqueValues.size(),
6352 Elt: UniqueValues.back());
6353 VL = NonUniqueValueVL;
6354 }
6355 return true;
6356 }
6357 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6358 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6359 return false;
6360 }
6361 VL = UniqueValues;
6362 }
6363 return true;
6364 };
6365
6366 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
6367
6368 // Don't vectorize ephemeral values.
6369 if (!EphValues.empty()) {
6370 for (Value *V : VL) {
6371 if (EphValues.count(Ptr: V)) {
6372 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6373 << ") is ephemeral.\n");
6374 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6375 return;
6376 }
6377 }
6378 }
6379
6380 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6381 // a load), in which case peek through to include it in the tree, without
6382 // ballooning over-budget.
6383 if (Depth >= RecursionMaxDepth &&
6384 !(S.MainOp && isa<Instruction>(Val: S.MainOp) && S.MainOp == S.AltOp &&
6385 VL.size() >= 4 &&
6386 (match(V: S.MainOp, P: m_Load(Op: m_Value())) || all_of(Range&: VL, P: [&S](const Value *I) {
6387 return match(V: I,
6388 P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
6389 cast<Instruction>(Val: I)->getOpcode() ==
6390 cast<Instruction>(Val: S.MainOp)->getOpcode();
6391 })))) {
6392 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6393 if (TryToFindDuplicates(S))
6394 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6395 ReuseShuffleIndices: ReuseShuffleIndicies);
6396 return;
6397 }
6398
6399 // Don't handle scalable vectors
6400 if (S.getOpcode() == Instruction::ExtractElement &&
6401 isa<ScalableVectorType>(
6402 Val: cast<ExtractElementInst>(Val: S.OpValue)->getVectorOperandType())) {
6403 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6404 if (TryToFindDuplicates(S))
6405 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6406 ReuseShuffleIndices: ReuseShuffleIndicies);
6407 return;
6408 }
6409
6410 // Don't handle vectors.
6411 if (S.OpValue->getType()->isVectorTy() &&
6412 !isa<InsertElementInst>(Val: S.OpValue)) {
6413 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6414 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6415 return;
6416 }
6417
6418 if (StoreInst *SI = dyn_cast<StoreInst>(Val: S.OpValue))
6419 if (SI->getValueOperand()->getType()->isVectorTy()) {
6420 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6421 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6422 return;
6423 }
6424
6425 // If all of the operands are identical or constant we have a simple solution.
6426 // If we deal with insert/extract instructions, they all must have constant
6427 // indices, otherwise we should gather them, not try to vectorize.
6428 // If alternate op node with 2 elements with gathered operands - do not
6429 // vectorize.
6430 auto &&NotProfitableForVectorization = [&S, this,
6431 Depth](ArrayRef<Value *> VL) {
6432 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6433 return false;
6434 if (VectorizableTree.size() < MinTreeSize)
6435 return false;
6436 if (Depth >= RecursionMaxDepth - 1)
6437 return true;
6438 // Check if all operands are extracts, part of vector node or can build a
6439 // regular vectorize node.
6440 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6441 for (Value *V : VL) {
6442 auto *I = cast<Instruction>(Val: V);
6443 InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) {
6444 return isa<Instruction>(Val: Op) || isVectorLikeInstWithConstOps(V: Op);
6445 }));
6446 }
6447 bool IsCommutative = isCommutative(I: S.MainOp) || isCommutative(I: S.AltOp);
6448 if ((IsCommutative &&
6449 std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: 0) < 2) ||
6450 (!IsCommutative &&
6451 all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < 2; })))
6452 return true;
6453 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6454 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
6455 auto *I1 = cast<Instruction>(Val: VL.front());
6456 auto *I2 = cast<Instruction>(Val: VL.back());
6457 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6458 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6459 Args: I2->getOperand(i: Op));
6460 if (static_cast<unsigned>(count_if(
6461 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6462 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6463 })) >= S.MainOp->getNumOperands() / 2)
6464 return false;
6465 if (S.MainOp->getNumOperands() > 2)
6466 return true;
6467 if (IsCommutative) {
6468 // Check permuted operands.
6469 Candidates.clear();
6470 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6471 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6472 Args: I2->getOperand(i: (Op + 1) % E));
6473 if (any_of(
6474 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6475 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6476 }))
6477 return false;
6478 }
6479 return true;
6480 };
6481 SmallVector<unsigned> SortedIndices;
6482 BasicBlock *BB = nullptr;
6483 bool IsScatterVectorizeUserTE =
6484 UserTreeIdx.UserTE &&
6485 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6486 bool AreAllSameInsts =
6487 (S.getOpcode() && allSameBlock(VL)) ||
6488 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6489 VL.size() > 2 &&
6490 all_of(Range&: VL,
6491 P: [&BB](Value *V) {
6492 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6493 if (!I)
6494 return doesNotNeedToBeScheduled(V);
6495 if (!BB)
6496 BB = I->getParent();
6497 return BB == I->getParent() && I->getNumOperands() == 2;
6498 }) &&
6499 BB &&
6500 sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL, SE&: *SE,
6501 SortedIndices));
6502 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6503 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6504 Val: S.OpValue) &&
6505 !all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) ||
6506 NotProfitableForVectorization(VL)) {
6507 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6508 if (TryToFindDuplicates(S))
6509 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6510 ReuseShuffleIndices: ReuseShuffleIndicies);
6511 return;
6512 }
6513
6514 // We now know that this is a vector of instructions of the same type from
6515 // the same block.
6516
6517 // Check if this is a duplicate of another entry.
6518 if (TreeEntry *E = getTreeEntry(V: S.OpValue)) {
6519 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6520 if (!E->isSame(VL)) {
6521 auto It = MultiNodeScalars.find(Val: S.OpValue);
6522 if (It != MultiNodeScalars.end()) {
6523 auto *TEIt = find_if(Range&: It->getSecond(),
6524 P: [&](TreeEntry *ME) { return ME->isSame(VL); });
6525 if (TEIt != It->getSecond().end())
6526 E = *TEIt;
6527 else
6528 E = nullptr;
6529 } else {
6530 E = nullptr;
6531 }
6532 }
6533 if (!E) {
6534 if (!doesNotNeedToBeScheduled(V: S.OpValue)) {
6535 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6536 if (TryToFindDuplicates(S))
6537 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6538 ReuseShuffleIndices: ReuseShuffleIndicies);
6539 return;
6540 }
6541 } else {
6542 // Record the reuse of the tree node. FIXME, currently this is only used
6543 // to properly draw the graph rather than for the actual vectorization.
6544 E->UserTreeIndices.push_back(Elt: UserTreeIdx);
6545 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6546 << ".\n");
6547 return;
6548 }
6549 }
6550
6551 // Check that none of the instructions in the bundle are already in the tree.
6552 for (Value *V : VL) {
6553 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(Val: V)) ||
6554 doesNotNeedToBeScheduled(V))
6555 continue;
6556 if (getTreeEntry(V)) {
6557 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6558 << ") is already in tree.\n");
6559 if (TryToFindDuplicates(S))
6560 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6561 ReuseShuffleIndices: ReuseShuffleIndicies);
6562 return;
6563 }
6564 }
6565
6566 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6567 if (UserIgnoreList && !UserIgnoreList->empty()) {
6568 for (Value *V : VL) {
6569 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6570 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6571 if (TryToFindDuplicates(S))
6572 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6573 ReuseShuffleIndices: ReuseShuffleIndicies);
6574 return;
6575 }
6576 }
6577 }
6578
6579 // Special processing for sorted pointers for ScatterVectorize node with
6580 // constant indeces only.
6581 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6582 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6583 !(S.getOpcode() && allSameBlock(VL))) {
6584 assert(S.OpValue->getType()->isPointerTy() &&
6585 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6586 "Expected pointers only.");
6587 // Reset S to make it GetElementPtr kind of node.
6588 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
6589 assert(It != VL.end() && "Expected at least one GEP.");
6590 S = getSameOpcode(VL: *It, TLI: *TLI);
6591 }
6592
6593 // Check that all of the users of the scalars that we want to vectorize are
6594 // schedulable.
6595 auto *VL0 = cast<Instruction>(Val: S.OpValue);
6596 BB = VL0->getParent();
6597
6598 if (!DT->isReachableFromEntry(A: BB)) {
6599 // Don't go into unreachable blocks. They may contain instructions with
6600 // dependency cycles which confuse the final scheduling.
6601 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6602 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6603 return;
6604 }
6605
6606 // Don't go into catchswitch blocks, which can happen with PHIs.
6607 // Such blocks can only have PHIs and the catchswitch. There is no
6608 // place to insert a shuffle if we need to, so just avoid that issue.
6609 if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
6610 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6611 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6612 return;
6613 }
6614
6615 // Check that every instruction appears once in this bundle.
6616 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6617 return;
6618
6619 // Perform specific checks for each particular instruction kind.
6620 OrdersType CurrentOrder;
6621 SmallVector<Value *> PointerOps;
6622 TreeEntry::EntryState State = getScalarsVectorizationState(
6623 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6624 if (State == TreeEntry::NeedToGather) {
6625 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6626 ReuseShuffleIndices: ReuseShuffleIndicies);
6627 return;
6628 }
6629
6630 auto &BSRef = BlocksSchedules[BB];
6631 if (!BSRef)
6632 BSRef = std::make_unique<BlockScheduling>(args&: BB);
6633
6634 BlockScheduling &BS = *BSRef;
6635
6636 std::optional<ScheduleData *> Bundle =
6637 BS.tryScheduleBundle(VL: UniqueValues, SLP: this, S);
6638#ifdef EXPENSIVE_CHECKS
6639 // Make sure we didn't break any internal invariants
6640 BS.verify();
6641#endif
6642 if (!Bundle) {
6643 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6644 assert((!BS.getScheduleData(VL0) ||
6645 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6646 "tryScheduleBundle should cancelScheduling on failure");
6647 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6648 ReuseShuffleIndices: ReuseShuffleIndicies);
6649 return;
6650 }
6651 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6652
6653 unsigned ShuffleOrOp = S.isAltShuffle() ?
6654 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6655 switch (ShuffleOrOp) {
6656 case Instruction::PHI: {
6657 auto *PH = cast<PHINode>(Val: VL0);
6658
6659 TreeEntry *TE =
6660 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies);
6661 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6662
6663 // Keeps the reordered operands to avoid code duplication.
6664 SmallVector<ValueList, 2> OperandsVec;
6665 for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
6666 if (!DT->isReachableFromEntry(A: PH->getIncomingBlock(i: I))) {
6667 ValueList Operands(VL.size(), PoisonValue::get(T: PH->getType()));
6668 TE->setOperand(OpIdx: I, OpVL: Operands);
6669 OperandsVec.push_back(Elt: Operands);
6670 continue;
6671 }
6672 ValueList Operands;
6673 // Prepare the operand vector.
6674 for (Value *V : VL)
6675 Operands.push_back(Elt: cast<PHINode>(Val: V)->getIncomingValueForBlock(
6676 BB: PH->getIncomingBlock(i: I)));
6677 TE->setOperand(OpIdx: I, OpVL: Operands);
6678 OperandsVec.push_back(Elt: Operands);
6679 }
6680 for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
6681 buildTree_rec(VL: OperandsVec[OpIdx], Depth: Depth + 1, UserTreeIdx: {TE, OpIdx});
6682 return;
6683 }
6684 case Instruction::ExtractValue:
6685 case Instruction::ExtractElement: {
6686 if (CurrentOrder.empty()) {
6687 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6688 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6689 ReuseShuffleIndices: ReuseShuffleIndicies);
6690 // This is a special case, as it does not gather, but at the same time
6691 // we are not extending buildTree_rec() towards the operands.
6692 ValueList Op0;
6693 Op0.assign(NumElts: VL.size(), Elt: VL0->getOperand(i: 0));
6694 VectorizableTree.back()->setOperand(OpIdx: 0, OpVL: Op0);
6695 return;
6696 }
6697 LLVM_DEBUG({
6698 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6699 "with order";
6700 for (unsigned Idx : CurrentOrder)
6701 dbgs() << " " << Idx;
6702 dbgs() << "\n";
6703 });
6704 fixupOrderingIndices(Order: CurrentOrder);
6705 // Insert new order with initial value 0, if it does not exist,
6706 // otherwise return the iterator to the existing one.
6707 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6708 ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
6709 // This is a special case, as it does not gather, but at the same time
6710 // we are not extending buildTree_rec() towards the operands.
6711 ValueList Op0;
6712 Op0.assign(NumElts: VL.size(), Elt: VL0->getOperand(i: 0));
6713 VectorizableTree.back()->setOperand(OpIdx: 0, OpVL: Op0);
6714 return;
6715 }
6716 case Instruction::InsertElement: {
6717 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6718
6719 auto OrdCompare = [](const std::pair<int, int> &P1,
6720 const std::pair<int, int> &P2) {
6721 return P1.first > P2.first;
6722 };
6723 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
6724 decltype(OrdCompare)>
6725 Indices(OrdCompare);
6726 for (int I = 0, E = VL.size(); I < E; ++I) {
6727 unsigned Idx = *getInsertIndex(InsertInst: VL[I]);
6728 Indices.emplace(args&: Idx, args&: I);
6729 }
6730 OrdersType CurrentOrder(VL.size(), VL.size());
6731 bool IsIdentity = true;
6732 for (int I = 0, E = VL.size(); I < E; ++I) {
6733 CurrentOrder[Indices.top().second] = I;
6734 IsIdentity &= Indices.top().second == I;
6735 Indices.pop();
6736 }
6737 if (IsIdentity)
6738 CurrentOrder.clear();
6739 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6740 ReuseShuffleIndices: std::nullopt, ReorderIndices: CurrentOrder);
6741 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6742
6743 constexpr int NumOps = 2;
6744 ValueList VectorOperands[NumOps];
6745 for (int I = 0; I < NumOps; ++I) {
6746 for (Value *V : VL)
6747 VectorOperands[I].push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6748
6749 TE->setOperand(OpIdx: I, OpVL: VectorOperands[I]);
6750 }
6751 buildTree_rec(VL: VectorOperands[NumOps - 1], Depth: Depth + 1, UserTreeIdx: {TE, NumOps - 1});
6752 return;
6753 }
6754 case Instruction::Load: {
6755 // Check that a vectorized load would load the same memory as a scalar
6756 // load. For example, we don't want to vectorize loads that are smaller
6757 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6758 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6759 // from such a struct, we read/write packed bits disagreeing with the
6760 // unvectorized version.
6761 TreeEntry *TE = nullptr;
6762 fixupOrderingIndices(Order: CurrentOrder);
6763 switch (State) {
6764 case TreeEntry::Vectorize:
6765 if (CurrentOrder.empty()) {
6766 // Original loads are consecutive and does not require reordering.
6767 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6768 ReuseShuffleIndices: ReuseShuffleIndicies);
6769 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6770 } else {
6771 // Need to reorder.
6772 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6773 ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
6774 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6775 }
6776 TE->setOperandsInOrder();
6777 break;
6778 case TreeEntry::StridedVectorize:
6779 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6780 if (CurrentOrder.empty()) {
6781 TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
6782 UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies);
6783 } else {
6784 TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
6785 UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
6786 }
6787 TE->setOperandsInOrder();
6788 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6789 break;
6790 case TreeEntry::ScatterVectorize:
6791 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6792 TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
6793 UserTreeIdx, ReuseShuffleIndices: ReuseShuffleIndicies);
6794 TE->setOperandsInOrder();
6795 buildTree_rec(VL: PointerOps, Depth: Depth + 1, UserTreeIdx: {TE, 0});
6796 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6797 break;
6798 case TreeEntry::NeedToGather:
6799 llvm_unreachable("Unexpected loads state.");
6800 }
6801 return;
6802 }
6803 case Instruction::ZExt:
6804 case Instruction::SExt:
6805 case Instruction::FPToUI:
6806 case Instruction::FPToSI:
6807 case Instruction::FPExt:
6808 case Instruction::PtrToInt:
6809 case Instruction::IntToPtr:
6810 case Instruction::SIToFP:
6811 case Instruction::UIToFP:
6812 case Instruction::Trunc:
6813 case Instruction::FPTrunc:
6814 case Instruction::BitCast: {
6815 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6816 u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
6817 y: std::numeric_limits<unsigned>::max()));
6818 if (ShuffleOrOp == Instruction::ZExt ||
6819 ShuffleOrOp == Instruction::SExt) {
6820 CastMaxMinBWSizes = std::make_pair(
6821 x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
6822 b: PrevMaxBW),
6823 y: std::min<unsigned>(
6824 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
6825 b: PrevMinBW));
6826 } else if (ShuffleOrOp == Instruction::Trunc) {
6827 CastMaxMinBWSizes = std::make_pair(
6828 x: std::max<unsigned>(
6829 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
6830 b: PrevMaxBW),
6831 y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
6832 b: PrevMinBW));
6833 ExtraBitWidthNodes.insert(V: VectorizableTree.size() + 1);
6834 } else if (ShuffleOrOp == Instruction::SIToFP ||
6835 ShuffleOrOp == Instruction::UIToFP) {
6836 unsigned NumSignBits =
6837 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
6838 if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: 0))) {
6839 APInt Mask = DB->getDemandedBits(I: OpI);
6840 NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
6841 }
6842 if (NumSignBits * 2 >=
6843 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
6844 ExtraBitWidthNodes.insert(V: VectorizableTree.size() + 1);
6845 }
6846 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6847 ReuseShuffleIndices: ReuseShuffleIndicies);
6848 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
6849
6850 TE->setOperandsInOrder();
6851 for (unsigned I : seq<unsigned>(Begin: 0, End: VL0->getNumOperands())) {
6852 ValueList Operands;
6853 // Prepare the operand vector.
6854 for (Value *V : VL)
6855 Operands.push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6856
6857 buildTree_rec(VL: Operands, Depth: Depth + 1, UserTreeIdx: {TE, I});
6858 }
6859 return;
6860 }
6861 case Instruction::ICmp:
6862 case Instruction::FCmp: {
6863 // Check that all of the compares have the same predicate.
6864 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
6865 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6866 ReuseShuffleIndices: ReuseShuffleIndicies);
6867 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
6868
6869 ValueList Left, Right;
6870 if (cast<CmpInst>(Val: VL0)->isCommutative()) {
6871 // Commutative predicate - collect + sort operands of the instructions
6872 // so that each side is more likely to have the same opcode.
6873 assert(P0 == CmpInst::getSwappedPredicate(P0) &&
6874 "Commutative Predicate mismatch");
6875 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
6876 } else {
6877 // Collect operands - commute if it uses the swapped predicate.
6878 for (Value *V : VL) {
6879 auto *Cmp = cast<CmpInst>(Val: V);
6880 Value *LHS = Cmp->getOperand(i_nocapture: 0);
6881 Value *RHS = Cmp->getOperand(i_nocapture: 1);
6882 if (Cmp->getPredicate() != P0)
6883 std::swap(a&: LHS, b&: RHS);
6884 Left.push_back(Elt: LHS);
6885 Right.push_back(Elt: RHS);
6886 }
6887 }
6888 TE->setOperand(OpIdx: 0, OpVL: Left);
6889 TE->setOperand(OpIdx: 1, OpVL: Right);
6890 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
6891 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
6892 if (ShuffleOrOp == Instruction::ICmp) {
6893 unsigned NumSignBits0 =
6894 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
6895 if (NumSignBits0 * 2 >=
6896 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
6897 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
6898 unsigned NumSignBits1 =
6899 ComputeNumSignBits(Op: VL0->getOperand(i: 1), DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
6900 if (NumSignBits1 * 2 >=
6901 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 1)->getType()))
6902 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 1)->Idx);
6903 }
6904 return;
6905 }
6906 case Instruction::Select:
6907 case Instruction::FNeg:
6908 case Instruction::Add:
6909 case Instruction::FAdd:
6910 case Instruction::Sub:
6911 case Instruction::FSub:
6912 case Instruction::Mul:
6913 case Instruction::FMul:
6914 case Instruction::UDiv:
6915 case Instruction::SDiv:
6916 case Instruction::FDiv:
6917 case Instruction::URem:
6918 case Instruction::SRem:
6919 case Instruction::FRem:
6920 case Instruction::Shl:
6921 case Instruction::LShr:
6922 case Instruction::AShr:
6923 case Instruction::And:
6924 case Instruction::Or:
6925 case Instruction::Xor: {
6926 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6927 ReuseShuffleIndices: ReuseShuffleIndicies);
6928 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
6929
6930 // Sort operands of the instructions so that each side is more likely to
6931 // have the same opcode.
6932 if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
6933 ValueList Left, Right;
6934 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
6935 TE->setOperand(OpIdx: 0, OpVL: Left);
6936 TE->setOperand(OpIdx: 1, OpVL: Right);
6937 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
6938 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
6939 return;
6940 }
6941
6942 TE->setOperandsInOrder();
6943 for (unsigned I : seq<unsigned>(Begin: 0, End: VL0->getNumOperands())) {
6944 ValueList Operands;
6945 // Prepare the operand vector.
6946 for (Value *V : VL)
6947 Operands.push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6948
6949 buildTree_rec(VL: Operands, Depth: Depth + 1, UserTreeIdx: {TE, I});
6950 }
6951 return;
6952 }
6953 case Instruction::GetElementPtr: {
6954 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6955 ReuseShuffleIndices: ReuseShuffleIndicies);
6956 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
6957 SmallVector<ValueList, 2> Operands(2);
6958 // Prepare the operand vector for pointer operands.
6959 for (Value *V : VL) {
6960 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
6961 if (!GEP) {
6962 Operands.front().push_back(Elt: V);
6963 continue;
6964 }
6965 Operands.front().push_back(Elt: GEP->getPointerOperand());
6966 }
6967 TE->setOperand(OpIdx: 0, OpVL: Operands.front());
6968 // Need to cast all indices to the same type before vectorization to
6969 // avoid crash.
6970 // Required to be able to find correct matches between different gather
6971 // nodes and reuse the vectorized values rather than trying to gather them
6972 // again.
6973 int IndexIdx = 1;
6974 Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
6975 Type *Ty = all_of(Range&: VL,
6976 P: [VL0Ty, IndexIdx](Value *V) {
6977 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
6978 if (!GEP)
6979 return true;
6980 return VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
6981 })
6982 ? VL0Ty
6983 : DL->getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
6984 ->getPointerOperandType()
6985 ->getScalarType());
6986 // Prepare the operand vector.
6987 for (Value *V : VL) {
6988 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6989 if (!I) {
6990 Operands.back().push_back(
6991 Elt: ConstantInt::get(Ty, V: 0, /*isSigned=*/IsSigned: false));
6992 continue;
6993 }
6994 auto *Op = I->getOperand(i_nocapture: IndexIdx);
6995 auto *CI = dyn_cast<ConstantInt>(Val: Op);
6996 if (!CI)
6997 Operands.back().push_back(Elt: Op);
6998 else
6999 Operands.back().push_back(Elt: ConstantFoldIntegerCast(
7000 C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL: *DL));
7001 }
7002 TE->setOperand(OpIdx: IndexIdx, OpVL: Operands.back());
7003
7004 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7005 buildTree_rec(VL: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
7006 return;
7007 }
7008 case Instruction::Store: {
7009 // Check if the stores are consecutive or if we need to swizzle them.
7010 ValueList Operands(VL.size());
7011 auto *OIter = Operands.begin();
7012 for (Value *V : VL) {
7013 auto *SI = cast<StoreInst>(Val: V);
7014 *OIter = SI->getValueOperand();
7015 ++OIter;
7016 }
7017 // Check that the sorted pointer operands are consecutive.
7018 if (CurrentOrder.empty()) {
7019 // Original stores are consecutive and does not require reordering.
7020 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7021 ReuseShuffleIndices: ReuseShuffleIndicies);
7022 TE->setOperandsInOrder();
7023 buildTree_rec(VL: Operands, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7024 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7025 } else {
7026 fixupOrderingIndices(Order: CurrentOrder);
7027 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7028 ReuseShuffleIndices: ReuseShuffleIndicies, ReorderIndices: CurrentOrder);
7029 TE->setOperandsInOrder();
7030 buildTree_rec(VL: Operands, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7031 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7032 }
7033 return;
7034 }
7035 case Instruction::Call: {
7036 // Check if the calls are all to the same vectorizable intrinsic or
7037 // library function.
7038 CallInst *CI = cast<CallInst>(Val: VL0);
7039 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7040
7041 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7042 ReuseShuffleIndices: ReuseShuffleIndicies);
7043 // Sort operands of the instructions so that each side is more likely to
7044 // have the same opcode.
7045 if (isCommutative(I: VL0)) {
7046 ValueList Left, Right;
7047 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7048 TE->setOperand(OpIdx: 0, OpVL: Left);
7049 TE->setOperand(OpIdx: 1, OpVL: Right);
7050 SmallVector<ValueList> Operands;
7051 for (unsigned I : seq<unsigned>(Begin: 2, End: CI->arg_size())) {
7052 Operands.emplace_back();
7053 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7054 continue;
7055 for (Value *V : VL) {
7056 auto *CI2 = cast<CallInst>(Val: V);
7057 Operands.back().push_back(Elt: CI2->getArgOperand(i: I));
7058 }
7059 TE->setOperand(OpIdx: I, OpVL: Operands.back());
7060 }
7061 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7062 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
7063 for (unsigned I : seq<unsigned>(Begin: 2, End: CI->arg_size())) {
7064 if (Operands[I - 2].empty())
7065 continue;
7066 buildTree_rec(VL: Operands[I - 2], Depth: Depth + 1, UserTreeIdx: {TE, I});
7067 }
7068 return;
7069 }
7070 TE->setOperandsInOrder();
7071 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
7072 // For scalar operands no need to create an entry since no need to
7073 // vectorize it.
7074 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7075 continue;
7076 ValueList Operands;
7077 // Prepare the operand vector.
7078 for (Value *V : VL) {
7079 auto *CI2 = cast<CallInst>(Val: V);
7080 Operands.push_back(Elt: CI2->getArgOperand(i: I));
7081 }
7082 buildTree_rec(VL: Operands, Depth: Depth + 1, UserTreeIdx: {TE, I});
7083 }
7084 return;
7085 }
7086 case Instruction::ShuffleVector: {
7087 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7088 ReuseShuffleIndices: ReuseShuffleIndicies);
7089 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7090
7091 // Reorder operands if reordering would enable vectorization.
7092 auto *CI = dyn_cast<CmpInst>(Val: VL0);
7093 if (isa<BinaryOperator>(Val: VL0) || CI) {
7094 ValueList Left, Right;
7095 if (!CI || all_of(Range&: VL, P: [](Value *V) {
7096 return cast<CmpInst>(Val: V)->isCommutative();
7097 })) {
7098 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7099 } else {
7100 auto *MainCI = cast<CmpInst>(Val: S.MainOp);
7101 auto *AltCI = cast<CmpInst>(Val: S.AltOp);
7102 CmpInst::Predicate MainP = MainCI->getPredicate();
7103 CmpInst::Predicate AltP = AltCI->getPredicate();
7104 assert(MainP != AltP &&
7105 "Expected different main/alternate predicates.");
7106 // Collect operands - commute if it uses the swapped predicate or
7107 // alternate operation.
7108 for (Value *V : VL) {
7109 auto *Cmp = cast<CmpInst>(Val: V);
7110 Value *LHS = Cmp->getOperand(i_nocapture: 0);
7111 Value *RHS = Cmp->getOperand(i_nocapture: 1);
7112
7113 if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
7114 if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7115 std::swap(a&: LHS, b&: RHS);
7116 } else {
7117 if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7118 std::swap(a&: LHS, b&: RHS);
7119 }
7120 Left.push_back(Elt: LHS);
7121 Right.push_back(Elt: RHS);
7122 }
7123 }
7124 TE->setOperand(OpIdx: 0, OpVL: Left);
7125 TE->setOperand(OpIdx: 1, OpVL: Right);
7126 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7127 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
7128 return;
7129 }
7130
7131 TE->setOperandsInOrder();
7132 for (unsigned I : seq<unsigned>(Begin: 0, End: VL0->getNumOperands())) {
7133 ValueList Operands;
7134 // Prepare the operand vector.
7135 for (Value *V : VL)
7136 Operands.push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
7137
7138 buildTree_rec(VL: Operands, Depth: Depth + 1, UserTreeIdx: {TE, I});
7139 }
7140 return;
7141 }
7142 default:
7143 break;
7144 }
7145 llvm_unreachable("Unexpected vectorization of the instructions.");
7146}
7147
7148unsigned BoUpSLP::canMapToVector(Type *T) const {
7149 unsigned N = 1;
7150 Type *EltTy = T;
7151
7152 while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
7153 if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
7154 // Check that struct is homogeneous.
7155 for (const auto *Ty : ST->elements())
7156 if (Ty != *ST->element_begin())
7157 return 0;
7158 N *= ST->getNumElements();
7159 EltTy = *ST->element_begin();
7160 } else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
7161 N *= AT->getNumElements();
7162 EltTy = AT->getElementType();
7163 } else {
7164 auto *VT = cast<FixedVectorType>(Val: EltTy);
7165 N *= VT->getNumElements();
7166 EltTy = VT->getElementType();
7167 }
7168 }
7169
7170 if (!isValidElementType(Ty: EltTy))
7171 return 0;
7172 uint64_t VTSize = DL->getTypeStoreSizeInBits(Ty: FixedVectorType::get(ElementType: EltTy, NumElts: N));
7173 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7174 VTSize != DL->getTypeStoreSizeInBits(Ty: T))
7175 return 0;
7176 return N;
7177}
7178
7179bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7180 SmallVectorImpl<unsigned> &CurrentOrder,
7181 bool ResizeAllowed) const {
7182 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
7183 assert(It != VL.end() && "Expected at least one extract instruction.");
7184 auto *E0 = cast<Instruction>(Val: *It);
7185 assert(
7186 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7187 "Invalid opcode");
7188 // Check if all of the extracts come from the same vector and from the
7189 // correct offset.
7190 Value *Vec = E0->getOperand(i: 0);
7191
7192 CurrentOrder.clear();
7193
7194 // We have to extract from a vector/aggregate with the same number of elements.
7195 unsigned NElts;
7196 if (E0->getOpcode() == Instruction::ExtractValue) {
7197 NElts = canMapToVector(T: Vec->getType());
7198 if (!NElts)
7199 return false;
7200 // Check if load can be rewritten as load of vector.
7201 LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
7202 if (!LI || !LI->isSimple() || !LI->hasNUses(N: VL.size()))
7203 return false;
7204 } else {
7205 NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
7206 }
7207
7208 unsigned E = VL.size();
7209 if (!ResizeAllowed && NElts != E)
7210 return false;
7211 SmallVector<int> Indices(E, PoisonMaskElem);
7212 unsigned MinIdx = NElts, MaxIdx = 0;
7213 for (auto [I, V] : enumerate(First&: VL)) {
7214 auto *Inst = dyn_cast<Instruction>(Val: V);
7215 if (!Inst)
7216 continue;
7217 if (Inst->getOperand(i: 0) != Vec)
7218 return false;
7219 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
7220 if (isa<UndefValue>(Val: EE->getIndexOperand()))
7221 continue;
7222 std::optional<unsigned> Idx = getExtractIndex(E: Inst);
7223 if (!Idx)
7224 return false;
7225 const unsigned ExtIdx = *Idx;
7226 if (ExtIdx >= NElts)
7227 continue;
7228 Indices[I] = ExtIdx;
7229 if (MinIdx > ExtIdx)
7230 MinIdx = ExtIdx;
7231 if (MaxIdx < ExtIdx)
7232 MaxIdx = ExtIdx;
7233 }
7234 if (MaxIdx - MinIdx + 1 > E)
7235 return false;
7236 if (MaxIdx + 1 <= E)
7237 MinIdx = 0;
7238
7239 // Check that all of the indices extract from the correct offset.
7240 bool ShouldKeepOrder = true;
7241 // Assign to all items the initial value E + 1 so we can check if the extract
7242 // instruction index was used already.
7243 // Also, later we can check that all the indices are used and we have a
7244 // consecutive access in the extract instructions, by checking that no
7245 // element of CurrentOrder still has value E + 1.
7246 CurrentOrder.assign(NumElts: E, Elt: E);
7247 for (unsigned I = 0; I < E; ++I) {
7248 if (Indices[I] == PoisonMaskElem)
7249 continue;
7250 const unsigned ExtIdx = Indices[I] - MinIdx;
7251 if (CurrentOrder[ExtIdx] != E) {
7252 CurrentOrder.clear();
7253 return false;
7254 }
7255 ShouldKeepOrder &= ExtIdx == I;
7256 CurrentOrder[ExtIdx] = I;
7257 }
7258 if (ShouldKeepOrder)
7259 CurrentOrder.clear();
7260
7261 return ShouldKeepOrder;
7262}
7263
7264bool BoUpSLP::areAllUsersVectorized(
7265 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7266 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(V: I))) ||
7267 all_of(Range: I->users(), P: [this](User *U) {
7268 return ScalarToTreeEntry.contains(Val: U) ||
7269 isVectorLikeInstWithConstOps(V: U) ||
7270 (isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
7271 });
7272}
7273
7274static std::pair<InstructionCost, InstructionCost>
7275getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7276 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7277 ArrayRef<Type *> ArgTys) {
7278 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7279
7280 // Calculate the cost of the scalar and vector calls.
7281 FastMathFlags FMF;
7282 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
7283 FMF = FPCI->getFastMathFlags();
7284 SmallVector<const Value *> Arguments(CI->args());
7285 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7286 dyn_cast<IntrinsicInst>(Val: CI));
7287 auto IntrinsicCost =
7288 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
7289
7290 auto Shape = VFShape::get(FTy: CI->getFunctionType(),
7291 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
7292 HasGlobalPred: false /*HasGlobalPred*/);
7293 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7294 auto LibCost = IntrinsicCost;
7295 if (!CI->isNoBuiltin() && VecFunc) {
7296 // Calculate the cost of the vector library call.
7297 // If the corresponding vector call is cheaper, return its cost.
7298 LibCost =
7299 TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
7300 }
7301 return {IntrinsicCost, LibCost};
7302}
7303
7304void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7305 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7306 SmallVectorImpl<Value *> *OpScalars,
7307 SmallVectorImpl<Value *> *AltScalars) const {
7308 unsigned Sz = Scalars.size();
7309 Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
7310 SmallVector<int> OrderMask;
7311 if (!ReorderIndices.empty())
7312 inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
7313 for (unsigned I = 0; I < Sz; ++I) {
7314 unsigned Idx = I;
7315 if (!ReorderIndices.empty())
7316 Idx = OrderMask[I];
7317 auto *OpInst = cast<Instruction>(Val: Scalars[Idx]);
7318 if (IsAltOp(OpInst)) {
7319 Mask[I] = Sz + Idx;
7320 if (AltScalars)
7321 AltScalars->push_back(Elt: OpInst);
7322 } else {
7323 Mask[I] = Idx;
7324 if (OpScalars)
7325 OpScalars->push_back(Elt: OpInst);
7326 }
7327 }
7328 if (!ReuseShuffleIndices.empty()) {
7329 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7330 transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
7331 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7332 });
7333 Mask.swap(RHS&: NewMask);
7334 }
7335}
7336
7337static bool isAlternateInstruction(const Instruction *I,
7338 const Instruction *MainOp,
7339 const Instruction *AltOp,
7340 const TargetLibraryInfo &TLI) {
7341 if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
7342 auto *AltCI = cast<CmpInst>(Val: AltOp);
7343 CmpInst::Predicate MainP = MainCI->getPredicate();
7344 CmpInst::Predicate AltP = AltCI->getPredicate();
7345 assert(MainP != AltP && "Expected different main/alternate predicates.");
7346 auto *CI = cast<CmpInst>(Val: I);
7347 if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
7348 return false;
7349 if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
7350 return true;
7351 CmpInst::Predicate P = CI->getPredicate();
7352 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
7353
7354 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7355 "CmpInst expected to match either main or alternate predicate or "
7356 "their swap.");
7357 (void)AltP;
7358 return MainP != P && MainP != SwappedP;
7359 }
7360 return I->getOpcode() == AltOp->getOpcode();
7361}
7362
7363TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7364 assert(!Ops.empty());
7365 const auto *Op0 = Ops.front();
7366
7367 const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
7368 // TODO: We should allow undef elements here
7369 return isConstant(V) && !isa<UndefValue>(Val: V);
7370 });
7371 const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
7372 // TODO: We should allow undef elements here
7373 return V == Op0;
7374 });
7375 const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7376 // TODO: We should allow undef elements here
7377 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7378 return CI->getValue().isPowerOf2();
7379 return false;
7380 });
7381 const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7382 // TODO: We should allow undef elements here
7383 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7384 return CI->getValue().isNegatedPowerOf2();
7385 return false;
7386 });
7387
7388 TTI::OperandValueKind VK = TTI::OK_AnyValue;
7389 if (IsConstant && IsUniform)
7390 VK = TTI::OK_UniformConstantValue;
7391 else if (IsConstant)
7392 VK = TTI::OK_NonUniformConstantValue;
7393 else if (IsUniform)
7394 VK = TTI::OK_UniformValue;
7395
7396 TTI::OperandValueProperties VP = TTI::OP_None;
7397 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7398 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7399
7400 return {.Kind: VK, .Properties: VP};
7401}
7402
7403namespace {
7404/// The base class for shuffle instruction emission and shuffle cost estimation.
7405class BaseShuffleAnalysis {
7406protected:
7407 /// Checks if the mask is an identity mask.
7408 /// \param IsStrict if is true the function returns false if mask size does
7409 /// not match vector size.
7410 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7411 bool IsStrict) {
7412 int Limit = Mask.size();
7413 int VF = VecTy->getNumElements();
7414 int Index = -1;
7415 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
7416 return true;
7417 if (!IsStrict) {
7418 // Consider extract subvector starting from index 0.
7419 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
7420 Index == 0)
7421 return true;
7422 // All VF-size submasks are identity (e.g.
7423 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7424 if (Limit % VF == 0 && all_of(Range: seq<int>(Begin: 0, End: Limit / VF), P: [=](int Idx) {
7425 ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
7426 return all_of(Range&: Slice, P: [](int I) { return I == PoisonMaskElem; }) ||
7427 ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
7428 }))
7429 return true;
7430 }
7431 return false;
7432 }
7433
7434 /// Tries to combine 2 different masks into single one.
7435 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7436 /// change the size of the vector, \p LocalVF is the original size of the
7437 /// shuffled vector.
7438 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7439 ArrayRef<int> ExtMask) {
7440 unsigned VF = Mask.size();
7441 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7442 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7443 if (ExtMask[I] == PoisonMaskElem)
7444 continue;
7445 int MaskedIdx = Mask[ExtMask[I] % VF];
7446 NewMask[I] =
7447 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7448 }
7449 Mask.swap(RHS&: NewMask);
7450 }
7451
7452 /// Looks through shuffles trying to reduce final number of shuffles in the
7453 /// code. The function looks through the previously emitted shuffle
7454 /// instructions and properly mark indices in mask as undef.
7455 /// For example, given the code
7456 /// \code
7457 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7458 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7459 /// \endcode
7460 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7461 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7462 /// <0, 1, 2, 3> for the shuffle.
7463 /// If 2 operands are of different size, the smallest one will be resized and
7464 /// the mask recalculated properly.
7465 /// For example, given the code
7466 /// \code
7467 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7468 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7469 /// \endcode
7470 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7471 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7472 /// <0, 1, 2, 3> for the shuffle.
7473 /// So, it tries to transform permutations to simple vector merge, if
7474 /// possible.
7475 /// \param V The input vector which must be shuffled using the given \p Mask.
7476 /// If the better candidate is found, \p V is set to this best candidate
7477 /// vector.
7478 /// \param Mask The input mask for the shuffle. If the best candidate is found
7479 /// during looking-through-shuffles attempt, it is updated accordingly.
7480 /// \param SinglePermute true if the shuffle operation is originally a
7481 /// single-value-permutation. In this case the look-through-shuffles procedure
7482 /// may look for resizing shuffles as the best candidates.
7483 /// \return true if the shuffle results in the non-resizing identity shuffle
7484 /// (and thus can be ignored), false - otherwise.
7485 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7486 bool SinglePermute) {
7487 Value *Op = V;
7488 ShuffleVectorInst *IdentityOp = nullptr;
7489 SmallVector<int> IdentityMask;
7490 while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
7491 // Exit if not a fixed vector type or changing size shuffle.
7492 auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
7493 if (!SVTy)
7494 break;
7495 // Remember the identity or broadcast mask, if it is not a resizing
7496 // shuffle. If no better candidates are found, this Op and Mask will be
7497 // used in the final shuffle.
7498 if (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/false)) {
7499 if (!IdentityOp || !SinglePermute ||
7500 (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/true) &&
7501 !ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
7502 NumSrcElts: IdentityMask.size()))) {
7503 IdentityOp = SV;
7504 // Store current mask in the IdentityMask so later we did not lost
7505 // this info if IdentityOp is selected as the best candidate for the
7506 // permutation.
7507 IdentityMask.assign(RHS: Mask);
7508 }
7509 }
7510 // Remember the broadcast mask. If no better candidates are found, this Op
7511 // and Mask will be used in the final shuffle.
7512 // Zero splat can be used as identity too, since it might be used with
7513 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7514 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7515 // expensive, the analysis founds out, that the source vector is just a
7516 // broadcast, this original mask can be transformed to identity mask <0,
7517 // 1, 2, 3>.
7518 // \code
7519 // %0 = shuffle %v, poison, zeroinitalizer
7520 // %res = shuffle %0, poison, <3, 1, 2, 0>
7521 // \endcode
7522 // may be transformed to
7523 // \code
7524 // %0 = shuffle %v, poison, zeroinitalizer
7525 // %res = shuffle %0, poison, <0, 1, 2, 3>
7526 // \endcode
7527 if (SV->isZeroEltSplat()) {
7528 IdentityOp = SV;
7529 IdentityMask.assign(RHS: Mask);
7530 }
7531 int LocalVF = Mask.size();
7532 if (auto *SVOpTy =
7533 dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType()))
7534 LocalVF = SVOpTy->getNumElements();
7535 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7536 for (auto [Idx, I] : enumerate(First&: Mask)) {
7537 if (I == PoisonMaskElem ||
7538 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7539 continue;
7540 ExtMask[Idx] = SV->getMaskValue(Elt: I);
7541 }
7542 bool IsOp1Undef =
7543 isUndefVector(V: SV->getOperand(i_nocapture: 0),
7544 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
7545 .all();
7546 bool IsOp2Undef =
7547 isUndefVector(V: SV->getOperand(i_nocapture: 1),
7548 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
7549 .all();
7550 if (!IsOp1Undef && !IsOp2Undef) {
7551 // Update mask and mark undef elems.
7552 for (int &I : Mask) {
7553 if (I == PoisonMaskElem)
7554 continue;
7555 if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
7556 PoisonMaskElem)
7557 I = PoisonMaskElem;
7558 }
7559 break;
7560 }
7561 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7562 SV->getShuffleMask().end());
7563 combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
7564 Mask.swap(RHS&: ShuffleMask);
7565 if (IsOp2Undef)
7566 Op = SV->getOperand(i_nocapture: 0);
7567 else
7568 Op = SV->getOperand(i_nocapture: 1);
7569 }
7570 if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
7571 !OpTy || !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) ||
7572 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
7573 if (IdentityOp) {
7574 V = IdentityOp;
7575 assert(Mask.size() == IdentityMask.size() &&
7576 "Expected masks of same sizes.");
7577 // Clear known poison elements.
7578 for (auto [I, Idx] : enumerate(First&: Mask))
7579 if (Idx == PoisonMaskElem)
7580 IdentityMask[I] = PoisonMaskElem;
7581 Mask.swap(RHS&: IdentityMask);
7582 auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
7583 return SinglePermute &&
7584 (isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
7585 /*IsStrict=*/true) ||
7586 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7587 Shuffle->isZeroEltSplat() &&
7588 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())));
7589 }
7590 V = Op;
7591 return false;
7592 }
7593 V = Op;
7594 return true;
7595 }
7596
7597 /// Smart shuffle instruction emission, walks through shuffles trees and
7598 /// tries to find the best matching vector for the actual shuffle
7599 /// instruction.
7600 template <typename T, typename ShuffleBuilderTy>
7601 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7602 ShuffleBuilderTy &Builder) {
7603 assert(V1 && "Expected at least one vector value.");
7604 if (V2)
7605 Builder.resizeToMatch(V1, V2);
7606 int VF = Mask.size();
7607 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
7608 VF = FTy->getNumElements();
7609 if (V2 &&
7610 !isUndefVector(V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg)).all()) {
7611 // Peek through shuffles.
7612 Value *Op1 = V1;
7613 Value *Op2 = V2;
7614 int VF =
7615 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
7616 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7617 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7618 for (int I = 0, E = Mask.size(); I < E; ++I) {
7619 if (Mask[I] < VF)
7620 CombinedMask1[I] = Mask[I];
7621 else
7622 CombinedMask2[I] = Mask[I] - VF;
7623 }
7624 Value *PrevOp1;
7625 Value *PrevOp2;
7626 do {
7627 PrevOp1 = Op1;
7628 PrevOp2 = Op2;
7629 (void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /*SinglePermute=*/false);
7630 (void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /*SinglePermute=*/false);
7631 // Check if we have 2 resizing shuffles - need to peek through operands
7632 // again.
7633 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
7634 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
7635 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7636 for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
7637 if (I == PoisonMaskElem)
7638 continue;
7639 ExtMask1[Idx] = SV1->getMaskValue(Elt: I);
7640 }
7641 SmallBitVector UseMask1 = buildUseMask(
7642 VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: 1)->getType())
7643 ->getNumElements(),
7644 Mask: ExtMask1, MaskArg: UseMask::SecondArg);
7645 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7646 for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
7647 if (I == PoisonMaskElem)
7648 continue;
7649 ExtMask2[Idx] = SV2->getMaskValue(Elt: I);
7650 }
7651 SmallBitVector UseMask2 = buildUseMask(
7652 VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: 1)->getType())
7653 ->getNumElements(),
7654 Mask: ExtMask2, MaskArg: UseMask::SecondArg);
7655 if (SV1->getOperand(i_nocapture: 0)->getType() ==
7656 SV2->getOperand(i_nocapture: 0)->getType() &&
7657 SV1->getOperand(i_nocapture: 0)->getType() != SV1->getType() &&
7658 isUndefVector(V: SV1->getOperand(i_nocapture: 1), UseMask: UseMask1).all() &&
7659 isUndefVector(V: SV2->getOperand(i_nocapture: 1), UseMask: UseMask2).all()) {
7660 Op1 = SV1->getOperand(i_nocapture: 0);
7661 Op2 = SV2->getOperand(i_nocapture: 0);
7662 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7663 SV1->getShuffleMask().end());
7664 int LocalVF = ShuffleMask1.size();
7665 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
7666 LocalVF = FTy->getNumElements();
7667 combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
7668 CombinedMask1.swap(RHS&: ShuffleMask1);
7669 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7670 SV2->getShuffleMask().end());
7671 LocalVF = ShuffleMask2.size();
7672 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
7673 LocalVF = FTy->getNumElements();
7674 combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
7675 CombinedMask2.swap(RHS&: ShuffleMask2);
7676 }
7677 }
7678 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7679 Builder.resizeToMatch(Op1, Op2);
7680 VF = std::max(a: cast<VectorType>(Val: Op1->getType())
7681 ->getElementCount()
7682 .getKnownMinValue(),
7683 b: cast<VectorType>(Val: Op2->getType())
7684 ->getElementCount()
7685 .getKnownMinValue());
7686 for (int I = 0, E = Mask.size(); I < E; ++I) {
7687 if (CombinedMask2[I] != PoisonMaskElem) {
7688 assert(CombinedMask1[I] == PoisonMaskElem &&
7689 "Expected undefined mask element");
7690 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7691 }
7692 }
7693 if (Op1 == Op2 &&
7694 (ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) ||
7695 (ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
7696 isa<ShuffleVectorInst>(Val: Op1) &&
7697 cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
7698 ArrayRef(CombinedMask1))))
7699 return Builder.createIdentity(Op1);
7700 return Builder.createShuffleVector(
7701 Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
7702 CombinedMask1);
7703 }
7704 if (isa<PoisonValue>(Val: V1))
7705 return Builder.createPoison(
7706 cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
7707 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7708 bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /*SinglePermute=*/true);
7709 assert(V1 && "Expected non-null value after looking through shuffles.");
7710
7711 if (!IsIdentity)
7712 return Builder.createShuffleVector(V1, NewMask);
7713 return Builder.createIdentity(V1);
7714 }
7715};
7716} // namespace
7717
7718/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7719/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7720/// subvector pattern.
7721static InstructionCost
7722getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
7723 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7724 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
7725 int Index = 0, VectorType *SubTp = nullptr,
7726 ArrayRef<const Value *> Args = std::nullopt) {
7727 if (Kind != TTI::SK_PermuteTwoSrc)
7728 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7729 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7730 int NumSubElts;
7731 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7732 Mask, NumSrcElts, NumSubElts, Index)) {
7733 if (Index + NumSubElts > NumSrcElts &&
7734 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7735 return TTI.getShuffleCost(
7736 Kind: TTI::SK_InsertSubvector,
7737 Tp: FixedVectorType::get(ElementType: Tp->getElementType(), NumElts: Mask.size()), Mask,
7738 CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
7739 }
7740 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7741}
7742
7743/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7744static std::pair<InstructionCost, InstructionCost>
7745getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
7746 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7747 Type *ScalarTy, VectorType *VecTy) {
7748 InstructionCost ScalarCost = 0;
7749 InstructionCost VecCost = 0;
7750 // Here we differentiate two cases: (1) when Ptrs represent a regular
7751 // vectorization tree node (as they are pointer arguments of scattered
7752 // loads) or (2) when Ptrs are the arguments of loads or stores being
7753 // vectorized as plane wide unit-stride load/store since all the
7754 // loads/stores are known to be from/to adjacent locations.
7755 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7756 // Case 2: estimate costs for pointer related costs when vectorizing to
7757 // a wide load/store.
7758 // Scalar cost is estimated as a set of pointers with known relationship
7759 // between them.
7760 // For vector code we will use BasePtr as argument for the wide load/store
7761 // but we also need to account all the instructions which are going to
7762 // stay in vectorized code due to uses outside of these scalar
7763 // loads/stores.
7764 ScalarCost = TTI.getPointersChainCost(
7765 Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
7766 CostKind);
7767
7768 SmallVector<const Value *> PtrsRetainedInVecCode;
7769 for (Value *V : Ptrs) {
7770 if (V == BasePtr) {
7771 PtrsRetainedInVecCode.push_back(Elt: V);
7772 continue;
7773 }
7774 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
7775 // For simplicity assume Ptr to stay in vectorized code if it's not a
7776 // GEP instruction. We don't care since it's cost considered free.
7777 // TODO: We should check for any uses outside of vectorizable tree
7778 // rather than just single use.
7779 if (!Ptr || !Ptr->hasOneUse())
7780 PtrsRetainedInVecCode.push_back(Elt: V);
7781 }
7782
7783 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7784 // If all pointers stay in vectorized code then we don't have
7785 // any savings on that.
7786 return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
7787 }
7788 VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
7789 Info: TTI::PointersChainInfo::getKnownStride(),
7790 AccessTy: VecTy, CostKind);
7791 } else {
7792 // Case 1: Ptrs are the arguments of loads that we are going to transform
7793 // into masked gather load intrinsic.
7794 // All the scalar GEPs will be removed as a result of vectorization.
7795 // For any external uses of some lanes extract element instructions will
7796 // be generated (which cost is estimated separately).
7797 TTI::PointersChainInfo PtrsInfo =
7798 all_of(Range&: Ptrs,
7799 P: [](const Value *V) {
7800 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
7801 return Ptr && !Ptr->hasAllConstantIndices();
7802 })
7803 ? TTI::PointersChainInfo::getUnknownStride()
7804 : TTI::PointersChainInfo::getKnownStride();
7805
7806 ScalarCost =
7807 TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
7808 if (auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr)) {
7809 SmallVector<const Value *> Indices(BaseGEP->indices());
7810 VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
7811 Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
7812 CostKind);
7813 }
7814 }
7815
7816 return std::make_pair(x&: ScalarCost, y&: VecCost);
7817}
7818
7819void BoUpSLP::transformNodes() {
7820 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7821 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7822 TreeEntry &E = *TE.get();
7823 switch (E.getOpcode()) {
7824 case Instruction::Load: {
7825 Type *ScalarTy = E.getMainOp()->getType();
7826 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: E.Scalars.size());
7827 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
7828 // Check if profitable to represent consecutive load + reverse as strided
7829 // load with stride -1.
7830 if (isReverseOrder(Order: E.ReorderIndices) &&
7831 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
7832 SmallVector<int> Mask;
7833 inversePermutation(Indices: E.ReorderIndices, Mask);
7834 auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
7835 InstructionCost OriginalVecCost =
7836 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
7837 AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
7838 OpdInfo: TTI::OperandValueInfo()) +
7839 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
7840 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
7841 Opcode: Instruction::Load, DataTy: VecTy, Ptr: BaseLI->getPointerOperand(),
7842 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: BaseLI);
7843 if (StridedCost < OriginalVecCost)
7844 // Strided load is more profitable than consecutive load + reverse -
7845 // transform the node to strided load.
7846 E.State = TreeEntry::StridedVectorize;
7847 }
7848 break;
7849 }
7850 default:
7851 break;
7852 }
7853 }
7854}
7855
7856/// Merges shuffle masks and emits final shuffle instruction, if required. It
7857/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
7858/// when the actual shuffle instruction is generated only if this is actually
7859/// required. Otherwise, the shuffle instruction emission is delayed till the
7860/// end of the process, to reduce the number of emitted instructions and further
7861/// analysis/transformations.
7862class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
7863 bool IsFinalized = false;
7864 SmallVector<int> CommonMask;
7865 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
7866 const TargetTransformInfo &TTI;
7867 InstructionCost Cost = 0;
7868 SmallDenseSet<Value *> VectorizedVals;
7869 BoUpSLP &R;
7870 SmallPtrSetImpl<Value *> &CheckedExtracts;
7871 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7872 /// While set, still trying to estimate the cost for the same nodes and we
7873 /// can delay actual cost estimation (virtual shuffle instruction emission).
7874 /// May help better estimate the cost if same nodes must be permuted + allows
7875 /// to move most of the long shuffles cost estimation to TTI.
7876 bool SameNodesEstimated = true;
7877
7878 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
7879 if (Ty->getScalarType()->isPointerTy()) {
7880 Constant *Res = ConstantExpr::getIntToPtr(
7881 C: ConstantInt::getAllOnesValue(
7882 Ty: IntegerType::get(C&: Ty->getContext(),
7883 NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
7884 Ty: Ty->getScalarType());
7885 if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
7886 Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
7887 return Res;
7888 }
7889 return Constant::getAllOnesValue(Ty);
7890 }
7891
7892 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
7893 if ((!Root && allConstant(VL)) || all_of(Range&: VL, P: IsaPred<UndefValue>))
7894 return TTI::TCC_Free;
7895 auto *VecTy = FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: VL.size());
7896 InstructionCost GatherCost = 0;
7897 SmallVector<Value *> Gathers(VL.begin(), VL.end());
7898 // Improve gather cost for gather of loads, if we can group some of the
7899 // loads into vector loads.
7900 InstructionsState S = getSameOpcode(VL, TLI: *R.TLI);
7901 const unsigned Sz = R.DL->getTypeSizeInBits(Ty: VL.front()->getType());
7902 unsigned MinVF = R.getMinVF(Sz: 2 * Sz);
7903 if (VL.size() > 2 &&
7904 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
7905 (InVectors.empty() &&
7906 any_of(Range: seq<unsigned>(Begin: 0, End: VL.size() / MinVF),
7907 P: [&](unsigned Idx) {
7908 ArrayRef<Value *> SubVL = VL.slice(N: Idx * MinVF, M: MinVF);
7909 InstructionsState S = getSameOpcode(VL: SubVL, TLI: *R.TLI);
7910 return S.getOpcode() == Instruction::Load &&
7911 !S.isAltShuffle();
7912 }))) &&
7913 !all_of(Range&: Gathers, P: [&](Value *V) { return R.getTreeEntry(V); }) &&
7914 !isSplat(VL: Gathers)) {
7915 InstructionCost BaseCost = R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root);
7916 SetVector<Value *> VectorizedLoads;
7917 SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
7918 SmallVector<unsigned> ScatterVectorized;
7919 unsigned StartIdx = 0;
7920 unsigned VF = VL.size() / 2;
7921 for (; VF >= MinVF; VF /= 2) {
7922 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
7923 Cnt += VF) {
7924 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
7925 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
7926 InstructionsState SliceS = getSameOpcode(VL: Slice, TLI: *R.TLI);
7927 if (SliceS.getOpcode() != Instruction::Load ||
7928 SliceS.isAltShuffle())
7929 continue;
7930 }
7931 if (!VectorizedLoads.count(key: Slice.front()) &&
7932 !VectorizedLoads.count(key: Slice.back()) && allSameBlock(VL: Slice)) {
7933 SmallVector<Value *> PointerOps;
7934 OrdersType CurrentOrder;
7935 LoadsState LS = R.canVectorizeLoads(VL: Slice, VL0: Slice.front(),
7936 Order&: CurrentOrder, PointerOps);
7937 switch (LS) {
7938 case LoadsState::Vectorize:
7939 case LoadsState::ScatterVectorize:
7940 case LoadsState::StridedVectorize:
7941 // Mark the vectorized loads so that we don't vectorize them
7942 // again.
7943 // TODO: better handling of loads with reorders.
7944 if (((LS == LoadsState::Vectorize ||
7945 LS == LoadsState::StridedVectorize) &&
7946 CurrentOrder.empty()) ||
7947 (LS == LoadsState::StridedVectorize &&
7948 isReverseOrder(Order: CurrentOrder)))
7949 VectorizedStarts.emplace_back(Args&: Cnt, Args&: LS);
7950 else
7951 ScatterVectorized.push_back(Elt: Cnt);
7952 VectorizedLoads.insert(Start: Slice.begin(), End: Slice.end());
7953 // If we vectorized initial block, no need to try to vectorize
7954 // it again.
7955 if (Cnt == StartIdx)
7956 StartIdx += VF;
7957 break;
7958 case LoadsState::Gather:
7959 break;
7960 }
7961 }
7962 }
7963 // Check if the whole array was vectorized already - exit.
7964 if (StartIdx >= VL.size())
7965 break;
7966 // Found vectorizable parts - exit.
7967 if (!VectorizedLoads.empty())
7968 break;
7969 }
7970 if (!VectorizedLoads.empty()) {
7971 unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
7972 bool NeedInsertSubvectorAnalysis =
7973 !NumParts || (VL.size() / VF) > NumParts;
7974 // Get the cost for gathered loads.
7975 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
7976 if (VectorizedLoads.contains(key: VL[I]))
7977 continue;
7978 GatherCost +=
7979 getBuildVectorCost(VL: VL.slice(N: I, M: std::min(a: End - I, b: VF)), Root);
7980 }
7981 // Exclude potentially vectorized loads from list of gathered
7982 // scalars.
7983 Gathers.assign(NumElts: Gathers.size(), Elt: PoisonValue::get(T: VL.front()->getType()));
7984 // The cost for vectorized loads.
7985 InstructionCost ScalarsCost = 0;
7986 for (Value *V : VectorizedLoads) {
7987 auto *LI = cast<LoadInst>(Val: V);
7988 ScalarsCost +=
7989 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LI->getType(),
7990 Alignment: LI->getAlign(), AddressSpace: LI->getPointerAddressSpace(),
7991 CostKind, OpdInfo: TTI::OperandValueInfo(), I: LI);
7992 }
7993 auto *LoadTy = FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: VF);
7994 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
7995 auto *LI = cast<LoadInst>(Val: VL[P.first]);
7996 Align Alignment = LI->getAlign();
7997 GatherCost +=
7998 P.second == LoadsState::Vectorize
7999 ? TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment,
8000 AddressSpace: LI->getPointerAddressSpace(), CostKind,
8001 OpdInfo: TTI::OperandValueInfo(), I: LI)
8002 : TTI.getStridedMemoryOpCost(
8003 Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI->getPointerOperand(),
8004 /*VariableMask=*/false, Alignment, CostKind, I: LI);
8005 // Estimate GEP cost.
8006 SmallVector<Value *> PointerOps(VF);
8007 for (auto [I, V] : enumerate(First: VL.slice(N: P.first, M: VF)))
8008 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
8009 auto [ScalarGEPCost, VectorGEPCost] =
8010 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: LI->getPointerOperand(),
8011 Opcode: Instruction::Load, CostKind, ScalarTy: LI->getType(), VecTy: LoadTy);
8012 GatherCost += VectorGEPCost - ScalarGEPCost;
8013 }
8014 for (unsigned P : ScatterVectorized) {
8015 auto *LI0 = cast<LoadInst>(Val: VL[P]);
8016 ArrayRef<Value *> Slice = VL.slice(N: P, M: VF);
8017 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: Slice);
8018 GatherCost += TTI.getGatherScatterOpCost(
8019 Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI0->getPointerOperand(),
8020 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: LI0);
8021 // Estimate GEP cost.
8022 SmallVector<Value *> PointerOps(VF);
8023 for (auto [I, V] : enumerate(First&: Slice))
8024 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
8025 OrdersType Order;
8026 if (sortPtrAccesses(VL: PointerOps, ElemTy: LI0->getType(), DL: *R.DL, SE&: *R.SE,
8027 SortedIndices&: Order)) {
8028 // TODO: improve checks if GEPs can be vectorized.
8029 Value *Ptr0 = PointerOps.front();
8030 Type *ScalarTy = Ptr0->getType();
8031 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF);
8032 auto [ScalarGEPCost, VectorGEPCost] =
8033 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: Ptr0, Opcode: Instruction::GetElementPtr,
8034 CostKind, ScalarTy, VecTy);
8035 GatherCost += VectorGEPCost - ScalarGEPCost;
8036 if (!Order.empty()) {
8037 SmallVector<int> Mask;
8038 inversePermutation(Indices: Order, Mask);
8039 GatherCost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
8040 Tp: VecTy, Mask, CostKind);
8041 }
8042 } else {
8043 GatherCost += R.getGatherCost(VL: PointerOps, /*ForPoisonSrc=*/true);
8044 }
8045 }
8046 if (NeedInsertSubvectorAnalysis) {
8047 // Add the cost for the subvectors insert.
8048 SmallVector<int> ShuffleMask(VL.size());
8049 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8050 for (unsigned Idx : seq<unsigned>(Begin: 0, End: E))
8051 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8052 GatherCost += TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: VecTy,
8053 Mask: ShuffleMask, CostKind, Index: I, SubTp: LoadTy);
8054 }
8055 }
8056 GatherCost -= ScalarsCost;
8057 }
8058 GatherCost = std::min(a: BaseCost, b: GatherCost);
8059 } else if (!Root && isSplat(VL)) {
8060 // Found the broadcasting of the single scalar, calculate the cost as
8061 // the broadcast.
8062 const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
8063 assert(It != VL.end() && "Expected at least one non-undef value.");
8064 // Add broadcast for non-identity shuffle only.
8065 bool NeedShuffle =
8066 count(Range&: VL, Element: *It) > 1 &&
8067 (VL.front() != *It || !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
8068 if (!NeedShuffle)
8069 return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
8070 CostKind, Index: std::distance(first: VL.begin(), last: It),
8071 Op0: PoisonValue::get(T: VecTy), Op1: *It);
8072
8073 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8074 transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
8075 return isa<PoisonValue>(Val: V) ? PoisonMaskElem : 0;
8076 });
8077 InstructionCost InsertCost = TTI.getVectorInstrCost(
8078 Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0,
8079 Op0: PoisonValue::get(T: VecTy), Op1: *It);
8080 return InsertCost +
8081 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VecTy,
8082 Mask: ShuffleMask, CostKind, /*Index=*/0,
8083 /*SubTp=*/nullptr, /*Args=*/*It);
8084 }
8085 return GatherCost +
8086 (all_of(Range&: Gathers, P: IsaPred<UndefValue>)
8087 ? TTI::TCC_Free
8088 : R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers)));
8089 };
8090
8091 /// Compute the cost of creating a vector containing the extracted values from
8092 /// \p VL.
8093 InstructionCost
8094 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8095 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8096 unsigned NumParts) {
8097 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8098 unsigned NumElts =
8099 std::accumulate(first: VL.begin(), last: VL.end(), init: 0, binary_op: [](unsigned Sz, Value *V) {
8100 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8101 if (!EE)
8102 return Sz;
8103 auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
8104 if (!VecTy)
8105 return Sz;
8106 return std::max(a: Sz, b: VecTy->getNumElements());
8107 });
8108 unsigned NumSrcRegs = TTI.getNumberOfParts(
8109 Tp: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts));
8110 if (NumSrcRegs == 0)
8111 NumSrcRegs = 1;
8112 // FIXME: this must be moved to TTI for better estimation.
8113 unsigned EltsPerVector = PowerOf2Ceil(A: std::max(
8114 a: divideCeil(Numerator: VL.size(), Denominator: NumParts), b: divideCeil(Numerator: NumElts, Denominator: NumSrcRegs)));
8115 auto CheckPerRegistersShuffle =
8116 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8117 DenseSet<int> RegIndices;
8118 // Check that if trying to permute same single/2 input vectors.
8119 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
8120 int FirstRegId = -1;
8121 for (int &I : Mask) {
8122 if (I == PoisonMaskElem)
8123 continue;
8124 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8125 if (FirstRegId < 0)
8126 FirstRegId = RegId;
8127 RegIndices.insert(V: RegId);
8128 if (RegIndices.size() > 2)
8129 return std::nullopt;
8130 if (RegIndices.size() == 2)
8131 ShuffleKind = TTI::SK_PermuteTwoSrc;
8132 I = (I % NumElts) % EltsPerVector +
8133 (RegId == FirstRegId ? 0 : EltsPerVector);
8134 }
8135 return ShuffleKind;
8136 };
8137 InstructionCost Cost = 0;
8138
8139 // Process extracts in blocks of EltsPerVector to check if the source vector
8140 // operand can be re-used directly. If not, add the cost of creating a
8141 // shuffle to extract the values into a vector register.
8142 for (unsigned Part = 0; Part < NumParts; ++Part) {
8143 if (!ShuffleKinds[Part])
8144 continue;
8145 ArrayRef<int> MaskSlice =
8146 Mask.slice(N: Part * EltsPerVector,
8147 M: (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8148 ? Mask.size() % EltsPerVector
8149 : EltsPerVector);
8150 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8151 copy(Range&: MaskSlice, Out: SubMask.begin());
8152 std::optional<TTI::ShuffleKind> RegShuffleKind =
8153 CheckPerRegistersShuffle(SubMask);
8154 if (!RegShuffleKind) {
8155 Cost += ::getShuffleCost(
8156 TTI, Kind: *ShuffleKinds[Part],
8157 Tp: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts), Mask: MaskSlice);
8158 continue;
8159 }
8160 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8161 !ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
8162 Cost += ::getShuffleCost(
8163 TTI, Kind: *RegShuffleKind,
8164 Tp: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: EltsPerVector),
8165 Mask: SubMask);
8166 }
8167 }
8168 return Cost;
8169 }
8170 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8171 /// shuffle emission.
8172 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8173 ArrayRef<int> Mask) {
8174 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8175 if (Mask[Idx] != PoisonMaskElem)
8176 CommonMask[Idx] = Idx;
8177 }
8178 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8179 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8180 /// elements.
8181 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8182 ArrayRef<int> Mask, unsigned Part,
8183 unsigned SliceSize) {
8184 if (SameNodesEstimated) {
8185 // Delay the cost estimation if the same nodes are reshuffling.
8186 // If we already requested the cost of reshuffling of E1 and E2 before, no
8187 // need to estimate another cost with the sub-Mask, instead include this
8188 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8189 // estimation.
8190 if ((InVectors.size() == 2 &&
8191 InVectors.front().get<const TreeEntry *>() == &E1 &&
8192 InVectors.back().get<const TreeEntry *>() == E2) ||
8193 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8194 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8195 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8196 "Expected all poisoned elements.");
8197 ArrayRef<int> SubMask =
8198 ArrayRef(Mask).slice(N: Part * SliceSize, M: SliceSize);
8199 copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
8200 return;
8201 }
8202 // Found non-matching nodes - need to estimate the cost for the matched
8203 // and transform mask.
8204 Cost += createShuffle(P1: InVectors.front(),
8205 P2: InVectors.size() == 1 ? nullptr : InVectors.back(),
8206 Mask: CommonMask);
8207 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8208 }
8209 SameNodesEstimated = false;
8210 if (!E2 && InVectors.size() == 1) {
8211 unsigned VF = E1.getVectorFactor();
8212 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8213 VF = std::max(a: VF,
8214 b: cast<FixedVectorType>(Val: V1->getType())->getNumElements());
8215 } else {
8216 const auto *E = InVectors.front().get<const TreeEntry *>();
8217 VF = std::max(a: VF, b: E->getVectorFactor());
8218 }
8219 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8220 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8221 CommonMask[Idx] = Mask[Idx] + VF;
8222 Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
8223 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8224 } else {
8225 Cost += createShuffle(P1: &E1, P2: E2, Mask);
8226 transformMaskAfterShuffle(CommonMask, Mask);
8227 }
8228 }
8229
8230 class ShuffleCostBuilder {
8231 const TargetTransformInfo &TTI;
8232
8233 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8234 int Index = -1;
8235 return Mask.empty() ||
8236 (VF == Mask.size() &&
8237 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) ||
8238 (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
8239 Index == 0);
8240 }
8241
8242 public:
8243 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8244 ~ShuffleCostBuilder() = default;
8245 InstructionCost createShuffleVector(Value *V1, Value *,
8246 ArrayRef<int> Mask) const {
8247 // Empty mask or identity mask are free.
8248 unsigned VF =
8249 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8250 if (isEmptyOrIdentity(Mask, VF))
8251 return TTI::TCC_Free;
8252 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
8253 Tp: cast<VectorType>(Val: V1->getType()), Mask);
8254 }
8255 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8256 // Empty mask or identity mask are free.
8257 unsigned VF =
8258 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8259 if (isEmptyOrIdentity(Mask, VF))
8260 return TTI::TCC_Free;
8261 return TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc,
8262 Tp: cast<VectorType>(Val: V1->getType()), Mask);
8263 }
8264 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8265 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8266 return TTI::TCC_Free;
8267 }
8268 void resizeToMatch(Value *&, Value *&) const {}
8269 };
8270
8271 /// Smart shuffle instruction emission, walks through shuffles trees and
8272 /// tries to find the best matching vector for the actual shuffle
8273 /// instruction.
8274 InstructionCost
8275 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8276 const PointerUnion<Value *, const TreeEntry *> &P2,
8277 ArrayRef<int> Mask) {
8278 ShuffleCostBuilder Builder(TTI);
8279 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8280 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8281 unsigned CommonVF = Mask.size();
8282 if (!V1 && !V2 && !P2.isNull()) {
8283 // Shuffle 2 entry nodes.
8284 const TreeEntry *E = P1.get<const TreeEntry *>();
8285 unsigned VF = E->getVectorFactor();
8286 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8287 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8288 assert(all_of(Mask,
8289 [=](int Idx) {
8290 return Idx < 2 * static_cast<int>(CommonVF);
8291 }) &&
8292 "All elements in mask must be less than 2 * CommonVF.");
8293 if (E->Scalars.size() == E2->Scalars.size()) {
8294 SmallVector<int> EMask = E->getCommonMask();
8295 SmallVector<int> E2Mask = E2->getCommonMask();
8296 if (!EMask.empty() || !E2Mask.empty()) {
8297 for (int &Idx : CommonMask) {
8298 if (Idx == PoisonMaskElem)
8299 continue;
8300 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8301 Idx = EMask[Idx];
8302 else if (Idx >= static_cast<int>(CommonVF))
8303 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8304 E->Scalars.size();
8305 }
8306 }
8307 CommonVF = E->Scalars.size();
8308 }
8309 V1 = Constant::getNullValue(
8310 Ty: FixedVectorType::get(ElementType: E->Scalars.front()->getType(), NumElts: CommonVF));
8311 V2 = getAllOnesValue(
8312 DL: *R.DL, Ty: FixedVectorType::get(ElementType: E->Scalars.front()->getType(), NumElts: CommonVF));
8313 } else if (!V1 && P2.isNull()) {
8314 // Shuffle single entry node.
8315 const TreeEntry *E = P1.get<const TreeEntry *>();
8316 unsigned VF = E->getVectorFactor();
8317 CommonVF = VF;
8318 assert(
8319 all_of(Mask,
8320 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8321 "All elements in mask must be less than CommonVF.");
8322 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8323 SmallVector<int> EMask = E->getCommonMask();
8324 assert(!EMask.empty() && "Expected non-empty common mask.");
8325 for (int &Idx : CommonMask) {
8326 if (Idx != PoisonMaskElem)
8327 Idx = EMask[Idx];
8328 }
8329 CommonVF = E->Scalars.size();
8330 }
8331 V1 = Constant::getNullValue(
8332 Ty: FixedVectorType::get(ElementType: E->Scalars.front()->getType(), NumElts: CommonVF));
8333 // Not identity/broadcast? Try to see if the original vector is better.
8334 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8335 CommonVF == CommonMask.size() &&
8336 any_of(Range: enumerate(First&: CommonMask),
8337 P: [](const auto &&P) {
8338 return P.value() != PoisonMaskElem &&
8339 static_cast<unsigned>(P.value()) != P.index();
8340 }) &&
8341 any_of(Range&: CommonMask,
8342 P: [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8343 SmallVector<int> ReorderMask;
8344 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
8345 ::addMask(Mask&: CommonMask, SubMask: ReorderMask);
8346 }
8347 } else if (V1 && P2.isNull()) {
8348 // Shuffle single vector.
8349 CommonVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8350 assert(
8351 all_of(Mask,
8352 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8353 "All elements in mask must be less than CommonVF.");
8354 } else if (V1 && !V2) {
8355 // Shuffle vector and tree node.
8356 unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8357 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8358 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8359 assert(all_of(Mask,
8360 [=](int Idx) {
8361 return Idx < 2 * static_cast<int>(CommonVF);
8362 }) &&
8363 "All elements in mask must be less than 2 * CommonVF.");
8364 if (E2->Scalars.size() == VF && VF != CommonVF) {
8365 SmallVector<int> E2Mask = E2->getCommonMask();
8366 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8367 for (int &Idx : CommonMask) {
8368 if (Idx == PoisonMaskElem)
8369 continue;
8370 if (Idx >= static_cast<int>(CommonVF))
8371 Idx = E2Mask[Idx - CommonVF] + VF;
8372 }
8373 CommonVF = VF;
8374 }
8375 V1 = Constant::getNullValue(
8376 Ty: FixedVectorType::get(ElementType: E2->Scalars.front()->getType(), NumElts: CommonVF));
8377 V2 = getAllOnesValue(
8378 DL: *R.DL,
8379 Ty: FixedVectorType::get(ElementType: E2->Scalars.front()->getType(), NumElts: CommonVF));
8380 } else if (!V1 && V2) {
8381 // Shuffle vector and tree node.
8382 unsigned VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
8383 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8384 CommonVF = std::max(a: VF, b: E1->getVectorFactor());
8385 assert(all_of(Mask,
8386 [=](int Idx) {
8387 return Idx < 2 * static_cast<int>(CommonVF);
8388 }) &&
8389 "All elements in mask must be less than 2 * CommonVF.");
8390 if (E1->Scalars.size() == VF && VF != CommonVF) {
8391 SmallVector<int> E1Mask = E1->getCommonMask();
8392 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8393 for (int &Idx : CommonMask) {
8394 if (Idx == PoisonMaskElem)
8395 continue;
8396 if (Idx >= static_cast<int>(CommonVF))
8397 Idx = E1Mask[Idx - CommonVF] + VF;
8398 else
8399 Idx = E1Mask[Idx];
8400 }
8401 CommonVF = VF;
8402 }
8403 V1 = Constant::getNullValue(
8404 Ty: FixedVectorType::get(ElementType: E1->Scalars.front()->getType(), NumElts: CommonVF));
8405 V2 = getAllOnesValue(
8406 DL: *R.DL,
8407 Ty: FixedVectorType::get(ElementType: E1->Scalars.front()->getType(), NumElts: CommonVF));
8408 } else {
8409 assert(V1 && V2 && "Expected both vectors.");
8410 unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8411 CommonVF =
8412 std::max(a: VF, b: cast<FixedVectorType>(Val: V2->getType())->getNumElements());
8413 assert(all_of(Mask,
8414 [=](int Idx) {
8415 return Idx < 2 * static_cast<int>(CommonVF);
8416 }) &&
8417 "All elements in mask must be less than 2 * CommonVF.");
8418 if (V1->getType() != V2->getType()) {
8419 V1 = Constant::getNullValue(Ty: FixedVectorType::get(
8420 ElementType: cast<FixedVectorType>(Val: V1->getType())->getElementType(), NumElts: CommonVF));
8421 V2 = getAllOnesValue(
8422 DL: *R.DL, Ty: FixedVectorType::get(
8423 ElementType: cast<FixedVectorType>(Val: V1->getType())->getElementType(),
8424 NumElts: CommonVF));
8425 }
8426 }
8427 InVectors.front() = Constant::getNullValue(Ty: FixedVectorType::get(
8428 ElementType: cast<FixedVectorType>(Val: V1->getType())->getElementType(),
8429 NumElts: CommonMask.size()));
8430 if (InVectors.size() == 2)
8431 InVectors.pop_back();
8432 return BaseShuffleAnalysis::createShuffle<InstructionCost>(
8433 V1, V2, Mask: CommonMask, Builder);
8434 }
8435
8436public:
8437 ShuffleCostEstimator(TargetTransformInfo &TTI,
8438 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8439 SmallPtrSetImpl<Value *> &CheckedExtracts)
8440 : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
8441 R(R), CheckedExtracts(CheckedExtracts) {}
8442 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8443 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8444 unsigned NumParts, bool &UseVecBaseAsInput) {
8445 UseVecBaseAsInput = false;
8446 if (Mask.empty())
8447 return nullptr;
8448 Value *VecBase = nullptr;
8449 ArrayRef<Value *> VL = E->Scalars;
8450 // If the resulting type is scalarized, do not adjust the cost.
8451 if (NumParts == VL.size())
8452 return nullptr;
8453 // Check if it can be considered reused if same extractelements were
8454 // vectorized already.
8455 bool PrevNodeFound = any_of(
8456 Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
8457 P: [&](const std::unique_ptr<TreeEntry> &TE) {
8458 return ((!TE->isAltShuffle() &&
8459 TE->getOpcode() == Instruction::ExtractElement) ||
8460 TE->State == TreeEntry::NeedToGather) &&
8461 all_of(Range: enumerate(First&: TE->Scalars), P: [&](auto &&Data) {
8462 return VL.size() > Data.index() &&
8463 (Mask[Data.index()] == PoisonMaskElem ||
8464 isa<UndefValue>(VL[Data.index()]) ||
8465 Data.value() == VL[Data.index()]);
8466 });
8467 });
8468 SmallPtrSet<Value *, 4> UniqueBases;
8469 unsigned SliceSize = VL.size() / NumParts;
8470 for (unsigned Part = 0; Part < NumParts; ++Part) {
8471 ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: SliceSize);
8472 for (auto [I, V] : enumerate(First: VL.slice(N: Part * SliceSize, M: SliceSize))) {
8473 // Ignore non-extractelement scalars.
8474 if (isa<UndefValue>(Val: V) ||
8475 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8476 continue;
8477 // If all users of instruction are going to be vectorized and this
8478 // instruction itself is not going to be vectorized, consider this
8479 // instruction as dead and remove its cost from the final cost of the
8480 // vectorized tree.
8481 // Also, avoid adjusting the cost for extractelements with multiple uses
8482 // in different graph entries.
8483 auto *EE = cast<ExtractElementInst>(Val: V);
8484 VecBase = EE->getVectorOperand();
8485 UniqueBases.insert(Ptr: VecBase);
8486 const TreeEntry *VE = R.getTreeEntry(V);
8487 if (!CheckedExtracts.insert(Ptr: V).second ||
8488 !R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) ||
8489 (VE && VE != E))
8490 continue;
8491 std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
8492 if (!EEIdx)
8493 continue;
8494 unsigned Idx = *EEIdx;
8495 // Take credit for instruction that will become dead.
8496 if (EE->hasOneUse() || !PrevNodeFound) {
8497 Instruction *Ext = EE->user_back();
8498 if (isa<SExtInst, ZExtInst>(Val: Ext) &&
8499 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
8500 // Use getExtractWithExtendCost() to calculate the cost of
8501 // extractelement/ext pair.
8502 Cost -=
8503 TTI.getExtractWithExtendCost(Opcode: Ext->getOpcode(), Dst: Ext->getType(),
8504 VecTy: EE->getVectorOperandType(), Index: Idx);
8505 // Add back the cost of s|zext which is subtracted separately.
8506 Cost += TTI.getCastInstrCost(
8507 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
8508 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
8509 continue;
8510 }
8511 }
8512 Cost -= TTI.getVectorInstrCost(I: *EE, Val: EE->getVectorOperandType(),
8513 CostKind, Index: Idx);
8514 }
8515 }
8516 // Check that gather of extractelements can be represented as just a
8517 // shuffle of a single/two vectors the scalars are extracted from.
8518 // Found the bunch of extractelement instructions that must be gathered
8519 // into a vector and can be represented as a permutation elements in a
8520 // single input vector or of 2 input vectors.
8521 // Done for reused if same extractelements were vectorized already.
8522 if (!PrevNodeFound)
8523 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8524 InVectors.assign(NumElts: 1, Elt: E);
8525 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8526 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8527 SameNodesEstimated = false;
8528 if (NumParts != 1 && UniqueBases.size() != 1) {
8529 UseVecBaseAsInput = true;
8530 VecBase = Constant::getNullValue(
8531 Ty: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: CommonMask.size()));
8532 }
8533 return VecBase;
8534 }
8535 /// Checks if the specified entry \p E needs to be delayed because of its
8536 /// dependency nodes.
8537 std::optional<InstructionCost>
8538 needToDelay(const TreeEntry *,
8539 ArrayRef<SmallVector<const TreeEntry *>>) const {
8540 // No need to delay the cost estimation during analysis.
8541 return std::nullopt;
8542 }
8543 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8544 if (&E1 == &E2) {
8545 assert(all_of(Mask,
8546 [&](int Idx) {
8547 return Idx < static_cast<int>(E1.getVectorFactor());
8548 }) &&
8549 "Expected single vector shuffle mask.");
8550 add(E1, Mask);
8551 return;
8552 }
8553 if (InVectors.empty()) {
8554 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8555 InVectors.assign(IL: {&E1, &E2});
8556 return;
8557 }
8558 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8559 auto *MaskVecTy =
8560 FixedVectorType::get(ElementType: E1.Scalars.front()->getType(), NumElts: Mask.size());
8561 unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
8562 if (NumParts == 0 || NumParts >= Mask.size())
8563 NumParts = 1;
8564 unsigned SliceSize = Mask.size() / NumParts;
8565 const auto *It =
8566 find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
8567 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
8568 estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
8569 }
8570 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8571 if (InVectors.empty()) {
8572 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8573 InVectors.assign(NumElts: 1, Elt: &E1);
8574 return;
8575 }
8576 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8577 auto *MaskVecTy =
8578 FixedVectorType::get(ElementType: E1.Scalars.front()->getType(), NumElts: Mask.size());
8579 unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
8580 if (NumParts == 0 || NumParts >= Mask.size())
8581 NumParts = 1;
8582 unsigned SliceSize = Mask.size() / NumParts;
8583 const auto *It =
8584 find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
8585 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
8586 estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
8587 if (!SameNodesEstimated && InVectors.size() == 1)
8588 InVectors.emplace_back(Args: &E1);
8589 }
8590 /// Adds 2 input vectors and the mask for their shuffling.
8591 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8592 // May come only for shuffling of 2 vectors with extractelements, already
8593 // handled in adjustExtracts.
8594 assert(InVectors.size() == 1 &&
8595 all_of(enumerate(CommonMask),
8596 [&](auto P) {
8597 if (P.value() == PoisonMaskElem)
8598 return Mask[P.index()] == PoisonMaskElem;
8599 auto *EI =
8600 cast<ExtractElementInst>(InVectors.front()
8601 .get<const TreeEntry *>()
8602 ->Scalars[P.index()]);
8603 return EI->getVectorOperand() == V1 ||
8604 EI->getVectorOperand() == V2;
8605 }) &&
8606 "Expected extractelement vectors.");
8607 }
8608 /// Adds another one input vector and the mask for the shuffling.
8609 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8610 if (InVectors.empty()) {
8611 assert(CommonMask.empty() && !ForExtracts &&
8612 "Expected empty input mask/vectors.");
8613 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8614 InVectors.assign(NumElts: 1, Elt: V1);
8615 return;
8616 }
8617 if (ForExtracts) {
8618 // No need to add vectors here, already handled them in adjustExtracts.
8619 assert(InVectors.size() == 1 &&
8620 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8621 all_of(enumerate(CommonMask),
8622 [&](auto P) {
8623 Value *Scalar = InVectors.front()
8624 .get<const TreeEntry *>()
8625 ->Scalars[P.index()];
8626 if (P.value() == PoisonMaskElem)
8627 return P.value() == Mask[P.index()] ||
8628 isa<UndefValue>(Scalar);
8629 if (isa<Constant>(V1))
8630 return true;
8631 auto *EI = cast<ExtractElementInst>(Scalar);
8632 return EI->getVectorOperand() == V1;
8633 }) &&
8634 "Expected only tree entry for extractelement vectors.");
8635 return;
8636 }
8637 assert(!InVectors.empty() && !CommonMask.empty() &&
8638 "Expected only tree entries from extracts/reused buildvectors.");
8639 unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8640 if (InVectors.size() == 2) {
8641 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
8642 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8643 VF = std::max<unsigned>(a: VF, b: CommonMask.size());
8644 } else if (const auto *InTE =
8645 InVectors.front().dyn_cast<const TreeEntry *>()) {
8646 VF = std::max(a: VF, b: InTE->getVectorFactor());
8647 } else {
8648 VF = std::max(
8649 a: VF, b: cast<FixedVectorType>(Val: InVectors.front().get<Value *>()->getType())
8650 ->getNumElements());
8651 }
8652 InVectors.push_back(Elt: V1);
8653 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8654 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8655 CommonMask[Idx] = Mask[Idx] + VF;
8656 }
8657 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8658 Value *Root = nullptr) {
8659 Cost += getBuildVectorCost(VL, Root);
8660 if (!Root) {
8661 // FIXME: Need to find a way to avoid use of getNullValue here.
8662 SmallVector<Constant *> Vals;
8663 unsigned VF = VL.size();
8664 if (MaskVF != 0)
8665 VF = std::min(a: VF, b: MaskVF);
8666 for (Value *V : VL.take_front(N: VF)) {
8667 if (isa<UndefValue>(Val: V)) {
8668 Vals.push_back(Elt: cast<Constant>(Val: V));
8669 continue;
8670 }
8671 Vals.push_back(Elt: Constant::getNullValue(Ty: V->getType()));
8672 }
8673 return ConstantVector::get(V: Vals);
8674 }
8675 return ConstantVector::getSplat(
8676 EC: ElementCount::getFixed(
8677 MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
8678 Elt: getAllOnesValue(DL: *R.DL, Ty: VL.front()->getType()));
8679 }
8680 InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
8681 /// Finalize emission of the shuffles.
8682 InstructionCost
8683 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8684 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8685 IsFinalized = true;
8686 if (Action) {
8687 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8688 if (InVectors.size() == 2)
8689 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
8690 else
8691 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
8692 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8693 if (CommonMask[Idx] != PoisonMaskElem)
8694 CommonMask[Idx] = Idx;
8695 assert(VF > 0 &&
8696 "Expected vector length for the final value before action.");
8697 Value *V = Vec.get<Value *>();
8698 Action(V, CommonMask);
8699 InVectors.front() = V;
8700 }
8701 ::addMask(Mask&: CommonMask, SubMask: ExtMask, /*ExtendingManyInputs=*/true);
8702 if (CommonMask.empty()) {
8703 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8704 return Cost;
8705 }
8706 return Cost +
8707 createShuffle(P1: InVectors.front(),
8708 P2: InVectors.size() == 2 ? InVectors.back() : nullptr,
8709 Mask: CommonMask);
8710 }
8711
8712 ~ShuffleCostEstimator() {
8713 assert((IsFinalized || CommonMask.empty()) &&
8714 "Shuffle construction must be finalized.");
8715 }
8716};
8717
8718const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8719 unsigned Idx) const {
8720 Value *Op = E->getOperand(OpIdx: Idx).front();
8721 if (const TreeEntry *TE = getTreeEntry(V: Op)) {
8722 if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
8723 return EI.EdgeIdx == Idx && EI.UserTE == E;
8724 }) != TE->UserTreeIndices.end())
8725 return TE;
8726 auto MIt = MultiNodeScalars.find(Val: Op);
8727 if (MIt != MultiNodeScalars.end()) {
8728 for (const TreeEntry *TE : MIt->second) {
8729 if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
8730 return EI.EdgeIdx == Idx && EI.UserTE == E;
8731 }) != TE->UserTreeIndices.end())
8732 return TE;
8733 }
8734 }
8735 }
8736 const auto *It =
8737 find_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8738 return TE->State == TreeEntry::NeedToGather &&
8739 find_if(Range&: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
8740 return EI.EdgeIdx == Idx && EI.UserTE == E;
8741 }) != TE->UserTreeIndices.end();
8742 });
8743 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
8744 return It->get();
8745}
8746
8747TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
8748 if (TE.State == TreeEntry::ScatterVectorize ||
8749 TE.State == TreeEntry::StridedVectorize)
8750 return TTI::CastContextHint::GatherScatter;
8751 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
8752 !TE.isAltShuffle()) {
8753 if (TE.ReorderIndices.empty())
8754 return TTI::CastContextHint::Normal;
8755 SmallVector<int> Mask;
8756 inversePermutation(Indices: TE.ReorderIndices, Mask);
8757 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
8758 return TTI::CastContextHint::Reversed;
8759 }
8760 return TTI::CastContextHint::None;
8761}
8762
8763/// Builds the arguments types vector for the given call instruction with the
8764/// given \p ID for the specified vector factor.
8765static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
8766 const Intrinsic::ID ID,
8767 const unsigned VF,
8768 unsigned MinBW) {
8769 SmallVector<Type *> ArgTys;
8770 for (auto [Idx, Arg] : enumerate(First: CI->args())) {
8771 if (ID != Intrinsic::not_intrinsic) {
8772 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx)) {
8773 ArgTys.push_back(Elt: Arg->getType());
8774 continue;
8775 }
8776 if (MinBW > 0) {
8777 ArgTys.push_back(Elt: FixedVectorType::get(
8778 ElementType: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), NumElts: VF));
8779 continue;
8780 }
8781 }
8782 ArgTys.push_back(Elt: FixedVectorType::get(ElementType: Arg->getType(), NumElts: VF));
8783 }
8784 return ArgTys;
8785}
8786
8787InstructionCost
8788BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8789 SmallPtrSetImpl<Value *> &CheckedExtracts) {
8790 ArrayRef<Value *> VL = E->Scalars;
8791
8792 Type *ScalarTy = VL[0]->getType();
8793 if (E->State != TreeEntry::NeedToGather) {
8794 if (auto *SI = dyn_cast<StoreInst>(Val: VL[0]))
8795 ScalarTy = SI->getValueOperand()->getType();
8796 else if (auto *CI = dyn_cast<CmpInst>(Val: VL[0]))
8797 ScalarTy = CI->getOperand(i_nocapture: 0)->getType();
8798 else if (auto *IE = dyn_cast<InsertElementInst>(Val: VL[0]))
8799 ScalarTy = IE->getOperand(i_nocapture: 1)->getType();
8800 }
8801 if (!isValidElementType(Ty: ScalarTy))
8802 return InstructionCost::getInvalid();
8803 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
8804 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8805
8806 // If we have computed a smaller type for the expression, update VecTy so
8807 // that the costs will be accurate.
8808 auto It = MinBWs.find(Val: E);
8809 Type *OrigScalarTy = ScalarTy;
8810 if (It != MinBWs.end()) {
8811 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
8812 VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
8813 }
8814 unsigned EntryVF = E->getVectorFactor();
8815 auto *FinalVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: EntryVF);
8816
8817 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
8818 if (E->State == TreeEntry::NeedToGather) {
8819 if (allConstant(VL))
8820 return 0;
8821 if (isa<InsertElementInst>(Val: VL[0]))
8822 return InstructionCost::getInvalid();
8823 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
8824 E, Params&: *TTI, Params&: VectorizedVals, Params&: *this, Params&: CheckedExtracts);
8825 }
8826 InstructionCost CommonCost = 0;
8827 SmallVector<int> Mask;
8828 bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
8829 if (!E->ReorderIndices.empty() &&
8830 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
8831 SmallVector<int> NewMask;
8832 if (E->getOpcode() == Instruction::Store) {
8833 // For stores the order is actually a mask.
8834 NewMask.resize(N: E->ReorderIndices.size());
8835 copy(Range: E->ReorderIndices, Out: NewMask.begin());
8836 } else {
8837 inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
8838 }
8839 ::addMask(Mask, SubMask: NewMask);
8840 }
8841 if (NeedToShuffleReuses)
8842 ::addMask(Mask, SubMask: E->ReuseShuffleIndices);
8843 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
8844 CommonCost =
8845 TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
8846 assert((E->State == TreeEntry::Vectorize ||
8847 E->State == TreeEntry::ScatterVectorize ||
8848 E->State == TreeEntry::StridedVectorize) &&
8849 "Unhandled state");
8850 assert(E->getOpcode() &&
8851 ((allSameType(VL) && allSameBlock(VL)) ||
8852 (E->getOpcode() == Instruction::GetElementPtr &&
8853 E->getMainOp()->getType()->isPointerTy())) &&
8854 "Invalid VL");
8855 Instruction *VL0 = E->getMainOp();
8856 unsigned ShuffleOrOp =
8857 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
8858 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
8859 const unsigned Sz = UniqueValues.size();
8860 SmallBitVector UsedScalars(Sz, false);
8861 for (unsigned I = 0; I < Sz; ++I) {
8862 if (getTreeEntry(V: UniqueValues[I]) == E)
8863 continue;
8864 UsedScalars.set(I);
8865 }
8866 auto GetCastContextHint = [&](Value *V) {
8867 if (const TreeEntry *OpTE = getTreeEntry(V))
8868 return getCastContextHint(TE: *OpTE);
8869 InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: 0), TLI: *TLI);
8870 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
8871 return TTI::CastContextHint::GatherScatter;
8872 return TTI::CastContextHint::None;
8873 };
8874 auto GetCostDiff =
8875 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
8876 function_ref<InstructionCost(InstructionCost)> VectorCost) {
8877 // Calculate the cost of this instruction.
8878 InstructionCost ScalarCost = 0;
8879 if (isa<CastInst, CmpInst, SelectInst, CallInst>(Val: VL0)) {
8880 // For some of the instructions no need to calculate cost for each
8881 // particular instruction, we can use the cost of the single
8882 // instruction x total number of scalar instructions.
8883 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
8884 } else {
8885 for (unsigned I = 0; I < Sz; ++I) {
8886 if (UsedScalars.test(Idx: I))
8887 continue;
8888 ScalarCost += ScalarEltCost(I);
8889 }
8890 }
8891
8892 InstructionCost VecCost = VectorCost(CommonCost);
8893 // Check if the current node must be resized, if the parent node is not
8894 // resized.
8895 if (!UnaryInstruction::isCast(Opcode: E->getOpcode()) && E->Idx != 0) {
8896 const EdgeInfo &EI = E->UserTreeIndices.front();
8897 if ((EI.UserTE->getOpcode() != Instruction::Select ||
8898 EI.EdgeIdx != 0) &&
8899 It != MinBWs.end()) {
8900 auto UserBWIt = MinBWs.find(Val: EI.UserTE);
8901 Type *UserScalarTy =
8902 EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
8903 if (UserBWIt != MinBWs.end())
8904 UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
8905 NumBits: UserBWIt->second.first);
8906 if (ScalarTy != UserScalarTy) {
8907 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
8908 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
8909 unsigned VecOpcode;
8910 auto *UserVecTy =
8911 FixedVectorType::get(ElementType: UserScalarTy, NumElts: E->getVectorFactor());
8912 if (BWSz > SrcBWSz)
8913 VecOpcode = Instruction::Trunc;
8914 else
8915 VecOpcode =
8916 It->second.second ? Instruction::SExt : Instruction::ZExt;
8917 TTI::CastContextHint CCH = GetCastContextHint(VL0);
8918 VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
8919 CostKind);
8920 }
8921 }
8922 }
8923 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
8924 ScalarCost, "Calculated costs for Tree"));
8925 return VecCost - ScalarCost;
8926 };
8927 // Calculate cost difference from vectorizing set of GEPs.
8928 // Negative value means vectorizing is profitable.
8929 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
8930 assert((E->State == TreeEntry::Vectorize ||
8931 E->State == TreeEntry::StridedVectorize) &&
8932 "Entry state expected to be Vectorize or StridedVectorize here.");
8933 InstructionCost ScalarCost = 0;
8934 InstructionCost VecCost = 0;
8935 std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
8936 TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
8937 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
8938 "Calculated GEPs cost for Tree"));
8939
8940 return VecCost - ScalarCost;
8941 };
8942
8943 switch (ShuffleOrOp) {
8944 case Instruction::PHI: {
8945 // Count reused scalars.
8946 InstructionCost ScalarCost = 0;
8947 SmallPtrSet<const TreeEntry *, 4> CountedOps;
8948 for (Value *V : UniqueValues) {
8949 auto *PHI = dyn_cast<PHINode>(Val: V);
8950 if (!PHI)
8951 continue;
8952
8953 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
8954 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
8955 Value *Op = PHI->getIncomingValue(i: I);
8956 Operands[I] = Op;
8957 }
8958 if (const TreeEntry *OpTE = getTreeEntry(V: Operands.front()))
8959 if (OpTE->isSame(VL: Operands) && CountedOps.insert(Ptr: OpTE).second)
8960 if (!OpTE->ReuseShuffleIndices.empty())
8961 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
8962 OpTE->Scalars.size());
8963 }
8964
8965 return CommonCost - ScalarCost;
8966 }
8967 case Instruction::ExtractValue:
8968 case Instruction::ExtractElement: {
8969 auto GetScalarCost = [&](unsigned Idx) {
8970 auto *I = cast<Instruction>(Val: UniqueValues[Idx]);
8971 VectorType *SrcVecTy;
8972 if (ShuffleOrOp == Instruction::ExtractElement) {
8973 auto *EE = cast<ExtractElementInst>(Val: I);
8974 SrcVecTy = EE->getVectorOperandType();
8975 } else {
8976 auto *EV = cast<ExtractValueInst>(Val: I);
8977 Type *AggregateTy = EV->getAggregateOperand()->getType();
8978 unsigned NumElts;
8979 if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
8980 NumElts = ATy->getNumElements();
8981 else
8982 NumElts = AggregateTy->getStructNumElements();
8983 SrcVecTy = FixedVectorType::get(ElementType: OrigScalarTy, NumElts);
8984 }
8985 if (I->hasOneUse()) {
8986 Instruction *Ext = I->user_back();
8987 if ((isa<SExtInst>(Val: Ext) || isa<ZExtInst>(Val: Ext)) &&
8988 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
8989 // Use getExtractWithExtendCost() to calculate the cost of
8990 // extractelement/ext pair.
8991 InstructionCost Cost = TTI->getExtractWithExtendCost(
8992 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I));
8993 // Subtract the cost of s|zext which is subtracted separately.
8994 Cost -= TTI->getCastInstrCost(
8995 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
8996 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
8997 return Cost;
8998 }
8999 }
9000 return TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: SrcVecTy,
9001 CostKind, Index: *getExtractIndex(E: I));
9002 };
9003 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9004 return GetCostDiff(GetScalarCost, GetVectorCost);
9005 }
9006 case Instruction::InsertElement: {
9007 assert(E->ReuseShuffleIndices.empty() &&
9008 "Unique insertelements only are expected.");
9009 auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
9010 unsigned const NumElts = SrcVecTy->getNumElements();
9011 unsigned const NumScalars = VL.size();
9012
9013 unsigned NumOfParts = TTI->getNumberOfParts(Tp: SrcVecTy);
9014
9015 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9016 unsigned OffsetBeg = *getInsertIndex(InsertInst: VL.front());
9017 unsigned OffsetEnd = OffsetBeg;
9018 InsertMask[OffsetBeg] = 0;
9019 for (auto [I, V] : enumerate(First: VL.drop_front())) {
9020 unsigned Idx = *getInsertIndex(InsertInst: V);
9021 if (OffsetBeg > Idx)
9022 OffsetBeg = Idx;
9023 else if (OffsetEnd < Idx)
9024 OffsetEnd = Idx;
9025 InsertMask[Idx] = I + 1;
9026 }
9027 unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
9028 if (NumOfParts > 0)
9029 VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - 1) / NumOfParts);
9030 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9031 VecScalarsSz;
9032 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9033 unsigned InsertVecSz = std::min<unsigned>(
9034 a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + 1),
9035 b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9036 bool IsWholeSubvector =
9037 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9038 // Check if we can safely insert a subvector. If it is not possible, just
9039 // generate a whole-sized vector and shuffle the source vector and the new
9040 // subvector.
9041 if (OffsetBeg + InsertVecSz > VecSz) {
9042 // Align OffsetBeg to generate correct mask.
9043 OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
9044 InsertVecSz = VecSz;
9045 }
9046
9047 APInt DemandedElts = APInt::getZero(numBits: NumElts);
9048 // TODO: Add support for Instruction::InsertValue.
9049 SmallVector<int> Mask;
9050 if (!E->ReorderIndices.empty()) {
9051 inversePermutation(Indices: E->ReorderIndices, Mask);
9052 Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
9053 } else {
9054 Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
9055 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: 0);
9056 }
9057 bool IsIdentity = true;
9058 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9059 Mask.swap(RHS&: PrevMask);
9060 for (unsigned I = 0; I < NumScalars; ++I) {
9061 unsigned InsertIdx = *getInsertIndex(InsertInst: VL[PrevMask[I]]);
9062 DemandedElts.setBit(InsertIdx);
9063 IsIdentity &= InsertIdx - OffsetBeg == I;
9064 Mask[InsertIdx - OffsetBeg] = I;
9065 }
9066 assert(Offset < NumElts && "Failed to find vector index offset");
9067
9068 InstructionCost Cost = 0;
9069 Cost -= TTI->getScalarizationOverhead(Ty: SrcVecTy, DemandedElts,
9070 /*Insert*/ true, /*Extract*/ false,
9071 CostKind);
9072
9073 // First cost - resize to actual vector size if not identity shuffle or
9074 // need to shift the vector.
9075 // Do not calculate the cost if the actual size is the register size and
9076 // we can merge this shuffle with the following SK_Select.
9077 auto *InsertVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: InsertVecSz);
9078 if (!IsIdentity)
9079 Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
9080 Tp: InsertVecTy, Mask);
9081 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: [E](Value *V) {
9082 return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
9083 }));
9084 // Second cost - permutation with subvector, if some elements are from the
9085 // initial vector or inserting a subvector.
9086 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9087 // subvector of ActualVecTy.
9088 SmallBitVector InMask =
9089 isUndefVector(V: FirstInsert->getOperand(i: 0),
9090 UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
9091 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9092 if (InsertVecSz != VecSz) {
9093 auto *ActualVecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VecSz);
9094 Cost += TTI->getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy,
9095 Mask: std::nullopt, CostKind, Index: OffsetBeg - Offset,
9096 SubTp: InsertVecTy);
9097 } else {
9098 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9099 Mask[I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
9100 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9101 I <= End; ++I)
9102 if (Mask[I] != PoisonMaskElem)
9103 Mask[I] = I + VecSz;
9104 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9105 Mask[I] =
9106 ((I >= InMask.size()) || InMask.test(Idx: I)) ? PoisonMaskElem : I;
9107 Cost +=
9108 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
9109 }
9110 }
9111 return Cost;
9112 }
9113 case Instruction::ZExt:
9114 case Instruction::SExt:
9115 case Instruction::FPToUI:
9116 case Instruction::FPToSI:
9117 case Instruction::FPExt:
9118 case Instruction::PtrToInt:
9119 case Instruction::IntToPtr:
9120 case Instruction::SIToFP:
9121 case Instruction::UIToFP:
9122 case Instruction::Trunc:
9123 case Instruction::FPTrunc:
9124 case Instruction::BitCast: {
9125 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
9126 Type *SrcScalarTy = VL0->getOperand(i: 0)->getType();
9127 auto *SrcVecTy = FixedVectorType::get(ElementType: SrcScalarTy, NumElts: VL.size());
9128 unsigned Opcode = ShuffleOrOp;
9129 unsigned VecOpcode = Opcode;
9130 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9131 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9132 // Check if the values are candidates to demote.
9133 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
9134 if (SrcIt != MinBWs.end()) {
9135 SrcBWSz = SrcIt->second.first;
9136 SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
9137 SrcVecTy = FixedVectorType::get(ElementType: SrcScalarTy, NumElts: VL.size());
9138 }
9139 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9140 if (BWSz == SrcBWSz) {
9141 VecOpcode = Instruction::BitCast;
9142 } else if (BWSz < SrcBWSz) {
9143 VecOpcode = Instruction::Trunc;
9144 } else if (It != MinBWs.end()) {
9145 assert(BWSz > SrcBWSz && "Invalid cast!");
9146 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9147 } else if (SrcIt != MinBWs.end()) {
9148 assert(BWSz > SrcBWSz && "Invalid cast!");
9149 VecOpcode =
9150 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9151 }
9152 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9153 !SrcIt->second.second) {
9154 VecOpcode = Instruction::UIToFP;
9155 }
9156 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9157 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9158 return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
9159 Src: VL0->getOperand(i: 0)->getType(),
9160 CCH: TTI::getCastContextHint(I: VI), CostKind, I: VI);
9161 };
9162 auto GetVectorCost = [=](InstructionCost CommonCost) {
9163 // Do not count cost here if minimum bitwidth is in effect and it is just
9164 // a bitcast (here it is just a noop).
9165 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9166 return CommonCost;
9167 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9168 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(i: 0));
9169 return CommonCost +
9170 TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
9171 I: VecOpcode == Opcode ? VI : nullptr);
9172 };
9173 return GetCostDiff(GetScalarCost, GetVectorCost);
9174 }
9175 case Instruction::FCmp:
9176 case Instruction::ICmp:
9177 case Instruction::Select: {
9178 CmpInst::Predicate VecPred, SwappedVecPred;
9179 auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
9180 if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) ||
9181 match(V: VL0, P: MatchCmp))
9182 SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
9183 else
9184 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9185 ? CmpInst::BAD_FCMP_PREDICATE
9186 : CmpInst::BAD_ICMP_PREDICATE;
9187 auto GetScalarCost = [&](unsigned Idx) {
9188 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9189 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9190 ? CmpInst::BAD_FCMP_PREDICATE
9191 : CmpInst::BAD_ICMP_PREDICATE;
9192 auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
9193 if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
9194 !match(V: VI, P: MatchCmp)) ||
9195 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9196 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9197 ? CmpInst::BAD_FCMP_PREDICATE
9198 : CmpInst::BAD_ICMP_PREDICATE;
9199
9200 return TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: OrigScalarTy,
9201 CondTy: Builder.getInt1Ty(), VecPred: CurrentPred, CostKind,
9202 I: VI);
9203 };
9204 auto GetVectorCost = [&](InstructionCost CommonCost) {
9205 auto *MaskTy = FixedVectorType::get(ElementType: Builder.getInt1Ty(), NumElts: VL.size());
9206
9207 InstructionCost VecCost = TTI->getCmpSelInstrCost(
9208 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred, CostKind, I: VL0);
9209 // Check if it is possible and profitable to use min/max for selects
9210 // in VL.
9211 //
9212 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9213 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9214 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9215 {VecTy, VecTy});
9216 InstructionCost IntrinsicCost =
9217 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9218 // If the selects are the only uses of the compares, they will be
9219 // dead and we can adjust the cost by removing their cost.
9220 if (IntrinsicAndUse.second)
9221 IntrinsicCost -= TTI->getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: VecTy,
9222 CondTy: MaskTy, VecPred, CostKind);
9223 VecCost = std::min(a: VecCost, b: IntrinsicCost);
9224 }
9225 return VecCost + CommonCost;
9226 };
9227 return GetCostDiff(GetScalarCost, GetVectorCost);
9228 }
9229 case Instruction::FNeg:
9230 case Instruction::Add:
9231 case Instruction::FAdd:
9232 case Instruction::Sub:
9233 case Instruction::FSub:
9234 case Instruction::Mul:
9235 case Instruction::FMul:
9236 case Instruction::UDiv:
9237 case Instruction::SDiv:
9238 case Instruction::FDiv:
9239 case Instruction::URem:
9240 case Instruction::SRem:
9241 case Instruction::FRem:
9242 case Instruction::Shl:
9243 case Instruction::LShr:
9244 case Instruction::AShr:
9245 case Instruction::And:
9246 case Instruction::Or:
9247 case Instruction::Xor: {
9248 auto GetScalarCost = [&](unsigned Idx) {
9249 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9250 unsigned OpIdx = isa<UnaryOperator>(Val: VI) ? 0 : 1;
9251 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: VI->getOperand(i: 0));
9252 TTI::OperandValueInfo Op2Info =
9253 TTI::getOperandInfo(V: VI->getOperand(i: OpIdx));
9254 SmallVector<const Value *> Operands(VI->operand_values());
9255 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind,
9256 Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands, CxtI: VI);
9257 };
9258 auto GetVectorCost = [=](InstructionCost CommonCost) {
9259 unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? 0 : 1;
9260 TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
9261 TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
9262 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
9263 Opd2Info: Op2Info, Args: std::nullopt, CxtI: nullptr, TLibInfo: TLI) +
9264 CommonCost;
9265 };
9266 return GetCostDiff(GetScalarCost, GetVectorCost);
9267 }
9268 case Instruction::GetElementPtr: {
9269 return CommonCost + GetGEPCostDiff(VL, VL0);
9270 }
9271 case Instruction::Load: {
9272 auto GetScalarCost = [&](unsigned Idx) {
9273 auto *VI = cast<LoadInst>(Val: UniqueValues[Idx]);
9274 return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
9275 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9276 CostKind, OpdInfo: TTI::OperandValueInfo(), I: VI);
9277 };
9278 auto *LI0 = cast<LoadInst>(Val: VL0);
9279 auto GetVectorCost = [&](InstructionCost CommonCost) {
9280 InstructionCost VecLdCost;
9281 if (E->State == TreeEntry::Vectorize) {
9282 VecLdCost = TTI->getMemoryOpCost(
9283 Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
9284 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
9285 } else if (E->State == TreeEntry::StridedVectorize) {
9286 Align CommonAlignment =
9287 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9288 VecLdCost = TTI->getStridedMemoryOpCost(
9289 Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9290 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
9291 } else {
9292 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9293 Align CommonAlignment =
9294 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9295 VecLdCost = TTI->getGatherScatterOpCost(
9296 Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9297 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
9298 }
9299 return VecLdCost + CommonCost;
9300 };
9301
9302 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9303 // If this node generates masked gather load then it is not a terminal node.
9304 // Hence address operand cost is estimated separately.
9305 if (E->State == TreeEntry::ScatterVectorize)
9306 return Cost;
9307
9308 // Estimate cost of GEPs since this tree node is a terminator.
9309 SmallVector<Value *> PointerOps(VL.size());
9310 for (auto [I, V] : enumerate(First&: VL))
9311 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
9312 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9313 }
9314 case Instruction::Store: {
9315 bool IsReorder = !E->ReorderIndices.empty();
9316 auto GetScalarCost = [=](unsigned Idx) {
9317 auto *VI = cast<StoreInst>(Val: VL[Idx]);
9318 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
9319 return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
9320 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9321 CostKind, OpdInfo: OpInfo, I: VI);
9322 };
9323 auto *BaseSI =
9324 cast<StoreInst>(Val: IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9325 auto GetVectorCost = [=](InstructionCost CommonCost) {
9326 // We know that we can merge the stores. Calculate the cost.
9327 TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
9328 return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
9329 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
9330 OpdInfo: OpInfo) +
9331 CommonCost;
9332 };
9333 SmallVector<Value *> PointerOps(VL.size());
9334 for (auto [I, V] : enumerate(First&: VL)) {
9335 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9336 PointerOps[Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
9337 }
9338
9339 return GetCostDiff(GetScalarCost, GetVectorCost) +
9340 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9341 }
9342 case Instruction::Call: {
9343 auto GetScalarCost = [&](unsigned Idx) {
9344 auto *CI = cast<CallInst>(Val: UniqueValues[Idx]);
9345 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9346 if (ID != Intrinsic::not_intrinsic) {
9347 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9348 return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9349 }
9350 return TTI->getCallInstrCost(F: CI->getCalledFunction(),
9351 RetTy: CI->getFunctionType()->getReturnType(),
9352 Tys: CI->getFunctionType()->params(), CostKind);
9353 };
9354 auto GetVectorCost = [=](InstructionCost CommonCost) {
9355 auto *CI = cast<CallInst>(Val: VL0);
9356 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9357 SmallVector<Type *> ArgTys =
9358 buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
9359 MinBW: It != MinBWs.end() ? It->second.first : 0);
9360 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9361 return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
9362 };
9363 return GetCostDiff(GetScalarCost, GetVectorCost);
9364 }
9365 case Instruction::ShuffleVector: {
9366 assert(E->isAltShuffle() &&
9367 ((Instruction::isBinaryOp(E->getOpcode()) &&
9368 Instruction::isBinaryOp(E->getAltOpcode())) ||
9369 (Instruction::isCast(E->getOpcode()) &&
9370 Instruction::isCast(E->getAltOpcode())) ||
9371 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9372 "Invalid Shuffle Vector Operand");
9373 // Try to find the previous shuffle node with the same operands and same
9374 // main/alternate ops.
9375 auto TryFindNodeWithEqualOperands = [=]() {
9376 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9377 if (TE.get() == E)
9378 break;
9379 if (TE->isAltShuffle() &&
9380 ((TE->getOpcode() == E->getOpcode() &&
9381 TE->getAltOpcode() == E->getAltOpcode()) ||
9382 (TE->getOpcode() == E->getAltOpcode() &&
9383 TE->getAltOpcode() == E->getOpcode())) &&
9384 TE->hasEqualOperands(TE: *E))
9385 return true;
9386 }
9387 return false;
9388 };
9389 auto GetScalarCost = [&](unsigned Idx) {
9390 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9391 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9392 (void)E;
9393 return TTI->getInstructionCost(U: VI, CostKind);
9394 };
9395 // Need to clear CommonCost since the final shuffle cost is included into
9396 // vector cost.
9397 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9398 // VecCost is equal to sum of the cost of creating 2 vectors
9399 // and the cost of creating shuffle.
9400 InstructionCost VecCost = 0;
9401 if (TryFindNodeWithEqualOperands()) {
9402 LLVM_DEBUG({
9403 dbgs() << "SLP: diamond match for alternate node found.\n";
9404 E->dump();
9405 });
9406 // No need to add new vector costs here since we're going to reuse
9407 // same main/alternate vector ops, just do different shuffling.
9408 } else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
9409 VecCost =
9410 TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
9411 VecCost +=
9412 TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
9413 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
9414 auto *MaskTy = FixedVectorType::get(ElementType: Builder.getInt1Ty(), NumElts: VL.size());
9415 VecCost = TTIRef.getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9416 VecPred: CI0->getPredicate(), CostKind, I: VL0);
9417 VecCost += TTIRef.getCmpSelInstrCost(
9418 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9419 VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
9420 I: E->getAltOp());
9421 } else {
9422 Type *SrcSclTy = E->getMainOp()->getOperand(i: 0)->getType();
9423 auto *SrcTy = FixedVectorType::get(ElementType: SrcSclTy, NumElts: VL.size());
9424 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9425 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
9426 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9427 unsigned SrcBWSz =
9428 DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: 0)->getType());
9429 if (SrcIt != MinBWs.end()) {
9430 SrcBWSz = SrcIt->second.first;
9431 SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
9432 SrcTy = FixedVectorType::get(ElementType: SrcSclTy, NumElts: VL.size());
9433 }
9434 if (BWSz <= SrcBWSz) {
9435 if (BWSz < SrcBWSz)
9436 VecCost =
9437 TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
9438 CCH: TTI::CastContextHint::None, CostKind);
9439 LLVM_DEBUG({
9440 dbgs()
9441 << "SLP: alternate extension, which should be truncated.\n";
9442 E->dump();
9443 });
9444 return VecCost;
9445 }
9446 }
9447 VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
9448 CCH: TTI::CastContextHint::None, CostKind);
9449 VecCost +=
9450 TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
9451 CCH: TTI::CastContextHint::None, CostKind);
9452 }
9453 SmallVector<int> Mask;
9454 E->buildAltOpShuffleMask(
9455 IsAltOp: [E](Instruction *I) {
9456 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9457 return I->getOpcode() == E->getAltOpcode();
9458 },
9459 Mask);
9460 VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
9461 Tp: FinalVecTy, Mask);
9462 // Patterns like [fadd,fsub] can be combined into a single instruction
9463 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9464 // need to take into account their order when looking for the most used
9465 // order.
9466 unsigned Opcode0 = E->getOpcode();
9467 unsigned Opcode1 = E->getAltOpcode();
9468 // The opcode mask selects between the two opcodes.
9469 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9470 for (unsigned Lane : seq<unsigned>(Begin: 0, End: E->Scalars.size()))
9471 if (cast<Instruction>(Val: E->Scalars[Lane])->getOpcode() == Opcode1)
9472 OpcodeMask.set(Lane);
9473 // If this pattern is supported by the target then we consider the
9474 // order.
9475 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9476 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9477 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9478 return AltVecCost < VecCost ? AltVecCost : VecCost;
9479 }
9480 // TODO: Check the reverse order too.
9481 return VecCost;
9482 };
9483 return GetCostDiff(GetScalarCost, GetVectorCost);
9484 }
9485 default:
9486 llvm_unreachable("Unknown instruction");
9487 }
9488}
9489
9490bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9491 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9492 << VectorizableTree.size() << " is fully vectorizable .\n");
9493
9494 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9495 SmallVector<int> Mask;
9496 return TE->State == TreeEntry::NeedToGather &&
9497 !any_of(Range: TE->Scalars,
9498 P: [this](Value *V) { return EphValues.contains(Ptr: V); }) &&
9499 (allConstant(VL: TE->Scalars) || isSplat(VL: TE->Scalars) ||
9500 TE->Scalars.size() < Limit ||
9501 ((TE->getOpcode() == Instruction::ExtractElement ||
9502 all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
9503 isFixedVectorShuffle(VL: TE->Scalars, Mask)) ||
9504 (TE->State == TreeEntry::NeedToGather &&
9505 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9506 };
9507
9508 // We only handle trees of heights 1 and 2.
9509 if (VectorizableTree.size() == 1 &&
9510 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9511 (ForReduction &&
9512 AreVectorizableGathers(VectorizableTree[0].get(),
9513 VectorizableTree[0]->Scalars.size()) &&
9514 VectorizableTree[0]->getVectorFactor() > 2)))
9515 return true;
9516
9517 if (VectorizableTree.size() != 2)
9518 return false;
9519
9520 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9521 // with the second gather nodes if they have less scalar operands rather than
9522 // the initial tree element (may be profitable to shuffle the second gather)
9523 // or they are extractelements, which form shuffle.
9524 SmallVector<int> Mask;
9525 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9526 AreVectorizableGathers(VectorizableTree[1].get(),
9527 VectorizableTree[0]->Scalars.size()))
9528 return true;
9529
9530 // Gathering cost would be too much for tiny trees.
9531 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9532 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9533 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9534 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9535 return false;
9536
9537 return true;
9538}
9539
9540static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9541 TargetTransformInfo *TTI,
9542 bool MustMatchOrInst) {
9543 // Look past the root to find a source value. Arbitrarily follow the
9544 // path through operand 0 of any 'or'. Also, peek through optional
9545 // shift-left-by-multiple-of-8-bits.
9546 Value *ZextLoad = Root;
9547 const APInt *ShAmtC;
9548 bool FoundOr = false;
9549 while (!isa<ConstantExpr>(Val: ZextLoad) &&
9550 (match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) ||
9551 (match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) &&
9552 ShAmtC->urem(RHS: 8) == 0))) {
9553 auto *BinOp = cast<BinaryOperator>(Val: ZextLoad);
9554 ZextLoad = BinOp->getOperand(i_nocapture: 0);
9555 if (BinOp->getOpcode() == Instruction::Or)
9556 FoundOr = true;
9557 }
9558 // Check if the input is an extended load of the required or/shift expression.
9559 Value *Load;
9560 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9561 !match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) || !isa<LoadInst>(Val: Load))
9562 return false;
9563
9564 // Require that the total load bit width is a legal integer type.
9565 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9566 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9567 Type *SrcTy = Load->getType();
9568 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9569 if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth)))
9570 return false;
9571
9572 // Everything matched - assume that we can fold the whole sequence using
9573 // load combining.
9574 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9575 << *(cast<Instruction>(Root)) << "\n");
9576
9577 return true;
9578}
9579
9580bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
9581 if (RdxKind != RecurKind::Or)
9582 return false;
9583
9584 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9585 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9586 return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI,
9587 /* MatchOr */ MustMatchOrInst: false);
9588}
9589
9590bool BoUpSLP::isLoadCombineCandidate() const {
9591 // Peek through a final sequence of stores and check if all operations are
9592 // likely to be load-combined.
9593 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9594 for (Value *Scalar : VectorizableTree[0]->Scalars) {
9595 Value *X;
9596 if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) ||
9597 !isLoadCombineCandidateImpl(Root: X, NumElts, TTI, /* MatchOr */ MustMatchOrInst: true))
9598 return false;
9599 }
9600 return true;
9601}
9602
9603bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9604 // No need to vectorize inserts of gathered values.
9605 if (VectorizableTree.size() == 2 &&
9606 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
9607 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9608 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9609 !(isSplat(VL: VectorizableTree[1]->Scalars) ||
9610 allConstant(VL: VectorizableTree[1]->Scalars))))
9611 return true;
9612
9613 // If the graph includes only PHI nodes and gathers, it is defnitely not
9614 // profitable for the vectorization, we can skip it, if the cost threshold is
9615 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9616 // gathers/buildvectors.
9617 constexpr int Limit = 4;
9618 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9619 !VectorizableTree.empty() &&
9620 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
9621 return (TE->State == TreeEntry::NeedToGather &&
9622 TE->getOpcode() != Instruction::ExtractElement &&
9623 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) ||
9624 TE->getOpcode() == Instruction::PHI;
9625 }))
9626 return true;
9627
9628 // We can vectorize the tree if its size is greater than or equal to the
9629 // minimum size specified by the MinTreeSize command line option.
9630 if (VectorizableTree.size() >= MinTreeSize)
9631 return false;
9632
9633 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9634 // can vectorize it if we can prove it fully vectorizable.
9635 if (isFullyVectorizableTinyTree(ForReduction))
9636 return false;
9637
9638 // Check if any of the gather node forms an insertelement buildvector
9639 // somewhere.
9640 bool IsAllowedSingleBVNode =
9641 VectorizableTree.size() > 1 ||
9642 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9643 !VectorizableTree.front()->isAltShuffle() &&
9644 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9645 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9646 allSameBlock(VL: VectorizableTree.front()->Scalars));
9647 if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
9648 return TE->State == TreeEntry::NeedToGather &&
9649 all_of(Range&: TE->Scalars, P: [&](Value *V) {
9650 return isa<ExtractElementInst, UndefValue>(Val: V) ||
9651 (IsAllowedSingleBVNode &&
9652 !V->hasNUsesOrMore(N: UsesLimit) &&
9653 any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
9654 });
9655 }))
9656 return false;
9657
9658 assert(VectorizableTree.empty()
9659 ? ExternalUses.empty()
9660 : true && "We shouldn't have any external users");
9661
9662 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9663 // vectorizable.
9664 return true;
9665}
9666
9667InstructionCost BoUpSLP::getSpillCost() const {
9668 // Walk from the bottom of the tree to the top, tracking which values are
9669 // live. When we see a call instruction that is not part of our tree,
9670 // query TTI to see if there is a cost to keeping values live over it
9671 // (for example, if spills and fills are required).
9672 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9673 InstructionCost Cost = 0;
9674
9675 SmallPtrSet<Instruction *, 4> LiveValues;
9676 Instruction *PrevInst = nullptr;
9677
9678 // The entries in VectorizableTree are not necessarily ordered by their
9679 // position in basic blocks. Collect them and order them by dominance so later
9680 // instructions are guaranteed to be visited first. For instructions in
9681 // different basic blocks, we only scan to the beginning of the block, so
9682 // their order does not matter, as long as all instructions in a basic block
9683 // are grouped together. Using dominance ensures a deterministic order.
9684 SmallVector<Instruction *, 16> OrderedScalars;
9685 for (const auto &TEPtr : VectorizableTree) {
9686 if (TEPtr->State != TreeEntry::Vectorize)
9687 continue;
9688 Instruction *Inst = dyn_cast<Instruction>(Val: TEPtr->Scalars[0]);
9689 if (!Inst)
9690 continue;
9691 OrderedScalars.push_back(Elt: Inst);
9692 }
9693 llvm::sort(C&: OrderedScalars, Comp: [&](Instruction *A, Instruction *B) {
9694 auto *NodeA = DT->getNode(BB: A->getParent());
9695 auto *NodeB = DT->getNode(BB: B->getParent());
9696 assert(NodeA && "Should only process reachable instructions");
9697 assert(NodeB && "Should only process reachable instructions");
9698 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9699 "Different nodes should have different DFS numbers");
9700 if (NodeA != NodeB)
9701 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9702 return B->comesBefore(Other: A);
9703 });
9704
9705 for (Instruction *Inst : OrderedScalars) {
9706 if (!PrevInst) {
9707 PrevInst = Inst;
9708 continue;
9709 }
9710
9711 // Update LiveValues.
9712 LiveValues.erase(Ptr: PrevInst);
9713 for (auto &J : PrevInst->operands()) {
9714 if (isa<Instruction>(Val: &*J) && getTreeEntry(V: &*J))
9715 LiveValues.insert(Ptr: cast<Instruction>(Val: &*J));
9716 }
9717
9718 LLVM_DEBUG({
9719 dbgs() << "SLP: #LV: " << LiveValues.size();
9720 for (auto *X : LiveValues)
9721 dbgs() << " " << X->getName();
9722 dbgs() << ", Looking at ";
9723 Inst->dump();
9724 });
9725
9726 // Now find the sequence of instructions between PrevInst and Inst.
9727 unsigned NumCalls = 0;
9728 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
9729 PrevInstIt =
9730 PrevInst->getIterator().getReverse();
9731 while (InstIt != PrevInstIt) {
9732 if (PrevInstIt == PrevInst->getParent()->rend()) {
9733 PrevInstIt = Inst->getParent()->rbegin();
9734 continue;
9735 }
9736
9737 auto NoCallIntrinsic = [this](Instruction *I) {
9738 if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
9739 if (II->isAssumeLikeIntrinsic())
9740 return true;
9741 FastMathFlags FMF;
9742 SmallVector<Type *, 4> Tys;
9743 for (auto &ArgOp : II->args())
9744 Tys.push_back(Elt: ArgOp->getType());
9745 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: II))
9746 FMF = FPMO->getFastMathFlags();
9747 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
9748 FMF);
9749 InstructionCost IntrCost =
9750 TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
9751 InstructionCost CallCost = TTI->getCallInstrCost(
9752 F: nullptr, RetTy: II->getType(), Tys, CostKind: TTI::TCK_RecipThroughput);
9753 if (IntrCost < CallCost)
9754 return true;
9755 }
9756 return false;
9757 };
9758
9759 // Debug information does not impact spill cost.
9760 if (isa<CallBase>(Val: &*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
9761 &*PrevInstIt != PrevInst)
9762 NumCalls++;
9763
9764 ++PrevInstIt;
9765 }
9766
9767 if (NumCalls) {
9768 SmallVector<Type *, 4> V;
9769 for (auto *II : LiveValues) {
9770 auto *ScalarTy = II->getType();
9771 if (auto *VectorTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
9772 ScalarTy = VectorTy->getElementType();
9773 V.push_back(Elt: FixedVectorType::get(ElementType: ScalarTy, NumElts: BundleWidth));
9774 }
9775 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(Tys: V);
9776 }
9777
9778 PrevInst = Inst;
9779 }
9780
9781 return Cost;
9782}
9783
9784/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
9785/// buildvector sequence.
9786static bool isFirstInsertElement(const InsertElementInst *IE1,
9787 const InsertElementInst *IE2) {
9788 if (IE1 == IE2)
9789 return false;
9790 const auto *I1 = IE1;
9791 const auto *I2 = IE2;
9792 const InsertElementInst *PrevI1;
9793 const InsertElementInst *PrevI2;
9794 unsigned Idx1 = *getInsertIndex(InsertInst: IE1);
9795 unsigned Idx2 = *getInsertIndex(InsertInst: IE2);
9796 do {
9797 if (I2 == IE1)
9798 return true;
9799 if (I1 == IE2)
9800 return false;
9801 PrevI1 = I1;
9802 PrevI2 = I2;
9803 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
9804 getInsertIndex(InsertInst: I1).value_or(u&: Idx2) != Idx2)
9805 I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: 0));
9806 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
9807 getInsertIndex(InsertInst: I2).value_or(u&: Idx1) != Idx1)
9808 I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: 0));
9809 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
9810 llvm_unreachable("Two different buildvectors not expected.");
9811}
9812
9813namespace {
9814/// Returns incoming Value *, if the requested type is Value * too, or a default
9815/// value, otherwise.
9816struct ValueSelect {
9817 template <typename U>
9818 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
9819 return V;
9820 }
9821 template <typename U>
9822 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
9823 return U();
9824 }
9825};
9826} // namespace
9827
9828/// Does the analysis of the provided shuffle masks and performs the requested
9829/// actions on the vectors with the given shuffle masks. It tries to do it in
9830/// several steps.
9831/// 1. If the Base vector is not undef vector, resizing the very first mask to
9832/// have common VF and perform action for 2 input vectors (including non-undef
9833/// Base). Other shuffle masks are combined with the resulting after the 1 stage
9834/// and processed as a shuffle of 2 elements.
9835/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
9836/// action only for 1 vector with the given mask, if it is not the identity
9837/// mask.
9838/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
9839/// vectors, combing the masks properly between the steps.
9840template <typename T>
9841static T *performExtractsShuffleAction(
9842 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
9843 function_ref<unsigned(T *)> GetVF,
9844 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
9845 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
9846 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
9847 SmallVector<int> Mask(ShuffleMask.begin()->second);
9848 auto VMIt = std::next(ShuffleMask.begin());
9849 T *Prev = nullptr;
9850 SmallBitVector UseMask =
9851 buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
9852 SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
9853 if (!IsBaseUndef.all()) {
9854 // Base is not undef, need to combine it with the next subvectors.
9855 std::pair<T *, bool> Res =
9856 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
9857 SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
9858 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
9859 if (Mask[Idx] == PoisonMaskElem)
9860 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
9861 else
9862 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
9863 }
9864 auto *V = ValueSelect::get<T *>(Base);
9865 (void)V;
9866 assert((!V || GetVF(V) == Mask.size()) &&
9867 "Expected base vector of VF number of elements.");
9868 Prev = Action(Mask, {nullptr, Res.first});
9869 } else if (ShuffleMask.size() == 1) {
9870 // Base is undef and only 1 vector is shuffled - perform the action only for
9871 // single vector, if the mask is not the identity mask.
9872 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
9873 /*ForSingleMask=*/true);
9874 if (Res.second)
9875 // Identity mask is found.
9876 Prev = Res.first;
9877 else
9878 Prev = Action(Mask, {ShuffleMask.begin()->first});
9879 } else {
9880 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
9881 // shuffles step by step, combining shuffle between the steps.
9882 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
9883 unsigned Vec2VF = GetVF(VMIt->first);
9884 if (Vec1VF == Vec2VF) {
9885 // No need to resize the input vectors since they are of the same size, we
9886 // can shuffle them directly.
9887 ArrayRef<int> SecMask = VMIt->second;
9888 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9889 if (SecMask[I] != PoisonMaskElem) {
9890 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9891 Mask[I] = SecMask[I] + Vec1VF;
9892 }
9893 }
9894 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
9895 } else {
9896 // Vectors of different sizes - resize and reshuffle.
9897 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
9898 /*ForSingleMask=*/false);
9899 std::pair<T *, bool> Res2 =
9900 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9901 ArrayRef<int> SecMask = VMIt->second;
9902 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9903 if (Mask[I] != PoisonMaskElem) {
9904 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9905 if (Res1.second)
9906 Mask[I] = I;
9907 } else if (SecMask[I] != PoisonMaskElem) {
9908 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
9909 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
9910 }
9911 }
9912 Prev = Action(Mask, {Res1.first, Res2.first});
9913 }
9914 VMIt = std::next(VMIt);
9915 }
9916 bool IsBaseNotUndef = !IsBaseUndef.all();
9917 (void)IsBaseNotUndef;
9918 // Perform requested actions for the remaining masks/vectors.
9919 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
9920 // Shuffle other input vectors, if any.
9921 std::pair<T *, bool> Res =
9922 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
9923 ArrayRef<int> SecMask = VMIt->second;
9924 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
9925 if (SecMask[I] != PoisonMaskElem) {
9926 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
9927 "Multiple uses of scalars.");
9928 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
9929 } else if (Mask[I] != PoisonMaskElem) {
9930 Mask[I] = I;
9931 }
9932 }
9933 Prev = Action(Mask, {Prev, Res.first});
9934 }
9935 return Prev;
9936}
9937
9938InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
9939 InstructionCost Cost = 0;
9940 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
9941 << VectorizableTree.size() << ".\n");
9942
9943 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
9944
9945 SmallPtrSet<Value *, 4> CheckedExtracts;
9946 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
9947 TreeEntry &TE = *VectorizableTree[I];
9948 if (TE.State == TreeEntry::NeedToGather) {
9949 if (const TreeEntry *E = getTreeEntry(V: TE.getMainOp());
9950 E && E->getVectorFactor() == TE.getVectorFactor() &&
9951 E->isSame(VL: TE.Scalars)) {
9952 // Some gather nodes might be absolutely the same as some vectorizable
9953 // nodes after reordering, need to handle it.
9954 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
9955 << shortBundleName(TE.Scalars) << ".\n"
9956 << "SLP: Current total cost = " << Cost << "\n");
9957 continue;
9958 }
9959 }
9960
9961 InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
9962 Cost += C;
9963 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
9964 << shortBundleName(TE.Scalars) << ".\n"
9965 << "SLP: Current total cost = " << Cost << "\n");
9966 }
9967
9968 SmallPtrSet<Value *, 16> ExtractCostCalculated;
9969 InstructionCost ExtractCost = 0;
9970 SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
9971 SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
9972 SmallVector<APInt> DemandedElts;
9973 SmallDenseSet<Value *, 4> UsedInserts;
9974 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
9975 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
9976 for (ExternalUser &EU : ExternalUses) {
9977 // We only add extract cost once for the same scalar.
9978 if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
9979 !ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
9980 continue;
9981
9982 // Uses by ephemeral values are free (because the ephemeral value will be
9983 // removed prior to code generation, and so the extraction will be
9984 // removed as well).
9985 if (EphValues.count(Ptr: EU.User))
9986 continue;
9987
9988 // No extract cost for vector "scalar"
9989 if (isa<FixedVectorType>(Val: EU.Scalar->getType()))
9990 continue;
9991
9992 // If found user is an insertelement, do not calculate extract cost but try
9993 // to detect it as a final shuffled/identity match.
9994 if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User)) {
9995 if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
9996 if (!UsedInserts.insert(V: VU).second)
9997 continue;
9998 std::optional<unsigned> InsertIdx = getInsertIndex(InsertInst: VU);
9999 if (InsertIdx) {
10000 const TreeEntry *ScalarTE = getTreeEntry(V: EU.Scalar);
10001 auto *It = find_if(
10002 Range&: FirstUsers,
10003 P: [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10004 return areTwoInsertFromSameBuildVector(
10005 VU, V: cast<InsertElementInst>(Val: Pair.first),
10006 GetBaseOperand: [this](InsertElementInst *II) -> Value * {
10007 Value *Op0 = II->getOperand(i_nocapture: 0);
10008 if (getTreeEntry(V: II) && !getTreeEntry(V: Op0))
10009 return nullptr;
10010 return Op0;
10011 });
10012 });
10013 int VecId = -1;
10014 if (It == FirstUsers.end()) {
10015 (void)ShuffleMasks.emplace_back();
10016 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10017 if (Mask.empty())
10018 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10019 // Find the insertvector, vectorized in tree, if any.
10020 Value *Base = VU;
10021 while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
10022 if (IEBase != EU.User &&
10023 (!IEBase->hasOneUse() ||
10024 getInsertIndex(InsertInst: IEBase).value_or(u&: *InsertIdx) == *InsertIdx))
10025 break;
10026 // Build the mask for the vectorized insertelement instructions.
10027 if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
10028 VU = IEBase;
10029 do {
10030 IEBase = cast<InsertElementInst>(Val: Base);
10031 int Idx = *getInsertIndex(InsertInst: IEBase);
10032 assert(Mask[Idx] == PoisonMaskElem &&
10033 "InsertElementInstruction used already.");
10034 Mask[Idx] = Idx;
10035 Base = IEBase->getOperand(i_nocapture: 0);
10036 } while (E == getTreeEntry(V: Base));
10037 break;
10038 }
10039 Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: 0);
10040 }
10041 FirstUsers.emplace_back(Args&: VU, Args&: ScalarTE);
10042 DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
10043 VecId = FirstUsers.size() - 1;
10044 auto It = MinBWs.find(Val: ScalarTE);
10045 if (It != MinBWs.end() &&
10046 VectorCasts
10047 .insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
10048 .second) {
10049 unsigned BWSz = It->second.first;
10050 unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
10051 unsigned VecOpcode;
10052 if (DstBWSz < BWSz)
10053 VecOpcode = Instruction::Trunc;
10054 else
10055 VecOpcode =
10056 It->second.second ? Instruction::SExt : Instruction::ZExt;
10057 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10058 InstructionCost C = TTI->getCastInstrCost(
10059 Opcode: VecOpcode, Dst: FTy,
10060 Src: FixedVectorType::get(
10061 ElementType: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
10062 NumElts: FTy->getNumElements()),
10063 CCH: TTI::CastContextHint::None, CostKind);
10064 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10065 << " for extending externally used vector with "
10066 "non-equal minimum bitwidth.\n");
10067 Cost += C;
10068 }
10069 } else {
10070 if (isFirstInsertElement(IE1: VU, IE2: cast<InsertElementInst>(Val: It->first)))
10071 It->first = VU;
10072 VecId = std::distance(first: FirstUsers.begin(), last: It);
10073 }
10074 int InIdx = *InsertIdx;
10075 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10076 if (Mask.empty())
10077 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10078 Mask[InIdx] = EU.Lane;
10079 DemandedElts[VecId].setBit(InIdx);
10080 continue;
10081 }
10082 }
10083 }
10084 // Leave the GEPs as is, they are free in most cases and better to keep them
10085 // as GEPs.
10086 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10087 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: EU.Scalar)) {
10088 if (!ValueToExtUses) {
10089 ValueToExtUses.emplace();
10090 for_each(Range: enumerate(First&: ExternalUses), F: [&](const auto &P) {
10091 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10092 });
10093 }
10094 // Can use original GEP, if no operands vectorized or they are marked as
10095 // externally used already.
10096 bool CanBeUsedAsGEP = all_of(Range: GEP->operands(), P: [&](Value *V) {
10097 if (!getTreeEntry(V))
10098 return true;
10099 auto It = ValueToExtUses->find(Val: V);
10100 if (It != ValueToExtUses->end()) {
10101 // Replace all uses to avoid compiler crash.
10102 ExternalUses[It->second].User = nullptr;
10103 return true;
10104 }
10105 return false;
10106 });
10107 if (CanBeUsedAsGEP) {
10108 ExtractCost += TTI->getInstructionCost(U: GEP, CostKind);
10109 ExternalUsesAsGEPs.insert(Ptr: EU.Scalar);
10110 continue;
10111 }
10112 }
10113
10114 // If we plan to rewrite the tree in a smaller type, we will need to sign
10115 // extend the extracted value back to the original type. Here, we account
10116 // for the extract and the added cost of the sign extend if needed.
10117 auto *VecTy = FixedVectorType::get(ElementType: EU.Scalar->getType(), NumElts: BundleWidth);
10118 auto It = MinBWs.find(Val: getTreeEntry(V: EU.Scalar));
10119 if (It != MinBWs.end()) {
10120 auto *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
10121 unsigned Extend =
10122 It->second.second ? Instruction::SExt : Instruction::ZExt;
10123 VecTy = FixedVectorType::get(ElementType: MinTy, NumElts: BundleWidth);
10124 ExtractCost += TTI->getExtractWithExtendCost(Opcode: Extend, Dst: EU.Scalar->getType(),
10125 VecTy, Index: EU.Lane);
10126 } else {
10127 ExtractCost += TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
10128 CostKind, Index: EU.Lane);
10129 }
10130 }
10131 // Add reduced value cost, if resized.
10132 if (!VectorizedVals.empty()) {
10133 const TreeEntry &Root = *VectorizableTree.front().get();
10134 auto BWIt = MinBWs.find(Val: &Root);
10135 if (BWIt != MinBWs.end()) {
10136 Type *DstTy = Root.Scalars.front()->getType();
10137 unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy);
10138 unsigned SrcSz =
10139 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10140 if (OriginalSz != SrcSz) {
10141 unsigned Opcode = Instruction::Trunc;
10142 if (OriginalSz > SrcSz)
10143 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10144 Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
10145 Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
10146 CCH: TTI::CastContextHint::None,
10147 CostKind: TTI::TCK_RecipThroughput);
10148 }
10149 }
10150 }
10151
10152 InstructionCost SpillCost = getSpillCost();
10153 Cost += SpillCost + ExtractCost;
10154 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10155 bool) {
10156 InstructionCost C = 0;
10157 unsigned VF = Mask.size();
10158 unsigned VecVF = TE->getVectorFactor();
10159 if (VF != VecVF &&
10160 (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10161 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))) {
10162 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10163 std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
10164 result: OrigMask.begin());
10165 C = TTI->getShuffleCost(
10166 Kind: TTI::SK_PermuteSingleSrc,
10167 Tp: FixedVectorType::get(ElementType: TE->getMainOp()->getType(), NumElts: VecVF), Mask: OrigMask);
10168 LLVM_DEBUG(
10169 dbgs() << "SLP: Adding cost " << C
10170 << " for final shuffle of insertelement external users.\n";
10171 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10172 Cost += C;
10173 return std::make_pair(x&: TE, y: true);
10174 }
10175 return std::make_pair(x&: TE, y: false);
10176 };
10177 // Calculate the cost of the reshuffled vectors, if any.
10178 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10179 Value *Base = cast<Instruction>(Val: FirstUsers[I].first)->getOperand(i: 0);
10180 auto Vector = ShuffleMasks[I].takeVector();
10181 unsigned VF = 0;
10182 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10183 ArrayRef<const TreeEntry *> TEs) {
10184 assert((TEs.size() == 1 || TEs.size() == 2) &&
10185 "Expected exactly 1 or 2 tree entries.");
10186 if (TEs.size() == 1) {
10187 if (VF == 0)
10188 VF = TEs.front()->getVectorFactor();
10189 auto *FTy =
10190 FixedVectorType::get(ElementType: TEs.back()->Scalars.front()->getType(), NumElts: VF);
10191 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
10192 !all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
10193 return Data.value() == PoisonMaskElem ||
10194 (Data.index() < VF &&
10195 static_cast<int>(Data.index()) == Data.value());
10196 })) {
10197 InstructionCost C =
10198 TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
10199 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10200 << " for final shuffle of insertelement "
10201 "external users.\n";
10202 TEs.front()->dump();
10203 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10204 Cost += C;
10205 }
10206 } else {
10207 if (VF == 0) {
10208 if (TEs.front() &&
10209 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10210 VF = TEs.front()->getVectorFactor();
10211 else
10212 VF = Mask.size();
10213 }
10214 auto *FTy =
10215 FixedVectorType::get(ElementType: TEs.back()->Scalars.front()->getType(), NumElts: VF);
10216 InstructionCost C =
10217 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
10218 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10219 << " for final shuffle of vector node and external "
10220 "insertelement users.\n";
10221 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10222 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10223 Cost += C;
10224 }
10225 VF = Mask.size();
10226 return TEs.back();
10227 };
10228 (void)performExtractsShuffleAction<const TreeEntry>(
10229 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
10230 GetVF: [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeAction: ResizeToVF,
10231 Action: EstimateShufflesCost);
10232 InstructionCost InsertCost = TTI->getScalarizationOverhead(
10233 Ty: cast<FixedVectorType>(Val: FirstUsers[I].first->getType()), DemandedElts: DemandedElts[I],
10234 /*Insert*/ true, /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
10235 Cost -= InsertCost;
10236 }
10237
10238 // Add the cost for reduced value resize (if required).
10239 if (ReductionBitWidth != 0) {
10240 assert(UserIgnoreList && "Expected reduction tree.");
10241 const TreeEntry &E = *VectorizableTree.front().get();
10242 auto It = MinBWs.find(Val: &E);
10243 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10244 unsigned SrcSize = It->second.first;
10245 unsigned DstSize = ReductionBitWidth;
10246 unsigned Opcode = Instruction::Trunc;
10247 if (SrcSize < DstSize)
10248 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10249 auto *SrcVecTy =
10250 FixedVectorType::get(ElementType: Builder.getIntNTy(N: SrcSize), NumElts: E.getVectorFactor());
10251 auto *DstVecTy =
10252 FixedVectorType::get(ElementType: Builder.getIntNTy(N: DstSize), NumElts: E.getVectorFactor());
10253 TTI::CastContextHint CCH = getCastContextHint(TE: E);
10254 InstructionCost CastCost;
10255 switch (E.getOpcode()) {
10256 case Instruction::SExt:
10257 case Instruction::ZExt:
10258 case Instruction::Trunc: {
10259 const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: 0);
10260 CCH = getCastContextHint(TE: *OpTE);
10261 break;
10262 }
10263 default:
10264 break;
10265 }
10266 CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
10267 CostKind: TTI::TCK_RecipThroughput);
10268 Cost += CastCost;
10269 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10270 << " for final resize for reduction from " << SrcVecTy
10271 << " to " << DstVecTy << "\n";
10272 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10273 }
10274 }
10275
10276#ifndef NDEBUG
10277 SmallString<256> Str;
10278 {
10279 raw_svector_ostream OS(Str);
10280 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10281 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10282 << "SLP: Total Cost = " << Cost << ".\n";
10283 }
10284 LLVM_DEBUG(dbgs() << Str);
10285 if (ViewSLPTree)
10286 ViewGraph(G: this, Name: "SLP" + F->getName(), ShortNames: false, Title: Str);
10287#endif
10288
10289 return Cost;
10290}
10291
10292/// Tries to find extractelement instructions with constant indices from fixed
10293/// vector type and gather such instructions into a bunch, which highly likely
10294/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10295/// successful, the matched scalars are replaced by poison values in \p VL for
10296/// future analysis.
10297std::optional<TTI::ShuffleKind>
10298BoUpSLP::tryToGatherSingleRegisterExtractElements(
10299 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
10300 // Scan list of gathered scalars for extractelements that can be represented
10301 // as shuffles.
10302 MapVector<Value *, SmallVector<int>> VectorOpToIdx;
10303 SmallVector<int> UndefVectorExtracts;
10304 for (int I = 0, E = VL.size(); I < E; ++I) {
10305 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
10306 if (!EI) {
10307 if (isa<UndefValue>(Val: VL[I]))
10308 UndefVectorExtracts.push_back(Elt: I);
10309 continue;
10310 }
10311 auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
10312 if (!VecTy || !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
10313 continue;
10314 std::optional<unsigned> Idx = getExtractIndex(E: EI);
10315 // Undefined index.
10316 if (!Idx) {
10317 UndefVectorExtracts.push_back(Elt: I);
10318 continue;
10319 }
10320 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10321 ExtractMask.reset(Idx: *Idx);
10322 if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
10323 UndefVectorExtracts.push_back(Elt: I);
10324 continue;
10325 }
10326 VectorOpToIdx[EI->getVectorOperand()].push_back(Elt: I);
10327 }
10328 // Sort the vector operands by the maximum number of uses in extractelements.
10329 MapVector<unsigned, SmallVector<Value *>> VFToVector;
10330 for (const auto &Data : VectorOpToIdx)
10331 VFToVector[cast<FixedVectorType>(Val: Data.first->getType())->getNumElements()]
10332 .push_back(Elt: Data.first);
10333 for (auto &Data : VFToVector) {
10334 stable_sort(Range&: Data.second, C: [&VectorOpToIdx](Value *V1, Value *V2) {
10335 return VectorOpToIdx.find(Key: V1)->second.size() >
10336 VectorOpToIdx.find(Key: V2)->second.size();
10337 });
10338 }
10339 // Find the best pair of the vectors with the same number of elements or a
10340 // single vector.
10341 const int UndefSz = UndefVectorExtracts.size();
10342 unsigned SingleMax = 0;
10343 Value *SingleVec = nullptr;
10344 unsigned PairMax = 0;
10345 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10346 for (auto &Data : VFToVector) {
10347 Value *V1 = Data.second.front();
10348 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10349 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10350 SingleVec = V1;
10351 }
10352 Value *V2 = nullptr;
10353 if (Data.second.size() > 1)
10354 V2 = *std::next(x: Data.second.begin());
10355 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10356 UndefSz) {
10357 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10358 PairVec = std::make_pair(x&: V1, y&: V2);
10359 }
10360 }
10361 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10362 return std::nullopt;
10363 // Check if better to perform a shuffle of 2 vectors or just of a single
10364 // vector.
10365 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10366 SmallVector<Value *> GatheredExtracts(
10367 VL.size(), PoisonValue::get(T: VL.front()->getType()));
10368 if (SingleMax >= PairMax && SingleMax) {
10369 for (int Idx : VectorOpToIdx[SingleVec])
10370 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
10371 } else {
10372 for (Value *V : {PairVec.first, PairVec.second})
10373 for (int Idx : VectorOpToIdx[V])
10374 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
10375 }
10376 // Add extracts from undefs too.
10377 for (int Idx : UndefVectorExtracts)
10378 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
10379 // Check that gather of extractelements can be represented as just a
10380 // shuffle of a single/two vectors the scalars are extracted from.
10381 std::optional<TTI::ShuffleKind> Res =
10382 isFixedVectorShuffle(VL: GatheredExtracts, Mask);
10383 if (!Res) {
10384 // TODO: try to check other subsets if possible.
10385 // Restore the original VL if attempt was not successful.
10386 copy(Range&: SavedVL, Out: VL.begin());
10387 return std::nullopt;
10388 }
10389 // Restore unused scalars from mask, if some of the extractelements were not
10390 // selected for shuffle.
10391 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10392 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts[I]) &&
10393 isa<UndefValue>(Val: GatheredExtracts[I])) {
10394 std::swap(a&: VL[I], b&: GatheredExtracts[I]);
10395 continue;
10396 }
10397 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
10398 if (!EI || !isa<FixedVectorType>(Val: EI->getVectorOperandType()) ||
10399 !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) ||
10400 is_contained(Range&: UndefVectorExtracts, Element: I))
10401 continue;
10402 }
10403 return Res;
10404}
10405
10406/// Tries to find extractelement instructions with constant indices from fixed
10407/// vector type and gather such instructions into a bunch, which highly likely
10408/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10409/// successful, the matched scalars are replaced by poison values in \p VL for
10410/// future analysis.
10411SmallVector<std::optional<TTI::ShuffleKind>>
10412BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10413 SmallVectorImpl<int> &Mask,
10414 unsigned NumParts) const {
10415 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10416 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10417 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
10418 unsigned SliceSize = VL.size() / NumParts;
10419 for (unsigned Part = 0; Part < NumParts; ++Part) {
10420 // Scan list of gathered scalars for extractelements that can be represented
10421 // as shuffles.
10422 MutableArrayRef<Value *> SubVL =
10423 MutableArrayRef(VL).slice(N: Part * SliceSize, M: SliceSize);
10424 SmallVector<int> SubMask;
10425 std::optional<TTI::ShuffleKind> Res =
10426 tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
10427 ShufflesRes[Part] = Res;
10428 copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
10429 }
10430 if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
10431 return Res.has_value();
10432 }))
10433 ShufflesRes.clear();
10434 return ShufflesRes;
10435}
10436
10437std::optional<TargetTransformInfo::ShuffleKind>
10438BoUpSLP::isGatherShuffledSingleRegisterEntry(
10439 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10440 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10441 Entries.clear();
10442 // TODO: currently checking only for Scalars in the tree entry, need to count
10443 // reused elements too for better cost estimation.
10444 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10445 const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
10446 const BasicBlock *TEInsertBlock = nullptr;
10447 // Main node of PHI entries keeps the correct order of operands/incoming
10448 // blocks.
10449 if (auto *PHI = dyn_cast<PHINode>(Val: TEUseEI.UserTE->getMainOp())) {
10450 TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
10451 TEInsertPt = TEInsertBlock->getTerminator();
10452 } else {
10453 TEInsertBlock = TEInsertPt->getParent();
10454 }
10455 if (!DT->isReachableFromEntry(A: TEInsertBlock))
10456 return std::nullopt;
10457 auto *NodeUI = DT->getNode(BB: TEInsertBlock);
10458 assert(NodeUI && "Should only process reachable instructions");
10459 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10460 auto CheckOrdering = [&](const Instruction *InsertPt) {
10461 // Argument InsertPt is an instruction where vector code for some other
10462 // tree entry (one that shares one or more scalars with TE) is going to be
10463 // generated. This lambda returns true if insertion point of vector code
10464 // for the TE dominates that point (otherwise dependency is the other way
10465 // around). The other node is not limited to be of a gather kind. Gather
10466 // nodes are not scheduled and their vector code is inserted before their
10467 // first user. If user is PHI, that is supposed to be at the end of a
10468 // predecessor block. Otherwise it is the last instruction among scalars of
10469 // the user node. So, instead of checking dependency between instructions
10470 // themselves, we check dependency between their insertion points for vector
10471 // code (since each scalar instruction ends up as a lane of a vector
10472 // instruction).
10473 const BasicBlock *InsertBlock = InsertPt->getParent();
10474 auto *NodeEUI = DT->getNode(BB: InsertBlock);
10475 if (!NodeEUI)
10476 return false;
10477 assert((NodeUI == NodeEUI) ==
10478 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10479 "Different nodes should have different DFS numbers");
10480 // Check the order of the gather nodes users.
10481 if (TEInsertPt->getParent() != InsertBlock &&
10482 (DT->dominates(A: NodeUI, B: NodeEUI) || !DT->dominates(A: NodeEUI, B: NodeUI)))
10483 return false;
10484 if (TEInsertPt->getParent() == InsertBlock &&
10485 TEInsertPt->comesBefore(Other: InsertPt))
10486 return false;
10487 return true;
10488 };
10489 // Find all tree entries used by the gathered values. If no common entries
10490 // found - not a shuffle.
10491 // Here we build a set of tree nodes for each gathered value and trying to
10492 // find the intersection between these sets. If we have at least one common
10493 // tree node for each gathered value - we have just a permutation of the
10494 // single vector. If we have 2 different sets, we're in situation where we
10495 // have a permutation of 2 input vectors.
10496 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
10497 DenseMap<Value *, int> UsedValuesEntry;
10498 for (Value *V : VL) {
10499 if (isConstant(V))
10500 continue;
10501 // Build a list of tree entries where V is used.
10502 SmallPtrSet<const TreeEntry *, 4> VToTEs;
10503 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(Val: V)->second) {
10504 if (TEPtr == TE)
10505 continue;
10506 assert(any_of(TEPtr->Scalars,
10507 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10508 "Must contain at least single gathered value.");
10509 assert(TEPtr->UserTreeIndices.size() == 1 &&
10510 "Expected only single user of a gather node.");
10511 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10512
10513 PHINode *UserPHI = dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp());
10514 const Instruction *InsertPt =
10515 UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
10516 : &getLastInstructionInBundle(E: UseEI.UserTE);
10517 if (TEInsertPt == InsertPt) {
10518 // If 2 gathers are operands of the same entry (regardless of whether
10519 // user is PHI or else), compare operands indices, use the earlier one
10520 // as the base.
10521 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10522 continue;
10523 // If the user instruction is used for some reason in different
10524 // vectorized nodes - make it depend on index.
10525 if (TEUseEI.UserTE != UseEI.UserTE &&
10526 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10527 continue;
10528 }
10529
10530 // Check if the user node of the TE comes after user node of TEPtr,
10531 // otherwise TEPtr depends on TE.
10532 if ((TEInsertBlock != InsertPt->getParent() ||
10533 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10534 !CheckOrdering(InsertPt))
10535 continue;
10536 VToTEs.insert(Ptr: TEPtr);
10537 }
10538 if (const TreeEntry *VTE = getTreeEntry(V)) {
10539 if (ForOrder) {
10540 if (VTE->State != TreeEntry::Vectorize) {
10541 auto It = MultiNodeScalars.find(Val: V);
10542 if (It == MultiNodeScalars.end())
10543 continue;
10544 VTE = *It->getSecond().begin();
10545 // Iterate through all vectorized nodes.
10546 auto *MIt = find_if(Range&: It->getSecond(), P: [](const TreeEntry *MTE) {
10547 return MTE->State == TreeEntry::Vectorize;
10548 });
10549 if (MIt == It->getSecond().end())
10550 continue;
10551 VTE = *MIt;
10552 }
10553 }
10554 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
10555 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10556 continue;
10557 VToTEs.insert(Ptr: VTE);
10558 }
10559 if (VToTEs.empty())
10560 continue;
10561 if (UsedTEs.empty()) {
10562 // The first iteration, just insert the list of nodes to vector.
10563 UsedTEs.push_back(Elt: VToTEs);
10564 UsedValuesEntry.try_emplace(Key: V, Args: 0);
10565 } else {
10566 // Need to check if there are any previously used tree nodes which use V.
10567 // If there are no such nodes, consider that we have another one input
10568 // vector.
10569 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10570 unsigned Idx = 0;
10571 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10572 // Do we have a non-empty intersection of previously listed tree entries
10573 // and tree entries using current V?
10574 set_intersect(S1&: VToTEs, S2: Set);
10575 if (!VToTEs.empty()) {
10576 // Yes, write the new subset and continue analysis for the next
10577 // scalar.
10578 Set.swap(RHS&: VToTEs);
10579 break;
10580 }
10581 VToTEs = SavedVToTEs;
10582 ++Idx;
10583 }
10584 // No non-empty intersection found - need to add a second set of possible
10585 // source vectors.
10586 if (Idx == UsedTEs.size()) {
10587 // If the number of input vectors is greater than 2 - not a permutation,
10588 // fallback to the regular gather.
10589 // TODO: support multiple reshuffled nodes.
10590 if (UsedTEs.size() == 2)
10591 continue;
10592 UsedTEs.push_back(Elt: SavedVToTEs);
10593 Idx = UsedTEs.size() - 1;
10594 }
10595 UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
10596 }
10597 }
10598
10599 if (UsedTEs.empty()) {
10600 Entries.clear();
10601 return std::nullopt;
10602 }
10603
10604 unsigned VF = 0;
10605 if (UsedTEs.size() == 1) {
10606 // Keep the order to avoid non-determinism.
10607 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10608 UsedTEs.front().end());
10609 sort(C&: FirstEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
10610 return TE1->Idx < TE2->Idx;
10611 });
10612 // Try to find the perfect match in another gather node at first.
10613 auto *It = find_if(Range&: FirstEntries, P: [=](const TreeEntry *EntryPtr) {
10614 return EntryPtr->isSame(VL) || EntryPtr->isSame(VL: TE->Scalars);
10615 });
10616 if (It != FirstEntries.end() &&
10617 ((*It)->getVectorFactor() == VL.size() ||
10618 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10619 TE->ReuseShuffleIndices.size() == VL.size() &&
10620 (*It)->isSame(VL: TE->Scalars)))) {
10621 Entries.push_back(Elt: *It);
10622 if ((*It)->getVectorFactor() == VL.size()) {
10623 std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
10624 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: 0);
10625 } else {
10626 SmallVector<int> CommonMask = TE->getCommonMask();
10627 copy(Range&: CommonMask, Out: Mask.begin());
10628 }
10629 // Clear undef scalars.
10630 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10631 if (isa<PoisonValue>(Val: VL[I]))
10632 Mask[I] = PoisonMaskElem;
10633 return TargetTransformInfo::SK_PermuteSingleSrc;
10634 }
10635 // No perfect match, just shuffle, so choose the first tree node from the
10636 // tree.
10637 Entries.push_back(Elt: FirstEntries.front());
10638 } else {
10639 // Try to find nodes with the same vector factor.
10640 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10641 // Keep the order of tree nodes to avoid non-determinism.
10642 DenseMap<int, const TreeEntry *> VFToTE;
10643 for (const TreeEntry *TE : UsedTEs.front()) {
10644 unsigned VF = TE->getVectorFactor();
10645 auto It = VFToTE.find(Val: VF);
10646 if (It != VFToTE.end()) {
10647 if (It->second->Idx > TE->Idx)
10648 It->getSecond() = TE;
10649 continue;
10650 }
10651 VFToTE.try_emplace(Key: VF, Args&: TE);
10652 }
10653 // Same, keep the order to avoid non-determinism.
10654 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10655 UsedTEs.back().end());
10656 sort(C&: SecondEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
10657 return TE1->Idx < TE2->Idx;
10658 });
10659 for (const TreeEntry *TE : SecondEntries) {
10660 auto It = VFToTE.find(Val: TE->getVectorFactor());
10661 if (It != VFToTE.end()) {
10662 VF = It->first;
10663 Entries.push_back(Elt: It->second);
10664 Entries.push_back(Elt: TE);
10665 break;
10666 }
10667 }
10668 // No 2 source vectors with the same vector factor - just choose 2 with max
10669 // index.
10670 if (Entries.empty()) {
10671 Entries.push_back(Elt: *llvm::max_element(
10672 Range&: UsedTEs.front(), C: [](const TreeEntry *TE1, const TreeEntry *TE2) {
10673 return TE1->Idx < TE2->Idx;
10674 }));
10675 Entries.push_back(Elt: SecondEntries.front());
10676 VF = std::max(a: Entries.front()->getVectorFactor(),
10677 b: Entries.back()->getVectorFactor());
10678 }
10679 }
10680
10681 bool IsSplatOrUndefs = isSplat(VL) || all_of(Range&: VL, P: IsaPred<UndefValue>);
10682 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10683 // vectorized.
10684 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10685 auto *PHI = cast<PHINode>(Val: V);
10686 auto *PHI1 = cast<PHINode>(Val: V1);
10687 // Check that all incoming values are compatible/from same parent (if they
10688 // are instructions).
10689 // The incoming values are compatible if they all are constants, or
10690 // instruction with the same/alternate opcodes from the same basic block.
10691 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10692 Value *In = PHI->getIncomingValue(i: I);
10693 Value *In1 = PHI1->getIncomingValue(i: I);
10694 if (isConstant(V: In) && isConstant(V: In1))
10695 continue;
10696 if (!getSameOpcode(VL: {In, In1}, TLI: *TLI).getOpcode())
10697 return false;
10698 if (cast<Instruction>(Val: In)->getParent() !=
10699 cast<Instruction>(Val: In1)->getParent())
10700 return false;
10701 }
10702 return true;
10703 };
10704 // Check if the value can be ignored during analysis for shuffled gathers.
10705 // We suppose it is better to ignore instruction, which do not form splats,
10706 // are not vectorized/not extractelements (these instructions will be handled
10707 // by extractelements processing) or may form vector node in future.
10708 auto MightBeIgnored = [=](Value *V) {
10709 auto *I = dyn_cast<Instruction>(Val: V);
10710 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(Val: I) &&
10711 !isVectorLikeInstWithConstOps(V: I) &&
10712 !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
10713 };
10714 // Check that the neighbor instruction may form a full vector node with the
10715 // current instruction V. It is possible, if they have same/alternate opcode
10716 // and same parent basic block.
10717 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
10718 Value *V1 = VL[Idx];
10719 bool UsedInSameVTE = false;
10720 auto It = UsedValuesEntry.find(Val: V1);
10721 if (It != UsedValuesEntry.end())
10722 UsedInSameVTE = It->second == UsedValuesEntry.find(Val: V)->second;
10723 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
10724 getSameOpcode(VL: {V, V1}, TLI: *TLI).getOpcode() &&
10725 cast<Instruction>(Val: V)->getParent() ==
10726 cast<Instruction>(Val: V1)->getParent() &&
10727 (!isa<PHINode>(Val: V1) || AreCompatiblePHIs(V, V1));
10728 };
10729 // Build a shuffle mask for better cost estimation and vector emission.
10730 SmallBitVector UsedIdxs(Entries.size());
10731 SmallVector<std::pair<unsigned, int>> EntryLanes;
10732 for (int I = 0, E = VL.size(); I < E; ++I) {
10733 Value *V = VL[I];
10734 auto It = UsedValuesEntry.find(Val: V);
10735 if (It == UsedValuesEntry.end())
10736 continue;
10737 // Do not try to shuffle scalars, if they are constants, or instructions
10738 // that can be vectorized as a result of the following vector build
10739 // vectorization.
10740 if (isConstant(V) || (MightBeIgnored(V) &&
10741 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
10742 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
10743 continue;
10744 unsigned Idx = It->second;
10745 EntryLanes.emplace_back(Args&: Idx, Args&: I);
10746 UsedIdxs.set(Idx);
10747 }
10748 // Iterate through all shuffled scalars and select entries, which can be used
10749 // for final shuffle.
10750 SmallVector<const TreeEntry *> TempEntries;
10751 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
10752 if (!UsedIdxs.test(Idx: I))
10753 continue;
10754 // Fix the entry number for the given scalar. If it is the first entry, set
10755 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
10756 // These indices are used when calculating final shuffle mask as the vector
10757 // offset.
10758 for (std::pair<unsigned, int> &Pair : EntryLanes)
10759 if (Pair.first == I)
10760 Pair.first = TempEntries.size();
10761 TempEntries.push_back(Elt: Entries[I]);
10762 }
10763 Entries.swap(RHS&: TempEntries);
10764 if (EntryLanes.size() == Entries.size() &&
10765 !VL.equals(RHS: ArrayRef(TE->Scalars)
10766 .slice(N: Part * VL.size(),
10767 M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
10768 // We may have here 1 or 2 entries only. If the number of scalars is equal
10769 // to the number of entries, no need to do the analysis, it is not very
10770 // profitable. Since VL is not the same as TE->Scalars, it means we already
10771 // have some shuffles before. Cut off not profitable case.
10772 Entries.clear();
10773 return std::nullopt;
10774 }
10775 // Build the final mask, check for the identity shuffle, if possible.
10776 bool IsIdentity = Entries.size() == 1;
10777 // Pair.first is the offset to the vector, while Pair.second is the index of
10778 // scalar in the list.
10779 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
10780 unsigned Idx = Part * VL.size() + Pair.second;
10781 Mask[Idx] =
10782 Pair.first * VF +
10783 (ForOrder ? std::distance(
10784 first: Entries[Pair.first]->Scalars.begin(),
10785 last: find(Range: Entries[Pair.first]->Scalars, Val: VL[Pair.second]))
10786 : Entries[Pair.first]->findLaneForValue(V: VL[Pair.second]));
10787 IsIdentity &= Mask[Idx] == Pair.second;
10788 }
10789 switch (Entries.size()) {
10790 case 1:
10791 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
10792 return TargetTransformInfo::SK_PermuteSingleSrc;
10793 break;
10794 case 2:
10795 if (EntryLanes.size() > 2 || VL.size() <= 2)
10796 return TargetTransformInfo::SK_PermuteTwoSrc;
10797 break;
10798 default:
10799 break;
10800 }
10801 Entries.clear();
10802 // Clear the corresponding mask elements.
10803 std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
10804 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: PoisonMaskElem);
10805 return std::nullopt;
10806}
10807
10808SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
10809BoUpSLP::isGatherShuffledEntry(
10810 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
10811 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
10812 bool ForOrder) {
10813 assert(NumParts > 0 && NumParts < VL.size() &&
10814 "Expected positive number of registers.");
10815 Entries.clear();
10816 // No need to check for the topmost gather node.
10817 if (TE == VectorizableTree.front().get())
10818 return {};
10819 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
10820 if (TE->isNonPowOf2Vec())
10821 return {};
10822 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
10823 assert(TE->UserTreeIndices.size() == 1 &&
10824 "Expected only single user of the gather node.");
10825 assert(VL.size() % NumParts == 0 &&
10826 "Number of scalars must be divisible by NumParts.");
10827 unsigned SliceSize = VL.size() / NumParts;
10828 SmallVector<std::optional<TTI::ShuffleKind>> Res;
10829 for (unsigned Part = 0; Part < NumParts; ++Part) {
10830 ArrayRef<Value *> SubVL = VL.slice(N: Part * SliceSize, M: SliceSize);
10831 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
10832 std::optional<TTI::ShuffleKind> SubRes =
10833 isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
10834 ForOrder);
10835 if (!SubRes)
10836 SubEntries.clear();
10837 Res.push_back(Elt: SubRes);
10838 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
10839 SubEntries.front()->getVectorFactor() == VL.size() &&
10840 (SubEntries.front()->isSame(VL: TE->Scalars) ||
10841 SubEntries.front()->isSame(VL))) {
10842 SmallVector<const TreeEntry *> LocalSubEntries;
10843 LocalSubEntries.swap(RHS&: SubEntries);
10844 Entries.clear();
10845 Res.clear();
10846 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
10847 // Clear undef scalars.
10848 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10849 if (isa<PoisonValue>(Val: VL[I]))
10850 Mask[I] = PoisonMaskElem;
10851 Entries.emplace_back(Args: 1, Args&: LocalSubEntries.front());
10852 Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
10853 return Res;
10854 }
10855 }
10856 if (all_of(Range&: Res,
10857 P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
10858 Entries.clear();
10859 return {};
10860 }
10861 return Res;
10862}
10863
10864InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
10865 bool ForPoisonSrc) const {
10866 // Find the type of the operands in VL.
10867 Type *ScalarTy = VL[0]->getType();
10868 if (StoreInst *SI = dyn_cast<StoreInst>(Val: VL[0]))
10869 ScalarTy = SI->getValueOperand()->getType();
10870 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
10871 bool DuplicateNonConst = false;
10872 // Find the cost of inserting/extracting values from the vector.
10873 // Check if the same elements are inserted several times and count them as
10874 // shuffle candidates.
10875 APInt ShuffledElements = APInt::getZero(numBits: VL.size());
10876 DenseMap<Value *, unsigned> UniqueElements;
10877 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10878 InstructionCost Cost;
10879 auto EstimateInsertCost = [&](unsigned I, Value *V) {
10880 if (!ForPoisonSrc)
10881 Cost +=
10882 TTI->getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
10883 Index: I, Op0: Constant::getNullValue(Ty: VecTy), Op1: V);
10884 };
10885 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10886 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
10887 Value *V = VL[I];
10888 // No need to shuffle duplicates for constants.
10889 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(Val: V)) {
10890 ShuffledElements.setBit(I);
10891 ShuffleMask[I] = isa<PoisonValue>(Val: V) ? PoisonMaskElem : I;
10892 continue;
10893 }
10894
10895 auto Res = UniqueElements.try_emplace(Key: V, Args&: I);
10896 if (Res.second) {
10897 EstimateInsertCost(I, V);
10898 ShuffleMask[I] = I;
10899 continue;
10900 }
10901
10902 DuplicateNonConst = true;
10903 ShuffledElements.setBit(I);
10904 ShuffleMask[I] = Res.first->second;
10905 }
10906 if (ForPoisonSrc)
10907 Cost =
10908 TTI->getScalarizationOverhead(Ty: VecTy, DemandedElts: ~ShuffledElements, /*Insert*/ true,
10909 /*Extract*/ false, CostKind);
10910 if (DuplicateNonConst)
10911 Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
10912 Tp: VecTy, Mask: ShuffleMask);
10913 return Cost;
10914}
10915
10916// Perform operand reordering on the instructions in VL and return the reordered
10917// operands in Left and Right.
10918void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
10919 SmallVectorImpl<Value *> &Left,
10920 SmallVectorImpl<Value *> &Right,
10921 const BoUpSLP &R) {
10922 if (VL.empty())
10923 return;
10924 VLOperands Ops(VL, R);
10925 // Reorder the operands in place.
10926 Ops.reorder();
10927 Left = Ops.getVL(OpIdx: 0);
10928 Right = Ops.getVL(OpIdx: 1);
10929}
10930
10931Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
10932 auto &Res = EntryToLastInstruction.FindAndConstruct(Key: E);
10933 if (Res.second)
10934 return *Res.second;
10935 // Get the basic block this bundle is in. All instructions in the bundle
10936 // should be in this block (except for extractelement-like instructions with
10937 // constant indeces).
10938 auto *Front = E->getMainOp();
10939 auto *BB = Front->getParent();
10940 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
10941 if (E->getOpcode() == Instruction::GetElementPtr &&
10942 !isa<GetElementPtrInst>(V))
10943 return true;
10944 auto *I = cast<Instruction>(V);
10945 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
10946 isVectorLikeInstWithConstOps(I);
10947 }));
10948
10949 auto FindLastInst = [&]() {
10950 Instruction *LastInst = Front;
10951 for (Value *V : E->Scalars) {
10952 auto *I = dyn_cast<Instruction>(Val: V);
10953 if (!I)
10954 continue;
10955 if (LastInst->getParent() == I->getParent()) {
10956 if (LastInst->comesBefore(Other: I))
10957 LastInst = I;
10958 continue;
10959 }
10960 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10961 !isa<GetElementPtrInst>(I)) ||
10962 (isVectorLikeInstWithConstOps(LastInst) &&
10963 isVectorLikeInstWithConstOps(I))) &&
10964 "Expected vector-like or non-GEP in GEP node insts only.");
10965 if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
10966 LastInst = I;
10967 continue;
10968 }
10969 if (!DT->isReachableFromEntry(A: I->getParent()))
10970 continue;
10971 auto *NodeA = DT->getNode(BB: LastInst->getParent());
10972 auto *NodeB = DT->getNode(BB: I->getParent());
10973 assert(NodeA && "Should only process reachable instructions");
10974 assert(NodeB && "Should only process reachable instructions");
10975 assert((NodeA == NodeB) ==
10976 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10977 "Different nodes should have different DFS numbers");
10978 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
10979 LastInst = I;
10980 }
10981 BB = LastInst->getParent();
10982 return LastInst;
10983 };
10984
10985 auto FindFirstInst = [&]() {
10986 Instruction *FirstInst = Front;
10987 for (Value *V : E->Scalars) {
10988 auto *I = dyn_cast<Instruction>(Val: V);
10989 if (!I)
10990 continue;
10991 if (FirstInst->getParent() == I->getParent()) {
10992 if (I->comesBefore(Other: FirstInst))
10993 FirstInst = I;
10994 continue;
10995 }
10996 assert(((E->getOpcode() == Instruction::GetElementPtr &&
10997 !isa<GetElementPtrInst>(I)) ||
10998 (isVectorLikeInstWithConstOps(FirstInst) &&
10999 isVectorLikeInstWithConstOps(I))) &&
11000 "Expected vector-like or non-GEP in GEP node insts only.");
11001 if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
11002 FirstInst = I;
11003 continue;
11004 }
11005 if (!DT->isReachableFromEntry(A: I->getParent()))
11006 continue;
11007 auto *NodeA = DT->getNode(BB: FirstInst->getParent());
11008 auto *NodeB = DT->getNode(BB: I->getParent());
11009 assert(NodeA && "Should only process reachable instructions");
11010 assert(NodeB && "Should only process reachable instructions");
11011 assert((NodeA == NodeB) ==
11012 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11013 "Different nodes should have different DFS numbers");
11014 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11015 FirstInst = I;
11016 }
11017 return FirstInst;
11018 };
11019
11020 // Set the insert point to the beginning of the basic block if the entry
11021 // should not be scheduled.
11022 if (doesNotNeedToSchedule(VL: E->Scalars) ||
11023 (E->State != TreeEntry::NeedToGather &&
11024 all_of(Range: E->Scalars, P: isVectorLikeInstWithConstOps))) {
11025 if ((E->getOpcode() == Instruction::GetElementPtr &&
11026 any_of(Range: E->Scalars,
11027 P: [](Value *V) {
11028 return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
11029 })) ||
11030 all_of(Range: E->Scalars,
11031 P: [](Value *V) {
11032 return !isVectorLikeInstWithConstOps(V) &&
11033 isUsedOutsideBlock(V);
11034 }) ||
11035 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11036 all_of(Range: E->Scalars, P: [](Value *V) {
11037 return isa<ExtractElementInst, UndefValue>(Val: V) ||
11038 areAllOperandsNonInsts(V);
11039 })))
11040 Res.second = FindLastInst();
11041 else
11042 Res.second = FindFirstInst();
11043 return *Res.second;
11044 }
11045
11046 // Find the last instruction. The common case should be that BB has been
11047 // scheduled, and the last instruction is VL.back(). So we start with
11048 // VL.back() and iterate over schedule data until we reach the end of the
11049 // bundle. The end of the bundle is marked by null ScheduleData.
11050 if (BlocksSchedules.count(Key: BB)) {
11051 Value *V = E->isOneOf(Op: E->Scalars.back());
11052 if (doesNotNeedToBeScheduled(V))
11053 V = *find_if_not(Range: E->Scalars, P: doesNotNeedToBeScheduled);
11054 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11055 if (Bundle && Bundle->isPartOfBundle())
11056 for (; Bundle; Bundle = Bundle->NextInBundle)
11057 if (Bundle->OpValue == Bundle->Inst)
11058 Res.second = Bundle->Inst;
11059 }
11060
11061 // LastInst can still be null at this point if there's either not an entry
11062 // for BB in BlocksSchedules or there's no ScheduleData available for
11063 // VL.back(). This can be the case if buildTree_rec aborts for various
11064 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11065 // size is reached, etc.). ScheduleData is initialized in the scheduling
11066 // "dry-run".
11067 //
11068 // If this happens, we can still find the last instruction by brute force. We
11069 // iterate forwards from Front (inclusive) until we either see all
11070 // instructions in the bundle or reach the end of the block. If Front is the
11071 // last instruction in program order, LastInst will be set to Front, and we
11072 // will visit all the remaining instructions in the block.
11073 //
11074 // One of the reasons we exit early from buildTree_rec is to place an upper
11075 // bound on compile-time. Thus, taking an additional compile-time hit here is
11076 // not ideal. However, this should be exceedingly rare since it requires that
11077 // we both exit early from buildTree_rec and that the bundle be out-of-order
11078 // (causing us to iterate all the way to the end of the block).
11079 if (!Res.second)
11080 Res.second = FindLastInst();
11081 assert(Res.second && "Failed to find last instruction in bundle");
11082 return *Res.second;
11083}
11084
11085void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11086 auto *Front = E->getMainOp();
11087 Instruction *LastInst = &getLastInstructionInBundle(E);
11088 assert(LastInst && "Failed to find last instruction in bundle");
11089 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11090 // If the instruction is PHI, set the insert point after all the PHIs.
11091 bool IsPHI = isa<PHINode>(Val: LastInst);
11092 if (IsPHI)
11093 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11094 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11095 doesNotNeedToSchedule(VL: E->Scalars))) {
11096 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
11097 } else {
11098 // Set the insertion point after the last instruction in the bundle. Set the
11099 // debug location to Front.
11100 Builder.SetInsertPoint(
11101 TheBB: LastInst->getParent(),
11102 IP: LastInst->getNextNonDebugInstruction()->getIterator());
11103 }
11104 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11105}
11106
11107Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root) {
11108 // List of instructions/lanes from current block and/or the blocks which are
11109 // part of the current loop. These instructions will be inserted at the end to
11110 // make it possible to optimize loops and hoist invariant instructions out of
11111 // the loops body with better chances for success.
11112 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
11113 SmallSet<int, 4> PostponedIndices;
11114 Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
11115 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11116 SmallPtrSet<BasicBlock *, 4> Visited;
11117 while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
11118 InsertBB = InsertBB->getSinglePredecessor();
11119 return InsertBB && InsertBB == InstBB;
11120 };
11121 for (int I = 0, E = VL.size(); I < E; ++I) {
11122 if (auto *Inst = dyn_cast<Instruction>(Val: VL[I]))
11123 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11124 getTreeEntry(V: Inst) ||
11125 (L && (!Root || L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
11126 PostponedIndices.insert(V: I).second)
11127 PostponedInsts.emplace_back(Args&: Inst, Args&: I);
11128 }
11129
11130 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11131 Type *Ty) {
11132 Value *Scalar = V;
11133 if (cast<VectorType>(Val: Vec->getType())->getElementType() != Ty) {
11134 assert(V->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11135 "Expected integer types only.");
11136 Vec = Builder.CreateIntCast(
11137 V: Vec,
11138 DestTy: VectorType::get(ElementType: Ty,
11139 EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
11140 isSigned: !isKnownNonNegative(V: Vec, SQ: SimplifyQuery(*DL)));
11141 }
11142
11143 Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
11144 auto *InsElt = dyn_cast<InsertElementInst>(Val: Vec);
11145 if (!InsElt)
11146 return Vec;
11147 GatherShuffleExtractSeq.insert(X: InsElt);
11148 CSEBlocks.insert(V: InsElt->getParent());
11149 // Add to our 'need-to-extract' list.
11150 if (isa<Instruction>(Val: V)) {
11151 if (TreeEntry *Entry = getTreeEntry(V)) {
11152 // Find which lane we need to extract.
11153 User *UserOp = nullptr;
11154 if (Scalar != V) {
11155 if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
11156 UserOp = SI;
11157 } else {
11158 UserOp = InsElt;
11159 }
11160 if (UserOp) {
11161 unsigned FoundLane = Entry->findLaneForValue(V);
11162 ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: FoundLane);
11163 }
11164 }
11165 }
11166 return Vec;
11167 };
11168 Value *Val0 =
11169 isa<StoreInst>(Val: VL[0]) ? cast<StoreInst>(Val: VL[0])->getValueOperand() : VL[0];
11170 Type *ScalarTy = Val0->getType();
11171 FixedVectorType *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VL.size());
11172 Value *Vec = Root ? Root : PoisonValue::get(T: VecTy);
11173 SmallVector<int> NonConsts;
11174 // Insert constant values at first.
11175 for (int I = 0, E = VL.size(); I < E; ++I) {
11176 if (PostponedIndices.contains(V: I))
11177 continue;
11178 if (!isConstant(V: VL[I])) {
11179 NonConsts.push_back(Elt: I);
11180 continue;
11181 }
11182 if (Root) {
11183 if (!isa<UndefValue>(Val: VL[I])) {
11184 NonConsts.push_back(Elt: I);
11185 continue;
11186 }
11187 if (isa<PoisonValue>(Val: VL[I]))
11188 continue;
11189 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Root)) {
11190 if (SV->getMaskValue(Elt: I) == PoisonMaskElem)
11191 continue;
11192 }
11193 }
11194 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11195 }
11196 // Insert non-constant values.
11197 for (int I : NonConsts)
11198 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11199 // Append instructions, which are/may be part of the loop, in the end to make
11200 // it possible to hoist non-loop-based instructions.
11201 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11202 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11203
11204 return Vec;
11205}
11206
11207/// Merges shuffle masks and emits final shuffle instruction, if required. It
11208/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11209/// when the actual shuffle instruction is generated only if this is actually
11210/// required. Otherwise, the shuffle instruction emission is delayed till the
11211/// end of the process, to reduce the number of emitted instructions and further
11212/// analysis/transformations.
11213/// The class also will look through the previously emitted shuffle instructions
11214/// and properly mark indices in mask as undef.
11215/// For example, given the code
11216/// \code
11217/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11218/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11219/// \endcode
11220/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11221/// look through %s1 and %s2 and emit
11222/// \code
11223/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11224/// \endcode
11225/// instead.
11226/// If 2 operands are of different size, the smallest one will be resized and
11227/// the mask recalculated properly.
11228/// For example, given the code
11229/// \code
11230/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11231/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11232/// \endcode
11233/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11234/// look through %s1 and %s2 and emit
11235/// \code
11236/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11237/// \endcode
11238/// instead.
11239class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11240 bool IsFinalized = false;
11241 /// Combined mask for all applied operands and masks. It is built during
11242 /// analysis and actual emission of shuffle vector instructions.
11243 SmallVector<int> CommonMask;
11244 /// List of operands for the shuffle vector instruction. It hold at max 2
11245 /// operands, if the 3rd is going to be added, the first 2 are combined into
11246 /// shuffle with \p CommonMask mask, the first operand sets to be the
11247 /// resulting shuffle and the second operand sets to be the newly added
11248 /// operand. The \p CommonMask is transformed in the proper way after that.
11249 SmallVector<Value *, 2> InVectors;
11250 IRBuilderBase &Builder;
11251 BoUpSLP &R;
11252
11253 class ShuffleIRBuilder {
11254 IRBuilderBase &Builder;
11255 /// Holds all of the instructions that we gathered.
11256 SetVector<Instruction *> &GatherShuffleExtractSeq;
11257 /// A list of blocks that we are going to CSE.
11258 DenseSet<BasicBlock *> &CSEBlocks;
11259 /// Data layout.
11260 const DataLayout &DL;
11261
11262 public:
11263 ShuffleIRBuilder(IRBuilderBase &Builder,
11264 SetVector<Instruction *> &GatherShuffleExtractSeq,
11265 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11266 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11267 CSEBlocks(CSEBlocks), DL(DL) {}
11268 ~ShuffleIRBuilder() = default;
11269 /// Creates shufflevector for the 2 operands with the given mask.
11270 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11271 if (V1->getType() != V2->getType()) {
11272 assert(V1->getType()->isIntOrIntVectorTy() &&
11273 V1->getType()->isIntOrIntVectorTy() &&
11274 "Expected integer vector types only.");
11275 if (V1->getType() != V2->getType()) {
11276 if (cast<VectorType>(Val: V2->getType())
11277 ->getElementType()
11278 ->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
11279 ->getElementType()
11280 ->getIntegerBitWidth())
11281 V2 = Builder.CreateIntCast(
11282 V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery(DL)));
11283 else
11284 V1 = Builder.CreateIntCast(
11285 V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery(DL)));
11286 }
11287 }
11288 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11289 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11290 GatherShuffleExtractSeq.insert(X: I);
11291 CSEBlocks.insert(V: I->getParent());
11292 }
11293 return Vec;
11294 }
11295 /// Creates permutation of the single vector operand with the given mask, if
11296 /// it is not identity mask.
11297 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11298 if (Mask.empty())
11299 return V1;
11300 unsigned VF = Mask.size();
11301 unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11302 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
11303 return V1;
11304 Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
11305 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11306 GatherShuffleExtractSeq.insert(X: I);
11307 CSEBlocks.insert(V: I->getParent());
11308 }
11309 return Vec;
11310 }
11311 Value *createIdentity(Value *V) { return V; }
11312 Value *createPoison(Type *Ty, unsigned VF) {
11313 return PoisonValue::get(T: FixedVectorType::get(ElementType: Ty, NumElts: VF));
11314 }
11315 /// Resizes 2 input vector to match the sizes, if the they are not equal
11316 /// yet. The smallest vector is resized to the size of the larger vector.
11317 void resizeToMatch(Value *&V1, Value *&V2) {
11318 if (V1->getType() == V2->getType())
11319 return;
11320 int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11321 int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
11322 int VF = std::max(a: V1VF, b: V2VF);
11323 int MinVF = std::min(a: V1VF, b: V2VF);
11324 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11325 std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
11326 value: 0);
11327 Value *&Op = MinVF == V1VF ? V1 : V2;
11328 Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
11329 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
11330 GatherShuffleExtractSeq.insert(X: I);
11331 CSEBlocks.insert(V: I->getParent());
11332 }
11333 if (MinVF == V1VF)
11334 V1 = Op;
11335 else
11336 V2 = Op;
11337 }
11338 };
11339
11340 /// Smart shuffle instruction emission, walks through shuffles trees and
11341 /// tries to find the best matching vector for the actual shuffle
11342 /// instruction.
11343 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11344 assert(V1 && "Expected at least one vector value.");
11345 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11346 R.CSEBlocks, *R.DL);
11347 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11348 Builder&: ShuffleBuilder);
11349 }
11350
11351 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11352 /// shuffle emission.
11353 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11354 ArrayRef<int> Mask) {
11355 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11356 if (Mask[Idx] != PoisonMaskElem)
11357 CommonMask[Idx] = Idx;
11358 }
11359
11360public:
11361 ShuffleInstructionBuilder(IRBuilderBase &Builder, BoUpSLP &R)
11362 : Builder(Builder), R(R) {}
11363
11364 /// Adjusts extractelements after reusing them.
11365 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11366 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11367 unsigned NumParts, bool &UseVecBaseAsInput) {
11368 UseVecBaseAsInput = false;
11369 SmallPtrSet<Value *, 4> UniqueBases;
11370 Value *VecBase = nullptr;
11371 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11372 int Idx = Mask[I];
11373 if (Idx == PoisonMaskElem)
11374 continue;
11375 auto *EI = cast<ExtractElementInst>(Val: E->Scalars[I]);
11376 VecBase = EI->getVectorOperand();
11377 if (const TreeEntry *TE = R.getTreeEntry(V: VecBase))
11378 VecBase = TE->VectorizedValue;
11379 assert(VecBase && "Expected vectorized value.");
11380 UniqueBases.insert(Ptr: VecBase);
11381 // If the only one use is vectorized - can delete the extractelement
11382 // itself.
11383 if (!EI->hasOneUse() || (NumParts != 1 && count(Range: E->Scalars, Element: EI) > 1) ||
11384 any_of(Range: EI->users(), P: [&](User *U) {
11385 const TreeEntry *UTE = R.getTreeEntry(V: U);
11386 return !UTE || R.MultiNodeScalars.contains(Val: U) ||
11387 count_if(Range&: R.VectorizableTree,
11388 P: [&](const std::unique_ptr<TreeEntry> &TE) {
11389 return any_of(Range&: TE->UserTreeIndices,
11390 P: [&](const EdgeInfo &Edge) {
11391 return Edge.UserTE == UTE;
11392 }) &&
11393 is_contained(Range&: TE->Scalars, Element: EI);
11394 }) != 1;
11395 }))
11396 continue;
11397 R.eraseInstruction(I: EI);
11398 }
11399 if (NumParts == 1 || UniqueBases.size() == 1)
11400 return VecBase;
11401 UseVecBaseAsInput = true;
11402 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11403 for (auto [I, Idx] : enumerate(First&: Mask))
11404 if (Idx != PoisonMaskElem)
11405 Idx = I;
11406 };
11407 // Perform multi-register vector shuffle, joining them into a single virtual
11408 // long vector.
11409 // Need to shuffle each part independently and then insert all this parts
11410 // into a long virtual vector register, forming the original vector.
11411 Value *Vec = nullptr;
11412 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11413 unsigned SliceSize = E->Scalars.size() / NumParts;
11414 for (unsigned Part = 0; Part < NumParts; ++Part) {
11415 ArrayRef<Value *> VL =
11416 ArrayRef(E->Scalars).slice(N: Part * SliceSize, M: SliceSize);
11417 MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: SliceSize);
11418 constexpr int MaxBases = 2;
11419 SmallVector<Value *, MaxBases> Bases(MaxBases);
11420#ifndef NDEBUG
11421 int PrevSize = 0;
11422#endif // NDEBUG
11423 for (const auto [I, V]: enumerate(First&: VL)) {
11424 if (SubMask[I] == PoisonMaskElem)
11425 continue;
11426 Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
11427 if (const TreeEntry *TE = R.getTreeEntry(V: VecOp))
11428 VecOp = TE->VectorizedValue;
11429 assert(VecOp && "Expected vectorized value.");
11430 const int Size =
11431 cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
11432#ifndef NDEBUG
11433 assert((PrevSize == Size || PrevSize == 0) &&
11434 "Expected vectors of the same size.");
11435 PrevSize = Size;
11436#endif // NDEBUG
11437 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11438 }
11439 if (!Bases.front())
11440 continue;
11441 Value *SubVec;
11442 if (Bases.back()) {
11443 SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
11444 TransformToIdentity(SubMask);
11445 } else {
11446 SubVec = Bases.front();
11447 }
11448 if (!Vec) {
11449 Vec = SubVec;
11450 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11451 [&](unsigned P) {
11452 ArrayRef<int> SubMask =
11453 Mask.slice(P * SliceSize, SliceSize);
11454 return all_of(SubMask, [](int Idx) {
11455 return Idx == PoisonMaskElem;
11456 });
11457 })) &&
11458 "Expected first part or all previous parts masked.");
11459 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11460 } else {
11461 unsigned VF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
11462 if (Vec->getType() != SubVec->getType()) {
11463 unsigned SubVecVF =
11464 cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
11465 VF = std::max(a: VF, b: SubVecVF);
11466 }
11467 // Adjust SubMask.
11468 for (int &Idx : SubMask)
11469 if (Idx != PoisonMaskElem)
11470 Idx += VF;
11471 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11472 Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
11473 TransformToIdentity(VecMask);
11474 }
11475 }
11476 copy(Range&: VecMask, Out: Mask.begin());
11477 return Vec;
11478 }
11479 /// Checks if the specified entry \p E needs to be delayed because of its
11480 /// dependency nodes.
11481 std::optional<Value *>
11482 needToDelay(const TreeEntry *E,
11483 ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
11484 // No need to delay emission if all deps are ready.
11485 if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
11486 return all_of(
11487 Range&: TEs, P: [](const TreeEntry *TE) { return TE->VectorizedValue; });
11488 }))
11489 return std::nullopt;
11490 // Postpone gather emission, will be emitted after the end of the
11491 // process to keep correct order.
11492 auto *VecTy = FixedVectorType::get(ElementType: E->Scalars.front()->getType(),
11493 NumElts: E->getVectorFactor());
11494 return Builder.CreateAlignedLoad(
11495 Ty: VecTy, Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: VecTy->getContext())),
11496 Align: MaybeAlign());
11497 }
11498 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11499 /// shuffling.
11500 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11501 add(V1: E1.VectorizedValue, V2: E2.VectorizedValue, Mask);
11502 }
11503 /// Adds single input vector (in form of tree entry) and the mask for its
11504 /// shuffling.
11505 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11506 add(V1: E1.VectorizedValue, Mask);
11507 }
11508 /// Adds 2 input vectors and the mask for their shuffling.
11509 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11510 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11511 if (InVectors.empty()) {
11512 InVectors.push_back(Elt: V1);
11513 InVectors.push_back(Elt: V2);
11514 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
11515 return;
11516 }
11517 Value *Vec = InVectors.front();
11518 if (InVectors.size() == 2) {
11519 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
11520 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11521 } else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
11522 Mask.size()) {
11523 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
11524 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11525 }
11526 V1 = createShuffle(V1, V2, Mask);
11527 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11528 if (Mask[Idx] != PoisonMaskElem)
11529 CommonMask[Idx] = Idx + Sz;
11530 InVectors.front() = Vec;
11531 if (InVectors.size() == 2)
11532 InVectors.back() = V1;
11533 else
11534 InVectors.push_back(Elt: V1);
11535 }
11536 /// Adds another one input vector and the mask for the shuffling.
11537 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11538 if (InVectors.empty()) {
11539 if (!isa<FixedVectorType>(Val: V1->getType())) {
11540 V1 = createShuffle(V1, V2: nullptr, Mask: CommonMask);
11541 CommonMask.assign(NumElts: Mask.size(), Elt: PoisonMaskElem);
11542 transformMaskAfterShuffle(CommonMask, Mask);
11543 }
11544 InVectors.push_back(Elt: V1);
11545 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
11546 return;
11547 }
11548 const auto *It = find(Range&: InVectors, Val: V1);
11549 if (It == InVectors.end()) {
11550 if (InVectors.size() == 2 ||
11551 InVectors.front()->getType() != V1->getType() ||
11552 !isa<FixedVectorType>(Val: V1->getType())) {
11553 Value *V = InVectors.front();
11554 if (InVectors.size() == 2) {
11555 V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
11556 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11557 } else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
11558 CommonMask.size()) {
11559 V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
11560 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
11561 }
11562 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11563 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11564 CommonMask[Idx] =
11565 V->getType() != V1->getType()
11566 ? Idx + Sz
11567 : Mask[Idx] + cast<FixedVectorType>(Val: V1->getType())
11568 ->getNumElements();
11569 if (V->getType() != V1->getType())
11570 V1 = createShuffle(V1, V2: nullptr, Mask);
11571 InVectors.front() = V;
11572 if (InVectors.size() == 2)
11573 InVectors.back() = V1;
11574 else
11575 InVectors.push_back(Elt: V1);
11576 return;
11577 }
11578 // Check if second vector is required if the used elements are already
11579 // used from the first one.
11580 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11581 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11582 InVectors.push_back(Elt: V1);
11583 break;
11584 }
11585 }
11586 int VF = CommonMask.size();
11587 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
11588 VF = FTy->getNumElements();
11589 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11590 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11591 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11592 }
11593 /// Adds another one input vector and the mask for the shuffling.
11594 void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
11595 SmallVector<int> NewMask;
11596 inversePermutation(Indices: Order, Mask&: NewMask);
11597 add(V1, Mask: NewMask);
11598 }
11599 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11600 Value *Root = nullptr) {
11601 return R.gather(VL, Root);
11602 }
11603 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11604 /// Finalize emission of the shuffles.
11605 /// \param Action the action (if any) to be performed before final applying of
11606 /// the \p ExtMask mask.
11607 Value *
11608 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11609 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11610 IsFinalized = true;
11611 if (Action) {
11612 Value *Vec = InVectors.front();
11613 if (InVectors.size() == 2) {
11614 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
11615 InVectors.pop_back();
11616 } else {
11617 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
11618 }
11619 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11620 if (CommonMask[Idx] != PoisonMaskElem)
11621 CommonMask[Idx] = Idx;
11622 assert(VF > 0 &&
11623 "Expected vector length for the final value before action.");
11624 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
11625 if (VecVF < VF) {
11626 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11627 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
11628 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
11629 }
11630 Action(Vec, CommonMask);
11631 InVectors.front() = Vec;
11632 }
11633 if (!ExtMask.empty()) {
11634 if (CommonMask.empty()) {
11635 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
11636 } else {
11637 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11638 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11639 if (ExtMask[I] == PoisonMaskElem)
11640 continue;
11641 NewMask[I] = CommonMask[ExtMask[I]];
11642 }
11643 CommonMask.swap(RHS&: NewMask);
11644 }
11645 }
11646 if (CommonMask.empty()) {
11647 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11648 return InVectors.front();
11649 }
11650 if (InVectors.size() == 2)
11651 return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
11652 return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
11653 }
11654
11655 ~ShuffleInstructionBuilder() {
11656 assert((IsFinalized || CommonMask.empty()) &&
11657 "Shuffle construction must be finalized.");
11658 }
11659};
11660
11661Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11662 bool PostponedPHIs) {
11663 ValueList &VL = E->getOperand(OpIdx: NodeIdx);
11664 const unsigned VF = VL.size();
11665 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
11666 // Special processing for GEPs bundle, which may include non-gep values.
11667 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11668 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
11669 if (It != VL.end())
11670 S = getSameOpcode(VL: *It, TLI: *TLI);
11671 }
11672 if (S.getOpcode()) {
11673 auto CheckSameVE = [&](const TreeEntry *VE) {
11674 return VE->isSame(VL) &&
11675 (any_of(Range: VE->UserTreeIndices,
11676 P: [E, NodeIdx](const EdgeInfo &EI) {
11677 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11678 }) ||
11679 any_of(Range&: VectorizableTree,
11680 P: [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
11681 return TE->isOperandGatherNode(UserEI: {E, NodeIdx}) &&
11682 VE->isSame(VL: TE->Scalars);
11683 }));
11684 };
11685 TreeEntry *VE = getTreeEntry(V: S.OpValue);
11686 bool IsSameVE = VE && CheckSameVE(VE);
11687 if (!IsSameVE) {
11688 auto It = MultiNodeScalars.find(Val: S.OpValue);
11689 if (It != MultiNodeScalars.end()) {
11690 auto *I = find_if(Range&: It->getSecond(), P: [&](const TreeEntry *TE) {
11691 return TE != VE && CheckSameVE(TE);
11692 });
11693 if (I != It->getSecond().end()) {
11694 VE = *I;
11695 IsSameVE = true;
11696 }
11697 }
11698 }
11699 if (IsSameVE) {
11700 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
11701 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
11702 ShuffleBuilder.add(V1: V, Mask);
11703 return ShuffleBuilder.finalize(ExtMask: std::nullopt);
11704 };
11705 Value *V = vectorizeTree(E: VE, PostponedPHIs);
11706 if (VF != cast<FixedVectorType>(Val: V->getType())->getNumElements()) {
11707 if (!VE->ReuseShuffleIndices.empty()) {
11708 // Reshuffle to get only unique values.
11709 // If some of the scalars are duplicated in the vectorization
11710 // tree entry, we do not vectorize them but instead generate a
11711 // mask for the reuses. But if there are several users of the
11712 // same entry, they may have different vectorization factors.
11713 // This is especially important for PHI nodes. In this case, we
11714 // need to adapt the resulting instruction for the user
11715 // vectorization factor and have to reshuffle it again to take
11716 // only unique elements of the vector. Without this code the
11717 // function incorrectly returns reduced vector instruction with
11718 // the same elements, not with the unique ones.
11719
11720 // block:
11721 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
11722 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
11723 // ... (use %2)
11724 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
11725 // br %block
11726 SmallVector<int> Mask(VF, PoisonMaskElem);
11727 for (auto [I, V] : enumerate(First&: VL)) {
11728 if (isa<PoisonValue>(Val: V))
11729 continue;
11730 Mask[I] = VE->findLaneForValue(V);
11731 }
11732 V = FinalShuffle(V, Mask);
11733 } else {
11734 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
11735 "Expected vectorization factor less "
11736 "than original vector size.");
11737 SmallVector<int> UniformMask(VF, 0);
11738 std::iota(first: UniformMask.begin(), last: UniformMask.end(), value: 0);
11739 V = FinalShuffle(V, UniformMask);
11740 }
11741 }
11742 // Need to update the operand gather node, if actually the operand is not a
11743 // vectorized node, but the buildvector/gather node, which matches one of
11744 // the vectorized nodes.
11745 if (find_if(Range&: VE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
11746 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
11747 }) == VE->UserTreeIndices.end()) {
11748 auto *It = find_if(
11749 Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
11750 return TE->State == TreeEntry::NeedToGather &&
11751 TE->UserTreeIndices.front().UserTE == E &&
11752 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
11753 });
11754 assert(It != VectorizableTree.end() && "Expected gather node operand.");
11755 (*It)->VectorizedValue = V;
11756 }
11757 return V;
11758 }
11759 }
11760
11761 // Find the corresponding gather entry and vectorize it.
11762 // Allows to be more accurate with tree/graph transformations, checks for the
11763 // correctness of the transformations in many cases.
11764 auto *I = find_if(Range&: VectorizableTree,
11765 P: [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
11766 return TE->isOperandGatherNode(UserEI: {E, NodeIdx});
11767 });
11768 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
11769 assert(I->get()->UserTreeIndices.size() == 1 &&
11770 "Expected only single user for the gather node.");
11771 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
11772 return vectorizeTree(E: I->get(), PostponedPHIs);
11773}
11774
11775template <typename BVTy, typename ResTy, typename... Args>
11776ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
11777 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
11778 unsigned VF = E->getVectorFactor();
11779
11780 bool NeedFreeze = false;
11781 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
11782 E->ReuseShuffleIndices.end());
11783 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
11784 // Build a mask out of the reorder indices and reorder scalars per this
11785 // mask.
11786 SmallVector<int> ReorderMask;
11787 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
11788 if (!ReorderMask.empty())
11789 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
11790 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
11791 unsigned I, unsigned SliceSize) {
11792 if (!isSplat(VL: E->Scalars) || none_of(E->Scalars, [](Value *V) {
11793 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
11794 }))
11795 return false;
11796 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
11797 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
11798 if (UserTE->getNumOperands() != 2)
11799 return false;
11800 auto *It =
11801 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
11802 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
11803 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
11804 }) != TE->UserTreeIndices.end();
11805 });
11806 if (It == VectorizableTree.end())
11807 return false;
11808 int Idx;
11809 if ((Mask.size() < InputVF &&
11810 ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
11811 Idx == 0) ||
11812 (Mask.size() == InputVF &&
11813 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
11814 std::iota(first: std::next(x: Mask.begin(), n: I * SliceSize),
11815 last: std::next(x: Mask.begin(), n: (I + 1) * SliceSize), value: 0);
11816 } else {
11817 unsigned IVal =
11818 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
11819 std::fill(first: std::next(x: Mask.begin(), n: I * SliceSize),
11820 last: std::next(x: Mask.begin(), n: (I + 1) * SliceSize), value: IVal);
11821 }
11822 return true;
11823 };
11824 BVTy ShuffleBuilder(Params...);
11825 ResTy Res = ResTy();
11826 SmallVector<int> Mask;
11827 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
11828 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
11829 Value *ExtractVecBase = nullptr;
11830 bool UseVecBaseAsInput = false;
11831 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
11832 SmallVector<SmallVector<const TreeEntry *>> Entries;
11833 Type *ScalarTy = GatheredScalars.front()->getType();
11834 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: GatheredScalars.size());
11835 unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
11836 if (NumParts == 0 || NumParts >= GatheredScalars.size())
11837 NumParts = 1;
11838 if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
11839 // Check for gathered extracts.
11840 bool Resized = false;
11841 ExtractShuffles =
11842 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
11843 if (!ExtractShuffles.empty()) {
11844 SmallVector<const TreeEntry *> ExtractEntries;
11845 for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
11846 if (I == PoisonMaskElem)
11847 continue;
11848 if (const auto *TE = getTreeEntry(
11849 V: cast<ExtractElementInst>(Val: E->Scalars[Idx])->getVectorOperand()))
11850 ExtractEntries.push_back(Elt: TE);
11851 }
11852 if (std::optional<ResTy> Delayed =
11853 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
11854 // Delay emission of gathers which are not ready yet.
11855 PostponedGathers.insert(X: E);
11856 // Postpone gather emission, will be emitted after the end of the
11857 // process to keep correct order.
11858 return *Delayed;
11859 }
11860 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
11861 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
11862 ExtractVecBase = VecBase;
11863 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
11864 if (VF == VecBaseTy->getNumElements() &&
11865 GatheredScalars.size() != VF) {
11866 Resized = true;
11867 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
11868 Elt: PoisonValue::get(T: ScalarTy));
11869 }
11870 }
11871 }
11872 // Gather extracts after we check for full matched gathers only.
11873 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
11874 E->isAltShuffle() ||
11875 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
11876 isSplat(VL: E->Scalars) ||
11877 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
11878 GatherShuffles =
11879 isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
11880 }
11881 if (!GatherShuffles.empty()) {
11882 if (std::optional<ResTy> Delayed =
11883 ShuffleBuilder.needToDelay(E, Entries)) {
11884 // Delay emission of gathers which are not ready yet.
11885 PostponedGathers.insert(X: E);
11886 // Postpone gather emission, will be emitted after the end of the
11887 // process to keep correct order.
11888 return *Delayed;
11889 }
11890 if (GatherShuffles.size() == 1 &&
11891 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
11892 Entries.front().front()->isSame(VL: E->Scalars)) {
11893 // Perfect match in the graph, will reuse the previously vectorized
11894 // node. Cost is 0.
11895 LLVM_DEBUG(
11896 dbgs()
11897 << "SLP: perfect diamond match for gather bundle "
11898 << shortBundleName(E->Scalars) << ".\n");
11899 // Restore the mask for previous partially matched values.
11900 Mask.resize(N: E->Scalars.size());
11901 const TreeEntry *FrontTE = Entries.front().front();
11902 if (FrontTE->ReorderIndices.empty() &&
11903 ((FrontTE->ReuseShuffleIndices.empty() &&
11904 E->Scalars.size() == FrontTE->Scalars.size()) ||
11905 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
11906 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
11907 } else {
11908 for (auto [I, V] : enumerate(First: E->Scalars)) {
11909 if (isa<PoisonValue>(Val: V)) {
11910 Mask[I] = PoisonMaskElem;
11911 continue;
11912 }
11913 Mask[I] = FrontTE->findLaneForValue(V);
11914 }
11915 }
11916 ShuffleBuilder.add(*FrontTE, Mask);
11917 Res = ShuffleBuilder.finalize(E->getCommonMask());
11918 return Res;
11919 }
11920 if (!Resized) {
11921 if (GatheredScalars.size() != VF &&
11922 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
11923 return any_of(TEs, [&](const TreeEntry *TE) {
11924 return TE->getVectorFactor() == VF;
11925 });
11926 }))
11927 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
11928 Elt: PoisonValue::get(T: ScalarTy));
11929 }
11930 // Remove shuffled elements from list of gathers.
11931 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11932 if (Mask[I] != PoisonMaskElem)
11933 GatheredScalars[I] = PoisonValue::get(T: ScalarTy);
11934 }
11935 }
11936 }
11937 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
11938 SmallVectorImpl<int> &ReuseMask,
11939 bool IsRootPoison) {
11940 // For splats with can emit broadcasts instead of gathers, so try to find
11941 // such sequences.
11942 bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
11943 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
11944 Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: ScalarTy));
11945 SmallVector<int> UndefPos;
11946 DenseMap<Value *, unsigned> UniquePositions;
11947 // Gather unique non-const values and all constant values.
11948 // For repeated values, just shuffle them.
11949 int NumNonConsts = 0;
11950 int SinglePos = 0;
11951 for (auto [I, V] : enumerate(First&: Scalars)) {
11952 if (isa<UndefValue>(Val: V)) {
11953 if (!isa<PoisonValue>(Val: V)) {
11954 ReuseMask[I] = I;
11955 UndefPos.push_back(Elt: I);
11956 }
11957 continue;
11958 }
11959 if (isConstant(V)) {
11960 ReuseMask[I] = I;
11961 continue;
11962 }
11963 ++NumNonConsts;
11964 SinglePos = I;
11965 Value *OrigV = V;
11966 Scalars[I] = PoisonValue::get(T: ScalarTy);
11967 if (IsSplat) {
11968 Scalars.front() = OrigV;
11969 ReuseMask[I] = 0;
11970 } else {
11971 const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
11972 Scalars[Res.first->second] = OrigV;
11973 ReuseMask[I] = Res.first->second;
11974 }
11975 }
11976 if (NumNonConsts == 1) {
11977 // Restore single insert element.
11978 if (IsSplat) {
11979 ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
11980 std::swap(a&: Scalars.front(), b&: Scalars[SinglePos]);
11981 if (!UndefPos.empty() && UndefPos.front() == 0)
11982 Scalars.front() = UndefValue::get(T: ScalarTy);
11983 }
11984 ReuseMask[SinglePos] = SinglePos;
11985 } else if (!UndefPos.empty() && IsSplat) {
11986 // For undef values, try to replace them with the simple broadcast.
11987 // We can do it if the broadcasted value is guaranteed to be
11988 // non-poisonous, or by freezing the incoming scalar value first.
11989 auto *It = find_if(Scalars, [this, E](Value *V) {
11990 return !isa<UndefValue>(Val: V) &&
11991 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
11992 (E->UserTreeIndices.size() == 1 &&
11993 any_of(V->uses(), [E](const Use &U) {
11994 // Check if the value already used in the same operation in
11995 // one of the nodes already.
11996 return E->UserTreeIndices.front().EdgeIdx !=
11997 U.getOperandNo() &&
11998 is_contained(
11999 Range&: E->UserTreeIndices.front().UserTE->Scalars,
12000 Element: U.getUser());
12001 })));
12002 });
12003 if (It != Scalars.end()) {
12004 // Replace undefs by the non-poisoned scalars and emit broadcast.
12005 int Pos = std::distance(Scalars.begin(), It);
12006 for (int I : UndefPos) {
12007 // Set the undef position to the non-poisoned scalar.
12008 ReuseMask[I] = Pos;
12009 // Replace the undef by the poison, in the mask it is replaced by
12010 // non-poisoned scalar already.
12011 if (I != Pos)
12012 Scalars[I] = PoisonValue::get(T: ScalarTy);
12013 }
12014 } else {
12015 // Replace undefs by the poisons, emit broadcast and then emit
12016 // freeze.
12017 for (int I : UndefPos) {
12018 ReuseMask[I] = PoisonMaskElem;
12019 if (isa<UndefValue>(Val: Scalars[I]))
12020 Scalars[I] = PoisonValue::get(T: ScalarTy);
12021 }
12022 NeedFreeze = true;
12023 }
12024 }
12025 };
12026 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12027 bool IsNonPoisoned = true;
12028 bool IsUsedInExpr = true;
12029 Value *Vec1 = nullptr;
12030 if (!ExtractShuffles.empty()) {
12031 // Gather of extractelements can be represented as just a shuffle of
12032 // a single/two vectors the scalars are extracted from.
12033 // Find input vectors.
12034 Value *Vec2 = nullptr;
12035 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12036 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12037 ExtractMask[I] = PoisonMaskElem;
12038 }
12039 if (UseVecBaseAsInput) {
12040 Vec1 = ExtractVecBase;
12041 } else {
12042 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12043 if (ExtractMask[I] == PoisonMaskElem)
12044 continue;
12045 if (isa<UndefValue>(Val: E->Scalars[I]))
12046 continue;
12047 auto *EI = cast<ExtractElementInst>(Val: E->Scalars[I]);
12048 Value *VecOp = EI->getVectorOperand();
12049 if (const auto *TE = getTreeEntry(V: VecOp))
12050 if (TE->VectorizedValue)
12051 VecOp = TE->VectorizedValue;
12052 if (!Vec1) {
12053 Vec1 = VecOp;
12054 } else if (Vec1 != VecOp) {
12055 assert((!Vec2 || Vec2 == VecOp) &&
12056 "Expected only 1 or 2 vectors shuffle.");
12057 Vec2 = VecOp;
12058 }
12059 }
12060 }
12061 if (Vec2) {
12062 IsUsedInExpr = false;
12063 IsNonPoisoned &=
12064 isGuaranteedNotToBePoison(V: Vec1) && isGuaranteedNotToBePoison(V: Vec2);
12065 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12066 } else if (Vec1) {
12067 IsUsedInExpr &= FindReusedSplat(
12068 ExtractMask,
12069 cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), 0,
12070 ExtractMask.size());
12071 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12072 IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1);
12073 } else {
12074 IsUsedInExpr = false;
12075 ShuffleBuilder.add(PoisonValue::get(T: FixedVectorType::get(
12076 ElementType: ScalarTy, NumElts: GatheredScalars.size())),
12077 ExtractMask, /*ForExtracts=*/true);
12078 }
12079 }
12080 if (!GatherShuffles.empty()) {
12081 unsigned SliceSize = E->Scalars.size() / NumParts;
12082 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12083 for (const auto [I, TEs] : enumerate(First&: Entries)) {
12084 if (TEs.empty()) {
12085 assert(!GatherShuffles[I] &&
12086 "No shuffles with empty entries list expected.");
12087 continue;
12088 }
12089 assert((TEs.size() == 1 || TEs.size() == 2) &&
12090 "Expected shuffle of 1 or 2 entries.");
12091 auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: SliceSize);
12092 VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
12093 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
12094 if (TEs.size() == 1) {
12095 IsUsedInExpr &= FindReusedSplat(
12096 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12097 ShuffleBuilder.add(*TEs.front(), VecMask);
12098 if (TEs.front()->VectorizedValue)
12099 IsNonPoisoned &=
12100 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue);
12101 } else {
12102 IsUsedInExpr = false;
12103 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12104 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12105 IsNonPoisoned &=
12106 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue) &&
12107 isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue);
12108 }
12109 }
12110 }
12111 // Try to figure out best way to combine values: build a shuffle and insert
12112 // elements or just build several shuffles.
12113 // Insert non-constant scalars.
12114 SmallVector<Value *> NonConstants(GatheredScalars);
12115 int EMSz = ExtractMask.size();
12116 int MSz = Mask.size();
12117 // Try to build constant vector and shuffle with it only if currently we
12118 // have a single permutation and more than 1 scalar constants.
12119 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12120 bool IsIdentityShuffle =
12121 ((UseVecBaseAsInput ||
12122 all_of(ExtractShuffles,
12123 [](const std::optional<TTI::ShuffleKind> &SK) {
12124 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12125 TTI::SK_PermuteSingleSrc;
12126 })) &&
12127 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12128 ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) ||
12129 (!GatherShuffles.empty() &&
12130 all_of(GatherShuffles,
12131 [](const std::optional<TTI::ShuffleKind> &SK) {
12132 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12133 TTI::SK_PermuteSingleSrc;
12134 }) &&
12135 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12136 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
12137 bool EnoughConstsForShuffle =
12138 IsSingleShuffle &&
12139 (none_of(GatheredScalars,
12140 [](Value *V) {
12141 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
12142 }) ||
12143 any_of(GatheredScalars,
12144 [](Value *V) {
12145 return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
12146 })) &&
12147 (!IsIdentityShuffle ||
12148 (GatheredScalars.size() == 2 &&
12149 any_of(GatheredScalars,
12150 [](Value *V) { return !isa<UndefValue>(Val: V); })) ||
12151 count_if(GatheredScalars, [](Value *V) {
12152 return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
12153 }) > 1);
12154 // NonConstants array contains just non-constant values, GatheredScalars
12155 // contains only constant to build final vector and then shuffle.
12156 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12157 if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars[I]))
12158 NonConstants[I] = PoisonValue::get(T: ScalarTy);
12159 else
12160 GatheredScalars[I] = PoisonValue::get(T: ScalarTy);
12161 }
12162 // Generate constants for final shuffle and build a mask for them.
12163 if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
12164 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12165 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12166 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12167 ShuffleBuilder.add(BV, BVMask);
12168 }
12169 if (all_of(NonConstants, [=](Value *V) {
12170 return isa<PoisonValue>(Val: V) ||
12171 (IsSingleShuffle && ((IsIdentityShuffle &&
12172 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(Val: V));
12173 }))
12174 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12175 else
12176 Res = ShuffleBuilder.finalize(
12177 E->ReuseShuffleIndices, E->Scalars.size(),
12178 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12179 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12180 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12181 });
12182 } else if (!allConstant(VL: GatheredScalars)) {
12183 // Gather unique scalars and all constants.
12184 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12185 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12186 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12187 ShuffleBuilder.add(BV, ReuseMask);
12188 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12189 } else {
12190 // Gather all constants.
12191 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12192 for (auto [I, V] : enumerate(First: E->Scalars)) {
12193 if (!isa<PoisonValue>(Val: V))
12194 Mask[I] = I;
12195 }
12196 Value *BV = ShuffleBuilder.gather(E->Scalars);
12197 ShuffleBuilder.add(BV, Mask);
12198 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12199 }
12200
12201 if (NeedFreeze)
12202 Res = ShuffleBuilder.createFreeze(Res);
12203 return Res;
12204}
12205
12206Value *BoUpSLP::createBuildVector(const TreeEntry *E) {
12207 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, Params&: Builder,
12208 Params&: *this);
12209}
12210
12211Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12212 IRBuilderBase::InsertPointGuard Guard(Builder);
12213
12214 if (E->VectorizedValue &&
12215 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12216 E->isAltShuffle())) {
12217 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12218 return E->VectorizedValue;
12219 }
12220
12221 if (E->State == TreeEntry::NeedToGather) {
12222 // Set insert point for non-reduction initial nodes.
12223 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12224 setInsertPointAfterBundle(E);
12225 Value *Vec = createBuildVector(E);
12226 E->VectorizedValue = Vec;
12227 return Vec;
12228 }
12229
12230 bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
12231 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12232 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
12233 if (E->getOpcode() == Instruction::Store) {
12234 ArrayRef<int> Mask =
12235 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12236 E->ReorderIndices.size());
12237 ShuffleBuilder.add(V1: V, Mask);
12238 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12239 ShuffleBuilder.addOrdered(V1: V, Order: std::nullopt);
12240 } else {
12241 ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
12242 }
12243 return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices);
12244 };
12245
12246 assert((E->State == TreeEntry::Vectorize ||
12247 E->State == TreeEntry::ScatterVectorize ||
12248 E->State == TreeEntry::StridedVectorize) &&
12249 "Unhandled state");
12250 unsigned ShuffleOrOp =
12251 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12252 Instruction *VL0 = E->getMainOp();
12253 Type *ScalarTy = VL0->getType();
12254 if (auto *Store = dyn_cast<StoreInst>(Val: VL0))
12255 ScalarTy = Store->getValueOperand()->getType();
12256 else if (auto *IE = dyn_cast<InsertElementInst>(Val: VL0))
12257 ScalarTy = IE->getOperand(i_nocapture: 1)->getType();
12258 auto It = MinBWs.find(Val: E);
12259 if (It != MinBWs.end())
12260 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
12261 auto GetOperandSignedness = [&](unsigned Idx) {
12262 const TreeEntry *OpE = getOperandEntry(E, Idx);
12263 bool IsSigned = false;
12264 auto It = MinBWs.find(Val: OpE);
12265 if (It != MinBWs.end())
12266 IsSigned = It->second.second;
12267 else
12268 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
12269 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
12270 });
12271 return IsSigned;
12272 };
12273 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: E->Scalars.size());
12274 switch (ShuffleOrOp) {
12275 case Instruction::PHI: {
12276 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12277 E != VectorizableTree.front().get() ||
12278 !E->UserTreeIndices.empty()) &&
12279 "PHI reordering is free.");
12280 if (PostponedPHIs && E->VectorizedValue)
12281 return E->VectorizedValue;
12282 auto *PH = cast<PHINode>(Val: VL0);
12283 Builder.SetInsertPoint(TheBB: PH->getParent(),
12284 IP: PH->getParent()->getFirstNonPHIIt());
12285 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12286 if (PostponedPHIs || !E->VectorizedValue) {
12287 PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
12288 E->PHI = NewPhi;
12289 Value *V = NewPhi;
12290
12291 // Adjust insertion point once all PHI's have been generated.
12292 Builder.SetInsertPoint(TheBB: PH->getParent(),
12293 IP: PH->getParent()->getFirstInsertionPt());
12294 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12295
12296 V = FinalShuffle(V, E, VecTy);
12297
12298 E->VectorizedValue = V;
12299 if (PostponedPHIs)
12300 return V;
12301 }
12302 PHINode *NewPhi = cast<PHINode>(Val: E->PHI);
12303 // If phi node is fully emitted - exit.
12304 if (NewPhi->getNumIncomingValues() != 0)
12305 return NewPhi;
12306
12307 // PHINodes may have multiple entries from the same block. We want to
12308 // visit every block once.
12309 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
12310
12311 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
12312 ValueList Operands;
12313 BasicBlock *IBB = PH->getIncomingBlock(i: I);
12314
12315 // Stop emission if all incoming values are generated.
12316 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12317 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12318 return NewPhi;
12319 }
12320
12321 if (!VisitedBBs.insert(Ptr: IBB).second) {
12322 NewPhi->addIncoming(V: NewPhi->getIncomingValueForBlock(BB: IBB), BB: IBB);
12323 continue;
12324 }
12325
12326 Builder.SetInsertPoint(IBB->getTerminator());
12327 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12328 Value *Vec = vectorizeOperand(E, NodeIdx: I, /*PostponedPHIs=*/true);
12329 if (VecTy != Vec->getType()) {
12330 assert((It != MinBWs.end() ||
12331 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12332 MinBWs.contains(getOperandEntry(E, I))) &&
12333 "Expected item in MinBWs.");
12334 Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
12335 }
12336 NewPhi->addIncoming(V: Vec, BB: IBB);
12337 }
12338
12339 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12340 "Invalid number of incoming values");
12341 return NewPhi;
12342 }
12343
12344 case Instruction::ExtractElement: {
12345 Value *V = E->getSingleOperand(OpIdx: 0);
12346 if (const TreeEntry *TE = getTreeEntry(V))
12347 V = TE->VectorizedValue;
12348 setInsertPointAfterBundle(E);
12349 V = FinalShuffle(V, E, VecTy);
12350 E->VectorizedValue = V;
12351 return V;
12352 }
12353 case Instruction::ExtractValue: {
12354 auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: 0));
12355 Builder.SetInsertPoint(LI);
12356 Value *Ptr = LI->getPointerOperand();
12357 LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
12358 Value *NewV = propagateMetadata(I: V, VL: E->Scalars);
12359 NewV = FinalShuffle(NewV, E, VecTy);
12360 E->VectorizedValue = NewV;
12361 return NewV;
12362 }
12363 case Instruction::InsertElement: {
12364 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12365 Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
12366 Value *V = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
12367 ArrayRef<Value *> Op = E->getOperand(OpIdx: 1);
12368 Type *ScalarTy = Op.front()->getType();
12369 if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
12370 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12371 std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: 1));
12372 assert(Res.first > 0 && "Expected item in MinBWs.");
12373 V = Builder.CreateIntCast(
12374 V,
12375 DestTy: FixedVectorType::get(
12376 ElementType: ScalarTy,
12377 NumElts: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
12378 isSigned: Res.second);
12379 }
12380
12381 // Create InsertVector shuffle if necessary
12382 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range&: E->Scalars, P: [E](Value *V) {
12383 return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
12384 }));
12385 const unsigned NumElts =
12386 cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
12387 const unsigned NumScalars = E->Scalars.size();
12388
12389 unsigned Offset = *getInsertIndex(InsertInst: VL0);
12390 assert(Offset < NumElts && "Failed to find vector index offset");
12391
12392 // Create shuffle to resize vector
12393 SmallVector<int> Mask;
12394 if (!E->ReorderIndices.empty()) {
12395 inversePermutation(Indices: E->ReorderIndices, Mask);
12396 Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
12397 } else {
12398 Mask.assign(NumElts, Elt: PoisonMaskElem);
12399 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: 0);
12400 }
12401 // Create InsertVector shuffle if necessary
12402 bool IsIdentity = true;
12403 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12404 Mask.swap(RHS&: PrevMask);
12405 for (unsigned I = 0; I < NumScalars; ++I) {
12406 Value *Scalar = E->Scalars[PrevMask[I]];
12407 unsigned InsertIdx = *getInsertIndex(InsertInst: Scalar);
12408 IsIdentity &= InsertIdx - Offset == I;
12409 Mask[InsertIdx - Offset] = I;
12410 }
12411 if (!IsIdentity || NumElts != NumScalars) {
12412 Value *V2 = nullptr;
12413 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12414 SmallVector<int> InsertMask(Mask);
12415 if (NumElts != NumScalars && Offset == 0) {
12416 // Follow all insert element instructions from the current buildvector
12417 // sequence.
12418 InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
12419 do {
12420 std::optional<unsigned> InsertIdx = getInsertIndex(InsertInst: Ins);
12421 if (!InsertIdx)
12422 break;
12423 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12424 InsertMask[*InsertIdx] = *InsertIdx;
12425 if (!Ins->hasOneUse())
12426 break;
12427 Ins = dyn_cast_or_null<InsertElementInst>(
12428 Val: Ins->getUniqueUndroppableUser());
12429 } while (Ins);
12430 SmallBitVector UseMask =
12431 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
12432 SmallBitVector IsFirstPoison =
12433 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
12434 SmallBitVector IsFirstUndef =
12435 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
12436 if (!IsFirstPoison.all()) {
12437 unsigned Idx = 0;
12438 for (unsigned I = 0; I < NumElts; I++) {
12439 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
12440 IsFirstUndef.test(Idx: I)) {
12441 if (IsVNonPoisonous) {
12442 InsertMask[I] = I < NumScalars ? I : 0;
12443 continue;
12444 }
12445 if (!V2)
12446 V2 = UndefValue::get(T: V->getType());
12447 if (Idx >= NumScalars)
12448 Idx = NumScalars - 1;
12449 InsertMask[I] = NumScalars + Idx;
12450 ++Idx;
12451 } else if (InsertMask[I] != PoisonMaskElem &&
12452 Mask[I] == PoisonMaskElem) {
12453 InsertMask[I] = PoisonMaskElem;
12454 }
12455 }
12456 } else {
12457 InsertMask = Mask;
12458 }
12459 }
12460 if (!V2)
12461 V2 = PoisonValue::get(T: V->getType());
12462 V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
12463 if (auto *I = dyn_cast<Instruction>(Val: V)) {
12464 GatherShuffleExtractSeq.insert(X: I);
12465 CSEBlocks.insert(V: I->getParent());
12466 }
12467 }
12468
12469 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12470 for (unsigned I = 0; I < NumElts; I++) {
12471 if (Mask[I] != PoisonMaskElem)
12472 InsertMask[Offset + I] = I;
12473 }
12474 SmallBitVector UseMask =
12475 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
12476 SmallBitVector IsFirstUndef =
12477 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
12478 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12479 NumElts != NumScalars) {
12480 if (IsFirstUndef.all()) {
12481 if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
12482 SmallBitVector IsFirstPoison =
12483 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
12484 if (!IsFirstPoison.all()) {
12485 for (unsigned I = 0; I < NumElts; I++) {
12486 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
12487 InsertMask[I] = I + NumElts;
12488 }
12489 }
12490 V = Builder.CreateShuffleVector(
12491 V1: V,
12492 V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
12493 : FirstInsert->getOperand(i: 0),
12494 Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
12495 if (auto *I = dyn_cast<Instruction>(Val: V)) {
12496 GatherShuffleExtractSeq.insert(X: I);
12497 CSEBlocks.insert(V: I->getParent());
12498 }
12499 }
12500 } else {
12501 SmallBitVector IsFirstPoison =
12502 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
12503 for (unsigned I = 0; I < NumElts; I++) {
12504 if (InsertMask[I] == PoisonMaskElem)
12505 InsertMask[I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
12506 else
12507 InsertMask[I] += NumElts;
12508 }
12509 V = Builder.CreateShuffleVector(
12510 V1: FirstInsert->getOperand(i: 0), V2: V, Mask: InsertMask,
12511 Name: cast<Instruction>(Val: E->Scalars.back())->getName());
12512 if (auto *I = dyn_cast<Instruction>(Val: V)) {
12513 GatherShuffleExtractSeq.insert(X: I);
12514 CSEBlocks.insert(V: I->getParent());
12515 }
12516 }
12517 }
12518
12519 ++NumVectorInstructions;
12520 E->VectorizedValue = V;
12521 return V;
12522 }
12523 case Instruction::ZExt:
12524 case Instruction::SExt:
12525 case Instruction::FPToUI:
12526 case Instruction::FPToSI:
12527 case Instruction::FPExt:
12528 case Instruction::PtrToInt:
12529 case Instruction::IntToPtr:
12530 case Instruction::SIToFP:
12531 case Instruction::UIToFP:
12532 case Instruction::Trunc:
12533 case Instruction::FPTrunc:
12534 case Instruction::BitCast: {
12535 setInsertPointAfterBundle(E);
12536
12537 Value *InVec = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12538 if (E->VectorizedValue) {
12539 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12540 return E->VectorizedValue;
12541 }
12542
12543 auto *CI = cast<CastInst>(Val: VL0);
12544 Instruction::CastOps VecOpcode = CI->getOpcode();
12545 Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
12546 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
12547 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12548 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12549 SrcScalarTy != CI->getOperand(i_nocapture: 0)->getType())) {
12550 // Check if the values are candidates to demote.
12551 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
12552 if (SrcIt != MinBWs.end())
12553 SrcBWSz = SrcIt->second.first;
12554 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
12555 if (BWSz == SrcBWSz) {
12556 VecOpcode = Instruction::BitCast;
12557 } else if (BWSz < SrcBWSz) {
12558 VecOpcode = Instruction::Trunc;
12559 } else if (It != MinBWs.end()) {
12560 assert(BWSz > SrcBWSz && "Invalid cast!");
12561 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12562 } else if (SrcIt != MinBWs.end()) {
12563 assert(BWSz > SrcBWSz && "Invalid cast!");
12564 VecOpcode =
12565 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12566 }
12567 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12568 !SrcIt->second.second) {
12569 VecOpcode = Instruction::UIToFP;
12570 }
12571 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12572 ? InVec
12573 : Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
12574 V = FinalShuffle(V, E, VecTy);
12575
12576 E->VectorizedValue = V;
12577 ++NumVectorInstructions;
12578 return V;
12579 }
12580 case Instruction::FCmp:
12581 case Instruction::ICmp: {
12582 setInsertPointAfterBundle(E);
12583
12584 Value *L = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12585 if (E->VectorizedValue) {
12586 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12587 return E->VectorizedValue;
12588 }
12589 Value *R = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
12590 if (E->VectorizedValue) {
12591 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12592 return E->VectorizedValue;
12593 }
12594 if (L->getType() != R->getType()) {
12595 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12596 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12597 MinBWs.contains(getOperandEntry(E, 0)) ||
12598 MinBWs.contains(getOperandEntry(E, 1))) &&
12599 "Expected item in MinBWs.");
12600 if (cast<VectorType>(Val: L->getType())
12601 ->getElementType()
12602 ->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
12603 ->getElementType()
12604 ->getIntegerBitWidth()) {
12605 Type *CastTy = R->getType();
12606 L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness(0));
12607 } else {
12608 Type *CastTy = L->getType();
12609 R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness(1));
12610 }
12611 }
12612
12613 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
12614 Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
12615 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
12616 // Do not cast for cmps.
12617 VecTy = cast<FixedVectorType>(Val: V->getType());
12618 V = FinalShuffle(V, E, VecTy);
12619
12620 E->VectorizedValue = V;
12621 ++NumVectorInstructions;
12622 return V;
12623 }
12624 case Instruction::Select: {
12625 setInsertPointAfterBundle(E);
12626
12627 Value *Cond = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12628 if (E->VectorizedValue) {
12629 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12630 return E->VectorizedValue;
12631 }
12632 Value *True = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
12633 if (E->VectorizedValue) {
12634 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12635 return E->VectorizedValue;
12636 }
12637 Value *False = vectorizeOperand(E, NodeIdx: 2, PostponedPHIs);
12638 if (E->VectorizedValue) {
12639 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12640 return E->VectorizedValue;
12641 }
12642 if (True->getType() != VecTy || False->getType() != VecTy) {
12643 assert((It != MinBWs.end() ||
12644 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12645 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12646 MinBWs.contains(getOperandEntry(E, 1)) ||
12647 MinBWs.contains(getOperandEntry(E, 2))) &&
12648 "Expected item in MinBWs.");
12649 if (True->getType() != VecTy)
12650 True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness(1));
12651 if (False->getType() != VecTy)
12652 False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness(2));
12653 }
12654
12655 Value *V = Builder.CreateSelect(C: Cond, True, False);
12656 V = FinalShuffle(V, E, VecTy);
12657
12658 E->VectorizedValue = V;
12659 ++NumVectorInstructions;
12660 return V;
12661 }
12662 case Instruction::FNeg: {
12663 setInsertPointAfterBundle(E);
12664
12665 Value *Op = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12666
12667 if (E->VectorizedValue) {
12668 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12669 return E->VectorizedValue;
12670 }
12671
12672 Value *V = Builder.CreateUnOp(
12673 Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
12674 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
12675 if (auto *I = dyn_cast<Instruction>(Val: V))
12676 V = propagateMetadata(I, VL: E->Scalars);
12677
12678 V = FinalShuffle(V, E, VecTy);
12679
12680 E->VectorizedValue = V;
12681 ++NumVectorInstructions;
12682
12683 return V;
12684 }
12685 case Instruction::Add:
12686 case Instruction::FAdd:
12687 case Instruction::Sub:
12688 case Instruction::FSub:
12689 case Instruction::Mul:
12690 case Instruction::FMul:
12691 case Instruction::UDiv:
12692 case Instruction::SDiv:
12693 case Instruction::FDiv:
12694 case Instruction::URem:
12695 case Instruction::SRem:
12696 case Instruction::FRem:
12697 case Instruction::Shl:
12698 case Instruction::LShr:
12699 case Instruction::AShr:
12700 case Instruction::And:
12701 case Instruction::Or:
12702 case Instruction::Xor: {
12703 setInsertPointAfterBundle(E);
12704
12705 Value *LHS = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12706 if (E->VectorizedValue) {
12707 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12708 return E->VectorizedValue;
12709 }
12710 Value *RHS = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
12711 if (E->VectorizedValue) {
12712 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12713 return E->VectorizedValue;
12714 }
12715 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
12716 assert((It != MinBWs.end() ||
12717 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12718 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12719 MinBWs.contains(getOperandEntry(E, 0)) ||
12720 MinBWs.contains(getOperandEntry(E, 1))) &&
12721 "Expected item in MinBWs.");
12722 if (LHS->getType() != VecTy)
12723 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness(0));
12724 if (RHS->getType() != VecTy)
12725 RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness(1));
12726 }
12727
12728 Value *V = Builder.CreateBinOp(
12729 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
12730 RHS);
12731 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0, IncludeWrapFlags: It == MinBWs.end());
12732 if (auto *I = dyn_cast<Instruction>(Val: V)) {
12733 V = propagateMetadata(I, VL: E->Scalars);
12734 // Drop nuw flags for abs(sub(commutative), true).
12735 if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
12736 any_of(Range&: E->Scalars, P: [](Value *V) {
12737 return isCommutative(I: cast<Instruction>(Val: V));
12738 }))
12739 I->setHasNoUnsignedWrap(/*b=*/false);
12740 }
12741
12742 V = FinalShuffle(V, E, VecTy);
12743
12744 E->VectorizedValue = V;
12745 ++NumVectorInstructions;
12746
12747 return V;
12748 }
12749 case Instruction::Load: {
12750 // Loads are inserted at the head of the tree because we don't want to
12751 // sink them all the way down past store instructions.
12752 setInsertPointAfterBundle(E);
12753
12754 LoadInst *LI = cast<LoadInst>(Val: VL0);
12755 Instruction *NewLI;
12756 Value *PO = LI->getPointerOperand();
12757 if (E->State == TreeEntry::Vectorize) {
12758 NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
12759 } else if (E->State == TreeEntry::StridedVectorize) {
12760 Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
12761 Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
12762 PO = IsReverseOrder ? PtrN : Ptr0;
12763 std::optional<int> Diff = getPointersDiff(
12764 ElemTyA: VL0->getType(), PtrA: Ptr0, ElemTyB: VL0->getType(), PtrB: PtrN, DL: *DL, SE&: *SE);
12765 Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
12766 Value *StrideVal;
12767 if (Diff) {
12768 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
12769 StrideVal =
12770 ConstantInt::get(Ty: StrideTy, V: (IsReverseOrder ? -1 : 1) * Stride *
12771 DL->getTypeAllocSize(Ty: ScalarTy));
12772 } else {
12773 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
12774 transform(Range&: E->Scalars, d_first: PointerOps.begin(), F: [](Value *V) {
12775 return cast<LoadInst>(Val: V)->getPointerOperand();
12776 });
12777 OrdersType Order;
12778 std::optional<Value *> Stride =
12779 calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order,
12780 Inst: &*Builder.GetInsertPoint());
12781 Value *NewStride =
12782 Builder.CreateIntCast(V: *Stride, DestTy: StrideTy, /*isSigned=*/true);
12783 StrideVal = Builder.CreateMul(
12784 LHS: NewStride,
12785 RHS: ConstantInt::get(
12786 Ty: StrideTy,
12787 V: (IsReverseOrder ? -1 : 1) *
12788 static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))));
12789 }
12790 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
12791 auto *Inst = Builder.CreateIntrinsic(
12792 Intrinsic::experimental_vp_strided_load,
12793 {VecTy, PO->getType(), StrideTy},
12794 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
12795 Builder.getInt32(E->Scalars.size())});
12796 Inst->addParamAttr(
12797 /*ArgNo=*/0,
12798 Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
12799 NewLI = Inst;
12800 } else {
12801 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
12802 Value *VecPtr = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12803 if (E->VectorizedValue) {
12804 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12805 return E->VectorizedValue;
12806 }
12807 // Use the minimum alignment of the gathered loads.
12808 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
12809 NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
12810 }
12811 Value *V = propagateMetadata(I: NewLI, VL: E->Scalars);
12812
12813 V = FinalShuffle(V, E, VecTy);
12814 E->VectorizedValue = V;
12815 ++NumVectorInstructions;
12816 return V;
12817 }
12818 case Instruction::Store: {
12819 auto *SI = cast<StoreInst>(Val: VL0);
12820
12821 setInsertPointAfterBundle(E);
12822
12823 Value *VecValue = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12824 if (VecValue->getType() != VecTy)
12825 VecValue =
12826 Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness(0));
12827 VecValue = FinalShuffle(VecValue, E, VecTy);
12828
12829 Value *Ptr = SI->getPointerOperand();
12830 StoreInst *ST =
12831 Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
12832
12833 Value *V = propagateMetadata(I: ST, VL: E->Scalars);
12834
12835 E->VectorizedValue = V;
12836 ++NumVectorInstructions;
12837 return V;
12838 }
12839 case Instruction::GetElementPtr: {
12840 auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
12841 setInsertPointAfterBundle(E);
12842
12843 Value *Op0 = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12844 if (E->VectorizedValue) {
12845 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12846 return E->VectorizedValue;
12847 }
12848
12849 SmallVector<Value *> OpVecs;
12850 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
12851 Value *OpVec = vectorizeOperand(E, NodeIdx: J, PostponedPHIs);
12852 if (E->VectorizedValue) {
12853 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12854 return E->VectorizedValue;
12855 }
12856 OpVecs.push_back(Elt: OpVec);
12857 }
12858
12859 Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
12860 if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
12861 SmallVector<Value *> GEPs;
12862 for (Value *V : E->Scalars) {
12863 if (isa<GetElementPtrInst>(Val: V))
12864 GEPs.push_back(Elt: V);
12865 }
12866 V = propagateMetadata(I, VL: GEPs);
12867 }
12868
12869 V = FinalShuffle(V, E, VecTy);
12870
12871 E->VectorizedValue = V;
12872 ++NumVectorInstructions;
12873
12874 return V;
12875 }
12876 case Instruction::Call: {
12877 CallInst *CI = cast<CallInst>(Val: VL0);
12878 setInsertPointAfterBundle(E);
12879
12880 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
12881
12882 SmallVector<Type *> ArgTys =
12883 buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
12884 MinBW: It != MinBWs.end() ? It->second.first : 0);
12885 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
12886 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12887 VecCallCosts.first <= VecCallCosts.second;
12888
12889 Value *ScalarArg = nullptr;
12890 SmallVector<Value *> OpVecs;
12891 SmallVector<Type *, 2> TysForDecl;
12892 // Add return type if intrinsic is overloaded on it.
12893 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -1))
12894 TysForDecl.push_back(Elt: VecTy);
12895 auto *CEI = cast<CallInst>(Val: VL0);
12896 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
12897 ValueList OpVL;
12898 // Some intrinsics have scalar arguments. This argument should not be
12899 // vectorized.
12900 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I)) {
12901 ScalarArg = CEI->getArgOperand(i: I);
12902 // if decided to reduce bitwidth of abs intrinsic, it second argument
12903 // must be set false (do not return poison, if value issigned min).
12904 if (ID == Intrinsic::abs && It != MinBWs.end() &&
12905 It->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
12906 ScalarArg = Builder.getFalse();
12907 OpVecs.push_back(Elt: ScalarArg);
12908 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
12909 TysForDecl.push_back(Elt: ScalarArg->getType());
12910 continue;
12911 }
12912
12913 Value *OpVec = vectorizeOperand(E, NodeIdx: I, PostponedPHIs);
12914 if (E->VectorizedValue) {
12915 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12916 return E->VectorizedValue;
12917 }
12918 ScalarArg = CEI->getArgOperand(i: I);
12919 if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
12920 ScalarArg->getType() &&
12921 It == MinBWs.end()) {
12922 auto *CastTy = FixedVectorType::get(ElementType: ScalarArg->getType(),
12923 NumElts: VecTy->getNumElements());
12924 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness(I));
12925 } else if (It != MinBWs.end()) {
12926 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
12927 }
12928 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
12929 OpVecs.push_back(Elt: OpVec);
12930 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
12931 TysForDecl.push_back(Elt: OpVec->getType());
12932 }
12933
12934 Function *CF;
12935 if (!UseIntrinsic) {
12936 VFShape Shape =
12937 VFShape::get(FTy: CI->getFunctionType(),
12938 EC: ElementCount::getFixed(
12939 MinVal: static_cast<unsigned>(VecTy->getNumElements())),
12940 HasGlobalPred: false /*HasGlobalPred*/);
12941 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
12942 } else {
12943 CF = Intrinsic::getDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
12944 }
12945
12946 SmallVector<OperandBundleDef, 1> OpBundles;
12947 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
12948 Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
12949
12950 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
12951 V = FinalShuffle(V, E, VecTy);
12952
12953 E->VectorizedValue = V;
12954 ++NumVectorInstructions;
12955 return V;
12956 }
12957 case Instruction::ShuffleVector: {
12958 assert(E->isAltShuffle() &&
12959 ((Instruction::isBinaryOp(E->getOpcode()) &&
12960 Instruction::isBinaryOp(E->getAltOpcode())) ||
12961 (Instruction::isCast(E->getOpcode()) &&
12962 Instruction::isCast(E->getAltOpcode())) ||
12963 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
12964 "Invalid Shuffle Vector Operand");
12965
12966 Value *LHS = nullptr, *RHS = nullptr;
12967 if (Instruction::isBinaryOp(Opcode: E->getOpcode()) || isa<CmpInst>(Val: VL0)) {
12968 setInsertPointAfterBundle(E);
12969 LHS = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12970 if (E->VectorizedValue) {
12971 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12972 return E->VectorizedValue;
12973 }
12974 RHS = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
12975 } else {
12976 setInsertPointAfterBundle(E);
12977 LHS = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
12978 }
12979 if (E->VectorizedValue) {
12980 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12981 return E->VectorizedValue;
12982 }
12983 if (LHS && RHS &&
12984 ((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
12985 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
12986 (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
12987 assert((It != MinBWs.end() ||
12988 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12989 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12990 MinBWs.contains(getOperandEntry(E, 0)) ||
12991 MinBWs.contains(getOperandEntry(E, 1))) &&
12992 "Expected item in MinBWs.");
12993 Type *CastTy = VecTy;
12994 if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
12995 if (cast<VectorType>(Val: LHS->getType())
12996 ->getElementType()
12997 ->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
12998 ->getElementType()
12999 ->getIntegerBitWidth())
13000 CastTy = RHS->getType();
13001 else
13002 CastTy = LHS->getType();
13003 }
13004 if (LHS->getType() != CastTy)
13005 LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness(0));
13006 if (RHS->getType() != CastTy)
13007 RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness(1));
13008 }
13009
13010 Value *V0, *V1;
13011 if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
13012 V0 = Builder.CreateBinOp(
13013 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13014 V1 = Builder.CreateBinOp(
13015 Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13016 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
13017 V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
13018 auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
13019 CmpInst::Predicate AltPred = AltCI->getPredicate();
13020 V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
13021 } else {
13022 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13023 unsigned SrcBWSz = DL->getTypeSizeInBits(
13024 Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
13025 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13026 if (BWSz <= SrcBWSz) {
13027 if (BWSz < SrcBWSz)
13028 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It->second.first);
13029 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13030 if (auto *I = dyn_cast<Instruction>(Val: LHS))
13031 LHS = propagateMetadata(I, VL: E->Scalars);
13032 E->VectorizedValue = LHS;
13033 ++NumVectorInstructions;
13034 return LHS;
13035 }
13036 }
13037 V0 = Builder.CreateCast(
13038 Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
13039 V1 = Builder.CreateCast(
13040 Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
13041 }
13042 // Add V0 and V1 to later analysis to try to find and remove matching
13043 // instruction, if any.
13044 for (Value *V : {V0, V1}) {
13045 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13046 GatherShuffleExtractSeq.insert(X: I);
13047 CSEBlocks.insert(V: I->getParent());
13048 }
13049 }
13050
13051 // Create shuffle to take alternate operations from the vector.
13052 // Also, gather up main and alt scalar ops to propagate IR flags to
13053 // each vector operation.
13054 ValueList OpScalars, AltScalars;
13055 SmallVector<int> Mask;
13056 E->buildAltOpShuffleMask(
13057 IsAltOp: [E, this](Instruction *I) {
13058 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13059 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
13060 TLI: *TLI);
13061 },
13062 Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
13063
13064 propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
13065 propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
13066 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13067 // Drop nuw flags for abs(sub(commutative), true).
13068 if (auto *I = dyn_cast<Instruction>(Val: Vec);
13069 I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
13070 any_of(Range&: E->Scalars, P: [](Value *V) {
13071 auto *IV = cast<Instruction>(Val: V);
13072 return IV->getOpcode() == Instruction::Sub &&
13073 isCommutative(I: cast<Instruction>(Val: IV));
13074 }))
13075 I->setHasNoUnsignedWrap(/*b=*/false);
13076 };
13077 DropNuwFlag(V0, E->getOpcode());
13078 DropNuwFlag(V1, E->getAltOpcode());
13079
13080 Value *V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
13081 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13082 V = propagateMetadata(I, VL: E->Scalars);
13083 GatherShuffleExtractSeq.insert(X: I);
13084 CSEBlocks.insert(V: I->getParent());
13085 }
13086
13087 E->VectorizedValue = V;
13088 ++NumVectorInstructions;
13089
13090 return V;
13091 }
13092 default:
13093 llvm_unreachable("unknown inst");
13094 }
13095 return nullptr;
13096}
13097
13098Value *BoUpSLP::vectorizeTree() {
13099 ExtraValueToDebugLocsMap ExternallyUsedValues;
13100 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13101 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13102}
13103
13104namespace {
13105/// Data type for handling buildvector sequences with the reused scalars from
13106/// other tree entries.
13107struct ShuffledInsertData {
13108 /// List of insertelements to be replaced by shuffles.
13109 SmallVector<InsertElementInst *> InsertElements;
13110 /// The parent vectors and shuffle mask for the given list of inserts.
13111 MapVector<Value *, SmallVector<int>> ValueMasks;
13112};
13113} // namespace
13114
13115Value *BoUpSLP::vectorizeTree(
13116 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13117 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13118 Instruction *ReductionRoot) {
13119 // All blocks must be scheduled before any instructions are inserted.
13120 for (auto &BSIter : BlocksSchedules) {
13121 scheduleBlock(BS: BSIter.second.get());
13122 }
13123 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13124 // need to rebuild it.
13125 EntryToLastInstruction.clear();
13126
13127 if (ReductionRoot)
13128 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
13129 IP: ReductionRoot->getIterator());
13130 else
13131 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13132
13133 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13134 (void)vectorizeTree(E: VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13135 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13136 if (TE->State == TreeEntry::Vectorize &&
13137 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13138 TE->VectorizedValue)
13139 (void)vectorizeTree(E: TE.get(), /*PostponedPHIs=*/false);
13140 // Run through the list of postponed gathers and emit them, replacing the temp
13141 // emitted allocas with actual vector instructions.
13142 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13143 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
13144 for (const TreeEntry *E : PostponedNodes) {
13145 auto *TE = const_cast<TreeEntry *>(E);
13146 if (auto *VecTE = getTreeEntry(V: TE->Scalars.front()))
13147 if (VecTE->isSame(VL: TE->UserTreeIndices.front().UserTE->getOperand(
13148 OpIdx: TE->UserTreeIndices.front().EdgeIdx)))
13149 // Found gather node which is absolutely the same as one of the
13150 // vectorized nodes. It may happen after reordering.
13151 continue;
13152 auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
13153 TE->VectorizedValue = nullptr;
13154 auto *UserI =
13155 cast<Instruction>(Val&: TE->UserTreeIndices.front().UserTE->VectorizedValue);
13156 // If user is a PHI node, its vector code have to be inserted right before
13157 // block terminator. Since the node was delayed, there were some unresolved
13158 // dependencies at the moment when stab instruction was emitted. In a case
13159 // when any of these dependencies turn out an operand of another PHI, coming
13160 // from this same block, position of a stab instruction will become invalid.
13161 // The is because source vector that supposed to feed this gather node was
13162 // inserted at the end of the block [after stab instruction]. So we need
13163 // to adjust insertion point again to the end of block.
13164 if (isa<PHINode>(Val: UserI)) {
13165 // Insert before all users.
13166 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13167 for (User *U : PrevVec->users()) {
13168 if (U == UserI)
13169 continue;
13170 auto *UI = dyn_cast<Instruction>(Val: U);
13171 if (!UI || isa<PHINode>(Val: UI) || UI->getParent() != InsertPt->getParent())
13172 continue;
13173 if (UI->comesBefore(Other: InsertPt))
13174 InsertPt = UI;
13175 }
13176 Builder.SetInsertPoint(InsertPt);
13177 } else {
13178 Builder.SetInsertPoint(PrevVec);
13179 }
13180 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13181 Value *Vec = vectorizeTree(E: TE, /*PostponedPHIs=*/false);
13182 if (Vec->getType() != PrevVec->getType()) {
13183 assert(Vec->getType()->isIntOrIntVectorTy() &&
13184 PrevVec->getType()->isIntOrIntVectorTy() &&
13185 "Expected integer vector types only.");
13186 std::optional<bool> IsSigned;
13187 for (Value *V : TE->Scalars) {
13188 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13189 auto It = MinBWs.find(Val: BaseTE);
13190 if (It != MinBWs.end()) {
13191 IsSigned = IsSigned.value_or(u: false) || It->second.second;
13192 if (*IsSigned)
13193 break;
13194 }
13195 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(Val: V)) {
13196 auto It = MinBWs.find(Val: MNTE);
13197 if (It != MinBWs.end()) {
13198 IsSigned = IsSigned.value_or(u: false) || It->second.second;
13199 if (*IsSigned)
13200 break;
13201 }
13202 }
13203 if (IsSigned.value_or(u: false))
13204 break;
13205 // Scan through gather nodes.
13206 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
13207 auto It = MinBWs.find(Val: BVE);
13208 if (It != MinBWs.end()) {
13209 IsSigned = IsSigned.value_or(u: false) || It->second.second;
13210 if (*IsSigned)
13211 break;
13212 }
13213 }
13214 if (IsSigned.value_or(u: false))
13215 break;
13216 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
13217 IsSigned =
13218 IsSigned.value_or(u: false) ||
13219 !isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery(*DL));
13220 continue;
13221 }
13222 if (IsSigned.value_or(u: false))
13223 break;
13224 }
13225 }
13226 if (IsSigned.value_or(u: false)) {
13227 // Final attempt - check user node.
13228 auto It = MinBWs.find(Val: TE->UserTreeIndices.front().UserTE);
13229 if (It != MinBWs.end())
13230 IsSigned = It->second.second;
13231 }
13232 assert(IsSigned &&
13233 "Expected user node or perfect diamond match in MinBWs.");
13234 Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
13235 }
13236 PrevVec->replaceAllUsesWith(V: Vec);
13237 PostponedValues.try_emplace(Key: Vec).first->second.push_back(Elt: TE);
13238 // Replace the stub vector node, if it was used before for one of the
13239 // buildvector nodes already.
13240 auto It = PostponedValues.find(Val: PrevVec);
13241 if (It != PostponedValues.end()) {
13242 for (TreeEntry *VTE : It->getSecond())
13243 VTE->VectorizedValue = Vec;
13244 }
13245 eraseInstruction(I: PrevVec);
13246 }
13247
13248 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13249 << " values .\n");
13250
13251 SmallVector<ShuffledInsertData> ShuffledInserts;
13252 // Maps vector instruction to original insertelement instruction
13253 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13254 // Maps extract Scalar to the corresponding extractelement instruction in the
13255 // basic block. Only one extractelement per block should be emitted.
13256 DenseMap<Value *,
13257 DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
13258 ScalarToEEs;
13259 SmallDenseSet<Value *, 4> UsedInserts;
13260 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
13261 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13262 // Extract all of the elements with the external uses.
13263 for (const auto &ExternalUse : ExternalUses) {
13264 Value *Scalar = ExternalUse.Scalar;
13265 llvm::User *User = ExternalUse.User;
13266
13267 // Skip users that we already RAUW. This happens when one instruction
13268 // has multiple uses of the same value.
13269 if (User && !is_contained(Range: Scalar->users(), Element: User))
13270 continue;
13271 TreeEntry *E = getTreeEntry(V: Scalar);
13272 assert(E && "Invalid scalar");
13273 assert(E->State != TreeEntry::NeedToGather &&
13274 "Extracting from a gather list");
13275 // Non-instruction pointers are not deleted, just skip them.
13276 if (E->getOpcode() == Instruction::GetElementPtr &&
13277 !isa<GetElementPtrInst>(Val: Scalar))
13278 continue;
13279
13280 Value *Vec = E->VectorizedValue;
13281 assert(Vec && "Can't find vectorizable value");
13282
13283 Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
13284 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13285 if (Scalar->getType() != Vec->getType()) {
13286 Value *Ex = nullptr;
13287 Value *ExV = nullptr;
13288 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Scalar);
13289 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(Ptr: GEP);
13290 auto It = ScalarToEEs.find(Val: Scalar);
13291 if (It != ScalarToEEs.end()) {
13292 // No need to emit many extracts, just move the only one in the
13293 // current block.
13294 auto EEIt = It->second.find(Val: Builder.GetInsertBlock());
13295 if (EEIt != It->second.end()) {
13296 Instruction *I = EEIt->second.first;
13297 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13298 Builder.GetInsertPoint()->comesBefore(Other: I)) {
13299 I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
13300 I: Builder.GetInsertPoint());
13301 if (auto *CI = EEIt->second.second)
13302 CI->moveAfter(MovePos: I);
13303 }
13304 Ex = I;
13305 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13306 }
13307 }
13308 if (!Ex) {
13309 // "Reuse" the existing extract to improve final codegen.
13310 if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar)) {
13311 Value *V = ES->getVectorOperand();
13312 if (const TreeEntry *ETE = getTreeEntry(V))
13313 V = ETE->VectorizedValue;
13314 Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
13315 } else if (ReplaceGEP) {
13316 // Leave the GEPs as is, they are free in most cases and better to
13317 // keep them as GEPs.
13318 auto *CloneGEP = GEP->clone();
13319 CloneGEP->insertBefore(BB&: *Builder.GetInsertBlock(),
13320 InsertPos: Builder.GetInsertPoint());
13321 if (GEP->hasName())
13322 CloneGEP->takeName(V: GEP);
13323 Ex = CloneGEP;
13324 } else {
13325 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
13326 }
13327 // If necessary, sign-extend or zero-extend ScalarRoot
13328 // to the larger type.
13329 ExV = Ex;
13330 if (Scalar->getType() != Ex->getType())
13331 ExV = Builder.CreateIntCast(V: Ex, DestTy: Scalar->getType(),
13332 isSigned: MinBWs.find(Val: E)->second.second);
13333 if (auto *I = dyn_cast<Instruction>(Val: Ex))
13334 ScalarToEEs[Scalar].try_emplace(
13335 Key: Builder.GetInsertBlock(),
13336 Args: std::make_pair(x&: I, y: cast<Instruction>(Val: ExV)));
13337 }
13338 // The then branch of the previous if may produce constants, since 0
13339 // operand might be a constant.
13340 if (auto *ExI = dyn_cast<Instruction>(Val: Ex)) {
13341 GatherShuffleExtractSeq.insert(X: ExI);
13342 CSEBlocks.insert(V: ExI->getParent());
13343 }
13344 return ExV;
13345 }
13346 assert(isa<FixedVectorType>(Scalar->getType()) &&
13347 isa<InsertElementInst>(Scalar) &&
13348 "In-tree scalar of vector type is not insertelement?");
13349 auto *IE = cast<InsertElementInst>(Val: Scalar);
13350 VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
13351 return Vec;
13352 };
13353 // If User == nullptr, the Scalar remains as scalar in vectorized
13354 // instructions or is used as extra arg. Generate ExtractElement instruction
13355 // and update the record for this scalar in ExternallyUsedValues.
13356 if (!User) {
13357 if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
13358 continue;
13359 assert((ExternallyUsedValues.count(Scalar) ||
13360 any_of(Scalar->users(),
13361 [&](llvm::User *U) {
13362 if (ExternalUsesAsGEPs.contains(U))
13363 return true;
13364 TreeEntry *UseEntry = getTreeEntry(U);
13365 return UseEntry &&
13366 (UseEntry->State == TreeEntry::Vectorize ||
13367 UseEntry->State ==
13368 TreeEntry::StridedVectorize) &&
13369 (E->State == TreeEntry::Vectorize ||
13370 E->State == TreeEntry::StridedVectorize) &&
13371 doesInTreeUserNeedToExtract(
13372 Scalar,
13373 cast<Instruction>(UseEntry->Scalars.front()),
13374 TLI);
13375 })) &&
13376 "Scalar with nullptr User must be registered in "
13377 "ExternallyUsedValues map or remain as scalar in vectorized "
13378 "instructions");
13379 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
13380 if (auto *PHI = dyn_cast<PHINode>(Val: VecI))
13381 Builder.SetInsertPoint(TheBB: PHI->getParent(),
13382 IP: PHI->getParent()->getFirstNonPHIIt());
13383 else
13384 Builder.SetInsertPoint(TheBB: VecI->getParent(),
13385 IP: std::next(x: VecI->getIterator()));
13386 } else {
13387 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13388 }
13389 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13390 // Required to update internally referenced instructions.
13391 Scalar->replaceAllUsesWith(V: NewInst);
13392 ReplacedExternals.emplace_back(Args&: Scalar, Args&: NewInst);
13393 continue;
13394 }
13395
13396 if (auto *VU = dyn_cast<InsertElementInst>(Val: User)) {
13397 // Skip if the scalar is another vector op or Vec is not an instruction.
13398 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
13399 if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
13400 if (!UsedInserts.insert(V: VU).second)
13401 continue;
13402 // Need to use original vector, if the root is truncated.
13403 auto BWIt = MinBWs.find(Val: E);
13404 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13405 auto *ScalarTy = FTy->getElementType();
13406 auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
13407 auto VecIt = VectorCasts.find(Val: Key);
13408 if (VecIt == VectorCasts.end()) {
13409 IRBuilderBase::InsertPointGuard Guard(Builder);
13410 if (auto *IVec = dyn_cast<Instruction>(Val: Vec))
13411 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13412 Vec = Builder.CreateIntCast(
13413 V: Vec,
13414 DestTy: FixedVectorType::get(
13415 ElementType: ScalarTy,
13416 NumElts: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
13417 isSigned: BWIt->second.second);
13418 VectorCasts.try_emplace(Key, Args&: Vec);
13419 } else {
13420 Vec = VecIt->second;
13421 }
13422 }
13423
13424 std::optional<unsigned> InsertIdx = getInsertIndex(InsertInst: VU);
13425 if (InsertIdx) {
13426 auto *It =
13427 find_if(Range&: ShuffledInserts, P: [VU](const ShuffledInsertData &Data) {
13428 // Checks if 2 insertelements are from the same buildvector.
13429 InsertElementInst *VecInsert = Data.InsertElements.front();
13430 return areTwoInsertFromSameBuildVector(
13431 VU, V: VecInsert,
13432 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); });
13433 });
13434 unsigned Idx = *InsertIdx;
13435 if (It == ShuffledInserts.end()) {
13436 (void)ShuffledInserts.emplace_back();
13437 It = std::next(x: ShuffledInserts.begin(),
13438 n: ShuffledInserts.size() - 1);
13439 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13440 if (Mask.empty())
13441 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
13442 // Find the insertvector, vectorized in tree, if any.
13443 Value *Base = VU;
13444 while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
13445 if (IEBase != User &&
13446 (!IEBase->hasOneUse() ||
13447 getInsertIndex(InsertInst: IEBase).value_or(u&: Idx) == Idx))
13448 break;
13449 // Build the mask for the vectorized insertelement instructions.
13450 if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
13451 do {
13452 IEBase = cast<InsertElementInst>(Val: Base);
13453 int IEIdx = *getInsertIndex(InsertInst: IEBase);
13454 assert(Mask[IEIdx] == PoisonMaskElem &&
13455 "InsertElementInstruction used already.");
13456 Mask[IEIdx] = IEIdx;
13457 Base = IEBase->getOperand(i_nocapture: 0);
13458 } while (E == getTreeEntry(V: Base));
13459 break;
13460 }
13461 Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: 0);
13462 // After the vectorization the def-use chain has changed, need
13463 // to look through original insertelement instructions, if they
13464 // get replaced by vector instructions.
13465 auto It = VectorToInsertElement.find(Val: Base);
13466 if (It != VectorToInsertElement.end())
13467 Base = It->second;
13468 }
13469 }
13470 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13471 if (Mask.empty())
13472 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
13473 Mask[Idx] = ExternalUse.Lane;
13474 It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
13475 continue;
13476 }
13477 }
13478 }
13479 }
13480
13481 // Generate extracts for out-of-tree users.
13482 // Find the insertion point for the extractelement lane.
13483 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
13484 if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
13485 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
13486 if (PH->getIncomingValue(i: I) == Scalar) {
13487 Instruction *IncomingTerminator =
13488 PH->getIncomingBlock(i: I)->getTerminator();
13489 if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
13490 Builder.SetInsertPoint(TheBB: VecI->getParent(),
13491 IP: std::next(x: VecI->getIterator()));
13492 } else {
13493 Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
13494 }
13495 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13496 PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
13497 }
13498 }
13499 } else {
13500 Builder.SetInsertPoint(cast<Instruction>(Val: User));
13501 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13502 User->replaceUsesOfWith(From: Scalar, To: NewInst);
13503 }
13504 } else {
13505 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13506 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13507 User->replaceUsesOfWith(From: Scalar, To: NewInst);
13508 }
13509
13510 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13511 }
13512
13513 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13514 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13515 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13516 int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
13517 for (int I = 0, E = Mask.size(); I < E; ++I) {
13518 if (Mask[I] < VF)
13519 CombinedMask1[I] = Mask[I];
13520 else
13521 CombinedMask2[I] = Mask[I] - VF;
13522 }
13523 ShuffleInstructionBuilder ShuffleBuilder(Builder, *this);
13524 ShuffleBuilder.add(V1, Mask: CombinedMask1);
13525 if (V2)
13526 ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
13527 return ShuffleBuilder.finalize(ExtMask: std::nullopt);
13528 };
13529
13530 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13531 bool ForSingleMask) {
13532 unsigned VF = Mask.size();
13533 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
13534 if (VF != VecVF) {
13535 if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13536 Vec = CreateShuffle(Vec, nullptr, Mask);
13537 return std::make_pair(x&: Vec, y: true);
13538 }
13539 if (!ForSingleMask) {
13540 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13541 for (unsigned I = 0; I < VF; ++I) {
13542 if (Mask[I] != PoisonMaskElem)
13543 ResizeMask[Mask[I]] = Mask[I];
13544 }
13545 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13546 }
13547 }
13548
13549 return std::make_pair(x&: Vec, y: false);
13550 };
13551 // Perform shuffling of the vectorize tree entries for better handling of
13552 // external extracts.
13553 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13554 // Find the first and the last instruction in the list of insertelements.
13555 sort(C&: ShuffledInserts[I].InsertElements, Comp: isFirstInsertElement);
13556 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13557 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13558 Builder.SetInsertPoint(LastInsert);
13559 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13560 Value *NewInst = performExtractsShuffleAction<Value>(
13561 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
13562 Base: FirstInsert->getOperand(i_nocapture: 0),
13563 GetVF: [](Value *Vec) {
13564 return cast<VectorType>(Val: Vec->getType())
13565 ->getElementCount()
13566 .getKnownMinValue();
13567 },
13568 ResizeAction: ResizeToVF,
13569 Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13570 ArrayRef<Value *> Vals) {
13571 assert((Vals.size() == 1 || Vals.size() == 2) &&
13572 "Expected exactly 1 or 2 input values.");
13573 if (Vals.size() == 1) {
13574 // Do not create shuffle if the mask is a simple identity
13575 // non-resizing mask.
13576 if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
13577 ->getNumElements() ||
13578 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
13579 return CreateShuffle(Vals.front(), nullptr, Mask);
13580 return Vals.front();
13581 }
13582 return CreateShuffle(Vals.front() ? Vals.front()
13583 : FirstInsert->getOperand(i_nocapture: 0),
13584 Vals.back(), Mask);
13585 });
13586 auto It = ShuffledInserts[I].InsertElements.rbegin();
13587 // Rebuild buildvector chain.
13588 InsertElementInst *II = nullptr;
13589 if (It != ShuffledInserts[I].InsertElements.rend())
13590 II = *It;
13591 SmallVector<Instruction *> Inserts;
13592 while (It != ShuffledInserts[I].InsertElements.rend()) {
13593 assert(II && "Must be an insertelement instruction.");
13594 if (*It == II)
13595 ++It;
13596 else
13597 Inserts.push_back(Elt: cast<Instruction>(Val: II));
13598 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
13599 }
13600 for (Instruction *II : reverse(C&: Inserts)) {
13601 II->replaceUsesOfWith(From: II->getOperand(i: 0), To: NewInst);
13602 if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
13603 if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
13604 II->moveAfter(MovePos: NewI);
13605 NewInst = II;
13606 }
13607 LastInsert->replaceAllUsesWith(V: NewInst);
13608 for (InsertElementInst *IE : reverse(C&: ShuffledInserts[I].InsertElements)) {
13609 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 0),
13610 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 0)->getType()));
13611 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 1),
13612 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 1)->getType()));
13613 eraseInstruction(I: IE);
13614 }
13615 CSEBlocks.insert(V: LastInsert->getParent());
13616 }
13617
13618 SmallVector<Instruction *> RemovedInsts;
13619 // For each vectorized value:
13620 for (auto &TEPtr : VectorizableTree) {
13621 TreeEntry *Entry = TEPtr.get();
13622
13623 // No need to handle users of gathered values.
13624 if (Entry->State == TreeEntry::NeedToGather)
13625 continue;
13626
13627 assert(Entry->VectorizedValue && "Can't find vectorizable value");
13628
13629 // For each lane:
13630 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
13631 Value *Scalar = Entry->Scalars[Lane];
13632
13633 if (Entry->getOpcode() == Instruction::GetElementPtr &&
13634 !isa<GetElementPtrInst>(Val: Scalar))
13635 continue;
13636#ifndef NDEBUG
13637 Type *Ty = Scalar->getType();
13638 if (!Ty->isVoidTy()) {
13639 for (User *U : Scalar->users()) {
13640 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
13641
13642 // It is legal to delete users in the ignorelist.
13643 assert((getTreeEntry(U) ||
13644 (UserIgnoreList && UserIgnoreList->contains(U)) ||
13645 (isa_and_nonnull<Instruction>(U) &&
13646 isDeleted(cast<Instruction>(U)))) &&
13647 "Deleting out-of-tree value");
13648 }
13649 }
13650#endif
13651 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
13652 eraseInstruction(I: cast<Instruction>(Val: Scalar));
13653 // Retain to-be-deleted instructions for some debug-info
13654 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
13655 // deletion - instructions are not deleted until later.
13656 RemovedInsts.push_back(Elt: cast<Instruction>(Val: Scalar));
13657 }
13658 }
13659
13660 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
13661 // new vector instruction.
13662 if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree[0]->VectorizedValue))
13663 V->mergeDIAssignID(SourceInstructions: RemovedInsts);
13664
13665 Builder.ClearInsertionPoint();
13666 InstrElementSize.clear();
13667
13668 const TreeEntry &RootTE = *VectorizableTree.front().get();
13669 Value *Vec = RootTE.VectorizedValue;
13670 if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != 0 &&
13671 It != MinBWs.end() &&
13672 ReductionBitWidth != It->second.first) {
13673 IRBuilder<>::InsertPointGuard Guard(Builder);
13674 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
13675 IP: ReductionRoot->getIterator());
13676 Vec = Builder.CreateIntCast(
13677 V: Vec,
13678 DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
13679 EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
13680 isSigned: It->second.second);
13681 }
13682 return Vec;
13683}
13684
13685void BoUpSLP::optimizeGatherSequence() {
13686 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
13687 << " gather sequences instructions.\n");
13688 // LICM InsertElementInst sequences.
13689 for (Instruction *I : GatherShuffleExtractSeq) {
13690 if (isDeleted(I))
13691 continue;
13692
13693 // Check if this block is inside a loop.
13694 Loop *L = LI->getLoopFor(BB: I->getParent());
13695 if (!L)
13696 continue;
13697
13698 // Check if it has a preheader.
13699 BasicBlock *PreHeader = L->getLoopPreheader();
13700 if (!PreHeader)
13701 continue;
13702
13703 // If the vector or the element that we insert into it are
13704 // instructions that are defined in this basic block then we can't
13705 // hoist this instruction.
13706 if (any_of(Range: I->operands(), P: [L](Value *V) {
13707 auto *OpI = dyn_cast<Instruction>(Val: V);
13708 return OpI && L->contains(Inst: OpI);
13709 }))
13710 continue;
13711
13712 // We can hoist this instruction. Move it to the pre-header.
13713 I->moveBefore(MovePos: PreHeader->getTerminator());
13714 CSEBlocks.insert(V: PreHeader);
13715 }
13716
13717 // Make a list of all reachable blocks in our CSE queue.
13718 SmallVector<const DomTreeNode *, 8> CSEWorkList;
13719 CSEWorkList.reserve(N: CSEBlocks.size());
13720 for (BasicBlock *BB : CSEBlocks)
13721 if (DomTreeNode *N = DT->getNode(BB)) {
13722 assert(DT->isReachableFromEntry(N));
13723 CSEWorkList.push_back(Elt: N);
13724 }
13725
13726 // Sort blocks by domination. This ensures we visit a block after all blocks
13727 // dominating it are visited.
13728 llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode *A, const DomTreeNode *B) {
13729 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
13730 "Different nodes should have different DFS numbers");
13731 return A->getDFSNumIn() < B->getDFSNumIn();
13732 });
13733
13734 // Less defined shuffles can be replaced by the more defined copies.
13735 // Between two shuffles one is less defined if it has the same vector operands
13736 // and its mask indeces are the same as in the first one or undefs. E.g.
13737 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
13738 // poison, <0, 0, 0, 0>.
13739 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
13740 SmallVectorImpl<int> &NewMask) {
13741 if (I1->getType() != I2->getType())
13742 return false;
13743 auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
13744 auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
13745 if (!SI1 || !SI2)
13746 return I1->isIdenticalTo(I: I2);
13747 if (SI1->isIdenticalTo(I: SI2))
13748 return true;
13749 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
13750 if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
13751 return false;
13752 // Check if the second instruction is more defined than the first one.
13753 NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
13754 ArrayRef<int> SM1 = SI1->getShuffleMask();
13755 // Count trailing undefs in the mask to check the final number of used
13756 // registers.
13757 unsigned LastUndefsCnt = 0;
13758 for (int I = 0, E = NewMask.size(); I < E; ++I) {
13759 if (SM1[I] == PoisonMaskElem)
13760 ++LastUndefsCnt;
13761 else
13762 LastUndefsCnt = 0;
13763 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
13764 NewMask[I] != SM1[I])
13765 return false;
13766 if (NewMask[I] == PoisonMaskElem)
13767 NewMask[I] = SM1[I];
13768 }
13769 // Check if the last undefs actually change the final number of used vector
13770 // registers.
13771 return SM1.size() - LastUndefsCnt > 1 &&
13772 TTI->getNumberOfParts(Tp: SI1->getType()) ==
13773 TTI->getNumberOfParts(
13774 Tp: FixedVectorType::get(ElementType: SI1->getType()->getElementType(),
13775 NumElts: SM1.size() - LastUndefsCnt));
13776 };
13777 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
13778 // instructions. TODO: We can further optimize this scan if we split the
13779 // instructions into different buckets based on the insert lane.
13780 SmallVector<Instruction *, 16> Visited;
13781 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
13782 assert(*I &&
13783 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
13784 "Worklist not sorted properly!");
13785 BasicBlock *BB = (*I)->getBlock();
13786 // For all instructions in blocks containing gather sequences:
13787 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
13788 if (isDeleted(I: &In))
13789 continue;
13790 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
13791 !GatherShuffleExtractSeq.contains(key: &In))
13792 continue;
13793
13794 // Check if we can replace this instruction with any of the
13795 // visited instructions.
13796 bool Replaced = false;
13797 for (Instruction *&V : Visited) {
13798 SmallVector<int> NewMask;
13799 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
13800 DT->dominates(A: V->getParent(), B: In.getParent())) {
13801 In.replaceAllUsesWith(V);
13802 eraseInstruction(I: &In);
13803 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
13804 if (!NewMask.empty())
13805 SI->setShuffleMask(NewMask);
13806 Replaced = true;
13807 break;
13808 }
13809 if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
13810 GatherShuffleExtractSeq.contains(key: V) &&
13811 IsIdenticalOrLessDefined(V, &In, NewMask) &&
13812 DT->dominates(A: In.getParent(), B: V->getParent())) {
13813 In.moveAfter(MovePos: V);
13814 V->replaceAllUsesWith(V: &In);
13815 eraseInstruction(I: V);
13816 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
13817 if (!NewMask.empty())
13818 SI->setShuffleMask(NewMask);
13819 V = &In;
13820 Replaced = true;
13821 break;
13822 }
13823 }
13824 if (!Replaced) {
13825 assert(!is_contained(Visited, &In));
13826 Visited.push_back(Elt: &In);
13827 }
13828 }
13829 }
13830 CSEBlocks.clear();
13831 GatherShuffleExtractSeq.clear();
13832}
13833
13834BoUpSLP::ScheduleData *
13835BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
13836 ScheduleData *Bundle = nullptr;
13837 ScheduleData *PrevInBundle = nullptr;
13838 for (Value *V : VL) {
13839 if (doesNotNeedToBeScheduled(V))
13840 continue;
13841 ScheduleData *BundleMember = getScheduleData(V);
13842 assert(BundleMember &&
13843 "no ScheduleData for bundle member "
13844 "(maybe not in same basic block)");
13845 assert(BundleMember->isSchedulingEntity() &&
13846 "bundle member already part of other bundle");
13847 if (PrevInBundle) {
13848 PrevInBundle->NextInBundle = BundleMember;
13849 } else {
13850 Bundle = BundleMember;
13851 }
13852
13853 // Group the instructions to a bundle.
13854 BundleMember->FirstInBundle = Bundle;
13855 PrevInBundle = BundleMember;
13856 }
13857 assert(Bundle && "Failed to find schedule bundle");
13858 return Bundle;
13859}
13860
13861// Groups the instructions to a bundle (which is then a single scheduling entity)
13862// and schedules instructions until the bundle gets ready.
13863std::optional<BoUpSLP::ScheduleData *>
13864BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
13865 const InstructionsState &S) {
13866 // No need to schedule PHIs, insertelement, extractelement and extractvalue
13867 // instructions.
13868 if (isa<PHINode>(Val: S.OpValue) || isVectorLikeInstWithConstOps(V: S.OpValue) ||
13869 doesNotNeedToSchedule(VL))
13870 return nullptr;
13871
13872 // Initialize the instruction bundle.
13873 Instruction *OldScheduleEnd = ScheduleEnd;
13874 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
13875
13876 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
13877 ScheduleData *Bundle) {
13878 // The scheduling region got new instructions at the lower end (or it is a
13879 // new region for the first bundle). This makes it necessary to
13880 // recalculate all dependencies.
13881 // It is seldom that this needs to be done a second time after adding the
13882 // initial bundle to the region.
13883 if (ScheduleEnd != OldScheduleEnd) {
13884 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
13885 doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->clearDependencies(); });
13886 ReSchedule = true;
13887 }
13888 if (Bundle) {
13889 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
13890 << " in block " << BB->getName() << "\n");
13891 calculateDependencies(SD: Bundle, /*InsertInReadyList=*/true, SLP);
13892 }
13893
13894 if (ReSchedule) {
13895 resetSchedule();
13896 initialFillReadyList(ReadyList&: ReadyInsts);
13897 }
13898
13899 // Now try to schedule the new bundle or (if no bundle) just calculate
13900 // dependencies. As soon as the bundle is "ready" it means that there are no
13901 // cyclic dependencies and we can schedule it. Note that's important that we
13902 // don't "schedule" the bundle yet (see cancelScheduling).
13903 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
13904 !ReadyInsts.empty()) {
13905 ScheduleData *Picked = ReadyInsts.pop_back_val();
13906 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
13907 "must be ready to schedule");
13908 schedule(SD: Picked, ReadyList&: ReadyInsts);
13909 }
13910 };
13911
13912 // Make sure that the scheduling region contains all
13913 // instructions of the bundle.
13914 for (Value *V : VL) {
13915 if (doesNotNeedToBeScheduled(V))
13916 continue;
13917 if (!extendSchedulingRegion(V, S)) {
13918 // If the scheduling region got new instructions at the lower end (or it
13919 // is a new region for the first bundle). This makes it necessary to
13920 // recalculate all dependencies.
13921 // Otherwise the compiler may crash trying to incorrectly calculate
13922 // dependencies and emit instruction in the wrong order at the actual
13923 // scheduling.
13924 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
13925 return std::nullopt;
13926 }
13927 }
13928
13929 bool ReSchedule = false;
13930 for (Value *V : VL) {
13931 if (doesNotNeedToBeScheduled(V))
13932 continue;
13933 ScheduleData *BundleMember = getScheduleData(V);
13934 assert(BundleMember &&
13935 "no ScheduleData for bundle member (maybe not in same basic block)");
13936
13937 // Make sure we don't leave the pieces of the bundle in the ready list when
13938 // whole bundle might not be ready.
13939 ReadyInsts.remove(X: BundleMember);
13940
13941 if (!BundleMember->IsScheduled)
13942 continue;
13943 // A bundle member was scheduled as single instruction before and now
13944 // needs to be scheduled as part of the bundle. We just get rid of the
13945 // existing schedule.
13946 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
13947 << " was already scheduled\n");
13948 ReSchedule = true;
13949 }
13950
13951 auto *Bundle = buildBundle(VL);
13952 TryScheduleBundleImpl(ReSchedule, Bundle);
13953 if (!Bundle->isReady()) {
13954 cancelScheduling(VL, OpValue: S.OpValue);
13955 return std::nullopt;
13956 }
13957 return Bundle;
13958}
13959
13960void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
13961 Value *OpValue) {
13962 if (isa<PHINode>(Val: OpValue) || isVectorLikeInstWithConstOps(V: OpValue) ||
13963 doesNotNeedToSchedule(VL))
13964 return;
13965
13966 if (doesNotNeedToBeScheduled(V: OpValue))
13967 OpValue = *find_if_not(Range&: VL, P: doesNotNeedToBeScheduled);
13968 ScheduleData *Bundle = getScheduleData(V: OpValue);
13969 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
13970 assert(!Bundle->IsScheduled &&
13971 "Can't cancel bundle which is already scheduled");
13972 assert(Bundle->isSchedulingEntity() &&
13973 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
13974 "tried to unbundle something which is not a bundle");
13975
13976 // Remove the bundle from the ready list.
13977 if (Bundle->isReady())
13978 ReadyInsts.remove(X: Bundle);
13979
13980 // Un-bundle: make single instructions out of the bundle.
13981 ScheduleData *BundleMember = Bundle;
13982 while (BundleMember) {
13983 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
13984 BundleMember->FirstInBundle = BundleMember;
13985 ScheduleData *Next = BundleMember->NextInBundle;
13986 BundleMember->NextInBundle = nullptr;
13987 BundleMember->TE = nullptr;
13988 if (BundleMember->unscheduledDepsInBundle() == 0) {
13989 ReadyInsts.insert(X: BundleMember);
13990 }
13991 BundleMember = Next;
13992 }
13993}
13994
13995BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
13996 // Allocate a new ScheduleData for the instruction.
13997 if (ChunkPos >= ChunkSize) {
13998 ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
13999 ChunkPos = 0;
14000 }
14001 return &(ScheduleDataChunks.back()[ChunkPos++]);
14002}
14003
14004bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14005 const InstructionsState &S) {
14006 if (getScheduleData(V, Key: isOneOf(S, Op: V)))
14007 return true;
14008 Instruction *I = dyn_cast<Instruction>(Val: V);
14009 assert(I && "bundle member must be an instruction");
14010 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14011 !doesNotNeedToBeScheduled(I) &&
14012 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14013 "be scheduled");
14014 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14015 ScheduleData *ISD = getScheduleData(I);
14016 if (!ISD)
14017 return false;
14018 assert(isInSchedulingRegion(ISD) &&
14019 "ScheduleData not in scheduling region");
14020 ScheduleData *SD = allocateScheduleDataChunks();
14021 SD->Inst = I;
14022 SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: S.OpValue);
14023 ExtraScheduleDataMap[I][S.OpValue] = SD;
14024 return true;
14025 };
14026 if (CheckScheduleForI(I))
14027 return true;
14028 if (!ScheduleStart) {
14029 // It's the first instruction in the new region.
14030 initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
14031 ScheduleStart = I;
14032 ScheduleEnd = I->getNextNode();
14033 if (isOneOf(S, Op: I) != I)
14034 CheckScheduleForI(I);
14035 assert(ScheduleEnd && "tried to vectorize a terminator?");
14036 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14037 return true;
14038 }
14039 // Search up and down at the same time, because we don't know if the new
14040 // instruction is above or below the existing scheduling region.
14041 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14042 // against the budget. Otherwise debug info could affect codegen.
14043 BasicBlock::reverse_iterator UpIter =
14044 ++ScheduleStart->getIterator().getReverse();
14045 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14046 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14047 BasicBlock::iterator LowerEnd = BB->end();
14048 auto IsAssumeLikeIntr = [](const Instruction &I) {
14049 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
14050 return II->isAssumeLikeIntrinsic();
14051 return false;
14052 };
14053 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14054 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14055 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14056 &*DownIter != I) {
14057 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14058 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14059 return false;
14060 }
14061
14062 ++UpIter;
14063 ++DownIter;
14064
14065 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14066 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14067 }
14068 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14069 assert(I->getParent() == ScheduleStart->getParent() &&
14070 "Instruction is in wrong basic block.");
14071 initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
14072 ScheduleStart = I;
14073 if (isOneOf(S, Op: I) != I)
14074 CheckScheduleForI(I);
14075 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14076 << "\n");
14077 return true;
14078 }
14079 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14080 "Expected to reach top of the basic block or instruction down the "
14081 "lower end.");
14082 assert(I->getParent() == ScheduleEnd->getParent() &&
14083 "Instruction is in wrong basic block.");
14084 initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
14085 NextLoadStore: nullptr);
14086 ScheduleEnd = I->getNextNode();
14087 if (isOneOf(S, Op: I) != I)
14088 CheckScheduleForI(I);
14089 assert(ScheduleEnd && "tried to vectorize a terminator?");
14090 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14091 return true;
14092}
14093
14094void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14095 Instruction *ToI,
14096 ScheduleData *PrevLoadStore,
14097 ScheduleData *NextLoadStore) {
14098 ScheduleData *CurrentLoadStore = PrevLoadStore;
14099 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14100 // No need to allocate data for non-schedulable instructions.
14101 if (doesNotNeedToBeScheduled(V: I))
14102 continue;
14103 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
14104 if (!SD) {
14105 SD = allocateScheduleDataChunks();
14106 ScheduleDataMap[I] = SD;
14107 SD->Inst = I;
14108 }
14109 assert(!isInSchedulingRegion(SD) &&
14110 "new ScheduleData already in scheduling region");
14111 SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: I);
14112
14113 if (I->mayReadOrWriteMemory() &&
14114 (!isa<IntrinsicInst>(Val: I) ||
14115 (cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
14116 cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
14117 Intrinsic::pseudoprobe))) {
14118 // Update the linked list of memory accessing instructions.
14119 if (CurrentLoadStore) {
14120 CurrentLoadStore->NextLoadStore = SD;
14121 } else {
14122 FirstLoadStoreInRegion = SD;
14123 }
14124 CurrentLoadStore = SD;
14125 }
14126
14127 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14128 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14129 RegionHasStackSave = true;
14130 }
14131 if (NextLoadStore) {
14132 if (CurrentLoadStore)
14133 CurrentLoadStore->NextLoadStore = NextLoadStore;
14134 } else {
14135 LastLoadStoreInRegion = CurrentLoadStore;
14136 }
14137}
14138
14139void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14140 bool InsertInReadyList,
14141 BoUpSLP *SLP) {
14142 assert(SD->isSchedulingEntity());
14143
14144 SmallVector<ScheduleData *, 10> WorkList;
14145 WorkList.push_back(Elt: SD);
14146
14147 while (!WorkList.empty()) {
14148 ScheduleData *SD = WorkList.pop_back_val();
14149 for (ScheduleData *BundleMember = SD; BundleMember;
14150 BundleMember = BundleMember->NextInBundle) {
14151 assert(isInSchedulingRegion(BundleMember));
14152 if (BundleMember->hasValidDependencies())
14153 continue;
14154
14155 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14156 << "\n");
14157 BundleMember->Dependencies = 0;
14158 BundleMember->resetUnscheduledDeps();
14159
14160 // Handle def-use chain dependencies.
14161 if (BundleMember->OpValue != BundleMember->Inst) {
14162 if (ScheduleData *UseSD = getScheduleData(I: BundleMember->Inst)) {
14163 BundleMember->Dependencies++;
14164 ScheduleData *DestBundle = UseSD->FirstInBundle;
14165 if (!DestBundle->IsScheduled)
14166 BundleMember->incrementUnscheduledDeps(Incr: 1);
14167 if (!DestBundle->hasValidDependencies())
14168 WorkList.push_back(Elt: DestBundle);
14169 }
14170 } else {
14171 for (User *U : BundleMember->Inst->users()) {
14172 if (ScheduleData *UseSD = getScheduleData(I: cast<Instruction>(Val: U))) {
14173 BundleMember->Dependencies++;
14174 ScheduleData *DestBundle = UseSD->FirstInBundle;
14175 if (!DestBundle->IsScheduled)
14176 BundleMember->incrementUnscheduledDeps(Incr: 1);
14177 if (!DestBundle->hasValidDependencies())
14178 WorkList.push_back(Elt: DestBundle);
14179 }
14180 }
14181 }
14182
14183 auto MakeControlDependent = [&](Instruction *I) {
14184 auto *DepDest = getScheduleData(I);
14185 assert(DepDest && "must be in schedule window");
14186 DepDest->ControlDependencies.push_back(Elt: BundleMember);
14187 BundleMember->Dependencies++;
14188 ScheduleData *DestBundle = DepDest->FirstInBundle;
14189 if (!DestBundle->IsScheduled)
14190 BundleMember->incrementUnscheduledDeps(Incr: 1);
14191 if (!DestBundle->hasValidDependencies())
14192 WorkList.push_back(Elt: DestBundle);
14193 };
14194
14195 // Any instruction which isn't safe to speculate at the beginning of the
14196 // block is control dependend on any early exit or non-willreturn call
14197 // which proceeds it.
14198 if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->Inst)) {
14199 for (Instruction *I = BundleMember->Inst->getNextNode();
14200 I != ScheduleEnd; I = I->getNextNode()) {
14201 if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
14202 continue;
14203
14204 // Add the dependency
14205 MakeControlDependent(I);
14206
14207 if (!isGuaranteedToTransferExecutionToSuccessor(I))
14208 // Everything past here must be control dependent on I.
14209 break;
14210 }
14211 }
14212
14213 if (RegionHasStackSave) {
14214 // If we have an inalloc alloca instruction, it needs to be scheduled
14215 // after any preceeding stacksave. We also need to prevent any alloca
14216 // from reordering above a preceeding stackrestore.
14217 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14218 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14219 for (Instruction *I = BundleMember->Inst->getNextNode();
14220 I != ScheduleEnd; I = I->getNextNode()) {
14221 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14222 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14223 // Any allocas past here must be control dependent on I, and I
14224 // must be memory dependend on BundleMember->Inst.
14225 break;
14226
14227 if (!isa<AllocaInst>(Val: I))
14228 continue;
14229
14230 // Add the dependency
14231 MakeControlDependent(I);
14232 }
14233 }
14234
14235 // In addition to the cases handle just above, we need to prevent
14236 // allocas and loads/stores from moving below a stacksave or a
14237 // stackrestore. Avoiding moving allocas below stackrestore is currently
14238 // thought to be conservatism. Moving loads/stores below a stackrestore
14239 // can lead to incorrect code.
14240 if (isa<AllocaInst>(Val: BundleMember->Inst) ||
14241 BundleMember->Inst->mayReadOrWriteMemory()) {
14242 for (Instruction *I = BundleMember->Inst->getNextNode();
14243 I != ScheduleEnd; I = I->getNextNode()) {
14244 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14245 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14246 continue;
14247
14248 // Add the dependency
14249 MakeControlDependent(I);
14250 break;
14251 }
14252 }
14253 }
14254
14255 // Handle the memory dependencies (if any).
14256 ScheduleData *DepDest = BundleMember->NextLoadStore;
14257 if (!DepDest)
14258 continue;
14259 Instruction *SrcInst = BundleMember->Inst;
14260 assert(SrcInst->mayReadOrWriteMemory() &&
14261 "NextLoadStore list for non memory effecting bundle?");
14262 MemoryLocation SrcLoc = getLocation(I: SrcInst);
14263 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14264 unsigned NumAliased = 0;
14265 unsigned DistToSrc = 1;
14266
14267 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14268 assert(isInSchedulingRegion(DepDest));
14269
14270 // We have two limits to reduce the complexity:
14271 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14272 // SLP->isAliased (which is the expensive part in this loop).
14273 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14274 // the whole loop (even if the loop is fast, it's quadratic).
14275 // It's important for the loop break condition (see below) to
14276 // check this limit even between two read-only instructions.
14277 if (DistToSrc >= MaxMemDepDistance ||
14278 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14279 (NumAliased >= AliasedCheckLimit ||
14280 SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->Inst)))) {
14281
14282 // We increment the counter only if the locations are aliased
14283 // (instead of counting all alias checks). This gives a better
14284 // balance between reduced runtime and accurate dependencies.
14285 NumAliased++;
14286
14287 DepDest->MemoryDependencies.push_back(Elt: BundleMember);
14288 BundleMember->Dependencies++;
14289 ScheduleData *DestBundle = DepDest->FirstInBundle;
14290 if (!DestBundle->IsScheduled) {
14291 BundleMember->incrementUnscheduledDeps(Incr: 1);
14292 }
14293 if (!DestBundle->hasValidDependencies()) {
14294 WorkList.push_back(Elt: DestBundle);
14295 }
14296 }
14297
14298 // Example, explaining the loop break condition: Let's assume our
14299 // starting instruction is i0 and MaxMemDepDistance = 3.
14300 //
14301 // +--------v--v--v
14302 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14303 // +--------^--^--^
14304 //
14305 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14306 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14307 // Previously we already added dependencies from i3 to i6,i7,i8
14308 // (because of MaxMemDepDistance). As we added a dependency from
14309 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14310 // and we can abort this loop at i6.
14311 if (DistToSrc >= 2 * MaxMemDepDistance)
14312 break;
14313 DistToSrc++;
14314 }
14315 }
14316 if (InsertInReadyList && SD->isReady()) {
14317 ReadyInsts.insert(X: SD);
14318 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14319 << "\n");
14320 }
14321 }
14322}
14323
14324void BoUpSLP::BlockScheduling::resetSchedule() {
14325 assert(ScheduleStart &&
14326 "tried to reset schedule on block which has not been scheduled");
14327 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14328 doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
14329 assert(isInSchedulingRegion(SD) &&
14330 "ScheduleData not in scheduling region");
14331 SD->IsScheduled = false;
14332 SD->resetUnscheduledDeps();
14333 });
14334 }
14335 ReadyInsts.clear();
14336}
14337
14338void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14339 if (!BS->ScheduleStart)
14340 return;
14341
14342 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14343
14344 // A key point - if we got here, pre-scheduling was able to find a valid
14345 // scheduling of the sub-graph of the scheduling window which consists
14346 // of all vector bundles and their transitive users. As such, we do not
14347 // need to reschedule anything *outside of* that subgraph.
14348
14349 BS->resetSchedule();
14350
14351 // For the real scheduling we use a more sophisticated ready-list: it is
14352 // sorted by the original instruction location. This lets the final schedule
14353 // be as close as possible to the original instruction order.
14354 // WARNING: If changing this order causes a correctness issue, that means
14355 // there is some missing dependence edge in the schedule data graph.
14356 struct ScheduleDataCompare {
14357 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14358 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14359 }
14360 };
14361 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14362
14363 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14364 // and fill the ready-list with initial instructions.
14365 int Idx = 0;
14366 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14367 I = I->getNextNode()) {
14368 BS->doForAllOpcodes(V: I, Action: [this, &Idx, BS](ScheduleData *SD) {
14369 TreeEntry *SDTE = getTreeEntry(V: SD->Inst);
14370 (void)SDTE;
14371 assert((isVectorLikeInstWithConstOps(SD->Inst) ||
14372 SD->isPartOfBundle() ==
14373 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14374 "scheduler and vectorizer bundle mismatch");
14375 SD->FirstInBundle->SchedulingPriority = Idx++;
14376
14377 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14378 BS->calculateDependencies(SD, InsertInReadyList: false, SLP: this);
14379 });
14380 }
14381 BS->initialFillReadyList(ReadyList&: ReadyInsts);
14382
14383 Instruction *LastScheduledInst = BS->ScheduleEnd;
14384
14385 // Do the "real" scheduling.
14386 while (!ReadyInsts.empty()) {
14387 ScheduleData *Picked = *ReadyInsts.begin();
14388 ReadyInsts.erase(position: ReadyInsts.begin());
14389
14390 // Move the scheduled instruction(s) to their dedicated places, if not
14391 // there yet.
14392 for (ScheduleData *BundleMember = Picked; BundleMember;
14393 BundleMember = BundleMember->NextInBundle) {
14394 Instruction *PickedInst = BundleMember->Inst;
14395 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14396 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
14397 LastScheduledInst = PickedInst;
14398 }
14399
14400 BS->schedule(SD: Picked, ReadyList&: ReadyInsts);
14401 }
14402
14403 // Check that we didn't break any of our invariants.
14404#ifdef EXPENSIVE_CHECKS
14405 BS->verify();
14406#endif
14407
14408#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14409 // Check that all schedulable entities got scheduled
14410 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14411 BS->doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
14412 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14413 assert(SD->IsScheduled && "must be scheduled at this point");
14414 }
14415 });
14416 }
14417#endif
14418
14419 // Avoid duplicate scheduling of the block.
14420 BS->ScheduleStart = nullptr;
14421}
14422
14423unsigned BoUpSLP::getVectorElementSize(Value *V) {
14424 // If V is a store, just return the width of the stored value (or value
14425 // truncated just before storing) without traversing the expression tree.
14426 // This is the common case.
14427 if (auto *Store = dyn_cast<StoreInst>(Val: V))
14428 return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
14429
14430 if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
14431 return getVectorElementSize(V: IEI->getOperand(i_nocapture: 1));
14432
14433 auto E = InstrElementSize.find(Val: V);
14434 if (E != InstrElementSize.end())
14435 return E->second;
14436
14437 // If V is not a store, we can traverse the expression tree to find loads
14438 // that feed it. The type of the loaded value may indicate a more suitable
14439 // width than V's type. We want to base the vector element size on the width
14440 // of memory operations where possible.
14441 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
14442 SmallPtrSet<Instruction *, 16> Visited;
14443 if (auto *I = dyn_cast<Instruction>(Val: V)) {
14444 Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: 0);
14445 Visited.insert(Ptr: I);
14446 }
14447
14448 // Traverse the expression tree in bottom-up order looking for loads. If we
14449 // encounter an instruction we don't yet handle, we give up.
14450 auto Width = 0u;
14451 Value *FirstNonBool = nullptr;
14452 while (!Worklist.empty()) {
14453 auto [I, Parent, Level] = Worklist.pop_back_val();
14454
14455 // We should only be looking at scalar instructions here. If the current
14456 // instruction has a vector type, skip.
14457 auto *Ty = I->getType();
14458 if (isa<VectorType>(Val: Ty))
14459 continue;
14460 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14461 FirstNonBool = I;
14462 if (Level > RecursionMaxDepth)
14463 continue;
14464
14465 // If the current instruction is a load, update MaxWidth to reflect the
14466 // width of the loaded value.
14467 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
14468 Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
14469
14470 // Otherwise, we need to visit the operands of the instruction. We only
14471 // handle the interesting cases from buildTree here. If an operand is an
14472 // instruction we haven't yet visited and from the same basic block as the
14473 // user or the use is a PHI node, we add it to the worklist.
14474 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
14475 BinaryOperator, UnaryOperator>(Val: I)) {
14476 for (Use &U : I->operands()) {
14477 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
14478 if (Visited.insert(Ptr: J).second &&
14479 (isa<PHINode>(Val: I) || J->getParent() == Parent)) {
14480 Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + 1);
14481 continue;
14482 }
14483 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14484 FirstNonBool = U.get();
14485 }
14486 } else {
14487 break;
14488 }
14489 }
14490
14491 // If we didn't encounter a memory access in the expression tree, or if we
14492 // gave up for some reason, just return the width of V. Otherwise, return the
14493 // maximum width we found.
14494 if (!Width) {
14495 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14496 V = FirstNonBool;
14497 Width = DL->getTypeSizeInBits(Ty: V->getType());
14498 }
14499
14500 for (Instruction *I : Visited)
14501 InstrElementSize[I] = Width;
14502
14503 return Width;
14504}
14505
14506bool BoUpSLP::collectValuesToDemote(
14507 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14508 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
14509 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14510 bool IsTruncRoot) const {
14511 // We can always demote constants.
14512 if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
14513 return true;
14514
14515 unsigned OrigBitWidth = DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType());
14516 if (OrigBitWidth == BitWidth) {
14517 MaxDepthLevel = 1;
14518 return true;
14519 }
14520
14521 // If the value is not a vectorized instruction in the expression and not used
14522 // by the insertelement instruction and not used in multiple vector nodes, it
14523 // cannot be demoted.
14524 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14525 if (MultiNodeScalars.contains(Val: V))
14526 return false;
14527 if (OrigBitWidth > BitWidth) {
14528 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14529 if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery(*DL)))
14530 return true;
14531 }
14532 auto NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
14533 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14534 bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery(*DL));
14535 if (IsSigned)
14536 ++BitWidth1;
14537 if (auto *I = dyn_cast<Instruction>(Val: V)) {
14538 APInt Mask = DB->getDemandedBits(I);
14539 unsigned BitWidth2 =
14540 std::max<unsigned>(a: 1, b: Mask.getBitWidth() - Mask.countl_zero());
14541 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14542 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - 1);
14543 if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery(*DL)))
14544 break;
14545 BitWidth2 *= 2;
14546 }
14547 BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
14548 }
14549 BitWidth = std::max(a: BitWidth, b: BitWidth1);
14550 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14551 };
14552 using namespace std::placeholders;
14553 auto FinalAnalysis = [&]() {
14554 if (!IsProfitableToDemote)
14555 return false;
14556 bool Res = all_of(
14557 Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
14558 // Gather demoted constant operands.
14559 if (Res && E.State == TreeEntry::NeedToGather &&
14560 all_of(Range: E.Scalars, P: IsaPred<Constant>))
14561 ToDemote.push_back(Elt: E.Idx);
14562 return Res;
14563 };
14564 // TODO: improve handling of gathered values and others.
14565 if (E.State == TreeEntry::NeedToGather || !Visited.insert(V: &E).second ||
14566 any_of(Range: E.Scalars, P: [&](Value *V) {
14567 return all_of(Range: V->users(), P: [&](User *U) {
14568 return isa<InsertElementInst>(Val: U) && !getTreeEntry(V: U);
14569 });
14570 }))
14571 return FinalAnalysis();
14572
14573 if (any_of(Range: E.Scalars, P: [&](Value *V) {
14574 return !all_of(Range: V->users(), P: [=](User *U) {
14575 return getTreeEntry(V: U) ||
14576 (UserIgnoreList && UserIgnoreList->contains(V: U)) ||
14577 (!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
14578 !U->getType()->isScalableTy() &&
14579 DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
14580 }) && !IsPotentiallyTruncated(V, BitWidth);
14581 }))
14582 return false;
14583
14584 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14585 bool &NeedToExit) {
14586 NeedToExit = false;
14587 unsigned InitLevel = MaxDepthLevel;
14588 for (const TreeEntry *Op : Operands) {
14589 unsigned Level = InitLevel;
14590 if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
14591 ToDemote, Visited, MaxDepthLevel&: Level, IsProfitableToDemote,
14592 IsTruncRoot)) {
14593 if (!IsProfitableToDemote)
14594 return false;
14595 NeedToExit = true;
14596 if (!FinalAnalysis())
14597 return false;
14598 continue;
14599 }
14600 MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
14601 }
14602 return true;
14603 };
14604 auto AttemptCheckBitwidth =
14605 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14606 // Try all bitwidth < OrigBitWidth.
14607 NeedToExit = false;
14608 unsigned BestFailBitwidth = 0;
14609 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14610 if (Checker(BitWidth, OrigBitWidth))
14611 return true;
14612 if (BestFailBitwidth == 0 && FinalAnalysis())
14613 BestFailBitwidth = BitWidth;
14614 }
14615 if (BitWidth >= OrigBitWidth) {
14616 if (BestFailBitwidth == 0) {
14617 BitWidth = OrigBitWidth;
14618 return false;
14619 }
14620 MaxDepthLevel = 1;
14621 BitWidth = BestFailBitwidth;
14622 NeedToExit = true;
14623 return true;
14624 }
14625 return false;
14626 };
14627 auto TryProcessInstruction =
14628 [&](unsigned &BitWidth,
14629 ArrayRef<const TreeEntry *> Operands = std::nullopt,
14630 function_ref<bool(unsigned, unsigned)> Checker = {}) {
14631 if (Operands.empty()) {
14632 if (!IsTruncRoot)
14633 MaxDepthLevel = 1;
14634 (void)for_each(Range: E.Scalars, F: std::bind(f&: IsPotentiallyTruncated, args: _1,
14635 args: std::ref(t&: BitWidth)));
14636 } else {
14637 // Several vectorized uses? Check if we can truncate it, otherwise -
14638 // exit.
14639 if (E.UserTreeIndices.size() > 1 &&
14640 !all_of(Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1,
14641 args: std::ref(t&: BitWidth))))
14642 return false;
14643 bool NeedToExit = false;
14644 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
14645 return false;
14646 if (NeedToExit)
14647 return true;
14648 if (!ProcessOperands(Operands, NeedToExit))
14649 return false;
14650 if (NeedToExit)
14651 return true;
14652 }
14653
14654 ++MaxDepthLevel;
14655 // Record the entry that we can demote.
14656 ToDemote.push_back(Elt: E.Idx);
14657 return IsProfitableToDemote;
14658 };
14659 switch (E.getOpcode()) {
14660
14661 // We can always demote truncations and extensions. Since truncations can
14662 // seed additional demotion, we save the truncated value.
14663 case Instruction::Trunc:
14664 if (IsProfitableToDemoteRoot)
14665 IsProfitableToDemote = true;
14666 return TryProcessInstruction(BitWidth);
14667 case Instruction::ZExt:
14668 case Instruction::SExt:
14669 IsProfitableToDemote = true;
14670 return TryProcessInstruction(BitWidth);
14671
14672 // We can demote certain binary operations if we can demote both of their
14673 // operands.
14674 case Instruction::Add:
14675 case Instruction::Sub:
14676 case Instruction::Mul:
14677 case Instruction::And:
14678 case Instruction::Or:
14679 case Instruction::Xor: {
14680 return TryProcessInstruction(
14681 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)});
14682 }
14683 case Instruction::Shl: {
14684 // If we are truncating the result of this SHL, and if it's a shift of an
14685 // inrange amount, we can always perform a SHL in a smaller type.
14686 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
14687 return all_of(Range: E.Scalars, P: [&](Value *V) {
14688 auto *I = cast<Instruction>(Val: V);
14689 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
14690 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
14691 });
14692 };
14693 return TryProcessInstruction(
14694 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, ShlChecker);
14695 }
14696 case Instruction::LShr: {
14697 // If this is a truncate of a logical shr, we can truncate it to a smaller
14698 // lshr iff we know that the bits we would otherwise be shifting in are
14699 // already zeros.
14700 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14701 return all_of(Range: E.Scalars, P: [&](Value *V) {
14702 auto *I = cast<Instruction>(Val: V);
14703 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
14704 APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14705 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
14706 MaskedValueIsZero(V: I->getOperand(i: 0), Mask: ShiftedBits,
14707 DL: SimplifyQuery(*DL));
14708 });
14709 };
14710 return TryProcessInstruction(
14711 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
14712 LShrChecker);
14713 }
14714 case Instruction::AShr: {
14715 // If this is a truncate of an arithmetic shr, we can truncate it to a
14716 // smaller ashr iff we know that all the bits from the sign bit of the
14717 // original type and the sign bit of the truncate type are similar.
14718 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14719 return all_of(Range: E.Scalars, P: [&](Value *V) {
14720 auto *I = cast<Instruction>(Val: V);
14721 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
14722 unsigned ShiftedBits = OrigBitWidth - BitWidth;
14723 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
14724 ShiftedBits < ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, Depth: 0, AC,
14725 CxtI: nullptr, DT);
14726 });
14727 };
14728 return TryProcessInstruction(
14729 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
14730 AShrChecker);
14731 }
14732 case Instruction::UDiv:
14733 case Instruction::URem: {
14734 // UDiv and URem can be truncated if all the truncated bits are zero.
14735 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14736 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14737 return all_of(Range: E.Scalars, P: [&](Value *V) {
14738 auto *I = cast<Instruction>(Val: V);
14739 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14740 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, DL: SimplifyQuery(*DL)) &&
14741 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, DL: SimplifyQuery(*DL));
14742 });
14743 };
14744 return TryProcessInstruction(
14745 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, Checker);
14746 }
14747
14748 // We can demote selects if we can demote their true and false values.
14749 case Instruction::Select: {
14750 return TryProcessInstruction(
14751 BitWidth, {getOperandEntry(E: &E, Idx: 1), getOperandEntry(E: &E, Idx: 2)});
14752 }
14753
14754 // We can demote phis if we can demote all their incoming operands. Note that
14755 // we don't need to worry about cycles since we ensure single use above.
14756 case Instruction::PHI: {
14757 const unsigned NumOps = E.getNumOperands();
14758 SmallVector<const TreeEntry *> Ops(NumOps);
14759 transform(Range: seq<unsigned>(Begin: 0, End: NumOps), d_first: Ops.begin(),
14760 F: std::bind(f: &BoUpSLP::getOperandEntry, args: this, args: &E, args: _1));
14761
14762 return TryProcessInstruction(BitWidth, Ops);
14763 }
14764
14765 case Instruction::Call: {
14766 auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
14767 if (!IC)
14768 break;
14769 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
14770 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14771 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14772 break;
14773 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(E: &E, Idx: 0));
14774 function_ref<bool(unsigned, unsigned)> CallChecker;
14775 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14776 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14777 return all_of(Range: E.Scalars, P: [&](Value *V) {
14778 auto *I = cast<Instruction>(Val: V);
14779 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14780 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
14781 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
14782 DL: SimplifyQuery(*DL)) &&
14783 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, DL: SimplifyQuery(*DL));
14784 }
14785 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
14786 "Expected min/max intrinsics only.");
14787 unsigned SignBits = OrigBitWidth - BitWidth;
14788 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
14789 return SignBits <= ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, Depth: 0, AC,
14790 CxtI: nullptr, DT) &&
14791 (!isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL)) ||
14792 MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
14793 DL: SimplifyQuery(*DL))) &&
14794 SignBits <= ComputeNumSignBits(Op: I->getOperand(i: 1), DL: *DL, Depth: 0, AC,
14795 CxtI: nullptr, DT) &&
14796 (!isKnownNonNegative(V: I->getOperand(i: 1), SQ: SimplifyQuery(*DL)) ||
14797 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, DL: SimplifyQuery(*DL)));
14798 });
14799 };
14800 if (ID != Intrinsic::abs) {
14801 Operands.push_back(Elt: getOperandEntry(E: &E, Idx: 1));
14802 CallChecker = CompChecker;
14803 }
14804 InstructionCost BestCost =
14805 std::numeric_limits<InstructionCost::CostType>::max();
14806 unsigned BestBitWidth = BitWidth;
14807 unsigned VF = E.Scalars.size();
14808 // Choose the best bitwidth based on cost estimations.
14809 auto Checker = [&](unsigned BitWidth, unsigned) {
14810 unsigned MinBW = PowerOf2Ceil(A: BitWidth);
14811 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW);
14812 auto VecCallCosts = getVectorCallCosts(
14813 CI: IC,
14814 VecTy: FixedVectorType::get(ElementType: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), NumElts: VF),
14815 TTI, TLI, ArgTys);
14816 InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
14817 if (Cost < BestCost) {
14818 BestCost = Cost;
14819 BestBitWidth = BitWidth;
14820 }
14821 return false;
14822 };
14823 [[maybe_unused]] bool NeedToExit;
14824 (void)AttemptCheckBitwidth(Checker, NeedToExit);
14825 BitWidth = BestBitWidth;
14826 return TryProcessInstruction(BitWidth, Operands, CallChecker);
14827 }
14828
14829 // Otherwise, conservatively give up.
14830 default:
14831 break;
14832 }
14833 MaxDepthLevel = 1;
14834 return FinalAnalysis();
14835}
14836
14837static RecurKind getRdxKind(Value *V);
14838
14839void BoUpSLP::computeMinimumValueSizes() {
14840 // We only attempt to truncate integer expressions.
14841 bool IsStoreOrInsertElt =
14842 VectorizableTree.front()->getOpcode() == Instruction::Store ||
14843 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
14844 if ((IsStoreOrInsertElt || UserIgnoreList) &&
14845 ExtraBitWidthNodes.size() <= 1 &&
14846 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
14847 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
14848 return;
14849
14850 unsigned NodeIdx = 0;
14851 if (IsStoreOrInsertElt &&
14852 VectorizableTree.front()->State != TreeEntry::NeedToGather)
14853 NodeIdx = 1;
14854
14855 // Ensure the roots of the vectorizable tree don't form a cycle.
14856 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
14857 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
14858 (NodeIdx != 0 && any_of(Range&: VectorizableTree[NodeIdx]->UserTreeIndices,
14859 P: [NodeIdx](const EdgeInfo &EI) {
14860 return EI.UserTE->Idx >
14861 static_cast<int>(NodeIdx);
14862 })))
14863 return;
14864
14865 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
14866 // resize to the final type.
14867 bool IsTruncRoot = false;
14868 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
14869 SmallVector<unsigned> RootDemotes;
14870 if (NodeIdx != 0 &&
14871 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
14872 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
14873 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
14874 IsTruncRoot = true;
14875 RootDemotes.push_back(Elt: NodeIdx);
14876 IsProfitableToDemoteRoot = true;
14877 ++NodeIdx;
14878 }
14879
14880 // Analyzed the reduction already and not profitable - exit.
14881 if (AnalyzedMinBWVals.contains(V: VectorizableTree[NodeIdx]->Scalars.front()))
14882 return;
14883
14884 SmallVector<unsigned> ToDemote;
14885 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
14886 bool IsProfitableToDemoteRoot, unsigned Opcode,
14887 unsigned Limit, bool IsTruncRoot,
14888 bool IsSignedCmp) {
14889 ToDemote.clear();
14890 unsigned VF = E.getVectorFactor();
14891 auto *TreeRootIT = dyn_cast<IntegerType>(Val: E.Scalars.front()->getType());
14892 if (!TreeRootIT || !Opcode)
14893 return 0u;
14894
14895 if (any_of(Range: E.Scalars,
14896 P: [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
14897 return 0u;
14898
14899 unsigned NumParts =
14900 TTI->getNumberOfParts(Tp: FixedVectorType::get(ElementType: TreeRootIT, NumElts: VF));
14901
14902 // The maximum bit width required to represent all the values that can be
14903 // demoted without loss of precision. It would be safe to truncate the roots
14904 // of the expression to this width.
14905 unsigned MaxBitWidth = 1u;
14906
14907 // True if the roots can be zero-extended back to their original type,
14908 // rather than sign-extended. We know that if the leading bits are not
14909 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
14910 // True.
14911 // Determine if the sign bit of all the roots is known to be zero. If not,
14912 // IsKnownPositive is set to False.
14913 bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
14914 KnownBits Known = computeKnownBits(V: R, DL: *DL);
14915 return Known.isNonNegative();
14916 });
14917
14918 // We first check if all the bits of the roots are demanded. If they're not,
14919 // we can truncate the roots to this narrower type.
14920 for (Value *Root : E.Scalars) {
14921 unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
14922 TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: Root->getType());
14923 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14924 // If we can't prove that the sign bit is zero, we must add one to the
14925 // maximum bit width to account for the unknown sign bit. This preserves
14926 // the existing sign bit so we can safely sign-extend the root back to the
14927 // original type. Otherwise, if we know the sign bit is zero, we will
14928 // zero-extend the root instead.
14929 //
14930 // FIXME: This is somewhat suboptimal, as there will be cases where adding
14931 // one to the maximum bit width will yield a larger-than-necessary
14932 // type. In general, we need to add an extra bit only if we can't
14933 // prove that the upper bit of the original type is equal to the
14934 // upper bit of the proposed smaller type. If these two bits are
14935 // the same (either zero or one) we know that sign-extending from
14936 // the smaller type will result in the same value. Here, since we
14937 // can't yet prove this, we are just making the proposed smaller
14938 // type larger to ensure correctness.
14939 if (!IsKnownPositive)
14940 ++BitWidth1;
14941
14942 APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: Root));
14943 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
14944 MaxBitWidth =
14945 std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
14946 }
14947
14948 if (MaxBitWidth < 8 && MaxBitWidth > 1)
14949 MaxBitWidth = 8;
14950
14951 // If the original type is large, but reduced type does not improve the reg
14952 // use - ignore it.
14953 if (NumParts > 1 &&
14954 NumParts ==
14955 TTI->getNumberOfParts(Tp: FixedVectorType::get(
14956 ElementType: IntegerType::get(C&: F->getContext(), NumBits: bit_ceil(Value: MaxBitWidth)), NumElts: VF)))
14957 return 0u;
14958
14959 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
14960 Opcode == Instruction::SExt ||
14961 Opcode == Instruction::ZExt || NumParts > 1;
14962 // Conservatively determine if we can actually truncate the roots of the
14963 // expression. Collect the values that can be demoted in ToDemote and
14964 // additional roots that require investigating in Roots.
14965 DenseSet<const TreeEntry *> Visited;
14966 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
14967 bool NeedToDemote = IsProfitableToDemote;
14968
14969 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
14970 ToDemote, Visited, MaxDepthLevel, IsProfitableToDemote&: NeedToDemote,
14971 IsTruncRoot) ||
14972 (MaxDepthLevel <= Limit &&
14973 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
14974 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
14975 DL->getTypeSizeInBits(Ty: TreeRootIT) /
14976 DL->getTypeSizeInBits(Ty: cast<Instruction>(Val: E.Scalars.front())
14977 ->getOperand(i: 0)
14978 ->getType()) >
14979 2)))))
14980 return 0u;
14981 // Round MaxBitWidth up to the next power-of-two.
14982 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
14983
14984 return MaxBitWidth;
14985 };
14986
14987 // If we can truncate the root, we must collect additional values that might
14988 // be demoted as a result. That is, those seeded by truncations we will
14989 // modify.
14990 // Add reduction ops sizes, if any.
14991 if (UserIgnoreList &&
14992 isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
14993 for (Value *V : *UserIgnoreList) {
14994 auto NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
14995 auto NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
14996 unsigned BitWidth1 = NumTypeBits - NumSignBits;
14997 if (!isKnownNonNegative(V, SQ: SimplifyQuery(*DL)))
14998 ++BitWidth1;
14999 unsigned BitWidth2 = BitWidth1;
15000 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
15001 auto Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
15002 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15003 }
15004 ReductionBitWidth =
15005 std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
15006 }
15007 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15008 ReductionBitWidth = 8;
15009
15010 ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
15011 }
15012 bool IsTopRoot = NodeIdx == 0;
15013 while (NodeIdx < VectorizableTree.size() &&
15014 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15015 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15016 RootDemotes.push_back(Elt: NodeIdx);
15017 ++NodeIdx;
15018 IsTruncRoot = true;
15019 }
15020 bool IsSignedCmp = false;
15021 while (NodeIdx < VectorizableTree.size()) {
15022 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15023 unsigned Limit = 2;
15024 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15025 if (IsTopRoot &&
15026 ReductionBitWidth ==
15027 DL->getTypeSizeInBits(
15028 Ty: VectorizableTree.front()->Scalars.front()->getType()))
15029 Limit = 3;
15030 unsigned MaxBitWidth = ComputeMaxBitWidth(
15031 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15032 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15033 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15034 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15035 ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
15036 else if (MaxBitWidth == 0)
15037 ReductionBitWidth = 0;
15038 }
15039
15040 for (unsigned Idx : RootDemotes) {
15041 if (all_of(Range&: VectorizableTree[Idx]->Scalars, P: [&](Value *V) {
15042 uint32_t OrigBitWidth = DL->getTypeSizeInBits(Ty: V->getType());
15043 if (OrigBitWidth > MaxBitWidth) {
15044 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
15045 return MaskedValueIsZero(V, Mask, DL: SimplifyQuery(*DL));
15046 }
15047 return false;
15048 }))
15049 ToDemote.push_back(Elt: Idx);
15050 }
15051 RootDemotes.clear();
15052 IsTopRoot = false;
15053 IsProfitableToDemoteRoot = true;
15054
15055 if (ExtraBitWidthNodes.empty()) {
15056 NodeIdx = VectorizableTree.size();
15057 } else {
15058 unsigned NewIdx = 0;
15059 do {
15060 NewIdx = *ExtraBitWidthNodes.begin();
15061 ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
15062 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15063 NodeIdx = NewIdx;
15064 IsTruncRoot =
15065 NodeIdx < VectorizableTree.size() &&
15066 any_of(Range&: VectorizableTree[NodeIdx]->UserTreeIndices,
15067 P: [](const EdgeInfo &EI) {
15068 return EI.EdgeIdx == 0 &&
15069 EI.UserTE->getOpcode() == Instruction::Trunc &&
15070 !EI.UserTE->isAltShuffle();
15071 });
15072 IsSignedCmp =
15073 NodeIdx < VectorizableTree.size() &&
15074 any_of(Range&: VectorizableTree[NodeIdx]->UserTreeIndices,
15075 P: [](const EdgeInfo &EI) {
15076 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15077 any_of(Range&: EI.UserTE->Scalars, P: [](Value *V) {
15078 auto *IC = dyn_cast<ICmpInst>(Val: V);
15079 return IC && IC->isSigned();
15080 });
15081 });
15082 }
15083
15084 // If the maximum bit width we compute is less than the with of the roots'
15085 // type, we can proceed with the narrowing. Otherwise, do nothing.
15086 if (MaxBitWidth == 0 ||
15087 MaxBitWidth >=
15088 cast<IntegerType>(Val: TreeRoot.front()->getType())->getBitWidth()) {
15089 if (UserIgnoreList)
15090 AnalyzedMinBWVals.insert(I: TreeRoot.begin(), E: TreeRoot.end());
15091 continue;
15092 }
15093
15094 // Finally, map the values we can demote to the maximum bit with we
15095 // computed.
15096 for (unsigned Idx : ToDemote) {
15097 TreeEntry *TE = VectorizableTree[Idx].get();
15098 if (MinBWs.contains(Val: TE))
15099 continue;
15100 bool IsSigned = TE->getOpcode() == Instruction::SExt ||
15101 any_of(Range&: TE->Scalars, P: [&](Value *R) {
15102 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
15103 });
15104 MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
15105 }
15106 }
15107}
15108
15109PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
15110 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
15111 auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
15112 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
15113 auto *AA = &AM.getResult<AAManager>(IR&: F);
15114 auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
15115 auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
15116 auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
15117 auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
15118 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
15119
15120 bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
15121 if (!Changed)
15122 return PreservedAnalyses::all();
15123
15124 PreservedAnalyses PA;
15125 PA.preserveSet<CFGAnalyses>();
15126 return PA;
15127}
15128
15129bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
15130 TargetTransformInfo *TTI_,
15131 TargetLibraryInfo *TLI_, AAResults *AA_,
15132 LoopInfo *LI_, DominatorTree *DT_,
15133 AssumptionCache *AC_, DemandedBits *DB_,
15134 OptimizationRemarkEmitter *ORE_) {
15135 if (!RunSLPVectorization)
15136 return false;
15137 SE = SE_;
15138 TTI = TTI_;
15139 TLI = TLI_;
15140 AA = AA_;
15141 LI = LI_;
15142 DT = DT_;
15143 AC = AC_;
15144 DB = DB_;
15145 DL = &F.getParent()->getDataLayout();
15146
15147 Stores.clear();
15148 GEPs.clear();
15149 bool Changed = false;
15150
15151 // If the target claims to have no vector registers don't attempt
15152 // vectorization.
15153 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
15154 LLVM_DEBUG(
15155 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15156 return false;
15157 }
15158
15159 // Don't vectorize when the attribute NoImplicitFloat is used.
15160 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15161 return false;
15162
15163 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15164
15165 // Use the bottom up slp vectorizer to construct chains that start with
15166 // store instructions.
15167 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15168
15169 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15170 // delete instructions.
15171
15172 // Update DFS numbers now so that we can use them for ordering.
15173 DT->updateDFSNumbers();
15174
15175 // Scan the blocks in the function in post order.
15176 for (auto *BB : post_order(G: &F.getEntryBlock())) {
15177 // Start new block - clear the list of reduction roots.
15178 R.clearReductionData();
15179 collectSeedInstructions(BB);
15180
15181 // Vectorize trees that end at stores.
15182 if (!Stores.empty()) {
15183 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15184 << " underlying objects.\n");
15185 Changed |= vectorizeStoreChains(R);
15186 }
15187
15188 // Vectorize trees that end at reductions.
15189 Changed |= vectorizeChainsInBlock(BB, R);
15190
15191 // Vectorize the index computations of getelementptr instructions. This
15192 // is primarily intended to catch gather-like idioms ending at
15193 // non-consecutive loads.
15194 if (!GEPs.empty()) {
15195 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15196 << " underlying objects.\n");
15197 Changed |= vectorizeGEPIndices(BB, R);
15198 }
15199 }
15200
15201 if (Changed) {
15202 R.optimizeGatherSequence();
15203 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15204 }
15205 return Changed;
15206}
15207
15208bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15209 unsigned Idx, unsigned MinVF) {
15210 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15211 << "\n");
15212 const unsigned Sz = R.getVectorElementSize(V: Chain[0]);
15213 unsigned VF = Chain.size();
15214
15215 if (!isPowerOf2_32(Value: Sz) || !isPowerOf2_32(Value: VF) || VF < 2 || VF < MinVF) {
15216 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15217 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15218 // all vector lanes are used.
15219 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15220 return false;
15221 }
15222
15223 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15224 << "\n");
15225
15226 R.buildTree(Roots: Chain);
15227 if (R.isTreeTinyAndNotFullyVectorizable())
15228 return false;
15229 if (R.isLoadCombineCandidate())
15230 return false;
15231 R.reorderTopToBottom();
15232 R.reorderBottomToTop();
15233 R.buildExternalUses();
15234
15235 R.computeMinimumValueSizes();
15236 R.transformNodes();
15237
15238 InstructionCost Cost = R.getTreeCost();
15239
15240 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15241 if (Cost < -SLPCostThreshold) {
15242 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15243
15244 using namespace ore;
15245
15246 R.getORE()->emit(OptDiag&: OptimizationRemark(SV_NAME, "StoresVectorized",
15247 cast<StoreInst>(Val: Chain[0]))
15248 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15249 << " and with tree size "
15250 << NV("TreeSize", R.getTreeSize()));
15251
15252 R.vectorizeTree();
15253 return true;
15254 }
15255
15256 return false;
15257}
15258
15259bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
15260 BoUpSLP &R) {
15261 // We may run into multiple chains that merge into a single chain. We mark the
15262 // stores that we vectorized so that we don't visit the same store twice.
15263 BoUpSLP::ValueSet VectorizedStores;
15264 bool Changed = false;
15265
15266 // Stores the pair of stores (first_store, last_store) in a range, that were
15267 // already tried to be vectorized. Allows to skip the store ranges that were
15268 // already tried to be vectorized but the attempts were unsuccessful.
15269 DenseSet<std::pair<Value *, Value *>> TriedSequences;
15270 struct StoreDistCompare {
15271 bool operator()(const std::pair<unsigned, int> &Op1,
15272 const std::pair<unsigned, int> &Op2) const {
15273 return Op1.second < Op2.second;
15274 }
15275 };
15276 // A set of pairs (index of store in Stores array ref, Distance of the store
15277 // address relative to base store address in units).
15278 using StoreIndexToDistSet =
15279 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15280 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15281 int PrevDist = -1;
15282 BoUpSLP::ValueList Operands;
15283 // Collect the chain into a list.
15284 for (auto [Idx, Data] : enumerate(First: Set)) {
15285 if (Operands.empty() || Data.second - PrevDist == 1) {
15286 Operands.push_back(Elt: Stores[Data.first]);
15287 PrevDist = Data.second;
15288 if (Idx != Set.size() - 1)
15289 continue;
15290 }
15291 auto E = make_scope_exit(F: [&, &DataVar = Data]() {
15292 Operands.clear();
15293 Operands.push_back(Elt: Stores[DataVar.first]);
15294 PrevDist = DataVar.second;
15295 });
15296
15297 if (Operands.size() <= 1)
15298 continue;
15299
15300 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15301 unsigned EltSize = R.getVectorElementSize(V: Operands[0]);
15302 unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
15303
15304 unsigned MaxVF =
15305 std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
15306 auto *Store = cast<StoreInst>(Val: Operands[0]);
15307 Type *StoreTy = Store->getValueOperand()->getType();
15308 Type *ValueTy = StoreTy;
15309 if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
15310 ValueTy = Trunc->getSrcTy();
15311 unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF(
15312 VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreTy)), ScalarMemTy: StoreTy, ScalarValTy: ValueTy));
15313
15314 if (MaxVF < MinVF) {
15315 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15316 << ") < "
15317 << "MinVF (" << MinVF << ")\n");
15318 continue;
15319 }
15320
15321 unsigned NonPowerOf2VF = 0;
15322 if (VectorizeNonPowerOf2) {
15323 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15324 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15325 // lanes are used.
15326 unsigned CandVF = Operands.size();
15327 if (isPowerOf2_32(Value: CandVF + 1) && CandVF <= MaxVF)
15328 NonPowerOf2VF = CandVF;
15329 }
15330
15331 unsigned Sz = 1 + Log2_32(Value: MaxVF) - Log2_32(Value: MinVF);
15332 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15333 unsigned Size = MinVF;
15334 for_each(Range: reverse(C&: CandidateVFs), F: [&](unsigned &VF) {
15335 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15336 Size *= 2;
15337 });
15338 unsigned StartIdx = 0;
15339 for (unsigned Size : CandidateVFs) {
15340 for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
15341 ArrayRef<Value *> Slice = ArrayRef(Operands).slice(N: Cnt, M: Size);
15342 assert(
15343 all_of(
15344 Slice,
15345 [&](Value *V) {
15346 return cast<StoreInst>(V)->getValueOperand()->getType() ==
15347 cast<StoreInst>(Slice.front())
15348 ->getValueOperand()
15349 ->getType();
15350 }) &&
15351 "Expected all operands of same type.");
15352 if (!VectorizedStores.count(Ptr: Slice.front()) &&
15353 !VectorizedStores.count(Ptr: Slice.back()) &&
15354 TriedSequences.insert(V: std::make_pair(x: Slice.front(), y: Slice.back()))
15355 .second &&
15356 vectorizeStoreChain(Chain: Slice, R, Idx: Cnt, MinVF)) {
15357 // Mark the vectorized stores so that we don't vectorize them again.
15358 VectorizedStores.insert(I: Slice.begin(), E: Slice.end());
15359 Changed = true;
15360 // If we vectorized initial block, no need to try to vectorize it
15361 // again.
15362 if (Cnt == StartIdx)
15363 StartIdx += Size;
15364 Cnt += Size;
15365 continue;
15366 }
15367 ++Cnt;
15368 }
15369 // Check if the whole array was vectorized already - exit.
15370 if (StartIdx >= Operands.size())
15371 break;
15372 }
15373 }
15374 };
15375
15376 // Stores pair (first: index of the store into Stores array ref, address of
15377 // which taken as base, second: sorted set of pairs {index, dist}, which are
15378 // indices of stores in the set and their store location distances relative to
15379 // the base address).
15380
15381 // Need to store the index of the very first store separately, since the set
15382 // may be reordered after the insertion and the first store may be moved. This
15383 // container allows to reduce number of calls of getPointersDiff() function.
15384 SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
15385 // Inserts the specified store SI with the given index Idx to the set of the
15386 // stores. If the store with the same distance is found already - stop
15387 // insertion, try to vectorize already found stores. If some stores from this
15388 // sequence were not vectorized - try to vectorize them with the new store
15389 // later. But this logic is applied only to the stores, that come before the
15390 // previous store with the same distance.
15391 // Example:
15392 // 1. store x, %p
15393 // 2. store y, %p+1
15394 // 3. store z, %p+2
15395 // 4. store a, %p
15396 // 5. store b, %p+3
15397 // - Scan this from the last to first store. The very first bunch of stores is
15398 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
15399 // vector).
15400 // - The next store in the list - #1 - has the same distance from store #5 as
15401 // the store #4.
15402 // - Try to vectorize sequence of stores 4,2,3,5.
15403 // - If all these stores are vectorized - just drop them.
15404 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
15405 // - Start new stores sequence.
15406 // The new bunch of stores is {1, {1, 0}}.
15407 // - Add the stores from previous sequence, that were not vectorized.
15408 // Here we consider the stores in the reversed order, rather they are used in
15409 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
15410 // Store #3 can be added -> comes after store #4 with the same distance as
15411 // store #1.
15412 // Store #5 cannot be added - comes before store #4.
15413 // This logic allows to improve the compile time, we assume that the stores
15414 // after previous store with the same distance most likely have memory
15415 // dependencies and no need to waste compile time to try to vectorize them.
15416 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
15417 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
15418 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
15419 std::optional<int> Diff = getPointersDiff(
15420 ElemTyA: Stores[Set.first]->getValueOperand()->getType(),
15421 PtrA: Stores[Set.first]->getPointerOperand(),
15422 ElemTyB: SI->getValueOperand()->getType(), PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
15423 /*StrictCheck=*/true);
15424 if (!Diff)
15425 continue;
15426 auto It = Set.second.find(x: std::make_pair(x&: Idx, y&: *Diff));
15427 if (It == Set.second.end()) {
15428 Set.second.emplace(args&: Idx, args&: *Diff);
15429 return;
15430 }
15431 // Try to vectorize the first found set to avoid duplicate analysis.
15432 TryToVectorize(Set.second);
15433 StoreIndexToDistSet PrevSet;
15434 PrevSet.swap(x&: Set.second);
15435 Set.first = Idx;
15436 Set.second.emplace(args&: Idx, args: 0);
15437 // Insert stores that followed previous match to try to vectorize them
15438 // with this store.
15439 unsigned StartIdx = It->first + 1;
15440 SmallBitVector UsedStores(Idx - StartIdx);
15441 // Distances to previously found dup store (or this store, since they
15442 // store to the same addresses).
15443 SmallVector<int> Dists(Idx - StartIdx, 0);
15444 for (const std::pair<unsigned, int> &Pair : reverse(C&: PrevSet)) {
15445 // Do not try to vectorize sequences, we already tried.
15446 if (Pair.first <= It->first ||
15447 VectorizedStores.contains(Ptr: Stores[Pair.first]))
15448 break;
15449 unsigned BI = Pair.first - StartIdx;
15450 UsedStores.set(BI);
15451 Dists[BI] = Pair.second - It->second;
15452 }
15453 for (unsigned I = StartIdx; I < Idx; ++I) {
15454 unsigned BI = I - StartIdx;
15455 if (UsedStores.test(Idx: BI))
15456 Set.second.emplace(args&: I, args&: Dists[BI]);
15457 }
15458 return;
15459 }
15460 auto &Res = SortedStores.emplace_back();
15461 Res.first = Idx;
15462 Res.second.emplace(args&: Idx, args: 0);
15463 };
15464 StoreInst *PrevStore = Stores.front();
15465 for (auto [I, SI] : enumerate(First&: Stores)) {
15466 // Check that we do not try to vectorize stores of different types.
15467 if (PrevStore->getValueOperand()->getType() !=
15468 SI->getValueOperand()->getType()) {
15469 for (auto &Set : SortedStores)
15470 TryToVectorize(Set.second);
15471 SortedStores.clear();
15472 PrevStore = SI;
15473 }
15474 FillStoresSet(I, SI);
15475 }
15476
15477 // Final vectorization attempt.
15478 for (auto &Set : SortedStores)
15479 TryToVectorize(Set.second);
15480
15481 return Changed;
15482}
15483
15484void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
15485 // Initialize the collections. We will make a single pass over the block.
15486 Stores.clear();
15487 GEPs.clear();
15488
15489 // Visit the store and getelementptr instructions in BB and organize them in
15490 // Stores and GEPs according to the underlying objects of their pointer
15491 // operands.
15492 for (Instruction &I : *BB) {
15493 // Ignore store instructions that are volatile or have a pointer operand
15494 // that doesn't point to a scalar type.
15495 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
15496 if (!SI->isSimple())
15497 continue;
15498 if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
15499 continue;
15500 Stores[getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
15501 }
15502
15503 // Ignore getelementptr instructions that have more than one index, a
15504 // constant index, or a pointer operand that doesn't point to a scalar
15505 // type.
15506 else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
15507 if (GEP->getNumIndices() != 1)
15508 continue;
15509 Value *Idx = GEP->idx_begin()->get();
15510 if (isa<Constant>(Val: Idx))
15511 continue;
15512 if (!isValidElementType(Ty: Idx->getType()))
15513 continue;
15514 if (GEP->getType()->isVectorTy())
15515 continue;
15516 GEPs[GEP->getPointerOperand()].push_back(Elt: GEP);
15517 }
15518 }
15519}
15520
15521bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
15522 bool MaxVFOnly) {
15523 if (VL.size() < 2)
15524 return false;
15525
15526 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
15527 << VL.size() << ".\n");
15528
15529 // Check that all of the parts are instructions of the same type,
15530 // we permit an alternate opcode via InstructionsState.
15531 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
15532 if (!S.getOpcode())
15533 return false;
15534
15535 Instruction *I0 = cast<Instruction>(Val: S.OpValue);
15536 // Make sure invalid types (including vector type) are rejected before
15537 // determining vectorization factor for scalar instructions.
15538 for (Value *V : VL) {
15539 Type *Ty = V->getType();
15540 if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
15541 // NOTE: the following will give user internal llvm type name, which may
15542 // not be useful.
15543 R.getORE()->emit(RemarkBuilder: [&]() {
15544 std::string TypeStr;
15545 llvm::raw_string_ostream rso(TypeStr);
15546 Ty->print(O&: rso);
15547 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
15548 << "Cannot SLP vectorize list: type "
15549 << rso.str() + " is unsupported by vectorizer";
15550 });
15551 return false;
15552 }
15553 }
15554
15555 unsigned Sz = R.getVectorElementSize(V: I0);
15556 unsigned MinVF = R.getMinVF(Sz);
15557 unsigned MaxVF = std::max<unsigned>(a: llvm::bit_floor(Value: VL.size()), b: MinVF);
15558 MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
15559 if (MaxVF < 2) {
15560 R.getORE()->emit(RemarkBuilder: [&]() {
15561 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
15562 << "Cannot SLP vectorize list: vectorization factor "
15563 << "less than 2 is not supported";
15564 });
15565 return false;
15566 }
15567
15568 bool Changed = false;
15569 bool CandidateFound = false;
15570 InstructionCost MinCost = SLPCostThreshold.getValue();
15571 Type *ScalarTy = VL[0]->getType();
15572 if (auto *IE = dyn_cast<InsertElementInst>(Val: VL[0]))
15573 ScalarTy = IE->getOperand(i_nocapture: 1)->getType();
15574
15575 unsigned NextInst = 0, MaxInst = VL.size();
15576 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
15577 // No actual vectorization should happen, if number of parts is the same as
15578 // provided vectorization factor (i.e. the scalar type is used for vector
15579 // code during codegen).
15580 auto *VecTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF);
15581 if (TTI->getNumberOfParts(Tp: VecTy) == VF)
15582 continue;
15583 for (unsigned I = NextInst; I < MaxInst; ++I) {
15584 unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
15585
15586 if (!isPowerOf2_32(Value: ActualVF))
15587 continue;
15588
15589 if (MaxVFOnly && ActualVF < MaxVF)
15590 break;
15591 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
15592 break;
15593
15594 ArrayRef<Value *> Ops = VL.slice(N: I, M: ActualVF);
15595 // Check that a previous iteration of this loop did not delete the Value.
15596 if (llvm::any_of(Range&: Ops, P: [&R](Value *V) {
15597 auto *I = dyn_cast<Instruction>(Val: V);
15598 return I && R.isDeleted(I);
15599 }))
15600 continue;
15601
15602 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
15603 << "\n");
15604
15605 R.buildTree(Roots: Ops);
15606 if (R.isTreeTinyAndNotFullyVectorizable())
15607 continue;
15608 R.reorderTopToBottom();
15609 R.reorderBottomToTop(
15610 /*IgnoreReorder=*/!isa<InsertElementInst>(Val: Ops.front()) &&
15611 !R.doesRootHaveInTreeUses());
15612 R.buildExternalUses();
15613
15614 R.computeMinimumValueSizes();
15615 R.transformNodes();
15616 InstructionCost Cost = R.getTreeCost();
15617 CandidateFound = true;
15618 MinCost = std::min(a: MinCost, b: Cost);
15619
15620 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
15621 << " for VF=" << ActualVF << "\n");
15622 if (Cost < -SLPCostThreshold) {
15623 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
15624 R.getORE()->emit(OptDiag&: OptimizationRemark(SV_NAME, "VectorizedList",
15625 cast<Instruction>(Val: Ops[0]))
15626 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
15627 << " and with tree size "
15628 << ore::NV("TreeSize", R.getTreeSize()));
15629
15630 R.vectorizeTree();
15631 // Move to the next bundle.
15632 I += VF - 1;
15633 NextInst = I + 1;
15634 Changed = true;
15635 }
15636 }
15637 }
15638
15639 if (!Changed && CandidateFound) {
15640 R.getORE()->emit(RemarkBuilder: [&]() {
15641 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
15642 << "List vectorization was possible but not beneficial with cost "
15643 << ore::NV("Cost", MinCost) << " >= "
15644 << ore::NV("Treshold", -SLPCostThreshold);
15645 });
15646 } else if (!Changed) {
15647 R.getORE()->emit(RemarkBuilder: [&]() {
15648 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
15649 << "Cannot SLP vectorize list: vectorization was impossible"
15650 << " with available vectorization factors";
15651 });
15652 }
15653 return Changed;
15654}
15655
15656bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
15657 if (!I)
15658 return false;
15659
15660 if (!isa<BinaryOperator, CmpInst>(Val: I) || isa<VectorType>(Val: I->getType()))
15661 return false;
15662
15663 Value *P = I->getParent();
15664
15665 // Vectorize in current basic block only.
15666 auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
15667 auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: 1));
15668 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
15669 return false;
15670
15671 // First collect all possible candidates
15672 SmallVector<std::pair<Value *, Value *>, 4> Candidates;
15673 Candidates.emplace_back(Args&: Op0, Args&: Op1);
15674
15675 auto *A = dyn_cast<BinaryOperator>(Val: Op0);
15676 auto *B = dyn_cast<BinaryOperator>(Val: Op1);
15677 // Try to skip B.
15678 if (A && B && B->hasOneUse()) {
15679 auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 0));
15680 auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 1));
15681 if (B0 && B0->getParent() == P)
15682 Candidates.emplace_back(Args&: A, Args&: B0);
15683 if (B1 && B1->getParent() == P)
15684 Candidates.emplace_back(Args&: A, Args&: B1);
15685 }
15686 // Try to skip A.
15687 if (B && A && A->hasOneUse()) {
15688 auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 0));
15689 auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 1));
15690 if (A0 && A0->getParent() == P)
15691 Candidates.emplace_back(Args&: A0, Args&: B);
15692 if (A1 && A1->getParent() == P)
15693 Candidates.emplace_back(Args&: A1, Args&: B);
15694 }
15695
15696 if (Candidates.size() == 1)
15697 return tryToVectorizeList(VL: {Op0, Op1}, R);
15698
15699 // We have multiple options. Try to pick the single best.
15700 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
15701 if (!BestCandidate)
15702 return false;
15703 return tryToVectorizeList(
15704 VL: {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
15705}
15706
15707namespace {
15708
15709/// Model horizontal reductions.
15710///
15711/// A horizontal reduction is a tree of reduction instructions that has values
15712/// that can be put into a vector as its leaves. For example:
15713///
15714/// mul mul mul mul
15715/// \ / \ /
15716/// + +
15717/// \ /
15718/// +
15719/// This tree has "mul" as its leaf values and "+" as its reduction
15720/// instructions. A reduction can feed into a store or a binary operation
15721/// feeding a phi.
15722/// ...
15723/// \ /
15724/// +
15725/// |
15726/// phi +=
15727///
15728/// Or:
15729/// ...
15730/// \ /
15731/// +
15732/// |
15733/// *p =
15734///
15735class HorizontalReduction {
15736 using ReductionOpsType = SmallVector<Value *, 16>;
15737 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
15738 ReductionOpsListType ReductionOps;
15739 /// List of possibly reduced values.
15740 SmallVector<SmallVector<Value *>> ReducedVals;
15741 /// Maps reduced value to the corresponding reduction operation.
15742 DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
15743 // Use map vector to make stable output.
15744 MapVector<Instruction *, Value *> ExtraArgs;
15745 WeakTrackingVH ReductionRoot;
15746 /// The type of reduction operation.
15747 RecurKind RdxKind;
15748 /// Checks if the optimization of original scalar identity operations on
15749 /// matched horizontal reductions is enabled and allowed.
15750 bool IsSupportedHorRdxIdentityOp = false;
15751
15752 static bool isCmpSelMinMax(Instruction *I) {
15753 return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
15754 RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
15755 }
15756
15757 // And/or are potentially poison-safe logical patterns like:
15758 // select x, y, false
15759 // select x, true, y
15760 static bool isBoolLogicOp(Instruction *I) {
15761 return isa<SelectInst>(Val: I) &&
15762 (match(V: I, P: m_LogicalAnd()) || match(V: I, P: m_LogicalOr()));
15763 }
15764
15765 /// Checks if instruction is associative and can be vectorized.
15766 static bool isVectorizable(RecurKind Kind, Instruction *I) {
15767 if (Kind == RecurKind::None)
15768 return false;
15769
15770 // Integer ops that map to select instructions or intrinsics are fine.
15771 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
15772 isBoolLogicOp(I))
15773 return true;
15774
15775 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
15776 // FP min/max are associative except for NaN and -0.0. We do not
15777 // have to rule out -0.0 here because the intrinsic semantics do not
15778 // specify a fixed result for it.
15779 return I->getFastMathFlags().noNaNs();
15780 }
15781
15782 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
15783 return true;
15784
15785 return I->isAssociative();
15786 }
15787
15788 static Value *getRdxOperand(Instruction *I, unsigned Index) {
15789 // Poison-safe 'or' takes the form: select X, true, Y
15790 // To make that work with the normal operand processing, we skip the
15791 // true value operand.
15792 // TODO: Change the code and data structures to handle this without a hack.
15793 if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == 1)
15794 return I->getOperand(i: 2);
15795 return I->getOperand(i: Index);
15796 }
15797
15798 /// Creates reduction operation with the current opcode.
15799 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
15800 Value *RHS, const Twine &Name, bool UseSelect) {
15801 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
15802 switch (Kind) {
15803 case RecurKind::Or:
15804 if (UseSelect &&
15805 LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
15806 return Builder.CreateSelect(C: LHS, True: Builder.getTrue(), False: RHS, Name);
15807 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15808 Name);
15809 case RecurKind::And:
15810 if (UseSelect &&
15811 LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
15812 return Builder.CreateSelect(C: LHS, True: RHS, False: Builder.getFalse(), Name);
15813 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15814 Name);
15815 case RecurKind::Add:
15816 case RecurKind::Mul:
15817 case RecurKind::Xor:
15818 case RecurKind::FAdd:
15819 case RecurKind::FMul:
15820 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
15821 Name);
15822 case RecurKind::FMax:
15823 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: maxnum, LHS, RHS);
15824 case RecurKind::FMin:
15825 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: minnum, LHS, RHS);
15826 case RecurKind::FMaximum:
15827 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: maximum, LHS, RHS);
15828 case RecurKind::FMinimum:
15829 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: minimum, LHS, RHS);
15830 case RecurKind::SMax:
15831 if (UseSelect) {
15832 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
15833 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15834 }
15835 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: smax, LHS, RHS);
15836 case RecurKind::SMin:
15837 if (UseSelect) {
15838 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
15839 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15840 }
15841 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: smin, LHS, RHS);
15842 case RecurKind::UMax:
15843 if (UseSelect) {
15844 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
15845 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15846 }
15847 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: umax, LHS, RHS);
15848 case RecurKind::UMin:
15849 if (UseSelect) {
15850 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
15851 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
15852 }
15853 return Builder.CreateBinaryIntrinsic(Intrinsic::ID: umin, LHS, RHS);
15854 default:
15855 llvm_unreachable("Unknown reduction operation.");
15856 }
15857 }
15858
15859 /// Creates reduction operation with the current opcode with the IR flags
15860 /// from \p ReductionOps, dropping nuw/nsw flags.
15861 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
15862 Value *RHS, const Twine &Name,
15863 const ReductionOpsListType &ReductionOps) {
15864 bool UseSelect = ReductionOps.size() == 2 ||
15865 // Logical or/and.
15866 (ReductionOps.size() == 1 &&
15867 any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
15868 assert((!UseSelect || ReductionOps.size() != 2 ||
15869 isa<SelectInst>(ReductionOps[1][0])) &&
15870 "Expected cmp + select pairs for reduction");
15871 Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
15872 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
15873 if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
15874 propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps[0], OpValue: nullptr,
15875 /*IncludeWrapFlags=*/false);
15876 propagateIRFlags(I: Op, VL: ReductionOps[1], OpValue: nullptr,
15877 /*IncludeWrapFlags=*/false);
15878 return Op;
15879 }
15880 }
15881 propagateIRFlags(I: Op, VL: ReductionOps[0], OpValue: nullptr, /*IncludeWrapFlags=*/false);
15882 return Op;
15883 }
15884
15885public:
15886 static RecurKind getRdxKind(Value *V) {
15887 auto *I = dyn_cast<Instruction>(Val: V);
15888 if (!I)
15889 return RecurKind::None;
15890 if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
15891 return RecurKind::Add;
15892 if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
15893 return RecurKind::Mul;
15894 if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) ||
15895 match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
15896 return RecurKind::And;
15897 if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) ||
15898 match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
15899 return RecurKind::Or;
15900 if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
15901 return RecurKind::Xor;
15902 if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
15903 return RecurKind::FAdd;
15904 if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
15905 return RecurKind::FMul;
15906
15907 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
15908 return RecurKind::FMax;
15909 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
15910 return RecurKind::FMin;
15911
15912 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
15913 return RecurKind::FMaximum;
15914 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
15915 return RecurKind::FMinimum;
15916 // This matches either cmp+select or intrinsics. SLP is expected to handle
15917 // either form.
15918 // TODO: If we are canonicalizing to intrinsics, we can remove several
15919 // special-case paths that deal with selects.
15920 if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
15921 return RecurKind::SMax;
15922 if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
15923 return RecurKind::SMin;
15924 if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
15925 return RecurKind::UMax;
15926 if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
15927 return RecurKind::UMin;
15928
15929 if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
15930 // Try harder: look for min/max pattern based on instructions producing
15931 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
15932 // During the intermediate stages of SLP, it's very common to have
15933 // pattern like this (since optimizeGatherSequence is run only once
15934 // at the end):
15935 // %1 = extractelement <2 x i32> %a, i32 0
15936 // %2 = extractelement <2 x i32> %a, i32 1
15937 // %cond = icmp sgt i32 %1, %2
15938 // %3 = extractelement <2 x i32> %a, i32 0
15939 // %4 = extractelement <2 x i32> %a, i32 1
15940 // %select = select i1 %cond, i32 %3, i32 %4
15941 CmpInst::Predicate Pred;
15942 Instruction *L1;
15943 Instruction *L2;
15944
15945 Value *LHS = Select->getTrueValue();
15946 Value *RHS = Select->getFalseValue();
15947 Value *Cond = Select->getCondition();
15948
15949 // TODO: Support inverse predicates.
15950 if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
15951 if (!isa<ExtractElementInst>(Val: RHS) ||
15952 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
15953 return RecurKind::None;
15954 } else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
15955 if (!isa<ExtractElementInst>(Val: LHS) ||
15956 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
15957 return RecurKind::None;
15958 } else {
15959 if (!isa<ExtractElementInst>(Val: LHS) || !isa<ExtractElementInst>(Val: RHS))
15960 return RecurKind::None;
15961 if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) ||
15962 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) ||
15963 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
15964 return RecurKind::None;
15965 }
15966
15967 switch (Pred) {
15968 default:
15969 return RecurKind::None;
15970 case CmpInst::ICMP_SGT:
15971 case CmpInst::ICMP_SGE:
15972 return RecurKind::SMax;
15973 case CmpInst::ICMP_SLT:
15974 case CmpInst::ICMP_SLE:
15975 return RecurKind::SMin;
15976 case CmpInst::ICMP_UGT:
15977 case CmpInst::ICMP_UGE:
15978 return RecurKind::UMax;
15979 case CmpInst::ICMP_ULT:
15980 case CmpInst::ICMP_ULE:
15981 return RecurKind::UMin;
15982 }
15983 }
15984 return RecurKind::None;
15985 }
15986
15987 /// Get the index of the first operand.
15988 static unsigned getFirstOperandIndex(Instruction *I) {
15989 return isCmpSelMinMax(I) ? 1 : 0;
15990 }
15991
15992private:
15993 /// Total number of operands in the reduction operation.
15994 static unsigned getNumberOfOperands(Instruction *I) {
15995 return isCmpSelMinMax(I) ? 3 : 2;
15996 }
15997
15998 /// Checks if the instruction is in basic block \p BB.
15999 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16000 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16001 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16002 auto *Sel = cast<SelectInst>(Val: I);
16003 auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
16004 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16005 }
16006 return I->getParent() == BB;
16007 }
16008
16009 /// Expected number of uses for reduction operations/reduced values.
16010 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16011 if (IsCmpSelMinMax) {
16012 // SelectInst must be used twice while the condition op must have single
16013 // use only.
16014 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
16015 return Sel->hasNUses(N: 2) && Sel->getCondition()->hasOneUse();
16016 return I->hasNUses(N: 2);
16017 }
16018
16019 // Arithmetic reduction operation must be used once only.
16020 return I->hasOneUse();
16021 }
16022
16023 /// Initializes the list of reduction operations.
16024 void initReductionOps(Instruction *I) {
16025 if (isCmpSelMinMax(I))
16026 ReductionOps.assign(NumElts: 2, Elt: ReductionOpsType());
16027 else
16028 ReductionOps.assign(NumElts: 1, Elt: ReductionOpsType());
16029 }
16030
16031 /// Add all reduction operations for the reduction instruction \p I.
16032 void addReductionOps(Instruction *I) {
16033 if (isCmpSelMinMax(I)) {
16034 ReductionOps[0].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
16035 ReductionOps[1].emplace_back(Args&: I);
16036 } else {
16037 ReductionOps[0].emplace_back(Args&: I);
16038 }
16039 }
16040
16041 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16042 int Sz = Data.size();
16043 auto *I = dyn_cast<Instruction>(Val: Data.front());
16044 return Sz > 1 || isConstant(V: Data.front()) ||
16045 (I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
16046 }
16047
16048public:
16049 HorizontalReduction() = default;
16050
16051 /// Try to find a reduction tree.
16052 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16053 ScalarEvolution &SE, const DataLayout &DL,
16054 const TargetLibraryInfo &TLI) {
16055 RdxKind = HorizontalReduction::getRdxKind(V: Root);
16056 if (!isVectorizable(Kind: RdxKind, I: Root))
16057 return false;
16058
16059 // Analyze "regular" integer/FP types for reductions - no target-specific
16060 // types or pointers.
16061 Type *Ty = Root->getType();
16062 if (!isValidElementType(Ty) || Ty->isPointerTy())
16063 return false;
16064
16065 // Though the ultimate reduction may have multiple uses, its condition must
16066 // have only single use.
16067 if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
16068 if (!Sel->getCondition()->hasOneUse())
16069 return false;
16070
16071 ReductionRoot = Root;
16072
16073 // Iterate through all the operands of the possible reduction tree and
16074 // gather all the reduced values, sorting them by their value id.
16075 BasicBlock *BB = Root->getParent();
16076 bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
16077 SmallVector<Instruction *> Worklist(1, Root);
16078 // Checks if the operands of the \p TreeN instruction are also reduction
16079 // operations or should be treated as reduced values or an extra argument,
16080 // which is not part of the reduction.
16081 auto CheckOperands = [&](Instruction *TreeN,
16082 SmallVectorImpl<Value *> &ExtraArgs,
16083 SmallVectorImpl<Value *> &PossibleReducedVals,
16084 SmallVectorImpl<Instruction *> &ReductionOps) {
16085 for (int I = getFirstOperandIndex(I: TreeN),
16086 End = getNumberOfOperands(I: TreeN);
16087 I < End; ++I) {
16088 Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
16089 ReducedValsToOps[EdgeVal].push_back(Elt: TreeN);
16090 auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
16091 // Edge has wrong parent - mark as an extra argument.
16092 if (EdgeInst && !isVectorLikeInstWithConstOps(V: EdgeInst) &&
16093 !hasSameParent(I: EdgeInst, BB)) {
16094 ExtraArgs.push_back(Elt: EdgeVal);
16095 continue;
16096 }
16097 // If the edge is not an instruction, or it is different from the main
16098 // reduction opcode or has too many uses - possible reduced value.
16099 // Also, do not try to reduce const values, if the operation is not
16100 // foldable.
16101 if (!EdgeInst || getRdxKind(V: EdgeInst) != RdxKind ||
16102 IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) ||
16103 !hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) ||
16104 !isVectorizable(Kind: RdxKind, I: EdgeInst) ||
16105 (R.isAnalyzedReductionRoot(I: EdgeInst) &&
16106 all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
16107 PossibleReducedVals.push_back(Elt: EdgeVal);
16108 continue;
16109 }
16110 ReductionOps.push_back(Elt: EdgeInst);
16111 }
16112 };
16113 // Try to regroup reduced values so that it gets more profitable to try to
16114 // reduce them. Values are grouped by their value ids, instructions - by
16115 // instruction op id and/or alternate op id, plus do extra analysis for
16116 // loads (grouping them by the distabce between pointers) and cmp
16117 // instructions (grouping them by the predicate).
16118 MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
16119 PossibleReducedVals;
16120 initReductionOps(I: Root);
16121 DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
16122 SmallSet<size_t, 2> LoadKeyUsed;
16123 SmallPtrSet<Value *, 4> DoNotReverseVals;
16124
16125 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16126 Value *Ptr = getUnderlyingObject(V: LI->getPointerOperand());
16127 if (LoadKeyUsed.contains(V: Key)) {
16128 auto LIt = LoadsMap.find(Val: Ptr);
16129 if (LIt != LoadsMap.end()) {
16130 for (LoadInst *RLI : LIt->second) {
16131 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
16132 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
16133 /*StrictCheck=*/true))
16134 return hash_value(ptr: RLI->getPointerOperand());
16135 }
16136 for (LoadInst *RLI : LIt->second) {
16137 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
16138 Ptr2: LI->getPointerOperand(), TLI)) {
16139 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
16140 DoNotReverseVals.insert(Ptr: RLI);
16141 return SubKey;
16142 }
16143 }
16144 if (LIt->second.size() > 2) {
16145 hash_code SubKey =
16146 hash_value(ptr: LIt->second.back()->getPointerOperand());
16147 DoNotReverseVals.insert(Ptr: LIt->second.back());
16148 return SubKey;
16149 }
16150 }
16151 }
16152 LoadKeyUsed.insert(V: Key);
16153 LoadsMap.try_emplace(Key: Ptr).first->second.push_back(Elt: LI);
16154 return hash_value(ptr: LI->getPointerOperand());
16155 };
16156
16157 while (!Worklist.empty()) {
16158 Instruction *TreeN = Worklist.pop_back_val();
16159 SmallVector<Value *> Args;
16160 SmallVector<Value *> PossibleRedVals;
16161 SmallVector<Instruction *> PossibleReductionOps;
16162 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16163 // If too many extra args - mark the instruction itself as a reduction
16164 // value, not a reduction operation.
16165 if (Args.size() < 2) {
16166 addReductionOps(I: TreeN);
16167 // Add extra args.
16168 if (!Args.empty()) {
16169 assert(Args.size() == 1 && "Expected only single argument.");
16170 ExtraArgs[TreeN] = Args.front();
16171 }
16172 // Add reduction values. The values are sorted for better vectorization
16173 // results.
16174 for (Value *V : PossibleRedVals) {
16175 size_t Key, Idx;
16176 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
16177 /*AllowAlternate=*/false);
16178 ++PossibleReducedVals[Key][Idx]
16179 .insert(KV: std::make_pair(x&: V, y: 0))
16180 .first->second;
16181 }
16182 Worklist.append(in_start: PossibleReductionOps.rbegin(),
16183 in_end: PossibleReductionOps.rend());
16184 } else {
16185 size_t Key, Idx;
16186 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V: TreeN, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
16187 /*AllowAlternate=*/false);
16188 ++PossibleReducedVals[Key][Idx]
16189 .insert(KV: std::make_pair(x&: TreeN, y: 0))
16190 .first->second;
16191 }
16192 }
16193 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16194 // Sort values by the total number of values kinds to start the reduction
16195 // from the longest possible reduced values sequences.
16196 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16197 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16198 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16199 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16200 It != E; ++It) {
16201 PossibleRedValsVect.emplace_back();
16202 auto RedValsVect = It->second.takeVector();
16203 stable_sort(Range&: RedValsVect, C: llvm::less_second());
16204 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16205 PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
16206 }
16207 stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
16208 return P1.size() > P2.size();
16209 });
16210 int NewIdx = -1;
16211 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16212 if (isGoodForReduction(Data) ||
16213 (isa<LoadInst>(Val: Data.front()) && NewIdx >= 0 &&
16214 isa<LoadInst>(Val: ReducedVals[NewIdx].front()) &&
16215 getUnderlyingObject(
16216 V: cast<LoadInst>(Val: Data.front())->getPointerOperand()) ==
16217 getUnderlyingObject(V: cast<LoadInst>(Val: ReducedVals[NewIdx].front())
16218 ->getPointerOperand()))) {
16219 if (NewIdx < 0) {
16220 NewIdx = ReducedVals.size();
16221 ReducedVals.emplace_back();
16222 }
16223 if (DoNotReverseVals.contains(Ptr: Data.front()))
16224 ReducedVals[NewIdx].append(in_start: Data.begin(), in_end: Data.end());
16225 else
16226 ReducedVals[NewIdx].append(in_start: Data.rbegin(), in_end: Data.rend());
16227 } else {
16228 ReducedVals.emplace_back().append(in_start: Data.rbegin(), in_end: Data.rend());
16229 }
16230 }
16231 }
16232 // Sort the reduced values by number of same/alternate opcode and/or pointer
16233 // operand.
16234 stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16235 return P1.size() > P2.size();
16236 });
16237 return true;
16238 }
16239
16240 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16241 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16242 const TargetLibraryInfo &TLI) {
16243 constexpr int ReductionLimit = 4;
16244 constexpr unsigned RegMaxNumber = 4;
16245 constexpr unsigned RedValsMaxNumber = 128;
16246 // If there are a sufficient number of reduction values, reduce
16247 // to a nearby power-of-2. We can safely generate oversized
16248 // vectors and rely on the backend to split them to legal sizes.
16249 unsigned NumReducedVals =
16250 std::accumulate(first: ReducedVals.begin(), last: ReducedVals.end(), init: 0,
16251 binary_op: [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16252 if (!isGoodForReduction(Data: Vals))
16253 return Num;
16254 return Num + Vals.size();
16255 });
16256 if (NumReducedVals < ReductionLimit &&
16257 (!AllowHorRdxIdenityOptimization ||
16258 all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
16259 return RedV.size() < 2 || !allConstant(VL: RedV) || !isSplat(VL: RedV);
16260 }))) {
16261 for (ReductionOpsType &RdxOps : ReductionOps)
16262 for (Value *RdxOp : RdxOps)
16263 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
16264 return nullptr;
16265 }
16266
16267 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16268 TargetFolder(DL));
16269 Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
16270
16271 // Track the reduced values in case if they are replaced by extractelement
16272 // because of the vectorization.
16273 DenseMap<Value *, WeakTrackingVH> TrackedVals(
16274 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16275 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16276 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16277 ExternallyUsedValues.reserve(NumEntries: ExtraArgs.size() + 1);
16278 // The same extra argument may be used several times, so log each attempt
16279 // to use it.
16280 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16281 assert(Pair.first && "DebugLoc must be set.");
16282 ExternallyUsedValues[Pair.second].push_back(Elt: Pair.first);
16283 TrackedVals.try_emplace(Key: Pair.second, Args: Pair.second);
16284 }
16285
16286 // The compare instruction of a min/max is the insertion point for new
16287 // instructions and may be replaced with a new compare instruction.
16288 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16289 assert(isa<SelectInst>(RdxRootInst) &&
16290 "Expected min/max reduction to have select root instruction");
16291 Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
16292 assert(isa<Instruction>(ScalarCond) &&
16293 "Expected min/max reduction to have compare condition");
16294 return cast<Instruction>(Val: ScalarCond);
16295 };
16296
16297 // Return new VectorizedTree, based on previous value.
16298 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16299 if (VectorizedTree) {
16300 // Update the final value in the reduction.
16301 Builder.SetCurrentDebugLocation(
16302 cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
16303 if ((isa<PoisonValue>(Val: VectorizedTree) && !isa<PoisonValue>(Val: Res)) ||
16304 (isGuaranteedNotToBePoison(V: Res) &&
16305 !isGuaranteedNotToBePoison(V: VectorizedTree))) {
16306 auto It = ReducedValsToOps.find(Val: Res);
16307 if (It != ReducedValsToOps.end() &&
16308 any_of(Range&: It->getSecond(),
16309 P: [](Instruction *I) { return isBoolLogicOp(I); }))
16310 std::swap(a&: VectorizedTree, b&: Res);
16311 }
16312
16313 return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
16314 ReductionOps);
16315 }
16316 // Initialize the final value in the reduction.
16317 return Res;
16318 };
16319 bool AnyBoolLogicOp =
16320 any_of(Range&: ReductionOps.back(), P: [](Value *V) {
16321 return isBoolLogicOp(I: cast<Instruction>(Val: V));
16322 });
16323 // The reduction root is used as the insertion point for new instructions,
16324 // so set it as externally used to prevent it from being deleted.
16325 ExternallyUsedValues[ReductionRoot];
16326 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16327 ReductionOps.front().size());
16328 for (ReductionOpsType &RdxOps : ReductionOps)
16329 for (Value *RdxOp : RdxOps) {
16330 if (!RdxOp)
16331 continue;
16332 IgnoreList.insert(V: RdxOp);
16333 }
16334 // Intersect the fast-math-flags from all reduction operations.
16335 FastMathFlags RdxFMF;
16336 RdxFMF.set();
16337 for (Value *U : IgnoreList)
16338 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
16339 RdxFMF &= FPMO->getFastMathFlags();
16340 bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
16341
16342 // Need to track reduced vals, they may be changed during vectorization of
16343 // subvectors.
16344 for (ArrayRef<Value *> Candidates : ReducedVals)
16345 for (Value *V : Candidates)
16346 TrackedVals.try_emplace(Key: V, Args&: V);
16347
16348 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
16349 // List of the values that were reduced in other trees as part of gather
16350 // nodes and thus requiring extract if fully vectorized in other trees.
16351 SmallPtrSet<Value *, 4> RequiredExtract;
16352 Value *VectorizedTree = nullptr;
16353 bool CheckForReusedReductionOps = false;
16354 // Try to vectorize elements based on their type.
16355 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
16356 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
16357 InstructionsState S = getSameOpcode(VL: OrigReducedVals, TLI);
16358 SmallVector<Value *> Candidates;
16359 Candidates.reserve(N: 2 * OrigReducedVals.size());
16360 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
16361 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16362 Value *RdxVal = TrackedVals.find(Val: OrigReducedVals[Cnt])->second;
16363 // Check if the reduction value was not overriden by the extractelement
16364 // instruction because of the vectorization and exclude it, if it is not
16365 // compatible with other values.
16366 // Also check if the instruction was folded to constant/other value.
16367 auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
16368 if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
16369 (!S.getOpcode() || !S.isOpcodeOrAlt(I: Inst))) ||
16370 (S.getOpcode() && !Inst))
16371 continue;
16372 Candidates.push_back(Elt: RdxVal);
16373 TrackedToOrig.try_emplace(Key: RdxVal, Args: OrigReducedVals[Cnt]);
16374 }
16375 bool ShuffledExtracts = false;
16376 // Try to handle shuffled extractelements.
16377 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16378 I + 1 < E) {
16379 InstructionsState NextS = getSameOpcode(VL: ReducedVals[I + 1], TLI);
16380 if (NextS.getOpcode() == Instruction::ExtractElement &&
16381 !NextS.isAltShuffle()) {
16382 SmallVector<Value *> CommonCandidates(Candidates);
16383 for (Value *RV : ReducedVals[I + 1]) {
16384 Value *RdxVal = TrackedVals.find(Val: RV)->second;
16385 // Check if the reduction value was not overriden by the
16386 // extractelement instruction because of the vectorization and
16387 // exclude it, if it is not compatible with other values.
16388 if (auto *Inst = dyn_cast<Instruction>(Val: RdxVal))
16389 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(I: Inst))
16390 continue;
16391 CommonCandidates.push_back(Elt: RdxVal);
16392 TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
16393 }
16394 SmallVector<int> Mask;
16395 if (isFixedVectorShuffle(VL: CommonCandidates, Mask)) {
16396 ++I;
16397 Candidates.swap(RHS&: CommonCandidates);
16398 ShuffledExtracts = true;
16399 }
16400 }
16401 }
16402
16403 // Emit code for constant values.
16404 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
16405 allConstant(VL: Candidates)) {
16406 Value *Res = Candidates.front();
16407 ++VectorizedVals.try_emplace(Key: Candidates.front(), Args: 0).first->getSecond();
16408 for (Value *VC : ArrayRef(Candidates).drop_front()) {
16409 Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
16410 ++VectorizedVals.try_emplace(Key: VC, Args: 0).first->getSecond();
16411 if (auto *ResI = dyn_cast<Instruction>(Val: Res))
16412 V.analyzedReductionRoot(I: ResI);
16413 }
16414 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
16415 continue;
16416 }
16417
16418 unsigned NumReducedVals = Candidates.size();
16419 if (NumReducedVals < ReductionLimit &&
16420 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
16421 !isSplat(VL: Candidates)))
16422 continue;
16423
16424 // Check if we support repeated scalar values processing (optimization of
16425 // original scalar identity operations on matched horizontal reductions).
16426 IsSupportedHorRdxIdentityOp =
16427 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
16428 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
16429 // Gather same values.
16430 MapVector<Value *, unsigned> SameValuesCounter;
16431 if (IsSupportedHorRdxIdentityOp)
16432 for (Value *V : Candidates)
16433 ++SameValuesCounter.insert(KV: std::make_pair(x&: V, y: 0)).first->second;
16434 // Used to check if the reduced values used same number of times. In this
16435 // case the compiler may produce better code. E.g. if reduced values are
16436 // aabbccdd (8 x values), then the first node of the tree will have a node
16437 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
16438 // Plus, the final reduction will be performed on <8 x aabbccdd>.
16439 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
16440 // x abcd) * 2.
16441 // Currently it only handles add/fadd/xor. and/or/min/max do not require
16442 // this analysis, other operations may require an extra estimation of
16443 // the profitability.
16444 bool SameScaleFactor = false;
16445 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
16446 SameValuesCounter.size() != Candidates.size();
16447 if (OptReusedScalars) {
16448 SameScaleFactor =
16449 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
16450 RdxKind == RecurKind::Xor) &&
16451 all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
16452 P: [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
16453 return P.second == SameValuesCounter.front().second;
16454 });
16455 Candidates.resize(N: SameValuesCounter.size());
16456 transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
16457 F: [](const auto &P) { return P.first; });
16458 NumReducedVals = Candidates.size();
16459 // Have a reduction of the same element.
16460 if (NumReducedVals == 1) {
16461 Value *OrigV = TrackedToOrig.find(Val: Candidates.front())->second;
16462 unsigned Cnt = SameValuesCounter.lookup(Key: OrigV);
16463 Value *RedVal =
16464 emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
16465 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16466 VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
16467 continue;
16468 }
16469 }
16470
16471 unsigned MaxVecRegSize = V.getMaxVecRegSize();
16472 unsigned EltSize = V.getVectorElementSize(V: Candidates[0]);
16473 unsigned MaxElts =
16474 RegMaxNumber * llvm::bit_floor(Value: MaxVecRegSize / EltSize);
16475
16476 unsigned ReduxWidth = std::min<unsigned>(
16477 a: llvm::bit_floor(Value: NumReducedVals),
16478 b: std::clamp<unsigned>(val: MaxElts, lo: RedValsMaxNumber,
16479 hi: RegMaxNumber * RedValsMaxNumber));
16480 unsigned Start = 0;
16481 unsigned Pos = Start;
16482 // Restarts vectorization attempt with lower vector factor.
16483 unsigned PrevReduxWidth = ReduxWidth;
16484 bool CheckForReusedReductionOpsLocal = false;
16485 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
16486 &CheckForReusedReductionOpsLocal,
16487 &PrevReduxWidth, &V,
16488 &IgnoreList](bool IgnoreVL = false) {
16489 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
16490 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
16491 // Check if any of the reduction ops are gathered. If so, worth
16492 // trying again with less number of reduction ops.
16493 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
16494 }
16495 ++Pos;
16496 if (Pos < NumReducedVals - ReduxWidth + 1)
16497 return IsAnyRedOpGathered;
16498 Pos = Start;
16499 ReduxWidth /= 2;
16500 return IsAnyRedOpGathered;
16501 };
16502 bool AnyVectorized = false;
16503 while (Pos < NumReducedVals - ReduxWidth + 1 &&
16504 ReduxWidth >= ReductionLimit) {
16505 // Dependency in tree of the reduction ops - drop this attempt, try
16506 // later.
16507 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
16508 Start == 0) {
16509 CheckForReusedReductionOps = true;
16510 break;
16511 }
16512 PrevReduxWidth = ReduxWidth;
16513 ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
16514 // Beeing analyzed already - skip.
16515 if (V.areAnalyzedReductionVals(VL)) {
16516 (void)AdjustReducedVals(/*IgnoreVL=*/true);
16517 continue;
16518 }
16519 // Early exit if any of the reduction values were deleted during
16520 // previous vectorization attempts.
16521 if (any_of(Range&: VL, P: [&V](Value *RedVal) {
16522 auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
16523 if (!RedValI)
16524 return false;
16525 return V.isDeleted(I: RedValI);
16526 }))
16527 break;
16528 V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
16529 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
16530 if (!AdjustReducedVals())
16531 V.analyzedReductionVals(VL);
16532 continue;
16533 }
16534 if (V.isLoadCombineReductionCandidate(RdxKind)) {
16535 if (!AdjustReducedVals())
16536 V.analyzedReductionVals(VL);
16537 continue;
16538 }
16539 V.reorderTopToBottom();
16540 // No need to reorder the root node at all.
16541 V.reorderBottomToTop(/*IgnoreReorder=*/true);
16542 // Keep extracted other reduction values, if they are used in the
16543 // vectorization trees.
16544 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
16545 ExternallyUsedValues);
16546 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
16547 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
16548 continue;
16549 for (Value *V : ReducedVals[Cnt])
16550 if (isa<Instruction>(Val: V))
16551 LocalExternallyUsedValues[TrackedVals[V]];
16552 }
16553 if (!IsSupportedHorRdxIdentityOp) {
16554 // Number of uses of the candidates in the vector of values.
16555 assert(SameValuesCounter.empty() &&
16556 "Reused values counter map is not empty");
16557 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16558 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16559 continue;
16560 Value *V = Candidates[Cnt];
16561 Value *OrigV = TrackedToOrig.find(Val: V)->second;
16562 ++SameValuesCounter[OrigV];
16563 }
16564 }
16565 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
16566 // Gather externally used values.
16567 SmallPtrSet<Value *, 4> Visited;
16568 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
16569 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
16570 continue;
16571 Value *RdxVal = Candidates[Cnt];
16572 if (!Visited.insert(Ptr: RdxVal).second)
16573 continue;
16574 // Check if the scalar was vectorized as part of the vectorization
16575 // tree but not the top node.
16576 if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
16577 LocalExternallyUsedValues[RdxVal];
16578 continue;
16579 }
16580 Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
16581 unsigned NumOps =
16582 VectorizedVals.lookup(Val: RdxVal) + SameValuesCounter[OrigV];
16583 if (NumOps != ReducedValsToOps.find(Val: OrigV)->second.size())
16584 LocalExternallyUsedValues[RdxVal];
16585 }
16586 // Do not need the list of reused scalars in regular mode anymore.
16587 if (!IsSupportedHorRdxIdentityOp)
16588 SameValuesCounter.clear();
16589 for (Value *RdxVal : VL)
16590 if (RequiredExtract.contains(Ptr: RdxVal))
16591 LocalExternallyUsedValues[RdxVal];
16592 // Update LocalExternallyUsedValues for the scalar, replaced by
16593 // extractelement instructions.
16594 DenseMap<Value *, Value *> ReplacementToExternal;
16595 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
16596 ReplacementToExternal.try_emplace(Key: Pair.second, Args: Pair.first);
16597 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
16598 Value *Ext = Pair.first;
16599 auto RIt = ReplacementToExternal.find(Val: Ext);
16600 while (RIt != ReplacementToExternal.end()) {
16601 Ext = RIt->second;
16602 RIt = ReplacementToExternal.find(Val: Ext);
16603 }
16604 auto *It = ExternallyUsedValues.find(Key: Ext);
16605 if (It == ExternallyUsedValues.end())
16606 continue;
16607 LocalExternallyUsedValues[Pair.second].append(RHS: It->second);
16608 }
16609 V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
16610
16611 V.computeMinimumValueSizes();
16612 V.transformNodes();
16613
16614 // Estimate cost.
16615 InstructionCost TreeCost = V.getTreeCost(VectorizedVals: VL);
16616 InstructionCost ReductionCost =
16617 getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, ReduxWidth, FMF: RdxFMF);
16618 InstructionCost Cost = TreeCost + ReductionCost;
16619 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16620 << " for reduction\n");
16621 if (!Cost.isValid())
16622 break;
16623 if (Cost >= -SLPCostThreshold) {
16624 V.getORE()->emit(RemarkBuilder: [&]() {
16625 return OptimizationRemarkMissed(
16626 SV_NAME, "HorSLPNotBeneficial",
16627 ReducedValsToOps.find(Val: VL[0])->second.front())
16628 << "Vectorizing horizontal reduction is possible "
16629 << "but not beneficial with cost " << ore::NV("Cost", Cost)
16630 << " and threshold "
16631 << ore::NV("Threshold", -SLPCostThreshold);
16632 });
16633 if (!AdjustReducedVals())
16634 V.analyzedReductionVals(VL);
16635 continue;
16636 }
16637
16638 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
16639 << Cost << ". (HorRdx)\n");
16640 V.getORE()->emit(RemarkBuilder: [&]() {
16641 return OptimizationRemark(
16642 SV_NAME, "VectorizedHorizontalReduction",
16643 ReducedValsToOps.find(Val: VL[0])->second.front())
16644 << "Vectorized horizontal reduction with cost "
16645 << ore::NV("Cost", Cost) << " and with tree size "
16646 << ore::NV("TreeSize", V.getTreeSize());
16647 });
16648
16649 Builder.setFastMathFlags(RdxFMF);
16650
16651 // Emit a reduction. If the root is a select (min/max idiom), the insert
16652 // point is the compare condition of that select.
16653 Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
16654 Instruction *InsertPt = RdxRootInst;
16655 if (IsCmpSelMinMax)
16656 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
16657
16658 // Vectorize a tree.
16659 Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues: LocalExternallyUsedValues,
16660 ReplacedExternals, ReductionRoot: InsertPt);
16661
16662 Builder.SetInsertPoint(InsertPt);
16663
16664 // To prevent poison from leaking across what used to be sequential,
16665 // safe, scalar boolean logic operations, the reduction operand must be
16666 // frozen.
16667 if ((isBoolLogicOp(I: RdxRootInst) ||
16668 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
16669 !isGuaranteedNotToBePoison(V: VectorizedRoot))
16670 VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
16671
16672 // Emit code to correctly handle reused reduced values, if required.
16673 if (OptReusedScalars && !SameScaleFactor) {
16674 VectorizedRoot =
16675 emitReusedOps(VectorizedValue: VectorizedRoot, Builder, VL: V.getRootNodeScalars(),
16676 SameValuesCounter, TrackedToOrig);
16677 }
16678
16679 Value *ReducedSubTree =
16680 emitReduction(VectorizedValue: VectorizedRoot, Builder, ReduxWidth, TTI);
16681 if (ReducedSubTree->getType() != VL.front()->getType()) {
16682 ReducedSubTree = Builder.CreateIntCast(
16683 V: ReducedSubTree, DestTy: VL.front()->getType(), isSigned: any_of(Range&: VL, P: [&](Value *R) {
16684 KnownBits Known = computeKnownBits(
16685 V: R, DL: cast<Instruction>(Val: ReductionOps.front().front())
16686 ->getModule()
16687 ->getDataLayout());
16688 return !Known.isNonNegative();
16689 }));
16690 }
16691
16692 // Improved analysis for add/fadd/xor reductions with same scale factor
16693 // for all operands of reductions. We can emit scalar ops for them
16694 // instead.
16695 if (OptReusedScalars && SameScaleFactor)
16696 ReducedSubTree = emitScaleForReusedOps(
16697 VectorizedValue: ReducedSubTree, Builder, Cnt: SameValuesCounter.front().second);
16698
16699 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
16700 // Count vectorized reduced values to exclude them from final reduction.
16701 for (Value *RdxVal : VL) {
16702 Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
16703 if (IsSupportedHorRdxIdentityOp) {
16704 VectorizedVals.try_emplace(Key: OrigV, Args&: SameValuesCounter[RdxVal]);
16705 continue;
16706 }
16707 ++VectorizedVals.try_emplace(Key: OrigV, Args: 0).first->getSecond();
16708 if (!V.isVectorized(V: RdxVal))
16709 RequiredExtract.insert(Ptr: RdxVal);
16710 }
16711 Pos += ReduxWidth;
16712 Start = Pos;
16713 ReduxWidth = llvm::bit_floor(Value: NumReducedVals - Pos);
16714 AnyVectorized = true;
16715 }
16716 if (OptReusedScalars && !AnyVectorized) {
16717 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
16718 Value *RedVal = emitScaleForReusedOps(VectorizedValue: P.first, Builder, Cnt: P.second);
16719 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
16720 Value *OrigV = TrackedToOrig.find(Val: P.first)->second;
16721 VectorizedVals.try_emplace(Key: OrigV, Args: P.second);
16722 }
16723 continue;
16724 }
16725 }
16726 if (VectorizedTree) {
16727 // Reorder operands of bool logical op in the natural order to avoid
16728 // possible problem with poison propagation. If not possible to reorder
16729 // (both operands are originally RHS), emit an extra freeze instruction
16730 // for the LHS operand.
16731 // I.e., if we have original code like this:
16732 // RedOp1 = select i1 ?, i1 LHS, i1 false
16733 // RedOp2 = select i1 RHS, i1 ?, i1 false
16734
16735 // Then, we swap LHS/RHS to create a new op that matches the poison
16736 // semantics of the original code.
16737
16738 // If we have original code like this and both values could be poison:
16739 // RedOp1 = select i1 ?, i1 LHS, i1 false
16740 // RedOp2 = select i1 ?, i1 RHS, i1 false
16741
16742 // Then, we must freeze LHS in the new op.
16743 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
16744 Instruction *RedOp1,
16745 Instruction *RedOp2,
16746 bool InitStep) {
16747 if (!AnyBoolLogicOp)
16748 return;
16749 if (isBoolLogicOp(I: RedOp1) &&
16750 ((!InitStep && LHS == VectorizedTree) ||
16751 getRdxOperand(I: RedOp1, Index: 0) == LHS || isGuaranteedNotToBePoison(V: LHS)))
16752 return;
16753 if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
16754 getRdxOperand(I: RedOp2, Index: 0) == RHS ||
16755 isGuaranteedNotToBePoison(V: RHS))) {
16756 std::swap(a&: LHS, b&: RHS);
16757 return;
16758 }
16759 if (LHS != VectorizedTree)
16760 LHS = Builder.CreateFreeze(V: LHS);
16761 };
16762 // Finish the reduction.
16763 // Need to add extra arguments and not vectorized possible reduction
16764 // values.
16765 // Try to avoid dependencies between the scalar remainders after
16766 // reductions.
16767 auto FinalGen =
16768 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
16769 bool InitStep) {
16770 unsigned Sz = InstVals.size();
16771 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
16772 Sz % 2);
16773 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
16774 Instruction *RedOp = InstVals[I + 1].first;
16775 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
16776 Value *RdxVal1 = InstVals[I].second;
16777 Value *StableRdxVal1 = RdxVal1;
16778 auto It1 = TrackedVals.find(Val: RdxVal1);
16779 if (It1 != TrackedVals.end())
16780 StableRdxVal1 = It1->second;
16781 Value *RdxVal2 = InstVals[I + 1].second;
16782 Value *StableRdxVal2 = RdxVal2;
16783 auto It2 = TrackedVals.find(Val: RdxVal2);
16784 if (It2 != TrackedVals.end())
16785 StableRdxVal2 = It2->second;
16786 // To prevent poison from leaking across what used to be
16787 // sequential, safe, scalar boolean logic operations, the
16788 // reduction operand must be frozen.
16789 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
16790 RedOp, InitStep);
16791 Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
16792 RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
16793 ExtraReds[I / 2] = std::make_pair(x: InstVals[I].first, y&: ExtraRed);
16794 }
16795 if (Sz % 2 == 1)
16796 ExtraReds[Sz / 2] = InstVals.back();
16797 return ExtraReds;
16798 };
16799 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
16800 ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
16801 Args&: VectorizedTree);
16802 SmallPtrSet<Value *, 8> Visited;
16803 for (ArrayRef<Value *> Candidates : ReducedVals) {
16804 for (Value *RdxVal : Candidates) {
16805 if (!Visited.insert(Ptr: RdxVal).second)
16806 continue;
16807 unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
16808 for (Instruction *RedOp :
16809 ArrayRef(ReducedValsToOps.find(Val: RdxVal)->second)
16810 .drop_back(N: NumOps))
16811 ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
16812 }
16813 }
16814 for (auto &Pair : ExternallyUsedValues) {
16815 // Add each externally used value to the final reduction.
16816 for (auto *I : Pair.second)
16817 ExtraReductions.emplace_back(Args&: I, Args&: Pair.first);
16818 }
16819 // Iterate through all not-vectorized reduction values/extra arguments.
16820 bool InitStep = true;
16821 while (ExtraReductions.size() > 1) {
16822 VectorizedTree = ExtraReductions.front().second;
16823 SmallVector<std::pair<Instruction *, Value *>> NewReds =
16824 FinalGen(ExtraReductions, InitStep);
16825 ExtraReductions.swap(RHS&: NewReds);
16826 InitStep = false;
16827 }
16828 VectorizedTree = ExtraReductions.front().second;
16829
16830 ReductionRoot->replaceAllUsesWith(V: VectorizedTree);
16831
16832 // The original scalar reduction is expected to have no remaining
16833 // uses outside the reduction tree itself. Assert that we got this
16834 // correct, replace internal uses with undef, and mark for eventual
16835 // deletion.
16836#ifndef NDEBUG
16837 SmallSet<Value *, 4> IgnoreSet;
16838 for (ArrayRef<Value *> RdxOps : ReductionOps)
16839 IgnoreSet.insert(I: RdxOps.begin(), E: RdxOps.end());
16840#endif
16841 for (ArrayRef<Value *> RdxOps : ReductionOps) {
16842 for (Value *Ignore : RdxOps) {
16843 if (!Ignore)
16844 continue;
16845#ifndef NDEBUG
16846 for (auto *U : Ignore->users()) {
16847 assert(IgnoreSet.count(U) &&
16848 "All users must be either in the reduction ops list.");
16849 }
16850#endif
16851 if (!Ignore->use_empty()) {
16852 Value *Undef = UndefValue::get(T: Ignore->getType());
16853 Ignore->replaceAllUsesWith(V: Undef);
16854 }
16855 V.eraseInstruction(I: cast<Instruction>(Val: Ignore));
16856 }
16857 }
16858 } else if (!CheckForReusedReductionOps) {
16859 for (ReductionOpsType &RdxOps : ReductionOps)
16860 for (Value *RdxOp : RdxOps)
16861 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
16862 }
16863 return VectorizedTree;
16864 }
16865
16866private:
16867 /// Calculate the cost of a reduction.
16868 InstructionCost getReductionCost(TargetTransformInfo *TTI,
16869 ArrayRef<Value *> ReducedVals,
16870 bool IsCmpSelMinMax, unsigned ReduxWidth,
16871 FastMathFlags FMF) {
16872 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
16873 Type *ScalarTy = ReducedVals.front()->getType();
16874 FixedVectorType *VectorTy = FixedVectorType::get(ElementType: ScalarTy, NumElts: ReduxWidth);
16875 InstructionCost VectorCost = 0, ScalarCost;
16876 // If all of the reduced values are constant, the vector cost is 0, since
16877 // the reduction value can be calculated at the compile time.
16878 bool AllConsts = allConstant(VL: ReducedVals);
16879 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
16880 InstructionCost Cost = 0;
16881 // Scalar cost is repeated for N-1 elements.
16882 int Cnt = ReducedVals.size();
16883 for (Value *RdxVal : ReducedVals) {
16884 if (Cnt == 1)
16885 break;
16886 --Cnt;
16887 if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? 3 : 2)) {
16888 Cost += GenCostFn();
16889 continue;
16890 }
16891 InstructionCost ScalarCost = 0;
16892 for (User *U : RdxVal->users()) {
16893 auto *RdxOp = cast<Instruction>(Val: U);
16894 if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
16895 ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
16896 continue;
16897 }
16898 ScalarCost = InstructionCost::getInvalid();
16899 break;
16900 }
16901 if (ScalarCost.isValid())
16902 Cost += ScalarCost;
16903 else
16904 Cost += GenCostFn();
16905 }
16906 return Cost;
16907 };
16908 switch (RdxKind) {
16909 case RecurKind::Add:
16910 case RecurKind::Mul:
16911 case RecurKind::Or:
16912 case RecurKind::And:
16913 case RecurKind::Xor:
16914 case RecurKind::FAdd:
16915 case RecurKind::FMul: {
16916 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
16917 if (!AllConsts)
16918 VectorCost =
16919 TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy, FMF, CostKind);
16920 ScalarCost = EvaluateScalarCost([&]() {
16921 return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
16922 });
16923 break;
16924 }
16925 case RecurKind::FMax:
16926 case RecurKind::FMin:
16927 case RecurKind::FMaximum:
16928 case RecurKind::FMinimum:
16929 case RecurKind::SMax:
16930 case RecurKind::SMin:
16931 case RecurKind::UMax:
16932 case RecurKind::UMin: {
16933 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
16934 if (!AllConsts)
16935 VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
16936 ScalarCost = EvaluateScalarCost([&]() {
16937 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
16938 return TTI->getIntrinsicInstrCost(ICA, CostKind);
16939 });
16940 break;
16941 }
16942 default:
16943 llvm_unreachable("Expected arithmetic or min/max reduction operation");
16944 }
16945
16946 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
16947 << " for reduction of " << shortBundleName(ReducedVals)
16948 << " (It is a splitting reduction)\n");
16949 return VectorCost - ScalarCost;
16950 }
16951
16952 /// Emit a horizontal reduction of the vectorized value.
16953 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
16954 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
16955 assert(VectorizedValue && "Need to have a vectorized tree node");
16956 assert(isPowerOf2_32(ReduxWidth) &&
16957 "We only handle power-of-two reductions for now");
16958 assert(RdxKind != RecurKind::FMulAdd &&
16959 "A call to the llvm.fmuladd intrinsic is not handled yet");
16960
16961 ++NumVectorInstructions;
16962 return createSimpleTargetReduction(B&: Builder, Src: VectorizedValue, RdxKind);
16963 }
16964
16965 /// Emits optimized code for unique scalar value reused \p Cnt times.
16966 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
16967 unsigned Cnt) {
16968 assert(IsSupportedHorRdxIdentityOp &&
16969 "The optimization of matched scalar identity horizontal reductions "
16970 "must be supported.");
16971 switch (RdxKind) {
16972 case RecurKind::Add: {
16973 // res = mul vv, n
16974 Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt);
16975 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
16976 << VectorizedValue << ". (HorRdx)\n");
16977 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
16978 }
16979 case RecurKind::Xor: {
16980 // res = n % 2 ? 0 : vv
16981 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
16982 << ". (HorRdx)\n");
16983 if (Cnt % 2 == 0)
16984 return Constant::getNullValue(Ty: VectorizedValue->getType());
16985 return VectorizedValue;
16986 }
16987 case RecurKind::FAdd: {
16988 // res = fmul v, n
16989 Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
16990 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
16991 << VectorizedValue << ". (HorRdx)\n");
16992 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
16993 }
16994 case RecurKind::And:
16995 case RecurKind::Or:
16996 case RecurKind::SMax:
16997 case RecurKind::SMin:
16998 case RecurKind::UMax:
16999 case RecurKind::UMin:
17000 case RecurKind::FMax:
17001 case RecurKind::FMin:
17002 case RecurKind::FMaximum:
17003 case RecurKind::FMinimum:
17004 // res = vv
17005 return VectorizedValue;
17006 case RecurKind::Mul:
17007 case RecurKind::FMul:
17008 case RecurKind::FMulAdd:
17009 case RecurKind::IAnyOf:
17010 case RecurKind::FAnyOf:
17011 case RecurKind::None:
17012 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17013 }
17014 return nullptr;
17015 }
17016
17017 /// Emits actual operation for the scalar identity values, found during
17018 /// horizontal reduction analysis.
17019 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17020 ArrayRef<Value *> VL,
17021 const MapVector<Value *, unsigned> &SameValuesCounter,
17022 const DenseMap<Value *, Value *> &TrackedToOrig) {
17023 assert(IsSupportedHorRdxIdentityOp &&
17024 "The optimization of matched scalar identity horizontal reductions "
17025 "must be supported.");
17026 auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
17027 if (VTy->getElementType() != VL.front()->getType()) {
17028 VectorizedValue = Builder.CreateIntCast(
17029 V: VectorizedValue,
17030 DestTy: FixedVectorType::get(ElementType: VL.front()->getType(), NumElts: VTy->getNumElements()),
17031 isSigned: any_of(Range&: VL, P: [&](Value *R) {
17032 KnownBits Known = computeKnownBits(
17033 V: R, DL: cast<Instruction>(Val: ReductionOps.front().front())
17034 ->getModule()
17035 ->getDataLayout());
17036 return !Known.isNonNegative();
17037 }));
17038 }
17039 switch (RdxKind) {
17040 case RecurKind::Add: {
17041 // root = mul prev_root, <1, 1, n, 1>
17042 SmallVector<Constant *> Vals;
17043 for (Value *V : VL) {
17044 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17045 Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /*IsSigned=*/false));
17046 }
17047 auto *Scale = ConstantVector::get(V: Vals);
17048 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17049 << VectorizedValue << ". (HorRdx)\n");
17050 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
17051 }
17052 case RecurKind::And:
17053 case RecurKind::Or:
17054 // No need for multiple or/and(s).
17055 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17056 << ". (HorRdx)\n");
17057 return VectorizedValue;
17058 case RecurKind::SMax:
17059 case RecurKind::SMin:
17060 case RecurKind::UMax:
17061 case RecurKind::UMin:
17062 case RecurKind::FMax:
17063 case RecurKind::FMin:
17064 case RecurKind::FMaximum:
17065 case RecurKind::FMinimum:
17066 // No need for multiple min/max(s) of the same value.
17067 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17068 << ". (HorRdx)\n");
17069 return VectorizedValue;
17070 case RecurKind::Xor: {
17071 // Replace values with even number of repeats with 0, since
17072 // x xor x = 0.
17073 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17074 // 7>, if elements 4th and 6th elements have even number of repeats.
17075 SmallVector<int> Mask(
17076 cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
17077 PoisonMaskElem);
17078 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
17079 bool NeedShuffle = false;
17080 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17081 Value *V = VL[I];
17082 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17083 if (Cnt % 2 == 0) {
17084 Mask[I] = VF;
17085 NeedShuffle = true;
17086 }
17087 }
17088 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17089 : Mask) dbgs()
17090 << I << " ";
17091 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17092 if (NeedShuffle)
17093 VectorizedValue = Builder.CreateShuffleVector(
17094 V1: VectorizedValue,
17095 V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
17096 return VectorizedValue;
17097 }
17098 case RecurKind::FAdd: {
17099 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17100 SmallVector<Constant *> Vals;
17101 for (Value *V : VL) {
17102 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17103 Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
17104 }
17105 auto *Scale = ConstantVector::get(V: Vals);
17106 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
17107 }
17108 case RecurKind::Mul:
17109 case RecurKind::FMul:
17110 case RecurKind::FMulAdd:
17111 case RecurKind::IAnyOf:
17112 case RecurKind::FAnyOf:
17113 case RecurKind::None:
17114 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17115 }
17116 return nullptr;
17117 }
17118};
17119} // end anonymous namespace
17120
17121/// Gets recurrence kind from the specified value.
17122static RecurKind getRdxKind(Value *V) {
17123 return HorizontalReduction::getRdxKind(V);
17124}
17125static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17126 if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
17127 return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
17128
17129 unsigned AggregateSize = 1;
17130 auto *IV = cast<InsertValueInst>(Val: InsertInst);
17131 Type *CurrentType = IV->getType();
17132 do {
17133 if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
17134 for (auto *Elt : ST->elements())
17135 if (Elt != ST->getElementType(N: 0)) // check homogeneity
17136 return std::nullopt;
17137 AggregateSize *= ST->getNumElements();
17138 CurrentType = ST->getElementType(N: 0);
17139 } else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
17140 AggregateSize *= AT->getNumElements();
17141 CurrentType = AT->getElementType();
17142 } else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
17143 AggregateSize *= VT->getNumElements();
17144 return AggregateSize;
17145 } else if (CurrentType->isSingleValueType()) {
17146 return AggregateSize;
17147 } else {
17148 return std::nullopt;
17149 }
17150 } while (true);
17151}
17152
17153static void findBuildAggregate_rec(Instruction *LastInsertInst,
17154 TargetTransformInfo *TTI,
17155 SmallVectorImpl<Value *> &BuildVectorOpds,
17156 SmallVectorImpl<Value *> &InsertElts,
17157 unsigned OperandOffset) {
17158 do {
17159 Value *InsertedOperand = LastInsertInst->getOperand(i: 1);
17160 std::optional<unsigned> OperandIndex =
17161 getInsertIndex(InsertInst: LastInsertInst, Offset: OperandOffset);
17162 if (!OperandIndex)
17163 return;
17164 if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
17165 findBuildAggregate_rec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
17166 BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex);
17167
17168 } else {
17169 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17170 InsertElts[*OperandIndex] = LastInsertInst;
17171 }
17172 LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: 0));
17173 } while (LastInsertInst != nullptr &&
17174 isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
17175 LastInsertInst->hasOneUse());
17176}
17177
17178/// Recognize construction of vectors like
17179/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17180/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17181/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17182/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17183/// starting from the last insertelement or insertvalue instruction.
17184///
17185/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17186/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17187/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17188///
17189/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17190///
17191/// \return true if it matches.
17192static bool findBuildAggregate(Instruction *LastInsertInst,
17193 TargetTransformInfo *TTI,
17194 SmallVectorImpl<Value *> &BuildVectorOpds,
17195 SmallVectorImpl<Value *> &InsertElts) {
17196
17197 assert((isa<InsertElementInst>(LastInsertInst) ||
17198 isa<InsertValueInst>(LastInsertInst)) &&
17199 "Expected insertelement or insertvalue instruction!");
17200
17201 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17202 "Expected empty result vectors!");
17203
17204 std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
17205 if (!AggregateSize)
17206 return false;
17207 BuildVectorOpds.resize(N: *AggregateSize);
17208 InsertElts.resize(N: *AggregateSize);
17209
17210 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: 0);
17211 llvm::erase(C&: BuildVectorOpds, V: nullptr);
17212 llvm::erase(C&: InsertElts, V: nullptr);
17213 if (BuildVectorOpds.size() >= 2)
17214 return true;
17215
17216 return false;
17217}
17218
17219/// Try and get a reduction instruction from a phi node.
17220///
17221/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17222/// if they come from either \p ParentBB or a containing loop latch.
17223///
17224/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17225/// if not possible.
17226static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
17227 BasicBlock *ParentBB, LoopInfo *LI) {
17228 // There are situations where the reduction value is not dominated by the
17229 // reduction phi. Vectorizing such cases has been reported to cause
17230 // miscompiles. See PR25787.
17231 auto DominatedReduxValue = [&](Value *R) {
17232 return isa<Instruction>(Val: R) &&
17233 DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
17234 };
17235
17236 Instruction *Rdx = nullptr;
17237
17238 // Return the incoming value if it comes from the same BB as the phi node.
17239 if (P->getIncomingBlock(i: 0) == ParentBB) {
17240 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
17241 } else if (P->getIncomingBlock(i: 1) == ParentBB) {
17242 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
17243 }
17244
17245 if (Rdx && DominatedReduxValue(Rdx))
17246 return Rdx;
17247
17248 // Otherwise, check whether we have a loop latch to look at.
17249 Loop *BBL = LI->getLoopFor(BB: ParentBB);
17250 if (!BBL)
17251 return nullptr;
17252 BasicBlock *BBLatch = BBL->getLoopLatch();
17253 if (!BBLatch)
17254 return nullptr;
17255
17256 // There is a loop latch, return the incoming value if it comes from
17257 // that. This reduction pattern occasionally turns up.
17258 if (P->getIncomingBlock(i: 0) == BBLatch) {
17259 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
17260 } else if (P->getIncomingBlock(i: 1) == BBLatch) {
17261 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
17262 }
17263
17264 if (Rdx && DominatedReduxValue(Rdx))
17265 return Rdx;
17266
17267 return nullptr;
17268}
17269
17270static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17271 if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
17272 return true;
17273 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V&: V0), m_Value(V&: V1))))
17274 return true;
17275 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V&: V0), m_Value(V&: V1))))
17276 return true;
17277 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V&: V0), m_Value(V&: V1))))
17278 return true;
17279 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17280 return true;
17281 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17282 return true;
17283 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17284 return true;
17285 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17286 return true;
17287 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17288 return true;
17289 return false;
17290}
17291
17292/// We could have an initial reduction that is not an add.
17293/// r *= v1 + v2 + v3 + v4
17294/// In such a case start looking for a tree rooted in the first '+'.
17295/// \Returns the new root if found, which may be nullptr if not an instruction.
17296static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
17297 Instruction *Root) {
17298 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17299 isa<IntrinsicInst>(Root)) &&
17300 "Expected binop, select, or intrinsic for reduction matching");
17301 Value *LHS =
17302 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
17303 Value *RHS =
17304 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + 1);
17305 if (LHS == Phi)
17306 return dyn_cast<Instruction>(Val: RHS);
17307 if (RHS == Phi)
17308 return dyn_cast<Instruction>(Val: LHS);
17309 return nullptr;
17310}
17311
17312/// \p Returns the first operand of \p I that does not match \p Phi. If
17313/// operand is not an instruction it returns nullptr.
17314static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
17315 Value *Op0 = nullptr;
17316 Value *Op1 = nullptr;
17317 if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
17318 return nullptr;
17319 return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
17320}
17321
17322/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17323static bool isReductionCandidate(Instruction *I) {
17324 bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
17325 Value *B0 = nullptr, *B1 = nullptr;
17326 bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
17327 return IsBinop || IsSelect;
17328}
17329
17330bool SLPVectorizerPass::vectorizeHorReduction(
17331 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
17332 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17333 if (!ShouldVectorizeHor)
17334 return false;
17335 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
17336
17337 if (Root->getParent() != BB || isa<PHINode>(Val: Root))
17338 return false;
17339
17340 // If we can find a secondary reduction root, use that instead.
17341 auto SelectRoot = [&]() {
17342 if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
17343 HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
17344 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
17345 return NewRoot;
17346 return Root;
17347 };
17348
17349 // Start analysis starting from Root instruction. If horizontal reduction is
17350 // found, try to vectorize it. If it is not a horizontal reduction or
17351 // vectorization is not possible or not effective, and currently analyzed
17352 // instruction is a binary operation, try to vectorize the operands, using
17353 // pre-order DFS traversal order. If the operands were not vectorized, repeat
17354 // the same procedure considering each operand as a possible root of the
17355 // horizontal reduction.
17356 // Interrupt the process if the Root instruction itself was vectorized or all
17357 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17358 // If a horizintal reduction was not matched or vectorized we collect
17359 // instructions for possible later attempts for vectorization.
17360 std::queue<std::pair<Instruction *, unsigned>> Stack;
17361 Stack.emplace(args: SelectRoot(), args: 0);
17362 SmallPtrSet<Value *, 8> VisitedInstrs;
17363 bool Res = false;
17364 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
17365 if (R.isAnalyzedReductionRoot(I: Inst))
17366 return nullptr;
17367 if (!isReductionCandidate(I: Inst))
17368 return nullptr;
17369 HorizontalReduction HorRdx;
17370 if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: *SE, DL: *DL, TLI: *TLI))
17371 return nullptr;
17372 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI, TLI: *TLI);
17373 };
17374 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17375 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17376 FutureSeed = getNonPhiOperand(I: Root, Phi: P);
17377 if (!FutureSeed)
17378 return false;
17379 }
17380 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
17381 // analysis is done separately.
17382 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
17383 PostponedInsts.push_back(Elt: FutureSeed);
17384 return true;
17385 };
17386
17387 while (!Stack.empty()) {
17388 Instruction *Inst;
17389 unsigned Level;
17390 std::tie(args&: Inst, args&: Level) = Stack.front();
17391 Stack.pop();
17392 // Do not try to analyze instruction that has already been vectorized.
17393 // This may happen when we vectorize instruction operands on a previous
17394 // iteration while stack was populated before that happened.
17395 if (R.isDeleted(I: Inst))
17396 continue;
17397 if (Value *VectorizedV = TryToReduce(Inst)) {
17398 Res = true;
17399 if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
17400 // Try to find another reduction.
17401 Stack.emplace(args&: I, args&: Level);
17402 continue;
17403 }
17404 } else {
17405 // We could not vectorize `Inst` so try to use it as a future seed.
17406 if (!TryAppendToPostponedInsts(Inst)) {
17407 assert(Stack.empty() && "Expected empty stack");
17408 break;
17409 }
17410 }
17411
17412 // Try to vectorize operands.
17413 // Continue analysis for the instruction from the same basic block only to
17414 // save compile time.
17415 if (++Level < RecursionMaxDepth)
17416 for (auto *Op : Inst->operand_values())
17417 if (VisitedInstrs.insert(Ptr: Op).second)
17418 if (auto *I = dyn_cast<Instruction>(Val: Op))
17419 // Do not try to vectorize CmpInst operands, this is done
17420 // separately.
17421 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
17422 !R.isDeleted(I) && I->getParent() == BB)
17423 Stack.emplace(args&: I, args&: Level);
17424 }
17425 return Res;
17426}
17427
17428bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
17429 BasicBlock *BB, BoUpSLP &R,
17430 TargetTransformInfo *TTI) {
17431 SmallVector<WeakTrackingVH> PostponedInsts;
17432 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
17433 Res |= tryToVectorize(Insts: PostponedInsts, R);
17434 return Res;
17435}
17436
17437bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
17438 BoUpSLP &R) {
17439 bool Res = false;
17440 for (Value *V : Insts)
17441 if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
17442 Res |= tryToVectorize(I: Inst, R);
17443 return Res;
17444}
17445
17446bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
17447 BasicBlock *BB, BoUpSLP &R) {
17448 if (!R.canMapToVector(T: IVI->getType()))
17449 return false;
17450
17451 SmallVector<Value *, 16> BuildVectorOpds;
17452 SmallVector<Value *, 16> BuildVectorInsts;
17453 if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts))
17454 return false;
17455
17456 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
17457 // Aggregate value is unlikely to be processed in vector register.
17458 return tryToVectorizeList(VL: BuildVectorOpds, R);
17459}
17460
17461bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
17462 BasicBlock *BB, BoUpSLP &R) {
17463 SmallVector<Value *, 16> BuildVectorInsts;
17464 SmallVector<Value *, 16> BuildVectorOpds;
17465 SmallVector<int> Mask;
17466 if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts) ||
17467 (llvm::all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
17468 isFixedVectorShuffle(VL: BuildVectorOpds, Mask)))
17469 return false;
17470
17471 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
17472 return tryToVectorizeList(VL: BuildVectorInsts, R);
17473}
17474
17475template <typename T>
17476static bool tryToVectorizeSequence(
17477 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
17478 function_ref<bool(T *, T *)> AreCompatible,
17479 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
17480 bool MaxVFOnly, BoUpSLP &R) {
17481 bool Changed = false;
17482 // Sort by type, parent, operands.
17483 stable_sort(Incoming, Comparator);
17484
17485 // Try to vectorize elements base on their type.
17486 SmallVector<T *> Candidates;
17487 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
17488 // Look for the next elements with the same type, parent and operand
17489 // kinds.
17490 auto *SameTypeIt = IncIt;
17491 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
17492 ++SameTypeIt;
17493
17494 // Try to vectorize them.
17495 unsigned NumElts = (SameTypeIt - IncIt);
17496 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
17497 << NumElts << ")\n");
17498 // The vectorization is a 3-state attempt:
17499 // 1. Try to vectorize instructions with the same/alternate opcodes with the
17500 // size of maximal register at first.
17501 // 2. Try to vectorize remaining instructions with the same type, if
17502 // possible. This may result in the better vectorization results rather than
17503 // if we try just to vectorize instructions with the same/alternate opcodes.
17504 // 3. Final attempt to try to vectorize all instructions with the
17505 // same/alternate ops only, this may result in some extra final
17506 // vectorization.
17507 if (NumElts > 1 &&
17508 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
17509 // Success start over because instructions might have been changed.
17510 Changed = true;
17511 } else {
17512 /// \Returns the minimum number of elements that we will attempt to
17513 /// vectorize.
17514 auto GetMinNumElements = [&R](Value *V) {
17515 unsigned EltSize = R.getVectorElementSize(V);
17516 return std::max(a: 2U, b: R.getMaxVecRegSize() / EltSize);
17517 };
17518 if (NumElts < GetMinNumElements(*IncIt) &&
17519 (Candidates.empty() ||
17520 Candidates.front()->getType() == (*IncIt)->getType())) {
17521 Candidates.append(IncIt, std::next(IncIt, NumElts));
17522 }
17523 }
17524 // Final attempt to vectorize instructions with the same types.
17525 if (Candidates.size() > 1 &&
17526 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
17527 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
17528 // Success start over because instructions might have been changed.
17529 Changed = true;
17530 } else if (MaxVFOnly) {
17531 // Try to vectorize using small vectors.
17532 for (auto *It = Candidates.begin(), *End = Candidates.end();
17533 It != End;) {
17534 auto *SameTypeIt = It;
17535 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
17536 ++SameTypeIt;
17537 unsigned NumElts = (SameTypeIt - It);
17538 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
17539 /*MaxVFOnly=*/false))
17540 Changed = true;
17541 It = SameTypeIt;
17542 }
17543 }
17544 Candidates.clear();
17545 }
17546
17547 // Start over at the next instruction of a different type (or the end).
17548 IncIt = SameTypeIt;
17549 }
17550 return Changed;
17551}
17552
17553/// Compare two cmp instructions. If IsCompatibility is true, function returns
17554/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
17555/// operands. If IsCompatibility is false, function implements strict weak
17556/// ordering relation between two cmp instructions, returning true if the first
17557/// instruction is "less" than the second, i.e. its predicate is less than the
17558/// predicate of the second or the operands IDs are less than the operands IDs
17559/// of the second cmp instruction.
17560template <bool IsCompatibility>
17561static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
17562 const DominatorTree &DT) {
17563 assert(isValidElementType(V->getType()) &&
17564 isValidElementType(V2->getType()) &&
17565 "Expected valid element types only.");
17566 if (V == V2)
17567 return IsCompatibility;
17568 auto *CI1 = cast<CmpInst>(Val: V);
17569 auto *CI2 = cast<CmpInst>(Val: V2);
17570 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() <
17571 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
17572 return !IsCompatibility;
17573 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() >
17574 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
17575 return false;
17576 CmpInst::Predicate Pred1 = CI1->getPredicate();
17577 CmpInst::Predicate Pred2 = CI2->getPredicate();
17578 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
17579 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
17580 CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
17581 CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
17582 if (BasePred1 < BasePred2)
17583 return !IsCompatibility;
17584 if (BasePred1 > BasePred2)
17585 return false;
17586 // Compare operands.
17587 bool CI1Preds = Pred1 == BasePred1;
17588 bool CI2Preds = Pred2 == BasePred1;
17589 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
17590 auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - 1);
17591 auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - 1);
17592 if (Op1 == Op2)
17593 continue;
17594 if (Op1->getValueID() < Op2->getValueID())
17595 return !IsCompatibility;
17596 if (Op1->getValueID() > Op2->getValueID())
17597 return false;
17598 if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
17599 if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
17600 if (IsCompatibility) {
17601 if (I1->getParent() != I2->getParent())
17602 return false;
17603 } else {
17604 // Try to compare nodes with same parent.
17605 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
17606 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
17607 if (!NodeI1)
17608 return NodeI2 != nullptr;
17609 if (!NodeI2)
17610 return false;
17611 assert((NodeI1 == NodeI2) ==
17612 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17613 "Different nodes should have different DFS numbers");
17614 if (NodeI1 != NodeI2)
17615 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17616 }
17617 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
17618 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
17619 continue;
17620 if (IsCompatibility)
17621 return false;
17622 if (I1->getOpcode() != I2->getOpcode())
17623 return I1->getOpcode() < I2->getOpcode();
17624 }
17625 }
17626 return IsCompatibility;
17627}
17628
17629template <typename ItT>
17630bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
17631 BasicBlock *BB, BoUpSLP &R) {
17632 bool Changed = false;
17633 // Try to find reductions first.
17634 for (CmpInst *I : CmpInsts) {
17635 if (R.isDeleted(I))
17636 continue;
17637 for (Value *Op : I->operands())
17638 if (auto *RootOp = dyn_cast<Instruction>(Val: Op))
17639 Changed |= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R, TTI);
17640 }
17641 // Try to vectorize operands as vector bundles.
17642 for (CmpInst *I : CmpInsts) {
17643 if (R.isDeleted(I))
17644 continue;
17645 Changed |= tryToVectorize(I, R);
17646 }
17647 // Try to vectorize list of compares.
17648 // Sort by type, compare predicate, etc.
17649 auto CompareSorter = [&](Value *V, Value *V2) {
17650 if (V == V2)
17651 return false;
17652 return compareCmp<false>(V, V2, TLI&: *TLI, DT: *DT);
17653 };
17654
17655 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
17656 if (V1 == V2)
17657 return true;
17658 return compareCmp<true>(V: V1, V2, TLI&: *TLI, DT: *DT);
17659 };
17660
17661 SmallVector<Value *> Vals;
17662 for (Instruction *V : CmpInsts)
17663 if (!R.isDeleted(I: V) && isValidElementType(Ty: V->getType()))
17664 Vals.push_back(Elt: V);
17665 if (Vals.size() <= 1)
17666 return Changed;
17667 Changed |= tryToVectorizeSequence<Value>(
17668 Vals, CompareSorter, AreCompatibleCompares,
17669 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
17670 // Exclude possible reductions from other blocks.
17671 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
17672 return any_of(V->users(), [V](User *U) {
17673 auto *Select = dyn_cast<SelectInst>(Val: U);
17674 return Select &&
17675 Select->getParent() != cast<Instruction>(Val: V)->getParent();
17676 });
17677 });
17678 if (ArePossiblyReducedInOtherBlock)
17679 return false;
17680 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
17681 },
17682 /*MaxVFOnly=*/true, R);
17683 return Changed;
17684}
17685
17686bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
17687 BasicBlock *BB, BoUpSLP &R) {
17688 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
17689 "This function only accepts Insert instructions");
17690 bool OpsChanged = false;
17691 SmallVector<WeakTrackingVH> PostponedInsts;
17692 // pass1 - try to vectorize reductions only
17693 for (auto *I : reverse(C&: Instructions)) {
17694 if (R.isDeleted(I))
17695 continue;
17696 OpsChanged |= vectorizeHorReduction(P: nullptr, Root: I, BB, R, TTI, PostponedInsts);
17697 }
17698 // pass2 - try to match and vectorize a buildvector sequence.
17699 for (auto *I : reverse(C&: Instructions)) {
17700 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
17701 continue;
17702 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
17703 OpsChanged |= vectorizeInsertValueInst(IVI: LastInsertValue, BB, R);
17704 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
17705 OpsChanged |= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R);
17706 }
17707 }
17708 // Now try to vectorize postponed instructions.
17709 OpsChanged |= tryToVectorize(Insts: PostponedInsts, R);
17710
17711 Instructions.clear();
17712 return OpsChanged;
17713}
17714
17715bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
17716 bool Changed = false;
17717 SmallVector<Value *, 4> Incoming;
17718 SmallPtrSet<Value *, 16> VisitedInstrs;
17719 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
17720 // node. Allows better to identify the chains that can be vectorized in the
17721 // better way.
17722 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
17723 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
17724 assert(isValidElementType(V1->getType()) &&
17725 isValidElementType(V2->getType()) &&
17726 "Expected vectorizable types only.");
17727 // It is fine to compare type IDs here, since we expect only vectorizable
17728 // types, like ints, floats and pointers, we don't care about other type.
17729 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
17730 return true;
17731 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
17732 return false;
17733 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
17734 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
17735 if (Opcodes1.size() < Opcodes2.size())
17736 return true;
17737 if (Opcodes1.size() > Opcodes2.size())
17738 return false;
17739 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
17740 {
17741 // Instructions come first.
17742 auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]);
17743 auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I]);
17744 if (I1 && I2) {
17745 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
17746 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
17747 if (!NodeI1)
17748 return NodeI2 != nullptr;
17749 if (!NodeI2)
17750 return false;
17751 assert((NodeI1 == NodeI2) ==
17752 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
17753 "Different nodes should have different DFS numbers");
17754 if (NodeI1 != NodeI2)
17755 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
17756 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
17757 if (S.getOpcode() && !S.isAltShuffle())
17758 continue;
17759 return I1->getOpcode() < I2->getOpcode();
17760 }
17761 if (I1)
17762 return true;
17763 if (I2)
17764 return false;
17765 }
17766 {
17767 // Non-undef constants come next.
17768 bool C1 = isa<Constant>(Val: Opcodes1[I]) && !isa<UndefValue>(Val: Opcodes1[I]);
17769 bool C2 = isa<Constant>(Val: Opcodes2[I]) && !isa<UndefValue>(Val: Opcodes2[I]);
17770 if (C1 && C2)
17771 continue;
17772 if (C1)
17773 return true;
17774 if (C2)
17775 return false;
17776 }
17777 bool U1 = isa<UndefValue>(Val: Opcodes1[I]);
17778 bool U2 = isa<UndefValue>(Val: Opcodes2[I]);
17779 {
17780 // Non-constant non-instructions come next.
17781 if (!U1 && !U2) {
17782 auto ValID1 = Opcodes1[I]->getValueID();
17783 auto ValID2 = Opcodes2[I]->getValueID();
17784 if (ValID1 == ValID2)
17785 continue;
17786 if (ValID1 < ValID2)
17787 return true;
17788 if (ValID1 > ValID2)
17789 return false;
17790 }
17791 if (!U1)
17792 return true;
17793 if (!U2)
17794 return false;
17795 }
17796 // Undefs come last.
17797 assert(U1 && U2 && "The only thing left should be undef & undef.");
17798 continue;
17799 }
17800 return false;
17801 };
17802 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
17803 if (V1 == V2)
17804 return true;
17805 if (V1->getType() != V2->getType())
17806 return false;
17807 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
17808 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
17809 if (Opcodes1.size() != Opcodes2.size())
17810 return false;
17811 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
17812 // Undefs are compatible with any other value.
17813 if (isa<UndefValue>(Val: Opcodes1[I]) || isa<UndefValue>(Val: Opcodes2[I]))
17814 continue;
17815 if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]))
17816 if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I])) {
17817 if (I1->getParent() != I2->getParent())
17818 return false;
17819 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
17820 if (S.getOpcode())
17821 continue;
17822 return false;
17823 }
17824 if (isa<Constant>(Val: Opcodes1[I]) && isa<Constant>(Val: Opcodes2[I]))
17825 continue;
17826 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
17827 return false;
17828 }
17829 return true;
17830 };
17831
17832 bool HaveVectorizedPhiNodes = false;
17833 do {
17834 // Collect the incoming values from the PHIs.
17835 Incoming.clear();
17836 for (Instruction &I : *BB) {
17837 PHINode *P = dyn_cast<PHINode>(Val: &I);
17838 if (!P)
17839 break;
17840
17841 // No need to analyze deleted, vectorized and non-vectorizable
17842 // instructions.
17843 if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
17844 isValidElementType(Ty: P->getType()))
17845 Incoming.push_back(Elt: P);
17846 }
17847
17848 if (Incoming.size() <= 1)
17849 break;
17850
17851 // Find the corresponding non-phi nodes for better matching when trying to
17852 // build the tree.
17853 for (Value *V : Incoming) {
17854 SmallVectorImpl<Value *> &Opcodes =
17855 PHIToOpcodes.try_emplace(Key: V).first->getSecond();
17856 if (!Opcodes.empty())
17857 continue;
17858 SmallVector<Value *, 4> Nodes(1, V);
17859 SmallPtrSet<Value *, 4> Visited;
17860 while (!Nodes.empty()) {
17861 auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
17862 if (!Visited.insert(Ptr: PHI).second)
17863 continue;
17864 for (Value *V : PHI->incoming_values()) {
17865 if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
17866 Nodes.push_back(Elt: PHI1);
17867 continue;
17868 }
17869 Opcodes.emplace_back(Args&: V);
17870 }
17871 }
17872 }
17873
17874 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
17875 Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
17876 TryToVectorizeHelper: [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
17877 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
17878 },
17879 /*MaxVFOnly=*/true, R);
17880 Changed |= HaveVectorizedPhiNodes;
17881 VisitedInstrs.insert(I: Incoming.begin(), E: Incoming.end());
17882 } while (HaveVectorizedPhiNodes);
17883
17884 VisitedInstrs.clear();
17885
17886 InstSetVector PostProcessInserts;
17887 SmallSetVector<CmpInst *, 8> PostProcessCmps;
17888 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
17889 // also vectorizes `PostProcessCmps`.
17890 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
17891 bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
17892 if (VectorizeCmps) {
17893 Changed |= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
17894 PostProcessCmps.clear();
17895 }
17896 PostProcessInserts.clear();
17897 return Changed;
17898 };
17899 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
17900 auto IsInPostProcessInstrs = [&](Instruction *I) {
17901 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
17902 return PostProcessCmps.contains(key: Cmp);
17903 return isa<InsertElementInst, InsertValueInst>(Val: I) &&
17904 PostProcessInserts.contains(key: I);
17905 };
17906 // Returns true if `I` is an instruction without users, like terminator, or
17907 // function call with ignored return value, store. Ignore unused instructions
17908 // (basing on instruction type, except for CallInst and InvokeInst).
17909 auto HasNoUsers = [](Instruction *I) {
17910 return I->use_empty() &&
17911 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(Val: I));
17912 };
17913 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
17914 // Skip instructions with scalable type. The num of elements is unknown at
17915 // compile-time for scalable type.
17916 if (isa<ScalableVectorType>(Val: It->getType()))
17917 continue;
17918
17919 // Skip instructions marked for the deletion.
17920 if (R.isDeleted(I: &*It))
17921 continue;
17922 // We may go through BB multiple times so skip the one we have checked.
17923 if (!VisitedInstrs.insert(Ptr: &*It).second) {
17924 if (HasNoUsers(&*It) &&
17925 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
17926 // We would like to start over since some instructions are deleted
17927 // and the iterator may become invalid value.
17928 Changed = true;
17929 It = BB->begin();
17930 E = BB->end();
17931 }
17932 continue;
17933 }
17934
17935 if (isa<DbgInfoIntrinsic>(Val: It))
17936 continue;
17937
17938 // Try to vectorize reductions that use PHINodes.
17939 if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
17940 // Check that the PHI is a reduction PHI.
17941 if (P->getNumIncomingValues() == 2) {
17942 // Try to match and vectorize a horizontal reduction.
17943 Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
17944 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
17945 Changed = true;
17946 It = BB->begin();
17947 E = BB->end();
17948 continue;
17949 }
17950 }
17951 // Try to vectorize the incoming values of the PHI, to catch reductions
17952 // that feed into PHIs.
17953 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
17954 // Skip if the incoming block is the current BB for now. Also, bypass
17955 // unreachable IR for efficiency and to avoid crashing.
17956 // TODO: Collect the skipped incoming values and try to vectorize them
17957 // after processing BB.
17958 if (BB == P->getIncomingBlock(i: I) ||
17959 !DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
17960 continue;
17961
17962 // Postponed instructions should not be vectorized here, delay their
17963 // vectorization.
17964 if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
17965 PI && !IsInPostProcessInstrs(PI))
17966 Changed |= vectorizeRootInstruction(P: nullptr, Root: PI,
17967 BB: P->getIncomingBlock(i: I), R, TTI);
17968 }
17969 continue;
17970 }
17971
17972 if (HasNoUsers(&*It)) {
17973 bool OpsChanged = false;
17974 auto *SI = dyn_cast<StoreInst>(Val&: It);
17975 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
17976 if (SI) {
17977 auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
17978 // Try to vectorize chain in store, if this is the only store to the
17979 // address in the block.
17980 // TODO: This is just a temporarily solution to save compile time. Need
17981 // to investigate if we can safely turn on slp-vectorize-hor-store
17982 // instead to allow lookup for reduction chains in all non-vectorized
17983 // stores (need to check side effects and compile time).
17984 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
17985 SI->getValueOperand()->hasOneUse();
17986 }
17987 if (TryToVectorizeRoot) {
17988 for (auto *V : It->operand_values()) {
17989 // Postponed instructions should not be vectorized here, delay their
17990 // vectorization.
17991 if (auto *VI = dyn_cast<Instruction>(Val: V);
17992 VI && !IsInPostProcessInstrs(VI))
17993 // Try to match and vectorize a horizontal reduction.
17994 OpsChanged |= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R, TTI);
17995 }
17996 }
17997 // Start vectorization of post-process list of instructions from the
17998 // top-tree instructions to try to vectorize as many instructions as
17999 // possible.
18000 OpsChanged |=
18001 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18002 if (OpsChanged) {
18003 // We would like to start over since some instructions are deleted
18004 // and the iterator may become invalid value.
18005 Changed = true;
18006 It = BB->begin();
18007 E = BB->end();
18008 continue;
18009 }
18010 }
18011
18012 if (isa<InsertElementInst, InsertValueInst>(Val: It))
18013 PostProcessInserts.insert(X: &*It);
18014 else if (isa<CmpInst>(Val: It))
18015 PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
18016 }
18017
18018 return Changed;
18019}
18020
18021bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18022 auto Changed = false;
18023 for (auto &Entry : GEPs) {
18024 // If the getelementptr list has fewer than two elements, there's nothing
18025 // to do.
18026 if (Entry.second.size() < 2)
18027 continue;
18028
18029 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18030 << Entry.second.size() << ".\n");
18031
18032 // Process the GEP list in chunks suitable for the target's supported
18033 // vector size. If a vector register can't hold 1 element, we are done. We
18034 // are trying to vectorize the index computations, so the maximum number of
18035 // elements is based on the size of the index expression, rather than the
18036 // size of the GEP itself (the target's pointer size).
18037 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18038 unsigned EltSize = R.getVectorElementSize(V: *Entry.second[0]->idx_begin());
18039 if (MaxVecRegSize < EltSize)
18040 continue;
18041
18042 unsigned MaxElts = MaxVecRegSize / EltSize;
18043 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18044 auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
18045 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
18046
18047 // Initialize a set a candidate getelementptrs. Note that we use a
18048 // SetVector here to preserve program order. If the index computations
18049 // are vectorizable and begin with loads, we want to minimize the chance
18050 // of having to reorder them later.
18051 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18052
18053 // Some of the candidates may have already been vectorized after we
18054 // initially collected them or their index is optimized to constant value.
18055 // If so, they are marked as deleted, so remove them from the set of
18056 // candidates.
18057 Candidates.remove_if(P: [&R](Value *I) {
18058 return R.isDeleted(I: cast<Instruction>(Val: I)) ||
18059 isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
18060 });
18061
18062 // Remove from the set of candidates all pairs of getelementptrs with
18063 // constant differences. Such getelementptrs are likely not good
18064 // candidates for vectorization in a bottom-up phase since one can be
18065 // computed from the other. We also ensure all candidate getelementptr
18066 // indices are unique.
18067 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
18068 auto *GEPI = GEPList[I];
18069 if (!Candidates.count(key: GEPI))
18070 continue;
18071 auto *SCEVI = SE->getSCEV(V: GEPList[I]);
18072 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
18073 auto *GEPJ = GEPList[J];
18074 auto *SCEVJ = SE->getSCEV(V: GEPList[J]);
18075 if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
18076 Candidates.remove(X: GEPI);
18077 Candidates.remove(X: GEPJ);
18078 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18079 Candidates.remove(X: GEPJ);
18080 }
18081 }
18082 }
18083
18084 // We break out of the above computation as soon as we know there are
18085 // fewer than two candidates remaining.
18086 if (Candidates.size() < 2)
18087 continue;
18088
18089 // Add the single, non-constant index of each candidate to the bundle. We
18090 // ensured the indices met these constraints when we originally collected
18091 // the getelementptrs.
18092 SmallVector<Value *, 16> Bundle(Candidates.size());
18093 auto BundleIndex = 0u;
18094 for (auto *V : Candidates) {
18095 auto *GEP = cast<GetElementPtrInst>(Val: V);
18096 auto *GEPIdx = GEP->idx_begin()->get();
18097 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18098 Bundle[BundleIndex++] = GEPIdx;
18099 }
18100
18101 // Try and vectorize the indices. We are currently only interested in
18102 // gather-like cases of the form:
18103 //
18104 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18105 //
18106 // where the loads of "a", the loads of "b", and the subtractions can be
18107 // performed in parallel. It's likely that detecting this pattern in a
18108 // bottom-up phase will be simpler and less costly than building a
18109 // full-blown top-down phase beginning at the consecutive loads.
18110 Changed |= tryToVectorizeList(VL: Bundle, R);
18111 }
18112 }
18113 return Changed;
18114}
18115
18116bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18117 bool Changed = false;
18118 // Sort by type, base pointers and values operand. Value operands must be
18119 // compatible (have the same opcode, same parent), otherwise it is
18120 // definitely not profitable to try to vectorize them.
18121 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18122 if (V->getValueOperand()->getType()->getTypeID() <
18123 V2->getValueOperand()->getType()->getTypeID())
18124 return true;
18125 if (V->getValueOperand()->getType()->getTypeID() >
18126 V2->getValueOperand()->getType()->getTypeID())
18127 return false;
18128 if (V->getPointerOperandType()->getTypeID() <
18129 V2->getPointerOperandType()->getTypeID())
18130 return true;
18131 if (V->getPointerOperandType()->getTypeID() >
18132 V2->getPointerOperandType()->getTypeID())
18133 return false;
18134 // UndefValues are compatible with all other values.
18135 if (isa<UndefValue>(Val: V->getValueOperand()) ||
18136 isa<UndefValue>(Val: V2->getValueOperand()))
18137 return false;
18138 if (auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand()))
18139 if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
18140 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
18141 DT->getNode(BB: I1->getParent());
18142 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
18143 DT->getNode(BB: I2->getParent());
18144 assert(NodeI1 && "Should only process reachable instructions");
18145 assert(NodeI2 && "Should only process reachable instructions");
18146 assert((NodeI1 == NodeI2) ==
18147 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18148 "Different nodes should have different DFS numbers");
18149 if (NodeI1 != NodeI2)
18150 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18151 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18152 if (S.getOpcode())
18153 return false;
18154 return I1->getOpcode() < I2->getOpcode();
18155 }
18156 if (isa<Constant>(Val: V->getValueOperand()) &&
18157 isa<Constant>(Val: V2->getValueOperand()))
18158 return false;
18159 return V->getValueOperand()->getValueID() <
18160 V2->getValueOperand()->getValueID();
18161 };
18162
18163 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18164 if (V1 == V2)
18165 return true;
18166 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18167 return false;
18168 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18169 return false;
18170 // Undefs are compatible with any other value.
18171 if (isa<UndefValue>(Val: V1->getValueOperand()) ||
18172 isa<UndefValue>(Val: V2->getValueOperand()))
18173 return true;
18174 if (auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand()))
18175 if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
18176 if (I1->getParent() != I2->getParent())
18177 return false;
18178 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18179 return S.getOpcode() > 0;
18180 }
18181 if (isa<Constant>(Val: V1->getValueOperand()) &&
18182 isa<Constant>(Val: V2->getValueOperand()))
18183 return true;
18184 return V1->getValueOperand()->getValueID() ==
18185 V2->getValueOperand()->getValueID();
18186 };
18187
18188 // Attempt to sort and vectorize each of the store-groups.
18189 for (auto &Pair : Stores) {
18190 if (Pair.second.size() < 2)
18191 continue;
18192
18193 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18194 << Pair.second.size() << ".\n");
18195
18196 if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
18197 continue;
18198
18199 // Reverse stores to do bottom-to-top analysis. This is important if the
18200 // values are stores to the same addresses several times, in this case need
18201 // to follow the stores order (reversed to meet the memory dependecies).
18202 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18203 Pair.second.rend());
18204 Changed |= tryToVectorizeSequence<StoreInst>(
18205 Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
18206 TryToVectorizeHelper: [this, &R](ArrayRef<StoreInst *> Candidates, bool) {
18207 return vectorizeStores(Stores: Candidates, R);
18208 },
18209 /*MaxVFOnly=*/false, R);
18210 }
18211 return Changed;
18212}
18213

source code of llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp