1 | //===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "AArch64TargetTransformInfo.h" |
10 | #include "AArch64ExpandImm.h" |
11 | #include "AArch64PerfectShuffle.h" |
12 | #include "MCTargetDesc/AArch64AddressingModes.h" |
13 | #include "llvm/Analysis/IVDescriptors.h" |
14 | #include "llvm/Analysis/LoopInfo.h" |
15 | #include "llvm/Analysis/TargetTransformInfo.h" |
16 | #include "llvm/CodeGen/BasicTTIImpl.h" |
17 | #include "llvm/CodeGen/CostTable.h" |
18 | #include "llvm/CodeGen/TargetLowering.h" |
19 | #include "llvm/IR/IntrinsicInst.h" |
20 | #include "llvm/IR/Intrinsics.h" |
21 | #include "llvm/IR/IntrinsicsAArch64.h" |
22 | #include "llvm/IR/PatternMatch.h" |
23 | #include "llvm/Support/Debug.h" |
24 | #include "llvm/Transforms/InstCombine/InstCombiner.h" |
25 | #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" |
26 | #include <algorithm> |
27 | #include <optional> |
28 | using namespace llvm; |
29 | using namespace llvm::PatternMatch; |
30 | |
31 | #define DEBUG_TYPE "aarch64tti" |
32 | |
33 | static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix" , |
34 | cl::init(Val: true), cl::Hidden); |
35 | |
36 | static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead" , cl::init(Val: 10), |
37 | cl::Hidden); |
38 | |
39 | static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead" , |
40 | cl::init(Val: 10), cl::Hidden); |
41 | |
42 | static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold" , |
43 | cl::init(Val: 15), cl::Hidden); |
44 | |
45 | static cl::opt<unsigned> |
46 | NeonNonConstStrideOverhead("neon-nonconst-stride-overhead" , cl::init(Val: 10), |
47 | cl::Hidden); |
48 | |
49 | static cl::opt<unsigned> CallPenaltyChangeSM( |
50 | "call-penalty-sm-change" , cl::init(Val: 5), cl::Hidden, |
51 | cl::desc( |
52 | "Penalty of calling a function that requires a change to PSTATE.SM" )); |
53 | |
54 | static cl::opt<unsigned> InlineCallPenaltyChangeSM( |
55 | "inline-call-penalty-sm-change" , cl::init(Val: 10), cl::Hidden, |
56 | cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM" )); |
57 | |
58 | static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select" , |
59 | cl::init(Val: true), cl::Hidden); |
60 | |
61 | namespace { |
62 | class TailFoldingOption { |
63 | // These bitfields will only ever be set to something non-zero in operator=, |
64 | // when setting the -sve-tail-folding option. This option should always be of |
65 | // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here |
66 | // InitialBits is one of (disabled|all|simple). EnableBits represents |
67 | // additional flags we're enabling, and DisableBits for those flags we're |
68 | // disabling. The default flag is tracked in the variable NeedsDefault, since |
69 | // at the time of setting the option we may not know what the default value |
70 | // for the CPU is. |
71 | TailFoldingOpts InitialBits = TailFoldingOpts::Disabled; |
72 | TailFoldingOpts EnableBits = TailFoldingOpts::Disabled; |
73 | TailFoldingOpts DisableBits = TailFoldingOpts::Disabled; |
74 | |
75 | // This value needs to be initialised to true in case the user does not |
76 | // explicitly set the -sve-tail-folding option. |
77 | bool NeedsDefault = true; |
78 | |
79 | void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; } |
80 | |
81 | void setNeedsDefault(bool V) { NeedsDefault = V; } |
82 | |
83 | void setEnableBit(TailFoldingOpts Bit) { |
84 | EnableBits |= Bit; |
85 | DisableBits &= ~Bit; |
86 | } |
87 | |
88 | void setDisableBit(TailFoldingOpts Bit) { |
89 | EnableBits &= ~Bit; |
90 | DisableBits |= Bit; |
91 | } |
92 | |
93 | TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const { |
94 | TailFoldingOpts Bits = TailFoldingOpts::Disabled; |
95 | |
96 | assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) && |
97 | "Initial bits should only include one of " |
98 | "(disabled|all|simple|default)" ); |
99 | Bits = NeedsDefault ? DefaultBits : InitialBits; |
100 | Bits |= EnableBits; |
101 | Bits &= ~DisableBits; |
102 | |
103 | return Bits; |
104 | } |
105 | |
106 | void reportError(std::string Opt) { |
107 | errs() << "invalid argument '" << Opt |
108 | << "' to -sve-tail-folding=; the option should be of the form\n" |
109 | " (disabled|all|default|simple)[+(reductions|recurrences" |
110 | "|reverse|noreductions|norecurrences|noreverse)]\n" ; |
111 | report_fatal_error(reason: "Unrecognised tail-folding option" ); |
112 | } |
113 | |
114 | public: |
115 | |
116 | void operator=(const std::string &Val) { |
117 | // If the user explicitly sets -sve-tail-folding= then treat as an error. |
118 | if (Val.empty()) { |
119 | reportError(Opt: "" ); |
120 | return; |
121 | } |
122 | |
123 | // Since the user is explicitly setting the option we don't automatically |
124 | // need the default unless they require it. |
125 | setNeedsDefault(false); |
126 | |
127 | SmallVector<StringRef, 4> TailFoldTypes; |
128 | StringRef(Val).split(A&: TailFoldTypes, Separator: '+', MaxSplit: -1, KeepEmpty: false); |
129 | |
130 | unsigned StartIdx = 1; |
131 | if (TailFoldTypes[0] == "disabled" ) |
132 | setInitialBits(TailFoldingOpts::Disabled); |
133 | else if (TailFoldTypes[0] == "all" ) |
134 | setInitialBits(TailFoldingOpts::All); |
135 | else if (TailFoldTypes[0] == "default" ) |
136 | setNeedsDefault(true); |
137 | else if (TailFoldTypes[0] == "simple" ) |
138 | setInitialBits(TailFoldingOpts::Simple); |
139 | else { |
140 | StartIdx = 0; |
141 | setInitialBits(TailFoldingOpts::Disabled); |
142 | } |
143 | |
144 | for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) { |
145 | if (TailFoldTypes[I] == "reductions" ) |
146 | setEnableBit(TailFoldingOpts::Reductions); |
147 | else if (TailFoldTypes[I] == "recurrences" ) |
148 | setEnableBit(TailFoldingOpts::Recurrences); |
149 | else if (TailFoldTypes[I] == "reverse" ) |
150 | setEnableBit(TailFoldingOpts::Reverse); |
151 | else if (TailFoldTypes[I] == "noreductions" ) |
152 | setDisableBit(TailFoldingOpts::Reductions); |
153 | else if (TailFoldTypes[I] == "norecurrences" ) |
154 | setDisableBit(TailFoldingOpts::Recurrences); |
155 | else if (TailFoldTypes[I] == "noreverse" ) |
156 | setDisableBit(TailFoldingOpts::Reverse); |
157 | else |
158 | reportError(Opt: Val); |
159 | } |
160 | } |
161 | |
162 | bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const { |
163 | return (getBits(DefaultBits) & Required) == Required; |
164 | } |
165 | }; |
166 | } // namespace |
167 | |
168 | TailFoldingOption TailFoldingOptionLoc; |
169 | |
170 | cl::opt<TailFoldingOption, true, cl::parser<std::string>> SVETailFolding( |
171 | "sve-tail-folding" , |
172 | cl::desc( |
173 | "Control the use of vectorisation using tail-folding for SVE where the" |
174 | " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" |
175 | "\ndisabled (Initial) No loop types will vectorize using " |
176 | "tail-folding" |
177 | "\ndefault (Initial) Uses the default tail-folding settings for " |
178 | "the target CPU" |
179 | "\nall (Initial) All legal loop types will vectorize using " |
180 | "tail-folding" |
181 | "\nsimple (Initial) Use tail-folding for simple loops (not " |
182 | "reductions or recurrences)" |
183 | "\nreductions Use tail-folding for loops containing reductions" |
184 | "\nnoreductions Inverse of above" |
185 | "\nrecurrences Use tail-folding for loops containing fixed order " |
186 | "recurrences" |
187 | "\nnorecurrences Inverse of above" |
188 | "\nreverse Use tail-folding for loops requiring reversed " |
189 | "predicates" |
190 | "\nnoreverse Inverse of above" ), |
191 | cl::location(L&: TailFoldingOptionLoc)); |
192 | |
193 | // Experimental option that will only be fully functional when the |
194 | // code-generator is changed to use SVE instead of NEON for all fixed-width |
195 | // operations. |
196 | static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode( |
197 | "enable-fixedwidth-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
198 | |
199 | // Experimental option that will only be fully functional when the cost-model |
200 | // and code-generator have been changed to avoid using scalable vector |
201 | // instructions that are not legal in streaming SVE mode. |
202 | static cl::opt<bool> EnableScalableAutovecInStreamingMode( |
203 | "enable-scalable-autovec-in-streaming-mode" , cl::init(Val: false), cl::Hidden); |
204 | |
205 | static bool isSMEABIRoutineCall(const CallInst &CI) { |
206 | const auto *F = CI.getCalledFunction(); |
207 | return F && StringSwitch<bool>(F->getName()) |
208 | .Case(S: "__arm_sme_state" , Value: true) |
209 | .Case(S: "__arm_tpidr2_save" , Value: true) |
210 | .Case(S: "__arm_tpidr2_restore" , Value: true) |
211 | .Case(S: "__arm_za_disable" , Value: true) |
212 | .Default(Value: false); |
213 | } |
214 | |
215 | /// Returns true if the function has explicit operations that can only be |
216 | /// lowered using incompatible instructions for the selected mode. This also |
217 | /// returns true if the function F may use or modify ZA state. |
218 | static bool hasPossibleIncompatibleOps(const Function *F) { |
219 | for (const BasicBlock &BB : *F) { |
220 | for (const Instruction &I : BB) { |
221 | // Be conservative for now and assume that any call to inline asm or to |
222 | // intrinsics could could result in non-streaming ops (e.g. calls to |
223 | // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that |
224 | // all native LLVM instructions can be lowered to compatible instructions. |
225 | if (isa<CallInst>(Val: I) && !I.isDebugOrPseudoInst() && |
226 | (cast<CallInst>(Val: I).isInlineAsm() || isa<IntrinsicInst>(Val: I) || |
227 | isSMEABIRoutineCall(CI: cast<CallInst>(Val: I)))) |
228 | return true; |
229 | } |
230 | } |
231 | return false; |
232 | } |
233 | |
234 | bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, |
235 | const Function *Callee) const { |
236 | SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee); |
237 | |
238 | // When inlining, we should consider the body of the function, not the |
239 | // interface. |
240 | if (CalleeAttrs.hasStreamingBody()) { |
241 | CalleeAttrs.set(M: SMEAttrs::SM_Compatible, Enable: false); |
242 | CalleeAttrs.set(M: SMEAttrs::SM_Enabled, Enable: true); |
243 | } |
244 | |
245 | if (CalleeAttrs.isNewZA()) |
246 | return false; |
247 | |
248 | if (CallerAttrs.requiresLazySave(Callee: CalleeAttrs) || |
249 | CallerAttrs.requiresSMChange(Callee: CalleeAttrs)) { |
250 | if (hasPossibleIncompatibleOps(F: Callee)) |
251 | return false; |
252 | } |
253 | |
254 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
255 | |
256 | const FeatureBitset &CallerBits = |
257 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
258 | const FeatureBitset &CalleeBits = |
259 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
260 | |
261 | // Inline a callee if its target-features are a subset of the callers |
262 | // target-features. |
263 | return (CallerBits & CalleeBits) == CalleeBits; |
264 | } |
265 | |
266 | bool AArch64TTIImpl::areTypesABICompatible( |
267 | const Function *Caller, const Function *Callee, |
268 | const ArrayRef<Type *> &Types) const { |
269 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
270 | return false; |
271 | |
272 | // We need to ensure that argument promotion does not attempt to promote |
273 | // pointers to fixed-length vector types larger than 128 bits like |
274 | // <8 x float> (and pointers to aggregate types which have such fixed-length |
275 | // vector type members) into the values of the pointees. Such vector types |
276 | // are used for SVE VLS but there is no ABI for SVE VLS arguments and the |
277 | // backend cannot lower such value arguments. The 128-bit fixed-length SVE |
278 | // types can be safely treated as 128-bit NEON types and they cannot be |
279 | // distinguished in IR. |
280 | if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Range: Types, P: [](Type *Ty) { |
281 | auto FVTy = dyn_cast<FixedVectorType>(Val: Ty); |
282 | return FVTy && |
283 | FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128; |
284 | })) |
285 | return false; |
286 | |
287 | return true; |
288 | } |
289 | |
290 | unsigned |
291 | AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, |
292 | unsigned DefaultCallPenalty) const { |
293 | // This function calculates a penalty for executing Call in F. |
294 | // |
295 | // There are two ways this function can be called: |
296 | // (1) F: |
297 | // call from F -> G (the call here is Call) |
298 | // |
299 | // For (1), Call.getCaller() == F, so it will always return a high cost if |
300 | // a streaming-mode change is required (thus promoting the need to inline the |
301 | // function) |
302 | // |
303 | // (2) F: |
304 | // call from F -> G (the call here is not Call) |
305 | // G: |
306 | // call from G -> H (the call here is Call) |
307 | // |
308 | // For (2), if after inlining the body of G into F the call to H requires a |
309 | // streaming-mode change, and the call to G from F would also require a |
310 | // streaming-mode change, then there is benefit to do the streaming-mode |
311 | // change only once and avoid inlining of G into F. |
312 | SMEAttrs FAttrs(*F); |
313 | SMEAttrs CalleeAttrs(Call); |
314 | if (FAttrs.requiresSMChange(Callee: CalleeAttrs)) { |
315 | if (F == Call.getCaller()) // (1) |
316 | return CallPenaltyChangeSM * DefaultCallPenalty; |
317 | if (FAttrs.requiresSMChange(Callee: SMEAttrs(*Call.getCaller()))) // (2) |
318 | return InlineCallPenaltyChangeSM * DefaultCallPenalty; |
319 | } |
320 | |
321 | return DefaultCallPenalty; |
322 | } |
323 | |
324 | bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( |
325 | TargetTransformInfo::RegisterKind K) const { |
326 | assert(K != TargetTransformInfo::RGK_Scalar); |
327 | return (K == TargetTransformInfo::RGK_FixedWidthVector && |
328 | ST->isNeonAvailable()); |
329 | } |
330 | |
331 | /// Calculate the cost of materializing a 64-bit value. This helper |
332 | /// method might only calculate a fraction of a larger immediate. Therefore it |
333 | /// is valid to return a cost of ZERO. |
334 | InstructionCost AArch64TTIImpl::getIntImmCost(int64_t Val) { |
335 | // Check if the immediate can be encoded within an instruction. |
336 | if (Val == 0 || AArch64_AM::isLogicalImmediate(imm: Val, regSize: 64)) |
337 | return 0; |
338 | |
339 | if (Val < 0) |
340 | Val = ~Val; |
341 | |
342 | // Calculate how many moves we will need to materialize this constant. |
343 | SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn; |
344 | AArch64_IMM::expandMOVImm(Imm: Val, BitSize: 64, Insn); |
345 | return Insn.size(); |
346 | } |
347 | |
348 | /// Calculate the cost of materializing the given constant. |
349 | InstructionCost AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
350 | TTI::TargetCostKind CostKind) { |
351 | assert(Ty->isIntegerTy()); |
352 | |
353 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
354 | if (BitSize == 0) |
355 | return ~0U; |
356 | |
357 | // Sign-extend all constants to a multiple of 64-bit. |
358 | APInt ImmVal = Imm; |
359 | if (BitSize & 0x3f) |
360 | ImmVal = Imm.sext(width: (BitSize + 63) & ~0x3fU); |
361 | |
362 | // Split the constant into 64-bit chunks and calculate the cost for each |
363 | // chunk. |
364 | InstructionCost Cost = 0; |
365 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
366 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
367 | int64_t Val = Tmp.getSExtValue(); |
368 | Cost += getIntImmCost(Val); |
369 | } |
370 | // We need at least one instruction to materialze the constant. |
371 | return std::max<InstructionCost>(a: 1, b: Cost); |
372 | } |
373 | |
374 | InstructionCost AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
375 | const APInt &Imm, Type *Ty, |
376 | TTI::TargetCostKind CostKind, |
377 | Instruction *Inst) { |
378 | assert(Ty->isIntegerTy()); |
379 | |
380 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
381 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
382 | // here, so that constant hoisting will ignore this constant. |
383 | if (BitSize == 0) |
384 | return TTI::TCC_Free; |
385 | |
386 | unsigned ImmIdx = ~0U; |
387 | switch (Opcode) { |
388 | default: |
389 | return TTI::TCC_Free; |
390 | case Instruction::GetElementPtr: |
391 | // Always hoist the base address of a GetElementPtr. |
392 | if (Idx == 0) |
393 | return 2 * TTI::TCC_Basic; |
394 | return TTI::TCC_Free; |
395 | case Instruction::Store: |
396 | ImmIdx = 0; |
397 | break; |
398 | case Instruction::Add: |
399 | case Instruction::Sub: |
400 | case Instruction::Mul: |
401 | case Instruction::UDiv: |
402 | case Instruction::SDiv: |
403 | case Instruction::URem: |
404 | case Instruction::SRem: |
405 | case Instruction::And: |
406 | case Instruction::Or: |
407 | case Instruction::Xor: |
408 | case Instruction::ICmp: |
409 | ImmIdx = 1; |
410 | break; |
411 | // Always return TCC_Free for the shift value of a shift instruction. |
412 | case Instruction::Shl: |
413 | case Instruction::LShr: |
414 | case Instruction::AShr: |
415 | if (Idx == 1) |
416 | return TTI::TCC_Free; |
417 | break; |
418 | case Instruction::Trunc: |
419 | case Instruction::ZExt: |
420 | case Instruction::SExt: |
421 | case Instruction::IntToPtr: |
422 | case Instruction::PtrToInt: |
423 | case Instruction::BitCast: |
424 | case Instruction::PHI: |
425 | case Instruction::Call: |
426 | case Instruction::Select: |
427 | case Instruction::Ret: |
428 | case Instruction::Load: |
429 | break; |
430 | } |
431 | |
432 | if (Idx == ImmIdx) { |
433 | int NumConstants = (BitSize + 63) / 64; |
434 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
435 | return (Cost <= NumConstants * TTI::TCC_Basic) |
436 | ? static_cast<int>(TTI::TCC_Free) |
437 | : Cost; |
438 | } |
439 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
440 | } |
441 | |
442 | InstructionCost |
443 | AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
444 | const APInt &Imm, Type *Ty, |
445 | TTI::TargetCostKind CostKind) { |
446 | assert(Ty->isIntegerTy()); |
447 | |
448 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
449 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
450 | // here, so that constant hoisting will ignore this constant. |
451 | if (BitSize == 0) |
452 | return TTI::TCC_Free; |
453 | |
454 | // Most (all?) AArch64 intrinsics do not support folding immediates into the |
455 | // selected instruction, so we compute the materialization cost for the |
456 | // immediate directly. |
457 | if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv) |
458 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
459 | |
460 | switch (IID) { |
461 | default: |
462 | return TTI::TCC_Free; |
463 | case Intrinsic::sadd_with_overflow: |
464 | case Intrinsic::uadd_with_overflow: |
465 | case Intrinsic::ssub_with_overflow: |
466 | case Intrinsic::usub_with_overflow: |
467 | case Intrinsic::smul_with_overflow: |
468 | case Intrinsic::umul_with_overflow: |
469 | if (Idx == 1) { |
470 | int NumConstants = (BitSize + 63) / 64; |
471 | InstructionCost Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
472 | return (Cost <= NumConstants * TTI::TCC_Basic) |
473 | ? static_cast<int>(TTI::TCC_Free) |
474 | : Cost; |
475 | } |
476 | break; |
477 | case Intrinsic::experimental_stackmap: |
478 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
479 | return TTI::TCC_Free; |
480 | break; |
481 | case Intrinsic::experimental_patchpoint_void: |
482 | case Intrinsic::experimental_patchpoint: |
483 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
484 | return TTI::TCC_Free; |
485 | break; |
486 | case Intrinsic::experimental_gc_statepoint: |
487 | if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(x: Imm.getSExtValue()))) |
488 | return TTI::TCC_Free; |
489 | break; |
490 | } |
491 | return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
492 | } |
493 | |
494 | TargetTransformInfo::PopcntSupportKind |
495 | AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { |
496 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
497 | if (TyWidth == 32 || TyWidth == 64) |
498 | return TTI::PSK_FastHardware; |
499 | // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount. |
500 | return TTI::PSK_Software; |
501 | } |
502 | |
503 | static bool isUnpackedVectorVT(EVT VecVT) { |
504 | return VecVT.isScalableVector() && |
505 | VecVT.getSizeInBits().getKnownMinValue() < AArch64::SVEBitsPerBlock; |
506 | } |
507 | |
508 | InstructionCost |
509 | AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
510 | TTI::TargetCostKind CostKind) { |
511 | auto *RetTy = ICA.getReturnType(); |
512 | switch (ICA.getID()) { |
513 | case Intrinsic::umin: |
514 | case Intrinsic::umax: |
515 | case Intrinsic::smin: |
516 | case Intrinsic::smax: { |
517 | static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
518 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
519 | MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, |
520 | MVT::nxv2i64}; |
521 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
522 | // v2i64 types get converted to cmp+bif hence the cost of 2 |
523 | if (LT.second == MVT::v2i64) |
524 | return LT.first * 2; |
525 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; })) |
526 | return LT.first; |
527 | break; |
528 | } |
529 | case Intrinsic::sadd_sat: |
530 | case Intrinsic::ssub_sat: |
531 | case Intrinsic::uadd_sat: |
532 | case Intrinsic::usub_sat: { |
533 | static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
534 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
535 | MVT::v2i64}; |
536 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
537 | // This is a base cost of 1 for the vadd, plus 3 extract shifts if we |
538 | // need to extend the type, as it uses shr(qadd(shl, shl)). |
539 | unsigned Instrs = |
540 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4; |
541 | if (any_of(ValidSatTys, [<](MVT M) { return M == LT.second; })) |
542 | return LT.first * Instrs; |
543 | break; |
544 | } |
545 | case Intrinsic::abs: { |
546 | static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16, |
547 | MVT::v8i16, MVT::v2i32, MVT::v4i32, |
548 | MVT::v2i64}; |
549 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
550 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; })) |
551 | return LT.first; |
552 | break; |
553 | } |
554 | case Intrinsic::bswap: { |
555 | static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32, |
556 | MVT::v4i32, MVT::v2i64}; |
557 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
558 | if (any_of(ValidAbsTys, [<](MVT M) { return M == LT.second; }) && |
559 | LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits()) |
560 | return LT.first; |
561 | break; |
562 | } |
563 | case Intrinsic::experimental_stepvector: { |
564 | InstructionCost Cost = 1; // Cost of the `index' instruction |
565 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
566 | // Legalisation of illegal vectors involves an `index' instruction plus |
567 | // (LT.first - 1) vector adds. |
568 | if (LT.first > 1) { |
569 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: RetTy->getContext()); |
570 | InstructionCost AddCost = |
571 | getArithmeticInstrCost(Opcode: Instruction::Add, Ty: LegalVTy, CostKind); |
572 | Cost += AddCost * (LT.first - 1); |
573 | } |
574 | return Cost; |
575 | } |
576 | case Intrinsic::vector_extract: |
577 | case Intrinsic::vector_insert: { |
578 | // If both the vector and subvector types are legal types and the index |
579 | // is 0, then this should be a no-op or simple operation; return a |
580 | // relatively low cost. |
581 | |
582 | // If arguments aren't actually supplied, then we cannot determine the |
583 | // value of the index. We also want to skip predicate types. |
584 | if (ICA.getArgs().size() != ICA.getArgTypes().size() || |
585 | ICA.getReturnType()->getScalarType()->isIntegerTy(Bitwidth: 1)) |
586 | break; |
587 | |
588 | LLVMContext &C = RetTy->getContext(); |
589 | EVT VecVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
590 | bool = ICA.getID() == Intrinsic::vector_extract; |
591 | EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, Ty: RetTy) |
592 | : getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[1]); |
593 | // Skip this if either the vector or subvector types are unpacked |
594 | // SVE types; they may get lowered to stack stores and loads. |
595 | if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(VecVT: SubVecVT)) |
596 | break; |
597 | |
598 | TargetLoweringBase::LegalizeKind SubVecLK = |
599 | getTLI()->getTypeConversion(Context&: C, VT: SubVecVT); |
600 | TargetLoweringBase::LegalizeKind VecLK = |
601 | getTLI()->getTypeConversion(Context&: C, VT: VecVT); |
602 | const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2]; |
603 | const ConstantInt *CIdx = cast<ConstantInt>(Val: Idx); |
604 | if (SubVecLK.first == TargetLoweringBase::TypeLegal && |
605 | VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero()) |
606 | return TTI::TCC_Free; |
607 | break; |
608 | } |
609 | case Intrinsic::bitreverse: { |
610 | static const CostTblEntry BitreverseTbl[] = { |
611 | {Intrinsic::bitreverse, MVT::i32, 1}, |
612 | {Intrinsic::bitreverse, MVT::i64, 1}, |
613 | {Intrinsic::bitreverse, MVT::v8i8, 1}, |
614 | {Intrinsic::bitreverse, MVT::v16i8, 1}, |
615 | {Intrinsic::bitreverse, MVT::v4i16, 2}, |
616 | {Intrinsic::bitreverse, MVT::v8i16, 2}, |
617 | {Intrinsic::bitreverse, MVT::v2i32, 2}, |
618 | {Intrinsic::bitreverse, MVT::v4i32, 2}, |
619 | {Intrinsic::bitreverse, MVT::v1i64, 2}, |
620 | {Intrinsic::bitreverse, MVT::v2i64, 2}, |
621 | }; |
622 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
623 | const auto *Entry = |
624 | CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second); |
625 | if (Entry) { |
626 | // Cost Model is using the legal type(i32) that i8 and i16 will be |
627 | // converted to +1 so that we match the actual lowering cost |
628 | if (TLI->getValueType(DL, RetTy, true) == MVT::i8 || |
629 | TLI->getValueType(DL, RetTy, true) == MVT::i16) |
630 | return LegalisationCost.first * Entry->Cost + 1; |
631 | |
632 | return LegalisationCost.first * Entry->Cost; |
633 | } |
634 | break; |
635 | } |
636 | case Intrinsic::ctpop: { |
637 | if (!ST->hasNEON()) { |
638 | // 32-bit or 64-bit ctpop without NEON is 12 instructions. |
639 | return getTypeLegalizationCost(Ty: RetTy).first * 12; |
640 | } |
641 | static const CostTblEntry CtpopCostTbl[] = { |
642 | {ISD::CTPOP, MVT::v2i64, 4}, |
643 | {ISD::CTPOP, MVT::v4i32, 3}, |
644 | {ISD::CTPOP, MVT::v8i16, 2}, |
645 | {ISD::CTPOP, MVT::v16i8, 1}, |
646 | {ISD::CTPOP, MVT::i64, 4}, |
647 | {ISD::CTPOP, MVT::v2i32, 3}, |
648 | {ISD::CTPOP, MVT::v4i16, 2}, |
649 | {ISD::CTPOP, MVT::v8i8, 1}, |
650 | {ISD::CTPOP, MVT::i32, 5}, |
651 | }; |
652 | auto LT = getTypeLegalizationCost(Ty: RetTy); |
653 | MVT MTy = LT.second; |
654 | if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) { |
655 | // Extra cost of +1 when illegal vector types are legalized by promoting |
656 | // the integer type. |
657 | int = MTy.isVector() && MTy.getScalarSizeInBits() != |
658 | RetTy->getScalarSizeInBits() |
659 | ? 1 |
660 | : 0; |
661 | return LT.first * Entry->Cost + ExtraCost; |
662 | } |
663 | break; |
664 | } |
665 | case Intrinsic::sadd_with_overflow: |
666 | case Intrinsic::uadd_with_overflow: |
667 | case Intrinsic::ssub_with_overflow: |
668 | case Intrinsic::usub_with_overflow: |
669 | case Intrinsic::smul_with_overflow: |
670 | case Intrinsic::umul_with_overflow: { |
671 | static const CostTblEntry WithOverflowCostTbl[] = { |
672 | {Intrinsic::sadd_with_overflow, MVT::i8, 3}, |
673 | {Intrinsic::uadd_with_overflow, MVT::i8, 3}, |
674 | {Intrinsic::sadd_with_overflow, MVT::i16, 3}, |
675 | {Intrinsic::uadd_with_overflow, MVT::i16, 3}, |
676 | {Intrinsic::sadd_with_overflow, MVT::i32, 1}, |
677 | {Intrinsic::uadd_with_overflow, MVT::i32, 1}, |
678 | {Intrinsic::sadd_with_overflow, MVT::i64, 1}, |
679 | {Intrinsic::uadd_with_overflow, MVT::i64, 1}, |
680 | {Intrinsic::ssub_with_overflow, MVT::i8, 3}, |
681 | {Intrinsic::usub_with_overflow, MVT::i8, 3}, |
682 | {Intrinsic::ssub_with_overflow, MVT::i16, 3}, |
683 | {Intrinsic::usub_with_overflow, MVT::i16, 3}, |
684 | {Intrinsic::ssub_with_overflow, MVT::i32, 1}, |
685 | {Intrinsic::usub_with_overflow, MVT::i32, 1}, |
686 | {Intrinsic::ssub_with_overflow, MVT::i64, 1}, |
687 | {Intrinsic::usub_with_overflow, MVT::i64, 1}, |
688 | {Intrinsic::smul_with_overflow, MVT::i8, 5}, |
689 | {Intrinsic::umul_with_overflow, MVT::i8, 4}, |
690 | {Intrinsic::smul_with_overflow, MVT::i16, 5}, |
691 | {Intrinsic::umul_with_overflow, MVT::i16, 4}, |
692 | {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst |
693 | {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw |
694 | {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp |
695 | {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr |
696 | }; |
697 | EVT MTy = TLI->getValueType(DL, Ty: RetTy->getContainedType(i: 0), AllowUnknown: true); |
698 | if (MTy.isSimple()) |
699 | if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(), |
700 | MTy.getSimpleVT())) |
701 | return Entry->Cost; |
702 | break; |
703 | } |
704 | case Intrinsic::fptosi_sat: |
705 | case Intrinsic::fptoui_sat: { |
706 | if (ICA.getArgTypes().empty()) |
707 | break; |
708 | bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat; |
709 | auto LT = getTypeLegalizationCost(Ty: ICA.getArgTypes()[0]); |
710 | EVT MTy = TLI->getValueType(DL, Ty: RetTy); |
711 | // Check for the legal types, which are where the size of the input and the |
712 | // output are the same, or we are using cvt f64->i32 or f32->i64. |
713 | if ((LT.second == MVT::f32 || LT.second == MVT::f64 || |
714 | LT.second == MVT::v2f32 || LT.second == MVT::v4f32 || |
715 | LT.second == MVT::v2f64) && |
716 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() || |
717 | (LT.second == MVT::f64 && MTy == MVT::i32) || |
718 | (LT.second == MVT::f32 && MTy == MVT::i64))) |
719 | return LT.first; |
720 | // Similarly for fp16 sizes |
721 | if (ST->hasFullFP16() && |
722 | ((LT.second == MVT::f16 && MTy == MVT::i32) || |
723 | ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) && |
724 | (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))) |
725 | return LT.first; |
726 | |
727 | // Otherwise we use a legal convert followed by a min+max |
728 | if ((LT.second.getScalarType() == MVT::f32 || |
729 | LT.second.getScalarType() == MVT::f64 || |
730 | (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) && |
731 | LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) { |
732 | Type *LegalTy = |
733 | Type::getIntNTy(C&: RetTy->getContext(), N: LT.second.getScalarSizeInBits()); |
734 | if (LT.second.isVector()) |
735 | LegalTy = VectorType::get(ElementType: LegalTy, EC: LT.second.getVectorElementCount()); |
736 | InstructionCost Cost = 1; |
737 | IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin, |
738 | LegalTy, {LegalTy, LegalTy}); |
739 | Cost += getIntrinsicInstrCost(ICA: Attrs1, CostKind); |
740 | IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax, |
741 | LegalTy, {LegalTy, LegalTy}); |
742 | Cost += getIntrinsicInstrCost(ICA: Attrs2, CostKind); |
743 | return LT.first * Cost; |
744 | } |
745 | break; |
746 | } |
747 | case Intrinsic::fshl: |
748 | case Intrinsic::fshr: { |
749 | if (ICA.getArgs().empty()) |
750 | break; |
751 | |
752 | // TODO: Add handling for fshl where third argument is not a constant. |
753 | const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(V: ICA.getArgs()[2]); |
754 | if (!OpInfoZ.isConstant()) |
755 | break; |
756 | |
757 | const auto LegalisationCost = getTypeLegalizationCost(Ty: RetTy); |
758 | if (OpInfoZ.isUniform()) { |
759 | // FIXME: The costs could be lower if the codegen is better. |
760 | static const CostTblEntry FshlTbl[] = { |
761 | {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr |
762 | {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4}, |
763 | {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3}, |
764 | {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}}; |
765 | // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl |
766 | // to avoid having to duplicate the costs. |
767 | const auto *Entry = |
768 | CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second); |
769 | if (Entry) |
770 | return LegalisationCost.first * Entry->Cost; |
771 | } |
772 | |
773 | auto TyL = getTypeLegalizationCost(Ty: RetTy); |
774 | if (!RetTy->isIntegerTy()) |
775 | break; |
776 | |
777 | // Estimate cost manually, as types like i8 and i16 will get promoted to |
778 | // i32 and CostTableLookup will ignore the extra conversion cost. |
779 | bool HigherCost = (RetTy->getScalarSizeInBits() != 32 && |
780 | RetTy->getScalarSizeInBits() < 64) || |
781 | (RetTy->getScalarSizeInBits() % 64 != 0); |
782 | unsigned = HigherCost ? 1 : 0; |
783 | if (RetTy->getScalarSizeInBits() == 32 || |
784 | RetTy->getScalarSizeInBits() == 64) |
785 | ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single |
786 | // extr instruction. |
787 | else if (HigherCost) |
788 | ExtraCost = 1; |
789 | else |
790 | break; |
791 | return TyL.first + ExtraCost; |
792 | } |
793 | case Intrinsic::get_active_lane_mask: { |
794 | auto *RetTy = dyn_cast<FixedVectorType>(Val: ICA.getReturnType()); |
795 | if (RetTy) { |
796 | EVT RetVT = getTLI()->getValueType(DL, Ty: RetTy); |
797 | EVT OpVT = getTLI()->getValueType(DL, Ty: ICA.getArgTypes()[0]); |
798 | if (!getTLI()->shouldExpandGetActiveLaneMask(VT: RetVT, OpVT) && |
799 | !getTLI()->isTypeLegal(VT: RetVT)) { |
800 | // We don't have enough context at this point to determine if the mask |
801 | // is going to be kept live after the block, which will force the vXi1 |
802 | // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32. |
803 | // For now, we just assume the vectorizer created this intrinsic and |
804 | // the result will be the input for a PHI. In this case the cost will |
805 | // be extremely high for fixed-width vectors. |
806 | // NOTE: getScalarizationOverhead returns a cost that's far too |
807 | // pessimistic for the actual generated codegen. In reality there are |
808 | // two instructions generated per lane. |
809 | return RetTy->getNumElements() * 2; |
810 | } |
811 | } |
812 | break; |
813 | } |
814 | default: |
815 | break; |
816 | } |
817 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
818 | } |
819 | |
820 | /// The function will remove redundant reinterprets casting in the presence |
821 | /// of the control flow |
822 | static std::optional<Instruction *> processPhiNode(InstCombiner &IC, |
823 | IntrinsicInst &II) { |
824 | SmallVector<Instruction *, 32> Worklist; |
825 | auto RequiredType = II.getType(); |
826 | |
827 | auto *PN = dyn_cast<PHINode>(Val: II.getArgOperand(i: 0)); |
828 | assert(PN && "Expected Phi Node!" ); |
829 | |
830 | // Don't create a new Phi unless we can remove the old one. |
831 | if (!PN->hasOneUse()) |
832 | return std::nullopt; |
833 | |
834 | for (Value *IncValPhi : PN->incoming_values()) { |
835 | auto *Reinterpret = dyn_cast<IntrinsicInst>(Val: IncValPhi); |
836 | if (!Reinterpret || |
837 | Reinterpret->getIntrinsicID() != |
838 | Intrinsic::aarch64_sve_convert_to_svbool || |
839 | RequiredType != Reinterpret->getArgOperand(0)->getType()) |
840 | return std::nullopt; |
841 | } |
842 | |
843 | // Create the new Phi |
844 | IC.Builder.SetInsertPoint(PN); |
845 | PHINode *NPN = IC.Builder.CreatePHI(Ty: RequiredType, NumReservedValues: PN->getNumIncomingValues()); |
846 | Worklist.push_back(Elt: PN); |
847 | |
848 | for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) { |
849 | auto *Reinterpret = cast<Instruction>(Val: PN->getIncomingValue(i: I)); |
850 | NPN->addIncoming(V: Reinterpret->getOperand(i: 0), BB: PN->getIncomingBlock(i: I)); |
851 | Worklist.push_back(Elt: Reinterpret); |
852 | } |
853 | |
854 | // Cleanup Phi Node and reinterprets |
855 | return IC.replaceInstUsesWith(I&: II, V: NPN); |
856 | } |
857 | |
858 | // (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _)))) |
859 | // => (binop (pred) (from_svbool _) (from_svbool _)) |
860 | // |
861 | // The above transformation eliminates a `to_svbool` in the predicate |
862 | // operand of bitwise operation `binop` by narrowing the vector width of |
863 | // the operation. For example, it would convert a `<vscale x 16 x i1> |
864 | // and` into a `<vscale x 4 x i1> and`. This is profitable because |
865 | // to_svbool must zero the new lanes during widening, whereas |
866 | // from_svbool is free. |
867 | static std::optional<Instruction *> |
868 | tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II) { |
869 | auto BinOp = dyn_cast<IntrinsicInst>(Val: II.getOperand(i_nocapture: 0)); |
870 | if (!BinOp) |
871 | return std::nullopt; |
872 | |
873 | auto IntrinsicID = BinOp->getIntrinsicID(); |
874 | switch (IntrinsicID) { |
875 | case Intrinsic::aarch64_sve_and_z: |
876 | case Intrinsic::aarch64_sve_bic_z: |
877 | case Intrinsic::aarch64_sve_eor_z: |
878 | case Intrinsic::aarch64_sve_nand_z: |
879 | case Intrinsic::aarch64_sve_nor_z: |
880 | case Intrinsic::aarch64_sve_orn_z: |
881 | case Intrinsic::aarch64_sve_orr_z: |
882 | break; |
883 | default: |
884 | return std::nullopt; |
885 | } |
886 | |
887 | auto BinOpPred = BinOp->getOperand(i_nocapture: 0); |
888 | auto BinOpOp1 = BinOp->getOperand(i_nocapture: 1); |
889 | auto BinOpOp2 = BinOp->getOperand(i_nocapture: 2); |
890 | |
891 | auto PredIntr = dyn_cast<IntrinsicInst>(Val: BinOpPred); |
892 | if (!PredIntr || |
893 | PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool) |
894 | return std::nullopt; |
895 | |
896 | auto PredOp = PredIntr->getOperand(i_nocapture: 0); |
897 | auto PredOpTy = cast<VectorType>(Val: PredOp->getType()); |
898 | if (PredOpTy != II.getType()) |
899 | return std::nullopt; |
900 | |
901 | SmallVector<Value *> NarrowedBinOpArgs = {PredOp}; |
902 | auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic( |
903 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1}); |
904 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
905 | if (BinOpOp1 == BinOpOp2) |
906 | NarrowedBinOpArgs.push_back(Elt: NarrowBinOpOp1); |
907 | else |
908 | NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic( |
909 | Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2})); |
910 | |
911 | auto NarrowedBinOp = |
912 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {PredOpTy}, Args: NarrowedBinOpArgs); |
913 | return IC.replaceInstUsesWith(I&: II, V: NarrowedBinOp); |
914 | } |
915 | |
916 | static std::optional<Instruction *> |
917 | instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II) { |
918 | // If the reinterpret instruction operand is a PHI Node |
919 | if (isa<PHINode>(Val: II.getArgOperand(i: 0))) |
920 | return processPhiNode(IC, II); |
921 | |
922 | if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II)) |
923 | return BinOpCombine; |
924 | |
925 | // Ignore converts to/from svcount_t. |
926 | if (isa<TargetExtType>(Val: II.getArgOperand(i: 0)->getType()) || |
927 | isa<TargetExtType>(Val: II.getType())) |
928 | return std::nullopt; |
929 | |
930 | SmallVector<Instruction *, 32> CandidatesForRemoval; |
931 | Value *Cursor = II.getOperand(i_nocapture: 0), *EarliestReplacement = nullptr; |
932 | |
933 | const auto *IVTy = cast<VectorType>(Val: II.getType()); |
934 | |
935 | // Walk the chain of conversions. |
936 | while (Cursor) { |
937 | // If the type of the cursor has fewer lanes than the final result, zeroing |
938 | // must take place, which breaks the equivalence chain. |
939 | const auto *CursorVTy = cast<VectorType>(Val: Cursor->getType()); |
940 | if (CursorVTy->getElementCount().getKnownMinValue() < |
941 | IVTy->getElementCount().getKnownMinValue()) |
942 | break; |
943 | |
944 | // If the cursor has the same type as I, it is a viable replacement. |
945 | if (Cursor->getType() == IVTy) |
946 | EarliestReplacement = Cursor; |
947 | |
948 | auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Val: Cursor); |
949 | |
950 | // If this is not an SVE conversion intrinsic, this is the end of the chain. |
951 | if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() == |
952 | Intrinsic::aarch64_sve_convert_to_svbool || |
953 | IntrinsicCursor->getIntrinsicID() == |
954 | Intrinsic::aarch64_sve_convert_from_svbool)) |
955 | break; |
956 | |
957 | CandidatesForRemoval.insert(I: CandidatesForRemoval.begin(), Elt: IntrinsicCursor); |
958 | Cursor = IntrinsicCursor->getOperand(i_nocapture: 0); |
959 | } |
960 | |
961 | // If no viable replacement in the conversion chain was found, there is |
962 | // nothing to do. |
963 | if (!EarliestReplacement) |
964 | return std::nullopt; |
965 | |
966 | return IC.replaceInstUsesWith(I&: II, V: EarliestReplacement); |
967 | } |
968 | |
969 | static bool isAllActivePredicate(Value *Pred) { |
970 | // Look through convert.from.svbool(convert.to.svbool(...) chain. |
971 | Value *UncastedPred; |
972 | if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>( |
973 | m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>( |
974 | m_Value(UncastedPred))))) |
975 | // If the predicate has the same or less lanes than the uncasted |
976 | // predicate then we know the casting has no effect. |
977 | if (cast<ScalableVectorType>(Val: Pred->getType())->getMinNumElements() <= |
978 | cast<ScalableVectorType>(Val: UncastedPred->getType())->getMinNumElements()) |
979 | Pred = UncastedPred; |
980 | |
981 | return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( |
982 | m_ConstantInt<AArch64SVEPredPattern::all>())); |
983 | } |
984 | |
985 | static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC, |
986 | IntrinsicInst &II) { |
987 | // svsel(ptrue, x, y) => x |
988 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
989 | if (isAllActivePredicate(Pred: OpPredicate)) |
990 | return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1)); |
991 | |
992 | auto Select = |
993 | IC.Builder.CreateSelect(C: OpPredicate, True: II.getOperand(i_nocapture: 1), False: II.getOperand(i_nocapture: 2)); |
994 | return IC.replaceInstUsesWith(I&: II, V: Select); |
995 | } |
996 | |
997 | static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC, |
998 | IntrinsicInst &II) { |
999 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
1000 | if (!Pg) |
1001 | return std::nullopt; |
1002 | |
1003 | if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1004 | return std::nullopt; |
1005 | |
1006 | const auto PTruePattern = |
1007 | cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: 0))->getZExtValue(); |
1008 | if (PTruePattern != AArch64SVEPredPattern::vl1) |
1009 | return std::nullopt; |
1010 | |
1011 | // The intrinsic is inserting into lane zero so use an insert instead. |
1012 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1013 | auto *Insert = InsertElementInst::Create( |
1014 | Vec: II.getArgOperand(i: 0), NewElt: II.getArgOperand(i: 2), Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
1015 | Insert->insertBefore(InsertPos: &II); |
1016 | Insert->takeName(V: &II); |
1017 | |
1018 | return IC.replaceInstUsesWith(I&: II, V: Insert); |
1019 | } |
1020 | |
1021 | static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC, |
1022 | IntrinsicInst &II) { |
1023 | // Replace DupX with a regular IR splat. |
1024 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
1025 | Value *Splat = IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), |
1026 | V: II.getArgOperand(i: 0)); |
1027 | Splat->takeName(V: &II); |
1028 | return IC.replaceInstUsesWith(I&: II, V: Splat); |
1029 | } |
1030 | |
1031 | static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC, |
1032 | IntrinsicInst &II) { |
1033 | LLVMContext &Ctx = II.getContext(); |
1034 | |
1035 | // Check that the predicate is all active |
1036 | auto *Pg = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 0)); |
1037 | if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1038 | return std::nullopt; |
1039 | |
1040 | const auto PTruePattern = |
1041 | cast<ConstantInt>(Val: Pg->getOperand(i_nocapture: 0))->getZExtValue(); |
1042 | if (PTruePattern != AArch64SVEPredPattern::all) |
1043 | return std::nullopt; |
1044 | |
1045 | // Check that we have a compare of zero.. |
1046 | auto *SplatValue = |
1047 | dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: II.getArgOperand(i: 2))); |
1048 | if (!SplatValue || !SplatValue->isZero()) |
1049 | return std::nullopt; |
1050 | |
1051 | // ..against a dupq |
1052 | auto *DupQLane = dyn_cast<IntrinsicInst>(Val: II.getArgOperand(i: 1)); |
1053 | if (!DupQLane || |
1054 | DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) |
1055 | return std::nullopt; |
1056 | |
1057 | // Where the dupq is a lane 0 replicate of a vector insert |
1058 | if (!cast<ConstantInt>(Val: DupQLane->getArgOperand(i: 1))->isZero()) |
1059 | return std::nullopt; |
1060 | |
1061 | auto *VecIns = dyn_cast<IntrinsicInst>(Val: DupQLane->getArgOperand(i: 0)); |
1062 | if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert) |
1063 | return std::nullopt; |
1064 | |
1065 | // Where the vector insert is a fixed constant vector insert into undef at |
1066 | // index zero |
1067 | if (!isa<UndefValue>(Val: VecIns->getArgOperand(i: 0))) |
1068 | return std::nullopt; |
1069 | |
1070 | if (!cast<ConstantInt>(Val: VecIns->getArgOperand(i: 2))->isZero()) |
1071 | return std::nullopt; |
1072 | |
1073 | auto *ConstVec = dyn_cast<Constant>(Val: VecIns->getArgOperand(i: 1)); |
1074 | if (!ConstVec) |
1075 | return std::nullopt; |
1076 | |
1077 | auto *VecTy = dyn_cast<FixedVectorType>(Val: ConstVec->getType()); |
1078 | auto *OutTy = dyn_cast<ScalableVectorType>(Val: II.getType()); |
1079 | if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) |
1080 | return std::nullopt; |
1081 | |
1082 | unsigned NumElts = VecTy->getNumElements(); |
1083 | unsigned PredicateBits = 0; |
1084 | |
1085 | // Expand intrinsic operands to a 16-bit byte level predicate |
1086 | for (unsigned I = 0; I < NumElts; ++I) { |
1087 | auto *Arg = dyn_cast<ConstantInt>(Val: ConstVec->getAggregateElement(Elt: I)); |
1088 | if (!Arg) |
1089 | return std::nullopt; |
1090 | if (!Arg->isZero()) |
1091 | PredicateBits |= 1 << (I * (16 / NumElts)); |
1092 | } |
1093 | |
1094 | // If all bits are zero bail early with an empty predicate |
1095 | if (PredicateBits == 0) { |
1096 | auto *PFalse = Constant::getNullValue(Ty: II.getType()); |
1097 | PFalse->takeName(V: &II); |
1098 | return IC.replaceInstUsesWith(I&: II, V: PFalse); |
1099 | } |
1100 | |
1101 | // Calculate largest predicate type used (where byte predicate is largest) |
1102 | unsigned Mask = 8; |
1103 | for (unsigned I = 0; I < 16; ++I) |
1104 | if ((PredicateBits & (1 << I)) != 0) |
1105 | Mask |= (I % 8); |
1106 | |
1107 | unsigned PredSize = Mask & -Mask; |
1108 | auto *PredType = ScalableVectorType::get( |
1109 | ElementType: Type::getInt1Ty(C&: Ctx), MinNumElts: AArch64::SVEBitsPerBlock / (PredSize * 8)); |
1110 | |
1111 | // Ensure all relevant bits are set |
1112 | for (unsigned I = 0; I < 16; I += PredSize) |
1113 | if ((PredicateBits & (1 << I)) == 0) |
1114 | return std::nullopt; |
1115 | |
1116 | auto *PTruePat = |
1117 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); |
1118 | auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, |
1119 | {PredType}, {PTruePat}); |
1120 | auto *ConvertToSVBool = IC.Builder.CreateIntrinsic( |
1121 | Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); |
1122 | auto *ConvertFromSVBool = |
1123 | IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, |
1124 | {II.getType()}, {ConvertToSVBool}); |
1125 | |
1126 | ConvertFromSVBool->takeName(&II); |
1127 | return IC.replaceInstUsesWith(I&: II, V: ConvertFromSVBool); |
1128 | } |
1129 | |
1130 | static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC, |
1131 | IntrinsicInst &II) { |
1132 | Value *Pg = II.getArgOperand(i: 0); |
1133 | Value *Vec = II.getArgOperand(i: 1); |
1134 | auto IntrinsicID = II.getIntrinsicID(); |
1135 | bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta; |
1136 | |
1137 | // lastX(splat(X)) --> X |
1138 | if (auto *SplatVal = getSplatValue(V: Vec)) |
1139 | return IC.replaceInstUsesWith(I&: II, V: SplatVal); |
1140 | |
1141 | // If x and/or y is a splat value then: |
1142 | // lastX (binop (x, y)) --> binop(lastX(x), lastX(y)) |
1143 | Value *LHS, *RHS; |
1144 | if (match(V: Vec, P: m_OneUse(SubPattern: m_BinOp(L: m_Value(V&: LHS), R: m_Value(V&: RHS))))) { |
1145 | if (isSplatValue(V: LHS) || isSplatValue(V: RHS)) { |
1146 | auto *OldBinOp = cast<BinaryOperator>(Val: Vec); |
1147 | auto OpC = OldBinOp->getOpcode(); |
1148 | auto *NewLHS = |
1149 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, LHS}); |
1150 | auto *NewRHS = |
1151 | IC.Builder.CreateIntrinsic(ID: IntrinsicID, Types: {Vec->getType()}, Args: {Pg, RHS}); |
1152 | auto *NewBinOp = BinaryOperator::CreateWithCopiedFlags( |
1153 | Opc: OpC, V1: NewLHS, V2: NewRHS, CopyO: OldBinOp, Name: OldBinOp->getName(), InsertBefore: II.getIterator()); |
1154 | return IC.replaceInstUsesWith(I&: II, V: NewBinOp); |
1155 | } |
1156 | } |
1157 | |
1158 | auto *C = dyn_cast<Constant>(Val: Pg); |
1159 | if (IsAfter && C && C->isNullValue()) { |
1160 | // The intrinsic is extracting lane 0 so use an extract instead. |
1161 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1162 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: 0)); |
1163 | Extract->insertBefore(InsertPos: &II); |
1164 | Extract->takeName(V: &II); |
1165 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
1166 | } |
1167 | |
1168 | auto *IntrPG = dyn_cast<IntrinsicInst>(Val: Pg); |
1169 | if (!IntrPG) |
1170 | return std::nullopt; |
1171 | |
1172 | if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) |
1173 | return std::nullopt; |
1174 | |
1175 | const auto PTruePattern = |
1176 | cast<ConstantInt>(Val: IntrPG->getOperand(i_nocapture: 0))->getZExtValue(); |
1177 | |
1178 | // Can the intrinsic's predicate be converted to a known constant index? |
1179 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern: PTruePattern); |
1180 | if (!MinNumElts) |
1181 | return std::nullopt; |
1182 | |
1183 | unsigned Idx = MinNumElts - 1; |
1184 | // Increment the index if extracting the element after the last active |
1185 | // predicate element. |
1186 | if (IsAfter) |
1187 | ++Idx; |
1188 | |
1189 | // Ignore extracts whose index is larger than the known minimum vector |
1190 | // length. NOTE: This is an artificial constraint where we prefer to |
1191 | // maintain what the user asked for until an alternative is proven faster. |
1192 | auto *PgVTy = cast<ScalableVectorType>(Val: Pg->getType()); |
1193 | if (Idx >= PgVTy->getMinNumElements()) |
1194 | return std::nullopt; |
1195 | |
1196 | // The intrinsic is extracting a fixed lane so use an extract instead. |
1197 | auto *IdxTy = Type::getInt64Ty(C&: II.getContext()); |
1198 | auto * = ExtractElementInst::Create(Vec, Idx: ConstantInt::get(Ty: IdxTy, V: Idx)); |
1199 | Extract->insertBefore(InsertPos: &II); |
1200 | Extract->takeName(V: &II); |
1201 | return IC.replaceInstUsesWith(I&: II, V: Extract); |
1202 | } |
1203 | |
1204 | static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC, |
1205 | IntrinsicInst &II) { |
1206 | // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar |
1207 | // integer variant across a variety of micro-architectures. Replace scalar |
1208 | // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple |
1209 | // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more |
1210 | // depending on the micro-architecture, but has been observed as generally |
1211 | // being faster, particularly when the CLAST[AB] op is a loop-carried |
1212 | // dependency. |
1213 | Value *Pg = II.getArgOperand(i: 0); |
1214 | Value *Fallback = II.getArgOperand(i: 1); |
1215 | Value *Vec = II.getArgOperand(i: 2); |
1216 | Type *Ty = II.getType(); |
1217 | |
1218 | if (!Ty->isIntegerTy()) |
1219 | return std::nullopt; |
1220 | |
1221 | Type *FPTy; |
1222 | switch (cast<IntegerType>(Val: Ty)->getBitWidth()) { |
1223 | default: |
1224 | return std::nullopt; |
1225 | case 16: |
1226 | FPTy = IC.Builder.getHalfTy(); |
1227 | break; |
1228 | case 32: |
1229 | FPTy = IC.Builder.getFloatTy(); |
1230 | break; |
1231 | case 64: |
1232 | FPTy = IC.Builder.getDoubleTy(); |
1233 | break; |
1234 | } |
1235 | |
1236 | Value *FPFallBack = IC.Builder.CreateBitCast(V: Fallback, DestTy: FPTy); |
1237 | auto *FPVTy = VectorType::get( |
1238 | ElementType: FPTy, EC: cast<VectorType>(Val: Vec->getType())->getElementCount()); |
1239 | Value *FPVec = IC.Builder.CreateBitCast(V: Vec, DestTy: FPVTy); |
1240 | auto *FPII = IC.Builder.CreateIntrinsic( |
1241 | ID: II.getIntrinsicID(), Types: {FPVec->getType()}, Args: {Pg, FPFallBack, FPVec}); |
1242 | Value *FPIItoInt = IC.Builder.CreateBitCast(V: FPII, DestTy: II.getType()); |
1243 | return IC.replaceInstUsesWith(I&: II, V: FPIItoInt); |
1244 | } |
1245 | |
1246 | static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC, |
1247 | IntrinsicInst &II) { |
1248 | LLVMContext &Ctx = II.getContext(); |
1249 | // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr |
1250 | // can work with RDFFR_PP for ptest elimination. |
1251 | auto *AllPat = |
1252 | ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); |
1253 | auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, |
1254 | {II.getType()}, {AllPat}); |
1255 | auto *RDFFR = |
1256 | IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue}); |
1257 | RDFFR->takeName(&II); |
1258 | return IC.replaceInstUsesWith(I&: II, V: RDFFR); |
1259 | } |
1260 | |
1261 | static std::optional<Instruction *> |
1262 | instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { |
1263 | const auto Pattern = cast<ConstantInt>(Val: II.getArgOperand(i: 0))->getZExtValue(); |
1264 | |
1265 | if (Pattern == AArch64SVEPredPattern::all) { |
1266 | Constant *StepVal = ConstantInt::get(Ty: II.getType(), V: NumElts); |
1267 | auto *VScale = IC.Builder.CreateVScale(Scaling: StepVal); |
1268 | VScale->takeName(V: &II); |
1269 | return IC.replaceInstUsesWith(I&: II, V: VScale); |
1270 | } |
1271 | |
1272 | unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern); |
1273 | |
1274 | return MinNumElts && NumElts >= MinNumElts |
1275 | ? std::optional<Instruction *>(IC.replaceInstUsesWith( |
1276 | I&: II, V: ConstantInt::get(Ty: II.getType(), V: MinNumElts))) |
1277 | : std::nullopt; |
1278 | } |
1279 | |
1280 | static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC, |
1281 | IntrinsicInst &II) { |
1282 | Value *PgVal = II.getArgOperand(i: 0); |
1283 | Value *OpVal = II.getArgOperand(i: 1); |
1284 | |
1285 | // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X). |
1286 | // Later optimizations prefer this form. |
1287 | if (PgVal == OpVal && |
1288 | (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first || |
1289 | II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) { |
1290 | Value *Ops[] = {PgVal, OpVal}; |
1291 | Type *Tys[] = {PgVal->getType()}; |
1292 | |
1293 | auto *PTest = |
1294 | IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops); |
1295 | PTest->takeName(&II); |
1296 | |
1297 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
1298 | } |
1299 | |
1300 | IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(Val: PgVal); |
1301 | IntrinsicInst *Op = dyn_cast<IntrinsicInst>(Val: OpVal); |
1302 | |
1303 | if (!Pg || !Op) |
1304 | return std::nullopt; |
1305 | |
1306 | Intrinsic::ID OpIID = Op->getIntrinsicID(); |
1307 | |
1308 | if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool && |
1309 | OpIID == Intrinsic::aarch64_sve_convert_to_svbool && |
1310 | Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) { |
1311 | Value *Ops[] = {Pg->getArgOperand(i: 0), Op->getArgOperand(i: 0)}; |
1312 | Type *Tys[] = {Pg->getArgOperand(i: 0)->getType()}; |
1313 | |
1314 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
1315 | |
1316 | PTest->takeName(V: &II); |
1317 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
1318 | } |
1319 | |
1320 | // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)). |
1321 | // Later optimizations may rewrite sequence to use the flag-setting variant |
1322 | // of instruction X to remove PTEST. |
1323 | if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) && |
1324 | ((OpIID == Intrinsic::aarch64_sve_brka_z) || |
1325 | (OpIID == Intrinsic::aarch64_sve_brkb_z) || |
1326 | (OpIID == Intrinsic::aarch64_sve_brkpa_z) || |
1327 | (OpIID == Intrinsic::aarch64_sve_brkpb_z) || |
1328 | (OpIID == Intrinsic::aarch64_sve_rdffr_z) || |
1329 | (OpIID == Intrinsic::aarch64_sve_and_z) || |
1330 | (OpIID == Intrinsic::aarch64_sve_bic_z) || |
1331 | (OpIID == Intrinsic::aarch64_sve_eor_z) || |
1332 | (OpIID == Intrinsic::aarch64_sve_nand_z) || |
1333 | (OpIID == Intrinsic::aarch64_sve_nor_z) || |
1334 | (OpIID == Intrinsic::aarch64_sve_orn_z) || |
1335 | (OpIID == Intrinsic::aarch64_sve_orr_z))) { |
1336 | Value *Ops[] = {Pg->getArgOperand(i: 0), Pg}; |
1337 | Type *Tys[] = {Pg->getType()}; |
1338 | |
1339 | auto *PTest = IC.Builder.CreateIntrinsic(ID: II.getIntrinsicID(), Types: Tys, Args: Ops); |
1340 | PTest->takeName(V: &II); |
1341 | |
1342 | return IC.replaceInstUsesWith(I&: II, V: PTest); |
1343 | } |
1344 | |
1345 | return std::nullopt; |
1346 | } |
1347 | |
1348 | template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc> |
1349 | static std::optional<Instruction *> |
1350 | instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, |
1351 | bool MergeIntoAddendOp) { |
1352 | Value *P = II.getOperand(i_nocapture: 0); |
1353 | Value *MulOp0, *MulOp1, *AddendOp, *Mul; |
1354 | if (MergeIntoAddendOp) { |
1355 | AddendOp = II.getOperand(i_nocapture: 1); |
1356 | Mul = II.getOperand(i_nocapture: 2); |
1357 | } else { |
1358 | AddendOp = II.getOperand(i_nocapture: 2); |
1359 | Mul = II.getOperand(i_nocapture: 1); |
1360 | } |
1361 | |
1362 | if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(V: P), m_Value(V&: MulOp0), |
1363 | m_Value(V&: MulOp1)))) |
1364 | return std::nullopt; |
1365 | |
1366 | if (!Mul->hasOneUse()) |
1367 | return std::nullopt; |
1368 | |
1369 | Instruction *FMFSource = nullptr; |
1370 | if (II.getType()->isFPOrFPVectorTy()) { |
1371 | llvm::FastMathFlags FAddFlags = II.getFastMathFlags(); |
1372 | // Stop the combine when the flags on the inputs differ in case dropping |
1373 | // flags would lead to us missing out on more beneficial optimizations. |
1374 | if (FAddFlags != cast<CallInst>(Val: Mul)->getFastMathFlags()) |
1375 | return std::nullopt; |
1376 | if (!FAddFlags.allowContract()) |
1377 | return std::nullopt; |
1378 | FMFSource = &II; |
1379 | } |
1380 | |
1381 | CallInst *Res; |
1382 | if (MergeIntoAddendOp) |
1383 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
1384 | Args: {P, AddendOp, MulOp0, MulOp1}, FMFSource); |
1385 | else |
1386 | Res = IC.Builder.CreateIntrinsic(ID: FuseOpc, Types: {II.getType()}, |
1387 | Args: {P, MulOp0, MulOp1, AddendOp}, FMFSource); |
1388 | |
1389 | return IC.replaceInstUsesWith(I&: II, V: Res); |
1390 | } |
1391 | |
1392 | static std::optional<Instruction *> |
1393 | instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
1394 | Value *Pred = II.getOperand(i_nocapture: 0); |
1395 | Value *PtrOp = II.getOperand(i_nocapture: 1); |
1396 | Type *VecTy = II.getType(); |
1397 | |
1398 | if (isAllActivePredicate(Pred)) { |
1399 | LoadInst *Load = IC.Builder.CreateLoad(Ty: VecTy, Ptr: PtrOp); |
1400 | Load->copyMetadata(SrcInst: II); |
1401 | return IC.replaceInstUsesWith(I&: II, V: Load); |
1402 | } |
1403 | |
1404 | CallInst *MaskedLoad = |
1405 | IC.Builder.CreateMaskedLoad(Ty: VecTy, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), |
1406 | Mask: Pred, PassThru: ConstantAggregateZero::get(Ty: VecTy)); |
1407 | MaskedLoad->copyMetadata(SrcInst: II); |
1408 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
1409 | } |
1410 | |
1411 | static std::optional<Instruction *> |
1412 | instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL) { |
1413 | Value *VecOp = II.getOperand(i_nocapture: 0); |
1414 | Value *Pred = II.getOperand(i_nocapture: 1); |
1415 | Value *PtrOp = II.getOperand(i_nocapture: 2); |
1416 | |
1417 | if (isAllActivePredicate(Pred)) { |
1418 | StoreInst *Store = IC.Builder.CreateStore(Val: VecOp, Ptr: PtrOp); |
1419 | Store->copyMetadata(SrcInst: II); |
1420 | return IC.eraseInstFromFunction(I&: II); |
1421 | } |
1422 | |
1423 | CallInst *MaskedStore = IC.Builder.CreateMaskedStore( |
1424 | Val: VecOp, Ptr: PtrOp, Alignment: PtrOp->getPointerAlignment(DL), Mask: Pred); |
1425 | MaskedStore->copyMetadata(SrcInst: II); |
1426 | return IC.eraseInstFromFunction(I&: II); |
1427 | } |
1428 | |
1429 | static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic) { |
1430 | switch (Intrinsic) { |
1431 | case Intrinsic::aarch64_sve_fmul_u: |
1432 | return Instruction::BinaryOps::FMul; |
1433 | case Intrinsic::aarch64_sve_fadd_u: |
1434 | return Instruction::BinaryOps::FAdd; |
1435 | case Intrinsic::aarch64_sve_fsub_u: |
1436 | return Instruction::BinaryOps::FSub; |
1437 | default: |
1438 | return Instruction::BinaryOpsEnd; |
1439 | } |
1440 | } |
1441 | |
1442 | static std::optional<Instruction *> |
1443 | instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II) { |
1444 | // Bail due to missing support for ISD::STRICT_ scalable vector operations. |
1445 | if (II.isStrictFP()) |
1446 | return std::nullopt; |
1447 | |
1448 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1449 | auto BinOpCode = intrinsicIDToBinOpCode(Intrinsic: II.getIntrinsicID()); |
1450 | if (BinOpCode == Instruction::BinaryOpsEnd || |
1451 | !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( |
1452 | m_ConstantInt<AArch64SVEPredPattern::all>()))) |
1453 | return std::nullopt; |
1454 | IRBuilderBase::FastMathFlagGuard FMFGuard(IC.Builder); |
1455 | IC.Builder.setFastMathFlags(II.getFastMathFlags()); |
1456 | auto BinOp = |
1457 | IC.Builder.CreateBinOp(Opc: BinOpCode, LHS: II.getOperand(i_nocapture: 1), RHS: II.getOperand(i_nocapture: 2)); |
1458 | return IC.replaceInstUsesWith(I&: II, V: BinOp); |
1459 | } |
1460 | |
1461 | // Canonicalise operations that take an all active predicate (e.g. sve.add -> |
1462 | // sve.add_u). |
1463 | static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II, |
1464 | Intrinsic::ID IID) { |
1465 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1466 | if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>( |
1467 | m_ConstantInt<AArch64SVEPredPattern::all>()))) |
1468 | return std::nullopt; |
1469 | |
1470 | auto *Mod = II.getModule(); |
1471 | auto *NewDecl = Intrinsic::getDeclaration(M: Mod, id: IID, Tys: {II.getType()}); |
1472 | II.setCalledFunction(NewDecl); |
1473 | |
1474 | return &II; |
1475 | } |
1476 | |
1477 | // Simplify operations where predicate has all inactive lanes or try to replace |
1478 | // with _u form when all lanes are active |
1479 | static std::optional<Instruction *> |
1480 | instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, |
1481 | Intrinsic::ID IID) { |
1482 | if (match(V: II.getOperand(i_nocapture: 0), P: m_ZeroInt())) { |
1483 | // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are |
1484 | // inactive for sv[func]_m |
1485 | return IC.replaceInstUsesWith(I&: II, V: II.getOperand(i_nocapture: 1)); |
1486 | } |
1487 | return instCombineSVEAllActive(II, IID); |
1488 | } |
1489 | |
1490 | static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC, |
1491 | IntrinsicInst &II) { |
1492 | if (auto II_U = |
1493 | instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u)) |
1494 | return II_U; |
1495 | if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
1496 | Intrinsic::aarch64_sve_mla>( |
1497 | IC, II, true)) |
1498 | return MLA; |
1499 | if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
1500 | Intrinsic::aarch64_sve_mad>( |
1501 | IC, II, false)) |
1502 | return MAD; |
1503 | return std::nullopt; |
1504 | } |
1505 | |
1506 | static std::optional<Instruction *> |
1507 | instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II) { |
1508 | if (auto II_U = |
1509 | instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u)) |
1510 | return II_U; |
1511 | if (auto FMLA = |
1512 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1513 | Intrinsic::aarch64_sve_fmla>(IC, II, |
1514 | true)) |
1515 | return FMLA; |
1516 | if (auto FMAD = |
1517 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1518 | Intrinsic::aarch64_sve_fmad>(IC, II, |
1519 | false)) |
1520 | return FMAD; |
1521 | if (auto FMLA = |
1522 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1523 | Intrinsic::aarch64_sve_fmla>(IC, II, |
1524 | true)) |
1525 | return FMLA; |
1526 | return std::nullopt; |
1527 | } |
1528 | |
1529 | static std::optional<Instruction *> |
1530 | instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II) { |
1531 | if (auto FMLA = |
1532 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1533 | Intrinsic::aarch64_sve_fmla>(IC, II, |
1534 | true)) |
1535 | return FMLA; |
1536 | if (auto FMAD = |
1537 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1538 | Intrinsic::aarch64_sve_fmad>(IC, II, |
1539 | false)) |
1540 | return FMAD; |
1541 | if (auto FMLA_U = |
1542 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1543 | Intrinsic::aarch64_sve_fmla_u>( |
1544 | IC, II, true)) |
1545 | return FMLA_U; |
1546 | return instCombineSVEVectorBinOp(IC, II); |
1547 | } |
1548 | |
1549 | static std::optional<Instruction *> |
1550 | instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II) { |
1551 | if (auto II_U = |
1552 | instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u)) |
1553 | return II_U; |
1554 | if (auto FMLS = |
1555 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1556 | Intrinsic::aarch64_sve_fmls>(IC, II, |
1557 | true)) |
1558 | return FMLS; |
1559 | if (auto FMSB = |
1560 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1561 | Intrinsic::aarch64_sve_fnmsb>( |
1562 | IC, II, false)) |
1563 | return FMSB; |
1564 | if (auto FMLS = |
1565 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1566 | Intrinsic::aarch64_sve_fmls>(IC, II, |
1567 | true)) |
1568 | return FMLS; |
1569 | return std::nullopt; |
1570 | } |
1571 | |
1572 | static std::optional<Instruction *> |
1573 | instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II) { |
1574 | if (auto FMLS = |
1575 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1576 | Intrinsic::aarch64_sve_fmls>(IC, II, |
1577 | true)) |
1578 | return FMLS; |
1579 | if (auto FMSB = |
1580 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul, |
1581 | Intrinsic::aarch64_sve_fnmsb>( |
1582 | IC, II, false)) |
1583 | return FMSB; |
1584 | if (auto FMLS_U = |
1585 | instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u, |
1586 | Intrinsic::aarch64_sve_fmls_u>( |
1587 | IC, II, true)) |
1588 | return FMLS_U; |
1589 | return instCombineSVEVectorBinOp(IC, II); |
1590 | } |
1591 | |
1592 | static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC, |
1593 | IntrinsicInst &II) { |
1594 | if (auto II_U = |
1595 | instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u)) |
1596 | return II_U; |
1597 | if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul, |
1598 | Intrinsic::aarch64_sve_mls>( |
1599 | IC, II, true)) |
1600 | return MLS; |
1601 | return std::nullopt; |
1602 | } |
1603 | |
1604 | static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC, |
1605 | IntrinsicInst &II, |
1606 | Intrinsic::ID IID) { |
1607 | auto *OpPredicate = II.getOperand(i_nocapture: 0); |
1608 | auto *OpMultiplicand = II.getOperand(i_nocapture: 1); |
1609 | auto *OpMultiplier = II.getOperand(i_nocapture: 2); |
1610 | |
1611 | // Return true if a given instruction is a unit splat value, false otherwise. |
1612 | auto IsUnitSplat = [](auto *I) { |
1613 | auto *SplatValue = getSplatValue(I); |
1614 | if (!SplatValue) |
1615 | return false; |
1616 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); |
1617 | }; |
1618 | |
1619 | // Return true if a given instruction is an aarch64_sve_dup intrinsic call |
1620 | // with a unit splat value, false otherwise. |
1621 | auto IsUnitDup = [](auto *I) { |
1622 | auto *IntrI = dyn_cast<IntrinsicInst>(I); |
1623 | if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup) |
1624 | return false; |
1625 | |
1626 | auto *SplatValue = IntrI->getOperand(2); |
1627 | return match(SplatValue, m_FPOne()) || match(SplatValue, m_One()); |
1628 | }; |
1629 | |
1630 | if (IsUnitSplat(OpMultiplier)) { |
1631 | // [f]mul pg %n, (dupx 1) => %n |
1632 | OpMultiplicand->takeName(V: &II); |
1633 | return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand); |
1634 | } else if (IsUnitDup(OpMultiplier)) { |
1635 | // [f]mul pg %n, (dup pg 1) => %n |
1636 | auto *DupInst = cast<IntrinsicInst>(Val: OpMultiplier); |
1637 | auto *DupPg = DupInst->getOperand(i_nocapture: 1); |
1638 | // TODO: this is naive. The optimization is still valid if DupPg |
1639 | // 'encompasses' OpPredicate, not only if they're the same predicate. |
1640 | if (OpPredicate == DupPg) { |
1641 | OpMultiplicand->takeName(V: &II); |
1642 | return IC.replaceInstUsesWith(I&: II, V: OpMultiplicand); |
1643 | } |
1644 | } |
1645 | |
1646 | return instCombineSVEVectorBinOp(IC, II); |
1647 | } |
1648 | |
1649 | static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC, |
1650 | IntrinsicInst &II) { |
1651 | Value *UnpackArg = II.getArgOperand(i: 0); |
1652 | auto *RetTy = cast<ScalableVectorType>(Val: II.getType()); |
1653 | bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi || |
1654 | II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo; |
1655 | |
1656 | // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X)) |
1657 | // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X)) |
1658 | if (auto *ScalarArg = getSplatValue(V: UnpackArg)) { |
1659 | ScalarArg = |
1660 | IC.Builder.CreateIntCast(V: ScalarArg, DestTy: RetTy->getScalarType(), isSigned: IsSigned); |
1661 | Value *NewVal = |
1662 | IC.Builder.CreateVectorSplat(EC: RetTy->getElementCount(), V: ScalarArg); |
1663 | NewVal->takeName(V: &II); |
1664 | return IC.replaceInstUsesWith(I&: II, V: NewVal); |
1665 | } |
1666 | |
1667 | return std::nullopt; |
1668 | } |
1669 | static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC, |
1670 | IntrinsicInst &II) { |
1671 | auto *OpVal = II.getOperand(i_nocapture: 0); |
1672 | auto *OpIndices = II.getOperand(i_nocapture: 1); |
1673 | VectorType *VTy = cast<VectorType>(Val: II.getType()); |
1674 | |
1675 | // Check whether OpIndices is a constant splat value < minimal element count |
1676 | // of result. |
1677 | auto *SplatValue = dyn_cast_or_null<ConstantInt>(Val: getSplatValue(V: OpIndices)); |
1678 | if (!SplatValue || |
1679 | SplatValue->getValue().uge(RHS: VTy->getElementCount().getKnownMinValue())) |
1680 | return std::nullopt; |
1681 | |
1682 | // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to |
1683 | // splat_vector(extractelement(OpVal, SplatValue)) for further optimization. |
1684 | auto * = IC.Builder.CreateExtractElement(Vec: OpVal, Idx: SplatValue); |
1685 | auto *VectorSplat = |
1686 | IC.Builder.CreateVectorSplat(EC: VTy->getElementCount(), V: Extract); |
1687 | |
1688 | VectorSplat->takeName(V: &II); |
1689 | return IC.replaceInstUsesWith(I&: II, V: VectorSplat); |
1690 | } |
1691 | |
1692 | static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC, |
1693 | IntrinsicInst &II) { |
1694 | Value *A, *B; |
1695 | Type *RetTy = II.getType(); |
1696 | constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool; |
1697 | constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool; |
1698 | |
1699 | // uzp1(to_svbool(A), to_svbool(B)) --> <A, B> |
1700 | // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B> |
1701 | if ((match(II.getArgOperand(i: 0), |
1702 | m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(V&: A)))) && |
1703 | match(II.getArgOperand(i: 1), |
1704 | m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(V&: B))))) || |
1705 | (match(II.getArgOperand(i: 0), m_Intrinsic<ToSVB>(m_Value(V&: A))) && |
1706 | match(II.getArgOperand(i: 1), m_Intrinsic<ToSVB>(m_Value(V&: B))))) { |
1707 | auto *TyA = cast<ScalableVectorType>(Val: A->getType()); |
1708 | if (TyA == B->getType() && |
1709 | RetTy == ScalableVectorType::getDoubleElementsVectorType(VTy: TyA)) { |
1710 | auto *SubVec = IC.Builder.CreateInsertVector( |
1711 | DstType: RetTy, SrcVec: PoisonValue::get(T: RetTy), SubVec: A, Idx: IC.Builder.getInt64(C: 0)); |
1712 | auto *ConcatVec = IC.Builder.CreateInsertVector( |
1713 | DstType: RetTy, SrcVec: SubVec, SubVec: B, Idx: IC.Builder.getInt64(C: TyA->getMinNumElements())); |
1714 | ConcatVec->takeName(V: &II); |
1715 | return IC.replaceInstUsesWith(I&: II, V: ConcatVec); |
1716 | } |
1717 | } |
1718 | |
1719 | return std::nullopt; |
1720 | } |
1721 | |
1722 | static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC, |
1723 | IntrinsicInst &II) { |
1724 | // zip1(uzp1(A, B), uzp2(A, B)) --> A |
1725 | // zip2(uzp1(A, B), uzp2(A, B)) --> B |
1726 | Value *A, *B; |
1727 | if (match(II.getArgOperand(0), |
1728 | m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) && |
1729 | match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>( |
1730 | m_Specific(A), m_Specific(B)))) |
1731 | return IC.replaceInstUsesWith( |
1732 | II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B)); |
1733 | |
1734 | return std::nullopt; |
1735 | } |
1736 | |
1737 | static std::optional<Instruction *> |
1738 | instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II) { |
1739 | Value *Mask = II.getOperand(i_nocapture: 0); |
1740 | Value *BasePtr = II.getOperand(i_nocapture: 1); |
1741 | Value *Index = II.getOperand(i_nocapture: 2); |
1742 | Type *Ty = II.getType(); |
1743 | Value *PassThru = ConstantAggregateZero::get(Ty); |
1744 | |
1745 | // Contiguous gather => masked load. |
1746 | // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1)) |
1747 | // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer) |
1748 | Value *IndexBase; |
1749 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( |
1750 | m_Value(IndexBase), m_SpecificInt(1)))) { |
1751 | Align Alignment = |
1752 | BasePtr->getPointerAlignment(DL: II.getModule()->getDataLayout()); |
1753 | |
1754 | Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty); |
1755 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
1756 | Ptr: BasePtr, IdxList: IndexBase); |
1757 | Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy); |
1758 | CallInst *MaskedLoad = |
1759 | IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru); |
1760 | MaskedLoad->takeName(V: &II); |
1761 | return IC.replaceInstUsesWith(I&: II, V: MaskedLoad); |
1762 | } |
1763 | |
1764 | return std::nullopt; |
1765 | } |
1766 | |
1767 | static std::optional<Instruction *> |
1768 | instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II) { |
1769 | Value *Val = II.getOperand(i_nocapture: 0); |
1770 | Value *Mask = II.getOperand(i_nocapture: 1); |
1771 | Value *BasePtr = II.getOperand(i_nocapture: 2); |
1772 | Value *Index = II.getOperand(i_nocapture: 3); |
1773 | Type *Ty = Val->getType(); |
1774 | |
1775 | // Contiguous scatter => masked store. |
1776 | // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1)) |
1777 | // => (masked.store Value (gep BasePtr IndexBase) Align Mask) |
1778 | Value *IndexBase; |
1779 | if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>( |
1780 | m_Value(IndexBase), m_SpecificInt(1)))) { |
1781 | Align Alignment = |
1782 | BasePtr->getPointerAlignment(DL: II.getModule()->getDataLayout()); |
1783 | |
1784 | Value *Ptr = IC.Builder.CreateGEP(Ty: cast<VectorType>(Val: Ty)->getElementType(), |
1785 | Ptr: BasePtr, IdxList: IndexBase); |
1786 | Type *VecPtrTy = PointerType::getUnqual(ElementType: Ty); |
1787 | Ptr = IC.Builder.CreateBitCast(V: Ptr, DestTy: VecPtrTy); |
1788 | |
1789 | (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask); |
1790 | |
1791 | return IC.eraseInstFromFunction(I&: II); |
1792 | } |
1793 | |
1794 | return std::nullopt; |
1795 | } |
1796 | |
1797 | static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC, |
1798 | IntrinsicInst &II) { |
1799 | Type *Int32Ty = IC.Builder.getInt32Ty(); |
1800 | Value *Pred = II.getOperand(i_nocapture: 0); |
1801 | Value *Vec = II.getOperand(i_nocapture: 1); |
1802 | Value *DivVec = II.getOperand(i_nocapture: 2); |
1803 | |
1804 | Value *SplatValue = getSplatValue(V: DivVec); |
1805 | ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(Val: SplatValue); |
1806 | if (!SplatConstantInt) |
1807 | return std::nullopt; |
1808 | APInt Divisor = SplatConstantInt->getValue(); |
1809 | |
1810 | if (Divisor.isPowerOf2()) { |
1811 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
1812 | auto ASRD = IC.Builder.CreateIntrinsic( |
1813 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); |
1814 | return IC.replaceInstUsesWith(I&: II, V: ASRD); |
1815 | } |
1816 | if (Divisor.isNegatedPowerOf2()) { |
1817 | Divisor.negate(); |
1818 | Constant *DivisorLog2 = ConstantInt::get(Ty: Int32Ty, V: Divisor.logBase2()); |
1819 | auto ASRD = IC.Builder.CreateIntrinsic( |
1820 | Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2}); |
1821 | auto NEG = IC.Builder.CreateIntrinsic( |
1822 | Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD}); |
1823 | return IC.replaceInstUsesWith(I&: II, V: NEG); |
1824 | } |
1825 | |
1826 | return std::nullopt; |
1827 | } |
1828 | |
1829 | bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) { |
1830 | size_t VecSize = Vec.size(); |
1831 | if (VecSize == 1) |
1832 | return true; |
1833 | if (!isPowerOf2_64(Value: VecSize)) |
1834 | return false; |
1835 | size_t HalfVecSize = VecSize / 2; |
1836 | |
1837 | for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize; |
1838 | RHS != Vec.end(); LHS++, RHS++) { |
1839 | if (*LHS != nullptr && *RHS != nullptr) { |
1840 | if (*LHS == *RHS) |
1841 | continue; |
1842 | else |
1843 | return false; |
1844 | } |
1845 | if (!AllowPoison) |
1846 | return false; |
1847 | if (*LHS == nullptr && *RHS != nullptr) |
1848 | *LHS = *RHS; |
1849 | } |
1850 | |
1851 | Vec.resize(N: HalfVecSize); |
1852 | SimplifyValuePattern(Vec, AllowPoison); |
1853 | return true; |
1854 | } |
1855 | |
1856 | // Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B) |
1857 | // to dupqlane(f64(C)) where C is A concatenated with B |
1858 | static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC, |
1859 | IntrinsicInst &II) { |
1860 | Value *CurrentInsertElt = nullptr, *Default = nullptr; |
1861 | if (!match(II.getOperand(0), |
1862 | m_Intrinsic<Intrinsic::vector_insert>( |
1863 | m_Value(Default), m_Value(CurrentInsertElt), m_Value())) || |
1864 | !isa<FixedVectorType>(CurrentInsertElt->getType())) |
1865 | return std::nullopt; |
1866 | auto IIScalableTy = cast<ScalableVectorType>(Val: II.getType()); |
1867 | |
1868 | // Insert the scalars into a container ordered by InsertElement index |
1869 | SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr); |
1870 | while (auto InsertElt = dyn_cast<InsertElementInst>(Val: CurrentInsertElt)) { |
1871 | auto Idx = cast<ConstantInt>(Val: InsertElt->getOperand(i_nocapture: 2)); |
1872 | Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(i_nocapture: 1); |
1873 | CurrentInsertElt = InsertElt->getOperand(i_nocapture: 0); |
1874 | } |
1875 | |
1876 | bool AllowPoison = |
1877 | isa<PoisonValue>(Val: CurrentInsertElt) && isa<PoisonValue>(Val: Default); |
1878 | if (!SimplifyValuePattern(Vec&: Elts, AllowPoison)) |
1879 | return std::nullopt; |
1880 | |
1881 | // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b) |
1882 | Value *InsertEltChain = PoisonValue::get(T: CurrentInsertElt->getType()); |
1883 | for (size_t I = 0; I < Elts.size(); I++) { |
1884 | if (Elts[I] == nullptr) |
1885 | continue; |
1886 | InsertEltChain = IC.Builder.CreateInsertElement(Vec: InsertEltChain, NewElt: Elts[I], |
1887 | Idx: IC.Builder.getInt64(C: I)); |
1888 | } |
1889 | if (InsertEltChain == nullptr) |
1890 | return std::nullopt; |
1891 | |
1892 | // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64 |
1893 | // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector |
1894 | // be bitcast to a type wide enough to fit the sequence, be splatted, and then |
1895 | // be narrowed back to the original type. |
1896 | unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size(); |
1897 | unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() * |
1898 | IIScalableTy->getMinNumElements() / |
1899 | PatternWidth; |
1900 | |
1901 | IntegerType *WideTy = IC.Builder.getIntNTy(N: PatternWidth); |
1902 | auto *WideScalableTy = ScalableVectorType::get(ElementType: WideTy, MinNumElts: PatternElementCount); |
1903 | auto *WideShuffleMaskTy = |
1904 | ScalableVectorType::get(ElementType: IC.Builder.getInt32Ty(), MinNumElts: PatternElementCount); |
1905 | |
1906 | auto ZeroIdx = ConstantInt::get(Ty: IC.Builder.getInt64Ty(), V: APInt(64, 0)); |
1907 | auto InsertSubvector = IC.Builder.CreateInsertVector( |
1908 | DstType: II.getType(), SrcVec: PoisonValue::get(T: II.getType()), SubVec: InsertEltChain, Idx: ZeroIdx); |
1909 | auto WideBitcast = |
1910 | IC.Builder.CreateBitOrPointerCast(V: InsertSubvector, DestTy: WideScalableTy); |
1911 | auto WideShuffleMask = ConstantAggregateZero::get(Ty: WideShuffleMaskTy); |
1912 | auto WideShuffle = IC.Builder.CreateShuffleVector( |
1913 | V1: WideBitcast, V2: PoisonValue::get(T: WideScalableTy), Mask: WideShuffleMask); |
1914 | auto NarrowBitcast = |
1915 | IC.Builder.CreateBitOrPointerCast(V: WideShuffle, DestTy: II.getType()); |
1916 | |
1917 | return IC.replaceInstUsesWith(I&: II, V: NarrowBitcast); |
1918 | } |
1919 | |
1920 | static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC, |
1921 | IntrinsicInst &II) { |
1922 | Value *A = II.getArgOperand(i: 0); |
1923 | Value *B = II.getArgOperand(i: 1); |
1924 | if (A == B) |
1925 | return IC.replaceInstUsesWith(I&: II, V: A); |
1926 | |
1927 | return std::nullopt; |
1928 | } |
1929 | |
1930 | static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC, |
1931 | IntrinsicInst &II) { |
1932 | Value *Pred = II.getOperand(i_nocapture: 0); |
1933 | Value *Vec = II.getOperand(i_nocapture: 1); |
1934 | Value *Shift = II.getOperand(i_nocapture: 2); |
1935 | |
1936 | // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic. |
1937 | Value *AbsPred, *MergedValue; |
1938 | if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>( |
1939 | m_Value(MergedValue), m_Value(AbsPred), m_Value())) && |
1940 | !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>( |
1941 | m_Value(MergedValue), m_Value(AbsPred), m_Value()))) |
1942 | |
1943 | return std::nullopt; |
1944 | |
1945 | // Transform is valid if any of the following are true: |
1946 | // * The ABS merge value is an undef or non-negative |
1947 | // * The ABS predicate is all active |
1948 | // * The ABS predicate and the SRSHL predicates are the same |
1949 | if (!isa<UndefValue>(Val: MergedValue) && !match(V: MergedValue, P: m_NonNegative()) && |
1950 | AbsPred != Pred && !isAllActivePredicate(Pred: AbsPred)) |
1951 | return std::nullopt; |
1952 | |
1953 | // Only valid when the shift amount is non-negative, otherwise the rounding |
1954 | // behaviour of SRSHL cannot be ignored. |
1955 | if (!match(V: Shift, P: m_NonNegative())) |
1956 | return std::nullopt; |
1957 | |
1958 | auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, |
1959 | {II.getType()}, {Pred, Vec, Shift}); |
1960 | |
1961 | return IC.replaceInstUsesWith(I&: II, V: LSL); |
1962 | } |
1963 | |
1964 | std::optional<Instruction *> |
1965 | AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, |
1966 | IntrinsicInst &II) const { |
1967 | Intrinsic::ID IID = II.getIntrinsicID(); |
1968 | switch (IID) { |
1969 | default: |
1970 | break; |
1971 | case Intrinsic::aarch64_neon_fmaxnm: |
1972 | case Intrinsic::aarch64_neon_fminnm: |
1973 | return instCombineMaxMinNM(IC, II); |
1974 | case Intrinsic::aarch64_sve_convert_from_svbool: |
1975 | return instCombineConvertFromSVBool(IC, II); |
1976 | case Intrinsic::aarch64_sve_dup: |
1977 | return instCombineSVEDup(IC, II); |
1978 | case Intrinsic::aarch64_sve_dup_x: |
1979 | return instCombineSVEDupX(IC, II); |
1980 | case Intrinsic::aarch64_sve_cmpne: |
1981 | case Intrinsic::aarch64_sve_cmpne_wide: |
1982 | return instCombineSVECmpNE(IC, II); |
1983 | case Intrinsic::aarch64_sve_rdffr: |
1984 | return instCombineRDFFR(IC, II); |
1985 | case Intrinsic::aarch64_sve_lasta: |
1986 | case Intrinsic::aarch64_sve_lastb: |
1987 | return instCombineSVELast(IC, II); |
1988 | case Intrinsic::aarch64_sve_clasta_n: |
1989 | case Intrinsic::aarch64_sve_clastb_n: |
1990 | return instCombineSVECondLast(IC, II); |
1991 | case Intrinsic::aarch64_sve_cntd: |
1992 | return instCombineSVECntElts(IC, II, NumElts: 2); |
1993 | case Intrinsic::aarch64_sve_cntw: |
1994 | return instCombineSVECntElts(IC, II, NumElts: 4); |
1995 | case Intrinsic::aarch64_sve_cnth: |
1996 | return instCombineSVECntElts(IC, II, NumElts: 8); |
1997 | case Intrinsic::aarch64_sve_cntb: |
1998 | return instCombineSVECntElts(IC, II, NumElts: 16); |
1999 | case Intrinsic::aarch64_sve_ptest_any: |
2000 | case Intrinsic::aarch64_sve_ptest_first: |
2001 | case Intrinsic::aarch64_sve_ptest_last: |
2002 | return instCombineSVEPTest(IC, II); |
2003 | case Intrinsic::aarch64_sve_fabd: |
2004 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u); |
2005 | case Intrinsic::aarch64_sve_fadd: |
2006 | return instCombineSVEVectorFAdd(IC, II); |
2007 | case Intrinsic::aarch64_sve_fadd_u: |
2008 | return instCombineSVEVectorFAddU(IC, II); |
2009 | case Intrinsic::aarch64_sve_fdiv: |
2010 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u); |
2011 | case Intrinsic::aarch64_sve_fmax: |
2012 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u); |
2013 | case Intrinsic::aarch64_sve_fmaxnm: |
2014 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u); |
2015 | case Intrinsic::aarch64_sve_fmin: |
2016 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u); |
2017 | case Intrinsic::aarch64_sve_fminnm: |
2018 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u); |
2019 | case Intrinsic::aarch64_sve_fmla: |
2020 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u); |
2021 | case Intrinsic::aarch64_sve_fmls: |
2022 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u); |
2023 | case Intrinsic::aarch64_sve_fmul: |
2024 | if (auto II_U = |
2025 | instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u)) |
2026 | return II_U; |
2027 | return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); |
2028 | case Intrinsic::aarch64_sve_fmul_u: |
2029 | return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u); |
2030 | case Intrinsic::aarch64_sve_fmulx: |
2031 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u); |
2032 | case Intrinsic::aarch64_sve_fnmla: |
2033 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u); |
2034 | case Intrinsic::aarch64_sve_fnmls: |
2035 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u); |
2036 | case Intrinsic::aarch64_sve_fsub: |
2037 | return instCombineSVEVectorFSub(IC, II); |
2038 | case Intrinsic::aarch64_sve_fsub_u: |
2039 | return instCombineSVEVectorFSubU(IC, II); |
2040 | case Intrinsic::aarch64_sve_add: |
2041 | return instCombineSVEVectorAdd(IC, II); |
2042 | case Intrinsic::aarch64_sve_add_u: |
2043 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
2044 | Intrinsic::aarch64_sve_mla_u>( |
2045 | IC, II, true); |
2046 | case Intrinsic::aarch64_sve_mla: |
2047 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u); |
2048 | case Intrinsic::aarch64_sve_mls: |
2049 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u); |
2050 | case Intrinsic::aarch64_sve_mul: |
2051 | if (auto II_U = |
2052 | instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u)) |
2053 | return II_U; |
2054 | return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); |
2055 | case Intrinsic::aarch64_sve_mul_u: |
2056 | return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u); |
2057 | case Intrinsic::aarch64_sve_sabd: |
2058 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u); |
2059 | case Intrinsic::aarch64_sve_smax: |
2060 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u); |
2061 | case Intrinsic::aarch64_sve_smin: |
2062 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u); |
2063 | case Intrinsic::aarch64_sve_smulh: |
2064 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u); |
2065 | case Intrinsic::aarch64_sve_sub: |
2066 | return instCombineSVEVectorSub(IC, II); |
2067 | case Intrinsic::aarch64_sve_sub_u: |
2068 | return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u, |
2069 | Intrinsic::aarch64_sve_mls_u>( |
2070 | IC, II, true); |
2071 | case Intrinsic::aarch64_sve_uabd: |
2072 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u); |
2073 | case Intrinsic::aarch64_sve_umax: |
2074 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u); |
2075 | case Intrinsic::aarch64_sve_umin: |
2076 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u); |
2077 | case Intrinsic::aarch64_sve_umulh: |
2078 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u); |
2079 | case Intrinsic::aarch64_sve_asr: |
2080 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u); |
2081 | case Intrinsic::aarch64_sve_lsl: |
2082 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u); |
2083 | case Intrinsic::aarch64_sve_lsr: |
2084 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u); |
2085 | case Intrinsic::aarch64_sve_and: |
2086 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u); |
2087 | case Intrinsic::aarch64_sve_bic: |
2088 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u); |
2089 | case Intrinsic::aarch64_sve_eor: |
2090 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u); |
2091 | case Intrinsic::aarch64_sve_orr: |
2092 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u); |
2093 | case Intrinsic::aarch64_sve_sqsub: |
2094 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u); |
2095 | case Intrinsic::aarch64_sve_uqsub: |
2096 | return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u); |
2097 | case Intrinsic::aarch64_sve_tbl: |
2098 | return instCombineSVETBL(IC, II); |
2099 | case Intrinsic::aarch64_sve_uunpkhi: |
2100 | case Intrinsic::aarch64_sve_uunpklo: |
2101 | case Intrinsic::aarch64_sve_sunpkhi: |
2102 | case Intrinsic::aarch64_sve_sunpklo: |
2103 | return instCombineSVEUnpack(IC, II); |
2104 | case Intrinsic::aarch64_sve_uzp1: |
2105 | return instCombineSVEUzp1(IC, II); |
2106 | case Intrinsic::aarch64_sve_zip1: |
2107 | case Intrinsic::aarch64_sve_zip2: |
2108 | return instCombineSVEZip(IC, II); |
2109 | case Intrinsic::aarch64_sve_ld1_gather_index: |
2110 | return instCombineLD1GatherIndex(IC, II); |
2111 | case Intrinsic::aarch64_sve_st1_scatter_index: |
2112 | return instCombineST1ScatterIndex(IC, II); |
2113 | case Intrinsic::aarch64_sve_ld1: |
2114 | return instCombineSVELD1(IC, II, DL); |
2115 | case Intrinsic::aarch64_sve_st1: |
2116 | return instCombineSVEST1(IC, II, DL); |
2117 | case Intrinsic::aarch64_sve_sdiv: |
2118 | return instCombineSVESDIV(IC, II); |
2119 | case Intrinsic::aarch64_sve_sel: |
2120 | return instCombineSVESel(IC, II); |
2121 | case Intrinsic::aarch64_sve_srshl: |
2122 | return instCombineSVESrshl(IC, II); |
2123 | case Intrinsic::aarch64_sve_dupq_lane: |
2124 | return instCombineSVEDupqLane(IC, II); |
2125 | } |
2126 | |
2127 | return std::nullopt; |
2128 | } |
2129 | |
2130 | std::optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic( |
2131 | InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts, |
2132 | APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, |
2133 | std::function<void(Instruction *, unsigned, APInt, APInt &)> |
2134 | SimplifyAndSetOp) const { |
2135 | switch (II.getIntrinsicID()) { |
2136 | default: |
2137 | break; |
2138 | case Intrinsic::aarch64_neon_fcvtxn: |
2139 | case Intrinsic::aarch64_neon_rshrn: |
2140 | case Intrinsic::aarch64_neon_sqrshrn: |
2141 | case Intrinsic::aarch64_neon_sqrshrun: |
2142 | case Intrinsic::aarch64_neon_sqshrn: |
2143 | case Intrinsic::aarch64_neon_sqshrun: |
2144 | case Intrinsic::aarch64_neon_sqxtn: |
2145 | case Intrinsic::aarch64_neon_sqxtun: |
2146 | case Intrinsic::aarch64_neon_uqrshrn: |
2147 | case Intrinsic::aarch64_neon_uqshrn: |
2148 | case Intrinsic::aarch64_neon_uqxtn: |
2149 | SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts); |
2150 | break; |
2151 | } |
2152 | |
2153 | return std::nullopt; |
2154 | } |
2155 | |
2156 | TypeSize |
2157 | AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
2158 | switch (K) { |
2159 | case TargetTransformInfo::RGK_Scalar: |
2160 | return TypeSize::getFixed(ExactSize: 64); |
2161 | case TargetTransformInfo::RGK_FixedWidthVector: |
2162 | if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode) |
2163 | return TypeSize::getFixed(ExactSize: 0); |
2164 | |
2165 | if (ST->hasSVE()) |
2166 | return TypeSize::getFixed( |
2167 | ExactSize: std::max(a: ST->getMinSVEVectorSizeInBits(), b: 128u)); |
2168 | |
2169 | return TypeSize::getFixed(ExactSize: ST->hasNEON() ? 128 : 0); |
2170 | case TargetTransformInfo::RGK_ScalableVector: |
2171 | if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode) |
2172 | return TypeSize::getScalable(MinimumSize: 0); |
2173 | |
2174 | return TypeSize::getScalable(MinimumSize: ST->hasSVE() ? 128 : 0); |
2175 | } |
2176 | llvm_unreachable("Unsupported register kind" ); |
2177 | } |
2178 | |
2179 | bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, |
2180 | ArrayRef<const Value *> Args, |
2181 | Type *SrcOverrideTy) { |
2182 | // A helper that returns a vector type from the given type. The number of |
2183 | // elements in type Ty determines the vector width. |
2184 | auto toVectorTy = [&](Type *ArgTy) { |
2185 | return VectorType::get(ElementType: ArgTy->getScalarType(), |
2186 | EC: cast<VectorType>(Val: DstTy)->getElementCount()); |
2187 | }; |
2188 | |
2189 | // Exit early if DstTy is not a vector type whose elements are one of [i16, |
2190 | // i32, i64]. SVE doesn't generally have the same set of instructions to |
2191 | // perform an extend with the add/sub/mul. There are SMULLB style |
2192 | // instructions, but they operate on top/bottom, requiring some sort of lane |
2193 | // interleaving to be used with zext/sext. |
2194 | unsigned DstEltSize = DstTy->getScalarSizeInBits(); |
2195 | if (!useNeonVector(Ty: DstTy) || Args.size() != 2 || |
2196 | (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64)) |
2197 | return false; |
2198 | |
2199 | // Determine if the operation has a widening variant. We consider both the |
2200 | // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the |
2201 | // instructions. |
2202 | // |
2203 | // TODO: Add additional widening operations (e.g., shl, etc.) once we |
2204 | // verify that their extending operands are eliminated during code |
2205 | // generation. |
2206 | Type *SrcTy = SrcOverrideTy; |
2207 | switch (Opcode) { |
2208 | case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). |
2209 | case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). |
2210 | // The second operand needs to be an extend |
2211 | if (isa<SExtInst>(Val: Args[1]) || isa<ZExtInst>(Val: Args[1])) { |
2212 | if (!SrcTy) |
2213 | SrcTy = |
2214 | toVectorTy(cast<Instruction>(Val: Args[1])->getOperand(i: 0)->getType()); |
2215 | } else |
2216 | return false; |
2217 | break; |
2218 | case Instruction::Mul: { // SMULL(2), UMULL(2) |
2219 | // Both operands need to be extends of the same type. |
2220 | if ((isa<SExtInst>(Val: Args[0]) && isa<SExtInst>(Val: Args[1])) || |
2221 | (isa<ZExtInst>(Val: Args[0]) && isa<ZExtInst>(Val: Args[1]))) { |
2222 | if (!SrcTy) |
2223 | SrcTy = |
2224 | toVectorTy(cast<Instruction>(Val: Args[0])->getOperand(i: 0)->getType()); |
2225 | } else if (isa<ZExtInst>(Val: Args[0]) || isa<ZExtInst>(Val: Args[1])) { |
2226 | // If one of the operands is a Zext and the other has enough zero bits to |
2227 | // be treated as unsigned, we can still general a umull, meaning the zext |
2228 | // is free. |
2229 | KnownBits Known = |
2230 | computeKnownBits(V: isa<ZExtInst>(Val: Args[0]) ? Args[1] : Args[0], DL); |
2231 | if (Args[0]->getType()->getScalarSizeInBits() - |
2232 | Known.Zero.countLeadingOnes() > |
2233 | DstTy->getScalarSizeInBits() / 2) |
2234 | return false; |
2235 | if (!SrcTy) |
2236 | SrcTy = toVectorTy(Type::getIntNTy(C&: DstTy->getContext(), |
2237 | N: DstTy->getScalarSizeInBits() / 2)); |
2238 | } else |
2239 | return false; |
2240 | break; |
2241 | } |
2242 | default: |
2243 | return false; |
2244 | } |
2245 | |
2246 | // Legalize the destination type and ensure it can be used in a widening |
2247 | // operation. |
2248 | auto DstTyL = getTypeLegalizationCost(Ty: DstTy); |
2249 | if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits()) |
2250 | return false; |
2251 | |
2252 | // Legalize the source type and ensure it can be used in a widening |
2253 | // operation. |
2254 | assert(SrcTy && "Expected some SrcTy" ); |
2255 | auto SrcTyL = getTypeLegalizationCost(Ty: SrcTy); |
2256 | unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); |
2257 | if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) |
2258 | return false; |
2259 | |
2260 | // Get the total number of vector elements in the legalized types. |
2261 | InstructionCost NumDstEls = |
2262 | DstTyL.first * DstTyL.second.getVectorMinNumElements(); |
2263 | InstructionCost NumSrcEls = |
2264 | SrcTyL.first * SrcTyL.second.getVectorMinNumElements(); |
2265 | |
2266 | // Return true if the legalized types have the same number of vector elements |
2267 | // and the destination element type size is twice that of the source type. |
2268 | return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize; |
2269 | } |
2270 | |
2271 | // s/urhadd instructions implement the following pattern, making the |
2272 | // extends free: |
2273 | // %x = add ((zext i8 -> i16), 1) |
2274 | // %y = (zext i8 -> i16) |
2275 | // trunc i16 (lshr (add %x, %y), 1) -> i8 |
2276 | // |
2277 | bool AArch64TTIImpl::isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, |
2278 | Type *Src) { |
2279 | // The source should be a legal vector type. |
2280 | if (!Src->isVectorTy() || !TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: Src)) || |
2281 | (Src->isScalableTy() && !ST->hasSVE2())) |
2282 | return false; |
2283 | |
2284 | if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse()) |
2285 | return false; |
2286 | |
2287 | // Look for trunc/shl/add before trying to match the pattern. |
2288 | const Instruction *Add = ExtUser; |
2289 | auto *AddUser = |
2290 | dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
2291 | if (AddUser && AddUser->getOpcode() == Instruction::Add) |
2292 | Add = AddUser; |
2293 | |
2294 | auto *Shr = dyn_cast_or_null<Instruction>(Val: Add->getUniqueUndroppableUser()); |
2295 | if (!Shr || Shr->getOpcode() != Instruction::LShr) |
2296 | return false; |
2297 | |
2298 | auto *Trunc = dyn_cast_or_null<Instruction>(Val: Shr->getUniqueUndroppableUser()); |
2299 | if (!Trunc || Trunc->getOpcode() != Instruction::Trunc || |
2300 | Src->getScalarSizeInBits() != |
2301 | cast<CastInst>(Val: Trunc)->getDestTy()->getScalarSizeInBits()) |
2302 | return false; |
2303 | |
2304 | // Try to match the whole pattern. Ext could be either the first or second |
2305 | // m_ZExtOrSExt matched. |
2306 | Instruction *Ex1, *Ex2; |
2307 | if (!(match(V: Add, P: m_c_Add(L: m_Instruction(I&: Ex1), |
2308 | R: m_c_Add(L: m_Instruction(I&: Ex2), R: m_SpecificInt(V: 1)))))) |
2309 | return false; |
2310 | |
2311 | // Ensure both extends are of the same type |
2312 | if (match(V: Ex1, P: m_ZExtOrSExt(Op: m_Value())) && |
2313 | Ex1->getOpcode() == Ex2->getOpcode()) |
2314 | return true; |
2315 | |
2316 | return false; |
2317 | } |
2318 | |
2319 | InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
2320 | Type *Src, |
2321 | TTI::CastContextHint CCH, |
2322 | TTI::TargetCostKind CostKind, |
2323 | const Instruction *I) { |
2324 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
2325 | assert(ISD && "Invalid opcode" ); |
2326 | // If the cast is observable, and it is used by a widening instruction (e.g., |
2327 | // uaddl, saddw, etc.), it may be free. |
2328 | if (I && I->hasOneUser()) { |
2329 | auto *SingleUser = cast<Instruction>(Val: *I->user_begin()); |
2330 | SmallVector<const Value *, 4> Operands(SingleUser->operand_values()); |
2331 | if (isWideningInstruction(DstTy: Dst, Opcode: SingleUser->getOpcode(), Args: Operands, SrcOverrideTy: Src)) { |
2332 | // For adds only count the second operand as free if both operands are |
2333 | // extends but not the same operation. (i.e both operands are not free in |
2334 | // add(sext, zext)). |
2335 | if (SingleUser->getOpcode() == Instruction::Add) { |
2336 | if (I == SingleUser->getOperand(i: 1) || |
2337 | (isa<CastInst>(Val: SingleUser->getOperand(i: 1)) && |
2338 | cast<CastInst>(Val: SingleUser->getOperand(i: 1))->getOpcode() == Opcode)) |
2339 | return 0; |
2340 | } else // Others are free so long as isWideningInstruction returned true. |
2341 | return 0; |
2342 | } |
2343 | |
2344 | // The cast will be free for the s/urhadd instructions |
2345 | if ((isa<ZExtInst>(Val: I) || isa<SExtInst>(Val: I)) && |
2346 | isExtPartOfAvgExpr(ExtUser: SingleUser, Dst, Src)) |
2347 | return 0; |
2348 | } |
2349 | |
2350 | // TODO: Allow non-throughput costs that aren't binary. |
2351 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
2352 | if (CostKind != TTI::TCK_RecipThroughput) |
2353 | return Cost == 0 ? 0 : 1; |
2354 | return Cost; |
2355 | }; |
2356 | |
2357 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
2358 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
2359 | |
2360 | if (!SrcTy.isSimple() || !DstTy.isSimple()) |
2361 | return AdjustCost( |
2362 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
2363 | |
2364 | static const TypeConversionCostTblEntry |
2365 | ConversionTbl[] = { |
2366 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn |
2367 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn |
2368 | { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn |
2369 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn |
2370 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1 |
2371 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn |
2372 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn |
2373 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1 |
2374 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn |
2375 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn |
2376 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn |
2377 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1 |
2378 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1 |
2379 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1 |
2380 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1 |
2381 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1 |
2382 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1 |
2383 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1 |
2384 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1 |
2385 | { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1 |
2386 | |
2387 | // Truncations on nxvmiN |
2388 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 }, |
2389 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 }, |
2390 | { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 }, |
2391 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 }, |
2392 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 }, |
2393 | { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 }, |
2394 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 }, |
2395 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 }, |
2396 | { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 }, |
2397 | { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 }, |
2398 | { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 }, |
2399 | { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 }, |
2400 | { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 }, |
2401 | { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 }, |
2402 | { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 }, |
2403 | { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 }, |
2404 | |
2405 | // The number of shll instructions for the extension. |
2406 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
2407 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 }, |
2408 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
2409 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
2410 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, |
2411 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 }, |
2412 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
2413 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
2414 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, |
2415 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 }, |
2416 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, |
2417 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 }, |
2418 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
2419 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
2420 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, |
2421 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 }, |
2422 | |
2423 | // LowerVectorINT_TO_FP: |
2424 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
2425 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
2426 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
2427 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
2428 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
2429 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
2430 | |
2431 | // Complex: to v2f32 |
2432 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, |
2433 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, |
2434 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, |
2435 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 }, |
2436 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 }, |
2437 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 }, |
2438 | |
2439 | // Complex: to v4f32 |
2440 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 }, |
2441 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, |
2442 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 }, |
2443 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 }, |
2444 | |
2445 | // Complex: to v8f32 |
2446 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, |
2447 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, |
2448 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 }, |
2449 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, |
2450 | |
2451 | // Complex: to v16f32 |
2452 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, |
2453 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 }, |
2454 | |
2455 | // Complex: to v2f64 |
2456 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, |
2457 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, |
2458 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, |
2459 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 }, |
2460 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 }, |
2461 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 }, |
2462 | |
2463 | // Complex: to v4f64 |
2464 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, |
2465 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 }, |
2466 | |
2467 | // LowerVectorFP_TO_INT |
2468 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 }, |
2469 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, |
2470 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, |
2471 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, |
2472 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, |
2473 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, |
2474 | |
2475 | // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext). |
2476 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 }, |
2477 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 }, |
2478 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 }, |
2479 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 }, |
2480 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 }, |
2481 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 }, |
2482 | |
2483 | // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2 |
2484 | { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 }, |
2485 | { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 }, |
2486 | { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 }, |
2487 | { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 }, |
2488 | |
2489 | // Complex, from nxv2f32. |
2490 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, |
2491 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, |
2492 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, |
2493 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, |
2494 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 }, |
2495 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 }, |
2496 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 }, |
2497 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 }, |
2498 | |
2499 | // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2. |
2500 | { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 }, |
2501 | { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 }, |
2502 | { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 }, |
2503 | { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 }, |
2504 | { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 }, |
2505 | { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 }, |
2506 | |
2507 | // Complex, from nxv2f64. |
2508 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, |
2509 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, |
2510 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, |
2511 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, |
2512 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 }, |
2513 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 }, |
2514 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 }, |
2515 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 }, |
2516 | |
2517 | // Complex, from nxv4f32. |
2518 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, |
2519 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, |
2520 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, |
2521 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, |
2522 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 }, |
2523 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 }, |
2524 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 }, |
2525 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 }, |
2526 | |
2527 | // Complex, from nxv8f64. Illegal -> illegal conversions not required. |
2528 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, |
2529 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, |
2530 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 }, |
2531 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 }, |
2532 | |
2533 | // Complex, from nxv4f64. Illegal -> illegal conversions not required. |
2534 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, |
2535 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, |
2536 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, |
2537 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 }, |
2538 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 }, |
2539 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 }, |
2540 | |
2541 | // Complex, from nxv8f32. Illegal -> illegal conversions not required. |
2542 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, |
2543 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, |
2544 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 }, |
2545 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 }, |
2546 | |
2547 | // Complex, from nxv8f16. |
2548 | { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, |
2549 | { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, |
2550 | { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, |
2551 | { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, |
2552 | { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 }, |
2553 | { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 }, |
2554 | { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 }, |
2555 | { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 }, |
2556 | |
2557 | // Complex, from nxv4f16. |
2558 | { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, |
2559 | { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, |
2560 | { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, |
2561 | { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, |
2562 | { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 }, |
2563 | { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 }, |
2564 | { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 }, |
2565 | { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 }, |
2566 | |
2567 | // Complex, from nxv2f16. |
2568 | { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, |
2569 | { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, |
2570 | { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, |
2571 | { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, |
2572 | { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 }, |
2573 | { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 }, |
2574 | { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 }, |
2575 | { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 }, |
2576 | |
2577 | // Truncate from nxvmf32 to nxvmf16. |
2578 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 }, |
2579 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 }, |
2580 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 }, |
2581 | |
2582 | // Truncate from nxvmf64 to nxvmf16. |
2583 | { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 }, |
2584 | { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 }, |
2585 | { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 }, |
2586 | |
2587 | // Truncate from nxvmf64 to nxvmf32. |
2588 | { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 }, |
2589 | { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 }, |
2590 | { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 }, |
2591 | |
2592 | // Extend from nxvmf16 to nxvmf32. |
2593 | { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1}, |
2594 | { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1}, |
2595 | { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2}, |
2596 | |
2597 | // Extend from nxvmf16 to nxvmf64. |
2598 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1}, |
2599 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2}, |
2600 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4}, |
2601 | |
2602 | // Extend from nxvmf32 to nxvmf64. |
2603 | { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1}, |
2604 | { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2}, |
2605 | { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6}, |
2606 | |
2607 | // Bitcasts from float to integer |
2608 | { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 }, |
2609 | { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 }, |
2610 | { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 }, |
2611 | |
2612 | // Bitcasts from integer to float |
2613 | { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 }, |
2614 | { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 }, |
2615 | { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 }, |
2616 | |
2617 | // Add cost for extending to illegal -too wide- scalable vectors. |
2618 | // zero/sign extend are implemented by multiple unpack operations, |
2619 | // where each operation has a cost of 1. |
2620 | { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, |
2621 | { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, |
2622 | { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, |
2623 | { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, |
2624 | { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, |
2625 | { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, |
2626 | |
2627 | { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2}, |
2628 | { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6}, |
2629 | { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14}, |
2630 | { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2}, |
2631 | { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6}, |
2632 | { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2}, |
2633 | }; |
2634 | |
2635 | // We have to estimate a cost of fixed length operation upon |
2636 | // SVE registers(operations) with the number of registers required |
2637 | // for a fixed type to be represented upon SVE registers. |
2638 | EVT WiderTy = SrcTy.bitsGT(VT: DstTy) ? SrcTy : DstTy; |
2639 | if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() && |
2640 | SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() && |
2641 | ST->useSVEForFixedLengthVectors(VT: WiderTy)) { |
2642 | std::pair<InstructionCost, MVT> LT = |
2643 | getTypeLegalizationCost(Ty: WiderTy.getTypeForEVT(Context&: Dst->getContext())); |
2644 | unsigned NumElements = AArch64::SVEBitsPerBlock / |
2645 | LT.second.getVectorElementType().getSizeInBits(); |
2646 | return AdjustCost( |
2647 | LT.first * |
2648 | getCastInstrCost( |
2649 | Opcode, Dst: ScalableVectorType::get(ElementType: Dst->getScalarType(), MinNumElts: NumElements), |
2650 | Src: ScalableVectorType::get(ElementType: Src->getScalarType(), MinNumElts: NumElements), CCH, |
2651 | CostKind, I)); |
2652 | } |
2653 | |
2654 | if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, |
2655 | DstTy.getSimpleVT(), |
2656 | SrcTy.getSimpleVT())) |
2657 | return AdjustCost(Entry->Cost); |
2658 | |
2659 | static const TypeConversionCostTblEntry FP16Tbl[] = { |
2660 | {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs |
2661 | {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1}, |
2662 | {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs |
2663 | {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1}, |
2664 | {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs |
2665 | {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2}, |
2666 | {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn |
2667 | {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2}, |
2668 | {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs |
2669 | {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1}, |
2670 | {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs |
2671 | {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4}, |
2672 | {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn |
2673 | {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3}, |
2674 | {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs |
2675 | {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2}, |
2676 | {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs |
2677 | {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8}, |
2678 | {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf |
2679 | {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf |
2680 | {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf |
2681 | {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf |
2682 | }; |
2683 | |
2684 | if (ST->hasFullFP16()) |
2685 | if (const auto *Entry = ConvertCostTableLookup( |
2686 | FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) |
2687 | return AdjustCost(Entry->Cost); |
2688 | |
2689 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
2690 | CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() && |
2691 | TLI->getTypeAction(Context&: Src->getContext(), VT: SrcTy) == |
2692 | TargetLowering::TypePromoteInteger && |
2693 | TLI->getTypeAction(Context&: Dst->getContext(), VT: DstTy) == |
2694 | TargetLowering::TypeSplitVector) { |
2695 | // The standard behaviour in the backend for these cases is to split the |
2696 | // extend up into two parts: |
2697 | // 1. Perform an extending load or masked load up to the legal type. |
2698 | // 2. Extend the loaded data to the final type. |
2699 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: Src); |
2700 | Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Context&: Src->getContext()); |
2701 | InstructionCost Part1 = AArch64TTIImpl::getCastInstrCost( |
2702 | Opcode, Dst: LegalTy, Src, CCH, CostKind, I); |
2703 | InstructionCost Part2 = AArch64TTIImpl::getCastInstrCost( |
2704 | Opcode, Dst, Src: LegalTy, CCH: TTI::CastContextHint::None, CostKind, I); |
2705 | return Part1 + Part2; |
2706 | } |
2707 | |
2708 | // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal, |
2709 | // but we also want to include the TTI::CastContextHint::Masked case too. |
2710 | if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) && |
2711 | CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() && |
2712 | TLI->isTypeLegal(VT: DstTy)) |
2713 | CCH = TTI::CastContextHint::Normal; |
2714 | |
2715 | return AdjustCost( |
2716 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
2717 | } |
2718 | |
2719 | InstructionCost AArch64TTIImpl::(unsigned Opcode, |
2720 | Type *Dst, |
2721 | VectorType *VecTy, |
2722 | unsigned Index) { |
2723 | |
2724 | // Make sure we were given a valid extend opcode. |
2725 | assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && |
2726 | "Invalid opcode" ); |
2727 | |
2728 | // We are extending an element we extract from a vector, so the source type |
2729 | // of the extend is the element type of the vector. |
2730 | auto *Src = VecTy->getElementType(); |
2731 | |
2732 | // Sign- and zero-extends are for integer types only. |
2733 | assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type" ); |
2734 | |
2735 | // Get the cost for the extract. We compute the cost (if any) for the extend |
2736 | // below. |
2737 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
2738 | InstructionCost Cost = getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, |
2739 | CostKind, Index, Op0: nullptr, Op1: nullptr); |
2740 | |
2741 | // Legalize the types. |
2742 | auto VecLT = getTypeLegalizationCost(Ty: VecTy); |
2743 | auto DstVT = TLI->getValueType(DL, Ty: Dst); |
2744 | auto SrcVT = TLI->getValueType(DL, Ty: Src); |
2745 | |
2746 | // If the resulting type is still a vector and the destination type is legal, |
2747 | // we may get the extension for free. If not, get the default cost for the |
2748 | // extend. |
2749 | if (!VecLT.second.isVector() || !TLI->isTypeLegal(VT: DstVT)) |
2750 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
2751 | CostKind); |
2752 | |
2753 | // The destination type should be larger than the element type. If not, get |
2754 | // the default cost for the extend. |
2755 | if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits()) |
2756 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
2757 | CostKind); |
2758 | |
2759 | switch (Opcode) { |
2760 | default: |
2761 | llvm_unreachable("Opcode should be either SExt or ZExt" ); |
2762 | |
2763 | // For sign-extends, we only need a smov, which performs the extension |
2764 | // automatically. |
2765 | case Instruction::SExt: |
2766 | return Cost; |
2767 | |
2768 | // For zero-extends, the extend is performed automatically by a umov unless |
2769 | // the destination type is i64 and the element type is i8 or i16. |
2770 | case Instruction::ZExt: |
2771 | if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u) |
2772 | return Cost; |
2773 | } |
2774 | |
2775 | // If we are unable to perform the extend for free, get the default cost. |
2776 | return Cost + getCastInstrCost(Opcode, Dst, Src, CCH: TTI::CastContextHint::None, |
2777 | CostKind); |
2778 | } |
2779 | |
2780 | InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode, |
2781 | TTI::TargetCostKind CostKind, |
2782 | const Instruction *I) { |
2783 | if (CostKind != TTI::TCK_RecipThroughput) |
2784 | return Opcode == Instruction::PHI ? 0 : 1; |
2785 | assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind" ); |
2786 | // Branches are assumed to be predicted. |
2787 | return 0; |
2788 | } |
2789 | |
2790 | InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I, |
2791 | Type *Val, |
2792 | unsigned Index, |
2793 | bool HasRealUse) { |
2794 | assert(Val->isVectorTy() && "This must be a vector type" ); |
2795 | |
2796 | if (Index != -1U) { |
2797 | // Legalize the type. |
2798 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
2799 | |
2800 | // This type is legalized to a scalar type. |
2801 | if (!LT.second.isVector()) |
2802 | return 0; |
2803 | |
2804 | // The type may be split. For fixed-width vectors we can normalize the |
2805 | // index to the new type. |
2806 | if (LT.second.isFixedLengthVector()) { |
2807 | unsigned Width = LT.second.getVectorNumElements(); |
2808 | Index = Index % Width; |
2809 | } |
2810 | |
2811 | // The element at index zero is already inside the vector. |
2812 | // - For a physical (HasRealUse==true) insert-element or extract-element |
2813 | // instruction that extracts integers, an explicit FPR -> GPR move is |
2814 | // needed. So it has non-zero cost. |
2815 | // - For the rest of cases (virtual instruction or element type is float), |
2816 | // consider the instruction free. |
2817 | if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy())) |
2818 | return 0; |
2819 | |
2820 | // This is recognising a LD1 single-element structure to one lane of one |
2821 | // register instruction. I.e., if this is an `insertelement` instruction, |
2822 | // and its second operand is a load, then we will generate a LD1, which |
2823 | // are expensive instructions. |
2824 | if (I && dyn_cast<LoadInst>(Val: I->getOperand(i: 1))) |
2825 | return ST->getVectorInsertExtractBaseCost() + 1; |
2826 | |
2827 | // i1 inserts and extract will include an extra cset or cmp of the vector |
2828 | // value. Increase the cost by 1 to account. |
2829 | if (Val->getScalarSizeInBits() == 1) |
2830 | return ST->getVectorInsertExtractBaseCost() + 1; |
2831 | |
2832 | // FIXME: |
2833 | // If the extract-element and insert-element instructions could be |
2834 | // simplified away (e.g., could be combined into users by looking at use-def |
2835 | // context), they have no cost. This is not done in the first place for |
2836 | // compile-time considerations. |
2837 | } |
2838 | |
2839 | // All other insert/extracts cost this much. |
2840 | return ST->getVectorInsertExtractBaseCost(); |
2841 | } |
2842 | |
2843 | InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
2844 | TTI::TargetCostKind CostKind, |
2845 | unsigned Index, Value *Op0, |
2846 | Value *Op1) { |
2847 | bool HasRealUse = |
2848 | Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Val: Op0); |
2849 | return getVectorInstrCostHelper(I: nullptr, Val, Index, HasRealUse); |
2850 | } |
2851 | |
2852 | InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, |
2853 | Type *Val, |
2854 | TTI::TargetCostKind CostKind, |
2855 | unsigned Index) { |
2856 | return getVectorInstrCostHelper(I: &I, Val, Index, HasRealUse: true /* HasRealUse */); |
2857 | } |
2858 | |
2859 | InstructionCost AArch64TTIImpl::getScalarizationOverhead( |
2860 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
2861 | TTI::TargetCostKind CostKind) { |
2862 | if (isa<ScalableVectorType>(Val: Ty)) |
2863 | return InstructionCost::getInvalid(); |
2864 | if (Ty->getElementType()->isFloatingPointTy()) |
2865 | return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract, |
2866 | CostKind); |
2867 | return DemandedElts.popcount() * (Insert + Extract) * |
2868 | ST->getVectorInsertExtractBaseCost(); |
2869 | } |
2870 | |
2871 | InstructionCost AArch64TTIImpl::getArithmeticInstrCost( |
2872 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
2873 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
2874 | ArrayRef<const Value *> Args, |
2875 | const Instruction *CxtI) { |
2876 | |
2877 | // TODO: Handle more cost kinds. |
2878 | if (CostKind != TTI::TCK_RecipThroughput) |
2879 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
2880 | Opd2Info: Op2Info, Args, CxtI); |
2881 | |
2882 | // Legalize the type. |
2883 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
2884 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
2885 | |
2886 | switch (ISD) { |
2887 | default: |
2888 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
2889 | Opd2Info: Op2Info); |
2890 | case ISD::SDIV: |
2891 | if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { |
2892 | // On AArch64, scalar signed division by constants power-of-two are |
2893 | // normally expanded to the sequence ADD + CMP + SELECT + SRA. |
2894 | // The OperandValue properties many not be same as that of previous |
2895 | // operation; conservatively assume OP_None. |
2896 | InstructionCost Cost = getArithmeticInstrCost( |
2897 | Opcode: Instruction::Add, Ty, CostKind, |
2898 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
2899 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind, |
2900 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
2901 | Cost += getArithmeticInstrCost( |
2902 | Opcode: Instruction::Select, Ty, CostKind, |
2903 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
2904 | Cost += getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
2905 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
2906 | return Cost; |
2907 | } |
2908 | [[fallthrough]]; |
2909 | case ISD::UDIV: { |
2910 | if (Op2Info.isConstant() && Op2Info.isUniform()) { |
2911 | auto VT = TLI->getValueType(DL, Ty); |
2912 | if (TLI->isOperationLegalOrCustom(Op: ISD::MULHU, VT)) { |
2913 | // Vector signed division by constant are expanded to the |
2914 | // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division |
2915 | // to MULHS + SUB + SRL + ADD + SRL. |
2916 | InstructionCost MulCost = getArithmeticInstrCost( |
2917 | Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
2918 | InstructionCost AddCost = getArithmeticInstrCost( |
2919 | Opcode: Instruction::Add, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
2920 | InstructionCost ShrCost = getArithmeticInstrCost( |
2921 | Opcode: Instruction::AShr, Ty, CostKind, Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
2922 | return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; |
2923 | } |
2924 | } |
2925 | |
2926 | InstructionCost Cost = BaseT::getArithmeticInstrCost( |
2927 | Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
2928 | if (Ty->isVectorTy()) { |
2929 | if (TLI->isOperationLegalOrCustom(Op: ISD, VT: LT.second) && ST->hasSVE()) { |
2930 | // SDIV/UDIV operations are lowered using SVE, then we can have less |
2931 | // costs. |
2932 | if (isa<FixedVectorType>(Val: Ty) && cast<FixedVectorType>(Val: Ty) |
2933 | ->getPrimitiveSizeInBits() |
2934 | .getFixedValue() < 128) { |
2935 | EVT VT = TLI->getValueType(DL, Ty); |
2936 | static const CostTblEntry DivTbl[]{ |
2937 | {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8}, |
2938 | {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5}, |
2939 | {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1}, |
2940 | {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8}, |
2941 | {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5}, |
2942 | {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}}; |
2943 | |
2944 | const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT()); |
2945 | if (nullptr != Entry) |
2946 | return Entry->Cost; |
2947 | } |
2948 | // For 8/16-bit elements, the cost is higher because the type |
2949 | // requires promotion and possibly splitting: |
2950 | if (LT.second.getScalarType() == MVT::i8) |
2951 | Cost *= 8; |
2952 | else if (LT.second.getScalarType() == MVT::i16) |
2953 | Cost *= 4; |
2954 | return Cost; |
2955 | } else { |
2956 | // If one of the operands is a uniform constant then the cost for each |
2957 | // element is Cost for insertion, extraction and division. |
2958 | // Insertion cost = 2, Extraction Cost = 2, Division = cost for the |
2959 | // operation with scalar type |
2960 | if ((Op1Info.isConstant() && Op1Info.isUniform()) || |
2961 | (Op2Info.isConstant() && Op2Info.isUniform())) { |
2962 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) { |
2963 | InstructionCost DivCost = BaseT::getArithmeticInstrCost( |
2964 | Opcode, Ty: Ty->getScalarType(), CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info); |
2965 | return (4 + DivCost) * VTy->getNumElements(); |
2966 | } |
2967 | } |
2968 | // On AArch64, without SVE, vector divisions are expanded |
2969 | // into scalar divisions of each pair of elements. |
2970 | Cost += getArithmeticInstrCost(Opcode: Instruction::ExtractElement, Ty, |
2971 | CostKind, Op1Info, Op2Info); |
2972 | Cost += getArithmeticInstrCost(Opcode: Instruction::InsertElement, Ty, CostKind, |
2973 | Op1Info, Op2Info); |
2974 | } |
2975 | |
2976 | // TODO: if one of the arguments is scalar, then it's not necessary to |
2977 | // double the cost of handling the vector elements. |
2978 | Cost += Cost; |
2979 | } |
2980 | return Cost; |
2981 | } |
2982 | case ISD::MUL: |
2983 | // When SVE is available, then we can lower the v2i64 operation using |
2984 | // the SVE mul instruction, which has a lower cost. |
2985 | if (LT.second == MVT::v2i64 && ST->hasSVE()) |
2986 | return LT.first; |
2987 | |
2988 | // When SVE is not available, there is no MUL.2d instruction, |
2989 | // which means mul <2 x i64> is expensive as elements are extracted |
2990 | // from the vectors and the muls scalarized. |
2991 | // As getScalarizationOverhead is a bit too pessimistic, we |
2992 | // estimate the cost for a i64 vector directly here, which is: |
2993 | // - four 2-cost i64 extracts, |
2994 | // - two 2-cost i64 inserts, and |
2995 | // - two 1-cost muls. |
2996 | // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with |
2997 | // LT.first = 2 the cost is 28. If both operands are extensions it will not |
2998 | // need to scalarize so the cost can be cheaper (smull or umull). |
2999 | // so the cost can be cheaper (smull or umull). |
3000 | if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args)) |
3001 | return LT.first; |
3002 | return LT.first * 14; |
3003 | case ISD::ADD: |
3004 | case ISD::XOR: |
3005 | case ISD::OR: |
3006 | case ISD::AND: |
3007 | case ISD::SRL: |
3008 | case ISD::SRA: |
3009 | case ISD::SHL: |
3010 | // These nodes are marked as 'custom' for combining purposes only. |
3011 | // We know that they are legal. See LowerAdd in ISelLowering. |
3012 | return LT.first; |
3013 | |
3014 | case ISD::FNEG: |
3015 | case ISD::FADD: |
3016 | case ISD::FSUB: |
3017 | // Increase the cost for half and bfloat types if not architecturally |
3018 | // supported. |
3019 | if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) || |
3020 | (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16())) |
3021 | return 2 * LT.first; |
3022 | if (!Ty->getScalarType()->isFP128Ty()) |
3023 | return LT.first; |
3024 | [[fallthrough]]; |
3025 | case ISD::FMUL: |
3026 | case ISD::FDIV: |
3027 | // These nodes are marked as 'custom' just to lower them to SVE. |
3028 | // We know said lowering will incur no additional cost. |
3029 | if (!Ty->getScalarType()->isFP128Ty()) |
3030 | return 2 * LT.first; |
3031 | |
3032 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3033 | Opd2Info: Op2Info); |
3034 | case ISD::FREM: |
3035 | // Pass nullptr as fmod/fmodf calls are emitted by the backend even when |
3036 | // those functions are not declared in the module. |
3037 | if (!Ty->isVectorTy()) |
3038 | return getCallInstrCost(/*Function*/ F: nullptr, RetTy: Ty, Tys: {Ty, Ty}, CostKind); |
3039 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
3040 | Opd2Info: Op2Info); |
3041 | } |
3042 | } |
3043 | |
3044 | InstructionCost AArch64TTIImpl::getAddressComputationCost(Type *Ty, |
3045 | ScalarEvolution *SE, |
3046 | const SCEV *Ptr) { |
3047 | // Address computations in vectorized code with non-consecutive addresses will |
3048 | // likely result in more instructions compared to scalar code where the |
3049 | // computation can more often be merged into the index mode. The resulting |
3050 | // extra micro-ops can significantly decrease throughput. |
3051 | unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead; |
3052 | int MaxMergeDistance = 64; |
3053 | |
3054 | if (Ty->isVectorTy() && SE && |
3055 | !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MergeDistance: MaxMergeDistance + 1)) |
3056 | return NumVectorInstToHideOverhead; |
3057 | |
3058 | // In many cases the address computation is not merged into the instruction |
3059 | // addressing mode. |
3060 | return 1; |
3061 | } |
3062 | |
3063 | InstructionCost AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
3064 | Type *CondTy, |
3065 | CmpInst::Predicate VecPred, |
3066 | TTI::TargetCostKind CostKind, |
3067 | const Instruction *I) { |
3068 | // TODO: Handle other cost kinds. |
3069 | if (CostKind != TTI::TCK_RecipThroughput) |
3070 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
3071 | I); |
3072 | |
3073 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3074 | // We don't lower some vector selects well that are wider than the register |
3075 | // width. |
3076 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SELECT) { |
3077 | // We would need this many instructions to hide the scalarization happening. |
3078 | const int AmortizationCost = 20; |
3079 | |
3080 | // If VecPred is not set, check if we can get a predicate from the context |
3081 | // instruction, if its type matches the requested ValTy. |
3082 | if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) { |
3083 | CmpInst::Predicate CurrentPred; |
3084 | if (match(V: I, P: m_Select(C: m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()), L: m_Value(), |
3085 | R: m_Value()))) |
3086 | VecPred = CurrentPred; |
3087 | } |
3088 | // Check if we have a compare/select chain that can be lowered using |
3089 | // a (F)CMxx & BFI pair. |
3090 | if (CmpInst::isIntPredicate(P: VecPred) || VecPred == CmpInst::FCMP_OLE || |
3091 | VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT || |
3092 | VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ || |
3093 | VecPred == CmpInst::FCMP_UNE) { |
3094 | static const auto ValidMinMaxTys = { |
3095 | MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32, |
3096 | MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64}; |
3097 | static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16}; |
3098 | |
3099 | auto LT = getTypeLegalizationCost(Ty: ValTy); |
3100 | if (any_of(ValidMinMaxTys, [<](MVT M) { return M == LT.second; }) || |
3101 | (ST->hasFullFP16() && |
3102 | any_of(ValidFP16MinMaxTys, [<](MVT M) { return M == LT.second; }))) |
3103 | return LT.first; |
3104 | } |
3105 | |
3106 | static const TypeConversionCostTblEntry |
3107 | VectorSelectTbl[] = { |
3108 | { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 }, |
3109 | { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 }, |
3110 | { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 }, |
3111 | { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 }, |
3112 | { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 }, |
3113 | { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 }, |
3114 | { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 }, |
3115 | { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 }, |
3116 | { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost }, |
3117 | { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost }, |
3118 | { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost } |
3119 | }; |
3120 | |
3121 | EVT SelCondTy = TLI->getValueType(DL, Ty: CondTy); |
3122 | EVT SelValTy = TLI->getValueType(DL, Ty: ValTy); |
3123 | if (SelCondTy.isSimple() && SelValTy.isSimple()) { |
3124 | if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD, |
3125 | SelCondTy.getSimpleVT(), |
3126 | SelValTy.getSimpleVT())) |
3127 | return Entry->Cost; |
3128 | } |
3129 | } |
3130 | |
3131 | if (isa<FixedVectorType>(Val: ValTy) && ISD == ISD::SETCC) { |
3132 | auto LT = getTypeLegalizationCost(Ty: ValTy); |
3133 | // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back. |
3134 | if (LT.second == MVT::v4f16 && !ST->hasFullFP16()) |
3135 | return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn |
3136 | } |
3137 | |
3138 | // Treat the icmp in icmp(and, 0) as free, as we can make use of ands. |
3139 | // FIXME: This can apply to more conditions and add/sub if it can be shown to |
3140 | // be profitable. |
3141 | if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I && |
3142 | ICmpInst::isEquality(P: VecPred) && |
3143 | TLI->isTypeLegal(VT: TLI->getValueType(DL, Ty: ValTy)) && |
3144 | match(V: I->getOperand(i: 1), P: m_Zero()) && |
3145 | match(V: I->getOperand(i: 0), P: m_And(L: m_Value(), R: m_Value()))) |
3146 | return 0; |
3147 | |
3148 | // The base case handles scalable vectors fine for now, since it treats the |
3149 | // cost as 1 * legalization cost. |
3150 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
3151 | } |
3152 | |
3153 | AArch64TTIImpl::TTI::MemCmpExpansionOptions |
3154 | AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
3155 | TTI::MemCmpExpansionOptions Options; |
3156 | if (ST->requiresStrictAlign()) { |
3157 | // TODO: Add cost modeling for strict align. Misaligned loads expand to |
3158 | // a bunch of instructions when strict align is enabled. |
3159 | return Options; |
3160 | } |
3161 | Options.AllowOverlappingLoads = true; |
3162 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
3163 | Options.NumLoadsPerBlock = Options.MaxNumLoads; |
3164 | // TODO: Though vector loads usually perform well on AArch64, in some targets |
3165 | // they may wake up the FP unit, which raises the power consumption. Perhaps |
3166 | // they could be used with no holds barred (-O3). |
3167 | Options.LoadSizes = {8, 4, 2, 1}; |
3168 | Options.AllowedTailExpansions = {3, 5, 6}; |
3169 | return Options; |
3170 | } |
3171 | |
3172 | bool AArch64TTIImpl::prefersVectorizedAddressing() const { |
3173 | return ST->hasSVE(); |
3174 | } |
3175 | |
3176 | InstructionCost |
3177 | AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
3178 | Align Alignment, unsigned AddressSpace, |
3179 | TTI::TargetCostKind CostKind) { |
3180 | if (useNeonVector(Ty: Src)) |
3181 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
3182 | CostKind); |
3183 | auto LT = getTypeLegalizationCost(Ty: Src); |
3184 | if (!LT.first.isValid()) |
3185 | return InstructionCost::getInvalid(); |
3186 | |
3187 | // The code-generator is currently not able to handle scalable vectors |
3188 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3189 | // it. This change will be removed when code-generation for these types is |
3190 | // sufficiently reliable. |
3191 | if (cast<VectorType>(Val: Src)->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3192 | return InstructionCost::getInvalid(); |
3193 | |
3194 | return LT.first; |
3195 | } |
3196 | |
3197 | static unsigned getSVEGatherScatterOverhead(unsigned Opcode) { |
3198 | return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead; |
3199 | } |
3200 | |
3201 | InstructionCost AArch64TTIImpl::getGatherScatterOpCost( |
3202 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
3203 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
3204 | if (useNeonVector(Ty: DataTy) || !isLegalMaskedGatherScatter(DataType: DataTy)) |
3205 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
3206 | Alignment, CostKind, I); |
3207 | auto *VT = cast<VectorType>(Val: DataTy); |
3208 | auto LT = getTypeLegalizationCost(Ty: DataTy); |
3209 | if (!LT.first.isValid()) |
3210 | return InstructionCost::getInvalid(); |
3211 | |
3212 | if (!LT.second.isVector() || |
3213 | !isElementTypeLegalForScalableVector(Ty: VT->getElementType())) |
3214 | return InstructionCost::getInvalid(); |
3215 | |
3216 | // The code-generator is currently not able to handle scalable vectors |
3217 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3218 | // it. This change will be removed when code-generation for these types is |
3219 | // sufficiently reliable. |
3220 | if (cast<VectorType>(Val: DataTy)->getElementCount() == |
3221 | ElementCount::getScalable(MinVal: 1)) |
3222 | return InstructionCost::getInvalid(); |
3223 | |
3224 | ElementCount LegalVF = LT.second.getVectorElementCount(); |
3225 | InstructionCost MemOpCost = |
3226 | getMemoryOpCost(Opcode, Src: VT->getElementType(), Alignment, AddressSpace: 0, CostKind, |
3227 | OpInfo: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I); |
3228 | // Add on an overhead cost for using gathers/scatters. |
3229 | // TODO: At the moment this is applied unilaterally for all CPUs, but at some |
3230 | // point we may want a per-CPU overhead. |
3231 | MemOpCost *= getSVEGatherScatterOverhead(Opcode); |
3232 | return LT.first * MemOpCost * getMaxNumElements(VF: LegalVF); |
3233 | } |
3234 | |
3235 | bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { |
3236 | return isa<FixedVectorType>(Val: Ty) && !ST->useSVEForFixedLengthVectors(); |
3237 | } |
3238 | |
3239 | InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty, |
3240 | MaybeAlign Alignment, |
3241 | unsigned AddressSpace, |
3242 | TTI::TargetCostKind CostKind, |
3243 | TTI::OperandValueInfo OpInfo, |
3244 | const Instruction *I) { |
3245 | EVT VT = TLI->getValueType(DL, Ty, AllowUnknown: true); |
3246 | // Type legalization can't handle structs |
3247 | if (VT == MVT::Other) |
3248 | return BaseT::getMemoryOpCost(Opcode, Src: Ty, Alignment, AddressSpace, |
3249 | CostKind); |
3250 | |
3251 | auto LT = getTypeLegalizationCost(Ty); |
3252 | if (!LT.first.isValid()) |
3253 | return InstructionCost::getInvalid(); |
3254 | |
3255 | // The code-generator is currently not able to handle scalable vectors |
3256 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3257 | // it. This change will be removed when code-generation for these types is |
3258 | // sufficiently reliable. |
3259 | if (auto *VTy = dyn_cast<ScalableVectorType>(Val: Ty)) |
3260 | if (VTy->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3261 | return InstructionCost::getInvalid(); |
3262 | |
3263 | // TODO: consider latency as well for TCK_SizeAndLatency. |
3264 | if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) |
3265 | return LT.first; |
3266 | |
3267 | if (CostKind != TTI::TCK_RecipThroughput) |
3268 | return 1; |
3269 | |
3270 | if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store && |
3271 | LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) { |
3272 | // Unaligned stores are extremely inefficient. We don't split all |
3273 | // unaligned 128-bit stores because the negative impact that has shown in |
3274 | // practice on inlined block copy code. |
3275 | // We make such stores expensive so that we will only vectorize if there |
3276 | // are 6 other instructions getting vectorized. |
3277 | const int AmortizationCost = 6; |
3278 | |
3279 | return LT.first * 2 * AmortizationCost; |
3280 | } |
3281 | |
3282 | // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs. |
3283 | if (Ty->isPtrOrPtrVectorTy()) |
3284 | return LT.first; |
3285 | |
3286 | if (useNeonVector(Ty)) { |
3287 | // Check truncating stores and extending loads. |
3288 | if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) { |
3289 | // v4i8 types are lowered to scalar a load/store and sshll/xtn. |
3290 | if (VT == MVT::v4i8) |
3291 | return 2; |
3292 | // Otherwise we need to scalarize. |
3293 | return cast<FixedVectorType>(Val: Ty)->getNumElements() * 2; |
3294 | } |
3295 | EVT EltVT = VT.getVectorElementType(); |
3296 | unsigned EltSize = EltVT.getScalarSizeInBits(); |
3297 | if (!isPowerOf2_32(Value: EltSize) || EltSize < 8 || EltSize > 64 || |
3298 | VT.getVectorNumElements() >= (128 / EltSize) || !Alignment || |
3299 | *Alignment != Align(1)) |
3300 | return LT.first; |
3301 | // FIXME: v3i8 lowering currently is very inefficient, due to automatic |
3302 | // widening to v4i8, which produces suboptimal results. |
3303 | if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8) |
3304 | return LT.first; |
3305 | |
3306 | // Check non-power-of-2 loads/stores for legal vector element types with |
3307 | // NEON. Non-power-of-2 memory ops will get broken down to a set of |
3308 | // operations on smaller power-of-2 ops, including ld1/st1. |
3309 | LLVMContext &C = Ty->getContext(); |
3310 | InstructionCost Cost(0); |
3311 | SmallVector<EVT> TypeWorklist; |
3312 | TypeWorklist.push_back(Elt: VT); |
3313 | while (!TypeWorklist.empty()) { |
3314 | EVT CurrVT = TypeWorklist.pop_back_val(); |
3315 | unsigned CurrNumElements = CurrVT.getVectorNumElements(); |
3316 | if (isPowerOf2_32(Value: CurrNumElements)) { |
3317 | Cost += 1; |
3318 | continue; |
3319 | } |
3320 | |
3321 | unsigned PrevPow2 = NextPowerOf2(A: CurrNumElements) / 2; |
3322 | TypeWorklist.push_back(Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: PrevPow2)); |
3323 | TypeWorklist.push_back( |
3324 | Elt: EVT::getVectorVT(Context&: C, VT: EltVT, NumElements: CurrNumElements - PrevPow2)); |
3325 | } |
3326 | return Cost; |
3327 | } |
3328 | |
3329 | return LT.first; |
3330 | } |
3331 | |
3332 | InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( |
3333 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
3334 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
3335 | bool UseMaskForCond, bool UseMaskForGaps) { |
3336 | assert(Factor >= 2 && "Invalid interleave factor" ); |
3337 | auto *VecVTy = cast<VectorType>(Val: VecTy); |
3338 | |
3339 | if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2)) |
3340 | return InstructionCost::getInvalid(); |
3341 | |
3342 | // Vectorization for masked interleaved accesses is only enabled for scalable |
3343 | // VF. |
3344 | if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) |
3345 | return InstructionCost::getInvalid(); |
3346 | |
3347 | if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { |
3348 | unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); |
3349 | auto *SubVecTy = |
3350 | VectorType::get(ElementType: VecVTy->getElementType(), |
3351 | EC: VecVTy->getElementCount().divideCoefficientBy(RHS: Factor)); |
3352 | |
3353 | // ldN/stN only support legal vector types of size 64 or 128 in bits. |
3354 | // Accesses having vector types that are a multiple of 128 bits can be |
3355 | // matched to more than one ldN/stN instruction. |
3356 | bool UseScalable; |
3357 | if (MinElts % Factor == 0 && |
3358 | TLI->isLegalInterleavedAccessType(VecTy: SubVecTy, DL, UseScalable)) |
3359 | return Factor * TLI->getNumInterleavedAccesses(VecTy: SubVecTy, DL, UseScalable); |
3360 | } |
3361 | |
3362 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
3363 | Alignment, AddressSpace, CostKind, |
3364 | UseMaskForCond, UseMaskForGaps); |
3365 | } |
3366 | |
3367 | InstructionCost |
3368 | AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) { |
3369 | InstructionCost Cost = 0; |
3370 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
3371 | for (auto *I : Tys) { |
3372 | if (!I->isVectorTy()) |
3373 | continue; |
3374 | if (I->getScalarSizeInBits() * cast<FixedVectorType>(Val: I)->getNumElements() == |
3375 | 128) |
3376 | Cost += getMemoryOpCost(Opcode: Instruction::Store, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind) + |
3377 | getMemoryOpCost(Opcode: Instruction::Load, Ty: I, Alignment: Align(128), AddressSpace: 0, CostKind); |
3378 | } |
3379 | return Cost; |
3380 | } |
3381 | |
3382 | unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
3383 | return ST->getMaxInterleaveFactor(); |
3384 | } |
3385 | |
3386 | // For Falkor, we want to avoid having too many strided loads in a loop since |
3387 | // that can exhaust the HW prefetcher resources. We adjust the unroller |
3388 | // MaxCount preference below to attempt to ensure unrolling doesn't create too |
3389 | // many strided loads. |
3390 | static void |
3391 | getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
3392 | TargetTransformInfo::UnrollingPreferences &UP) { |
3393 | enum { MaxStridedLoads = 7 }; |
3394 | auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) { |
3395 | int StridedLoads = 0; |
3396 | // FIXME? We could make this more precise by looking at the CFG and |
3397 | // e.g. not counting loads in each side of an if-then-else diamond. |
3398 | for (const auto BB : L->blocks()) { |
3399 | for (auto &I : *BB) { |
3400 | LoadInst *LMemI = dyn_cast<LoadInst>(Val: &I); |
3401 | if (!LMemI) |
3402 | continue; |
3403 | |
3404 | Value *PtrValue = LMemI->getPointerOperand(); |
3405 | if (L->isLoopInvariant(V: PtrValue)) |
3406 | continue; |
3407 | |
3408 | const SCEV *LSCEV = SE.getSCEV(V: PtrValue); |
3409 | const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(Val: LSCEV); |
3410 | if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) |
3411 | continue; |
3412 | |
3413 | // FIXME? We could take pairing of unrolled load copies into account |
3414 | // by looking at the AddRec, but we would probably have to limit this |
3415 | // to loops with no stores or other memory optimization barriers. |
3416 | ++StridedLoads; |
3417 | // We've seen enough strided loads that seeing more won't make a |
3418 | // difference. |
3419 | if (StridedLoads > MaxStridedLoads / 2) |
3420 | return StridedLoads; |
3421 | } |
3422 | } |
3423 | return StridedLoads; |
3424 | }; |
3425 | |
3426 | int StridedLoads = countStridedLoads(L, SE); |
3427 | LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads |
3428 | << " strided loads\n" ); |
3429 | // Pick the largest power of 2 unroll count that won't result in too many |
3430 | // strided loads. |
3431 | if (StridedLoads) { |
3432 | UP.MaxCount = 1 << Log2_32(Value: MaxStridedLoads / StridedLoads); |
3433 | LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to " |
3434 | << UP.MaxCount << '\n'); |
3435 | } |
3436 | } |
3437 | |
3438 | void AArch64TTIImpl::(Loop *L, ScalarEvolution &SE, |
3439 | TTI::UnrollingPreferences &UP, |
3440 | OptimizationRemarkEmitter *ORE) { |
3441 | // Enable partial unrolling and runtime unrolling. |
3442 | BaseT::getUnrollingPreferences(L, SE, UP, ORE); |
3443 | |
3444 | UP.UpperBound = true; |
3445 | |
3446 | // For inner loop, it is more likely to be a hot one, and the runtime check |
3447 | // can be promoted out from LICM pass, so the overhead is less, let's try |
3448 | // a larger threshold to unroll more loops. |
3449 | if (L->getLoopDepth() > 1) |
3450 | UP.PartialThreshold *= 2; |
3451 | |
3452 | // Disable partial & runtime unrolling on -Os. |
3453 | UP.PartialOptSizeThreshold = 0; |
3454 | |
3455 | if (ST->getProcFamily() == AArch64Subtarget::Falkor && |
3456 | EnableFalkorHWPFUnrollFix) |
3457 | getFalkorUnrollingPreferences(L, SE, UP); |
3458 | |
3459 | // Scan the loop: don't unroll loops with calls as this could prevent |
3460 | // inlining. Don't unroll vector loops either, as they don't benefit much from |
3461 | // unrolling. |
3462 | for (auto *BB : L->getBlocks()) { |
3463 | for (auto &I : *BB) { |
3464 | // Don't unroll vectorised loop. |
3465 | if (I.getType()->isVectorTy()) |
3466 | return; |
3467 | |
3468 | if (isa<CallInst>(Val: I) || isa<InvokeInst>(Val: I)) { |
3469 | if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) { |
3470 | if (!isLoweredToCall(F)) |
3471 | continue; |
3472 | } |
3473 | return; |
3474 | } |
3475 | } |
3476 | } |
3477 | |
3478 | // Enable runtime unrolling for in-order models |
3479 | // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by |
3480 | // checking for that case, we can ensure that the default behaviour is |
3481 | // unchanged |
3482 | if (ST->getProcFamily() != AArch64Subtarget::Others && |
3483 | !ST->getSchedModel().isOutOfOrder()) { |
3484 | UP.Runtime = true; |
3485 | UP.Partial = true; |
3486 | UP.UnrollRemainder = true; |
3487 | UP.DefaultUnrollRuntimeCount = 4; |
3488 | |
3489 | UP.UnrollAndJam = true; |
3490 | UP.UnrollAndJamInnerLoopThreshold = 60; |
3491 | } |
3492 | } |
3493 | |
3494 | void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
3495 | TTI::PeelingPreferences &PP) { |
3496 | BaseT::getPeelingPreferences(L, SE, PP); |
3497 | } |
3498 | |
3499 | Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, |
3500 | Type *ExpectedType) { |
3501 | switch (Inst->getIntrinsicID()) { |
3502 | default: |
3503 | return nullptr; |
3504 | case Intrinsic::aarch64_neon_st2: |
3505 | case Intrinsic::aarch64_neon_st3: |
3506 | case Intrinsic::aarch64_neon_st4: { |
3507 | // Create a struct type |
3508 | StructType *ST = dyn_cast<StructType>(Val: ExpectedType); |
3509 | if (!ST) |
3510 | return nullptr; |
3511 | unsigned NumElts = Inst->arg_size() - 1; |
3512 | if (ST->getNumElements() != NumElts) |
3513 | return nullptr; |
3514 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
3515 | if (Inst->getArgOperand(i)->getType() != ST->getElementType(N: i)) |
3516 | return nullptr; |
3517 | } |
3518 | Value *Res = PoisonValue::get(T: ExpectedType); |
3519 | IRBuilder<> Builder(Inst); |
3520 | for (unsigned i = 0, e = NumElts; i != e; ++i) { |
3521 | Value *L = Inst->getArgOperand(i); |
3522 | Res = Builder.CreateInsertValue(Agg: Res, Val: L, Idxs: i); |
3523 | } |
3524 | return Res; |
3525 | } |
3526 | case Intrinsic::aarch64_neon_ld2: |
3527 | case Intrinsic::aarch64_neon_ld3: |
3528 | case Intrinsic::aarch64_neon_ld4: |
3529 | if (Inst->getType() == ExpectedType) |
3530 | return Inst; |
3531 | return nullptr; |
3532 | } |
3533 | } |
3534 | |
3535 | bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, |
3536 | MemIntrinsicInfo &Info) { |
3537 | switch (Inst->getIntrinsicID()) { |
3538 | default: |
3539 | break; |
3540 | case Intrinsic::aarch64_neon_ld2: |
3541 | case Intrinsic::aarch64_neon_ld3: |
3542 | case Intrinsic::aarch64_neon_ld4: |
3543 | Info.ReadMem = true; |
3544 | Info.WriteMem = false; |
3545 | Info.PtrVal = Inst->getArgOperand(i: 0); |
3546 | break; |
3547 | case Intrinsic::aarch64_neon_st2: |
3548 | case Intrinsic::aarch64_neon_st3: |
3549 | case Intrinsic::aarch64_neon_st4: |
3550 | Info.ReadMem = false; |
3551 | Info.WriteMem = true; |
3552 | Info.PtrVal = Inst->getArgOperand(i: Inst->arg_size() - 1); |
3553 | break; |
3554 | } |
3555 | |
3556 | switch (Inst->getIntrinsicID()) { |
3557 | default: |
3558 | return false; |
3559 | case Intrinsic::aarch64_neon_ld2: |
3560 | case Intrinsic::aarch64_neon_st2: |
3561 | Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS; |
3562 | break; |
3563 | case Intrinsic::aarch64_neon_ld3: |
3564 | case Intrinsic::aarch64_neon_st3: |
3565 | Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS; |
3566 | break; |
3567 | case Intrinsic::aarch64_neon_ld4: |
3568 | case Intrinsic::aarch64_neon_st4: |
3569 | Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS; |
3570 | break; |
3571 | } |
3572 | return true; |
3573 | } |
3574 | |
3575 | /// See if \p I should be considered for address type promotion. We check if \p |
3576 | /// I is a sext with right type and used in memory accesses. If it used in a |
3577 | /// "complex" getelementptr, we allow it to be promoted without finding other |
3578 | /// sext instructions that sign extended the same initial value. A getelementptr |
3579 | /// is considered as "complex" if it has more than 2 operands. |
3580 | bool AArch64TTIImpl::shouldConsiderAddressTypePromotion( |
3581 | const Instruction &I, bool &) { |
3582 | bool Considerable = false; |
3583 | AllowPromotionWithoutCommonHeader = false; |
3584 | if (!isa<SExtInst>(Val: &I)) |
3585 | return false; |
3586 | Type *ConsideredSExtType = |
3587 | Type::getInt64Ty(C&: I.getParent()->getParent()->getContext()); |
3588 | if (I.getType() != ConsideredSExtType) |
3589 | return false; |
3590 | // See if the sext is the one with the right type and used in at least one |
3591 | // GetElementPtrInst. |
3592 | for (const User *U : I.users()) { |
3593 | if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(Val: U)) { |
3594 | Considerable = true; |
3595 | // A getelementptr is considered as "complex" if it has more than 2 |
3596 | // operands. We will promote a SExt used in such complex GEP as we |
3597 | // expect some computation to be merged if they are done on 64 bits. |
3598 | if (GEPInst->getNumOperands() > 2) { |
3599 | AllowPromotionWithoutCommonHeader = true; |
3600 | break; |
3601 | } |
3602 | } |
3603 | } |
3604 | return Considerable; |
3605 | } |
3606 | |
3607 | bool AArch64TTIImpl::isLegalToVectorizeReduction( |
3608 | const RecurrenceDescriptor &RdxDesc, ElementCount VF) const { |
3609 | if (!VF.isScalable()) |
3610 | return true; |
3611 | |
3612 | Type *Ty = RdxDesc.getRecurrenceType(); |
3613 | if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty)) |
3614 | return false; |
3615 | |
3616 | switch (RdxDesc.getRecurrenceKind()) { |
3617 | case RecurKind::Add: |
3618 | case RecurKind::FAdd: |
3619 | case RecurKind::And: |
3620 | case RecurKind::Or: |
3621 | case RecurKind::Xor: |
3622 | case RecurKind::SMin: |
3623 | case RecurKind::SMax: |
3624 | case RecurKind::UMin: |
3625 | case RecurKind::UMax: |
3626 | case RecurKind::FMin: |
3627 | case RecurKind::FMax: |
3628 | case RecurKind::FMulAdd: |
3629 | case RecurKind::IAnyOf: |
3630 | case RecurKind::FAnyOf: |
3631 | return true; |
3632 | default: |
3633 | return false; |
3634 | } |
3635 | } |
3636 | |
3637 | InstructionCost |
3638 | AArch64TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, |
3639 | FastMathFlags FMF, |
3640 | TTI::TargetCostKind CostKind) { |
3641 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
3642 | |
3643 | if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16()) |
3644 | return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); |
3645 | |
3646 | InstructionCost LegalizationCost = 0; |
3647 | if (LT.first > 1) { |
3648 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Ty->getContext()); |
3649 | IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF); |
3650 | LegalizationCost = getIntrinsicInstrCost(ICA: Attrs, CostKind) * (LT.first - 1); |
3651 | } |
3652 | |
3653 | return LegalizationCost + /*Cost of horizontal reduction*/ 2; |
3654 | } |
3655 | |
3656 | InstructionCost AArch64TTIImpl::getArithmeticReductionCostSVE( |
3657 | unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) { |
3658 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
3659 | InstructionCost LegalizationCost = 0; |
3660 | if (LT.first > 1) { |
3661 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: ValTy->getContext()); |
3662 | LegalizationCost = getArithmeticInstrCost(Opcode, Ty: LegalVTy, CostKind); |
3663 | LegalizationCost *= LT.first - 1; |
3664 | } |
3665 | |
3666 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3667 | assert(ISD && "Invalid opcode" ); |
3668 | // Add the final reduction cost for the legal horizontal reduction |
3669 | switch (ISD) { |
3670 | case ISD::ADD: |
3671 | case ISD::AND: |
3672 | case ISD::OR: |
3673 | case ISD::XOR: |
3674 | case ISD::FADD: |
3675 | return LegalizationCost + 2; |
3676 | default: |
3677 | return InstructionCost::getInvalid(); |
3678 | } |
3679 | } |
3680 | |
3681 | InstructionCost |
3682 | AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
3683 | std::optional<FastMathFlags> FMF, |
3684 | TTI::TargetCostKind CostKind) { |
3685 | if (TTI::requiresOrderedReduction(FMF)) { |
3686 | if (auto *FixedVTy = dyn_cast<FixedVectorType>(Val: ValTy)) { |
3687 | InstructionCost BaseCost = |
3688 | BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
3689 | // Add on extra cost to reflect the extra overhead on some CPUs. We still |
3690 | // end up vectorizing for more computationally intensive loops. |
3691 | return BaseCost + FixedVTy->getNumElements(); |
3692 | } |
3693 | |
3694 | if (Opcode != Instruction::FAdd) |
3695 | return InstructionCost::getInvalid(); |
3696 | |
3697 | auto *VTy = cast<ScalableVectorType>(Val: ValTy); |
3698 | InstructionCost Cost = |
3699 | getArithmeticInstrCost(Opcode, Ty: VTy->getScalarType(), CostKind); |
3700 | Cost *= getMaxNumElements(VF: VTy->getElementCount()); |
3701 | return Cost; |
3702 | } |
3703 | |
3704 | if (isa<ScalableVectorType>(Val: ValTy)) |
3705 | return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind); |
3706 | |
3707 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
3708 | MVT MTy = LT.second; |
3709 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3710 | assert(ISD && "Invalid opcode" ); |
3711 | |
3712 | // Horizontal adds can use the 'addv' instruction. We model the cost of these |
3713 | // instructions as twice a normal vector add, plus 1 for each legalization |
3714 | // step (LT.first). This is the only arithmetic vector reduction operation for |
3715 | // which we have an instruction. |
3716 | // OR, XOR and AND costs should match the codegen from: |
3717 | // OR: llvm/test/CodeGen/AArch64/reduce-or.ll |
3718 | // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll |
3719 | // AND: llvm/test/CodeGen/AArch64/reduce-and.ll |
3720 | static const CostTblEntry CostTblNoPairwise[]{ |
3721 | {ISD::ADD, MVT::v8i8, 2}, |
3722 | {ISD::ADD, MVT::v16i8, 2}, |
3723 | {ISD::ADD, MVT::v4i16, 2}, |
3724 | {ISD::ADD, MVT::v8i16, 2}, |
3725 | {ISD::ADD, MVT::v4i32, 2}, |
3726 | {ISD::ADD, MVT::v2i64, 2}, |
3727 | {ISD::OR, MVT::v8i8, 15}, |
3728 | {ISD::OR, MVT::v16i8, 17}, |
3729 | {ISD::OR, MVT::v4i16, 7}, |
3730 | {ISD::OR, MVT::v8i16, 9}, |
3731 | {ISD::OR, MVT::v2i32, 3}, |
3732 | {ISD::OR, MVT::v4i32, 5}, |
3733 | {ISD::OR, MVT::v2i64, 3}, |
3734 | {ISD::XOR, MVT::v8i8, 15}, |
3735 | {ISD::XOR, MVT::v16i8, 17}, |
3736 | {ISD::XOR, MVT::v4i16, 7}, |
3737 | {ISD::XOR, MVT::v8i16, 9}, |
3738 | {ISD::XOR, MVT::v2i32, 3}, |
3739 | {ISD::XOR, MVT::v4i32, 5}, |
3740 | {ISD::XOR, MVT::v2i64, 3}, |
3741 | {ISD::AND, MVT::v8i8, 15}, |
3742 | {ISD::AND, MVT::v16i8, 17}, |
3743 | {ISD::AND, MVT::v4i16, 7}, |
3744 | {ISD::AND, MVT::v8i16, 9}, |
3745 | {ISD::AND, MVT::v2i32, 3}, |
3746 | {ISD::AND, MVT::v4i32, 5}, |
3747 | {ISD::AND, MVT::v2i64, 3}, |
3748 | }; |
3749 | switch (ISD) { |
3750 | default: |
3751 | break; |
3752 | case ISD::ADD: |
3753 | if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) |
3754 | return (LT.first - 1) + Entry->Cost; |
3755 | break; |
3756 | case ISD::XOR: |
3757 | case ISD::AND: |
3758 | case ISD::OR: |
3759 | const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy); |
3760 | if (!Entry) |
3761 | break; |
3762 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
3763 | if (MTy.getVectorNumElements() <= ValVTy->getNumElements() && |
3764 | isPowerOf2_32(Value: ValVTy->getNumElements())) { |
3765 | InstructionCost = 0; |
3766 | if (LT.first != 1) { |
3767 | // Type needs to be split, so there is an extra cost of LT.first - 1 |
3768 | // arithmetic ops. |
3769 | auto *Ty = FixedVectorType::get(ElementType: ValTy->getElementType(), |
3770 | NumElts: MTy.getVectorNumElements()); |
3771 | ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
3772 | ExtraCost *= LT.first - 1; |
3773 | } |
3774 | // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov |
3775 | auto Cost = ValVTy->getElementType()->isIntegerTy(Bitwidth: 1) ? 2 : Entry->Cost; |
3776 | return Cost + ExtraCost; |
3777 | } |
3778 | break; |
3779 | } |
3780 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
3781 | } |
3782 | |
3783 | InstructionCost AArch64TTIImpl::getSpliceCost(VectorType *Tp, int Index) { |
3784 | static const CostTblEntry ShuffleTbl[] = { |
3785 | { TTI::SK_Splice, MVT::nxv16i8, 1 }, |
3786 | { TTI::SK_Splice, MVT::nxv8i16, 1 }, |
3787 | { TTI::SK_Splice, MVT::nxv4i32, 1 }, |
3788 | { TTI::SK_Splice, MVT::nxv2i64, 1 }, |
3789 | { TTI::SK_Splice, MVT::nxv2f16, 1 }, |
3790 | { TTI::SK_Splice, MVT::nxv4f16, 1 }, |
3791 | { TTI::SK_Splice, MVT::nxv8f16, 1 }, |
3792 | { TTI::SK_Splice, MVT::nxv2bf16, 1 }, |
3793 | { TTI::SK_Splice, MVT::nxv4bf16, 1 }, |
3794 | { TTI::SK_Splice, MVT::nxv8bf16, 1 }, |
3795 | { TTI::SK_Splice, MVT::nxv2f32, 1 }, |
3796 | { TTI::SK_Splice, MVT::nxv4f32, 1 }, |
3797 | { TTI::SK_Splice, MVT::nxv2f64, 1 }, |
3798 | }; |
3799 | |
3800 | // The code-generator is currently not able to handle scalable vectors |
3801 | // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting |
3802 | // it. This change will be removed when code-generation for these types is |
3803 | // sufficiently reliable. |
3804 | if (Tp->getElementCount() == ElementCount::getScalable(MinVal: 1)) |
3805 | return InstructionCost::getInvalid(); |
3806 | |
3807 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
3808 | Type *LegalVTy = EVT(LT.second).getTypeForEVT(Context&: Tp->getContext()); |
3809 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
3810 | EVT PromotedVT = LT.second.getScalarType() == MVT::i1 |
3811 | ? TLI->getPromotedVTForPredicate(EVT(LT.second)) |
3812 | : LT.second; |
3813 | Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Context&: Tp->getContext()); |
3814 | InstructionCost LegalizationCost = 0; |
3815 | if (Index < 0) { |
3816 | LegalizationCost = |
3817 | getCmpSelInstrCost(Opcode: Instruction::ICmp, ValTy: PromotedVTy, CondTy: PromotedVTy, |
3818 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind) + |
3819 | getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: PromotedVTy, CondTy: LegalVTy, |
3820 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
3821 | } |
3822 | |
3823 | // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp |
3824 | // Cost performed on a promoted type. |
3825 | if (LT.second.getScalarType() == MVT::i1) { |
3826 | LegalizationCost += |
3827 | getCastInstrCost(Opcode: Instruction::ZExt, Dst: PromotedVTy, Src: LegalVTy, |
3828 | CCH: TTI::CastContextHint::None, CostKind) + |
3829 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: LegalVTy, Src: PromotedVTy, |
3830 | CCH: TTI::CastContextHint::None, CostKind); |
3831 | } |
3832 | const auto *Entry = |
3833 | CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT()); |
3834 | assert(Entry && "Illegal Type for Splice" ); |
3835 | LegalizationCost += Entry->Cost; |
3836 | return LegalizationCost * LT.first; |
3837 | } |
3838 | |
3839 | InstructionCost AArch64TTIImpl::getShuffleCost( |
3840 | TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef<int> Mask, |
3841 | TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, |
3842 | ArrayRef<const Value *> Args, const Instruction *CxtI) { |
3843 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Tp); |
3844 | |
3845 | // If we have a Mask, and the LT is being legalized somehow, split the Mask |
3846 | // into smaller vectors and sum the cost of each shuffle. |
3847 | if (!Mask.empty() && isa<FixedVectorType>(Val: Tp) && LT.second.isVector() && |
3848 | Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() && |
3849 | Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) { |
3850 | |
3851 | // Check for LD3/LD4 instructions, which are represented in llvm IR as |
3852 | // deinterleaving-shuffle(load). The shuffle cost could potentially be free, |
3853 | // but we model it with a cost of LT.first so that LD3/LD4 have a higher |
3854 | // cost than just the load. |
3855 | if (Args.size() >= 1 && isa<LoadInst>(Val: Args[0]) && |
3856 | (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 3) || |
3857 | ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor: 4))) |
3858 | return std::max<InstructionCost>(a: 1, b: LT.first / 4); |
3859 | |
3860 | // Check for ST3/ST4 instructions, which are represented in llvm IR as |
3861 | // store(interleaving-shuffle). The shuffle cost could potentially be free, |
3862 | // but we model it with a cost of LT.first so that ST3/ST4 have a higher |
3863 | // cost than just the store. |
3864 | if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(Val: *CxtI->user_begin()) && |
3865 | (ShuffleVectorInst::isInterleaveMask( |
3866 | Mask, Factor: 4, NumInputElts: Tp->getElementCount().getKnownMinValue() * 2) || |
3867 | ShuffleVectorInst::isInterleaveMask( |
3868 | Mask, Factor: 3, NumInputElts: Tp->getElementCount().getKnownMinValue() * 2))) |
3869 | return LT.first; |
3870 | |
3871 | unsigned TpNumElts = Mask.size(); |
3872 | unsigned LTNumElts = LT.second.getVectorNumElements(); |
3873 | unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts; |
3874 | VectorType *NTp = |
3875 | VectorType::get(ElementType: Tp->getScalarType(), EC: LT.second.getVectorElementCount()); |
3876 | InstructionCost Cost; |
3877 | for (unsigned N = 0; N < NumVecs; N++) { |
3878 | SmallVector<int> NMask; |
3879 | // Split the existing mask into chunks of size LTNumElts. Track the source |
3880 | // sub-vectors to ensure the result has at most 2 inputs. |
3881 | unsigned Source1, Source2; |
3882 | unsigned NumSources = 0; |
3883 | for (unsigned E = 0; E < LTNumElts; E++) { |
3884 | int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E] |
3885 | : PoisonMaskElem; |
3886 | if (MaskElt < 0) { |
3887 | NMask.push_back(Elt: PoisonMaskElem); |
3888 | continue; |
3889 | } |
3890 | |
3891 | // Calculate which source from the input this comes from and whether it |
3892 | // is new to us. |
3893 | unsigned Source = MaskElt / LTNumElts; |
3894 | if (NumSources == 0) { |
3895 | Source1 = Source; |
3896 | NumSources = 1; |
3897 | } else if (NumSources == 1 && Source != Source1) { |
3898 | Source2 = Source; |
3899 | NumSources = 2; |
3900 | } else if (NumSources >= 2 && Source != Source1 && Source != Source2) { |
3901 | NumSources++; |
3902 | } |
3903 | |
3904 | // Add to the new mask. For the NumSources>2 case these are not correct, |
3905 | // but are only used for the modular lane number. |
3906 | if (Source == Source1) |
3907 | NMask.push_back(Elt: MaskElt % LTNumElts); |
3908 | else if (Source == Source2) |
3909 | NMask.push_back(Elt: MaskElt % LTNumElts + LTNumElts); |
3910 | else |
3911 | NMask.push_back(Elt: MaskElt % LTNumElts); |
3912 | } |
3913 | // If the sub-mask has at most 2 input sub-vectors then re-cost it using |
3914 | // getShuffleCost. If not then cost it using the worst case. |
3915 | if (NumSources <= 2) |
3916 | Cost += getShuffleCost(Kind: NumSources <= 1 ? TTI::SK_PermuteSingleSrc |
3917 | : TTI::SK_PermuteTwoSrc, |
3918 | Tp: NTp, Mask: NMask, CostKind, Index: 0, SubTp: nullptr, Args, CxtI); |
3919 | else if (any_of(Range: enumerate(First&: NMask), P: [&](const auto &ME) { |
3920 | return ME.value() % LTNumElts == ME.index(); |
3921 | })) |
3922 | Cost += LTNumElts - 1; |
3923 | else |
3924 | Cost += LTNumElts; |
3925 | } |
3926 | return Cost; |
3927 | } |
3928 | |
3929 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp); |
3930 | // Treat extractsubvector as single op permutation. |
3931 | bool = Kind == TTI::SK_ExtractSubvector; |
3932 | if (IsExtractSubvector && LT.second.isFixedLengthVector()) |
3933 | Kind = TTI::SK_PermuteSingleSrc; |
3934 | |
3935 | // Check for broadcast loads, which are supported by the LD1R instruction. |
3936 | // In terms of code-size, the shuffle vector is free when a load + dup get |
3937 | // folded into a LD1R. That's what we check and return here. For performance |
3938 | // and reciprocal throughput, a LD1R is not completely free. In this case, we |
3939 | // return the cost for the broadcast below (i.e. 1 for most/all types), so |
3940 | // that we model the load + dup sequence slightly higher because LD1R is a |
3941 | // high latency instruction. |
3942 | if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) { |
3943 | bool IsLoad = !Args.empty() && isa<LoadInst>(Val: Args[0]); |
3944 | if (IsLoad && LT.second.isVector() && |
3945 | isLegalBroadcastLoad(ElementTy: Tp->getElementType(), |
3946 | NumElements: LT.second.getVectorElementCount())) |
3947 | return 0; |
3948 | } |
3949 | |
3950 | // If we have 4 elements for the shuffle and a Mask, get the cost straight |
3951 | // from the perfect shuffle tables. |
3952 | if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(MinVal: 4) && |
3953 | (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) && |
3954 | all_of(Range&: Mask, P: [](int E) { return E < 8; })) |
3955 | return getPerfectShuffleCost(M: Mask); |
3956 | |
3957 | // Check for identity masks, which we can treat as free. |
3958 | if (!Mask.empty() && LT.second.isFixedLengthVector() && |
3959 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
3960 | all_of(Range: enumerate(First&: Mask), P: [](const auto &M) { |
3961 | return M.value() < 0 || M.value() == (int)M.index(); |
3962 | })) |
3963 | return 0; |
3964 | |
3965 | // Check for other shuffles that are not SK_ kinds but we have native |
3966 | // instructions for, for example ZIP and UZP. |
3967 | unsigned Unused; |
3968 | if (LT.second.isFixedLengthVector() && |
3969 | LT.second.getVectorNumElements() == Mask.size() && |
3970 | (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && |
3971 | (isZIPMask(M: Mask, VT: LT.second, WhichResultOut&: Unused) || |
3972 | isUZPMask(M: Mask, VT: LT.second, WhichResultOut&: Unused) || |
3973 | // Check for non-zero lane splats |
3974 | all_of(Range: drop_begin(RangeOrContainer&: Mask), |
3975 | P: [&Mask](int M) { return M < 0 || M == Mask[0]; }))) |
3976 | return 1; |
3977 | |
3978 | if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose || |
3979 | Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc || |
3980 | Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) { |
3981 | static const CostTblEntry ShuffleTbl[] = { |
3982 | // Broadcast shuffle kinds can be performed with 'dup'. |
3983 | {TTI::SK_Broadcast, MVT::v8i8, 1}, |
3984 | {TTI::SK_Broadcast, MVT::v16i8, 1}, |
3985 | {TTI::SK_Broadcast, MVT::v4i16, 1}, |
3986 | {TTI::SK_Broadcast, MVT::v8i16, 1}, |
3987 | {TTI::SK_Broadcast, MVT::v2i32, 1}, |
3988 | {TTI::SK_Broadcast, MVT::v4i32, 1}, |
3989 | {TTI::SK_Broadcast, MVT::v2i64, 1}, |
3990 | {TTI::SK_Broadcast, MVT::v4f16, 1}, |
3991 | {TTI::SK_Broadcast, MVT::v8f16, 1}, |
3992 | {TTI::SK_Broadcast, MVT::v2f32, 1}, |
3993 | {TTI::SK_Broadcast, MVT::v4f32, 1}, |
3994 | {TTI::SK_Broadcast, MVT::v2f64, 1}, |
3995 | // Transpose shuffle kinds can be performed with 'trn1/trn2' and |
3996 | // 'zip1/zip2' instructions. |
3997 | {TTI::SK_Transpose, MVT::v8i8, 1}, |
3998 | {TTI::SK_Transpose, MVT::v16i8, 1}, |
3999 | {TTI::SK_Transpose, MVT::v4i16, 1}, |
4000 | {TTI::SK_Transpose, MVT::v8i16, 1}, |
4001 | {TTI::SK_Transpose, MVT::v2i32, 1}, |
4002 | {TTI::SK_Transpose, MVT::v4i32, 1}, |
4003 | {TTI::SK_Transpose, MVT::v2i64, 1}, |
4004 | {TTI::SK_Transpose, MVT::v4f16, 1}, |
4005 | {TTI::SK_Transpose, MVT::v8f16, 1}, |
4006 | {TTI::SK_Transpose, MVT::v2f32, 1}, |
4007 | {TTI::SK_Transpose, MVT::v4f32, 1}, |
4008 | {TTI::SK_Transpose, MVT::v2f64, 1}, |
4009 | // Select shuffle kinds. |
4010 | // TODO: handle vXi8/vXi16. |
4011 | {TTI::SK_Select, MVT::v2i32, 1}, // mov. |
4012 | {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar). |
4013 | {TTI::SK_Select, MVT::v2i64, 1}, // mov. |
4014 | {TTI::SK_Select, MVT::v2f32, 1}, // mov. |
4015 | {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar). |
4016 | {TTI::SK_Select, MVT::v2f64, 1}, // mov. |
4017 | // PermuteSingleSrc shuffle kinds. |
4018 | {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov. |
4019 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case. |
4020 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov. |
4021 | {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov. |
4022 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case. |
4023 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov. |
4024 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case. |
4025 | {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case. |
4026 | {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same |
4027 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl |
4028 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl |
4029 | {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl |
4030 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl |
4031 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl |
4032 | // Reverse can be lowered with `rev`. |
4033 | {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64 |
4034 | {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT |
4035 | {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT |
4036 | {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64 |
4037 | {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT |
4038 | {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT |
4039 | {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT |
4040 | {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT |
4041 | {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT |
4042 | {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64 |
4043 | {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64 |
4044 | {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64 |
4045 | // Splice can all be lowered as `ext`. |
4046 | {TTI::SK_Splice, MVT::v2i32, 1}, |
4047 | {TTI::SK_Splice, MVT::v4i32, 1}, |
4048 | {TTI::SK_Splice, MVT::v2i64, 1}, |
4049 | {TTI::SK_Splice, MVT::v2f32, 1}, |
4050 | {TTI::SK_Splice, MVT::v4f32, 1}, |
4051 | {TTI::SK_Splice, MVT::v2f64, 1}, |
4052 | {TTI::SK_Splice, MVT::v8f16, 1}, |
4053 | {TTI::SK_Splice, MVT::v8bf16, 1}, |
4054 | {TTI::SK_Splice, MVT::v8i16, 1}, |
4055 | {TTI::SK_Splice, MVT::v16i8, 1}, |
4056 | {TTI::SK_Splice, MVT::v4bf16, 1}, |
4057 | {TTI::SK_Splice, MVT::v4f16, 1}, |
4058 | {TTI::SK_Splice, MVT::v4i16, 1}, |
4059 | {TTI::SK_Splice, MVT::v8i8, 1}, |
4060 | // Broadcast shuffle kinds for scalable vectors |
4061 | {TTI::SK_Broadcast, MVT::nxv16i8, 1}, |
4062 | {TTI::SK_Broadcast, MVT::nxv8i16, 1}, |
4063 | {TTI::SK_Broadcast, MVT::nxv4i32, 1}, |
4064 | {TTI::SK_Broadcast, MVT::nxv2i64, 1}, |
4065 | {TTI::SK_Broadcast, MVT::nxv2f16, 1}, |
4066 | {TTI::SK_Broadcast, MVT::nxv4f16, 1}, |
4067 | {TTI::SK_Broadcast, MVT::nxv8f16, 1}, |
4068 | {TTI::SK_Broadcast, MVT::nxv2bf16, 1}, |
4069 | {TTI::SK_Broadcast, MVT::nxv4bf16, 1}, |
4070 | {TTI::SK_Broadcast, MVT::nxv8bf16, 1}, |
4071 | {TTI::SK_Broadcast, MVT::nxv2f32, 1}, |
4072 | {TTI::SK_Broadcast, MVT::nxv4f32, 1}, |
4073 | {TTI::SK_Broadcast, MVT::nxv2f64, 1}, |
4074 | {TTI::SK_Broadcast, MVT::nxv16i1, 1}, |
4075 | {TTI::SK_Broadcast, MVT::nxv8i1, 1}, |
4076 | {TTI::SK_Broadcast, MVT::nxv4i1, 1}, |
4077 | {TTI::SK_Broadcast, MVT::nxv2i1, 1}, |
4078 | // Handle the cases for vector.reverse with scalable vectors |
4079 | {TTI::SK_Reverse, MVT::nxv16i8, 1}, |
4080 | {TTI::SK_Reverse, MVT::nxv8i16, 1}, |
4081 | {TTI::SK_Reverse, MVT::nxv4i32, 1}, |
4082 | {TTI::SK_Reverse, MVT::nxv2i64, 1}, |
4083 | {TTI::SK_Reverse, MVT::nxv2f16, 1}, |
4084 | {TTI::SK_Reverse, MVT::nxv4f16, 1}, |
4085 | {TTI::SK_Reverse, MVT::nxv8f16, 1}, |
4086 | {TTI::SK_Reverse, MVT::nxv2bf16, 1}, |
4087 | {TTI::SK_Reverse, MVT::nxv4bf16, 1}, |
4088 | {TTI::SK_Reverse, MVT::nxv8bf16, 1}, |
4089 | {TTI::SK_Reverse, MVT::nxv2f32, 1}, |
4090 | {TTI::SK_Reverse, MVT::nxv4f32, 1}, |
4091 | {TTI::SK_Reverse, MVT::nxv2f64, 1}, |
4092 | {TTI::SK_Reverse, MVT::nxv16i1, 1}, |
4093 | {TTI::SK_Reverse, MVT::nxv8i1, 1}, |
4094 | {TTI::SK_Reverse, MVT::nxv4i1, 1}, |
4095 | {TTI::SK_Reverse, MVT::nxv2i1, 1}, |
4096 | }; |
4097 | if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second)) |
4098 | return LT.first * Entry->Cost; |
4099 | } |
4100 | |
4101 | if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Val: Tp)) |
4102 | return getSpliceCost(Tp, Index); |
4103 | |
4104 | // Inserting a subvector can often be done with either a D, S or H register |
4105 | // move, so long as the inserted vector is "aligned". |
4106 | if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() && |
4107 | LT.second.getSizeInBits() <= 128 && SubTp) { |
4108 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
4109 | if (SubLT.second.isVector()) { |
4110 | int NumElts = LT.second.getVectorNumElements(); |
4111 | int NumSubElts = SubLT.second.getVectorNumElements(); |
4112 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
4113 | return SubLT.first; |
4114 | } |
4115 | } |
4116 | |
4117 | // Restore optimal kind. |
4118 | if (IsExtractSubvector) |
4119 | Kind = TTI::SK_ExtractSubvector; |
4120 | return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args, |
4121 | CxtI); |
4122 | } |
4123 | |
4124 | static bool containsDecreasingPointers(Loop *TheLoop, |
4125 | PredicatedScalarEvolution *PSE) { |
4126 | const auto &Strides = DenseMap<Value *, const SCEV *>(); |
4127 | for (BasicBlock *BB : TheLoop->blocks()) { |
4128 | // Scan the instructions in the block and look for addresses that are |
4129 | // consecutive and decreasing. |
4130 | for (Instruction &I : *BB) { |
4131 | if (isa<LoadInst>(Val: &I) || isa<StoreInst>(Val: &I)) { |
4132 | Value *Ptr = getLoadStorePointerOperand(V: &I); |
4133 | Type *AccessTy = getLoadStoreType(I: &I); |
4134 | if (getPtrStride(PSE&: *PSE, AccessTy, Ptr, Lp: TheLoop, StridesMap: Strides, /*Assume=*/true, |
4135 | /*ShouldCheckWrap=*/false) |
4136 | .value_or(u: 0) < 0) |
4137 | return true; |
4138 | } |
4139 | } |
4140 | } |
4141 | return false; |
4142 | } |
4143 | |
4144 | bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) { |
4145 | if (!ST->hasSVE()) |
4146 | return false; |
4147 | |
4148 | // We don't currently support vectorisation with interleaving for SVE - with |
4149 | // such loops we're better off not using tail-folding. This gives us a chance |
4150 | // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. |
4151 | if (TFI->IAI->hasGroups()) |
4152 | return false; |
4153 | |
4154 | TailFoldingOpts Required = TailFoldingOpts::Disabled; |
4155 | if (TFI->LVL->getReductionVars().size()) |
4156 | Required |= TailFoldingOpts::Reductions; |
4157 | if (TFI->LVL->getFixedOrderRecurrences().size()) |
4158 | Required |= TailFoldingOpts::Recurrences; |
4159 | |
4160 | // We call this to discover whether any load/store pointers in the loop have |
4161 | // negative strides. This will require extra work to reverse the loop |
4162 | // predicate, which may be expensive. |
4163 | if (containsDecreasingPointers(TheLoop: TFI->LVL->getLoop(), |
4164 | PSE: TFI->LVL->getPredicatedScalarEvolution())) |
4165 | Required |= TailFoldingOpts::Reverse; |
4166 | if (Required == TailFoldingOpts::Disabled) |
4167 | Required |= TailFoldingOpts::Simple; |
4168 | |
4169 | if (!TailFoldingOptionLoc.satisfies(DefaultBits: ST->getSVETailFoldingDefaultOpts(), |
4170 | Required)) |
4171 | return false; |
4172 | |
4173 | // Don't tail-fold for tight loops where we would be better off interleaving |
4174 | // with an unpredicated loop. |
4175 | unsigned NumInsns = 0; |
4176 | for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) { |
4177 | NumInsns += BB->sizeWithoutDebug(); |
4178 | } |
4179 | |
4180 | // We expect 4 of these to be a IV PHI, IV add, IV compare and branch. |
4181 | return NumInsns >= SVETailFoldInsnThreshold; |
4182 | } |
4183 | |
4184 | InstructionCost |
4185 | AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
4186 | int64_t BaseOffset, bool HasBaseReg, |
4187 | int64_t Scale, unsigned AddrSpace) const { |
4188 | // Scaling factors are not free at all. |
4189 | // Operands | Rt Latency |
4190 | // ------------------------------------------- |
4191 | // Rt, [Xn, Xm] | 4 |
4192 | // ------------------------------------------- |
4193 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 |
4194 | // Rt, [Xn, Wm, <extend> #imm] | |
4195 | TargetLoweringBase::AddrMode AM; |
4196 | AM.BaseGV = BaseGV; |
4197 | AM.BaseOffs = BaseOffset; |
4198 | AM.HasBaseReg = HasBaseReg; |
4199 | AM.Scale = Scale; |
4200 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
4201 | // Scale represents reg2 * scale, thus account for 1 if |
4202 | // it is not equal to 0 or 1. |
4203 | return AM.Scale != 0 && AM.Scale != 1; |
4204 | return -1; |
4205 | } |
4206 | |
4207 | bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { |
4208 | // For the binary operators (e.g. or) we need to be more careful than |
4209 | // selects, here we only transform them if they are already at a natural |
4210 | // break point in the code - the end of a block with an unconditional |
4211 | // terminator. |
4212 | if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or && |
4213 | isa<BranchInst>(Val: I->getNextNode()) && |
4214 | cast<BranchInst>(Val: I->getNextNode())->isUnconditional()) |
4215 | return true; |
4216 | return BaseT::shouldTreatInstructionLikeSelect(I); |
4217 | } |
4218 | |