1 | //===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// This file implements a TargetTransformInfo analysis pass specific to the |
10 | /// X86 target machine. It uses the target's detailed information to provide |
11 | /// more precise answers to certain TTI queries, while letting the target |
12 | /// independent and default TTI implementations handle the rest. |
13 | /// |
14 | //===----------------------------------------------------------------------===// |
15 | /// About Cost Model numbers used below it's necessary to say the following: |
16 | /// the numbers correspond to some "generic" X86 CPU instead of usage of a |
17 | /// specific CPU model. Usually the numbers correspond to the CPU where the |
18 | /// feature first appeared. For example, if we do Subtarget.hasSSE42() in |
19 | /// the lookups below the cost is based on Nehalem as that was the first CPU |
20 | /// to support that feature level and thus has most likely the worst case cost, |
21 | /// although we may discard an outlying worst cost from one CPU (e.g. Atom). |
22 | /// |
23 | /// Some examples of other technologies/CPUs: |
24 | /// SSE 3 - Pentium4 / Athlon64 |
25 | /// SSE 4.1 - Penryn |
26 | /// SSE 4.2 - Nehalem / Silvermont |
27 | /// AVX - Sandy Bridge / Jaguar / Bulldozer |
28 | /// AVX2 - Haswell / Ryzen |
29 | /// AVX-512 - Xeon Phi / Skylake |
30 | /// |
31 | /// And some examples of instruction target dependent costs (latency) |
32 | /// divss sqrtss rsqrtss |
33 | /// AMD K7 11-16 19 3 |
34 | /// Piledriver 9-24 13-15 5 |
35 | /// Jaguar 14 16 2 |
36 | /// Pentium II,III 18 30 2 |
37 | /// Nehalem 7-14 7-18 3 |
38 | /// Haswell 10-13 11 5 |
39 | /// |
40 | /// Interpreting the 4 TargetCostKind types: |
41 | /// TCK_RecipThroughput and TCK_Latency should try to match the worst case |
42 | /// values reported by the CPU scheduler models (and llvm-mca). |
43 | /// TCK_CodeSize should match the instruction count (e.g. divss = 1), NOT the |
44 | /// actual encoding size of the instruction. |
45 | /// TCK_SizeAndLatency should match the worst case micro-op counts reported by |
46 | /// by the CPU scheduler models (and llvm-mca), to ensure that they are |
47 | /// compatible with the MicroOpBufferSize and LoopMicroOpBufferSize values which are |
48 | /// often used as the cost thresholds where TCK_SizeAndLatency is requested. |
49 | //===----------------------------------------------------------------------===// |
50 | |
51 | #include "X86TargetTransformInfo.h" |
52 | #include "llvm/Analysis/TargetTransformInfo.h" |
53 | #include "llvm/CodeGen/BasicTTIImpl.h" |
54 | #include "llvm/CodeGen/CostTable.h" |
55 | #include "llvm/CodeGen/TargetLowering.h" |
56 | #include "llvm/IR/InstIterator.h" |
57 | #include "llvm/IR/IntrinsicInst.h" |
58 | #include "llvm/Support/Debug.h" |
59 | #include <optional> |
60 | |
61 | using namespace llvm; |
62 | |
63 | #define DEBUG_TYPE "x86tti" |
64 | |
65 | //===----------------------------------------------------------------------===// |
66 | // |
67 | // X86 cost model. |
68 | // |
69 | //===----------------------------------------------------------------------===// |
70 | |
71 | // Helper struct to store/access costs for each cost kind. |
72 | // TODO: Move this to allow other targets to use it? |
73 | struct CostKindCosts { |
74 | unsigned RecipThroughputCost = ~0U; |
75 | unsigned LatencyCost = ~0U; |
76 | unsigned CodeSizeCost = ~0U; |
77 | unsigned SizeAndLatencyCost = ~0U; |
78 | |
79 | std::optional<unsigned> |
80 | operator[](TargetTransformInfo::TargetCostKind Kind) const { |
81 | unsigned Cost = ~0U; |
82 | switch (Kind) { |
83 | case TargetTransformInfo::TCK_RecipThroughput: |
84 | Cost = RecipThroughputCost; |
85 | break; |
86 | case TargetTransformInfo::TCK_Latency: |
87 | Cost = LatencyCost; |
88 | break; |
89 | case TargetTransformInfo::TCK_CodeSize: |
90 | Cost = CodeSizeCost; |
91 | break; |
92 | case TargetTransformInfo::TCK_SizeAndLatency: |
93 | Cost = SizeAndLatencyCost; |
94 | break; |
95 | } |
96 | if (Cost == ~0U) |
97 | return std::nullopt; |
98 | return Cost; |
99 | } |
100 | }; |
101 | using CostKindTblEntry = CostTblEntryT<CostKindCosts>; |
102 | |
103 | TargetTransformInfo::PopcntSupportKind |
104 | X86TTIImpl::getPopcntSupport(unsigned TyWidth) { |
105 | assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2" ); |
106 | // TODO: Currently the __builtin_popcount() implementation using SSE3 |
107 | // instructions is inefficient. Once the problem is fixed, we should |
108 | // call ST->hasSSE3() instead of ST->hasPOPCNT(). |
109 | return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; |
110 | } |
111 | |
112 | std::optional<unsigned> X86TTIImpl::getCacheSize( |
113 | TargetTransformInfo::CacheLevel Level) const { |
114 | switch (Level) { |
115 | case TargetTransformInfo::CacheLevel::L1D: |
116 | // - Penryn |
117 | // - Nehalem |
118 | // - Westmere |
119 | // - Sandy Bridge |
120 | // - Ivy Bridge |
121 | // - Haswell |
122 | // - Broadwell |
123 | // - Skylake |
124 | // - Kabylake |
125 | return 32 * 1024; // 32 KByte |
126 | case TargetTransformInfo::CacheLevel::L2D: |
127 | // - Penryn |
128 | // - Nehalem |
129 | // - Westmere |
130 | // - Sandy Bridge |
131 | // - Ivy Bridge |
132 | // - Haswell |
133 | // - Broadwell |
134 | // - Skylake |
135 | // - Kabylake |
136 | return 256 * 1024; // 256 KByte |
137 | } |
138 | |
139 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
140 | } |
141 | |
142 | std::optional<unsigned> X86TTIImpl::getCacheAssociativity( |
143 | TargetTransformInfo::CacheLevel Level) const { |
144 | // - Penryn |
145 | // - Nehalem |
146 | // - Westmere |
147 | // - Sandy Bridge |
148 | // - Ivy Bridge |
149 | // - Haswell |
150 | // - Broadwell |
151 | // - Skylake |
152 | // - Kabylake |
153 | switch (Level) { |
154 | case TargetTransformInfo::CacheLevel::L1D: |
155 | [[fallthrough]]; |
156 | case TargetTransformInfo::CacheLevel::L2D: |
157 | return 8; |
158 | } |
159 | |
160 | llvm_unreachable("Unknown TargetTransformInfo::CacheLevel" ); |
161 | } |
162 | |
163 | unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const { |
164 | bool Vector = (ClassID == 1); |
165 | if (Vector && !ST->hasSSE1()) |
166 | return 0; |
167 | |
168 | if (ST->is64Bit()) { |
169 | if (Vector && ST->hasAVX512()) |
170 | return 32; |
171 | return 16; |
172 | } |
173 | return 8; |
174 | } |
175 | |
176 | TypeSize |
177 | X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
178 | unsigned PreferVectorWidth = ST->getPreferVectorWidth(); |
179 | switch (K) { |
180 | case TargetTransformInfo::RGK_Scalar: |
181 | return TypeSize::getFixed(ExactSize: ST->is64Bit() ? 64 : 32); |
182 | case TargetTransformInfo::RGK_FixedWidthVector: |
183 | if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512) |
184 | return TypeSize::getFixed(ExactSize: 512); |
185 | if (ST->hasAVX() && PreferVectorWidth >= 256) |
186 | return TypeSize::getFixed(ExactSize: 256); |
187 | if (ST->hasSSE1() && PreferVectorWidth >= 128) |
188 | return TypeSize::getFixed(ExactSize: 128); |
189 | return TypeSize::getFixed(ExactSize: 0); |
190 | case TargetTransformInfo::RGK_ScalableVector: |
191 | return TypeSize::getScalable(MinimumSize: 0); |
192 | } |
193 | |
194 | llvm_unreachable("Unsupported register kind" ); |
195 | } |
196 | |
197 | unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { |
198 | return getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
199 | .getFixedValue(); |
200 | } |
201 | |
202 | unsigned X86TTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
203 | // If the loop will not be vectorized, don't interleave the loop. |
204 | // Let regular unroll to unroll the loop, which saves the overflow |
205 | // check and memory check cost. |
206 | if (VF.isScalar()) |
207 | return 1; |
208 | |
209 | if (ST->isAtom()) |
210 | return 1; |
211 | |
212 | // Sandybridge and Haswell have multiple execution ports and pipelined |
213 | // vector units. |
214 | if (ST->hasAVX()) |
215 | return 4; |
216 | |
217 | return 2; |
218 | } |
219 | |
220 | InstructionCost X86TTIImpl::getArithmeticInstrCost( |
221 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
222 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
223 | ArrayRef<const Value *> Args, |
224 | const Instruction *CxtI) { |
225 | |
226 | // vXi8 multiplications are always promoted to vXi16. |
227 | // Sub-128-bit types can be extended/packed more efficiently. |
228 | if (Opcode == Instruction::Mul && Ty->isVectorTy() && |
229 | Ty->getPrimitiveSizeInBits() <= 64 && Ty->getScalarSizeInBits() == 8) { |
230 | Type *WideVecTy = |
231 | VectorType::getExtendedElementVectorType(VTy: cast<VectorType>(Val: Ty)); |
232 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: Ty, |
233 | CCH: TargetTransformInfo::CastContextHint::None, |
234 | CostKind) + |
235 | getCastInstrCost(Opcode: Instruction::Trunc, Dst: Ty, Src: WideVecTy, |
236 | CCH: TargetTransformInfo::CastContextHint::None, |
237 | CostKind) + |
238 | getArithmeticInstrCost(Opcode, Ty: WideVecTy, CostKind, Op1Info, Op2Info); |
239 | } |
240 | |
241 | // Legalize the type. |
242 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
243 | |
244 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
245 | assert(ISD && "Invalid opcode" ); |
246 | |
247 | if (ISD == ISD::MUL && Args.size() == 2 && LT.second.isVector() && |
248 | (LT.second.getScalarType() == MVT::i32 || |
249 | LT.second.getScalarType() == MVT::i64)) { |
250 | // Check if the operands can be represented as a smaller datatype. |
251 | bool Op1Signed = false, Op2Signed = false; |
252 | unsigned Op1MinSize = BaseT::minRequiredElementSize(Val: Args[0], isSigned&: Op1Signed); |
253 | unsigned Op2MinSize = BaseT::minRequiredElementSize(Val: Args[1], isSigned&: Op2Signed); |
254 | unsigned OpMinSize = std::max(a: Op1MinSize, b: Op2MinSize); |
255 | bool SignedMode = Op1Signed || Op2Signed; |
256 | |
257 | // If both vXi32 are representable as i15 and at least one is constant, |
258 | // zero-extended, or sign-extended from vXi16 (or less pre-SSE41) then we |
259 | // can treat this as PMADDWD which has the same costs as a vXi16 multiply. |
260 | if (OpMinSize <= 15 && !ST->isPMADDWDSlow() && |
261 | LT.second.getScalarType() == MVT::i32) { |
262 | bool Op1Constant = |
263 | isa<ConstantDataVector>(Val: Args[0]) || isa<ConstantVector>(Val: Args[0]); |
264 | bool Op2Constant = |
265 | isa<ConstantDataVector>(Val: Args[1]) || isa<ConstantVector>(Val: Args[1]); |
266 | bool Op1Sext = isa<SExtInst>(Val: Args[0]) && |
267 | (Op1MinSize == 15 || (Op1MinSize < 15 && !ST->hasSSE41())); |
268 | bool Op2Sext = isa<SExtInst>(Val: Args[1]) && |
269 | (Op2MinSize == 15 || (Op2MinSize < 15 && !ST->hasSSE41())); |
270 | |
271 | bool IsZeroExtended = !Op1Signed || !Op2Signed; |
272 | bool IsConstant = Op1Constant || Op2Constant; |
273 | bool IsSext = Op1Sext || Op2Sext; |
274 | if (IsConstant || IsZeroExtended || IsSext) |
275 | LT.second = |
276 | MVT::getVectorVT(MVT::i16, 2 * LT.second.getVectorNumElements()); |
277 | } |
278 | |
279 | // Check if the vXi32 operands can be shrunk into a smaller datatype. |
280 | // This should match the codegen from reduceVMULWidth. |
281 | // TODO: Make this generic (!ST->SSE41 || ST->isPMULLDSlow()). |
282 | if (ST->useSLMArithCosts() && LT.second == MVT::v4i32) { |
283 | if (OpMinSize <= 7) |
284 | return LT.first * 3; // pmullw/sext |
285 | if (!SignedMode && OpMinSize <= 8) |
286 | return LT.first * 3; // pmullw/zext |
287 | if (OpMinSize <= 15) |
288 | return LT.first * 5; // pmullw/pmulhw/pshuf |
289 | if (!SignedMode && OpMinSize <= 16) |
290 | return LT.first * 5; // pmullw/pmulhw/pshuf |
291 | } |
292 | |
293 | // If both vXi64 are representable as (unsigned) i32, then we can perform |
294 | // the multiple with a single PMULUDQ instruction. |
295 | // TODO: Add (SSE41+) PMULDQ handling for signed extensions. |
296 | if (!SignedMode && OpMinSize <= 32 && LT.second.getScalarType() == MVT::i64) |
297 | ISD = X86ISD::PMULUDQ; |
298 | } |
299 | |
300 | // Vector multiply by pow2 will be simplified to shifts. |
301 | // Vector multiply by -pow2 will be simplified to shifts/negates. |
302 | if (ISD == ISD::MUL && Op2Info.isConstant() && |
303 | (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2())) { |
304 | InstructionCost Cost = |
305 | getArithmeticInstrCost(Opcode: Instruction::Shl, Ty, CostKind, |
306 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
307 | if (Op2Info.isNegatedPowerOf2()) |
308 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind); |
309 | return Cost; |
310 | } |
311 | |
312 | // On X86, vector signed division by constants power-of-two are |
313 | // normally expanded to the sequence SRA + SRL + ADD + SRA. |
314 | // The OperandValue properties may not be the same as that of the previous |
315 | // operation; conservatively assume OP_None. |
316 | if ((ISD == ISD::SDIV || ISD == ISD::SREM) && |
317 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
318 | InstructionCost Cost = |
319 | 2 * getArithmeticInstrCost(Opcode: Instruction::AShr, Ty, CostKind, |
320 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
321 | Cost += getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
322 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
323 | Cost += getArithmeticInstrCost(Opcode: Instruction::Add, Ty, CostKind, |
324 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
325 | |
326 | if (ISD == ISD::SREM) { |
327 | // For SREM: (X % C) is the equivalent of (X - (X/C)*C) |
328 | Cost += getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
329 | Op2Info: Op2Info.getNoProps()); |
330 | Cost += getArithmeticInstrCost(Opcode: Instruction::Sub, Ty, CostKind, Op1Info: Op1Info.getNoProps(), |
331 | Op2Info: Op2Info.getNoProps()); |
332 | } |
333 | |
334 | return Cost; |
335 | } |
336 | |
337 | // Vector unsigned division/remainder will be simplified to shifts/masks. |
338 | if ((ISD == ISD::UDIV || ISD == ISD::UREM) && |
339 | Op2Info.isConstant() && Op2Info.isPowerOf2()) { |
340 | if (ISD == ISD::UDIV) |
341 | return getArithmeticInstrCost(Opcode: Instruction::LShr, Ty, CostKind, |
342 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
343 | // UREM |
344 | return getArithmeticInstrCost(Opcode: Instruction::And, Ty, CostKind, |
345 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
346 | } |
347 | |
348 | static const CostKindTblEntry AVX512BWUniformConstCostTable[] = { |
349 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. |
350 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. |
351 | { ISD::SRA, MVT::v16i8, { 1, 8, 4, 5 } }, // psrlw, pand, pxor, psubb. |
352 | { ISD::SHL, MVT::v32i8, { 1, 8, 2, 3 } }, // psllw + pand. |
353 | { ISD::SRL, MVT::v32i8, { 1, 8, 2, 3 } }, // psrlw + pand. |
354 | { ISD::SRA, MVT::v32i8, { 1, 9, 4, 5 } }, // psrlw, pand, pxor, psubb. |
355 | { ISD::SHL, MVT::v64i8, { 1, 8, 2, 3 } }, // psllw + pand. |
356 | { ISD::SRL, MVT::v64i8, { 1, 8, 2, 3 } }, // psrlw + pand. |
357 | { ISD::SRA, MVT::v64i8, { 1, 9, 4, 6 } }, // psrlw, pand, pxor, psubb. |
358 | |
359 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // psllw |
360 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw |
361 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // psrlw |
362 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // psllw |
363 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw |
364 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // psrlw |
365 | }; |
366 | |
367 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasBWI()) |
368 | if (const auto *Entry = |
369 | CostTableLookup(AVX512BWUniformConstCostTable, ISD, LT.second)) |
370 | if (auto KindCost = Entry->Cost[CostKind]) |
371 | return LT.first * *KindCost; |
372 | |
373 | static const CostKindTblEntry AVX512UniformConstCostTable[] = { |
374 | { ISD::SHL, MVT::v64i8, { 2, 12, 5, 6 } }, // psllw + pand. |
375 | { ISD::SRL, MVT::v64i8, { 2, 12, 5, 6 } }, // psrlw + pand. |
376 | { ISD::SRA, MVT::v64i8, { 3, 10, 12, 12 } }, // psrlw, pand, pxor, psubb. |
377 | |
378 | { ISD::SHL, MVT::v16i16, { 2, 7, 4, 4 } }, // psllw + split. |
379 | { ISD::SRL, MVT::v16i16, { 2, 7, 4, 4 } }, // psrlw + split. |
380 | { ISD::SRA, MVT::v16i16, { 2, 7, 4, 4 } }, // psraw + split. |
381 | |
382 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, // pslld |
383 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, // psrld |
384 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, // psrad |
385 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, // pslld |
386 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, // psrld |
387 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, // psrad |
388 | |
389 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, // psraq |
390 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, // psllq |
391 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, // psrlq |
392 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, // psraq |
393 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, // psllq |
394 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, // psrlq |
395 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, // psraq |
396 | |
397 | { ISD::SDIV, MVT::v16i32, { 6 } }, // pmuludq sequence |
398 | { ISD::SREM, MVT::v16i32, { 8 } }, // pmuludq+mul+sub sequence |
399 | { ISD::UDIV, MVT::v16i32, { 5 } }, // pmuludq sequence |
400 | { ISD::UREM, MVT::v16i32, { 7 } }, // pmuludq+mul+sub sequence |
401 | }; |
402 | |
403 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX512()) |
404 | if (const auto *Entry = |
405 | CostTableLookup(AVX512UniformConstCostTable, ISD, LT.second)) |
406 | if (auto KindCost = Entry->Cost[CostKind]) |
407 | return LT.first * *KindCost; |
408 | |
409 | static const CostKindTblEntry AVX2UniformConstCostTable[] = { |
410 | { ISD::SHL, MVT::v16i8, { 1, 8, 2, 3 } }, // psllw + pand. |
411 | { ISD::SRL, MVT::v16i8, { 1, 8, 2, 3 } }, // psrlw + pand. |
412 | { ISD::SRA, MVT::v16i8, { 2, 10, 5, 6 } }, // psrlw, pand, pxor, psubb. |
413 | { ISD::SHL, MVT::v32i8, { 2, 8, 2, 4 } }, // psllw + pand. |
414 | { ISD::SRL, MVT::v32i8, { 2, 8, 2, 4 } }, // psrlw + pand. |
415 | { ISD::SRA, MVT::v32i8, { 3, 10, 5, 9 } }, // psrlw, pand, pxor, psubb. |
416 | |
417 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw |
418 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw |
419 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw |
420 | { ISD::SHL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psllw |
421 | { ISD::SRL, MVT::v16i16,{ 2, 2, 1, 2 } }, // psrlw |
422 | { ISD::SRA, MVT::v16i16,{ 2, 2, 1, 2 } }, // psraw |
423 | |
424 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld |
425 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld |
426 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad |
427 | { ISD::SHL, MVT::v8i32, { 2, 2, 1, 2 } }, // pslld |
428 | { ISD::SRL, MVT::v8i32, { 2, 2, 1, 2 } }, // psrld |
429 | { ISD::SRA, MVT::v8i32, { 2, 2, 1, 2 } }, // psrad |
430 | |
431 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq |
432 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq |
433 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. |
434 | { ISD::SHL, MVT::v4i64, { 2, 2, 1, 2 } }, // psllq |
435 | { ISD::SRL, MVT::v4i64, { 2, 2, 1, 2 } }, // psrlq |
436 | { ISD::SRA, MVT::v4i64, { 4, 4, 3, 6 } }, // psrad + shuffle + split. |
437 | |
438 | { ISD::SDIV, MVT::v8i32, { 6 } }, // pmuludq sequence |
439 | { ISD::SREM, MVT::v8i32, { 8 } }, // pmuludq+mul+sub sequence |
440 | { ISD::UDIV, MVT::v8i32, { 5 } }, // pmuludq sequence |
441 | { ISD::UREM, MVT::v8i32, { 7 } }, // pmuludq+mul+sub sequence |
442 | }; |
443 | |
444 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX2()) |
445 | if (const auto *Entry = |
446 | CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second)) |
447 | if (auto KindCost = Entry->Cost[CostKind]) |
448 | return LT.first * *KindCost; |
449 | |
450 | static const CostKindTblEntry AVXUniformConstCostTable[] = { |
451 | { ISD::SHL, MVT::v16i8, { 2, 7, 2, 3 } }, // psllw + pand. |
452 | { ISD::SRL, MVT::v16i8, { 2, 7, 2, 3 } }, // psrlw + pand. |
453 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. |
454 | { ISD::SHL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psllw + pand) + split. |
455 | { ISD::SRL, MVT::v32i8, { 4, 7, 7, 8 } }, // 2*(psrlw + pand) + split. |
456 | { ISD::SRA, MVT::v32i8, { 7, 7, 12, 13 } }, // 2*(psrlw, pand, pxor, psubb) + split. |
457 | |
458 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 1 } }, // psllw. |
459 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 1 } }, // psrlw. |
460 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 1 } }, // psraw. |
461 | { ISD::SHL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psllw + split. |
462 | { ISD::SRL, MVT::v16i16,{ 3, 6, 4, 5 } }, // psrlw + split. |
463 | { ISD::SRA, MVT::v16i16,{ 3, 6, 4, 5 } }, // psraw + split. |
464 | |
465 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 1 } }, // pslld. |
466 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 1 } }, // psrld. |
467 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 1 } }, // psrad. |
468 | { ISD::SHL, MVT::v8i32, { 3, 6, 4, 5 } }, // pslld + split. |
469 | { ISD::SRL, MVT::v8i32, { 3, 6, 4, 5 } }, // psrld + split. |
470 | { ISD::SRA, MVT::v8i32, { 3, 6, 4, 5 } }, // psrad + split. |
471 | |
472 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 1 } }, // psllq. |
473 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 1 } }, // psrlq. |
474 | { ISD::SRA, MVT::v2i64, { 2, 3, 3, 3 } }, // psrad + shuffle. |
475 | { ISD::SHL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. |
476 | { ISD::SRL, MVT::v4i64, { 3, 6, 4, 5 } }, // 2 x psllq + split. |
477 | { ISD::SRA, MVT::v4i64, { 5, 7, 8, 9 } }, // 2 x psrad + shuffle + split. |
478 | |
479 | { ISD::SDIV, MVT::v8i32, { 14 } }, // 2*pmuludq sequence + split. |
480 | { ISD::SREM, MVT::v8i32, { 18 } }, // 2*pmuludq+mul+sub sequence + split. |
481 | { ISD::UDIV, MVT::v8i32, { 12 } }, // 2*pmuludq sequence + split. |
482 | { ISD::UREM, MVT::v8i32, { 16 } }, // 2*pmuludq+mul+sub sequence + split. |
483 | }; |
484 | |
485 | // XOP has faster vXi8 shifts. |
486 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasAVX() && |
487 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
488 | if (const auto *Entry = |
489 | CostTableLookup(AVXUniformConstCostTable, ISD, LT.second)) |
490 | if (auto KindCost = Entry->Cost[CostKind]) |
491 | return LT.first * *KindCost; |
492 | |
493 | static const CostKindTblEntry SSE2UniformConstCostTable[] = { |
494 | { ISD::SHL, MVT::v16i8, { 1, 7, 2, 3 } }, // psllw + pand. |
495 | { ISD::SRL, MVT::v16i8, { 1, 7, 2, 3 } }, // psrlw + pand. |
496 | { ISD::SRA, MVT::v16i8, { 3, 9, 5, 6 } }, // psrlw, pand, pxor, psubb. |
497 | |
498 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // psllw. |
499 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // psrlw. |
500 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // psraw. |
501 | |
502 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, // pslld |
503 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, // psrld. |
504 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, // psrad. |
505 | |
506 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, // psllq. |
507 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, // psrlq. |
508 | { ISD::SRA, MVT::v2i64, { 3, 5, 6, 6 } }, // 2 x psrad + shuffle. |
509 | |
510 | { ISD::SDIV, MVT::v4i32, { 6 } }, // pmuludq sequence |
511 | { ISD::SREM, MVT::v4i32, { 8 } }, // pmuludq+mul+sub sequence |
512 | { ISD::UDIV, MVT::v4i32, { 5 } }, // pmuludq sequence |
513 | { ISD::UREM, MVT::v4i32, { 7 } }, // pmuludq+mul+sub sequence |
514 | }; |
515 | |
516 | // XOP has faster vXi8 shifts. |
517 | if (Op2Info.isUniform() && Op2Info.isConstant() && ST->hasSSE2() && |
518 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
519 | if (const auto *Entry = |
520 | CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) |
521 | if (auto KindCost = Entry->Cost[CostKind]) |
522 | return LT.first * *KindCost; |
523 | |
524 | static const CostKindTblEntry AVX512BWConstCostTable[] = { |
525 | { ISD::SDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence |
526 | { ISD::SREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
527 | { ISD::UDIV, MVT::v64i8, { 14 } }, // 2*ext+2*pmulhw sequence |
528 | { ISD::UREM, MVT::v64i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
529 | |
530 | { ISD::SDIV, MVT::v32i16, { 6 } }, // vpmulhw sequence |
531 | { ISD::SREM, MVT::v32i16, { 8 } }, // vpmulhw+mul+sub sequence |
532 | { ISD::UDIV, MVT::v32i16, { 6 } }, // vpmulhuw sequence |
533 | { ISD::UREM, MVT::v32i16, { 8 } }, // vpmulhuw+mul+sub sequence |
534 | }; |
535 | |
536 | if (Op2Info.isConstant() && ST->hasBWI()) |
537 | if (const auto *Entry = |
538 | CostTableLookup(AVX512BWConstCostTable, ISD, LT.second)) |
539 | if (auto KindCost = Entry->Cost[CostKind]) |
540 | return LT.first * *KindCost; |
541 | |
542 | static const CostKindTblEntry AVX512ConstCostTable[] = { |
543 | { ISD::SDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence |
544 | { ISD::SREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
545 | { ISD::UDIV, MVT::v64i8, { 28 } }, // 4*ext+4*pmulhw sequence |
546 | { ISD::UREM, MVT::v64i8, { 32 } }, // 4*ext+4*pmulhw+mul+sub sequence |
547 | |
548 | { ISD::SDIV, MVT::v32i16, { 12 } }, // 2*vpmulhw sequence |
549 | { ISD::SREM, MVT::v32i16, { 16 } }, // 2*vpmulhw+mul+sub sequence |
550 | { ISD::UDIV, MVT::v32i16, { 12 } }, // 2*vpmulhuw sequence |
551 | { ISD::UREM, MVT::v32i16, { 16 } }, // 2*vpmulhuw+mul+sub sequence |
552 | |
553 | { ISD::SDIV, MVT::v16i32, { 15 } }, // vpmuldq sequence |
554 | { ISD::SREM, MVT::v16i32, { 17 } }, // vpmuldq+mul+sub sequence |
555 | { ISD::UDIV, MVT::v16i32, { 15 } }, // vpmuludq sequence |
556 | { ISD::UREM, MVT::v16i32, { 17 } }, // vpmuludq+mul+sub sequence |
557 | }; |
558 | |
559 | if (Op2Info.isConstant() && ST->hasAVX512()) |
560 | if (const auto *Entry = |
561 | CostTableLookup(AVX512ConstCostTable, ISD, LT.second)) |
562 | if (auto KindCost = Entry->Cost[CostKind]) |
563 | return LT.first * *KindCost; |
564 | |
565 | static const CostKindTblEntry AVX2ConstCostTable[] = { |
566 | { ISD::SDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence |
567 | { ISD::SREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
568 | { ISD::UDIV, MVT::v32i8, { 14 } }, // 2*ext+2*pmulhw sequence |
569 | { ISD::UREM, MVT::v32i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
570 | |
571 | { ISD::SDIV, MVT::v16i16, { 6 } }, // vpmulhw sequence |
572 | { ISD::SREM, MVT::v16i16, { 8 } }, // vpmulhw+mul+sub sequence |
573 | { ISD::UDIV, MVT::v16i16, { 6 } }, // vpmulhuw sequence |
574 | { ISD::UREM, MVT::v16i16, { 8 } }, // vpmulhuw+mul+sub sequence |
575 | |
576 | { ISD::SDIV, MVT::v8i32, { 15 } }, // vpmuldq sequence |
577 | { ISD::SREM, MVT::v8i32, { 19 } }, // vpmuldq+mul+sub sequence |
578 | { ISD::UDIV, MVT::v8i32, { 15 } }, // vpmuludq sequence |
579 | { ISD::UREM, MVT::v8i32, { 19 } }, // vpmuludq+mul+sub sequence |
580 | }; |
581 | |
582 | if (Op2Info.isConstant() && ST->hasAVX2()) |
583 | if (const auto *Entry = CostTableLookup(AVX2ConstCostTable, ISD, LT.second)) |
584 | if (auto KindCost = Entry->Cost[CostKind]) |
585 | return LT.first * *KindCost; |
586 | |
587 | static const CostKindTblEntry AVXConstCostTable[] = { |
588 | { ISD::SDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. |
589 | { ISD::SREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
590 | { ISD::UDIV, MVT::v32i8, { 30 } }, // 4*ext+4*pmulhw sequence + split. |
591 | { ISD::UREM, MVT::v32i8, { 34 } }, // 4*ext+4*pmulhw+mul+sub sequence + split. |
592 | |
593 | { ISD::SDIV, MVT::v16i16, { 14 } }, // 2*pmulhw sequence + split. |
594 | { ISD::SREM, MVT::v16i16, { 18 } }, // 2*pmulhw+mul+sub sequence + split. |
595 | { ISD::UDIV, MVT::v16i16, { 14 } }, // 2*pmulhuw sequence + split. |
596 | { ISD::UREM, MVT::v16i16, { 18 } }, // 2*pmulhuw+mul+sub sequence + split. |
597 | |
598 | { ISD::SDIV, MVT::v8i32, { 32 } }, // vpmuludq sequence |
599 | { ISD::SREM, MVT::v8i32, { 38 } }, // vpmuludq+mul+sub sequence |
600 | { ISD::UDIV, MVT::v8i32, { 32 } }, // 2*pmuludq sequence + split. |
601 | { ISD::UREM, MVT::v8i32, { 42 } }, // 2*pmuludq+mul+sub sequence + split. |
602 | }; |
603 | |
604 | if (Op2Info.isConstant() && ST->hasAVX()) |
605 | if (const auto *Entry = CostTableLookup(AVXConstCostTable, ISD, LT.second)) |
606 | if (auto KindCost = Entry->Cost[CostKind]) |
607 | return LT.first * *KindCost; |
608 | |
609 | static const CostKindTblEntry SSE41ConstCostTable[] = { |
610 | { ISD::SDIV, MVT::v4i32, { 15 } }, // vpmuludq sequence |
611 | { ISD::SREM, MVT::v4i32, { 20 } }, // vpmuludq+mul+sub sequence |
612 | }; |
613 | |
614 | if (Op2Info.isConstant() && ST->hasSSE41()) |
615 | if (const auto *Entry = |
616 | CostTableLookup(SSE41ConstCostTable, ISD, LT.second)) |
617 | if (auto KindCost = Entry->Cost[CostKind]) |
618 | return LT.first * *KindCost; |
619 | |
620 | static const CostKindTblEntry SSE2ConstCostTable[] = { |
621 | { ISD::SDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence |
622 | { ISD::SREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
623 | { ISD::UDIV, MVT::v16i8, { 14 } }, // 2*ext+2*pmulhw sequence |
624 | { ISD::UREM, MVT::v16i8, { 16 } }, // 2*ext+2*pmulhw+mul+sub sequence |
625 | |
626 | { ISD::SDIV, MVT::v8i16, { 6 } }, // pmulhw sequence |
627 | { ISD::SREM, MVT::v8i16, { 8 } }, // pmulhw+mul+sub sequence |
628 | { ISD::UDIV, MVT::v8i16, { 6 } }, // pmulhuw sequence |
629 | { ISD::UREM, MVT::v8i16, { 8 } }, // pmulhuw+mul+sub sequence |
630 | |
631 | { ISD::SDIV, MVT::v4i32, { 19 } }, // pmuludq sequence |
632 | { ISD::SREM, MVT::v4i32, { 24 } }, // pmuludq+mul+sub sequence |
633 | { ISD::UDIV, MVT::v4i32, { 15 } }, // pmuludq sequence |
634 | { ISD::UREM, MVT::v4i32, { 20 } }, // pmuludq+mul+sub sequence |
635 | }; |
636 | |
637 | if (Op2Info.isConstant() && ST->hasSSE2()) |
638 | if (const auto *Entry = CostTableLookup(SSE2ConstCostTable, ISD, LT.second)) |
639 | if (auto KindCost = Entry->Cost[CostKind]) |
640 | return LT.first * *KindCost; |
641 | |
642 | static const CostKindTblEntry AVX512BWUniformCostTable[] = { |
643 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. |
644 | { ISD::SRL, MVT::v16i8, { 3,10, 5, 8 } }, // psrlw + pand. |
645 | { ISD::SRA, MVT::v16i8, { 4,12, 8,12 } }, // psrlw, pand, pxor, psubb. |
646 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. |
647 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. |
648 | { ISD::SRA, MVT::v32i8, { 5,10,10,13 } }, // psrlw, pand, pxor, psubb. |
649 | { ISD::SHL, MVT::v64i8, { 4, 7, 6, 8 } }, // psllw + pand. |
650 | { ISD::SRL, MVT::v64i8, { 4, 8, 7,10 } }, // psrlw + pand. |
651 | { ISD::SRA, MVT::v64i8, { 5,10,10,15 } }, // psrlw, pand, pxor, psubb. |
652 | |
653 | { ISD::SHL, MVT::v32i16, { 2, 4, 2, 3 } }, // psllw |
654 | { ISD::SRL, MVT::v32i16, { 2, 4, 2, 3 } }, // psrlw |
655 | { ISD::SRA, MVT::v32i16, { 2, 4, 2, 3 } }, // psrqw |
656 | }; |
657 | |
658 | if (ST->hasBWI() && Op2Info.isUniform()) |
659 | if (const auto *Entry = |
660 | CostTableLookup(AVX512BWUniformCostTable, ISD, LT.second)) |
661 | if (auto KindCost = Entry->Cost[CostKind]) |
662 | return LT.first * *KindCost; |
663 | |
664 | static const CostKindTblEntry AVX512UniformCostTable[] = { |
665 | { ISD::SHL, MVT::v32i16, { 5,10, 5, 7 } }, // psllw + split. |
666 | { ISD::SRL, MVT::v32i16, { 5,10, 5, 7 } }, // psrlw + split. |
667 | { ISD::SRA, MVT::v32i16, { 5,10, 5, 7 } }, // psraw + split. |
668 | |
669 | { ISD::SHL, MVT::v16i32, { 2, 4, 2, 3 } }, // pslld |
670 | { ISD::SRL, MVT::v16i32, { 2, 4, 2, 3 } }, // psrld |
671 | { ISD::SRA, MVT::v16i32, { 2, 4, 2, 3 } }, // psrad |
672 | |
673 | { ISD::SRA, MVT::v2i64, { 1, 2, 1, 2 } }, // psraq |
674 | { ISD::SHL, MVT::v4i64, { 1, 4, 1, 2 } }, // psllq |
675 | { ISD::SRL, MVT::v4i64, { 1, 4, 1, 2 } }, // psrlq |
676 | { ISD::SRA, MVT::v4i64, { 1, 4, 1, 2 } }, // psraq |
677 | { ISD::SHL, MVT::v8i64, { 1, 4, 1, 2 } }, // psllq |
678 | { ISD::SRL, MVT::v8i64, { 1, 4, 1, 2 } }, // psrlq |
679 | { ISD::SRA, MVT::v8i64, { 1, 4, 1, 2 } }, // psraq |
680 | }; |
681 | |
682 | if (ST->hasAVX512() && Op2Info.isUniform()) |
683 | if (const auto *Entry = |
684 | CostTableLookup(AVX512UniformCostTable, ISD, LT.second)) |
685 | if (auto KindCost = Entry->Cost[CostKind]) |
686 | return LT.first * *KindCost; |
687 | |
688 | static const CostKindTblEntry AVX2UniformCostTable[] = { |
689 | // Uniform splats are cheaper for the following instructions. |
690 | { ISD::SHL, MVT::v16i8, { 3, 5, 5, 7 } }, // psllw + pand. |
691 | { ISD::SRL, MVT::v16i8, { 3, 9, 5, 8 } }, // psrlw + pand. |
692 | { ISD::SRA, MVT::v16i8, { 4, 5, 9,13 } }, // psrlw, pand, pxor, psubb. |
693 | { ISD::SHL, MVT::v32i8, { 4, 7, 6, 8 } }, // psllw + pand. |
694 | { ISD::SRL, MVT::v32i8, { 4, 8, 7, 9 } }, // psrlw + pand. |
695 | { ISD::SRA, MVT::v32i8, { 6, 9,11,16 } }, // psrlw, pand, pxor, psubb. |
696 | |
697 | { ISD::SHL, MVT::v8i16, { 1, 2, 1, 2 } }, // psllw. |
698 | { ISD::SRL, MVT::v8i16, { 1, 2, 1, 2 } }, // psrlw. |
699 | { ISD::SRA, MVT::v8i16, { 1, 2, 1, 2 } }, // psraw. |
700 | { ISD::SHL, MVT::v16i16, { 2, 4, 2, 3 } }, // psllw. |
701 | { ISD::SRL, MVT::v16i16, { 2, 4, 2, 3 } }, // psrlw. |
702 | { ISD::SRA, MVT::v16i16, { 2, 4, 2, 3 } }, // psraw. |
703 | |
704 | { ISD::SHL, MVT::v4i32, { 1, 2, 1, 2 } }, // pslld |
705 | { ISD::SRL, MVT::v4i32, { 1, 2, 1, 2 } }, // psrld |
706 | { ISD::SRA, MVT::v4i32, { 1, 2, 1, 2 } }, // psrad |
707 | { ISD::SHL, MVT::v8i32, { 2, 4, 2, 3 } }, // pslld |
708 | { ISD::SRL, MVT::v8i32, { 2, 4, 2, 3 } }, // psrld |
709 | { ISD::SRA, MVT::v8i32, { 2, 4, 2, 3 } }, // psrad |
710 | |
711 | { ISD::SHL, MVT::v2i64, { 1, 2, 1, 2 } }, // psllq |
712 | { ISD::SRL, MVT::v2i64, { 1, 2, 1, 2 } }, // psrlq |
713 | { ISD::SRA, MVT::v2i64, { 2, 4, 5, 7 } }, // 2 x psrad + shuffle. |
714 | { ISD::SHL, MVT::v4i64, { 2, 4, 1, 2 } }, // psllq |
715 | { ISD::SRL, MVT::v4i64, { 2, 4, 1, 2 } }, // psrlq |
716 | { ISD::SRA, MVT::v4i64, { 4, 6, 5, 9 } }, // 2 x psrad + shuffle. |
717 | }; |
718 | |
719 | if (ST->hasAVX2() && Op2Info.isUniform()) |
720 | if (const auto *Entry = |
721 | CostTableLookup(AVX2UniformCostTable, ISD, LT.second)) |
722 | if (auto KindCost = Entry->Cost[CostKind]) |
723 | return LT.first * *KindCost; |
724 | |
725 | static const CostKindTblEntry AVXUniformCostTable[] = { |
726 | { ISD::SHL, MVT::v16i8, { 4, 4, 6, 8 } }, // psllw + pand. |
727 | { ISD::SRL, MVT::v16i8, { 4, 8, 5, 8 } }, // psrlw + pand. |
728 | { ISD::SRA, MVT::v16i8, { 6, 6, 9,13 } }, // psrlw, pand, pxor, psubb. |
729 | { ISD::SHL, MVT::v32i8, { 7, 8,11,14 } }, // psllw + pand + split. |
730 | { ISD::SRL, MVT::v32i8, { 7, 9,10,14 } }, // psrlw + pand + split. |
731 | { ISD::SRA, MVT::v32i8, { 10,11,16,21 } }, // psrlw, pand, pxor, psubb + split. |
732 | |
733 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 2 } }, // psllw. |
734 | { ISD::SRL, MVT::v8i16, { 1, 3, 1, 2 } }, // psrlw. |
735 | { ISD::SRA, MVT::v8i16, { 1, 3, 1, 2 } }, // psraw. |
736 | { ISD::SHL, MVT::v16i16, { 3, 7, 5, 7 } }, // psllw + split. |
737 | { ISD::SRL, MVT::v16i16, { 3, 7, 5, 7 } }, // psrlw + split. |
738 | { ISD::SRA, MVT::v16i16, { 3, 7, 5, 7 } }, // psraw + split. |
739 | |
740 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 2 } }, // pslld. |
741 | { ISD::SRL, MVT::v4i32, { 1, 3, 1, 2 } }, // psrld. |
742 | { ISD::SRA, MVT::v4i32, { 1, 3, 1, 2 } }, // psrad. |
743 | { ISD::SHL, MVT::v8i32, { 3, 7, 5, 7 } }, // pslld + split. |
744 | { ISD::SRL, MVT::v8i32, { 3, 7, 5, 7 } }, // psrld + split. |
745 | { ISD::SRA, MVT::v8i32, { 3, 7, 5, 7 } }, // psrad + split. |
746 | |
747 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 2 } }, // psllq. |
748 | { ISD::SRL, MVT::v2i64, { 1, 3, 1, 2 } }, // psrlq. |
749 | { ISD::SRA, MVT::v2i64, { 3, 4, 5, 7 } }, // 2 x psrad + shuffle. |
750 | { ISD::SHL, MVT::v4i64, { 3, 7, 4, 6 } }, // psllq + split. |
751 | { ISD::SRL, MVT::v4i64, { 3, 7, 4, 6 } }, // psrlq + split. |
752 | { ISD::SRA, MVT::v4i64, { 6, 7,10,13 } }, // 2 x (2 x psrad + shuffle) + split. |
753 | }; |
754 | |
755 | // XOP has faster vXi8 shifts. |
756 | if (ST->hasAVX() && Op2Info.isUniform() && |
757 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
758 | if (const auto *Entry = |
759 | CostTableLookup(AVXUniformCostTable, ISD, LT.second)) |
760 | if (auto KindCost = Entry->Cost[CostKind]) |
761 | return LT.first * *KindCost; |
762 | |
763 | static const CostKindTblEntry SSE2UniformCostTable[] = { |
764 | // Uniform splats are cheaper for the following instructions. |
765 | { ISD::SHL, MVT::v16i8, { 9, 10, 6, 9 } }, // psllw + pand. |
766 | { ISD::SRL, MVT::v16i8, { 9, 13, 5, 9 } }, // psrlw + pand. |
767 | { ISD::SRA, MVT::v16i8, { 11, 15, 9,13 } }, // pcmpgtb sequence. |
768 | |
769 | { ISD::SHL, MVT::v8i16, { 2, 2, 1, 2 } }, // psllw. |
770 | { ISD::SRL, MVT::v8i16, { 2, 2, 1, 2 } }, // psrlw. |
771 | { ISD::SRA, MVT::v8i16, { 2, 2, 1, 2 } }, // psraw. |
772 | |
773 | { ISD::SHL, MVT::v4i32, { 2, 2, 1, 2 } }, // pslld |
774 | { ISD::SRL, MVT::v4i32, { 2, 2, 1, 2 } }, // psrld. |
775 | { ISD::SRA, MVT::v4i32, { 2, 2, 1, 2 } }, // psrad. |
776 | |
777 | { ISD::SHL, MVT::v2i64, { 2, 2, 1, 2 } }, // psllq. |
778 | { ISD::SRL, MVT::v2i64, { 2, 2, 1, 2 } }, // psrlq. |
779 | { ISD::SRA, MVT::v2i64, { 5, 9, 5, 7 } }, // 2*psrlq + xor + sub. |
780 | }; |
781 | |
782 | if (ST->hasSSE2() && Op2Info.isUniform() && |
783 | (!ST->hasXOP() || LT.second.getScalarSizeInBits() != 8)) |
784 | if (const auto *Entry = |
785 | CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) |
786 | if (auto KindCost = Entry->Cost[CostKind]) |
787 | return LT.first * *KindCost; |
788 | |
789 | static const CostKindTblEntry AVX512DQCostTable[] = { |
790 | { ISD::MUL, MVT::v2i64, { 2, 15, 1, 3 } }, // pmullq |
791 | { ISD::MUL, MVT::v4i64, { 2, 15, 1, 3 } }, // pmullq |
792 | { ISD::MUL, MVT::v8i64, { 3, 15, 1, 3 } } // pmullq |
793 | }; |
794 | |
795 | // Look for AVX512DQ lowering tricks for custom cases. |
796 | if (ST->hasDQI()) |
797 | if (const auto *Entry = CostTableLookup(AVX512DQCostTable, ISD, LT.second)) |
798 | if (auto KindCost = Entry->Cost[CostKind]) |
799 | return LT.first * *KindCost; |
800 | |
801 | static const CostKindTblEntry AVX512BWCostTable[] = { |
802 | { ISD::SHL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsllvw/pack sequence. |
803 | { ISD::SRL, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsrlvw/pack sequence. |
804 | { ISD::SRA, MVT::v16i8, { 4, 8, 4, 5 } }, // extend/vpsravw/pack sequence. |
805 | { ISD::SHL, MVT::v32i8, { 4, 23,11,16 } }, // extend/vpsllvw/pack sequence. |
806 | { ISD::SRL, MVT::v32i8, { 4, 30,12,18 } }, // extend/vpsrlvw/pack sequence. |
807 | { ISD::SRA, MVT::v32i8, { 6, 13,24,30 } }, // extend/vpsravw/pack sequence. |
808 | { ISD::SHL, MVT::v64i8, { 6, 19,13,15 } }, // extend/vpsllvw/pack sequence. |
809 | { ISD::SRL, MVT::v64i8, { 7, 27,15,18 } }, // extend/vpsrlvw/pack sequence. |
810 | { ISD::SRA, MVT::v64i8, { 15, 15,30,30 } }, // extend/vpsravw/pack sequence. |
811 | |
812 | { ISD::SHL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsllvw |
813 | { ISD::SRL, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsrlvw |
814 | { ISD::SRA, MVT::v8i16, { 1, 1, 1, 1 } }, // vpsravw |
815 | { ISD::SHL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsllvw |
816 | { ISD::SRL, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsrlvw |
817 | { ISD::SRA, MVT::v16i16, { 1, 1, 1, 1 } }, // vpsravw |
818 | { ISD::SHL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsllvw |
819 | { ISD::SRL, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsrlvw |
820 | { ISD::SRA, MVT::v32i16, { 1, 1, 1, 1 } }, // vpsravw |
821 | |
822 | { ISD::ADD, MVT::v64i8, { 1, 1, 1, 1 } }, // paddb |
823 | { ISD::ADD, MVT::v32i16, { 1, 1, 1, 1 } }, // paddw |
824 | |
825 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 1 } }, // paddb |
826 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 1 } }, // paddw |
827 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 1 } }, // paddd |
828 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 1 } }, // paddq |
829 | |
830 | { ISD::SUB, MVT::v64i8, { 1, 1, 1, 1 } }, // psubb |
831 | { ISD::SUB, MVT::v32i16, { 1, 1, 1, 1 } }, // psubw |
832 | |
833 | { ISD::MUL, MVT::v64i8, { 5, 10,10,11 } }, |
834 | { ISD::MUL, MVT::v32i16, { 1, 5, 1, 1 } }, // pmullw |
835 | |
836 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 1 } }, // psubb |
837 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 1 } }, // psubw |
838 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 1 } }, // psubd |
839 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 1 } }, // psubq |
840 | }; |
841 | |
842 | // Look for AVX512BW lowering tricks for custom cases. |
843 | if (ST->hasBWI()) |
844 | if (const auto *Entry = CostTableLookup(AVX512BWCostTable, ISD, LT.second)) |
845 | if (auto KindCost = Entry->Cost[CostKind]) |
846 | return LT.first * *KindCost; |
847 | |
848 | static const CostKindTblEntry AVX512CostTable[] = { |
849 | { ISD::SHL, MVT::v64i8, { 15, 19,27,33 } }, // vpblendv+split sequence. |
850 | { ISD::SRL, MVT::v64i8, { 15, 19,30,36 } }, // vpblendv+split sequence. |
851 | { ISD::SRA, MVT::v64i8, { 37, 37,51,63 } }, // vpblendv+split sequence. |
852 | |
853 | { ISD::SHL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. |
854 | { ISD::SRL, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsrlvd/pack sequence. |
855 | { ISD::SRA, MVT::v32i16, { 11, 16,11,15 } }, // 2*extend/vpsravd/pack sequence. |
856 | |
857 | { ISD::SHL, MVT::v4i32, { 1, 1, 1, 1 } }, |
858 | { ISD::SRL, MVT::v4i32, { 1, 1, 1, 1 } }, |
859 | { ISD::SRA, MVT::v4i32, { 1, 1, 1, 1 } }, |
860 | { ISD::SHL, MVT::v8i32, { 1, 1, 1, 1 } }, |
861 | { ISD::SRL, MVT::v8i32, { 1, 1, 1, 1 } }, |
862 | { ISD::SRA, MVT::v8i32, { 1, 1, 1, 1 } }, |
863 | { ISD::SHL, MVT::v16i32, { 1, 1, 1, 1 } }, |
864 | { ISD::SRL, MVT::v16i32, { 1, 1, 1, 1 } }, |
865 | { ISD::SRA, MVT::v16i32, { 1, 1, 1, 1 } }, |
866 | |
867 | { ISD::SHL, MVT::v2i64, { 1, 1, 1, 1 } }, |
868 | { ISD::SRL, MVT::v2i64, { 1, 1, 1, 1 } }, |
869 | { ISD::SRA, MVT::v2i64, { 1, 1, 1, 1 } }, |
870 | { ISD::SHL, MVT::v4i64, { 1, 1, 1, 1 } }, |
871 | { ISD::SRL, MVT::v4i64, { 1, 1, 1, 1 } }, |
872 | { ISD::SRA, MVT::v4i64, { 1, 1, 1, 1 } }, |
873 | { ISD::SHL, MVT::v8i64, { 1, 1, 1, 1 } }, |
874 | { ISD::SRL, MVT::v8i64, { 1, 1, 1, 1 } }, |
875 | { ISD::SRA, MVT::v8i64, { 1, 1, 1, 1 } }, |
876 | |
877 | { ISD::ADD, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*paddb + split |
878 | { ISD::ADD, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*paddw + split |
879 | |
880 | { ISD::SUB, MVT::v64i8, { 3, 7, 5, 5 } }, // 2*psubb + split |
881 | { ISD::SUB, MVT::v32i16, { 3, 7, 5, 5 } }, // 2*psubw + split |
882 | |
883 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 1 } }, |
884 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 1 } }, |
885 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 1 } }, |
886 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 1 } }, |
887 | |
888 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 1 } }, |
889 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 1 } }, |
890 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 1 } }, |
891 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 1 } }, |
892 | |
893 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 1 } }, |
894 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 1 } }, |
895 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 1 } }, |
896 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 1 } }, |
897 | |
898 | { ISD::MUL, MVT::v16i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) |
899 | { ISD::MUL, MVT::v8i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) |
900 | { ISD::MUL, MVT::v4i32, { 1, 10, 1, 2 } }, // pmulld (Skylake from agner.org) |
901 | { ISD::MUL, MVT::v8i64, { 6, 9, 8, 8 } }, // 3*pmuludq/3*shift/2*add |
902 | { ISD::MUL, MVT::i64, { 1 } }, // Skylake from http://www.agner.org/ |
903 | |
904 | { X86ISD::PMULUDQ, MVT::v8i64, { 1, 5, 1, 1 } }, |
905 | |
906 | { ISD::FNEG, MVT::v8f64, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ |
907 | { ISD::FADD, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
908 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
909 | { ISD::FSUB, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
910 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
911 | { ISD::FMUL, MVT::v8f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
912 | { ISD::FMUL, MVT::v4f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
913 | { ISD::FMUL, MVT::v2f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
914 | { ISD::FMUL, MVT::f64, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
915 | |
916 | { ISD::FDIV, MVT::f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ |
917 | { ISD::FDIV, MVT::v2f64, { 4, 14, 1, 1 } }, // Skylake from http://www.agner.org/ |
918 | { ISD::FDIV, MVT::v4f64, { 8, 14, 1, 1 } }, // Skylake from http://www.agner.org/ |
919 | { ISD::FDIV, MVT::v8f64, { 16, 23, 1, 3 } }, // Skylake from http://www.agner.org/ |
920 | |
921 | { ISD::FNEG, MVT::v16f32, { 1, 1, 1, 2 } }, // Skylake from http://www.agner.org/ |
922 | { ISD::FADD, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
923 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
924 | { ISD::FSUB, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
925 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
926 | { ISD::FMUL, MVT::v16f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
927 | { ISD::FMUL, MVT::v8f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
928 | { ISD::FMUL, MVT::v4f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
929 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // Skylake from http://www.agner.org/ |
930 | |
931 | { ISD::FDIV, MVT::f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ |
932 | { ISD::FDIV, MVT::v4f32, { 3, 11, 1, 1 } }, // Skylake from http://www.agner.org/ |
933 | { ISD::FDIV, MVT::v8f32, { 5, 11, 1, 1 } }, // Skylake from http://www.agner.org/ |
934 | { ISD::FDIV, MVT::v16f32, { 10, 18, 1, 3 } }, // Skylake from http://www.agner.org/ |
935 | }; |
936 | |
937 | if (ST->hasAVX512()) |
938 | if (const auto *Entry = CostTableLookup(AVX512CostTable, ISD, LT.second)) |
939 | if (auto KindCost = Entry->Cost[CostKind]) |
940 | return LT.first * *KindCost; |
941 | |
942 | static const CostKindTblEntry AVX2ShiftCostTable[] = { |
943 | // Shifts on vXi64/vXi32 on AVX2 is legal even though we declare to |
944 | // customize them to detect the cases where shift amount is a scalar one. |
945 | { ISD::SHL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsllvd (Haswell from agner.org) |
946 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsrlvd (Haswell from agner.org) |
947 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 3 } }, // vpsravd (Haswell from agner.org) |
948 | { ISD::SHL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsllvd (Haswell from agner.org) |
949 | { ISD::SRL, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsrlvd (Haswell from agner.org) |
950 | { ISD::SRA, MVT::v8i32, { 4, 4, 1, 3 } }, // vpsravd (Haswell from agner.org) |
951 | { ISD::SHL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsllvq (Haswell from agner.org) |
952 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, // vpsrlvq (Haswell from agner.org) |
953 | { ISD::SHL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsllvq (Haswell from agner.org) |
954 | { ISD::SRL, MVT::v4i64, { 4, 4, 1, 2 } }, // vpsrlvq (Haswell from agner.org) |
955 | }; |
956 | |
957 | if (ST->hasAVX512()) { |
958 | if (ISD == ISD::SHL && LT.second == MVT::v32i16 && Op2Info.isConstant()) |
959 | // On AVX512, a packed v32i16 shift left by a constant build_vector |
960 | // is lowered into a vector multiply (vpmullw). |
961 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
962 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
963 | } |
964 | |
965 | // Look for AVX2 lowering tricks (XOP is always better at v4i32 shifts). |
966 | if (ST->hasAVX2() && !(ST->hasXOP() && LT.second == MVT::v4i32)) { |
967 | if (ISD == ISD::SHL && LT.second == MVT::v16i16 && |
968 | Op2Info.isConstant()) |
969 | // On AVX2, a packed v16i16 shift left by a constant build_vector |
970 | // is lowered into a vector multiply (vpmullw). |
971 | return getArithmeticInstrCost(Opcode: Instruction::Mul, Ty, CostKind, |
972 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
973 | |
974 | if (const auto *Entry = CostTableLookup(AVX2ShiftCostTable, ISD, LT.second)) |
975 | if (auto KindCost = Entry->Cost[CostKind]) |
976 | return LT.first * *KindCost; |
977 | } |
978 | |
979 | static const CostKindTblEntry XOPShiftCostTable[] = { |
980 | // 128bit shifts take 1cy, but right shifts require negation beforehand. |
981 | { ISD::SHL, MVT::v16i8, { 1, 3, 1, 1 } }, |
982 | { ISD::SRL, MVT::v16i8, { 2, 3, 1, 1 } }, |
983 | { ISD::SRA, MVT::v16i8, { 2, 3, 1, 1 } }, |
984 | { ISD::SHL, MVT::v8i16, { 1, 3, 1, 1 } }, |
985 | { ISD::SRL, MVT::v8i16, { 2, 3, 1, 1 } }, |
986 | { ISD::SRA, MVT::v8i16, { 2, 3, 1, 1 } }, |
987 | { ISD::SHL, MVT::v4i32, { 1, 3, 1, 1 } }, |
988 | { ISD::SRL, MVT::v4i32, { 2, 3, 1, 1 } }, |
989 | { ISD::SRA, MVT::v4i32, { 2, 3, 1, 1 } }, |
990 | { ISD::SHL, MVT::v2i64, { 1, 3, 1, 1 } }, |
991 | { ISD::SRL, MVT::v2i64, { 2, 3, 1, 1 } }, |
992 | { ISD::SRA, MVT::v2i64, { 2, 3, 1, 1 } }, |
993 | // 256bit shifts require splitting if AVX2 didn't catch them above. |
994 | { ISD::SHL, MVT::v32i8, { 4, 7, 5, 6 } }, |
995 | { ISD::SRL, MVT::v32i8, { 6, 7, 5, 6 } }, |
996 | { ISD::SRA, MVT::v32i8, { 6, 7, 5, 6 } }, |
997 | { ISD::SHL, MVT::v16i16, { 4, 7, 5, 6 } }, |
998 | { ISD::SRL, MVT::v16i16, { 6, 7, 5, 6 } }, |
999 | { ISD::SRA, MVT::v16i16, { 6, 7, 5, 6 } }, |
1000 | { ISD::SHL, MVT::v8i32, { 4, 7, 5, 6 } }, |
1001 | { ISD::SRL, MVT::v8i32, { 6, 7, 5, 6 } }, |
1002 | { ISD::SRA, MVT::v8i32, { 6, 7, 5, 6 } }, |
1003 | { ISD::SHL, MVT::v4i64, { 4, 7, 5, 6 } }, |
1004 | { ISD::SRL, MVT::v4i64, { 6, 7, 5, 6 } }, |
1005 | { ISD::SRA, MVT::v4i64, { 6, 7, 5, 6 } }, |
1006 | }; |
1007 | |
1008 | // Look for XOP lowering tricks. |
1009 | if (ST->hasXOP()) { |
1010 | // If the right shift is constant then we'll fold the negation so |
1011 | // it's as cheap as a left shift. |
1012 | int ShiftISD = ISD; |
1013 | if ((ShiftISD == ISD::SRL || ShiftISD == ISD::SRA) && Op2Info.isConstant()) |
1014 | ShiftISD = ISD::SHL; |
1015 | if (const auto *Entry = |
1016 | CostTableLookup(XOPShiftCostTable, ShiftISD, LT.second)) |
1017 | if (auto KindCost = Entry->Cost[CostKind]) |
1018 | return LT.first * *KindCost; |
1019 | } |
1020 | |
1021 | if (ISD == ISD::SHL && !Op2Info.isUniform() && Op2Info.isConstant()) { |
1022 | MVT VT = LT.second; |
1023 | // Vector shift left by non uniform constant can be lowered |
1024 | // into vector multiply. |
1025 | if (((VT == MVT::v8i16 || VT == MVT::v4i32) && ST->hasSSE2()) || |
1026 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && ST->hasAVX())) |
1027 | ISD = ISD::MUL; |
1028 | } |
1029 | |
1030 | static const CostKindTblEntry GLMCostTable[] = { |
1031 | { ISD::FDIV, MVT::f32, { 18, 19, 1, 1 } }, // divss |
1032 | { ISD::FDIV, MVT::v4f32, { 35, 36, 1, 1 } }, // divps |
1033 | { ISD::FDIV, MVT::f64, { 33, 34, 1, 1 } }, // divsd |
1034 | { ISD::FDIV, MVT::v2f64, { 65, 66, 1, 1 } }, // divpd |
1035 | }; |
1036 | |
1037 | if (ST->useGLMDivSqrtCosts()) |
1038 | if (const auto *Entry = CostTableLookup(GLMCostTable, ISD, LT.second)) |
1039 | if (auto KindCost = Entry->Cost[CostKind]) |
1040 | return LT.first * *KindCost; |
1041 | |
1042 | static const CostKindTblEntry SLMCostTable[] = { |
1043 | { ISD::MUL, MVT::v4i32, { 11, 11, 1, 7 } }, // pmulld |
1044 | { ISD::MUL, MVT::v8i16, { 2, 5, 1, 1 } }, // pmullw |
1045 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // mulsd |
1046 | { ISD::FMUL, MVT::f32, { 1, 4, 1, 1 } }, // mulss |
1047 | { ISD::FMUL, MVT::v2f64, { 4, 7, 1, 1 } }, // mulpd |
1048 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // mulps |
1049 | { ISD::FDIV, MVT::f32, { 17, 19, 1, 1 } }, // divss |
1050 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 6 } }, // divps |
1051 | { ISD::FDIV, MVT::f64, { 32, 34, 1, 1 } }, // divsd |
1052 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 6 } }, // divpd |
1053 | { ISD::FADD, MVT::v2f64, { 2, 4, 1, 1 } }, // addpd |
1054 | { ISD::FSUB, MVT::v2f64, { 2, 4, 1, 1 } }, // subpd |
1055 | // v2i64/v4i64 mul is custom lowered as a series of long: |
1056 | // multiplies(3), shifts(3) and adds(2) |
1057 | // slm muldq version throughput is 2 and addq throughput 4 |
1058 | // thus: 3X2 (muldq throughput) + 3X1 (shift throughput) + |
1059 | // 3X4 (addq throughput) = 17 |
1060 | { ISD::MUL, MVT::v2i64, { 17, 22, 9, 9 } }, |
1061 | // slm addq\subq throughput is 4 |
1062 | { ISD::ADD, MVT::v2i64, { 4, 2, 1, 2 } }, |
1063 | { ISD::SUB, MVT::v2i64, { 4, 2, 1, 2 } }, |
1064 | }; |
1065 | |
1066 | if (ST->useSLMArithCosts()) |
1067 | if (const auto *Entry = CostTableLookup(SLMCostTable, ISD, LT.second)) |
1068 | if (auto KindCost = Entry->Cost[CostKind]) |
1069 | return LT.first * *KindCost; |
1070 | |
1071 | static const CostKindTblEntry AVX2CostTable[] = { |
1072 | { ISD::SHL, MVT::v16i8, { 6, 21,11,16 } }, // vpblendvb sequence. |
1073 | { ISD::SHL, MVT::v32i8, { 6, 23,11,22 } }, // vpblendvb sequence. |
1074 | { ISD::SHL, MVT::v8i16, { 5, 18, 5,10 } }, // extend/vpsrlvd/pack sequence. |
1075 | { ISD::SHL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. |
1076 | |
1077 | { ISD::SRL, MVT::v16i8, { 6, 27,12,18 } }, // vpblendvb sequence. |
1078 | { ISD::SRL, MVT::v32i8, { 8, 30,12,24 } }, // vpblendvb sequence. |
1079 | { ISD::SRL, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsrlvd/pack sequence. |
1080 | { ISD::SRL, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsrlvd/pack sequence. |
1081 | |
1082 | { ISD::SRA, MVT::v16i8, { 17, 17,24,30 } }, // vpblendvb sequence. |
1083 | { ISD::SRA, MVT::v32i8, { 18, 20,24,43 } }, // vpblendvb sequence. |
1084 | { ISD::SRA, MVT::v8i16, { 5, 11, 5,10 } }, // extend/vpsravd/pack sequence. |
1085 | { ISD::SRA, MVT::v16i16, { 8, 10,10,14 } }, // extend/vpsravd/pack sequence. |
1086 | { ISD::SRA, MVT::v2i64, { 4, 5, 5, 5 } }, // srl/xor/sub sequence. |
1087 | { ISD::SRA, MVT::v4i64, { 8, 8, 5, 9 } }, // srl/xor/sub sequence. |
1088 | |
1089 | { ISD::SUB, MVT::v32i8, { 1, 1, 1, 2 } }, // psubb |
1090 | { ISD::ADD, MVT::v32i8, { 1, 1, 1, 2 } }, // paddb |
1091 | { ISD::SUB, MVT::v16i16, { 1, 1, 1, 2 } }, // psubw |
1092 | { ISD::ADD, MVT::v16i16, { 1, 1, 1, 2 } }, // paddw |
1093 | { ISD::SUB, MVT::v8i32, { 1, 1, 1, 2 } }, // psubd |
1094 | { ISD::ADD, MVT::v8i32, { 1, 1, 1, 2 } }, // paddd |
1095 | { ISD::SUB, MVT::v4i64, { 1, 1, 1, 2 } }, // psubq |
1096 | { ISD::ADD, MVT::v4i64, { 1, 1, 1, 2 } }, // paddq |
1097 | |
1098 | { ISD::MUL, MVT::v16i8, { 5, 18, 6,12 } }, // extend/pmullw/pack |
1099 | { ISD::MUL, MVT::v32i8, { 6, 11,10,19 } }, // unpack/pmullw |
1100 | { ISD::MUL, MVT::v16i16, { 2, 5, 1, 2 } }, // pmullw |
1101 | { ISD::MUL, MVT::v8i32, { 4, 10, 1, 2 } }, // pmulld |
1102 | { ISD::MUL, MVT::v4i32, { 2, 10, 1, 2 } }, // pmulld |
1103 | { ISD::MUL, MVT::v4i64, { 6, 10, 8,13 } }, // 3*pmuludq/3*shift/2*add |
1104 | { ISD::MUL, MVT::v2i64, { 6, 10, 8, 8 } }, // 3*pmuludq/3*shift/2*add |
1105 | |
1106 | { X86ISD::PMULUDQ, MVT::v4i64, { 1, 5, 1, 1 } }, |
1107 | |
1108 | { ISD::FNEG, MVT::v4f64, { 1, 1, 1, 2 } }, // vxorpd |
1109 | { ISD::FNEG, MVT::v8f32, { 1, 1, 1, 2 } }, // vxorps |
1110 | |
1111 | { ISD::FADD, MVT::f64, { 1, 4, 1, 1 } }, // vaddsd |
1112 | { ISD::FADD, MVT::f32, { 1, 4, 1, 1 } }, // vaddss |
1113 | { ISD::FADD, MVT::v2f64, { 1, 4, 1, 1 } }, // vaddpd |
1114 | { ISD::FADD, MVT::v4f32, { 1, 4, 1, 1 } }, // vaddps |
1115 | { ISD::FADD, MVT::v4f64, { 1, 4, 1, 2 } }, // vaddpd |
1116 | { ISD::FADD, MVT::v8f32, { 1, 4, 1, 2 } }, // vaddps |
1117 | |
1118 | { ISD::FSUB, MVT::f64, { 1, 4, 1, 1 } }, // vsubsd |
1119 | { ISD::FSUB, MVT::f32, { 1, 4, 1, 1 } }, // vsubss |
1120 | { ISD::FSUB, MVT::v2f64, { 1, 4, 1, 1 } }, // vsubpd |
1121 | { ISD::FSUB, MVT::v4f32, { 1, 4, 1, 1 } }, // vsubps |
1122 | { ISD::FSUB, MVT::v4f64, { 1, 4, 1, 2 } }, // vsubpd |
1123 | { ISD::FSUB, MVT::v8f32, { 1, 4, 1, 2 } }, // vsubps |
1124 | |
1125 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // vmulsd |
1126 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // vmulss |
1127 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // vmulpd |
1128 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // vmulps |
1129 | { ISD::FMUL, MVT::v4f64, { 1, 5, 1, 2 } }, // vmulpd |
1130 | { ISD::FMUL, MVT::v8f32, { 1, 5, 1, 2 } }, // vmulps |
1131 | |
1132 | { ISD::FDIV, MVT::f32, { 7, 13, 1, 1 } }, // vdivss |
1133 | { ISD::FDIV, MVT::v4f32, { 7, 13, 1, 1 } }, // vdivps |
1134 | { ISD::FDIV, MVT::v8f32, { 14, 21, 1, 3 } }, // vdivps |
1135 | { ISD::FDIV, MVT::f64, { 14, 20, 1, 1 } }, // vdivsd |
1136 | { ISD::FDIV, MVT::v2f64, { 14, 20, 1, 1 } }, // vdivpd |
1137 | { ISD::FDIV, MVT::v4f64, { 28, 35, 1, 3 } }, // vdivpd |
1138 | }; |
1139 | |
1140 | // Look for AVX2 lowering tricks for custom cases. |
1141 | if (ST->hasAVX2()) |
1142 | if (const auto *Entry = CostTableLookup(AVX2CostTable, ISD, LT.second)) |
1143 | if (auto KindCost = Entry->Cost[CostKind]) |
1144 | return LT.first * *KindCost; |
1145 | |
1146 | static const CostKindTblEntry AVX1CostTable[] = { |
1147 | // We don't have to scalarize unsupported ops. We can issue two half-sized |
1148 | // operations and we only need to extract the upper YMM half. |
1149 | // Two ops + 1 extract + 1 insert = 4. |
1150 | { ISD::MUL, MVT::v32i8, { 12, 13, 22, 23 } }, // unpack/pmullw + split |
1151 | { ISD::MUL, MVT::v16i16, { 4, 8, 5, 6 } }, // pmullw + split |
1152 | { ISD::MUL, MVT::v8i32, { 5, 8, 5, 10 } }, // pmulld + split |
1153 | { ISD::MUL, MVT::v4i32, { 2, 5, 1, 3 } }, // pmulld |
1154 | { ISD::MUL, MVT::v4i64, { 12, 15, 19, 20 } }, |
1155 | |
1156 | { ISD::AND, MVT::v32i8, { 1, 1, 1, 2 } }, // vandps |
1157 | { ISD::AND, MVT::v16i16, { 1, 1, 1, 2 } }, // vandps |
1158 | { ISD::AND, MVT::v8i32, { 1, 1, 1, 2 } }, // vandps |
1159 | { ISD::AND, MVT::v4i64, { 1, 1, 1, 2 } }, // vandps |
1160 | |
1161 | { ISD::OR, MVT::v32i8, { 1, 1, 1, 2 } }, // vorps |
1162 | { ISD::OR, MVT::v16i16, { 1, 1, 1, 2 } }, // vorps |
1163 | { ISD::OR, MVT::v8i32, { 1, 1, 1, 2 } }, // vorps |
1164 | { ISD::OR, MVT::v4i64, { 1, 1, 1, 2 } }, // vorps |
1165 | |
1166 | { ISD::XOR, MVT::v32i8, { 1, 1, 1, 2 } }, // vxorps |
1167 | { ISD::XOR, MVT::v16i16, { 1, 1, 1, 2 } }, // vxorps |
1168 | { ISD::XOR, MVT::v8i32, { 1, 1, 1, 2 } }, // vxorps |
1169 | { ISD::XOR, MVT::v4i64, { 1, 1, 1, 2 } }, // vxorps |
1170 | |
1171 | { ISD::SUB, MVT::v32i8, { 4, 2, 5, 6 } }, // psubb + split |
1172 | { ISD::ADD, MVT::v32i8, { 4, 2, 5, 6 } }, // paddb + split |
1173 | { ISD::SUB, MVT::v16i16, { 4, 2, 5, 6 } }, // psubw + split |
1174 | { ISD::ADD, MVT::v16i16, { 4, 2, 5, 6 } }, // paddw + split |
1175 | { ISD::SUB, MVT::v8i32, { 4, 2, 5, 6 } }, // psubd + split |
1176 | { ISD::ADD, MVT::v8i32, { 4, 2, 5, 6 } }, // paddd + split |
1177 | { ISD::SUB, MVT::v4i64, { 4, 2, 5, 6 } }, // psubq + split |
1178 | { ISD::ADD, MVT::v4i64, { 4, 2, 5, 6 } }, // paddq + split |
1179 | { ISD::SUB, MVT::v2i64, { 1, 1, 1, 1 } }, // psubq |
1180 | { ISD::ADD, MVT::v2i64, { 1, 1, 1, 1 } }, // paddq |
1181 | |
1182 | { ISD::SHL, MVT::v16i8, { 10, 21,11,17 } }, // pblendvb sequence. |
1183 | { ISD::SHL, MVT::v32i8, { 22, 22,27,40 } }, // pblendvb sequence + split. |
1184 | { ISD::SHL, MVT::v8i16, { 6, 9,11,11 } }, // pblendvb sequence. |
1185 | { ISD::SHL, MVT::v16i16, { 13, 16,24,25 } }, // pblendvb sequence + split. |
1186 | { ISD::SHL, MVT::v4i32, { 3, 11, 4, 6 } }, // pslld/paddd/cvttps2dq/pmulld |
1187 | { ISD::SHL, MVT::v8i32, { 9, 11,12,17 } }, // pslld/paddd/cvttps2dq/pmulld + split |
1188 | { ISD::SHL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. |
1189 | { ISD::SHL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. |
1190 | |
1191 | { ISD::SRL, MVT::v16i8, { 11, 27,12,18 } }, // pblendvb sequence. |
1192 | { ISD::SRL, MVT::v32i8, { 23, 23,30,43 } }, // pblendvb sequence + split. |
1193 | { ISD::SRL, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. |
1194 | { ISD::SRL, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. |
1195 | { ISD::SRL, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. |
1196 | { ISD::SRL, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. |
1197 | { ISD::SRL, MVT::v2i64, { 2, 4, 4, 6 } }, // Shift each lane + blend. |
1198 | { ISD::SRL, MVT::v4i64, { 6, 7,11,15 } }, // Shift each lane + blend + split. |
1199 | |
1200 | { ISD::SRA, MVT::v16i8, { 21, 22,24,36 } }, // pblendvb sequence. |
1201 | { ISD::SRA, MVT::v32i8, { 44, 45,51,76 } }, // pblendvb sequence + split. |
1202 | { ISD::SRA, MVT::v8i16, { 13, 16,14,22 } }, // pblendvb sequence. |
1203 | { ISD::SRA, MVT::v16i16, { 28, 30,31,48 } }, // pblendvb sequence + split. |
1204 | { ISD::SRA, MVT::v4i32, { 6, 7,12,16 } }, // Shift each lane + blend. |
1205 | { ISD::SRA, MVT::v8i32, { 14, 14,26,34 } }, // Shift each lane + blend + split. |
1206 | { ISD::SRA, MVT::v2i64, { 5, 6,10,14 } }, // Shift each lane + blend. |
1207 | { ISD::SRA, MVT::v4i64, { 12, 12,22,30 } }, // Shift each lane + blend + split. |
1208 | |
1209 | { ISD::FNEG, MVT::v4f64, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ |
1210 | { ISD::FNEG, MVT::v8f32, { 2, 2, 1, 2 } }, // BTVER2 from http://www.agner.org/ |
1211 | |
1212 | { ISD::FADD, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1213 | { ISD::FADD, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1214 | { ISD::FADD, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1215 | { ISD::FADD, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1216 | { ISD::FADD, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ |
1217 | { ISD::FADD, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ |
1218 | |
1219 | { ISD::FSUB, MVT::f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1220 | { ISD::FSUB, MVT::f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1221 | { ISD::FSUB, MVT::v2f64, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1222 | { ISD::FSUB, MVT::v4f32, { 1, 5, 1, 1 } }, // BDVER2 from http://www.agner.org/ |
1223 | { ISD::FSUB, MVT::v4f64, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ |
1224 | { ISD::FSUB, MVT::v8f32, { 2, 5, 1, 2 } }, // BDVER2 from http://www.agner.org/ |
1225 | |
1226 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ |
1227 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ |
1228 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ |
1229 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // BTVER2 from http://www.agner.org/ |
1230 | { ISD::FMUL, MVT::v4f64, { 4, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ |
1231 | { ISD::FMUL, MVT::v8f32, { 2, 5, 1, 2 } }, // BTVER2 from http://www.agner.org/ |
1232 | |
1233 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ |
1234 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // SNB from http://www.agner.org/ |
1235 | { ISD::FDIV, MVT::v8f32, { 28, 29, 1, 3 } }, // SNB from http://www.agner.org/ |
1236 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ |
1237 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // SNB from http://www.agner.org/ |
1238 | { ISD::FDIV, MVT::v4f64, { 44, 45, 1, 3 } }, // SNB from http://www.agner.org/ |
1239 | }; |
1240 | |
1241 | if (ST->hasAVX()) |
1242 | if (const auto *Entry = CostTableLookup(AVX1CostTable, ISD, LT.second)) |
1243 | if (auto KindCost = Entry->Cost[CostKind]) |
1244 | return LT.first * *KindCost; |
1245 | |
1246 | static const CostKindTblEntry SSE42CostTable[] = { |
1247 | { ISD::FADD, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1248 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1249 | { ISD::FADD, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1250 | { ISD::FADD, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1251 | |
1252 | { ISD::FSUB, MVT::f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1253 | { ISD::FSUB, MVT::f32 , { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1254 | { ISD::FSUB, MVT::v2f64, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1255 | { ISD::FSUB, MVT::v4f32, { 1, 3, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1256 | |
1257 | { ISD::FMUL, MVT::f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1258 | { ISD::FMUL, MVT::f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1259 | { ISD::FMUL, MVT::v2f64, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1260 | { ISD::FMUL, MVT::v4f32, { 1, 5, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1261 | |
1262 | { ISD::FDIV, MVT::f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1263 | { ISD::FDIV, MVT::v4f32, { 14, 14, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1264 | { ISD::FDIV, MVT::f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1265 | { ISD::FDIV, MVT::v2f64, { 22, 22, 1, 1 } }, // Nehalem from http://www.agner.org/ |
1266 | |
1267 | { ISD::MUL, MVT::v2i64, { 6, 10,10,10 } } // 3*pmuludq/3*shift/2*add |
1268 | }; |
1269 | |
1270 | if (ST->hasSSE42()) |
1271 | if (const auto *Entry = CostTableLookup(SSE42CostTable, ISD, LT.second)) |
1272 | if (auto KindCost = Entry->Cost[CostKind]) |
1273 | return LT.first * *KindCost; |
1274 | |
1275 | static const CostKindTblEntry SSE41CostTable[] = { |
1276 | { ISD::SHL, MVT::v16i8, { 15, 24,17,22 } }, // pblendvb sequence. |
1277 | { ISD::SHL, MVT::v8i16, { 11, 14,11,11 } }, // pblendvb sequence. |
1278 | { ISD::SHL, MVT::v4i32, { 14, 20, 4,10 } }, // pslld/paddd/cvttps2dq/pmulld |
1279 | |
1280 | { ISD::SRL, MVT::v16i8, { 16, 27,18,24 } }, // pblendvb sequence. |
1281 | { ISD::SRL, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. |
1282 | { ISD::SRL, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. |
1283 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. |
1284 | |
1285 | { ISD::SRA, MVT::v16i8, { 38, 41,30,36 } }, // pblendvb sequence. |
1286 | { ISD::SRA, MVT::v8i16, { 22, 26,23,27 } }, // pblendvb sequence. |
1287 | { ISD::SRA, MVT::v4i32, { 16, 17,15,19 } }, // Shift each lane + blend. |
1288 | { ISD::SRA, MVT::v2i64, { 8, 17, 5, 7 } }, // splat+shuffle sequence. |
1289 | |
1290 | { ISD::MUL, MVT::v16i8, { 5, 18,10,12 } }, // 2*unpack/2*pmullw/2*and/pack |
1291 | { ISD::MUL, MVT::v4i32, { 2, 11, 1, 1 } } // pmulld (Nehalem from agner.org) |
1292 | }; |
1293 | |
1294 | if (ST->hasSSE41()) |
1295 | if (const auto *Entry = CostTableLookup(SSE41CostTable, ISD, LT.second)) |
1296 | if (auto KindCost = Entry->Cost[CostKind]) |
1297 | return LT.first * *KindCost; |
1298 | |
1299 | static const CostKindTblEntry SSE2CostTable[] = { |
1300 | // We don't correctly identify costs of casts because they are marked as |
1301 | // custom. |
1302 | { ISD::SHL, MVT::v16i8, { 13, 21,26,28 } }, // cmpgtb sequence. |
1303 | { ISD::SHL, MVT::v8i16, { 24, 27,16,20 } }, // cmpgtw sequence. |
1304 | { ISD::SHL, MVT::v4i32, { 17, 19,10,12 } }, // pslld/paddd/cvttps2dq/pmuludq. |
1305 | { ISD::SHL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. |
1306 | |
1307 | { ISD::SRL, MVT::v16i8, { 14, 28,27,30 } }, // cmpgtb sequence. |
1308 | { ISD::SRL, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. |
1309 | { ISD::SRL, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. |
1310 | { ISD::SRL, MVT::v2i64, { 4, 6, 5, 7 } }, // splat+shuffle sequence. |
1311 | |
1312 | { ISD::SRA, MVT::v16i8, { 27, 30,54,54 } }, // unpacked cmpgtb sequence. |
1313 | { ISD::SRA, MVT::v8i16, { 16, 19,31,31 } }, // cmpgtw sequence. |
1314 | { ISD::SRA, MVT::v4i32, { 12, 12,15,19 } }, // Shift each lane + blend. |
1315 | { ISD::SRA, MVT::v2i64, { 8, 11,12,16 } }, // srl/xor/sub splat+shuffle sequence. |
1316 | |
1317 | { ISD::AND, MVT::v16i8, { 1, 1, 1, 1 } }, // pand |
1318 | { ISD::AND, MVT::v8i16, { 1, 1, 1, 1 } }, // pand |
1319 | { ISD::AND, MVT::v4i32, { 1, 1, 1, 1 } }, // pand |
1320 | { ISD::AND, MVT::v2i64, { 1, 1, 1, 1 } }, // pand |
1321 | |
1322 | { ISD::OR, MVT::v16i8, { 1, 1, 1, 1 } }, // por |
1323 | { ISD::OR, MVT::v8i16, { 1, 1, 1, 1 } }, // por |
1324 | { ISD::OR, MVT::v4i32, { 1, 1, 1, 1 } }, // por |
1325 | { ISD::OR, MVT::v2i64, { 1, 1, 1, 1 } }, // por |
1326 | |
1327 | { ISD::XOR, MVT::v16i8, { 1, 1, 1, 1 } }, // pxor |
1328 | { ISD::XOR, MVT::v8i16, { 1, 1, 1, 1 } }, // pxor |
1329 | { ISD::XOR, MVT::v4i32, { 1, 1, 1, 1 } }, // pxor |
1330 | { ISD::XOR, MVT::v2i64, { 1, 1, 1, 1 } }, // pxor |
1331 | |
1332 | { ISD::ADD, MVT::v2i64, { 1, 2, 1, 2 } }, // paddq |
1333 | { ISD::SUB, MVT::v2i64, { 1, 2, 1, 2 } }, // psubq |
1334 | |
1335 | { ISD::MUL, MVT::v16i8, { 5, 18,12,12 } }, // 2*unpack/2*pmullw/2*and/pack |
1336 | { ISD::MUL, MVT::v8i16, { 1, 5, 1, 1 } }, // pmullw |
1337 | { ISD::MUL, MVT::v4i32, { 6, 8, 7, 7 } }, // 3*pmuludq/4*shuffle |
1338 | { ISD::MUL, MVT::v2i64, { 7, 10,10,10 } }, // 3*pmuludq/3*shift/2*add |
1339 | |
1340 | { X86ISD::PMULUDQ, MVT::v2i64, { 1, 5, 1, 1 } }, |
1341 | |
1342 | { ISD::FDIV, MVT::f32, { 23, 23, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1343 | { ISD::FDIV, MVT::v4f32, { 39, 39, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1344 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1345 | { ISD::FDIV, MVT::v2f64, { 69, 69, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1346 | |
1347 | { ISD::FNEG, MVT::f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1348 | { ISD::FNEG, MVT::f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1349 | { ISD::FNEG, MVT::v4f32, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1350 | { ISD::FNEG, MVT::v2f64, { 1, 1, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1351 | |
1352 | { ISD::FADD, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1353 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1354 | { ISD::FADD, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1355 | |
1356 | { ISD::FSUB, MVT::f32, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1357 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1358 | { ISD::FSUB, MVT::v2f64, { 2, 3, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1359 | |
1360 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1361 | { ISD::FMUL, MVT::v2f64, { 2, 5, 1, 1 } }, // Pentium IV from http://www.agner.org/ |
1362 | }; |
1363 | |
1364 | if (ST->hasSSE2()) |
1365 | if (const auto *Entry = CostTableLookup(SSE2CostTable, ISD, LT.second)) |
1366 | if (auto KindCost = Entry->Cost[CostKind]) |
1367 | return LT.first * *KindCost; |
1368 | |
1369 | static const CostKindTblEntry SSE1CostTable[] = { |
1370 | { ISD::FDIV, MVT::f32, { 17, 18, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1371 | { ISD::FDIV, MVT::v4f32, { 34, 48, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1372 | |
1373 | { ISD::FNEG, MVT::f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ |
1374 | { ISD::FNEG, MVT::v4f32, { 2, 2, 1, 2 } }, // Pentium III from http://www.agner.org/ |
1375 | |
1376 | { ISD::FADD, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1377 | { ISD::FADD, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1378 | |
1379 | { ISD::FSUB, MVT::f32, { 1, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1380 | { ISD::FSUB, MVT::v4f32, { 2, 3, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1381 | |
1382 | { ISD::FMUL, MVT::f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1383 | { ISD::FMUL, MVT::v4f32, { 2, 5, 1, 1 } }, // Pentium III from http://www.agner.org/ |
1384 | }; |
1385 | |
1386 | if (ST->hasSSE1()) |
1387 | if (const auto *Entry = CostTableLookup(SSE1CostTable, ISD, LT.second)) |
1388 | if (auto KindCost = Entry->Cost[CostKind]) |
1389 | return LT.first * *KindCost; |
1390 | |
1391 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
1392 | { ISD::ADD, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ |
1393 | { ISD::SUB, MVT::i64, { 1 } }, // Core (Merom) from http://www.agner.org/ |
1394 | { ISD::MUL, MVT::i64, { 2, 6, 1, 2 } }, |
1395 | }; |
1396 | |
1397 | if (ST->is64Bit()) |
1398 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, LT.second)) |
1399 | if (auto KindCost = Entry->Cost[CostKind]) |
1400 | return LT.first * *KindCost; |
1401 | |
1402 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
1403 | { ISD::ADD, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ |
1404 | { ISD::ADD, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ |
1405 | { ISD::ADD, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ |
1406 | |
1407 | { ISD::SUB, MVT::i8, { 1 } }, // Pentium III from http://www.agner.org/ |
1408 | { ISD::SUB, MVT::i16, { 1 } }, // Pentium III from http://www.agner.org/ |
1409 | { ISD::SUB, MVT::i32, { 1 } }, // Pentium III from http://www.agner.org/ |
1410 | |
1411 | { ISD::MUL, MVT::i8, { 3, 4, 1, 1 } }, |
1412 | { ISD::MUL, MVT::i16, { 2, 4, 1, 1 } }, |
1413 | { ISD::MUL, MVT::i32, { 1, 4, 1, 1 } }, |
1414 | |
1415 | { ISD::FNEG, MVT::f64, { 2, 2, 1, 3 } }, // (x87) |
1416 | { ISD::FADD, MVT::f64, { 2, 3, 1, 1 } }, // (x87) |
1417 | { ISD::FSUB, MVT::f64, { 2, 3, 1, 1 } }, // (x87) |
1418 | { ISD::FMUL, MVT::f64, { 2, 5, 1, 1 } }, // (x87) |
1419 | { ISD::FDIV, MVT::f64, { 38, 38, 1, 1 } }, // (x87) |
1420 | }; |
1421 | |
1422 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, LT.second)) |
1423 | if (auto KindCost = Entry->Cost[CostKind]) |
1424 | return LT.first * *KindCost; |
1425 | |
1426 | // It is not a good idea to vectorize division. We have to scalarize it and |
1427 | // in the process we will often end up having to spilling regular |
1428 | // registers. The overhead of division is going to dominate most kernels |
1429 | // anyways so try hard to prevent vectorization of division - it is |
1430 | // generally a bad idea. Assume somewhat arbitrarily that we have to be able |
1431 | // to hide "20 cycles" for each lane. |
1432 | if (CostKind == TTI::TCK_RecipThroughput && LT.second.isVector() && |
1433 | (ISD == ISD::SDIV || ISD == ISD::SREM || ISD == ISD::UDIV || |
1434 | ISD == ISD::UREM)) { |
1435 | InstructionCost ScalarCost = |
1436 | getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind, |
1437 | Op1Info: Op1Info.getNoProps(), Op2Info: Op2Info.getNoProps()); |
1438 | return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost; |
1439 | } |
1440 | |
1441 | // Handle some basic single instruction code size cases. |
1442 | if (CostKind == TTI::TCK_CodeSize) { |
1443 | switch (ISD) { |
1444 | case ISD::FADD: |
1445 | case ISD::FSUB: |
1446 | case ISD::FMUL: |
1447 | case ISD::FDIV: |
1448 | case ISD::FNEG: |
1449 | case ISD::AND: |
1450 | case ISD::OR: |
1451 | case ISD::XOR: |
1452 | return LT.first; |
1453 | break; |
1454 | } |
1455 | } |
1456 | |
1457 | // Fallback to the default implementation. |
1458 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
1459 | Args, CxtI); |
1460 | } |
1461 | |
1462 | InstructionCost |
1463 | X86TTIImpl::getAltInstrCost(VectorType *VecTy, unsigned Opcode0, |
1464 | unsigned Opcode1, const SmallBitVector &OpcodeMask, |
1465 | TTI::TargetCostKind CostKind) const { |
1466 | if (isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) |
1467 | return TTI::TCC_Basic; |
1468 | return InstructionCost::getInvalid(); |
1469 | } |
1470 | |
1471 | InstructionCost X86TTIImpl::getShuffleCost( |
1472 | TTI::ShuffleKind Kind, VectorType *BaseTp, ArrayRef<int> Mask, |
1473 | TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, |
1474 | ArrayRef<const Value *> Args, const Instruction *CxtI) { |
1475 | // 64-bit packed float vectors (v2f32) are widened to type v4f32. |
1476 | // 64-bit packed integer vectors (v2i32) are widened to type v4i32. |
1477 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: BaseTp); |
1478 | |
1479 | Kind = improveShuffleKindFromMask(Kind, Mask, Ty: BaseTp, Index, SubTy&: SubTp); |
1480 | |
1481 | // Recognize a basic concat_vector shuffle. |
1482 | if (Kind == TTI::SK_PermuteTwoSrc && |
1483 | Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) && |
1484 | ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size())) |
1485 | return getShuffleCost(Kind: TTI::SK_InsertSubvector, |
1486 | BaseTp: VectorType::getDoubleElementsVectorType(VTy: BaseTp), Mask, |
1487 | CostKind, Index: Mask.size() / 2, SubTp: BaseTp); |
1488 | |
1489 | // Treat Transpose as 2-op shuffles - there's no difference in lowering. |
1490 | if (Kind == TTI::SK_Transpose) |
1491 | Kind = TTI::SK_PermuteTwoSrc; |
1492 | |
1493 | if (Kind == TTI::SK_Broadcast) { |
1494 | // For Broadcasts we are splatting the first element from the first input |
1495 | // register, so only need to reference that input and all the output |
1496 | // registers are the same. |
1497 | LT.first = 1; |
1498 | |
1499 | // If we're broadcasting a load then AVX/AVX2 can do this for free. |
1500 | using namespace PatternMatch; |
1501 | if (!Args.empty() && match(V: Args[0], P: m_OneUse(SubPattern: m_Load(Op: m_Value()))) && |
1502 | (ST->hasAVX2() || |
1503 | (ST->hasAVX() && LT.second.getScalarSizeInBits() >= 32))) |
1504 | return TTI::TCC_Free; |
1505 | } |
1506 | |
1507 | // Treat <X x bfloat> shuffles as <X x half>. |
1508 | if (LT.second.isVector() && LT.second.getScalarType() == MVT::bf16) |
1509 | LT.second = LT.second.changeVectorElementType(MVT::f16); |
1510 | |
1511 | // Subvector extractions are free if they start at the beginning of a |
1512 | // vector and cheap if the subvectors are aligned. |
1513 | if (Kind == TTI::SK_ExtractSubvector && LT.second.isVector()) { |
1514 | int NumElts = LT.second.getVectorNumElements(); |
1515 | if ((Index % NumElts) == 0) |
1516 | return 0; |
1517 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
1518 | if (SubLT.second.isVector()) { |
1519 | int NumSubElts = SubLT.second.getVectorNumElements(); |
1520 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
1521 | return SubLT.first; |
1522 | // Handle some cases for widening legalization. For now we only handle |
1523 | // cases where the original subvector was naturally aligned and evenly |
1524 | // fit in its legalized subvector type. |
1525 | // FIXME: Remove some of the alignment restrictions. |
1526 | // FIXME: We can use permq for 64-bit or larger extracts from 256-bit |
1527 | // vectors. |
1528 | int OrigSubElts = cast<FixedVectorType>(Val: SubTp)->getNumElements(); |
1529 | if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 && |
1530 | (NumSubElts % OrigSubElts) == 0 && |
1531 | LT.second.getVectorElementType() == |
1532 | SubLT.second.getVectorElementType() && |
1533 | LT.second.getVectorElementType().getSizeInBits() == |
1534 | BaseTp->getElementType()->getPrimitiveSizeInBits()) { |
1535 | assert(NumElts >= NumSubElts && NumElts > OrigSubElts && |
1536 | "Unexpected number of elements!" ); |
1537 | auto *VecTy = FixedVectorType::get(ElementType: BaseTp->getElementType(), |
1538 | NumElts: LT.second.getVectorNumElements()); |
1539 | auto *SubTy = FixedVectorType::get(ElementType: BaseTp->getElementType(), |
1540 | NumElts: SubLT.second.getVectorNumElements()); |
1541 | int = alignDown(Value: (Index % NumElts), Align: NumSubElts); |
1542 | InstructionCost = |
1543 | getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: VecTy, Mask: std::nullopt, |
1544 | CostKind, Index: ExtractIndex, SubTp: SubTy); |
1545 | |
1546 | // If the original size is 32-bits or more, we can use pshufd. Otherwise |
1547 | // if we have SSSE3 we can use pshufb. |
1548 | if (SubTp->getPrimitiveSizeInBits() >= 32 || ST->hasSSSE3()) |
1549 | return ExtractCost + 1; // pshufd or pshufb |
1550 | |
1551 | assert(SubTp->getPrimitiveSizeInBits() == 16 && |
1552 | "Unexpected vector size" ); |
1553 | |
1554 | return ExtractCost + 2; // worst case pshufhw + pshufd |
1555 | } |
1556 | } |
1557 | // If the extract subvector is not optimal, treat it as single op shuffle. |
1558 | Kind = TTI::SK_PermuteSingleSrc; |
1559 | } |
1560 | |
1561 | // Subvector insertions are cheap if the subvectors are aligned. |
1562 | // Note that in general, the insertion starting at the beginning of a vector |
1563 | // isn't free, because we need to preserve the rest of the wide vector. |
1564 | if (Kind == TTI::SK_InsertSubvector && LT.second.isVector()) { |
1565 | int NumElts = LT.second.getVectorNumElements(); |
1566 | std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(Ty: SubTp); |
1567 | if (SubLT.second.isVector()) { |
1568 | int NumSubElts = SubLT.second.getVectorNumElements(); |
1569 | if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0) |
1570 | return SubLT.first; |
1571 | } |
1572 | |
1573 | // If the insertion isn't aligned, treat it like a 2-op shuffle. |
1574 | Kind = TTI::SK_PermuteTwoSrc; |
1575 | } |
1576 | |
1577 | // Handle some common (illegal) sub-vector types as they are often very cheap |
1578 | // to shuffle even on targets without PSHUFB. |
1579 | EVT VT = TLI->getValueType(DL, Ty: BaseTp); |
1580 | if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 && |
1581 | !ST->hasSSSE3()) { |
1582 | static const CostTblEntry SSE2SubVectorShuffleTbl[] = { |
1583 | {TTI::SK_Broadcast, MVT::v4i16, 1}, // pshuflw |
1584 | {TTI::SK_Broadcast, MVT::v2i16, 1}, // pshuflw |
1585 | {TTI::SK_Broadcast, MVT::v8i8, 2}, // punpck/pshuflw |
1586 | {TTI::SK_Broadcast, MVT::v4i8, 2}, // punpck/pshuflw |
1587 | {TTI::SK_Broadcast, MVT::v2i8, 1}, // punpck |
1588 | |
1589 | {TTI::SK_Reverse, MVT::v4i16, 1}, // pshuflw |
1590 | {TTI::SK_Reverse, MVT::v2i16, 1}, // pshuflw |
1591 | {TTI::SK_Reverse, MVT::v4i8, 3}, // punpck/pshuflw/packus |
1592 | {TTI::SK_Reverse, MVT::v2i8, 1}, // punpck |
1593 | |
1594 | {TTI::SK_Splice, MVT::v4i16, 2}, // punpck+psrldq |
1595 | {TTI::SK_Splice, MVT::v2i16, 2}, // punpck+psrldq |
1596 | {TTI::SK_Splice, MVT::v4i8, 2}, // punpck+psrldq |
1597 | {TTI::SK_Splice, MVT::v2i8, 2}, // punpck+psrldq |
1598 | |
1599 | {TTI::SK_PermuteTwoSrc, MVT::v4i16, 2}, // punpck/pshuflw |
1600 | {TTI::SK_PermuteTwoSrc, MVT::v2i16, 2}, // punpck/pshuflw |
1601 | {TTI::SK_PermuteTwoSrc, MVT::v8i8, 7}, // punpck/pshuflw |
1602 | {TTI::SK_PermuteTwoSrc, MVT::v4i8, 4}, // punpck/pshuflw |
1603 | {TTI::SK_PermuteTwoSrc, MVT::v2i8, 2}, // punpck |
1604 | |
1605 | {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw |
1606 | {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw |
1607 | {TTI::SK_PermuteSingleSrc, MVT::v8i8, 5}, // punpck/pshuflw |
1608 | {TTI::SK_PermuteSingleSrc, MVT::v4i8, 3}, // punpck/pshuflw |
1609 | {TTI::SK_PermuteSingleSrc, MVT::v2i8, 1}, // punpck |
1610 | }; |
1611 | |
1612 | if (ST->hasSSE2()) |
1613 | if (const auto *Entry = |
1614 | CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT())) |
1615 | return Entry->Cost; |
1616 | } |
1617 | |
1618 | // We are going to permute multiple sources and the result will be in multiple |
1619 | // destinations. Providing an accurate cost only for splits where the element |
1620 | // type remains the same. |
1621 | if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) { |
1622 | MVT LegalVT = LT.second; |
1623 | if (LegalVT.isVector() && |
1624 | LegalVT.getVectorElementType().getSizeInBits() == |
1625 | BaseTp->getElementType()->getPrimitiveSizeInBits() && |
1626 | LegalVT.getVectorNumElements() < |
1627 | cast<FixedVectorType>(Val: BaseTp)->getNumElements()) { |
1628 | unsigned VecTySize = DL.getTypeStoreSize(Ty: BaseTp); |
1629 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
1630 | // Number of source vectors after legalization: |
1631 | unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
1632 | // Number of destination vectors after legalization: |
1633 | InstructionCost NumOfDests = LT.first; |
1634 | |
1635 | auto *SingleOpTy = FixedVectorType::get(ElementType: BaseTp->getElementType(), |
1636 | NumElts: LegalVT.getVectorNumElements()); |
1637 | |
1638 | if (!Mask.empty() && NumOfDests.isValid()) { |
1639 | // Try to perform better estimation of the permutation. |
1640 | // 1. Split the source/destination vectors into real registers. |
1641 | // 2. Do the mask analysis to identify which real registers are |
1642 | // permuted. If more than 1 source registers are used for the |
1643 | // destination register building, the cost for this destination register |
1644 | // is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one |
1645 | // source register is used, build mask and calculate the cost as a cost |
1646 | // of PermuteSingleSrc. |
1647 | // Also, for the single register permute we try to identify if the |
1648 | // destination register is just a copy of the source register or the |
1649 | // copy of the previous destination register (the cost is |
1650 | // TTI::TCC_Basic). If the source register is just reused, the cost for |
1651 | // this operation is 0. |
1652 | NumOfDests = |
1653 | getTypeLegalizationCost( |
1654 | Ty: FixedVectorType::get(ElementType: BaseTp->getElementType(), NumElts: Mask.size())) |
1655 | .first; |
1656 | unsigned E = *NumOfDests.getValue(); |
1657 | unsigned NormalizedVF = |
1658 | LegalVT.getVectorNumElements() * std::max(a: NumOfSrcs, b: E); |
1659 | unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
1660 | unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements(); |
1661 | SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem); |
1662 | copy(Range&: Mask, Out: NormalizedMask.begin()); |
1663 | unsigned PrevSrcReg = 0; |
1664 | ArrayRef<int> PrevRegMask; |
1665 | InstructionCost Cost = 0; |
1666 | processShuffleMasks( |
1667 | Mask: NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfUsedRegs: NumOfDestRegs, NoInputAction: []() {}, |
1668 | SingleInputAction: [this, SingleOpTy, CostKind, &PrevSrcReg, &PrevRegMask, |
1669 | &Cost](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) { |
1670 | if (!ShuffleVectorInst::isIdentityMask(Mask: RegMask, NumSrcElts: RegMask.size())) { |
1671 | // Check if the previous register can be just copied to the next |
1672 | // one. |
1673 | if (PrevRegMask.empty() || PrevSrcReg != SrcReg || |
1674 | PrevRegMask != RegMask) |
1675 | Cost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: SingleOpTy, |
1676 | Mask: RegMask, CostKind, Index: 0, SubTp: nullptr); |
1677 | else |
1678 | // Just a copy of previous destination register. |
1679 | Cost += TTI::TCC_Basic; |
1680 | return; |
1681 | } |
1682 | if (SrcReg != DestReg && |
1683 | any_of(Range&: RegMask, P: [](int I) { return I != PoisonMaskElem; })) { |
1684 | // Just a copy of the source register. |
1685 | Cost += TTI::TCC_Basic; |
1686 | } |
1687 | PrevSrcReg = SrcReg; |
1688 | PrevRegMask = RegMask; |
1689 | }, |
1690 | ManyInputsAction: [this, SingleOpTy, CostKind, &Cost](ArrayRef<int> RegMask, |
1691 | unsigned /*Unused*/, |
1692 | unsigned /*Unused*/) { |
1693 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SingleOpTy, Mask: RegMask, |
1694 | CostKind, Index: 0, SubTp: nullptr); |
1695 | }); |
1696 | return Cost; |
1697 | } |
1698 | |
1699 | InstructionCost NumOfShuffles = (NumOfSrcs - 1) * NumOfDests; |
1700 | return NumOfShuffles * getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SingleOpTy, |
1701 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
1702 | } |
1703 | |
1704 | return BaseT::getShuffleCost(Kind, Tp: BaseTp, Mask, CostKind, Index, SubTp); |
1705 | } |
1706 | |
1707 | // For 2-input shuffles, we must account for splitting the 2 inputs into many. |
1708 | if (Kind == TTI::SK_PermuteTwoSrc && LT.first != 1) { |
1709 | // We assume that source and destination have the same vector type. |
1710 | InstructionCost NumOfDests = LT.first; |
1711 | InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1; |
1712 | LT.first = NumOfDests * NumOfShufflesPerDest; |
1713 | } |
1714 | |
1715 | static const CostTblEntry AVX512VBMIShuffleTbl[] = { |
1716 | {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb |
1717 | {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb |
1718 | |
1719 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb |
1720 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb |
1721 | |
1722 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b |
1723 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b |
1724 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2} // vpermt2b |
1725 | }; |
1726 | |
1727 | if (ST->hasVBMI()) |
1728 | if (const auto *Entry = |
1729 | CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second)) |
1730 | return LT.first * Entry->Cost; |
1731 | |
1732 | static const CostTblEntry AVX512BWShuffleTbl[] = { |
1733 | {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw |
1734 | {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw |
1735 | {TTI::SK_Broadcast, MVT::v64i8, 1}, // vpbroadcastb |
1736 | |
1737 | {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw |
1738 | {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw |
1739 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw |
1740 | {TTI::SK_Reverse, MVT::v64i8, 2}, // pshufb + vshufi64x2 |
1741 | |
1742 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw |
1743 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw |
1744 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw |
1745 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw |
1746 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8}, // extend to v32i16 |
1747 | |
1748 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w |
1749 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w |
1750 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w |
1751 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2}, // vpermt2w |
1752 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1 |
1753 | |
1754 | {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw |
1755 | {TTI::SK_Select, MVT::v64i8, 1}, // vblendmb |
1756 | |
1757 | {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr |
1758 | {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr |
1759 | {TTI::SK_Splice, MVT::v64i8, 2}, // vshufi64x2 + palignr |
1760 | }; |
1761 | |
1762 | if (ST->hasBWI()) |
1763 | if (const auto *Entry = |
1764 | CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second)) |
1765 | return LT.first * Entry->Cost; |
1766 | |
1767 | static const CostKindTblEntry AVX512ShuffleTbl[] = { |
1768 | {TTI::SK_Broadcast, MVT::v8f64, { 1, 1, 1, 1 } }, // vbroadcastsd |
1769 | {TTI::SK_Broadcast, MVT::v16f32, { 1, 1, 1, 1 } }, // vbroadcastss |
1770 | {TTI::SK_Broadcast, MVT::v8i64, { 1, 1, 1, 1 } }, // vpbroadcastq |
1771 | {TTI::SK_Broadcast, MVT::v16i32, { 1, 1, 1, 1 } }, // vpbroadcastd |
1772 | {TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw |
1773 | {TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw |
1774 | {TTI::SK_Broadcast, MVT::v64i8, { 1, 1, 1, 1 } }, // vpbroadcastb |
1775 | |
1776 | {TTI::SK_Reverse, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd |
1777 | {TTI::SK_Reverse, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps |
1778 | {TTI::SK_Reverse, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq |
1779 | {TTI::SK_Reverse, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd |
1780 | {TTI::SK_Reverse, MVT::v32i16, { 7, 7, 7, 7 } }, // per mca |
1781 | {TTI::SK_Reverse, MVT::v32f16, { 7, 7, 7, 7 } }, // per mca |
1782 | {TTI::SK_Reverse, MVT::v64i8, { 7, 7, 7, 7 } }, // per mca |
1783 | |
1784 | {TTI::SK_Splice, MVT::v8f64, { 1, 1, 1, 1 } }, // vpalignd |
1785 | {TTI::SK_Splice, MVT::v4f64, { 1, 1, 1, 1 } }, // vpalignd |
1786 | {TTI::SK_Splice, MVT::v16f32, { 1, 1, 1, 1 } }, // vpalignd |
1787 | {TTI::SK_Splice, MVT::v8f32, { 1, 1, 1, 1 } }, // vpalignd |
1788 | {TTI::SK_Splice, MVT::v8i64, { 1, 1, 1, 1 } }, // vpalignd |
1789 | {TTI::SK_Splice, MVT::v4i64, { 1, 1, 1, 1 } }, // vpalignd |
1790 | {TTI::SK_Splice, MVT::v16i32, { 1, 1, 1, 1 } }, // vpalignd |
1791 | {TTI::SK_Splice, MVT::v8i32, { 1, 1, 1, 1 } }, // vpalignd |
1792 | {TTI::SK_Splice, MVT::v32i16, { 4, 4, 4, 4 } }, // split + palignr |
1793 | {TTI::SK_Splice, MVT::v32f16, { 4, 4, 4, 4 } }, // split + palignr |
1794 | {TTI::SK_Splice, MVT::v64i8, { 4, 4, 4, 4 } }, // split + palignr |
1795 | |
1796 | {TTI::SK_PermuteSingleSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermpd |
1797 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermpd |
1798 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermpd |
1799 | {TTI::SK_PermuteSingleSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermps |
1800 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermps |
1801 | {TTI::SK_PermuteSingleSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermps |
1802 | {TTI::SK_PermuteSingleSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermq |
1803 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermq |
1804 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermq |
1805 | {TTI::SK_PermuteSingleSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermd |
1806 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermd |
1807 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermd |
1808 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, { 1, 3, 1, 1 } }, // pshufb |
1809 | |
1810 | {TTI::SK_PermuteTwoSrc, MVT::v8f64, { 1, 3, 1, 1 } }, // vpermt2pd |
1811 | {TTI::SK_PermuteTwoSrc, MVT::v16f32, { 1, 3, 1, 1 } }, // vpermt2ps |
1812 | {TTI::SK_PermuteTwoSrc, MVT::v8i64, { 1, 3, 1, 1 } }, // vpermt2q |
1813 | {TTI::SK_PermuteTwoSrc, MVT::v16i32, { 1, 3, 1, 1 } }, // vpermt2d |
1814 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, { 1, 3, 1, 1 } }, // vpermt2pd |
1815 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, { 1, 3, 1, 1 } }, // vpermt2ps |
1816 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, { 1, 3, 1, 1 } }, // vpermt2q |
1817 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, { 1, 3, 1, 1 } }, // vpermt2d |
1818 | {TTI::SK_PermuteTwoSrc, MVT::v2f64, { 1, 3, 1, 1 } }, // vpermt2pd |
1819 | {TTI::SK_PermuteTwoSrc, MVT::v4f32, { 1, 3, 1, 1 } }, // vpermt2ps |
1820 | {TTI::SK_PermuteTwoSrc, MVT::v2i64, { 1, 3, 1, 1 } }, // vpermt2q |
1821 | {TTI::SK_PermuteTwoSrc, MVT::v4i32, { 1, 3, 1, 1 } }, // vpermt2d |
1822 | |
1823 | // FIXME: This just applies the type legalization cost rules above |
1824 | // assuming these completely split. |
1825 | {TTI::SK_PermuteSingleSrc, MVT::v32i16, { 14, 14, 14, 14 } }, |
1826 | {TTI::SK_PermuteSingleSrc, MVT::v32f16, { 14, 14, 14, 14 } }, |
1827 | {TTI::SK_PermuteSingleSrc, MVT::v64i8, { 14, 14, 14, 14 } }, |
1828 | {TTI::SK_PermuteTwoSrc, MVT::v32i16, { 42, 42, 42, 42 } }, |
1829 | {TTI::SK_PermuteTwoSrc, MVT::v32f16, { 42, 42, 42, 42 } }, |
1830 | {TTI::SK_PermuteTwoSrc, MVT::v64i8, { 42, 42, 42, 42 } }, |
1831 | |
1832 | {TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vpternlogq |
1833 | {TTI::SK_Select, MVT::v32f16, { 1, 1, 1, 1 } }, // vpternlogq |
1834 | {TTI::SK_Select, MVT::v64i8, { 1, 1, 1, 1 } }, // vpternlogq |
1835 | {TTI::SK_Select, MVT::v8f64, { 1, 1, 1, 1 } }, // vblendmpd |
1836 | {TTI::SK_Select, MVT::v16f32, { 1, 1, 1, 1 } }, // vblendmps |
1837 | {TTI::SK_Select, MVT::v8i64, { 1, 1, 1, 1 } }, // vblendmq |
1838 | {TTI::SK_Select, MVT::v16i32, { 1, 1, 1, 1 } }, // vblendmd |
1839 | }; |
1840 | |
1841 | if (ST->hasAVX512()) |
1842 | if (const auto *Entry = CostTableLookup(AVX512ShuffleTbl, Kind, LT.second)) |
1843 | if (auto KindCost = Entry->Cost[CostKind]) |
1844 | return LT.first * *KindCost; |
1845 | |
1846 | static const CostTblEntry AVX2ShuffleTbl[] = { |
1847 | {TTI::SK_Broadcast, MVT::v4f64, 1}, // vbroadcastpd |
1848 | {TTI::SK_Broadcast, MVT::v8f32, 1}, // vbroadcastps |
1849 | {TTI::SK_Broadcast, MVT::v4i64, 1}, // vpbroadcastq |
1850 | {TTI::SK_Broadcast, MVT::v8i32, 1}, // vpbroadcastd |
1851 | {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw |
1852 | {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw |
1853 | {TTI::SK_Broadcast, MVT::v32i8, 1}, // vpbroadcastb |
1854 | |
1855 | {TTI::SK_Reverse, MVT::v4f64, 1}, // vpermpd |
1856 | {TTI::SK_Reverse, MVT::v8f32, 1}, // vpermps |
1857 | {TTI::SK_Reverse, MVT::v4i64, 1}, // vpermq |
1858 | {TTI::SK_Reverse, MVT::v8i32, 1}, // vpermd |
1859 | {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb |
1860 | {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb |
1861 | {TTI::SK_Reverse, MVT::v32i8, 2}, // vperm2i128 + pshufb |
1862 | |
1863 | {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb |
1864 | {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb |
1865 | {TTI::SK_Select, MVT::v32i8, 1}, // vpblendvb |
1866 | |
1867 | {TTI::SK_Splice, MVT::v8i32, 2}, // vperm2i128 + vpalignr |
1868 | {TTI::SK_Splice, MVT::v8f32, 2}, // vperm2i128 + vpalignr |
1869 | {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr |
1870 | {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr |
1871 | {TTI::SK_Splice, MVT::v32i8, 2}, // vperm2i128 + vpalignr |
1872 | |
1873 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1}, // vpermpd |
1874 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1}, // vpermps |
1875 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1}, // vpermq |
1876 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1}, // vpermd |
1877 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb |
1878 | // + vpblendvb |
1879 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb |
1880 | // + vpblendvb |
1881 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vperm2i128 + 2*vpshufb |
1882 | // + vpblendvb |
1883 | |
1884 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vpermpd + vblendpd |
1885 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3}, // 2*vpermps + vblendps |
1886 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vpermq + vpblendd |
1887 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3}, // 2*vpermd + vpblendd |
1888 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb |
1889 | // + vpblendvb |
1890 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb |
1891 | // + vpblendvb |
1892 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7}, // 2*vperm2i128 + 4*vpshufb |
1893 | // + vpblendvb |
1894 | }; |
1895 | |
1896 | if (ST->hasAVX2()) |
1897 | if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) |
1898 | return LT.first * Entry->Cost; |
1899 | |
1900 | static const CostTblEntry XOPShuffleTbl[] = { |
1901 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vpermil2pd |
1902 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2}, // vperm2f128 + vpermil2ps |
1903 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vpermil2pd |
1904 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2}, // vperm2f128 + vpermil2ps |
1905 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm |
1906 | // + vinsertf128 |
1907 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4}, // vextractf128 + 2*vpperm |
1908 | // + vinsertf128 |
1909 | |
1910 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm |
1911 | // + vinsertf128 |
1912 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1}, // vpperm |
1913 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9}, // 2*vextractf128 + 6*vpperm |
1914 | // + vinsertf128 |
1915 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}, // vpperm |
1916 | }; |
1917 | |
1918 | if (ST->hasXOP()) |
1919 | if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) |
1920 | return LT.first * Entry->Cost; |
1921 | |
1922 | static const CostTblEntry AVX1ShuffleTbl[] = { |
1923 | {TTI::SK_Broadcast, MVT::v4f64, 2}, // vperm2f128 + vpermilpd |
1924 | {TTI::SK_Broadcast, MVT::v8f32, 2}, // vperm2f128 + vpermilps |
1925 | {TTI::SK_Broadcast, MVT::v4i64, 2}, // vperm2f128 + vpermilpd |
1926 | {TTI::SK_Broadcast, MVT::v8i32, 2}, // vperm2f128 + vpermilps |
1927 | {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128 |
1928 | {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128 |
1929 | {TTI::SK_Broadcast, MVT::v32i8, 2}, // vpshufb + vinsertf128 |
1930 | |
1931 | {TTI::SK_Reverse, MVT::v4f64, 2}, // vperm2f128 + vpermilpd |
1932 | {TTI::SK_Reverse, MVT::v8f32, 2}, // vperm2f128 + vpermilps |
1933 | {TTI::SK_Reverse, MVT::v4i64, 2}, // vperm2f128 + vpermilpd |
1934 | {TTI::SK_Reverse, MVT::v8i32, 2}, // vperm2f128 + vpermilps |
1935 | {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb |
1936 | // + vinsertf128 |
1937 | {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb |
1938 | // + vinsertf128 |
1939 | {TTI::SK_Reverse, MVT::v32i8, 4}, // vextractf128 + 2*pshufb |
1940 | // + vinsertf128 |
1941 | |
1942 | {TTI::SK_Select, MVT::v4i64, 1}, // vblendpd |
1943 | {TTI::SK_Select, MVT::v4f64, 1}, // vblendpd |
1944 | {TTI::SK_Select, MVT::v8i32, 1}, // vblendps |
1945 | {TTI::SK_Select, MVT::v8f32, 1}, // vblendps |
1946 | {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor |
1947 | {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor |
1948 | {TTI::SK_Select, MVT::v32i8, 3}, // vpand + vpandn + vpor |
1949 | |
1950 | {TTI::SK_Splice, MVT::v4i64, 2}, // vperm2f128 + shufpd |
1951 | {TTI::SK_Splice, MVT::v4f64, 2}, // vperm2f128 + shufpd |
1952 | {TTI::SK_Splice, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps |
1953 | {TTI::SK_Splice, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps |
1954 | {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
1955 | {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
1956 | {TTI::SK_Splice, MVT::v32i8, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128 |
1957 | |
1958 | {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2}, // vperm2f128 + vshufpd |
1959 | {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2}, // vperm2f128 + vshufpd |
1960 | {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps |
1961 | {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps |
1962 | {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb |
1963 | // + 2*por + vinsertf128 |
1964 | {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb |
1965 | // + 2*por + vinsertf128 |
1966 | {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8}, // vextractf128 + 4*pshufb |
1967 | // + 2*por + vinsertf128 |
1968 | |
1969 | {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3}, // 2*vperm2f128 + vshufpd |
1970 | {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3}, // 2*vperm2f128 + vshufpd |
1971 | {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4}, // 2*vperm2f128 + 2*vshufps |
1972 | {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4}, // 2*vperm2f128 + 2*vshufps |
1973 | {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb |
1974 | // + 4*por + vinsertf128 |
1975 | {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb |
1976 | // + 4*por + vinsertf128 |
1977 | {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15}, // 2*vextractf128 + 8*pshufb |
1978 | // + 4*por + vinsertf128 |
1979 | }; |
1980 | |
1981 | if (ST->hasAVX()) |
1982 | if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second)) |
1983 | return LT.first * Entry->Cost; |
1984 | |
1985 | static const CostTblEntry SSE41ShuffleTbl[] = { |
1986 | {TTI::SK_Select, MVT::v2i64, 1}, // pblendw |
1987 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd |
1988 | {TTI::SK_Select, MVT::v4i32, 1}, // pblendw |
1989 | {TTI::SK_Select, MVT::v4f32, 1}, // blendps |
1990 | {TTI::SK_Select, MVT::v8i16, 1}, // pblendw |
1991 | {TTI::SK_Select, MVT::v8f16, 1}, // pblendw |
1992 | {TTI::SK_Select, MVT::v16i8, 1} // pblendvb |
1993 | }; |
1994 | |
1995 | if (ST->hasSSE41()) |
1996 | if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second)) |
1997 | return LT.first * Entry->Cost; |
1998 | |
1999 | static const CostTblEntry SSSE3ShuffleTbl[] = { |
2000 | {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb |
2001 | {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb |
2002 | {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb |
2003 | |
2004 | {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb |
2005 | {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb |
2006 | {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb |
2007 | |
2008 | {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por |
2009 | {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por |
2010 | {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por |
2011 | |
2012 | {TTI::SK_Splice, MVT::v4i32, 1}, // palignr |
2013 | {TTI::SK_Splice, MVT::v4f32, 1}, // palignr |
2014 | {TTI::SK_Splice, MVT::v8i16, 1}, // palignr |
2015 | {TTI::SK_Splice, MVT::v8f16, 1}, // palignr |
2016 | {TTI::SK_Splice, MVT::v16i8, 1}, // palignr |
2017 | |
2018 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb |
2019 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb |
2020 | {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb |
2021 | |
2022 | {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por |
2023 | {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por |
2024 | {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por |
2025 | }; |
2026 | |
2027 | if (ST->hasSSSE3()) |
2028 | if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second)) |
2029 | return LT.first * Entry->Cost; |
2030 | |
2031 | static const CostTblEntry SSE2ShuffleTbl[] = { |
2032 | {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd |
2033 | {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd |
2034 | {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd |
2035 | {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd |
2036 | {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd |
2037 | {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd |
2038 | |
2039 | {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd |
2040 | {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd |
2041 | {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd |
2042 | {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd |
2043 | {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd |
2044 | {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw |
2045 | // + 2*pshufd + 2*unpck + packus |
2046 | |
2047 | {TTI::SK_Select, MVT::v2i64, 1}, // movsd |
2048 | {TTI::SK_Select, MVT::v2f64, 1}, // movsd |
2049 | {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps |
2050 | {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por |
2051 | {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por |
2052 | {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por |
2053 | |
2054 | {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd |
2055 | {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd |
2056 | {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd} |
2057 | {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por |
2058 | {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por |
2059 | {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por |
2060 | |
2061 | {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd |
2062 | {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd |
2063 | {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd |
2064 | {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw |
2065 | // + pshufd/unpck |
2066 | {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw |
2067 | // + pshufd/unpck |
2068 | { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw |
2069 | // + 2*pshufd + 2*unpck + 2*packus |
2070 | |
2071 | { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd |
2072 | { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd |
2073 | { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} |
2074 | { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute |
2075 | { TTI::SK_PermuteTwoSrc, MVT::v8f16, 8 }, // blend+permute |
2076 | { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute |
2077 | }; |
2078 | |
2079 | static const CostTblEntry SSE3BroadcastLoadTbl[] = { |
2080 | {TTI::SK_Broadcast, MVT::v2f64, 0}, // broadcast handled by movddup |
2081 | }; |
2082 | |
2083 | if (ST->hasSSE2()) { |
2084 | bool IsLoad = |
2085 | llvm::any_of(Range&: Args, P: [](const auto &V) { return isa<LoadInst>(V); }); |
2086 | if (ST->hasSSE3() && IsLoad) |
2087 | if (const auto *Entry = |
2088 | CostTableLookup(SSE3BroadcastLoadTbl, Kind, LT.second)) { |
2089 | assert(isLegalBroadcastLoad(BaseTp->getElementType(), |
2090 | LT.second.getVectorElementCount()) && |
2091 | "Table entry missing from isLegalBroadcastLoad()" ); |
2092 | return LT.first * Entry->Cost; |
2093 | } |
2094 | |
2095 | if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second)) |
2096 | return LT.first * Entry->Cost; |
2097 | } |
2098 | |
2099 | static const CostTblEntry SSE1ShuffleTbl[] = { |
2100 | { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps |
2101 | { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps |
2102 | { TTI::SK_Select, MVT::v4f32, 2 }, // 2*shufps |
2103 | { TTI::SK_Splice, MVT::v4f32, 2 }, // 2*shufps |
2104 | { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps |
2105 | { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps |
2106 | }; |
2107 | |
2108 | if (ST->hasSSE1()) |
2109 | if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second)) |
2110 | return LT.first * Entry->Cost; |
2111 | |
2112 | return BaseT::getShuffleCost(Kind, Tp: BaseTp, Mask, CostKind, Index, SubTp); |
2113 | } |
2114 | |
2115 | InstructionCost X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, |
2116 | Type *Src, |
2117 | TTI::CastContextHint CCH, |
2118 | TTI::TargetCostKind CostKind, |
2119 | const Instruction *I) { |
2120 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
2121 | assert(ISD && "Invalid opcode" ); |
2122 | |
2123 | // TODO: Allow non-throughput costs that aren't binary. |
2124 | auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { |
2125 | if (CostKind != TTI::TCK_RecipThroughput) |
2126 | return Cost == 0 ? 0 : 1; |
2127 | return Cost; |
2128 | }; |
2129 | |
2130 | // The cost tables include both specific, custom (non-legal) src/dst type |
2131 | // conversions and generic, legalized types. We test for customs first, before |
2132 | // falling back to legalization. |
2133 | // FIXME: Need a better design of the cost table to handle non-simple types of |
2134 | // potential massive combinations (elem_num x src_type x dst_type). |
2135 | static const TypeConversionCostTblEntry AVX512BWConversionTbl[] { |
2136 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, |
2137 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 }, |
2138 | |
2139 | // Mask sign extend has an instruction. |
2140 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, |
2141 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, |
2142 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, |
2143 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, |
2144 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, |
2145 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, |
2146 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, |
2147 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, |
2148 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, |
2149 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, |
2150 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, |
2151 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, |
2152 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, |
2153 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, |
2154 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 }, |
2155 | { ISD::SIGN_EXTEND, MVT::v64i8, MVT::v64i1, 1 }, |
2156 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v64i1, 1 }, |
2157 | |
2158 | // Mask zero extend is a sext + shift. |
2159 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, |
2160 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, |
2161 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, |
2162 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, |
2163 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, |
2164 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, |
2165 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, |
2166 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, |
2167 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, |
2168 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, |
2169 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, |
2170 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, |
2171 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, |
2172 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, |
2173 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 }, |
2174 | { ISD::ZERO_EXTEND, MVT::v64i8, MVT::v64i1, 2 }, |
2175 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v64i1, 2 }, |
2176 | |
2177 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, |
2178 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, |
2179 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, |
2180 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, |
2181 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, |
2182 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, |
2183 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, |
2184 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, |
2185 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, |
2186 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, |
2187 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, |
2188 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, |
2189 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, |
2190 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, |
2191 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i16, 2 }, |
2192 | { ISD::TRUNCATE, MVT::v64i1, MVT::v64i8, 2 }, |
2193 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i16, 2 }, |
2194 | |
2195 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 2 }, |
2196 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // widen to zmm |
2197 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i16, 2 }, // vpmovwb |
2198 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i16, 2 }, // vpmovwb |
2199 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2 }, // vpmovwb |
2200 | }; |
2201 | |
2202 | static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = { |
2203 | // Mask sign extend has an instruction. |
2204 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, |
2205 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, |
2206 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, |
2207 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, |
2208 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, |
2209 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v16i1, 1 }, |
2210 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, |
2211 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, |
2212 | |
2213 | // Mask zero extend is a sext + shift. |
2214 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, |
2215 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, |
2216 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, |
2217 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, |
2218 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, |
2219 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v16i1, 2 }, |
2220 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, |
2221 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, |
2222 | |
2223 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, |
2224 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, |
2225 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, |
2226 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, |
2227 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, |
2228 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, |
2229 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, |
2230 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i64, 2 }, |
2231 | |
2232 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, |
2233 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, |
2234 | |
2235 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 1 }, |
2236 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 1 }, |
2237 | |
2238 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f32, 1 }, |
2239 | { ISD::FP_TO_SINT, MVT::v8i64, MVT::v8f64, 1 }, |
2240 | |
2241 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f32, 1 }, |
2242 | { ISD::FP_TO_UINT, MVT::v8i64, MVT::v8f64, 1 }, |
2243 | }; |
2244 | |
2245 | // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and |
2246 | // 256-bit wide vectors. |
2247 | |
2248 | static const TypeConversionCostTblEntry AVX512FConversionTbl[] = { |
2249 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 1 }, |
2250 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v16f32, 3 }, |
2251 | { ISD::FP_EXTEND, MVT::v16f64, MVT::v16f32, 4 }, // 2*vcvtps2pd+vextractf64x4 |
2252 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 1 }, |
2253 | |
2254 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd |
2255 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd |
2256 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd |
2257 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 3 }, // sext+vpslld+vptestmd |
2258 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq |
2259 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq |
2260 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq |
2261 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 3 }, // sext+vpslld+vptestmd |
2262 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // zmm vpslld+vptestmd |
2263 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // zmm vpslld+vptestmd |
2264 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // zmm vpslld+vptestmd |
2265 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i32, 2 }, // vpslld+vptestmd |
2266 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // zmm vpsllq+vptestmq |
2267 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // zmm vpsllq+vptestmq |
2268 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 2 }, // vpsllq+vptestmq |
2269 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 2 }, // vpmovdb |
2270 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 2 }, // vpmovdb |
2271 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 2 }, // vpmovdb |
2272 | { ISD::TRUNCATE, MVT::v32i8, MVT::v16i32, 2 }, // vpmovdb |
2273 | { ISD::TRUNCATE, MVT::v64i8, MVT::v16i32, 2 }, // vpmovdb |
2274 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2 }, // vpmovdw |
2275 | { ISD::TRUNCATE, MVT::v32i16, MVT::v16i32, 2 }, // vpmovdw |
2276 | { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 2 }, // vpmovqb |
2277 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1 }, // vpshufb |
2278 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 2 }, // vpmovqb |
2279 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i64, 2 }, // vpmovqb |
2280 | { ISD::TRUNCATE, MVT::v32i8, MVT::v8i64, 2 }, // vpmovqb |
2281 | { ISD::TRUNCATE, MVT::v64i8, MVT::v8i64, 2 }, // vpmovqb |
2282 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 2 }, // vpmovqw |
2283 | { ISD::TRUNCATE, MVT::v16i16, MVT::v8i64, 2 }, // vpmovqw |
2284 | { ISD::TRUNCATE, MVT::v32i16, MVT::v8i64, 2 }, // vpmovqw |
2285 | { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 1 }, // vpmovqd |
2286 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // zmm vpmovqd |
2287 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb |
2288 | |
2289 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, // extend to v16i32 |
2290 | { ISD::TRUNCATE, MVT::v32i8, MVT::v32i16, 8 }, |
2291 | { ISD::TRUNCATE, MVT::v64i8, MVT::v32i16, 8 }, |
2292 | |
2293 | // Sign extend is zmm vpternlogd+vptruncdb. |
2294 | // Zero extend is zmm broadcast load+vptruncdw. |
2295 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 3 }, |
2296 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 4 }, |
2297 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 3 }, |
2298 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 4 }, |
2299 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 3 }, |
2300 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 4 }, |
2301 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 3 }, |
2302 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 4 }, |
2303 | |
2304 | // Sign extend is zmm vpternlogd+vptruncdw. |
2305 | // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw. |
2306 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 3 }, |
2307 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, |
2308 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 3 }, |
2309 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, |
2310 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 3 }, |
2311 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, |
2312 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 3 }, |
2313 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, |
2314 | |
2315 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // zmm vpternlogd |
2316 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // zmm vpternlogd+psrld |
2317 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // zmm vpternlogd |
2318 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // zmm vpternlogd+psrld |
2319 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // zmm vpternlogd |
2320 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // zmm vpternlogd+psrld |
2321 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // zmm vpternlogq |
2322 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // zmm vpternlogq+psrlq |
2323 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // zmm vpternlogq |
2324 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // zmm vpternlogq+psrlq |
2325 | |
2326 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1, 1 }, // vpternlogd |
2327 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1, 2 }, // vpternlogd+psrld |
2328 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i1, 1 }, // vpternlogq |
2329 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i1, 2 }, // vpternlogq+psrlq |
2330 | |
2331 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, |
2332 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 1 }, |
2333 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, |
2334 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 }, |
2335 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, |
2336 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 1 }, |
2337 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, |
2338 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 1 }, |
2339 | { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, |
2340 | { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 1 }, |
2341 | |
2342 | { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right |
2343 | { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right |
2344 | |
2345 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, |
2346 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, |
2347 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, |
2348 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, |
2349 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, |
2350 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, |
2351 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, |
2352 | { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, |
2353 | |
2354 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i1, 4 }, |
2355 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i1, 3 }, |
2356 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v16i8, 2 }, |
2357 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 1 }, |
2358 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i16, 2 }, |
2359 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 1 }, |
2360 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 1 }, |
2361 | { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 1 }, |
2362 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i64, 26 }, |
2363 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 }, |
2364 | |
2365 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, |
2366 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f64, 7 }, |
2367 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f64,15 }, |
2368 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f32,11 }, |
2369 | { ISD::FP_TO_SINT, MVT::v64i8, MVT::v64f64,31 }, |
2370 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f64, 3 }, |
2371 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f64, 7 }, |
2372 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f32, 5 }, |
2373 | { ISD::FP_TO_SINT, MVT::v32i16, MVT::v32f64,15 }, |
2374 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 1 }, |
2375 | { ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f64, 3 }, |
2376 | |
2377 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, |
2378 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 3 }, |
2379 | { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 3 }, |
2380 | { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, |
2381 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 3 }, |
2382 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 3 }, |
2383 | }; |
2384 | |
2385 | static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] { |
2386 | // Mask sign extend has an instruction. |
2387 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 1 }, |
2388 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v2i1, 1 }, |
2389 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 1 }, |
2390 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v2i1, 1 }, |
2391 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 1 }, |
2392 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v4i1, 1 }, |
2393 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 1 }, |
2394 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v4i1, 1 }, |
2395 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 1 }, |
2396 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v8i1, 1 }, |
2397 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 1 }, |
2398 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 1 }, |
2399 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, |
2400 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v32i1, 1 }, |
2401 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v32i1, 1 }, |
2402 | { ISD::SIGN_EXTEND, MVT::v32i8, MVT::v64i1, 1 }, |
2403 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v64i1, 1 }, |
2404 | |
2405 | // Mask zero extend is a sext + shift. |
2406 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 2 }, |
2407 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v2i1, 2 }, |
2408 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 2 }, |
2409 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v2i1, 2 }, |
2410 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 2 }, |
2411 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v4i1, 2 }, |
2412 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 2 }, |
2413 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v4i1, 2 }, |
2414 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 2 }, |
2415 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v8i1, 2 }, |
2416 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 2 }, |
2417 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 2 }, |
2418 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 }, |
2419 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v32i1, 2 }, |
2420 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v32i1, 2 }, |
2421 | { ISD::ZERO_EXTEND, MVT::v32i8, MVT::v64i1, 2 }, |
2422 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v64i1, 2 }, |
2423 | |
2424 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 2 }, |
2425 | { ISD::TRUNCATE, MVT::v2i1, MVT::v16i8, 2 }, |
2426 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, |
2427 | { ISD::TRUNCATE, MVT::v2i1, MVT::v8i16, 2 }, |
2428 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, |
2429 | { ISD::TRUNCATE, MVT::v4i1, MVT::v16i8, 2 }, |
2430 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 2 }, |
2431 | { ISD::TRUNCATE, MVT::v4i1, MVT::v8i16, 2 }, |
2432 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 2 }, |
2433 | { ISD::TRUNCATE, MVT::v8i1, MVT::v16i8, 2 }, |
2434 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 2 }, |
2435 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 2 }, |
2436 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 2 }, |
2437 | { ISD::TRUNCATE, MVT::v32i1, MVT::v32i8, 2 }, |
2438 | { ISD::TRUNCATE, MVT::v32i1, MVT::v16i16, 2 }, |
2439 | { ISD::TRUNCATE, MVT::v64i1, MVT::v32i8, 2 }, |
2440 | { ISD::TRUNCATE, MVT::v64i1, MVT::v16i16, 2 }, |
2441 | |
2442 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, |
2443 | }; |
2444 | |
2445 | static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = { |
2446 | // Mask sign extend has an instruction. |
2447 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, |
2448 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v2i1, 1 }, |
2449 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, |
2450 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i1, 1 }, |
2451 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, |
2452 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i1, 1 }, |
2453 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, |
2454 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, |
2455 | |
2456 | // Mask zero extend is a sext + shift. |
2457 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, |
2458 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v2i1, 2 }, |
2459 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, |
2460 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i1, 2 }, |
2461 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, |
2462 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i1, 2 }, |
2463 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, |
2464 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, |
2465 | |
2466 | { ISD::TRUNCATE, MVT::v16i1, MVT::v4i64, 2 }, |
2467 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, |
2468 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, |
2469 | { ISD::TRUNCATE, MVT::v2i1, MVT::v4i32, 2 }, |
2470 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, |
2471 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, |
2472 | { ISD::TRUNCATE, MVT::v8i1, MVT::v4i64, 2 }, |
2473 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, |
2474 | |
2475 | { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, |
2476 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
2477 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, |
2478 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, |
2479 | |
2480 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 1 }, |
2481 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 }, |
2482 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 1 }, |
2483 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 1 }, |
2484 | |
2485 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v4f32, 1 }, |
2486 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f32, 1 }, |
2487 | { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 }, |
2488 | { ISD::FP_TO_SINT, MVT::v4i64, MVT::v4f64, 1 }, |
2489 | |
2490 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v4f32, 1 }, |
2491 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f32, 1 }, |
2492 | { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 }, |
2493 | { ISD::FP_TO_UINT, MVT::v4i64, MVT::v4f64, 1 }, |
2494 | }; |
2495 | |
2496 | static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = { |
2497 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // sext+vpslld+vptestmd |
2498 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 3 }, // sext+vpslld+vptestmd |
2499 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 3 }, // sext+vpslld+vptestmd |
2500 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i8, 8 }, // split+2*v8i8 |
2501 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 3 }, // sext+vpsllq+vptestmq |
2502 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 3 }, // sext+vpsllq+vptestmq |
2503 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i16, 3 }, // sext+vpsllq+vptestmq |
2504 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 8 }, // split+2*v8i16 |
2505 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 2 }, // vpslld+vptestmd |
2506 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i32, 2 }, // vpslld+vptestmd |
2507 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, // vpslld+vptestmd |
2508 | { ISD::TRUNCATE, MVT::v16i1, MVT::v8i32, 2 }, // vpslld+vptestmd |
2509 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i64, 2 }, // vpsllq+vptestmq |
2510 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 2 }, // vpsllq+vptestmq |
2511 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, // vpmovqd |
2512 | { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 2 }, // vpmovqb |
2513 | { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2 }, // vpmovqw |
2514 | { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2 }, // vpmovwb |
2515 | |
2516 | // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb |
2517 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb |
2518 | { ISD::SIGN_EXTEND, MVT::v2i8, MVT::v2i1, 5 }, |
2519 | { ISD::ZERO_EXTEND, MVT::v2i8, MVT::v2i1, 6 }, |
2520 | { ISD::SIGN_EXTEND, MVT::v4i8, MVT::v4i1, 5 }, |
2521 | { ISD::ZERO_EXTEND, MVT::v4i8, MVT::v4i1, 6 }, |
2522 | { ISD::SIGN_EXTEND, MVT::v8i8, MVT::v8i1, 5 }, |
2523 | { ISD::ZERO_EXTEND, MVT::v8i8, MVT::v8i1, 6 }, |
2524 | { ISD::SIGN_EXTEND, MVT::v16i8, MVT::v16i1, 10 }, |
2525 | { ISD::ZERO_EXTEND, MVT::v16i8, MVT::v16i1, 12 }, |
2526 | |
2527 | // sign extend is vpcmpeq+maskedmove+vpmovdw |
2528 | // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw |
2529 | { ISD::SIGN_EXTEND, MVT::v2i16, MVT::v2i1, 4 }, |
2530 | { ISD::ZERO_EXTEND, MVT::v2i16, MVT::v2i1, 5 }, |
2531 | { ISD::SIGN_EXTEND, MVT::v4i16, MVT::v4i1, 4 }, |
2532 | { ISD::ZERO_EXTEND, MVT::v4i16, MVT::v4i1, 5 }, |
2533 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i1, 4 }, |
2534 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i1, 5 }, |
2535 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 }, |
2536 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 }, |
2537 | |
2538 | { ISD::SIGN_EXTEND, MVT::v2i32, MVT::v2i1, 1 }, // vpternlogd |
2539 | { ISD::ZERO_EXTEND, MVT::v2i32, MVT::v2i1, 2 }, // vpternlogd+psrld |
2540 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i1, 1 }, // vpternlogd |
2541 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i1, 2 }, // vpternlogd+psrld |
2542 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 1 }, // vpternlogd |
2543 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 2 }, // vpternlogd+psrld |
2544 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i1, 1 }, // vpternlogd |
2545 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i1, 2 }, // vpternlogd+psrld |
2546 | |
2547 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i1, 1 }, // vpternlogq |
2548 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i1, 2 }, // vpternlogq+psrlq |
2549 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 1 }, // vpternlogq |
2550 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 2 }, // vpternlogq+psrlq |
2551 | |
2552 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, |
2553 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 1 }, |
2554 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, |
2555 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 1 }, |
2556 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, |
2557 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 }, |
2558 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, |
2559 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 1 }, |
2560 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, |
2561 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, |
2562 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, |
2563 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, |
2564 | |
2565 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, |
2566 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, |
2567 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, |
2568 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, |
2569 | |
2570 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 }, |
2571 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 }, |
2572 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, |
2573 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 1 }, |
2574 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, |
2575 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 1 }, |
2576 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 }, |
2577 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
2578 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, |
2579 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, |
2580 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 5 }, |
2581 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, |
2582 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 }, |
2583 | |
2584 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, |
2585 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f32, 2 }, |
2586 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v32f32, 5 }, |
2587 | |
2588 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 }, |
2589 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 }, |
2590 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, |
2591 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 1 }, |
2592 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 }, |
2593 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, |
2594 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 }, |
2595 | }; |
2596 | |
2597 | static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { |
2598 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, |
2599 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 3 }, |
2600 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, |
2601 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 3 }, |
2602 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, |
2603 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 1 }, |
2604 | |
2605 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, |
2606 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 2 }, |
2607 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, |
2608 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 2 }, |
2609 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
2610 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 }, |
2611 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, |
2612 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 2 }, |
2613 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
2614 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 }, |
2615 | { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, |
2616 | { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 }, |
2617 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
2618 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 }, |
2619 | |
2620 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 2 }, |
2621 | |
2622 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 4 }, |
2623 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 4 }, |
2624 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 1 }, |
2625 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 1 }, |
2626 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 1 }, |
2627 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 4 }, |
2628 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 4 }, |
2629 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 1 }, |
2630 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 1 }, |
2631 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 5 }, |
2632 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, |
2633 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2 }, |
2634 | |
2635 | { ISD::FP_EXTEND, MVT::v8f64, MVT::v8f32, 3 }, |
2636 | { ISD::FP_ROUND, MVT::v8f32, MVT::v8f64, 3 }, |
2637 | |
2638 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 1 }, |
2639 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 1 }, |
2640 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 1 }, |
2641 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 3 }, |
2642 | |
2643 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 3 }, |
2644 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 3 }, |
2645 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 1 }, |
2646 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, |
2647 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, |
2648 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 4 }, |
2649 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 3 }, |
2650 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 4 }, |
2651 | |
2652 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, |
2653 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, |
2654 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, |
2655 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, |
2656 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 1 }, |
2657 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 1 }, |
2658 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 3 }, |
2659 | |
2660 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 2 }, |
2661 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 2 }, |
2662 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 2 }, |
2663 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 2 }, |
2664 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 2 }, |
2665 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 }, |
2666 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 2 }, |
2667 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, |
2668 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, |
2669 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, |
2670 | }; |
2671 | |
2672 | static const TypeConversionCostTblEntry AVXConversionTbl[] = { |
2673 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, |
2674 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i1, 4 }, |
2675 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, |
2676 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 4 }, |
2677 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, |
2678 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 }, |
2679 | |
2680 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, |
2681 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v16i8, 3 }, |
2682 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, |
2683 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v16i8, 3 }, |
2684 | { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, |
2685 | { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 3 }, |
2686 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, |
2687 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v8i16, 3 }, |
2688 | { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, |
2689 | { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 3 }, |
2690 | { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, |
2691 | { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 3 }, |
2692 | |
2693 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i64, 4 }, |
2694 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i32, 5 }, |
2695 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i16, 4 }, |
2696 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i64, 9 }, |
2697 | { ISD::TRUNCATE, MVT::v16i1, MVT::v16i64, 11 }, |
2698 | |
2699 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 }, |
2700 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 }, |
2701 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 2 }, // and+extract+packuswb |
2702 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i32, 5 }, |
2703 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, |
2704 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i64, 5 }, |
2705 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i64, 3 }, // and+extract+2*packusdw |
2706 | { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 2 }, |
2707 | |
2708 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 }, |
2709 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i1, 3 }, |
2710 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i1, 8 }, |
2711 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, |
2712 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, |
2713 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, |
2714 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, |
2715 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, |
2716 | { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 }, |
2717 | { ISD::SINT_TO_FP, MVT::v8f64, MVT::v8i32, 4 }, |
2718 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 5 }, |
2719 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i64, 8 }, |
2720 | |
2721 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 7 }, |
2722 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i1, 7 }, |
2723 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i1, 6 }, |
2724 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v16i8, 4 }, |
2725 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v16i8, 2 }, |
2726 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 }, |
2727 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v8i16, 2 }, |
2728 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 4 }, |
2729 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 4 }, |
2730 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, |
2731 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 6 }, |
2732 | { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 8 }, |
2733 | { ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i32, 10 }, |
2734 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 10 }, |
2735 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 18 }, |
2736 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 5 }, |
2737 | { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 10 }, |
2738 | |
2739 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v8f32, 2 }, |
2740 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f64, 2 }, |
2741 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v8f32, 2 }, |
2742 | { ISD::FP_TO_SINT, MVT::v32i8, MVT::v4f64, 2 }, |
2743 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 2 }, |
2744 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f64, 2 }, |
2745 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v8f32, 2 }, |
2746 | { ISD::FP_TO_SINT, MVT::v16i16, MVT::v4f64, 2 }, |
2747 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f64, 2 }, |
2748 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f32, 2 }, |
2749 | { ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f64, 5 }, |
2750 | |
2751 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v8f32, 2 }, |
2752 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f64, 2 }, |
2753 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v8f32, 2 }, |
2754 | { ISD::FP_TO_UINT, MVT::v32i8, MVT::v4f64, 2 }, |
2755 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 2 }, |
2756 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f64, 2 }, |
2757 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v8f32, 2 }, |
2758 | { ISD::FP_TO_UINT, MVT::v16i16, MVT::v4f64, 2 }, |
2759 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 3 }, |
2760 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, |
2761 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 6 }, |
2762 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 7 }, |
2763 | { ISD::FP_TO_UINT, MVT::v8i32, MVT::v4f64, 7 }, |
2764 | |
2765 | { ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 1 }, |
2766 | { ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 1 }, |
2767 | }; |
2768 | |
2769 | static const TypeConversionCostTblEntry SSE41ConversionTbl[] = { |
2770 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, |
2771 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 1 }, |
2772 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, |
2773 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 1 }, |
2774 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, |
2775 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, |
2776 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, |
2777 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 1 }, |
2778 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, |
2779 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, |
2780 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, |
2781 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, |
2782 | |
2783 | // These truncates end up widening elements. |
2784 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 1 }, // PMOVXZBQ |
2785 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 1 }, // PMOVXZWQ |
2786 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 1 }, // PMOVXZBD |
2787 | |
2788 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 2 }, |
2789 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 2 }, |
2790 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 2 }, |
2791 | |
2792 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 1 }, |
2793 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 1 }, |
2794 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 1 }, |
2795 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 1 }, |
2796 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, |
2797 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, |
2798 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, |
2799 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, |
2800 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 }, |
2801 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 1 }, |
2802 | { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 2 }, |
2803 | |
2804 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 1 }, |
2805 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 1 }, |
2806 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 }, |
2807 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 }, |
2808 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 1 }, |
2809 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 1 }, |
2810 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 1 }, |
2811 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 1 }, |
2812 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 3 }, |
2813 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, |
2814 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 2 }, |
2815 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 12 }, |
2816 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i64, 22 }, |
2817 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 4 }, |
2818 | |
2819 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 1 }, |
2820 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 1 }, |
2821 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 1 }, |
2822 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 1 }, |
2823 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 2 }, |
2824 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 2 }, |
2825 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 1 }, |
2826 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 1 }, |
2827 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 }, |
2828 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 1 }, |
2829 | |
2830 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 1 }, |
2831 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, |
2832 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 1 }, |
2833 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 }, |
2834 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 2 }, |
2835 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 2 }, |
2836 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 1 }, |
2837 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 1 }, |
2838 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 4 }, |
2839 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 4 }, |
2840 | }; |
2841 | |
2842 | static const TypeConversionCostTblEntry SSE2ConversionTbl[] = { |
2843 | // These are somewhat magic numbers justified by comparing the |
2844 | // output of llvm-mca for our various supported scheduler models |
2845 | // and basing it off the worst case scenario. |
2846 | { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 3 }, |
2847 | { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 3 }, |
2848 | { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 3 }, |
2849 | { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 3 }, |
2850 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v16i8, 3 }, |
2851 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, |
2852 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v8i16, 3 }, |
2853 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, |
2854 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 3 }, |
2855 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v4i32, 4 }, |
2856 | { ISD::SINT_TO_FP, MVT::v4f32, MVT::v2i64, 8 }, |
2857 | { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 8 }, |
2858 | |
2859 | { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 3 }, |
2860 | { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 3 }, |
2861 | { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 8 }, |
2862 | { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 9 }, |
2863 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v16i8, 4 }, |
2864 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v16i8, 4 }, |
2865 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v8i16, 4 }, |
2866 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v8i16, 4 }, |
2867 | { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 7 }, |
2868 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v4i32, 7 }, |
2869 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 5 }, |
2870 | { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 15 }, |
2871 | { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 18 }, |
2872 | |
2873 | { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 4 }, |
2874 | { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 4 }, |
2875 | { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 4 }, |
2876 | { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 4 }, |
2877 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v4f32, 6 }, |
2878 | { ISD::FP_TO_SINT, MVT::v16i8, MVT::v2f64, 6 }, |
2879 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v4f32, 5 }, |
2880 | { ISD::FP_TO_SINT, MVT::v8i16, MVT::v2f64, 5 }, |
2881 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 4 }, |
2882 | { ISD::FP_TO_SINT, MVT::v4i32, MVT::v2f64, 4 }, |
2883 | |
2884 | { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 4 }, |
2885 | { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 }, |
2886 | { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 4 }, |
2887 | { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 15 }, |
2888 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v4f32, 6 }, |
2889 | { ISD::FP_TO_UINT, MVT::v16i8, MVT::v2f64, 6 }, |
2890 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v4f32, 5 }, |
2891 | { ISD::FP_TO_UINT, MVT::v8i16, MVT::v2f64, 5 }, |
2892 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 8 }, |
2893 | { ISD::FP_TO_UINT, MVT::v4i32, MVT::v2f64, 8 }, |
2894 | |
2895 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, |
2896 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v16i8, 4 }, |
2897 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v16i8, 2 }, |
2898 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v16i8, 3 }, |
2899 | { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v16i8, 1 }, |
2900 | { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v16i8, 2 }, |
2901 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v8i16, 2 }, |
2902 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v8i16, 3 }, |
2903 | { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v8i16, 1 }, |
2904 | { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v8i16, 2 }, |
2905 | { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v4i32, 1 }, |
2906 | { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v4i32, 2 }, |
2907 | |
2908 | // These truncates are really widening elements. |
2909 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i32, 1 }, // PSHUFD |
2910 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i16, 2 }, // PUNPCKLWD+DQ |
2911 | { ISD::TRUNCATE, MVT::v2i1, MVT::v2i8, 3 }, // PUNPCKLBW+WD+PSHUFD |
2912 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i16, 1 }, // PUNPCKLWD |
2913 | { ISD::TRUNCATE, MVT::v4i1, MVT::v4i8, 2 }, // PUNPCKLBW+WD |
2914 | { ISD::TRUNCATE, MVT::v8i1, MVT::v8i8, 1 }, // PUNPCKLBW |
2915 | |
2916 | { ISD::TRUNCATE, MVT::v16i8, MVT::v8i16, 2 }, // PAND+PACKUSWB |
2917 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3 }, |
2918 | { ISD::TRUNCATE, MVT::v16i8, MVT::v4i32, 3 }, // PAND+2*PACKUSWB |
2919 | { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7 }, |
2920 | { ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 1 }, |
2921 | { ISD::TRUNCATE, MVT::v8i16, MVT::v4i32, 3 }, |
2922 | { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 5 }, |
2923 | { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32,10 }, |
2924 | { ISD::TRUNCATE, MVT::v16i8, MVT::v2i64, 4 }, // PAND+3*PACKUSWB |
2925 | { ISD::TRUNCATE, MVT::v8i16, MVT::v2i64, 2 }, // PSHUFD+PSHUFLW |
2926 | { ISD::TRUNCATE, MVT::v4i32, MVT::v2i64, 1 }, // PSHUFD |
2927 | }; |
2928 | |
2929 | // Attempt to map directly to (simple) MVT types to let us match custom entries. |
2930 | EVT SrcTy = TLI->getValueType(DL, Ty: Src); |
2931 | EVT DstTy = TLI->getValueType(DL, Ty: Dst); |
2932 | |
2933 | // The function getSimpleVT only handles simple value types. |
2934 | if (SrcTy.isSimple() && DstTy.isSimple()) { |
2935 | MVT SimpleSrcTy = SrcTy.getSimpleVT(); |
2936 | MVT SimpleDstTy = DstTy.getSimpleVT(); |
2937 | |
2938 | if (ST->useAVX512Regs()) { |
2939 | if (ST->hasBWI()) |
2940 | if (const auto *Entry = ConvertCostTableLookup( |
2941 | AVX512BWConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) |
2942 | return AdjustCost(Entry->Cost); |
2943 | |
2944 | if (ST->hasDQI()) |
2945 | if (const auto *Entry = ConvertCostTableLookup( |
2946 | AVX512DQConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) |
2947 | return AdjustCost(Entry->Cost); |
2948 | |
2949 | if (ST->hasAVX512()) |
2950 | if (const auto *Entry = ConvertCostTableLookup( |
2951 | AVX512FConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) |
2952 | return AdjustCost(Entry->Cost); |
2953 | } |
2954 | |
2955 | if (ST->hasBWI()) |
2956 | if (const auto *Entry = ConvertCostTableLookup( |
2957 | AVX512BWVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) |
2958 | return AdjustCost(Entry->Cost); |
2959 | |
2960 | if (ST->hasDQI()) |
2961 | if (const auto *Entry = ConvertCostTableLookup( |
2962 | AVX512DQVLConversionTbl, ISD, SimpleDstTy, SimpleSrcTy)) |
2963 | return AdjustCost(Entry->Cost); |
2964 | |
2965 | if (ST->hasAVX512()) |
2966 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, |
2967 | SimpleDstTy, SimpleSrcTy)) |
2968 | return AdjustCost(Entry->Cost); |
2969 | |
2970 | if (ST->hasAVX2()) { |
2971 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, |
2972 | SimpleDstTy, SimpleSrcTy)) |
2973 | return AdjustCost(Entry->Cost); |
2974 | } |
2975 | |
2976 | if (ST->hasAVX()) { |
2977 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, |
2978 | SimpleDstTy, SimpleSrcTy)) |
2979 | return AdjustCost(Entry->Cost); |
2980 | } |
2981 | |
2982 | if (ST->hasSSE41()) { |
2983 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, |
2984 | SimpleDstTy, SimpleSrcTy)) |
2985 | return AdjustCost(Entry->Cost); |
2986 | } |
2987 | |
2988 | if (ST->hasSSE2()) { |
2989 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, |
2990 | SimpleDstTy, SimpleSrcTy)) |
2991 | return AdjustCost(Entry->Cost); |
2992 | } |
2993 | } |
2994 | |
2995 | // Fall back to legalized types. |
2996 | std::pair<InstructionCost, MVT> LTSrc = getTypeLegalizationCost(Ty: Src); |
2997 | std::pair<InstructionCost, MVT> LTDest = getTypeLegalizationCost(Ty: Dst); |
2998 | |
2999 | // If we're truncating to the same legalized type - just assume its free. |
3000 | if (ISD == ISD::TRUNCATE && LTSrc.second == LTDest.second) |
3001 | return TTI::TCC_Free; |
3002 | |
3003 | if (ST->useAVX512Regs()) { |
3004 | if (ST->hasBWI()) |
3005 | if (const auto *Entry = ConvertCostTableLookup( |
3006 | AVX512BWConversionTbl, ISD, LTDest.second, LTSrc.second)) |
3007 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3008 | |
3009 | if (ST->hasDQI()) |
3010 | if (const auto *Entry = ConvertCostTableLookup( |
3011 | AVX512DQConversionTbl, ISD, LTDest.second, LTSrc.second)) |
3012 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3013 | |
3014 | if (ST->hasAVX512()) |
3015 | if (const auto *Entry = ConvertCostTableLookup( |
3016 | AVX512FConversionTbl, ISD, LTDest.second, LTSrc.second)) |
3017 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3018 | } |
3019 | |
3020 | if (ST->hasBWI()) |
3021 | if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD, |
3022 | LTDest.second, LTSrc.second)) |
3023 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3024 | |
3025 | if (ST->hasDQI()) |
3026 | if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD, |
3027 | LTDest.second, LTSrc.second)) |
3028 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3029 | |
3030 | if (ST->hasAVX512()) |
3031 | if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD, |
3032 | LTDest.second, LTSrc.second)) |
3033 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3034 | |
3035 | if (ST->hasAVX2()) |
3036 | if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD, |
3037 | LTDest.second, LTSrc.second)) |
3038 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3039 | |
3040 | if (ST->hasAVX()) |
3041 | if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD, |
3042 | LTDest.second, LTSrc.second)) |
3043 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3044 | |
3045 | if (ST->hasSSE41()) |
3046 | if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD, |
3047 | LTDest.second, LTSrc.second)) |
3048 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3049 | |
3050 | if (ST->hasSSE2()) |
3051 | if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD, |
3052 | LTDest.second, LTSrc.second)) |
3053 | return AdjustCost(std::max(a: LTSrc.first, b: LTDest.first) * Entry->Cost); |
3054 | |
3055 | // Fallback, for i8/i16 sitofp/uitofp cases we need to extend to i32 for |
3056 | // sitofp. |
3057 | if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) && |
3058 | 1 < Src->getScalarSizeInBits() && Src->getScalarSizeInBits() < 32) { |
3059 | Type *ExtSrc = Src->getWithNewBitWidth(NewBitWidth: 32); |
3060 | unsigned ExtOpc = |
3061 | (ISD == ISD::SINT_TO_FP) ? Instruction::SExt : Instruction::ZExt; |
3062 | |
3063 | // For scalar loads the extend would be free. |
3064 | InstructionCost ExtCost = 0; |
3065 | if (!(Src->isIntegerTy() && I && isa<LoadInst>(Val: I->getOperand(i: 0)))) |
3066 | ExtCost = getCastInstrCost(Opcode: ExtOpc, Dst: ExtSrc, Src, CCH, CostKind); |
3067 | |
3068 | return ExtCost + getCastInstrCost(Opcode: Instruction::SIToFP, Dst, Src: ExtSrc, |
3069 | CCH: TTI::CastContextHint::None, CostKind); |
3070 | } |
3071 | |
3072 | // Fallback for fptosi/fptoui i8/i16 cases we need to truncate from fptosi |
3073 | // i32. |
3074 | if ((ISD == ISD::FP_TO_SINT || ISD == ISD::FP_TO_UINT) && |
3075 | 1 < Dst->getScalarSizeInBits() && Dst->getScalarSizeInBits() < 32) { |
3076 | Type *TruncDst = Dst->getWithNewBitWidth(NewBitWidth: 32); |
3077 | return getCastInstrCost(Opcode: Instruction::FPToSI, Dst: TruncDst, Src, CCH, CostKind) + |
3078 | getCastInstrCost(Opcode: Instruction::Trunc, Dst, Src: TruncDst, |
3079 | CCH: TTI::CastContextHint::None, CostKind); |
3080 | } |
3081 | |
3082 | return AdjustCost( |
3083 | BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); |
3084 | } |
3085 | |
3086 | InstructionCost X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
3087 | Type *CondTy, |
3088 | CmpInst::Predicate VecPred, |
3089 | TTI::TargetCostKind CostKind, |
3090 | const Instruction *I) { |
3091 | // Early out if this type isn't scalar/vector integer/float. |
3092 | if (!(ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) |
3093 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, |
3094 | I); |
3095 | |
3096 | // Legalize the type. |
3097 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
3098 | |
3099 | MVT MTy = LT.second; |
3100 | |
3101 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
3102 | assert(ISD && "Invalid opcode" ); |
3103 | |
3104 | InstructionCost = 0; |
3105 | if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) { |
3106 | // Some vector comparison predicates cost extra instructions. |
3107 | // TODO: Adjust ExtraCost based on CostKind? |
3108 | // TODO: Should we invert this and assume worst case cmp costs |
3109 | // and reduce for particular predicates? |
3110 | if (MTy.isVector() && |
3111 | !((ST->hasXOP() && (!ST->hasAVX2() || MTy.is128BitVector())) || |
3112 | (ST->hasAVX512() && 32 <= MTy.getScalarSizeInBits()) || |
3113 | ST->hasBWI())) { |
3114 | // Fallback to I if a specific predicate wasn't specified. |
3115 | CmpInst::Predicate Pred = VecPred; |
3116 | if (I && (Pred == CmpInst::BAD_ICMP_PREDICATE || |
3117 | Pred == CmpInst::BAD_FCMP_PREDICATE)) |
3118 | Pred = cast<CmpInst>(Val: I)->getPredicate(); |
3119 | |
3120 | bool CmpWithConstant = false; |
3121 | if (auto *CmpInstr = dyn_cast_or_null<CmpInst>(Val: I)) |
3122 | CmpWithConstant = isa<Constant>(Val: CmpInstr->getOperand(i_nocapture: 1)); |
3123 | |
3124 | switch (Pred) { |
3125 | case CmpInst::Predicate::ICMP_NE: |
3126 | // xor(cmpeq(x,y),-1) |
3127 | ExtraCost = CmpWithConstant ? 0 : 1; |
3128 | break; |
3129 | case CmpInst::Predicate::ICMP_SGE: |
3130 | case CmpInst::Predicate::ICMP_SLE: |
3131 | // xor(cmpgt(x,y),-1) |
3132 | ExtraCost = CmpWithConstant ? 0 : 1; |
3133 | break; |
3134 | case CmpInst::Predicate::ICMP_ULT: |
3135 | case CmpInst::Predicate::ICMP_UGT: |
3136 | // cmpgt(xor(x,signbit),xor(y,signbit)) |
3137 | // xor(cmpeq(pmaxu(x,y),x),-1) |
3138 | ExtraCost = CmpWithConstant ? 1 : 2; |
3139 | break; |
3140 | case CmpInst::Predicate::ICMP_ULE: |
3141 | case CmpInst::Predicate::ICMP_UGE: |
3142 | if ((ST->hasSSE41() && MTy.getScalarSizeInBits() == 32) || |
3143 | (ST->hasSSE2() && MTy.getScalarSizeInBits() < 32)) { |
3144 | // cmpeq(psubus(x,y),0) |
3145 | // cmpeq(pminu(x,y),x) |
3146 | ExtraCost = 1; |
3147 | } else { |
3148 | // xor(cmpgt(xor(x,signbit),xor(y,signbit)),-1) |
3149 | ExtraCost = CmpWithConstant ? 2 : 3; |
3150 | } |
3151 | break; |
3152 | case CmpInst::Predicate::FCMP_ONE: |
3153 | case CmpInst::Predicate::FCMP_UEQ: |
3154 | // Without AVX we need to expand FCMP_ONE/FCMP_UEQ cases. |
3155 | // Use FCMP_UEQ expansion - FCMP_ONE should be the same. |
3156 | if (CondTy && !ST->hasAVX()) |
3157 | return getCmpSelInstrCost(Opcode, ValTy, CondTy, |
3158 | VecPred: CmpInst::Predicate::FCMP_UNO, CostKind) + |
3159 | getCmpSelInstrCost(Opcode, ValTy, CondTy, |
3160 | VecPred: CmpInst::Predicate::FCMP_OEQ, CostKind) + |
3161 | getArithmeticInstrCost(Opcode: Instruction::Or, Ty: CondTy, CostKind); |
3162 | |
3163 | break; |
3164 | case CmpInst::Predicate::BAD_ICMP_PREDICATE: |
3165 | case CmpInst::Predicate::BAD_FCMP_PREDICATE: |
3166 | // Assume worst case scenario and add the maximum extra cost. |
3167 | ExtraCost = 3; |
3168 | break; |
3169 | default: |
3170 | break; |
3171 | } |
3172 | } |
3173 | } |
3174 | |
3175 | static const CostKindTblEntry SLMCostTbl[] = { |
3176 | // slm pcmpeq/pcmpgt throughput is 2 |
3177 | { ISD::SETCC, MVT::v2i64, { 2, 5, 1, 2 } }, |
3178 | // slm pblendvb/blendvpd/blendvps throughput is 4 |
3179 | { ISD::SELECT, MVT::v2f64, { 4, 4, 1, 3 } }, // vblendvpd |
3180 | { ISD::SELECT, MVT::v4f32, { 4, 4, 1, 3 } }, // vblendvps |
3181 | { ISD::SELECT, MVT::v2i64, { 4, 4, 1, 3 } }, // pblendvb |
3182 | { ISD::SELECT, MVT::v8i32, { 4, 4, 1, 3 } }, // pblendvb |
3183 | { ISD::SELECT, MVT::v8i16, { 4, 4, 1, 3 } }, // pblendvb |
3184 | { ISD::SELECT, MVT::v16i8, { 4, 4, 1, 3 } }, // pblendvb |
3185 | }; |
3186 | |
3187 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
3188 | { ISD::SETCC, MVT::v32i16, { 1, 1, 1, 1 } }, |
3189 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 1 } }, |
3190 | { ISD::SETCC, MVT::v64i8, { 1, 1, 1, 1 } }, |
3191 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 1 } }, |
3192 | |
3193 | { ISD::SELECT, MVT::v32i16, { 1, 1, 1, 1 } }, |
3194 | { ISD::SELECT, MVT::v64i8, { 1, 1, 1, 1 } }, |
3195 | }; |
3196 | |
3197 | static const CostKindTblEntry AVX512CostTbl[] = { |
3198 | { ISD::SETCC, MVT::v8f64, { 1, 4, 1, 1 } }, |
3199 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 1 } }, |
3200 | { ISD::SETCC, MVT::v16f32, { 1, 4, 1, 1 } }, |
3201 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 1 } }, |
3202 | |
3203 | { ISD::SETCC, MVT::v8i64, { 1, 1, 1, 1 } }, |
3204 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 1 } }, |
3205 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, |
3206 | { ISD::SETCC, MVT::v16i32, { 1, 1, 1, 1 } }, |
3207 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 1 } }, |
3208 | { ISD::SETCC, MVT::v32i16, { 3, 7, 5, 5 } }, |
3209 | { ISD::SETCC, MVT::v64i8, { 3, 7, 5, 5 } }, |
3210 | |
3211 | { ISD::SELECT, MVT::v8i64, { 1, 1, 1, 1 } }, |
3212 | { ISD::SELECT, MVT::v4i64, { 1, 1, 1, 1 } }, |
3213 | { ISD::SELECT, MVT::v2i64, { 1, 1, 1, 1 } }, |
3214 | { ISD::SELECT, MVT::v16i32, { 1, 1, 1, 1 } }, |
3215 | { ISD::SELECT, MVT::v8i32, { 1, 1, 1, 1 } }, |
3216 | { ISD::SELECT, MVT::v4i32, { 1, 1, 1, 1 } }, |
3217 | { ISD::SELECT, MVT::v8f64, { 1, 1, 1, 1 } }, |
3218 | { ISD::SELECT, MVT::v4f64, { 1, 1, 1, 1 } }, |
3219 | { ISD::SELECT, MVT::v2f64, { 1, 1, 1, 1 } }, |
3220 | { ISD::SELECT, MVT::f64, { 1, 1, 1, 1 } }, |
3221 | { ISD::SELECT, MVT::v16f32, { 1, 1, 1, 1 } }, |
3222 | { ISD::SELECT, MVT::v8f32 , { 1, 1, 1, 1 } }, |
3223 | { ISD::SELECT, MVT::v4f32, { 1, 1, 1, 1 } }, |
3224 | { ISD::SELECT, MVT::f32 , { 1, 1, 1, 1 } }, |
3225 | |
3226 | { ISD::SELECT, MVT::v32i16, { 2, 2, 4, 4 } }, |
3227 | { ISD::SELECT, MVT::v16i16, { 1, 1, 1, 1 } }, |
3228 | { ISD::SELECT, MVT::v8i16, { 1, 1, 1, 1 } }, |
3229 | { ISD::SELECT, MVT::v64i8, { 2, 2, 4, 4 } }, |
3230 | { ISD::SELECT, MVT::v32i8, { 1, 1, 1, 1 } }, |
3231 | { ISD::SELECT, MVT::v16i8, { 1, 1, 1, 1 } }, |
3232 | }; |
3233 | |
3234 | static const CostKindTblEntry AVX2CostTbl[] = { |
3235 | { ISD::SETCC, MVT::v4f64, { 1, 4, 1, 2 } }, |
3236 | { ISD::SETCC, MVT::v2f64, { 1, 4, 1, 1 } }, |
3237 | { ISD::SETCC, MVT::f64, { 1, 4, 1, 1 } }, |
3238 | { ISD::SETCC, MVT::v8f32, { 1, 4, 1, 2 } }, |
3239 | { ISD::SETCC, MVT::v4f32, { 1, 4, 1, 1 } }, |
3240 | { ISD::SETCC, MVT::f32, { 1, 4, 1, 1 } }, |
3241 | |
3242 | { ISD::SETCC, MVT::v4i64, { 1, 1, 1, 2 } }, |
3243 | { ISD::SETCC, MVT::v8i32, { 1, 1, 1, 2 } }, |
3244 | { ISD::SETCC, MVT::v16i16, { 1, 1, 1, 2 } }, |
3245 | { ISD::SETCC, MVT::v32i8, { 1, 1, 1, 2 } }, |
3246 | |
3247 | { ISD::SELECT, MVT::v4f64, { 2, 2, 1, 2 } }, // vblendvpd |
3248 | { ISD::SELECT, MVT::v8f32, { 2, 2, 1, 2 } }, // vblendvps |
3249 | { ISD::SELECT, MVT::v4i64, { 2, 2, 1, 2 } }, // pblendvb |
3250 | { ISD::SELECT, MVT::v8i32, { 2, 2, 1, 2 } }, // pblendvb |
3251 | { ISD::SELECT, MVT::v16i16, { 2, 2, 1, 2 } }, // pblendvb |
3252 | { ISD::SELECT, MVT::v32i8, { 2, 2, 1, 2 } }, // pblendvb |
3253 | }; |
3254 | |
3255 | static const CostKindTblEntry XOPCostTbl[] = { |
3256 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, |
3257 | { ISD::SETCC, MVT::v2i64, { 1, 1, 1, 1 } }, |
3258 | }; |
3259 | |
3260 | static const CostKindTblEntry AVX1CostTbl[] = { |
3261 | { ISD::SETCC, MVT::v4f64, { 2, 3, 1, 2 } }, |
3262 | { ISD::SETCC, MVT::v2f64, { 1, 3, 1, 1 } }, |
3263 | { ISD::SETCC, MVT::f64, { 1, 3, 1, 1 } }, |
3264 | { ISD::SETCC, MVT::v8f32, { 2, 3, 1, 2 } }, |
3265 | { ISD::SETCC, MVT::v4f32, { 1, 3, 1, 1 } }, |
3266 | { ISD::SETCC, MVT::f32, { 1, 3, 1, 1 } }, |
3267 | |
3268 | // AVX1 does not support 8-wide integer compare. |
3269 | { ISD::SETCC, MVT::v4i64, { 4, 2, 5, 6 } }, |
3270 | { ISD::SETCC, MVT::v8i32, { 4, 2, 5, 6 } }, |
3271 | { ISD::SETCC, MVT::v16i16, { 4, 2, 5, 6 } }, |
3272 | { ISD::SETCC, MVT::v32i8, { 4, 2, 5, 6 } }, |
3273 | |
3274 | { ISD::SELECT, MVT::v4f64, { 3, 3, 1, 2 } }, // vblendvpd |
3275 | { ISD::SELECT, MVT::v8f32, { 3, 3, 1, 2 } }, // vblendvps |
3276 | { ISD::SELECT, MVT::v4i64, { 3, 3, 1, 2 } }, // vblendvpd |
3277 | { ISD::SELECT, MVT::v8i32, { 3, 3, 1, 2 } }, // vblendvps |
3278 | { ISD::SELECT, MVT::v16i16, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps |
3279 | { ISD::SELECT, MVT::v32i8, { 3, 3, 3, 3 } }, // vandps + vandnps + vorps |
3280 | }; |
3281 | |
3282 | static const CostKindTblEntry SSE42CostTbl[] = { |
3283 | { ISD::SETCC, MVT::v2i64, { 1, 2, 1, 2 } }, |
3284 | }; |
3285 | |
3286 | static const CostKindTblEntry SSE41CostTbl[] = { |
3287 | { ISD::SETCC, MVT::v2f64, { 1, 5, 1, 1 } }, |
3288 | { ISD::SETCC, MVT::v4f32, { 1, 5, 1, 1 } }, |
3289 | |
3290 | { ISD::SELECT, MVT::v2f64, { 2, 2, 1, 2 } }, // blendvpd |
3291 | { ISD::SELECT, MVT::f64, { 2, 2, 1, 2 } }, // blendvpd |
3292 | { ISD::SELECT, MVT::v4f32, { 2, 2, 1, 2 } }, // blendvps |
3293 | { ISD::SELECT, MVT::f32 , { 2, 2, 1, 2 } }, // blendvps |
3294 | { ISD::SELECT, MVT::v2i64, { 2, 2, 1, 2 } }, // pblendvb |
3295 | { ISD::SELECT, MVT::v4i32, { 2, 2, 1, 2 } }, // pblendvb |
3296 | { ISD::SELECT, MVT::v8i16, { 2, 2, 1, 2 } }, // pblendvb |
3297 | { ISD::SELECT, MVT::v16i8, { 2, 2, 1, 2 } }, // pblendvb |
3298 | }; |
3299 | |
3300 | static const CostKindTblEntry SSE2CostTbl[] = { |
3301 | { ISD::SETCC, MVT::v2f64, { 2, 5, 1, 1 } }, |
3302 | { ISD::SETCC, MVT::f64, { 1, 5, 1, 1 } }, |
3303 | |
3304 | { ISD::SETCC, MVT::v2i64, { 5, 4, 5, 5 } }, // pcmpeqd/pcmpgtd expansion |
3305 | { ISD::SETCC, MVT::v4i32, { 1, 1, 1, 1 } }, |
3306 | { ISD::SETCC, MVT::v8i16, { 1, 1, 1, 1 } }, |
3307 | { ISD::SETCC, MVT::v16i8, { 1, 1, 1, 1 } }, |
3308 | |
3309 | { ISD::SELECT, MVT::v2f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd |
3310 | { ISD::SELECT, MVT::f64, { 2, 2, 3, 3 } }, // andpd + andnpd + orpd |
3311 | { ISD::SELECT, MVT::v2i64, { 2, 2, 3, 3 } }, // pand + pandn + por |
3312 | { ISD::SELECT, MVT::v4i32, { 2, 2, 3, 3 } }, // pand + pandn + por |
3313 | { ISD::SELECT, MVT::v8i16, { 2, 2, 3, 3 } }, // pand + pandn + por |
3314 | { ISD::SELECT, MVT::v16i8, { 2, 2, 3, 3 } }, // pand + pandn + por |
3315 | }; |
3316 | |
3317 | static const CostKindTblEntry SSE1CostTbl[] = { |
3318 | { ISD::SETCC, MVT::v4f32, { 2, 5, 1, 1 } }, |
3319 | { ISD::SETCC, MVT::f32, { 1, 5, 1, 1 } }, |
3320 | |
3321 | { ISD::SELECT, MVT::v4f32, { 2, 2, 3, 3 } }, // andps + andnps + orps |
3322 | { ISD::SELECT, MVT::f32, { 2, 2, 3, 3 } }, // andps + andnps + orps |
3323 | }; |
3324 | |
3325 | if (ST->useSLMArithCosts()) |
3326 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) |
3327 | if (auto KindCost = Entry->Cost[CostKind]) |
3328 | return LT.first * (ExtraCost + *KindCost); |
3329 | |
3330 | if (ST->hasBWI()) |
3331 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) |
3332 | if (auto KindCost = Entry->Cost[CostKind]) |
3333 | return LT.first * (ExtraCost + *KindCost); |
3334 | |
3335 | if (ST->hasAVX512()) |
3336 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) |
3337 | if (auto KindCost = Entry->Cost[CostKind]) |
3338 | return LT.first * (ExtraCost + *KindCost); |
3339 | |
3340 | if (ST->hasAVX2()) |
3341 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) |
3342 | if (auto KindCost = Entry->Cost[CostKind]) |
3343 | return LT.first * (ExtraCost + *KindCost); |
3344 | |
3345 | if (ST->hasXOP()) |
3346 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) |
3347 | if (auto KindCost = Entry->Cost[CostKind]) |
3348 | return LT.first * (ExtraCost + *KindCost); |
3349 | |
3350 | if (ST->hasAVX()) |
3351 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
3352 | if (auto KindCost = Entry->Cost[CostKind]) |
3353 | return LT.first * (ExtraCost + *KindCost); |
3354 | |
3355 | if (ST->hasSSE42()) |
3356 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) |
3357 | if (auto KindCost = Entry->Cost[CostKind]) |
3358 | return LT.first * (ExtraCost + *KindCost); |
3359 | |
3360 | if (ST->hasSSE41()) |
3361 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) |
3362 | if (auto KindCost = Entry->Cost[CostKind]) |
3363 | return LT.first * (ExtraCost + *KindCost); |
3364 | |
3365 | if (ST->hasSSE2()) |
3366 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
3367 | if (auto KindCost = Entry->Cost[CostKind]) |
3368 | return LT.first * (ExtraCost + *KindCost); |
3369 | |
3370 | if (ST->hasSSE1()) |
3371 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) |
3372 | if (auto KindCost = Entry->Cost[CostKind]) |
3373 | return LT.first * (ExtraCost + *KindCost); |
3374 | |
3375 | // Assume a 3cy latency for fp select ops. |
3376 | if (CostKind == TTI::TCK_Latency && Opcode == Instruction::Select) |
3377 | if (ValTy->getScalarType()->isFloatingPointTy()) |
3378 | return 3; |
3379 | |
3380 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
3381 | } |
3382 | |
3383 | unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; } |
3384 | |
3385 | InstructionCost |
3386 | X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
3387 | TTI::TargetCostKind CostKind) { |
3388 | // Costs should match the codegen from: |
3389 | // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll |
3390 | // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll |
3391 | // CTLZ: llvm\test\CodeGen\X86\vector-lzcnt-*.ll |
3392 | // CTPOP: llvm\test\CodeGen\X86\vector-popcnt-*.ll |
3393 | // CTTZ: llvm\test\CodeGen\X86\vector-tzcnt-*.ll |
3394 | |
3395 | // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not |
3396 | // specialized in these tables yet. |
3397 | static const CostKindTblEntry AVX512VBMI2CostTbl[] = { |
3398 | { ISD::FSHL, MVT::v8i64, { 1, 1, 1, 1 } }, |
3399 | { ISD::FSHL, MVT::v4i64, { 1, 1, 1, 1 } }, |
3400 | { ISD::FSHL, MVT::v2i64, { 1, 1, 1, 1 } }, |
3401 | { ISD::FSHL, MVT::v16i32, { 1, 1, 1, 1 } }, |
3402 | { ISD::FSHL, MVT::v8i32, { 1, 1, 1, 1 } }, |
3403 | { ISD::FSHL, MVT::v4i32, { 1, 1, 1, 1 } }, |
3404 | { ISD::FSHL, MVT::v32i16, { 1, 1, 1, 1 } }, |
3405 | { ISD::FSHL, MVT::v16i16, { 1, 1, 1, 1 } }, |
3406 | { ISD::FSHL, MVT::v8i16, { 1, 1, 1, 1 } }, |
3407 | { ISD::ROTL, MVT::v32i16, { 1, 1, 1, 1 } }, |
3408 | { ISD::ROTL, MVT::v16i16, { 1, 1, 1, 1 } }, |
3409 | { ISD::ROTL, MVT::v8i16, { 1, 1, 1, 1 } }, |
3410 | { ISD::ROTR, MVT::v32i16, { 1, 1, 1, 1 } }, |
3411 | { ISD::ROTR, MVT::v16i16, { 1, 1, 1, 1 } }, |
3412 | { ISD::ROTR, MVT::v8i16, { 1, 1, 1, 1 } }, |
3413 | { X86ISD::VROTLI, MVT::v32i16, { 1, 1, 1, 1 } }, |
3414 | { X86ISD::VROTLI, MVT::v16i16, { 1, 1, 1, 1 } }, |
3415 | { X86ISD::VROTLI, MVT::v8i16, { 1, 1, 1, 1 } }, |
3416 | }; |
3417 | static const CostKindTblEntry AVX512BITALGCostTbl[] = { |
3418 | { ISD::CTPOP, MVT::v32i16, { 1, 1, 1, 1 } }, |
3419 | { ISD::CTPOP, MVT::v64i8, { 1, 1, 1, 1 } }, |
3420 | { ISD::CTPOP, MVT::v16i16, { 1, 1, 1, 1 } }, |
3421 | { ISD::CTPOP, MVT::v32i8, { 1, 1, 1, 1 } }, |
3422 | { ISD::CTPOP, MVT::v8i16, { 1, 1, 1, 1 } }, |
3423 | { ISD::CTPOP, MVT::v16i8, { 1, 1, 1, 1 } }, |
3424 | }; |
3425 | static const CostKindTblEntry AVX512VPOPCNTDQCostTbl[] = { |
3426 | { ISD::CTPOP, MVT::v8i64, { 1, 1, 1, 1 } }, |
3427 | { ISD::CTPOP, MVT::v16i32, { 1, 1, 1, 1 } }, |
3428 | { ISD::CTPOP, MVT::v4i64, { 1, 1, 1, 1 } }, |
3429 | { ISD::CTPOP, MVT::v8i32, { 1, 1, 1, 1 } }, |
3430 | { ISD::CTPOP, MVT::v2i64, { 1, 1, 1, 1 } }, |
3431 | { ISD::CTPOP, MVT::v4i32, { 1, 1, 1, 1 } }, |
3432 | }; |
3433 | static const CostKindTblEntry AVX512CDCostTbl[] = { |
3434 | { ISD::CTLZ, MVT::v8i64, { 1, 5, 1, 1 } }, |
3435 | { ISD::CTLZ, MVT::v16i32, { 1, 5, 1, 1 } }, |
3436 | { ISD::CTLZ, MVT::v32i16, { 18, 27, 23, 27 } }, |
3437 | { ISD::CTLZ, MVT::v64i8, { 3, 16, 9, 11 } }, |
3438 | { ISD::CTLZ, MVT::v4i64, { 1, 5, 1, 1 } }, |
3439 | { ISD::CTLZ, MVT::v8i32, { 1, 5, 1, 1 } }, |
3440 | { ISD::CTLZ, MVT::v16i16, { 8, 19, 11, 13 } }, |
3441 | { ISD::CTLZ, MVT::v32i8, { 2, 11, 9, 10 } }, |
3442 | { ISD::CTLZ, MVT::v2i64, { 1, 5, 1, 1 } }, |
3443 | { ISD::CTLZ, MVT::v4i32, { 1, 5, 1, 1 } }, |
3444 | { ISD::CTLZ, MVT::v8i16, { 3, 15, 4, 6 } }, |
3445 | { ISD::CTLZ, MVT::v16i8, { 2, 10, 9, 10 } }, |
3446 | |
3447 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, |
3448 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, |
3449 | { ISD::CTTZ, MVT::v4i64, { 1, 8, 6, 6 } }, |
3450 | { ISD::CTTZ, MVT::v8i32, { 1, 8, 6, 6 } }, |
3451 | { ISD::CTTZ, MVT::v2i64, { 1, 8, 6, 6 } }, |
3452 | { ISD::CTTZ, MVT::v4i32, { 1, 8, 6, 6 } }, |
3453 | }; |
3454 | static const CostKindTblEntry AVX512BWCostTbl[] = { |
3455 | { ISD::ABS, MVT::v32i16, { 1, 1, 1, 1 } }, |
3456 | { ISD::ABS, MVT::v64i8, { 1, 1, 1, 1 } }, |
3457 | { ISD::BITREVERSE, MVT::v2i64, { 3, 10, 10, 11 } }, |
3458 | { ISD::BITREVERSE, MVT::v4i64, { 3, 11, 10, 11 } }, |
3459 | { ISD::BITREVERSE, MVT::v8i64, { 3, 12, 10, 14 } }, |
3460 | { ISD::BITREVERSE, MVT::v4i32, { 3, 10, 10, 11 } }, |
3461 | { ISD::BITREVERSE, MVT::v8i32, { 3, 11, 10, 11 } }, |
3462 | { ISD::BITREVERSE, MVT::v16i32, { 3, 12, 10, 14 } }, |
3463 | { ISD::BITREVERSE, MVT::v8i16, { 3, 10, 10, 11 } }, |
3464 | { ISD::BITREVERSE, MVT::v16i16, { 3, 11, 10, 11 } }, |
3465 | { ISD::BITREVERSE, MVT::v32i16, { 3, 12, 10, 14 } }, |
3466 | { ISD::BITREVERSE, MVT::v16i8, { 2, 5, 9, 9 } }, |
3467 | { ISD::BITREVERSE, MVT::v32i8, { 2, 5, 9, 9 } }, |
3468 | { ISD::BITREVERSE, MVT::v64i8, { 2, 5, 9, 12 } }, |
3469 | { ISD::BSWAP, MVT::v2i64, { 1, 1, 1, 2 } }, |
3470 | { ISD::BSWAP, MVT::v4i64, { 1, 1, 1, 2 } }, |
3471 | { ISD::BSWAP, MVT::v8i64, { 1, 1, 1, 2 } }, |
3472 | { ISD::BSWAP, MVT::v4i32, { 1, 1, 1, 2 } }, |
3473 | { ISD::BSWAP, MVT::v8i32, { 1, 1, 1, 2 } }, |
3474 | { ISD::BSWAP, MVT::v16i32, { 1, 1, 1, 2 } }, |
3475 | { ISD::BSWAP, MVT::v8i16, { 1, 1, 1, 2 } }, |
3476 | { ISD::BSWAP, MVT::v16i16, { 1, 1, 1, 2 } }, |
3477 | { ISD::BSWAP, MVT::v32i16, { 1, 1, 1, 2 } }, |
3478 | { ISD::CTLZ, MVT::v8i64, { 8, 22, 23, 23 } }, |
3479 | { ISD::CTLZ, MVT::v16i32, { 8, 23, 25, 25 } }, |
3480 | { ISD::CTLZ, MVT::v32i16, { 4, 15, 15, 16 } }, |
3481 | { ISD::CTLZ, MVT::v64i8, { 3, 12, 10, 9 } }, |
3482 | { ISD::CTPOP, MVT::v2i64, { 3, 7, 10, 10 } }, |
3483 | { ISD::CTPOP, MVT::v4i64, { 3, 7, 10, 10 } }, |
3484 | { ISD::CTPOP, MVT::v8i64, { 3, 8, 10, 12 } }, |
3485 | { ISD::CTPOP, MVT::v4i32, { 7, 11, 14, 14 } }, |
3486 | { ISD::CTPOP, MVT::v8i32, { 7, 11, 14, 14 } }, |
3487 | { ISD::CTPOP, MVT::v16i32, { 7, 12, 14, 16 } }, |
3488 | { ISD::CTPOP, MVT::v8i16, { 2, 7, 11, 11 } }, |
3489 | { ISD::CTPOP, MVT::v16i16, { 2, 7, 11, 11 } }, |
3490 | { ISD::CTPOP, MVT::v32i16, { 3, 7, 11, 13 } }, |
3491 | { ISD::CTPOP, MVT::v16i8, { 2, 4, 8, 8 } }, |
3492 | { ISD::CTPOP, MVT::v32i8, { 2, 4, 8, 8 } }, |
3493 | { ISD::CTPOP, MVT::v64i8, { 2, 5, 8, 10 } }, |
3494 | { ISD::CTTZ, MVT::v8i16, { 3, 9, 14, 14 } }, |
3495 | { ISD::CTTZ, MVT::v16i16, { 3, 9, 14, 14 } }, |
3496 | { ISD::CTTZ, MVT::v32i16, { 3, 10, 14, 16 } }, |
3497 | { ISD::CTTZ, MVT::v16i8, { 2, 6, 11, 11 } }, |
3498 | { ISD::CTTZ, MVT::v32i8, { 2, 6, 11, 11 } }, |
3499 | { ISD::CTTZ, MVT::v64i8, { 3, 7, 11, 13 } }, |
3500 | { ISD::ROTL, MVT::v32i16, { 2, 8, 6, 8 } }, |
3501 | { ISD::ROTL, MVT::v16i16, { 2, 8, 6, 7 } }, |
3502 | { ISD::ROTL, MVT::v8i16, { 2, 7, 6, 7 } }, |
3503 | { ISD::ROTL, MVT::v64i8, { 5, 6, 11, 12 } }, |
3504 | { ISD::ROTL, MVT::v32i8, { 5, 15, 7, 10 } }, |
3505 | { ISD::ROTL, MVT::v16i8, { 5, 15, 7, 10 } }, |
3506 | { ISD::ROTR, MVT::v32i16, { 2, 8, 6, 8 } }, |
3507 | { ISD::ROTR, MVT::v16i16, { 2, 8, 6, 7 } }, |
3508 | { ISD::ROTR, MVT::v8i16, { 2, 7, 6, 7 } }, |
3509 | { ISD::ROTR, MVT::v64i8, { 5, 6, 12, 14 } }, |
3510 | { ISD::ROTR, MVT::v32i8, { 5, 14, 6, 9 } }, |
3511 | { ISD::ROTR, MVT::v16i8, { 5, 14, 6, 9 } }, |
3512 | { X86ISD::VROTLI, MVT::v32i16, { 2, 5, 3, 3 } }, |
3513 | { X86ISD::VROTLI, MVT::v16i16, { 1, 5, 3, 3 } }, |
3514 | { X86ISD::VROTLI, MVT::v8i16, { 1, 5, 3, 3 } }, |
3515 | { X86ISD::VROTLI, MVT::v64i8, { 2, 9, 3, 4 } }, |
3516 | { X86ISD::VROTLI, MVT::v32i8, { 1, 9, 3, 4 } }, |
3517 | { X86ISD::VROTLI, MVT::v16i8, { 1, 8, 3, 4 } }, |
3518 | { ISD::SADDSAT, MVT::v32i16, { 1 } }, |
3519 | { ISD::SADDSAT, MVT::v64i8, { 1 } }, |
3520 | { ISD::SMAX, MVT::v32i16, { 1, 1, 1, 1 } }, |
3521 | { ISD::SMAX, MVT::v64i8, { 1, 1, 1, 1 } }, |
3522 | { ISD::SMIN, MVT::v32i16, { 1, 1, 1, 1 } }, |
3523 | { ISD::SMIN, MVT::v64i8, { 1, 1, 1, 1 } }, |
3524 | { ISD::SSUBSAT, MVT::v32i16, { 1 } }, |
3525 | { ISD::SSUBSAT, MVT::v64i8, { 1 } }, |
3526 | { ISD::UADDSAT, MVT::v32i16, { 1 } }, |
3527 | { ISD::UADDSAT, MVT::v64i8, { 1 } }, |
3528 | { ISD::UMAX, MVT::v32i16, { 1, 1, 1, 1 } }, |
3529 | { ISD::UMAX, MVT::v64i8, { 1, 1, 1, 1 } }, |
3530 | { ISD::UMIN, MVT::v32i16, { 1, 1, 1, 1 } }, |
3531 | { ISD::UMIN, MVT::v64i8, { 1, 1, 1, 1 } }, |
3532 | { ISD::USUBSAT, MVT::v32i16, { 1 } }, |
3533 | { ISD::USUBSAT, MVT::v64i8, { 1 } }, |
3534 | }; |
3535 | static const CostKindTblEntry AVX512CostTbl[] = { |
3536 | { ISD::ABS, MVT::v8i64, { 1, 1, 1, 1 } }, |
3537 | { ISD::ABS, MVT::v4i64, { 1, 1, 1, 1 } }, |
3538 | { ISD::ABS, MVT::v2i64, { 1, 1, 1, 1 } }, |
3539 | { ISD::ABS, MVT::v16i32, { 1, 1, 1, 1 } }, |
3540 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 1 } }, |
3541 | { ISD::ABS, MVT::v32i16, { 2, 7, 4, 4 } }, |
3542 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 1 } }, |
3543 | { ISD::ABS, MVT::v64i8, { 2, 7, 4, 4 } }, |
3544 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 1 } }, |
3545 | { ISD::BITREVERSE, MVT::v8i64, { 9, 13, 20, 20 } }, |
3546 | { ISD::BITREVERSE, MVT::v16i32, { 9, 13, 20, 20 } }, |
3547 | { ISD::BITREVERSE, MVT::v32i16, { 9, 13, 20, 20 } }, |
3548 | { ISD::BITREVERSE, MVT::v64i8, { 6, 11, 17, 17 } }, |
3549 | { ISD::BSWAP, MVT::v8i64, { 4, 7, 5, 5 } }, |
3550 | { ISD::BSWAP, MVT::v16i32, { 4, 7, 5, 5 } }, |
3551 | { ISD::BSWAP, MVT::v32i16, { 4, 7, 5, 5 } }, |
3552 | { ISD::CTLZ, MVT::v8i64, { 10, 28, 32, 32 } }, |
3553 | { ISD::CTLZ, MVT::v16i32, { 12, 30, 38, 38 } }, |
3554 | { ISD::CTLZ, MVT::v32i16, { 8, 15, 29, 29 } }, |
3555 | { ISD::CTLZ, MVT::v64i8, { 6, 11, 19, 19 } }, |
3556 | { ISD::CTPOP, MVT::v8i64, { 16, 16, 19, 19 } }, |
3557 | { ISD::CTPOP, MVT::v16i32, { 24, 19, 27, 27 } }, |
3558 | { ISD::CTPOP, MVT::v32i16, { 18, 15, 22, 22 } }, |
3559 | { ISD::CTPOP, MVT::v64i8, { 12, 11, 16, 16 } }, |
3560 | { ISD::CTTZ, MVT::v8i64, { 2, 8, 6, 7 } }, |
3561 | { ISD::CTTZ, MVT::v16i32, { 2, 8, 6, 7 } }, |
3562 | { ISD::CTTZ, MVT::v32i16, { 7, 17, 27, 27 } }, |
3563 | { ISD::CTTZ, MVT::v64i8, { 6, 13, 21, 21 } }, |
3564 | { ISD::ROTL, MVT::v8i64, { 1, 1, 1, 1 } }, |
3565 | { ISD::ROTL, MVT::v4i64, { 1, 1, 1, 1 } }, |
3566 | { ISD::ROTL, MVT::v2i64, { 1, 1, 1, 1 } }, |
3567 | { ISD::ROTL, MVT::v16i32, { 1, 1, 1, 1 } }, |
3568 | { ISD::ROTL, MVT::v8i32, { 1, 1, 1, 1 } }, |
3569 | { ISD::ROTL, MVT::v4i32, { 1, 1, 1, 1 } }, |
3570 | { ISD::ROTR, MVT::v8i64, { 1, 1, 1, 1 } }, |
3571 | { ISD::ROTR, MVT::v4i64, { 1, 1, 1, 1 } }, |
3572 | { ISD::ROTR, MVT::v2i64, { 1, 1, 1, 1 } }, |
3573 | { ISD::ROTR, MVT::v16i32, { 1, 1, 1, 1 } }, |
3574 | { ISD::ROTR, MVT::v8i32, { 1, 1, 1, 1 } }, |
3575 | { ISD::ROTR, MVT::v4i32, { 1, 1, 1, 1 } }, |
3576 | { X86ISD::VROTLI, MVT::v8i64, { 1, 1, 1, 1 } }, |
3577 | { X86ISD::VROTLI, MVT::v4i64, { 1, 1, 1, 1 } }, |
3578 | { X86ISD::VROTLI, MVT::v2i64, { 1, 1, 1, 1 } }, |
3579 | { X86ISD::VROTLI, MVT::v16i32, { 1, 1, 1, 1 } }, |
3580 | { X86ISD::VROTLI, MVT::v8i32, { 1, 1, 1, 1 } }, |
3581 | { X86ISD::VROTLI, MVT::v4i32, { 1, 1, 1, 1 } }, |
3582 | { ISD::SMAX, MVT::v8i64, { 1, 3, 1, 1 } }, |
3583 | { ISD::SMAX, MVT::v16i32, { 1, 1, 1, 1 } }, |
3584 | { ISD::SMAX, MVT::v32i16, { 3, 7, 5, 5 } }, |
3585 | { ISD::SMAX, MVT::v64i8, { 3, 7, 5, 5 } }, |
3586 | { ISD::SMAX, MVT::v4i64, { 1, 3, 1, 1 } }, |
3587 | { ISD::SMAX, MVT::v2i64, { 1, 3, 1, 1 } }, |
3588 | { ISD::SMIN, MVT::v8i64, { 1, 3, 1, 1 } }, |
3589 | { ISD::SMIN, MVT::v16i32, { 1, 1, 1, 1 } }, |
3590 | { ISD::SMIN, MVT::v32i16, { 3, 7, 5, 5 } }, |
3591 | { ISD::SMIN, MVT::v64i8, { 3, 7, 5, 5 } }, |
3592 | { ISD::SMIN, MVT::v4i64, { 1, 3, 1, 1 } }, |
3593 | { ISD::SMIN, MVT::v2i64, { 1, 3, 1, 1 } }, |
3594 | { ISD::UMAX, MVT::v8i64, { 1, 3, 1, 1 } }, |
3595 | { ISD::UMAX, MVT::v16i32, { 1, 1, 1, 1 } }, |
3596 | { ISD::UMAX, MVT::v32i16, { 3, 7, 5, 5 } }, |
3597 | { ISD::UMAX, MVT::v64i8, { 3, 7, 5, 5 } }, |
3598 | { ISD::UMAX, MVT::v4i64, { 1, 3, 1, 1 } }, |
3599 | { ISD::UMAX, MVT::v2i64, { 1, 3, 1, 1 } }, |
3600 | { ISD::UMIN, MVT::v8i64, { 1, 3, 1, 1 } }, |
3601 | { ISD::UMIN, MVT::v16i32, { 1, 1, 1, 1 } }, |
3602 | { ISD::UMIN, MVT::v32i16, { 3, 7, 5, 5 } }, |
3603 | { ISD::UMIN, MVT::v64i8, { 3, 7, 5, 5 } }, |
3604 | { ISD::UMIN, MVT::v4i64, { 1, 3, 1, 1 } }, |
3605 | { ISD::UMIN, MVT::v2i64, { 1, 3, 1, 1 } }, |
3606 | { ISD::USUBSAT, MVT::v16i32, { 2 } }, // pmaxud + psubd |
3607 | { ISD::USUBSAT, MVT::v2i64, { 2 } }, // pmaxuq + psubq |
3608 | { ISD::USUBSAT, MVT::v4i64, { 2 } }, // pmaxuq + psubq |
3609 | { ISD::USUBSAT, MVT::v8i64, { 2 } }, // pmaxuq + psubq |
3610 | { ISD::UADDSAT, MVT::v16i32, { 3 } }, // not + pminud + paddd |
3611 | { ISD::UADDSAT, MVT::v2i64, { 3 } }, // not + pminuq + paddq |
3612 | { ISD::UADDSAT, MVT::v4i64, { 3 } }, // not + pminuq + paddq |
3613 | { ISD::UADDSAT, MVT::v8i64, { 3 } }, // not + pminuq + paddq |
3614 | { ISD::SADDSAT, MVT::v32i16, { 2 } }, |
3615 | { ISD::SADDSAT, MVT::v64i8, { 2 } }, |
3616 | { ISD::SSUBSAT, MVT::v32i16, { 2 } }, |
3617 | { ISD::SSUBSAT, MVT::v64i8, { 2 } }, |
3618 | { ISD::UADDSAT, MVT::v32i16, { 2 } }, |
3619 | { ISD::UADDSAT, MVT::v64i8, { 2 } }, |
3620 | { ISD::USUBSAT, MVT::v32i16, { 2 } }, |
3621 | { ISD::USUBSAT, MVT::v64i8, { 2 } }, |
3622 | { ISD::FMAXNUM, MVT::f32, { 2, 2, 3, 3 } }, |
3623 | { ISD::FMAXNUM, MVT::v4f32, { 1, 1, 3, 3 } }, |
3624 | { ISD::FMAXNUM, MVT::v8f32, { 2, 2, 3, 3 } }, |
3625 | { ISD::FMAXNUM, MVT::v16f32, { 4, 4, 3, 3 } }, |
3626 | { ISD::FMAXNUM, MVT::f64, { 2, 2, 3, 3 } }, |
3627 | { ISD::FMAXNUM, MVT::v2f64, { 1, 1, 3, 3 } }, |
3628 | { ISD::FMAXNUM, MVT::v4f64, { 2, 2, 3, 3 } }, |
3629 | { ISD::FMAXNUM, MVT::v8f64, { 3, 3, 3, 3 } }, |
3630 | { ISD::FSQRT, MVT::f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ |
3631 | { ISD::FSQRT, MVT::v4f32, { 3, 12, 1, 1 } }, // Skylake from http://www.agner.org/ |
3632 | { ISD::FSQRT, MVT::v8f32, { 6, 12, 1, 1 } }, // Skylake from http://www.agner.org/ |
3633 | { ISD::FSQRT, MVT::v16f32, { 12, 20, 1, 3 } }, // Skylake from http://www.agner.org/ |
3634 | { ISD::FSQRT, MVT::f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ |
3635 | { ISD::FSQRT, MVT::v2f64, { 6, 18, 1, 1 } }, // Skylake from http://www.agner.org/ |
3636 | { ISD::FSQRT, MVT::v4f64, { 12, 18, 1, 1 } }, // Skylake from http://www.agner.org/ |
3637 | { ISD::FSQRT, MVT::v8f64, { 24, 32, 1, 3 } }, // Skylake from http://www.agner.org/ |
3638 | }; |
3639 | static const CostKindTblEntry XOPCostTbl[] = { |
3640 | { ISD::BITREVERSE, MVT::v4i64, { 3, 6, 5, 6 } }, |
3641 | { ISD::BITREVERSE, MVT::v8i32, { 3, 6, 5, 6 } }, |
3642 | { ISD::BITREVERSE, MVT::v16i16, { 3, 6, 5, 6 } }, |
3643 | { ISD::BITREVERSE, MVT::v32i8, { 3, 6, 5, 6 } }, |
3644 | { ISD::BITREVERSE, MVT::v2i64, { 2, 7, 1, 1 } }, |
3645 | { ISD::BITREVERSE, MVT::v4i32, { 2, 7, 1, 1 } }, |
3646 | { ISD::BITREVERSE, MVT::v8i16, { 2, 7, 1, 1 } }, |
3647 | { ISD::BITREVERSE, MVT::v16i8, { 2, 7, 1, 1 } }, |
3648 | { ISD::BITREVERSE, MVT::i64, { 2, 2, 3, 4 } }, |
3649 | { ISD::BITREVERSE, MVT::i32, { 2, 2, 3, 4 } }, |
3650 | { ISD::BITREVERSE, MVT::i16, { 2, 2, 3, 4 } }, |
3651 | { ISD::BITREVERSE, MVT::i8, { 2, 2, 3, 4 } }, |
3652 | // XOP: ROTL = VPROT(X,Y), ROTR = VPROT(X,SUB(0,Y)) |
3653 | { ISD::ROTL, MVT::v4i64, { 4, 7, 5, 6 } }, |
3654 | { ISD::ROTL, MVT::v8i32, { 4, 7, 5, 6 } }, |
3655 | { ISD::ROTL, MVT::v16i16, { 4, 7, 5, 6 } }, |
3656 | { ISD::ROTL, MVT::v32i8, { 4, 7, 5, 6 } }, |
3657 | { ISD::ROTL, MVT::v2i64, { 1, 3, 1, 1 } }, |
3658 | { ISD::ROTL, MVT::v4i32, { 1, 3, 1, 1 } }, |
3659 | { ISD::ROTL, MVT::v8i16, { 1, 3, 1, 1 } }, |
3660 | { ISD::ROTL, MVT::v16i8, { 1, 3, 1, 1 } }, |
3661 | { ISD::ROTR, MVT::v4i64, { 4, 7, 8, 9 } }, |
3662 | { ISD::ROTR, MVT::v8i32, { 4, 7, 8, 9 } }, |
3663 | { ISD::ROTR, MVT::v16i16, { 4, 7, 8, 9 } }, |
3664 | { ISD::ROTR, MVT::v32i8, { 4, 7, 8, 9 } }, |
3665 | { ISD::ROTR, MVT::v2i64, { 1, 3, 3, 3 } }, |
3666 | { ISD::ROTR, MVT::v4i32, { 1, 3, 3, 3 } }, |
3667 | { ISD::ROTR, MVT::v8i16, { 1, 3, 3, 3 } }, |
3668 | { ISD::ROTR, MVT::v16i8, { 1, 3, 3, 3 } }, |
3669 | { X86ISD::VROTLI, MVT::v4i64, { 4, 7, 5, 6 } }, |
3670 | { X86ISD::VROTLI, MVT::v8i32, { 4, 7, 5, 6 } }, |
3671 | { X86ISD::VROTLI, MVT::v16i16, { 4, 7, 5, 6 } }, |
3672 | { X86ISD::VROTLI, MVT::v32i8, { 4, 7, 5, 6 } }, |
3673 | { X86ISD::VROTLI, MVT::v2i64, { 1, 3, 1, 1 } }, |
3674 | { X86ISD::VROTLI, MVT::v4i32, { 1, 3, 1, 1 } }, |
3675 | { X86ISD::VROTLI, MVT::v8i16, { 1, 3, 1, 1 } }, |
3676 | { X86ISD::VROTLI, MVT::v16i8, { 1, 3, 1, 1 } }, |
3677 | }; |
3678 | static const CostKindTblEntry AVX2CostTbl[] = { |
3679 | { ISD::ABS, MVT::v2i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3680 | { ISD::ABS, MVT::v4i64, { 2, 4, 3, 5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3681 | { ISD::ABS, MVT::v4i32, { 1, 1, 1, 1 } }, |
3682 | { ISD::ABS, MVT::v8i32, { 1, 1, 1, 2 } }, |
3683 | { ISD::ABS, MVT::v8i16, { 1, 1, 1, 1 } }, |
3684 | { ISD::ABS, MVT::v16i16, { 1, 1, 1, 2 } }, |
3685 | { ISD::ABS, MVT::v16i8, { 1, 1, 1, 1 } }, |
3686 | { ISD::ABS, MVT::v32i8, { 1, 1, 1, 2 } }, |
3687 | { ISD::BITREVERSE, MVT::v2i64, { 3, 11, 10, 11 } }, |
3688 | { ISD::BITREVERSE, MVT::v4i64, { 5, 11, 10, 17 } }, |
3689 | { ISD::BITREVERSE, MVT::v4i32, { 3, 11, 10, 11 } }, |
3690 | { ISD::BITREVERSE, MVT::v8i32, { 5, 11, 10, 17 } }, |
3691 | { ISD::BITREVERSE, MVT::v8i16, { 3, 11, 10, 11 } }, |
3692 | { ISD::BITREVERSE, MVT::v16i16, { 5, 11, 10, 17 } }, |
3693 | { ISD::BITREVERSE, MVT::v16i8, { 3, 6, 9, 9 } }, |
3694 | { ISD::BITREVERSE, MVT::v32i8, { 4, 5, 9, 15 } }, |
3695 | { ISD::BSWAP, MVT::v2i64, { 1, 2, 1, 2 } }, |
3696 | { ISD::BSWAP, MVT::v4i64, { 1, 3, 1, 2 } }, |
3697 | { ISD::BSWAP, MVT::v4i32, { 1, 2, 1, 2 } }, |
3698 | { ISD::BSWAP, MVT::v8i32, { 1, 3, 1, 2 } }, |
3699 | { ISD::BSWAP, MVT::v8i16, { 1, 2, 1, 2 } }, |
3700 | { ISD::BSWAP, MVT::v16i16, { 1, 3, 1, 2 } }, |
3701 | { ISD::CTLZ, MVT::v2i64, { 7, 18, 24, 25 } }, |
3702 | { ISD::CTLZ, MVT::v4i64, { 14, 18, 24, 44 } }, |
3703 | { ISD::CTLZ, MVT::v4i32, { 5, 16, 19, 20 } }, |
3704 | { ISD::CTLZ, MVT::v8i32, { 10, 16, 19, 34 } }, |
3705 | { ISD::CTLZ, MVT::v8i16, { 4, 13, 14, 15 } }, |
3706 | { ISD::CTLZ, MVT::v16i16, { 6, 14, 14, 24 } }, |
3707 | { ISD::CTLZ, MVT::v16i8, { 3, 12, 9, 10 } }, |
3708 | { ISD::CTLZ, MVT::v32i8, { 4, 12, 9, 14 } }, |
3709 | { ISD::CTPOP, MVT::v2i64, { 3, 9, 10, 10 } }, |
3710 | { ISD::CTPOP, MVT::v4i64, { 4, 9, 10, 14 } }, |
3711 | { ISD::CTPOP, MVT::v4i32, { 7, 12, 14, 14 } }, |
3712 | { ISD::CTPOP, MVT::v8i32, { 7, 12, 14, 18 } }, |
3713 | { ISD::CTPOP, MVT::v8i16, { 3, 7, 11, 11 } }, |
3714 | { ISD::CTPOP, MVT::v16i16, { 6, 8, 11, 18 } }, |
3715 | { ISD::CTPOP, MVT::v16i8, { 2, 5, 8, 8 } }, |
3716 | { ISD::CTPOP, MVT::v32i8, { 3, 5, 8, 12 } }, |
3717 | { ISD::CTTZ, MVT::v2i64, { 4, 11, 13, 13 } }, |
3718 | { ISD::CTTZ, MVT::v4i64, { 5, 11, 13, 20 } }, |
3719 | { ISD::CTTZ, MVT::v4i32, { 7, 14, 17, 17 } }, |
3720 | { ISD::CTTZ, MVT::v8i32, { 7, 15, 17, 24 } }, |
3721 | { ISD::CTTZ, MVT::v8i16, { 4, 9, 14, 14 } }, |
3722 | { ISD::CTTZ, MVT::v16i16, { 6, 9, 14, 24 } }, |
3723 | { ISD::CTTZ, MVT::v16i8, { 3, 7, 11, 11 } }, |
3724 | { ISD::CTTZ, MVT::v32i8, { 5, 7, 11, 18 } }, |
3725 | { ISD::SADDSAT, MVT::v16i16, { 1 } }, |
3726 | { ISD::SADDSAT, MVT::v32i8, { 1 } }, |
3727 | { ISD::SMAX, MVT::v2i64, { 2, 7, 2, 3 } }, |
3728 | { ISD::SMAX, MVT::v4i64, { 2, 7, 2, 3 } }, |
3729 | { ISD::SMAX, MVT::v8i32, { 1, 1, 1, 2 } }, |
3730 | { ISD::SMAX, MVT::v16i16, { 1, 1, 1, 2 } }, |
3731 | { ISD::SMAX, MVT::v32i8, { 1, 1, 1, 2 } }, |
3732 | { ISD::SMIN, MVT::v2i64, { 2, 7, 2, 3 } }, |
3733 | { ISD::SMIN, MVT::v4i64, { 2, 7, 2, 3 } }, |
3734 | { ISD::SMIN, MVT::v8i32, { 1, 1, 1, 2 } }, |
3735 | { ISD::SMIN, MVT::v16i16, { 1, 1, 1, 2 } }, |
3736 | { ISD::SMIN, MVT::v32i8, { 1, 1, 1, 2 } }, |
3737 | { ISD::SSUBSAT, MVT::v16i16, { 1 } }, |
3738 | { ISD::SSUBSAT, MVT::v32i8, { 1 } }, |
3739 | { ISD::UADDSAT, MVT::v16i16, { 1 } }, |
3740 | { ISD::UADDSAT, MVT::v32i8, { 1 } }, |
3741 | { ISD::UADDSAT, MVT::v8i32, { 3 } }, // not + pminud + paddd |
3742 | { ISD::UMAX, MVT::v2i64, { 2, 8, 5, 6 } }, |
3743 | { ISD::UMAX, MVT::v4i64, { 2, 8, 5, 8 } }, |
3744 | { ISD::UMAX, MVT::v8i32, { 1, 1, 1, 2 } }, |
3745 | { ISD::UMAX, MVT::v16i16, { 1, 1, 1, 2 } }, |
3746 | { ISD::UMAX, MVT::v32i8, { 1, 1, 1, 2 } }, |
3747 | { ISD::UMIN, MVT::v2i64, { 2, 8, 5, 6 } }, |
3748 | { ISD::UMIN, MVT::v4i64, { 2, 8, 5, 8 } }, |
3749 | { ISD::UMIN, MVT::v8i32, { 1, 1, 1, 2 } }, |
3750 | { ISD::UMIN, MVT::v16i16, { 1, 1, 1, 2 } }, |
3751 | { ISD::UMIN, MVT::v32i8, { 1, 1, 1, 2 } }, |
3752 | { ISD::USUBSAT, MVT::v16i16, { 1 } }, |
3753 | { ISD::USUBSAT, MVT::v32i8, { 1 } }, |
3754 | { ISD::USUBSAT, MVT::v8i32, { 2 } }, // pmaxud + psubd |
3755 | { ISD::FMAXNUM, MVT::f32, { 2, 7, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
3756 | { ISD::FMAXNUM, MVT::v4f32, { 2, 7, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3757 | { ISD::FMAXNUM, MVT::v8f32, { 3, 7, 3, 6 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3758 | { ISD::FMAXNUM, MVT::f64, { 2, 7, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
3759 | { ISD::FMAXNUM, MVT::v2f64, { 2, 7, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3760 | { ISD::FMAXNUM, MVT::v4f64, { 3, 7, 3, 6 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3761 | { ISD::FSQRT, MVT::f32, { 7, 15, 1, 1 } }, // vsqrtss |
3762 | { ISD::FSQRT, MVT::v4f32, { 7, 15, 1, 1 } }, // vsqrtps |
3763 | { ISD::FSQRT, MVT::v8f32, { 14, 21, 1, 3 } }, // vsqrtps |
3764 | { ISD::FSQRT, MVT::f64, { 14, 21, 1, 1 } }, // vsqrtsd |
3765 | { ISD::FSQRT, MVT::v2f64, { 14, 21, 1, 1 } }, // vsqrtpd |
3766 | { ISD::FSQRT, MVT::v4f64, { 28, 35, 1, 3 } }, // vsqrtpd |
3767 | }; |
3768 | static const CostKindTblEntry AVX1CostTbl[] = { |
3769 | { ISD::ABS, MVT::v4i64, { 6, 8, 6, 12 } }, // VBLENDVPD(X,VPSUBQ(0,X),X) |
3770 | { ISD::ABS, MVT::v8i32, { 3, 6, 4, 5 } }, |
3771 | { ISD::ABS, MVT::v16i16, { 3, 6, 4, 5 } }, |
3772 | { ISD::ABS, MVT::v32i8, { 3, 6, 4, 5 } }, |
3773 | { ISD::BITREVERSE, MVT::v4i64, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert |
3774 | { ISD::BITREVERSE, MVT::v2i64, { 8, 13, 10, 16 } }, |
3775 | { ISD::BITREVERSE, MVT::v8i32, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert |
3776 | { ISD::BITREVERSE, MVT::v4i32, { 8, 13, 10, 16 } }, |
3777 | { ISD::BITREVERSE, MVT::v16i16, { 17, 20, 20, 33 } }, // 2 x 128-bit Op + extract/insert |
3778 | { ISD::BITREVERSE, MVT::v8i16, { 8, 13, 10, 16 } }, |
3779 | { ISD::BITREVERSE, MVT::v32i8, { 13, 15, 17, 26 } }, // 2 x 128-bit Op + extract/insert |
3780 | { ISD::BITREVERSE, MVT::v16i8, { 7, 7, 9, 13 } }, |
3781 | { ISD::BSWAP, MVT::v4i64, { 5, 6, 5, 10 } }, |
3782 | { ISD::BSWAP, MVT::v2i64, { 2, 2, 1, 3 } }, |
3783 | { ISD::BSWAP, MVT::v8i32, { 5, 6, 5, 10 } }, |
3784 | { ISD::BSWAP, MVT::v4i32, { 2, 2, 1, 3 } }, |
3785 | { ISD::BSWAP, MVT::v16i16, { 5, 6, 5, 10 } }, |
3786 | { ISD::BSWAP, MVT::v8i16, { 2, 2, 1, 3 } }, |
3787 | { ISD::CTLZ, MVT::v4i64, { 29, 33, 49, 58 } }, // 2 x 128-bit Op + extract/insert |
3788 | { ISD::CTLZ, MVT::v2i64, { 14, 24, 24, 28 } }, |
3789 | { ISD::CTLZ, MVT::v8i32, { 24, 28, 39, 48 } }, // 2 x 128-bit Op + extract/insert |
3790 | { ISD::CTLZ, MVT::v4i32, { 12, 20, 19, 23 } }, |
3791 | { ISD::CTLZ, MVT::v16i16, { 19, 22, 29, 38 } }, // 2 x 128-bit Op + extract/insert |
3792 | { ISD::CTLZ, MVT::v8i16, { 9, 16, 14, 18 } }, |
3793 | { ISD::CTLZ, MVT::v32i8, { 14, 15, 19, 28 } }, // 2 x 128-bit Op + extract/insert |
3794 | { ISD::CTLZ, MVT::v16i8, { 7, 12, 9, 13 } }, |
3795 | { ISD::CTPOP, MVT::v4i64, { 14, 18, 19, 28 } }, // 2 x 128-bit Op + extract/insert |
3796 | { ISD::CTPOP, MVT::v2i64, { 7, 14, 10, 14 } }, |
3797 | { ISD::CTPOP, MVT::v8i32, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert |
3798 | { ISD::CTPOP, MVT::v4i32, { 9, 20, 14, 18 } }, |
3799 | { ISD::CTPOP, MVT::v16i16, { 16, 21, 22, 31 } }, // 2 x 128-bit Op + extract/insert |
3800 | { ISD::CTPOP, MVT::v8i16, { 8, 18, 11, 15 } }, |
3801 | { ISD::CTPOP, MVT::v32i8, { 13, 15, 16, 25 } }, // 2 x 128-bit Op + extract/insert |
3802 | { ISD::CTPOP, MVT::v16i8, { 6, 12, 8, 12 } }, |
3803 | { ISD::CTTZ, MVT::v4i64, { 17, 22, 24, 33 } }, // 2 x 128-bit Op + extract/insert |
3804 | { ISD::CTTZ, MVT::v2i64, { 9, 19, 13, 17 } }, |
3805 | { ISD::CTTZ, MVT::v8i32, { 21, 27, 32, 41 } }, // 2 x 128-bit Op + extract/insert |
3806 | { ISD::CTTZ, MVT::v4i32, { 11, 24, 17, 21 } }, |
3807 | { ISD::CTTZ, MVT::v16i16, { 18, 24, 27, 36 } }, // 2 x 128-bit Op + extract/insert |
3808 | { ISD::CTTZ, MVT::v8i16, { 9, 21, 14, 18 } }, |
3809 | { ISD::CTTZ, MVT::v32i8, { 15, 18, 21, 30 } }, // 2 x 128-bit Op + extract/insert |
3810 | { ISD::CTTZ, MVT::v16i8, { 8, 16, 11, 15 } }, |
3811 | { ISD::SADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert |
3812 | { ISD::SADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert |
3813 | { ISD::SMAX, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert |
3814 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 4 } }, |
3815 | { ISD::SMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3816 | { ISD::SMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3817 | { ISD::SMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3818 | { ISD::SMIN, MVT::v4i64, { 6, 9, 6, 12 } }, // 2 x 128-bit Op + extract/insert |
3819 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, |
3820 | { ISD::SMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3821 | { ISD::SMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3822 | { ISD::SMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3823 | { ISD::SSUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert |
3824 | { ISD::SSUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert |
3825 | { ISD::UADDSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert |
3826 | { ISD::UADDSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert |
3827 | { ISD::UADDSAT, MVT::v8i32, { 8 } }, // 2 x 128-bit Op + extract/insert |
3828 | { ISD::UMAX, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert |
3829 | { ISD::UMAX, MVT::v2i64, { 4, 8, 5, 7 } }, |
3830 | { ISD::UMAX, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3831 | { ISD::UMAX, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3832 | { ISD::UMAX, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3833 | { ISD::UMIN, MVT::v4i64, { 9, 10, 11, 17 } }, // 2 x 128-bit Op + extract/insert |
3834 | { ISD::UMIN, MVT::v2i64, { 4, 8, 5, 7 } }, |
3835 | { ISD::UMIN, MVT::v8i32, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3836 | { ISD::UMIN, MVT::v16i16, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3837 | { ISD::UMIN, MVT::v32i8, { 4, 6, 5, 6 } }, // 2 x 128-bit Op + extract/insert |
3838 | { ISD::USUBSAT, MVT::v16i16, { 4 } }, // 2 x 128-bit Op + extract/insert |
3839 | { ISD::USUBSAT, MVT::v32i8, { 4 } }, // 2 x 128-bit Op + extract/insert |
3840 | { ISD::USUBSAT, MVT::v8i32, { 6 } }, // 2 x 128-bit Op + extract/insert |
3841 | { ISD::FMAXNUM, MVT::f32, { 3, 6, 3, 5 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
3842 | { ISD::FMAXNUM, MVT::v4f32, { 3, 6, 3, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3843 | { ISD::FMAXNUM, MVT::v8f32, { 5, 7, 3, 10 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3844 | { ISD::FMAXNUM, MVT::f64, { 3, 6, 3, 5 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
3845 | { ISD::FMAXNUM, MVT::v2f64, { 3, 6, 3, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3846 | { ISD::FMAXNUM, MVT::v4f64, { 5, 7, 3, 10 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3847 | { ISD::FSQRT, MVT::f32, { 21, 21, 1, 1 } }, // vsqrtss |
3848 | { ISD::FSQRT, MVT::v4f32, { 21, 21, 1, 1 } }, // vsqrtps |
3849 | { ISD::FSQRT, MVT::v8f32, { 42, 42, 1, 3 } }, // vsqrtps |
3850 | { ISD::FSQRT, MVT::f64, { 27, 27, 1, 1 } }, // vsqrtsd |
3851 | { ISD::FSQRT, MVT::v2f64, { 27, 27, 1, 1 } }, // vsqrtpd |
3852 | { ISD::FSQRT, MVT::v4f64, { 54, 54, 1, 3 } }, // vsqrtpd |
3853 | }; |
3854 | static const CostKindTblEntry GFNICostTbl[] = { |
3855 | { ISD::BITREVERSE, MVT::i8, { 3, 3, 3, 4 } }, // gf2p8affineqb |
3856 | { ISD::BITREVERSE, MVT::i16, { 3, 3, 4, 6 } }, // gf2p8affineqb |
3857 | { ISD::BITREVERSE, MVT::i32, { 3, 3, 4, 5 } }, // gf2p8affineqb |
3858 | { ISD::BITREVERSE, MVT::i64, { 3, 3, 4, 6 } }, // gf2p8affineqb |
3859 | { ISD::BITREVERSE, MVT::v16i8, { 1, 6, 1, 2 } }, // gf2p8affineqb |
3860 | { ISD::BITREVERSE, MVT::v32i8, { 1, 6, 1, 2 } }, // gf2p8affineqb |
3861 | { ISD::BITREVERSE, MVT::v64i8, { 1, 6, 1, 2 } }, // gf2p8affineqb |
3862 | { ISD::BITREVERSE, MVT::v8i16, { 1, 8, 2, 4 } }, // gf2p8affineqb |
3863 | { ISD::BITREVERSE, MVT::v16i16, { 1, 9, 2, 4 } }, // gf2p8affineqb |
3864 | { ISD::BITREVERSE, MVT::v32i16, { 1, 9, 2, 4 } }, // gf2p8affineqb |
3865 | { ISD::BITREVERSE, MVT::v4i32, { 1, 8, 2, 4 } }, // gf2p8affineqb |
3866 | { ISD::BITREVERSE, MVT::v8i32, { 1, 9, 2, 4 } }, // gf2p8affineqb |
3867 | { ISD::BITREVERSE, MVT::v16i32, { 1, 9, 2, 4 } }, // gf2p8affineqb |
3868 | { ISD::BITREVERSE, MVT::v2i64, { 1, 8, 2, 4 } }, // gf2p8affineqb |
3869 | { ISD::BITREVERSE, MVT::v4i64, { 1, 9, 2, 4 } }, // gf2p8affineqb |
3870 | { ISD::BITREVERSE, MVT::v8i64, { 1, 9, 2, 4 } }, // gf2p8affineqb |
3871 | }; |
3872 | static const CostKindTblEntry GLMCostTbl[] = { |
3873 | { ISD::FSQRT, MVT::f32, { 19, 20, 1, 1 } }, // sqrtss |
3874 | { ISD::FSQRT, MVT::v4f32, { 37, 41, 1, 5 } }, // sqrtps |
3875 | { ISD::FSQRT, MVT::f64, { 34, 35, 1, 1 } }, // sqrtsd |
3876 | { ISD::FSQRT, MVT::v2f64, { 67, 71, 1, 5 } }, // sqrtpd |
3877 | }; |
3878 | static const CostKindTblEntry SLMCostTbl[] = { |
3879 | { ISD::BSWAP, MVT::v2i64, { 5, 5, 1, 5 } }, |
3880 | { ISD::BSWAP, MVT::v4i32, { 5, 5, 1, 5 } }, |
3881 | { ISD::BSWAP, MVT::v8i16, { 5, 5, 1, 5 } }, |
3882 | { ISD::FSQRT, MVT::f32, { 20, 20, 1, 1 } }, // sqrtss |
3883 | { ISD::FSQRT, MVT::v4f32, { 40, 41, 1, 5 } }, // sqrtps |
3884 | { ISD::FSQRT, MVT::f64, { 35, 35, 1, 1 } }, // sqrtsd |
3885 | { ISD::FSQRT, MVT::v2f64, { 70, 71, 1, 5 } }, // sqrtpd |
3886 | }; |
3887 | static const CostKindTblEntry SSE42CostTbl[] = { |
3888 | { ISD::USUBSAT, MVT::v4i32, { 2 } }, // pmaxud + psubd |
3889 | { ISD::UADDSAT, MVT::v4i32, { 3 } }, // not + pminud + paddd |
3890 | { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, // MAXSS + CMPUNORDSS + BLENDVPS |
3891 | { ISD::FMAXNUM, MVT::v4f32, { 4, 4, 4, 5 } }, // MAXPS + CMPUNORDPS + BLENDVPS |
3892 | { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, // MAXSD + CMPUNORDSD + BLENDVPD |
3893 | { ISD::FMAXNUM, MVT::v2f64, { 4, 4, 4, 5 } }, // MAXPD + CMPUNORDPD + BLENDVPD |
3894 | { ISD::FSQRT, MVT::f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ |
3895 | { ISD::FSQRT, MVT::v4f32, { 18, 18, 1, 1 } }, // Nehalem from http://www.agner.org/ |
3896 | }; |
3897 | static const CostKindTblEntry SSE41CostTbl[] = { |
3898 | { ISD::ABS, MVT::v2i64, { 3, 4, 3, 5 } }, // BLENDVPD(X,PSUBQ(0,X),X) |
3899 | { ISD::SMAX, MVT::v2i64, { 3, 7, 2, 3 } }, |
3900 | { ISD::SMAX, MVT::v4i32, { 1, 1, 1, 1 } }, |
3901 | { ISD::SMAX, MVT::v16i8, { 1, 1, 1, 1 } }, |
3902 | { ISD::SMIN, MVT::v2i64, { 3, 7, 2, 3 } }, |
3903 | { ISD::SMIN, MVT::v4i32, { 1, 1, 1, 1 } }, |
3904 | { ISD::SMIN, MVT::v16i8, { 1, 1, 1, 1 } }, |
3905 | { ISD::UMAX, MVT::v2i64, { 2, 11, 6, 7 } }, |
3906 | { ISD::UMAX, MVT::v4i32, { 1, 1, 1, 1 } }, |
3907 | { ISD::UMAX, MVT::v8i16, { 1, 1, 1, 1 } }, |
3908 | { ISD::UMIN, MVT::v2i64, { 2, 11, 6, 7 } }, |
3909 | { ISD::UMIN, MVT::v4i32, { 1, 1, 1, 1 } }, |
3910 | { ISD::UMIN, MVT::v8i16, { 1, 1, 1, 1 } }, |
3911 | }; |
3912 | static const CostKindTblEntry SSSE3CostTbl[] = { |
3913 | { ISD::ABS, MVT::v4i32, { 1, 2, 1, 1 } }, |
3914 | { ISD::ABS, MVT::v8i16, { 1, 2, 1, 1 } }, |
3915 | { ISD::ABS, MVT::v16i8, { 1, 2, 1, 1 } }, |
3916 | { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 11, 21 } }, |
3917 | { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 11, 21 } }, |
3918 | { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 11, 21 } }, |
3919 | { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 10, 16 } }, |
3920 | { ISD::BSWAP, MVT::v2i64, { 2, 3, 1, 5 } }, |
3921 | { ISD::BSWAP, MVT::v4i32, { 2, 3, 1, 5 } }, |
3922 | { ISD::BSWAP, MVT::v8i16, { 2, 3, 1, 5 } }, |
3923 | { ISD::CTLZ, MVT::v2i64, { 18, 28, 28, 35 } }, |
3924 | { ISD::CTLZ, MVT::v4i32, { 15, 20, 22, 28 } }, |
3925 | { ISD::CTLZ, MVT::v8i16, { 13, 17, 16, 22 } }, |
3926 | { ISD::CTLZ, MVT::v16i8, { 11, 15, 10, 16 } }, |
3927 | { ISD::CTPOP, MVT::v2i64, { 13, 19, 12, 18 } }, |
3928 | { ISD::CTPOP, MVT::v4i32, { 18, 24, 16, 22 } }, |
3929 | { ISD::CTPOP, MVT::v8i16, { 13, 18, 14, 20 } }, |
3930 | { ISD::CTPOP, MVT::v16i8, { 11, 12, 10, 16 } }, |
3931 | { ISD::CTTZ, MVT::v2i64, { 13, 25, 15, 22 } }, |
3932 | { ISD::CTTZ, MVT::v4i32, { 18, 26, 19, 25 } }, |
3933 | { ISD::CTTZ, MVT::v8i16, { 13, 20, 17, 23 } }, |
3934 | { ISD::CTTZ, MVT::v16i8, { 11, 16, 13, 19 } } |
3935 | }; |
3936 | static const CostKindTblEntry SSE2CostTbl[] = { |
3937 | { ISD::ABS, MVT::v2i64, { 3, 6, 5, 5 } }, |
3938 | { ISD::ABS, MVT::v4i32, { 1, 4, 4, 4 } }, |
3939 | { ISD::ABS, MVT::v8i16, { 1, 2, 3, 3 } }, |
3940 | { ISD::ABS, MVT::v16i8, { 1, 2, 3, 3 } }, |
3941 | { ISD::BITREVERSE, MVT::v2i64, { 16, 20, 32, 32 } }, |
3942 | { ISD::BITREVERSE, MVT::v4i32, { 16, 20, 30, 30 } }, |
3943 | { ISD::BITREVERSE, MVT::v8i16, { 16, 20, 25, 25 } }, |
3944 | { ISD::BITREVERSE, MVT::v16i8, { 11, 12, 21, 21 } }, |
3945 | { ISD::BSWAP, MVT::v2i64, { 5, 6, 11, 11 } }, |
3946 | { ISD::BSWAP, MVT::v4i32, { 5, 5, 9, 9 } }, |
3947 | { ISD::BSWAP, MVT::v8i16, { 5, 5, 4, 5 } }, |
3948 | { ISD::CTLZ, MVT::v2i64, { 10, 45, 36, 38 } }, |
3949 | { ISD::CTLZ, MVT::v4i32, { 10, 45, 38, 40 } }, |
3950 | { ISD::CTLZ, MVT::v8i16, { 9, 38, 32, 34 } }, |
3951 | { ISD::CTLZ, MVT::v16i8, { 8, 39, 29, 32 } }, |
3952 | { ISD::CTPOP, MVT::v2i64, { 12, 26, 16, 18 } }, |
3953 | { ISD::CTPOP, MVT::v4i32, { 15, 29, 21, 23 } }, |
3954 | { ISD::CTPOP, MVT::v8i16, { 13, 25, 18, 20 } }, |
3955 | { ISD::CTPOP, MVT::v16i8, { 10, 21, 14, 16 } }, |
3956 | { ISD::CTTZ, MVT::v2i64, { 14, 28, 19, 21 } }, |
3957 | { ISD::CTTZ, MVT::v4i32, { 18, 31, 24, 26 } }, |
3958 | { ISD::CTTZ, MVT::v8i16, { 16, 27, 21, 23 } }, |
3959 | { ISD::CTTZ, MVT::v16i8, { 13, 23, 17, 19 } }, |
3960 | { ISD::SADDSAT, MVT::v8i16, { 1 } }, |
3961 | { ISD::SADDSAT, MVT::v16i8, { 1 } }, |
3962 | { ISD::SMAX, MVT::v2i64, { 4, 8, 15, 15 } }, |
3963 | { ISD::SMAX, MVT::v4i32, { 2, 4, 5, 5 } }, |
3964 | { ISD::SMAX, MVT::v8i16, { 1, 1, 1, 1 } }, |
3965 | { ISD::SMAX, MVT::v16i8, { 2, 4, 5, 5 } }, |
3966 | { ISD::SMIN, MVT::v2i64, { 4, 8, 15, 15 } }, |
3967 | { ISD::SMIN, MVT::v4i32, { 2, 4, 5, 5 } }, |
3968 | { ISD::SMIN, MVT::v8i16, { 1, 1, 1, 1 } }, |
3969 | { ISD::SMIN, MVT::v16i8, { 2, 4, 5, 5 } }, |
3970 | { ISD::SSUBSAT, MVT::v8i16, { 1 } }, |
3971 | { ISD::SSUBSAT, MVT::v16i8, { 1 } }, |
3972 | { ISD::UADDSAT, MVT::v8i16, { 1 } }, |
3973 | { ISD::UADDSAT, MVT::v16i8, { 1 } }, |
3974 | { ISD::UMAX, MVT::v2i64, { 4, 8, 15, 15 } }, |
3975 | { ISD::UMAX, MVT::v4i32, { 2, 5, 8, 8 } }, |
3976 | { ISD::UMAX, MVT::v8i16, { 1, 3, 3, 3 } }, |
3977 | { ISD::UMAX, MVT::v16i8, { 1, 1, 1, 1 } }, |
3978 | { ISD::UMIN, MVT::v2i64, { 4, 8, 15, 15 } }, |
3979 | { ISD::UMIN, MVT::v4i32, { 2, 5, 8, 8 } }, |
3980 | { ISD::UMIN, MVT::v8i16, { 1, 3, 3, 3 } }, |
3981 | { ISD::UMIN, MVT::v16i8, { 1, 1, 1, 1 } }, |
3982 | { ISD::USUBSAT, MVT::v8i16, { 1 } }, |
3983 | { ISD::USUBSAT, MVT::v16i8, { 1 } }, |
3984 | { ISD::FMAXNUM, MVT::f64, { 5, 5, 7, 7 } }, |
3985 | { ISD::FMAXNUM, MVT::v2f64, { 4, 6, 6, 6 } }, |
3986 | { ISD::FSQRT, MVT::f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ |
3987 | { ISD::FSQRT, MVT::v2f64, { 32, 32, 1, 1 } }, // Nehalem from http://www.agner.org/ |
3988 | }; |
3989 | static const CostKindTblEntry SSE1CostTbl[] = { |
3990 | { ISD::FMAXNUM, MVT::f32, { 5, 5, 7, 7 } }, |
3991 | { ISD::FMAXNUM, MVT::v4f32, { 4, 6, 6, 6 } }, |
3992 | { ISD::FSQRT, MVT::f32, { 28, 30, 1, 2 } }, // Pentium III from http://www.agner.org/ |
3993 | { ISD::FSQRT, MVT::v4f32, { 56, 56, 1, 2 } }, // Pentium III from http://www.agner.org/ |
3994 | }; |
3995 | static const CostKindTblEntry BMI64CostTbl[] = { // 64-bit targets |
3996 | { ISD::CTTZ, MVT::i64, { 1 } }, |
3997 | }; |
3998 | static const CostKindTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets |
3999 | { ISD::CTTZ, MVT::i32, { 1 } }, |
4000 | { ISD::CTTZ, MVT::i16, { 1 } }, |
4001 | { ISD::CTTZ, MVT::i8, { 1 } }, |
4002 | }; |
4003 | static const CostKindTblEntry LZCNT64CostTbl[] = { // 64-bit targets |
4004 | { ISD::CTLZ, MVT::i64, { 1 } }, |
4005 | }; |
4006 | static const CostKindTblEntry LZCNT32CostTbl[] = { // 32 or 64-bit targets |
4007 | { ISD::CTLZ, MVT::i32, { 1 } }, |
4008 | { ISD::CTLZ, MVT::i16, { 2 } }, |
4009 | { ISD::CTLZ, MVT::i8, { 2 } }, |
4010 | }; |
4011 | static const CostKindTblEntry POPCNT64CostTbl[] = { // 64-bit targets |
4012 | { ISD::CTPOP, MVT::i64, { 1, 1, 1, 1 } }, // popcnt |
4013 | }; |
4014 | static const CostKindTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets |
4015 | { ISD::CTPOP, MVT::i32, { 1, 1, 1, 1 } }, // popcnt |
4016 | { ISD::CTPOP, MVT::i16, { 1, 1, 2, 2 } }, // popcnt(zext()) |
4017 | { ISD::CTPOP, MVT::i8, { 1, 1, 2, 2 } }, // popcnt(zext()) |
4018 | }; |
4019 | static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets |
4020 | { ISD::ABS, MVT::i64, { 1, 2, 3, 4 } }, // SUB+CMOV |
4021 | { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, |
4022 | { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, |
4023 | { ISD::CTLZ, MVT::i64, { 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4024 | { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR+XOR |
4025 | { ISD::CTTZ, MVT::i64, { 3 } }, // TEST+BSF+CMOV/BRANCH |
4026 | { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 1, 1, 1 } }, // BSR |
4027 | { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, |
4028 | { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, |
4029 | { ISD::ROTR, MVT::i64, { 2, 3, 1, 3 } }, |
4030 | { X86ISD::VROTLI, MVT::i64, { 1, 1, 1, 1 } }, |
4031 | { ISD::FSHL, MVT::i64, { 4, 4, 1, 4 } }, |
4032 | { ISD::SMAX, MVT::i64, { 1, 3, 2, 3 } }, |
4033 | { ISD::SMIN, MVT::i64, { 1, 3, 2, 3 } }, |
4034 | { ISD::UMAX, MVT::i64, { 1, 3, 2, 3 } }, |
4035 | { ISD::UMIN, MVT::i64, { 1, 3, 2, 3 } }, |
4036 | { ISD::SADDO, MVT::i64, { 1 } }, |
4037 | { ISD::UADDO, MVT::i64, { 1 } }, |
4038 | { ISD::UMULO, MVT::i64, { 2 } }, // mulq + seto |
4039 | }; |
4040 | static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets |
4041 | { ISD::ABS, MVT::i32, { 1, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV |
4042 | { ISD::ABS, MVT::i16, { 2, 2, 3, 4 } }, // SUB+XOR+SRA or SUB+CMOV |
4043 | { ISD::ABS, MVT::i8, { 2, 4, 4, 4 } }, // SUB+XOR+SRA |
4044 | { ISD::BITREVERSE, MVT::i32, { 9, 12, 17, 19 } }, |
4045 | { ISD::BITREVERSE, MVT::i16, { 9, 12, 17, 19 } }, |
4046 | { ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } }, |
4047 | { ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } }, |
4048 | { ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL |
4049 | { ISD::CTLZ, MVT::i32, { 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4050 | { ISD::CTLZ, MVT::i16, { 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4051 | { ISD::CTLZ, MVT::i8, { 4 } }, // BSR+XOR or BSR+XOR+CMOV |
4052 | { ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSR+XOR |
4053 | { ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 3, 3 } }, // BSR+XOR |
4054 | { ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR |
4055 | { ISD::CTTZ, MVT::i32, { 3 } }, // TEST+BSF+CMOV/BRANCH |
4056 | { ISD::CTTZ, MVT::i16, { 3 } }, // TEST+BSF+CMOV/BRANCH |
4057 | { ISD::CTTZ, MVT::i8, { 3 } }, // TEST+BSF+CMOV/BRANCH |
4058 | { ISD::CTTZ_ZERO_UNDEF, MVT::i32,{ 1, 1, 1, 1 } }, // BSF |
4059 | { ISD::CTTZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 1, 1 } }, // BSF |
4060 | { ISD::CTTZ_ZERO_UNDEF, MVT::i8, { 2, 2, 1, 1 } }, // BSF |
4061 | { ISD::CTPOP, MVT::i32, { 8, 7, 15, 15 } }, |
4062 | { ISD::CTPOP, MVT::i16, { 9, 8, 17, 17 } }, |
4063 | { ISD::CTPOP, MVT::i8, { 7, 6, 6, 6 } }, |
4064 | { ISD::ROTL, MVT::i32, { 2, 3, 1, 3 } }, |
4065 | { ISD::ROTL, MVT::i16, { 2, 3, 1, 3 } }, |
4066 | { ISD::ROTL, MVT::i8, { 2, 3, 1, 3 } }, |
4067 | { ISD::ROTR, MVT::i32, { 2, 3, 1, 3 } }, |
4068 | { ISD::ROTR, MVT::i16, { 2, 3, 1, 3 } }, |
4069 | { ISD::ROTR, MVT::i8, { 2, 3, 1, 3 } }, |
4070 | { X86ISD::VROTLI, MVT::i32, { 1, 1, 1, 1 } }, |
4071 | { X86ISD::VROTLI, MVT::i16, { 1, 1, 1, 1 } }, |
4072 | { X86ISD::VROTLI, MVT::i8, { 1, 1, 1, 1 } }, |
4073 | { ISD::FSHL, MVT::i32, { 4, 4, 1, 4 } }, |
4074 | { ISD::FSHL, MVT::i16, { 4, 4, 2, 5 } }, |
4075 | { ISD::FSHL, MVT::i8, { 4, 4, 2, 5 } }, |
4076 | { ISD::SMAX, MVT::i32, { 1, 2, 2, 3 } }, |
4077 | { ISD::SMAX, MVT::i16, { 1, 4, 2, 4 } }, |
4078 | { ISD::SMAX, MVT::i8, { 1, 4, 2, 4 } }, |
4079 | { ISD::SMIN, MVT::i32, { 1, 2, 2, 3 } }, |
4080 | { ISD::SMIN, MVT::i16, { 1, 4, 2, 4 } }, |
4081 | { ISD::SMIN, MVT::i8, { 1, 4, 2, 4 } }, |
4082 | { ISD::UMAX, MVT::i32, { 1, 2, 2, 3 } }, |
4083 | { ISD::UMAX, MVT::i16, { 1, 4, 2, 4 } }, |
4084 | { ISD::UMAX, MVT::i8, { 1, 4, 2, 4 } }, |
4085 | { ISD::UMIN, MVT::i32, { 1, 2, 2, 3 } }, |
4086 | { ISD::UMIN, MVT::i16, { 1, 4, 2, 4 } }, |
4087 | { ISD::UMIN, MVT::i8, { 1, 4, 2, 4 } }, |
4088 | { ISD::SADDO, MVT::i32, { 1 } }, |
4089 | { ISD::SADDO, MVT::i16, { 1 } }, |
4090 | { ISD::SADDO, MVT::i8, { 1 } }, |
4091 | { ISD::UADDO, MVT::i32, { 1 } }, |
4092 | { ISD::UADDO, MVT::i16, { 1 } }, |
4093 | { ISD::UADDO, MVT::i8, { 1 } }, |
4094 | { ISD::UMULO, MVT::i32, { 2 } }, // mul + seto |
4095 | { ISD::UMULO, MVT::i16, { 2 } }, |
4096 | { ISD::UMULO, MVT::i8, { 2 } }, |
4097 | }; |
4098 | |
4099 | Type *RetTy = ICA.getReturnType(); |
4100 | Type *OpTy = RetTy; |
4101 | Intrinsic::ID IID = ICA.getID(); |
4102 | unsigned ISD = ISD::DELETED_NODE; |
4103 | switch (IID) { |
4104 | default: |
4105 | break; |
4106 | case Intrinsic::abs: |
4107 | ISD = ISD::ABS; |
4108 | break; |
4109 | case Intrinsic::bitreverse: |
4110 | ISD = ISD::BITREVERSE; |
4111 | break; |
4112 | case Intrinsic::bswap: |
4113 | ISD = ISD::BSWAP; |
4114 | break; |
4115 | case Intrinsic::ctlz: |
4116 | ISD = ISD::CTLZ; |
4117 | break; |
4118 | case Intrinsic::ctpop: |
4119 | ISD = ISD::CTPOP; |
4120 | break; |
4121 | case Intrinsic::cttz: |
4122 | ISD = ISD::CTTZ; |
4123 | break; |
4124 | case Intrinsic::fshl: |
4125 | ISD = ISD::FSHL; |
4126 | if (!ICA.isTypeBasedOnly()) { |
4127 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4128 | if (Args[0] == Args[1]) { |
4129 | ISD = ISD::ROTL; |
4130 | // Handle uniform constant rotation amounts. |
4131 | // TODO: Handle funnel-shift cases. |
4132 | const APInt *Amt; |
4133 | if (Args[2] && |
4134 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
4135 | ISD = X86ISD::VROTLI; |
4136 | } |
4137 | } |
4138 | break; |
4139 | case Intrinsic::fshr: |
4140 | // FSHR has same costs so don't duplicate. |
4141 | ISD = ISD::FSHL; |
4142 | if (!ICA.isTypeBasedOnly()) { |
4143 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4144 | if (Args[0] == Args[1]) { |
4145 | ISD = ISD::ROTR; |
4146 | // Handle uniform constant rotation amount. |
4147 | // TODO: Handle funnel-shift cases. |
4148 | const APInt *Amt; |
4149 | if (Args[2] && |
4150 | PatternMatch::match(V: Args[2], P: PatternMatch::m_APIntAllowPoison(Res&: Amt))) |
4151 | ISD = X86ISD::VROTLI; |
4152 | } |
4153 | } |
4154 | break; |
4155 | case Intrinsic::maxnum: |
4156 | case Intrinsic::minnum: |
4157 | // FMINNUM has same costs so don't duplicate. |
4158 | ISD = ISD::FMAXNUM; |
4159 | break; |
4160 | case Intrinsic::sadd_sat: |
4161 | ISD = ISD::SADDSAT; |
4162 | break; |
4163 | case Intrinsic::smax: |
4164 | ISD = ISD::SMAX; |
4165 | break; |
4166 | case Intrinsic::smin: |
4167 | ISD = ISD::SMIN; |
4168 | break; |
4169 | case Intrinsic::ssub_sat: |
4170 | ISD = ISD::SSUBSAT; |
4171 | break; |
4172 | case Intrinsic::uadd_sat: |
4173 | ISD = ISD::UADDSAT; |
4174 | break; |
4175 | case Intrinsic::umax: |
4176 | ISD = ISD::UMAX; |
4177 | break; |
4178 | case Intrinsic::umin: |
4179 | ISD = ISD::UMIN; |
4180 | break; |
4181 | case Intrinsic::usub_sat: |
4182 | ISD = ISD::USUBSAT; |
4183 | break; |
4184 | case Intrinsic::sqrt: |
4185 | ISD = ISD::FSQRT; |
4186 | break; |
4187 | case Intrinsic::sadd_with_overflow: |
4188 | case Intrinsic::ssub_with_overflow: |
4189 | // SSUBO has same costs so don't duplicate. |
4190 | ISD = ISD::SADDO; |
4191 | OpTy = RetTy->getContainedType(i: 0); |
4192 | break; |
4193 | case Intrinsic::uadd_with_overflow: |
4194 | case Intrinsic::usub_with_overflow: |
4195 | // USUBO has same costs so don't duplicate. |
4196 | ISD = ISD::UADDO; |
4197 | OpTy = RetTy->getContainedType(i: 0); |
4198 | break; |
4199 | case Intrinsic::umul_with_overflow: |
4200 | case Intrinsic::smul_with_overflow: |
4201 | // SMULO has same costs so don't duplicate. |
4202 | ISD = ISD::UMULO; |
4203 | OpTy = RetTy->getContainedType(i: 0); |
4204 | break; |
4205 | } |
4206 | |
4207 | if (ISD != ISD::DELETED_NODE) { |
4208 | // Legalize the type. |
4209 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: OpTy); |
4210 | MVT MTy = LT.second; |
4211 | |
4212 | // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost. |
4213 | if (((ISD == ISD::CTTZ && !ST->hasBMI()) || |
4214 | (ISD == ISD::CTLZ && !ST->hasLZCNT())) && |
4215 | !MTy.isVector() && !ICA.isTypeBasedOnly()) { |
4216 | const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); |
4217 | if (auto *Cst = dyn_cast<ConstantInt>(Val: Args[1])) |
4218 | if (Cst->isAllOnesValue()) |
4219 | ISD = ISD == ISD::CTTZ ? ISD::CTTZ_ZERO_UNDEF : ISD::CTLZ_ZERO_UNDEF; |
4220 | } |
4221 | |
4222 | // FSQRT is a single instruction. |
4223 | if (ISD == ISD::FSQRT && CostKind == TTI::TCK_CodeSize) |
4224 | return LT.first; |
4225 | |
4226 | auto adjustTableCost = [](int ISD, unsigned Cost, |
4227 | InstructionCost LegalizationCost, |
4228 | FastMathFlags FMF) { |
4229 | // If there are no NANs to deal with, then these are reduced to a |
4230 | // single MIN** or MAX** instruction instead of the MIN/CMP/SELECT that we |
4231 | // assume is used in the non-fast case. |
4232 | if (ISD == ISD::FMAXNUM || ISD == ISD::FMINNUM) { |
4233 | if (FMF.noNaNs()) |
4234 | return LegalizationCost * 1; |
4235 | } |
4236 | return LegalizationCost * (int)Cost; |
4237 | }; |
4238 | |
4239 | if (ST->useGLMDivSqrtCosts()) |
4240 | if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy)) |
4241 | if (auto KindCost = Entry->Cost[CostKind]) |
4242 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4243 | ICA.getFlags()); |
4244 | |
4245 | if (ST->useSLMArithCosts()) |
4246 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) |
4247 | if (auto KindCost = Entry->Cost[CostKind]) |
4248 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4249 | ICA.getFlags()); |
4250 | |
4251 | if (ST->hasVBMI2()) |
4252 | if (const auto *Entry = CostTableLookup(AVX512VBMI2CostTbl, ISD, MTy)) |
4253 | if (auto KindCost = Entry->Cost[CostKind]) |
4254 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4255 | ICA.getFlags()); |
4256 | |
4257 | if (ST->hasBITALG()) |
4258 | if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy)) |
4259 | if (auto KindCost = Entry->Cost[CostKind]) |
4260 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4261 | ICA.getFlags()); |
4262 | |
4263 | if (ST->hasVPOPCNTDQ()) |
4264 | if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy)) |
4265 | if (auto KindCost = Entry->Cost[CostKind]) |
4266 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4267 | ICA.getFlags()); |
4268 | |
4269 | if (ST->hasGFNI()) |
4270 | if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy)) |
4271 | if (auto KindCost = Entry->Cost[CostKind]) |
4272 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4273 | ICA.getFlags()); |
4274 | |
4275 | if (ST->hasCDI()) |
4276 | if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy)) |
4277 | if (auto KindCost = Entry->Cost[CostKind]) |
4278 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4279 | ICA.getFlags()); |
4280 | |
4281 | if (ST->hasBWI()) |
4282 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) |
4283 | if (auto KindCost = Entry->Cost[CostKind]) |
4284 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4285 | ICA.getFlags()); |
4286 | |
4287 | if (ST->hasAVX512()) |
4288 | if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy)) |
4289 | if (auto KindCost = Entry->Cost[CostKind]) |
4290 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4291 | ICA.getFlags()); |
4292 | |
4293 | if (ST->hasXOP()) |
4294 | if (const auto *Entry = CostTableLookup(XOPCostTbl, ISD, MTy)) |
4295 | if (auto KindCost = Entry->Cost[CostKind]) |
4296 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4297 | ICA.getFlags()); |
4298 | |
4299 | if (ST->hasAVX2()) |
4300 | if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy)) |
4301 | if (auto KindCost = Entry->Cost[CostKind]) |
4302 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4303 | ICA.getFlags()); |
4304 | |
4305 | if (ST->hasAVX()) |
4306 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
4307 | if (auto KindCost = Entry->Cost[CostKind]) |
4308 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4309 | ICA.getFlags()); |
4310 | |
4311 | if (ST->hasSSE42()) |
4312 | if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy)) |
4313 | if (auto KindCost = Entry->Cost[CostKind]) |
4314 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4315 | ICA.getFlags()); |
4316 | |
4317 | if (ST->hasSSE41()) |
4318 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) |
4319 | if (auto KindCost = Entry->Cost[CostKind]) |
4320 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4321 | ICA.getFlags()); |
4322 | |
4323 | if (ST->hasSSSE3()) |
4324 | if (const auto *Entry = CostTableLookup(SSSE3CostTbl, ISD, MTy)) |
4325 | if (auto KindCost = Entry->Cost[CostKind]) |
4326 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4327 | ICA.getFlags()); |
4328 | |
4329 | if (ST->hasSSE2()) |
4330 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
4331 | if (auto KindCost = Entry->Cost[CostKind]) |
4332 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4333 | ICA.getFlags()); |
4334 | |
4335 | if (ST->hasSSE1()) |
4336 | if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy)) |
4337 | if (auto KindCost = Entry->Cost[CostKind]) |
4338 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4339 | ICA.getFlags()); |
4340 | |
4341 | if (ST->hasBMI()) { |
4342 | if (ST->is64Bit()) |
4343 | if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy)) |
4344 | if (auto KindCost = Entry->Cost[CostKind]) |
4345 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4346 | ICA.getFlags()); |
4347 | |
4348 | if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy)) |
4349 | if (auto KindCost = Entry->Cost[CostKind]) |
4350 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4351 | ICA.getFlags()); |
4352 | } |
4353 | |
4354 | if (ST->hasLZCNT()) { |
4355 | if (ST->is64Bit()) |
4356 | if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy)) |
4357 | if (auto KindCost = Entry->Cost[CostKind]) |
4358 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4359 | ICA.getFlags()); |
4360 | |
4361 | if (const auto *Entry = CostTableLookup(LZCNT32CostTbl, ISD, MTy)) |
4362 | if (auto KindCost = Entry->Cost[CostKind]) |
4363 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4364 | ICA.getFlags()); |
4365 | } |
4366 | |
4367 | if (ST->hasPOPCNT()) { |
4368 | if (ST->is64Bit()) |
4369 | if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy)) |
4370 | if (auto KindCost = Entry->Cost[CostKind]) |
4371 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4372 | ICA.getFlags()); |
4373 | |
4374 | if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy)) |
4375 | if (auto KindCost = Entry->Cost[CostKind]) |
4376 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4377 | ICA.getFlags()); |
4378 | } |
4379 | |
4380 | if (ISD == ISD::BSWAP && ST->hasMOVBE() && ST->hasFastMOVBE()) { |
4381 | if (const Instruction *II = ICA.getInst()) { |
4382 | if (II->hasOneUse() && isa<StoreInst>(Val: II->user_back())) |
4383 | return TTI::TCC_Free; |
4384 | if (auto *LI = dyn_cast<LoadInst>(Val: II->getOperand(i: 0))) { |
4385 | if (LI->hasOneUse()) |
4386 | return TTI::TCC_Free; |
4387 | } |
4388 | } |
4389 | } |
4390 | |
4391 | if (ST->is64Bit()) |
4392 | if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy)) |
4393 | if (auto KindCost = Entry->Cost[CostKind]) |
4394 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, |
4395 | ICA.getFlags()); |
4396 | |
4397 | if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy)) |
4398 | if (auto KindCost = Entry->Cost[CostKind]) |
4399 | return adjustTableCost(Entry->ISD, *KindCost, LT.first, ICA.getFlags()); |
4400 | } |
4401 | |
4402 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
4403 | } |
4404 | |
4405 | InstructionCost X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
4406 | TTI::TargetCostKind CostKind, |
4407 | unsigned Index, Value *Op0, |
4408 | Value *Op1) { |
4409 | static const CostTblEntry SLMCostTbl[] = { |
4410 | { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 }, |
4411 | { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 }, |
4412 | { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 }, |
4413 | { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 } |
4414 | }; |
4415 | |
4416 | assert(Val->isVectorTy() && "This must be a vector type" ); |
4417 | Type *ScalarType = Val->getScalarType(); |
4418 | InstructionCost RegisterFileMoveCost = 0; |
4419 | |
4420 | // Non-immediate extraction/insertion can be handled as a sequence of |
4421 | // aliased loads+stores via the stack. |
4422 | if (Index == -1U && (Opcode == Instruction::ExtractElement || |
4423 | Opcode == Instruction::InsertElement)) { |
4424 | // TODO: On some SSE41+ targets, we expand to cmp+splat+select patterns: |
4425 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. |
4426 | |
4427 | // TODO: Move this to BasicTTIImpl.h? We'd need better gep + index handling. |
4428 | assert(isa<FixedVectorType>(Val) && "Fixed vector type expected" ); |
4429 | Align VecAlign = DL.getPrefTypeAlign(Ty: Val); |
4430 | Align SclAlign = DL.getPrefTypeAlign(Ty: ScalarType); |
4431 | |
4432 | // Extract - store vector to stack, load scalar. |
4433 | if (Opcode == Instruction::ExtractElement) { |
4434 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
4435 | getMemoryOpCost(Opcode: Instruction::Load, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
4436 | CostKind); |
4437 | } |
4438 | // Insert - store vector to stack, store scalar, load vector. |
4439 | if (Opcode == Instruction::InsertElement) { |
4440 | return getMemoryOpCost(Opcode: Instruction::Store, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind) + |
4441 | getMemoryOpCost(Opcode: Instruction::Store, Src: ScalarType, Alignment: SclAlign, AddressSpace: 0, |
4442 | CostKind) + |
4443 | getMemoryOpCost(Opcode: Instruction::Load, Src: Val, Alignment: VecAlign, AddressSpace: 0, CostKind); |
4444 | } |
4445 | } |
4446 | |
4447 | if (Index != -1U && (Opcode == Instruction::ExtractElement || |
4448 | Opcode == Instruction::InsertElement)) { |
4449 | // Extraction of vXi1 elements are now efficiently handled by MOVMSK. |
4450 | if (Opcode == Instruction::ExtractElement && |
4451 | ScalarType->getScalarSizeInBits() == 1 && |
4452 | cast<FixedVectorType>(Val)->getNumElements() > 1) |
4453 | return 1; |
4454 | |
4455 | // Legalize the type. |
4456 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Val); |
4457 | |
4458 | // This type is legalized to a scalar type. |
4459 | if (!LT.second.isVector()) |
4460 | return 0; |
4461 | |
4462 | // The type may be split. Normalize the index to the new type. |
4463 | unsigned SizeInBits = LT.second.getSizeInBits(); |
4464 | unsigned NumElts = LT.second.getVectorNumElements(); |
4465 | unsigned SubNumElts = NumElts; |
4466 | Index = Index % NumElts; |
4467 | |
4468 | // For >128-bit vectors, we need to extract higher 128-bit subvectors. |
4469 | // For inserts, we also need to insert the subvector back. |
4470 | if (SizeInBits > 128) { |
4471 | assert((SizeInBits % 128) == 0 && "Illegal vector" ); |
4472 | unsigned NumSubVecs = SizeInBits / 128; |
4473 | SubNumElts = NumElts / NumSubVecs; |
4474 | if (SubNumElts <= Index) { |
4475 | RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1); |
4476 | Index %= SubNumElts; |
4477 | } |
4478 | } |
4479 | |
4480 | MVT MScalarTy = LT.second.getScalarType(); |
4481 | auto IsCheapPInsrPExtrInsertPS = [&]() { |
4482 | // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets. |
4483 | // Also, assume insertps is relatively cheap on all >= SSE41 targets. |
4484 | return (MScalarTy == MVT::i16 && ST->hasSSE2()) || |
4485 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
4486 | (MScalarTy == MVT::f32 && ST->hasSSE41() && |
4487 | Opcode == Instruction::InsertElement); |
4488 | }; |
4489 | |
4490 | if (Index == 0) { |
4491 | // Floating point scalars are already located in index #0. |
4492 | // Many insertions to #0 can fold away for scalar fp-ops, so let's assume |
4493 | // true for all. |
4494 | if (ScalarType->isFloatingPointTy() && |
4495 | (Opcode != Instruction::InsertElement || !Op0 || |
4496 | isa<UndefValue>(Val: Op0))) |
4497 | return RegisterFileMoveCost; |
4498 | |
4499 | if (Opcode == Instruction::InsertElement && |
4500 | isa_and_nonnull<UndefValue>(Val: Op0)) { |
4501 | // Consider the gather cost to be cheap. |
4502 | if (isa_and_nonnull<LoadInst>(Val: Op1)) |
4503 | return RegisterFileMoveCost; |
4504 | if (!IsCheapPInsrPExtrInsertPS()) { |
4505 | // mov constant-to-GPR + movd/movq GPR -> XMM. |
4506 | if (isa_and_nonnull<Constant>(Val: Op1) && Op1->getType()->isIntegerTy()) |
4507 | return 2 + RegisterFileMoveCost; |
4508 | // Assume movd/movq GPR -> XMM is relatively cheap on all targets. |
4509 | return 1 + RegisterFileMoveCost; |
4510 | } |
4511 | } |
4512 | |
4513 | // Assume movd/movq XMM -> GPR is relatively cheap on all targets. |
4514 | if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement) |
4515 | return 1 + RegisterFileMoveCost; |
4516 | } |
4517 | |
4518 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
4519 | assert(ISD && "Unexpected vector opcode" ); |
4520 | if (ST->useSLMArithCosts()) |
4521 | if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy)) |
4522 | return Entry->Cost + RegisterFileMoveCost; |
4523 | |
4524 | // Consider cheap cases. |
4525 | if (IsCheapPInsrPExtrInsertPS()) |
4526 | return 1 + RegisterFileMoveCost; |
4527 | |
4528 | // For extractions we just need to shuffle the element to index 0, which |
4529 | // should be very cheap (assume cost = 1). For insertions we need to shuffle |
4530 | // the elements to its destination. In both cases we must handle the |
4531 | // subvector move(s). |
4532 | // If the vector type is already less than 128-bits then don't reduce it. |
4533 | // TODO: Under what circumstances should we shuffle using the full width? |
4534 | InstructionCost ShuffleCost = 1; |
4535 | if (Opcode == Instruction::InsertElement) { |
4536 | auto *SubTy = cast<VectorType>(Val); |
4537 | EVT VT = TLI->getValueType(DL, Ty: Val); |
4538 | if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128) |
4539 | SubTy = FixedVectorType::get(ElementType: ScalarType, NumElts: SubNumElts); |
4540 | ShuffleCost = getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SubTy, Mask: std::nullopt, |
4541 | CostKind, Index: 0, SubTp: SubTy); |
4542 | } |
4543 | int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1; |
4544 | return ShuffleCost + IntOrFpCost + RegisterFileMoveCost; |
4545 | } |
4546 | |
4547 | return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1) + |
4548 | RegisterFileMoveCost; |
4549 | } |
4550 | |
4551 | InstructionCost |
4552 | X86TTIImpl::getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, |
4553 | bool Insert, bool , |
4554 | TTI::TargetCostKind CostKind) { |
4555 | assert(DemandedElts.getBitWidth() == |
4556 | cast<FixedVectorType>(Ty)->getNumElements() && |
4557 | "Vector size mismatch" ); |
4558 | |
4559 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
4560 | MVT MScalarTy = LT.second.getScalarType(); |
4561 | unsigned LegalVectorBitWidth = LT.second.getSizeInBits(); |
4562 | InstructionCost Cost = 0; |
4563 | |
4564 | constexpr unsigned LaneBitWidth = 128; |
4565 | assert((LegalVectorBitWidth < LaneBitWidth || |
4566 | (LegalVectorBitWidth % LaneBitWidth) == 0) && |
4567 | "Illegal vector" ); |
4568 | |
4569 | const int NumLegalVectors = *LT.first.getValue(); |
4570 | assert(NumLegalVectors >= 0 && "Negative cost!" ); |
4571 | |
4572 | // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much |
4573 | // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT. |
4574 | if (Insert) { |
4575 | if ((MScalarTy == MVT::i16 && ST->hasSSE2()) || |
4576 | (MScalarTy.isInteger() && ST->hasSSE41()) || |
4577 | (MScalarTy == MVT::f32 && ST->hasSSE41())) { |
4578 | // For types we can insert directly, insertion into 128-bit sub vectors is |
4579 | // cheap, followed by a cheap chain of concatenations. |
4580 | if (LegalVectorBitWidth <= LaneBitWidth) { |
4581 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, |
4582 | /*Extract*/ false, CostKind); |
4583 | } else { |
4584 | // In each 128-lane, if at least one index is demanded but not all |
4585 | // indices are demanded and this 128-lane is not the first 128-lane of |
4586 | // the legalized-vector, then this 128-lane needs a extracti128; If in |
4587 | // each 128-lane, there is at least one demanded index, this 128-lane |
4588 | // needs a inserti128. |
4589 | |
4590 | // The following cases will help you build a better understanding: |
4591 | // Assume we insert several elements into a v8i32 vector in avx2, |
4592 | // Case#1: inserting into 1th index needs vpinsrd + inserti128. |
4593 | // Case#2: inserting into 5th index needs extracti128 + vpinsrd + |
4594 | // inserti128. |
4595 | // Case#3: inserting into 4,5,6,7 index needs 4*vpinsrd + inserti128. |
4596 | assert((LegalVectorBitWidth % LaneBitWidth) == 0 && "Illegal vector" ); |
4597 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
4598 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
4599 | unsigned NumLegalElts = |
4600 | LT.second.getVectorNumElements() * NumLegalVectors; |
4601 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
4602 | "Vector has been legalized to smaller element count" ); |
4603 | assert((NumLegalElts % NumLanesTotal) == 0 && |
4604 | "Unexpected elts per lane" ); |
4605 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
4606 | |
4607 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
4608 | auto *LaneTy = |
4609 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
4610 | |
4611 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
4612 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
4613 | numBits: NumEltsPerLane, bitPosition: NumEltsPerLane * I); |
4614 | if (LaneEltMask.isZero()) |
4615 | continue; |
4616 | // FIXME: we don't need to extract if all non-demanded elements |
4617 | // are legalization-inserted padding. |
4618 | if (!LaneEltMask.isAllOnes()) |
4619 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, |
4620 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
4621 | Cost += BaseT::getScalarizationOverhead(InTy: LaneTy, DemandedElts: LaneEltMask, Insert, |
4622 | /*Extract*/ false, CostKind); |
4623 | } |
4624 | |
4625 | APInt AffectedLanes = |
4626 | APIntOps::ScaleBitMask(A: WidenedDemandedElts, NewBitWidth: NumLanesTotal); |
4627 | APInt FullyAffectedLegalVectors = APIntOps::ScaleBitMask( |
4628 | A: AffectedLanes, NewBitWidth: NumLegalVectors, /*MatchAllBits=*/true); |
4629 | for (int LegalVec = 0; LegalVec != NumLegalVectors; ++LegalVec) { |
4630 | for (unsigned Lane = 0; Lane != NumLegalLanes; ++Lane) { |
4631 | unsigned I = NumLegalLanes * LegalVec + Lane; |
4632 | // No need to insert unaffected lane; or lane 0 of each legal vector |
4633 | // iff ALL lanes of that vector were affected and will be inserted. |
4634 | if (!AffectedLanes[I] || |
4635 | (Lane == 0 && FullyAffectedLegalVectors[LegalVec])) |
4636 | continue; |
4637 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, BaseTp: Ty, Mask: std::nullopt, |
4638 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
4639 | } |
4640 | } |
4641 | } |
4642 | } else if (LT.second.isVector()) { |
4643 | // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded |
4644 | // integer element as a SCALAR_TO_VECTOR, then we build the vector as a |
4645 | // series of UNPCK followed by CONCAT_VECTORS - all of these can be |
4646 | // considered cheap. |
4647 | if (Ty->isIntOrIntVectorTy()) |
4648 | Cost += DemandedElts.popcount(); |
4649 | |
4650 | // Get the smaller of the legalized or original pow2-extended number of |
4651 | // vector elements, which represents the number of unpacks we'll end up |
4652 | // performing. |
4653 | unsigned NumElts = LT.second.getVectorNumElements(); |
4654 | unsigned Pow2Elts = |
4655 | PowerOf2Ceil(A: cast<FixedVectorType>(Val: Ty)->getNumElements()); |
4656 | Cost += (std::min<unsigned>(a: NumElts, b: Pow2Elts) - 1) * LT.first; |
4657 | } |
4658 | } |
4659 | |
4660 | if (Extract) { |
4661 | // vXi1 can be efficiently extracted with MOVMSK. |
4662 | // TODO: AVX512 predicate mask handling. |
4663 | // NOTE: This doesn't work well for roundtrip scalarization. |
4664 | if (!Insert && Ty->getScalarSizeInBits() == 1 && !ST->hasAVX512()) { |
4665 | unsigned NumElts = cast<FixedVectorType>(Val: Ty)->getNumElements(); |
4666 | unsigned MaxElts = ST->hasAVX2() ? 32 : 16; |
4667 | unsigned MOVMSKCost = (NumElts + MaxElts - 1) / MaxElts; |
4668 | return MOVMSKCost; |
4669 | } |
4670 | |
4671 | if (LT.second.isVector()) { |
4672 | unsigned NumLegalElts = |
4673 | LT.second.getVectorNumElements() * NumLegalVectors; |
4674 | assert(NumLegalElts >= DemandedElts.getBitWidth() && |
4675 | "Vector has been legalized to smaller element count" ); |
4676 | |
4677 | // If we're extracting elements from a 128-bit subvector lane, |
4678 | // we only need to extract each lane once, not for every element. |
4679 | if (LegalVectorBitWidth > LaneBitWidth) { |
4680 | unsigned NumLegalLanes = LegalVectorBitWidth / LaneBitWidth; |
4681 | unsigned NumLanesTotal = NumLegalLanes * NumLegalVectors; |
4682 | assert((NumLegalElts % NumLanesTotal) == 0 && |
4683 | "Unexpected elts per lane" ); |
4684 | unsigned NumEltsPerLane = NumLegalElts / NumLanesTotal; |
4685 | |
4686 | // Add cost for each demanded 128-bit subvector extraction. |
4687 | // Luckily this is a lot easier than for insertion. |
4688 | APInt WidenedDemandedElts = DemandedElts.zext(width: NumLegalElts); |
4689 | auto *LaneTy = |
4690 | FixedVectorType::get(ElementType: Ty->getElementType(), NumElts: NumEltsPerLane); |
4691 | |
4692 | for (unsigned I = 0; I != NumLanesTotal; ++I) { |
4693 | APInt LaneEltMask = WidenedDemandedElts.extractBits( |
4694 | numBits: NumEltsPerLane, bitPosition: I * NumEltsPerLane); |
4695 | if (LaneEltMask.isZero()) |
4696 | continue; |
4697 | Cost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, |
4698 | CostKind, Index: I * NumEltsPerLane, SubTp: LaneTy); |
4699 | Cost += BaseT::getScalarizationOverhead( |
4700 | InTy: LaneTy, DemandedElts: LaneEltMask, /*Insert*/ false, Extract, CostKind); |
4701 | } |
4702 | |
4703 | return Cost; |
4704 | } |
4705 | } |
4706 | |
4707 | // Fallback to default extraction. |
4708 | Cost += BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, /*Insert*/ false, |
4709 | Extract, CostKind); |
4710 | } |
4711 | |
4712 | return Cost; |
4713 | } |
4714 | |
4715 | InstructionCost |
4716 | X86TTIImpl::getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, |
4717 | int VF, const APInt &DemandedDstElts, |
4718 | TTI::TargetCostKind CostKind) { |
4719 | const unsigned EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
4720 | // We don't differentiate element types here, only element bit width. |
4721 | EltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: EltTyBits); |
4722 | |
4723 | auto bailout = [&]() { |
4724 | return BaseT::getReplicationShuffleCost(EltTy, ReplicationFactor, VF, |
4725 | DemandedDstElts, CostKind); |
4726 | }; |
4727 | |
4728 | // For now, only deal with AVX512 cases. |
4729 | if (!ST->hasAVX512()) |
4730 | return bailout(); |
4731 | |
4732 | // Do we have a native shuffle for this element type, or should we promote? |
4733 | unsigned PromEltTyBits = EltTyBits; |
4734 | switch (EltTyBits) { |
4735 | case 32: |
4736 | case 64: |
4737 | break; // AVX512F. |
4738 | case 16: |
4739 | if (!ST->hasBWI()) |
4740 | PromEltTyBits = 32; // promote to i32, AVX512F. |
4741 | break; // AVX512BW |
4742 | case 8: |
4743 | if (!ST->hasVBMI()) |
4744 | PromEltTyBits = 32; // promote to i32, AVX512F. |
4745 | break; // AVX512VBMI |
4746 | case 1: |
4747 | // There is no support for shuffling i1 elements. We *must* promote. |
4748 | if (ST->hasBWI()) { |
4749 | if (ST->hasVBMI()) |
4750 | PromEltTyBits = 8; // promote to i8, AVX512VBMI. |
4751 | else |
4752 | PromEltTyBits = 16; // promote to i16, AVX512BW. |
4753 | break; |
4754 | } |
4755 | PromEltTyBits = 32; // promote to i32, AVX512F. |
4756 | break; |
4757 | default: |
4758 | return bailout(); |
4759 | } |
4760 | auto *PromEltTy = IntegerType::getIntNTy(C&: EltTy->getContext(), N: PromEltTyBits); |
4761 | |
4762 | auto *SrcVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: VF); |
4763 | auto *PromSrcVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: VF); |
4764 | |
4765 | int NumDstElements = VF * ReplicationFactor; |
4766 | auto *PromDstVecTy = FixedVectorType::get(ElementType: PromEltTy, NumElts: NumDstElements); |
4767 | auto *DstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumDstElements); |
4768 | |
4769 | // Legalize the types. |
4770 | MVT LegalSrcVecTy = getTypeLegalizationCost(Ty: SrcVecTy).second; |
4771 | MVT LegalPromSrcVecTy = getTypeLegalizationCost(Ty: PromSrcVecTy).second; |
4772 | MVT LegalPromDstVecTy = getTypeLegalizationCost(Ty: PromDstVecTy).second; |
4773 | MVT LegalDstVecTy = getTypeLegalizationCost(Ty: DstVecTy).second; |
4774 | // They should have legalized into vector types. |
4775 | if (!LegalSrcVecTy.isVector() || !LegalPromSrcVecTy.isVector() || |
4776 | !LegalPromDstVecTy.isVector() || !LegalDstVecTy.isVector()) |
4777 | return bailout(); |
4778 | |
4779 | if (PromEltTyBits != EltTyBits) { |
4780 | // If we have to perform the shuffle with wider elt type than our data type, |
4781 | // then we will first need to anyext (we don't care about the new bits) |
4782 | // the source elements, and then truncate Dst elements. |
4783 | InstructionCost PromotionCost; |
4784 | PromotionCost += getCastInstrCost( |
4785 | Opcode: Instruction::SExt, /*Dst=*/PromSrcVecTy, /*Src=*/SrcVecTy, |
4786 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
4787 | PromotionCost += |
4788 | getCastInstrCost(Opcode: Instruction::Trunc, /*Dst=*/DstVecTy, |
4789 | /*Src=*/PromDstVecTy, |
4790 | CCH: TargetTransformInfo::CastContextHint::None, CostKind); |
4791 | return PromotionCost + getReplicationShuffleCost(EltTy: PromEltTy, |
4792 | ReplicationFactor, VF, |
4793 | DemandedDstElts, CostKind); |
4794 | } |
4795 | |
4796 | assert(LegalSrcVecTy.getScalarSizeInBits() == EltTyBits && |
4797 | LegalSrcVecTy.getScalarType() == LegalDstVecTy.getScalarType() && |
4798 | "We expect that the legalization doesn't affect the element width, " |
4799 | "doesn't coalesce/split elements." ); |
4800 | |
4801 | unsigned NumEltsPerDstVec = LegalDstVecTy.getVectorNumElements(); |
4802 | unsigned NumDstVectors = |
4803 | divideCeil(Numerator: DstVecTy->getNumElements(), Denominator: NumEltsPerDstVec); |
4804 | |
4805 | auto *SingleDstVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltsPerDstVec); |
4806 | |
4807 | // Not all the produced Dst elements may be demanded. In our case, |
4808 | // given that a single Dst vector is formed by a single shuffle, |
4809 | // if all elements that will form a single Dst vector aren't demanded, |
4810 | // then we won't need to do that shuffle, so adjust the cost accordingly. |
4811 | APInt DemandedDstVectors = APIntOps::ScaleBitMask( |
4812 | A: DemandedDstElts.zext(width: NumDstVectors * NumEltsPerDstVec), NewBitWidth: NumDstVectors); |
4813 | unsigned NumDstVectorsDemanded = DemandedDstVectors.popcount(); |
4814 | |
4815 | InstructionCost SingleShuffleCost = getShuffleCost( |
4816 | Kind: TTI::SK_PermuteSingleSrc, BaseTp: SingleDstVecTy, /*Mask=*/std::nullopt, CostKind, |
4817 | /*Index=*/0, /*SubTp=*/nullptr); |
4818 | return NumDstVectorsDemanded * SingleShuffleCost; |
4819 | } |
4820 | |
4821 | InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
4822 | MaybeAlign Alignment, |
4823 | unsigned AddressSpace, |
4824 | TTI::TargetCostKind CostKind, |
4825 | TTI::OperandValueInfo OpInfo, |
4826 | const Instruction *I) { |
4827 | // TODO: Handle other cost kinds. |
4828 | if (CostKind != TTI::TCK_RecipThroughput) { |
4829 | if (auto *SI = dyn_cast_or_null<StoreInst>(Val: I)) { |
4830 | // Store instruction with index and scale costs 2 Uops. |
4831 | // Check the preceding GEP to identify non-const indices. |
4832 | if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: SI->getPointerOperand())) { |
4833 | if (!all_of(Range: GEP->indices(), P: [](Value *V) { return isa<Constant>(Val: V); })) |
4834 | return TTI::TCC_Basic * 2; |
4835 | } |
4836 | } |
4837 | return TTI::TCC_Basic; |
4838 | } |
4839 | |
4840 | assert((Opcode == Instruction::Load || Opcode == Instruction::Store) && |
4841 | "Invalid Opcode" ); |
4842 | // Type legalization can't handle structs |
4843 | if (TLI->getValueType(DL, Src, true) == MVT::Other) |
4844 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
4845 | CostKind); |
4846 | |
4847 | // Legalize the type. |
4848 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: Src); |
4849 | |
4850 | auto *VTy = dyn_cast<FixedVectorType>(Val: Src); |
4851 | |
4852 | InstructionCost Cost = 0; |
4853 | |
4854 | // Add a cost for constant load to vector. |
4855 | if (Opcode == Instruction::Store && OpInfo.isConstant()) |
4856 | Cost += getMemoryOpCost(Opcode: Instruction::Load, Src, Alignment: DL.getABITypeAlign(Ty: Src), |
4857 | /*AddressSpace=*/0, CostKind); |
4858 | |
4859 | // Handle the simple case of non-vectors. |
4860 | // NOTE: this assumes that legalization never creates vector from scalars! |
4861 | if (!VTy || !LT.second.isVector()) { |
4862 | // Each load/store unit costs 1. |
4863 | return (LT.second.isFloatingPoint() ? Cost : 0) + LT.first * 1; |
4864 | } |
4865 | |
4866 | bool IsLoad = Opcode == Instruction::Load; |
4867 | |
4868 | Type *EltTy = VTy->getElementType(); |
4869 | |
4870 | const int EltTyBits = DL.getTypeSizeInBits(Ty: EltTy); |
4871 | |
4872 | // Source of truth: how many elements were there in the original IR vector? |
4873 | const unsigned SrcNumElt = VTy->getNumElements(); |
4874 | |
4875 | // How far have we gotten? |
4876 | int NumEltRemaining = SrcNumElt; |
4877 | // Note that we intentionally capture by-reference, NumEltRemaining changes. |
4878 | auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; }; |
4879 | |
4880 | const int MaxLegalOpSizeBytes = divideCeil(Numerator: LT.second.getSizeInBits(), Denominator: 8); |
4881 | |
4882 | // Note that even if we can store 64 bits of an XMM, we still operate on XMM. |
4883 | const unsigned XMMBits = 128; |
4884 | if (XMMBits % EltTyBits != 0) |
4885 | // Vector size must be a multiple of the element size. I.e. no padding. |
4886 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
4887 | CostKind); |
4888 | const int NumEltPerXMM = XMMBits / EltTyBits; |
4889 | |
4890 | auto *XMMVecTy = FixedVectorType::get(ElementType: EltTy, NumElts: NumEltPerXMM); |
4891 | |
4892 | for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0; |
4893 | NumEltRemaining > 0; CurrOpSizeBytes /= 2) { |
4894 | // How many elements would a single op deal with at once? |
4895 | if ((8 * CurrOpSizeBytes) % EltTyBits != 0) |
4896 | // Vector size must be a multiple of the element size. I.e. no padding. |
4897 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
4898 | CostKind); |
4899 | int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits; |
4900 | |
4901 | assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?" ); |
4902 | assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) || |
4903 | (CurrOpSizeBytes == MaxLegalOpSizeBytes)) && |
4904 | "Unless we haven't halved the op size yet, " |
4905 | "we have less than two op's sized units of work left." ); |
4906 | |
4907 | auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM |
4908 | ? FixedVectorType::get(ElementType: EltTy, NumElts: CurrNumEltPerOp) |
4909 | : XMMVecTy; |
4910 | |
4911 | assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 && |
4912 | "After halving sizes, the vector elt count is no longer a multiple " |
4913 | "of number of elements per operation?" ); |
4914 | auto *CoalescedVecTy = |
4915 | CurrNumEltPerOp == 1 |
4916 | ? CurrVecTy |
4917 | : FixedVectorType::get( |
4918 | ElementType: IntegerType::get(C&: Src->getContext(), |
4919 | NumBits: EltTyBits * CurrNumEltPerOp), |
4920 | NumElts: CurrVecTy->getNumElements() / CurrNumEltPerOp); |
4921 | assert(DL.getTypeSizeInBits(CoalescedVecTy) == |
4922 | DL.getTypeSizeInBits(CurrVecTy) && |
4923 | "coalesciing elements doesn't change vector width." ); |
4924 | |
4925 | while (NumEltRemaining > 0) { |
4926 | assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?" ); |
4927 | |
4928 | // Can we use this vector size, as per the remaining element count? |
4929 | // Iff the vector is naturally aligned, we can do a wide load regardless. |
4930 | if (NumEltRemaining < CurrNumEltPerOp && |
4931 | (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) && |
4932 | CurrOpSizeBytes != 1) |
4933 | break; // Try smalled vector size. |
4934 | |
4935 | bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0; |
4936 | |
4937 | // If we have fully processed the previous reg, we need to replenish it. |
4938 | if (SubVecEltsLeft == 0) { |
4939 | SubVecEltsLeft += CurrVecTy->getNumElements(); |
4940 | // And that's free only for the 0'th subvector of a legalized vector. |
4941 | if (!Is0thSubVec) |
4942 | Cost += getShuffleCost(Kind: IsLoad ? TTI::ShuffleKind::SK_InsertSubvector |
4943 | : TTI::ShuffleKind::SK_ExtractSubvector, |
4944 | BaseTp: VTy, Mask: std::nullopt, CostKind, Index: NumEltDone(), |
4945 | SubTp: CurrVecTy); |
4946 | } |
4947 | |
4948 | // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM, |
4949 | // for smaller widths (32/16/8) we have to insert/extract them separately. |
4950 | // Again, it's free for the 0'th subreg (if op is 32/64 bit wide, |
4951 | // but let's pretend that it is also true for 16/8 bit wide ops...) |
4952 | if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) { |
4953 | int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM; |
4954 | assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "" ); |
4955 | int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp; |
4956 | APInt DemandedElts = |
4957 | APInt::getBitsSet(numBits: CoalescedVecTy->getNumElements(), |
4958 | loBit: CoalescedVecEltIdx, hiBit: CoalescedVecEltIdx + 1); |
4959 | assert(DemandedElts.popcount() == 1 && "Inserting single value" ); |
4960 | Cost += getScalarizationOverhead(Ty: CoalescedVecTy, DemandedElts, Insert: IsLoad, |
4961 | Extract: !IsLoad, CostKind); |
4962 | } |
4963 | |
4964 | // This isn't exactly right. We're using slow unaligned 32-byte accesses |
4965 | // as a proxy for a double-pumped AVX memory interface such as on |
4966 | // Sandybridge. |
4967 | // Sub-32-bit loads/stores will be slower either with PINSR*/PEXTR* or |
4968 | // will be scalarized. |
4969 | if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow()) |
4970 | Cost += 2; |
4971 | else if (CurrOpSizeBytes < 4) |
4972 | Cost += 2; |
4973 | else |
4974 | Cost += 1; |
4975 | |
4976 | SubVecEltsLeft -= CurrNumEltPerOp; |
4977 | NumEltRemaining -= CurrNumEltPerOp; |
4978 | Alignment = commonAlignment(A: Alignment.valueOrOne(), Offset: CurrOpSizeBytes); |
4979 | } |
4980 | } |
4981 | |
4982 | assert(NumEltRemaining <= 0 && "Should have processed all the elements." ); |
4983 | |
4984 | return Cost; |
4985 | } |
4986 | |
4987 | InstructionCost |
4988 | X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment, |
4989 | unsigned AddressSpace, |
4990 | TTI::TargetCostKind CostKind) { |
4991 | bool IsLoad = (Instruction::Load == Opcode); |
4992 | bool IsStore = (Instruction::Store == Opcode); |
4993 | |
4994 | auto *SrcVTy = dyn_cast<FixedVectorType>(Val: SrcTy); |
4995 | if (!SrcVTy) |
4996 | // To calculate scalar take the regular cost, without mask |
4997 | return getMemoryOpCost(Opcode, Src: SrcTy, Alignment, AddressSpace, CostKind); |
4998 | |
4999 | unsigned NumElem = SrcVTy->getNumElements(); |
5000 | auto *MaskTy = |
5001 | FixedVectorType::get(ElementType: Type::getInt8Ty(C&: SrcVTy->getContext()), NumElts: NumElem); |
5002 | if ((IsLoad && !isLegalMaskedLoad(DataType: SrcVTy, Alignment)) || |
5003 | (IsStore && !isLegalMaskedStore(DataType: SrcVTy, Alignment))) { |
5004 | // Scalarization |
5005 | APInt DemandedElts = APInt::getAllOnes(numBits: NumElem); |
5006 | InstructionCost MaskSplitCost = getScalarizationOverhead( |
5007 | Ty: MaskTy, DemandedElts, /*Insert*/ false, /*Extract*/ true, CostKind); |
5008 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( |
5009 | Opcode: Instruction::ICmp, ValTy: Type::getInt8Ty(C&: SrcVTy->getContext()), CondTy: nullptr, |
5010 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
5011 | InstructionCost BranchCost = getCFInstrCost(Opcode: Instruction::Br, CostKind); |
5012 | InstructionCost MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost); |
5013 | InstructionCost ValueSplitCost = getScalarizationOverhead( |
5014 | Ty: SrcVTy, DemandedElts, Insert: IsLoad, Extract: IsStore, CostKind); |
5015 | InstructionCost MemopCost = |
5016 | NumElem * BaseT::getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
5017 | Alignment, AddressSpace, CostKind); |
5018 | return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost; |
5019 | } |
5020 | |
5021 | // Legalize the type. |
5022 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: SrcVTy); |
5023 | auto VT = TLI->getValueType(DL, Ty: SrcVTy); |
5024 | InstructionCost Cost = 0; |
5025 | if (VT.isSimple() && LT.second != VT.getSimpleVT() && |
5026 | LT.second.getVectorNumElements() == NumElem) |
5027 | // Promotion requires extend/truncate for data and a shuffle for mask. |
5028 | Cost += getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: SrcVTy, Mask: std::nullopt, |
5029 | CostKind, Index: 0, SubTp: nullptr) + |
5030 | getShuffleCost(Kind: TTI::SK_PermuteTwoSrc, BaseTp: MaskTy, Mask: std::nullopt, |
5031 | CostKind, Index: 0, SubTp: nullptr); |
5032 | |
5033 | else if (LT.first * LT.second.getVectorNumElements() > NumElem) { |
5034 | auto *NewMaskTy = FixedVectorType::get(ElementType: MaskTy->getElementType(), |
5035 | NumElts: LT.second.getVectorNumElements()); |
5036 | // Expanding requires fill mask with zeroes |
5037 | Cost += getShuffleCost(Kind: TTI::SK_InsertSubvector, BaseTp: NewMaskTy, Mask: std::nullopt, |
5038 | CostKind, Index: 0, SubTp: MaskTy); |
5039 | } |
5040 | |
5041 | // Pre-AVX512 - each maskmov load costs 2 + store costs ~8. |
5042 | if (!ST->hasAVX512()) |
5043 | return Cost + LT.first * (IsLoad ? 2 : 8); |
5044 | |
5045 | // AVX-512 masked load/store is cheaper |
5046 | return Cost + LT.first; |
5047 | } |
5048 | |
5049 | InstructionCost |
5050 | X86TTIImpl::getPointersChainCost(ArrayRef<const Value *> Ptrs, |
5051 | const Value *Base, |
5052 | const TTI::PointersChainInfo &Info, |
5053 | Type *AccessTy, TTI::TargetCostKind CostKind) { |
5054 | if (Info.isSameBase() && Info.isKnownStride()) { |
5055 | // If all the pointers have known stride all the differences are translated |
5056 | // into constants. X86 memory addressing allows encoding it into |
5057 | // displacement. So we just need to take the base GEP cost. |
5058 | if (const auto *BaseGEP = dyn_cast<GetElementPtrInst>(Val: Base)) { |
5059 | SmallVector<const Value *> Indices(BaseGEP->indices()); |
5060 | return getGEPCost(PointeeType: BaseGEP->getSourceElementType(), |
5061 | Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: nullptr, |
5062 | CostKind); |
5063 | } |
5064 | return TTI::TCC_Free; |
5065 | } |
5066 | return BaseT::getPointersChainCost(Ptrs, Base, Info, AccessTy, CostKind); |
5067 | } |
5068 | |
5069 | InstructionCost X86TTIImpl::getAddressComputationCost(Type *Ty, |
5070 | ScalarEvolution *SE, |
5071 | const SCEV *Ptr) { |
5072 | // Address computations in vectorized code with non-consecutive addresses will |
5073 | // likely result in more instructions compared to scalar code where the |
5074 | // computation can more often be merged into the index mode. The resulting |
5075 | // extra micro-ops can significantly decrease throughput. |
5076 | const unsigned NumVectorInstToHideOverhead = 10; |
5077 | |
5078 | // Cost modeling of Strided Access Computation is hidden by the indexing |
5079 | // modes of X86 regardless of the stride value. We dont believe that there |
5080 | // is a difference between constant strided access in gerenal and constant |
5081 | // strided value which is less than or equal to 64. |
5082 | // Even in the case of (loop invariant) stride whose value is not known at |
5083 | // compile time, the address computation will not incur more than one extra |
5084 | // ADD instruction. |
5085 | if (Ty->isVectorTy() && SE && !ST->hasAVX2()) { |
5086 | // TODO: AVX2 is the current cut-off because we don't have correct |
5087 | // interleaving costs for prior ISA's. |
5088 | if (!BaseT::isStridedAccess(Ptr)) |
5089 | return NumVectorInstToHideOverhead; |
5090 | if (!BaseT::getConstantStrideStep(SE, Ptr)) |
5091 | return 1; |
5092 | } |
5093 | |
5094 | return BaseT::getAddressComputationCost(Ty, SE, Ptr); |
5095 | } |
5096 | |
5097 | InstructionCost |
5098 | X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, |
5099 | std::optional<FastMathFlags> FMF, |
5100 | TTI::TargetCostKind CostKind) { |
5101 | if (TTI::requiresOrderedReduction(FMF)) |
5102 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValTy, FMF, CostKind); |
5103 | |
5104 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
5105 | // and make it as the cost. |
5106 | |
5107 | static const CostTblEntry SLMCostTbl[] = { |
5108 | { ISD::FADD, MVT::v2f64, 3 }, |
5109 | { ISD::ADD, MVT::v2i64, 5 }, |
5110 | }; |
5111 | |
5112 | static const CostTblEntry SSE2CostTbl[] = { |
5113 | { ISD::FADD, MVT::v2f64, 2 }, |
5114 | { ISD::FADD, MVT::v2f32, 2 }, |
5115 | { ISD::FADD, MVT::v4f32, 4 }, |
5116 | { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". |
5117 | { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32 |
5118 | { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.3". |
5119 | { ISD::ADD, MVT::v2i16, 2 }, // The data reported by the IACA tool is "4.3". |
5120 | { ISD::ADD, MVT::v4i16, 3 }, // The data reported by the IACA tool is "4.3". |
5121 | { ISD::ADD, MVT::v8i16, 4 }, // The data reported by the IACA tool is "4.3". |
5122 | { ISD::ADD, MVT::v2i8, 2 }, |
5123 | { ISD::ADD, MVT::v4i8, 2 }, |
5124 | { ISD::ADD, MVT::v8i8, 2 }, |
5125 | { ISD::ADD, MVT::v16i8, 3 }, |
5126 | }; |
5127 | |
5128 | static const CostTblEntry AVX1CostTbl[] = { |
5129 | { ISD::FADD, MVT::v4f64, 3 }, |
5130 | { ISD::FADD, MVT::v4f32, 3 }, |
5131 | { ISD::FADD, MVT::v8f32, 4 }, |
5132 | { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". |
5133 | { ISD::ADD, MVT::v4i64, 3 }, |
5134 | { ISD::ADD, MVT::v8i32, 5 }, |
5135 | { ISD::ADD, MVT::v16i16, 5 }, |
5136 | { ISD::ADD, MVT::v32i8, 4 }, |
5137 | }; |
5138 | |
5139 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
5140 | assert(ISD && "Invalid opcode" ); |
5141 | |
5142 | // Before legalizing the type, give a chance to look up illegal narrow types |
5143 | // in the table. |
5144 | // FIXME: Is there a better way to do this? |
5145 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
5146 | if (VT.isSimple()) { |
5147 | MVT MTy = VT.getSimpleVT(); |
5148 | if (ST->useSLMArithCosts()) |
5149 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) |
5150 | return Entry->Cost; |
5151 | |
5152 | if (ST->hasAVX()) |
5153 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
5154 | return Entry->Cost; |
5155 | |
5156 | if (ST->hasSSE2()) |
5157 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
5158 | return Entry->Cost; |
5159 | } |
5160 | |
5161 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5162 | |
5163 | MVT MTy = LT.second; |
5164 | |
5165 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
5166 | |
5167 | // Special case: vXi8 mul reductions are performed as vXi16. |
5168 | if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) { |
5169 | auto *WideSclTy = IntegerType::get(C&: ValVTy->getContext(), NumBits: 16); |
5170 | auto *WideVecTy = FixedVectorType::get(ElementType: WideSclTy, NumElts: ValVTy->getNumElements()); |
5171 | return getCastInstrCost(Opcode: Instruction::ZExt, Dst: WideVecTy, Src: ValTy, |
5172 | CCH: TargetTransformInfo::CastContextHint::None, |
5173 | CostKind) + |
5174 | getArithmeticReductionCost(Opcode, ValTy: WideVecTy, FMF, CostKind); |
5175 | } |
5176 | |
5177 | InstructionCost ArithmeticCost = 0; |
5178 | if (LT.first != 1 && MTy.isVector() && |
5179 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5180 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5181 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5182 | NumElts: MTy.getVectorNumElements()); |
5183 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
5184 | ArithmeticCost *= LT.first - 1; |
5185 | } |
5186 | |
5187 | if (ST->useSLMArithCosts()) |
5188 | if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy)) |
5189 | return ArithmeticCost + Entry->Cost; |
5190 | |
5191 | if (ST->hasAVX()) |
5192 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
5193 | return ArithmeticCost + Entry->Cost; |
5194 | |
5195 | if (ST->hasSSE2()) |
5196 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
5197 | return ArithmeticCost + Entry->Cost; |
5198 | |
5199 | // FIXME: These assume a naive kshift+binop lowering, which is probably |
5200 | // conservative in most cases. |
5201 | static const CostTblEntry AVX512BoolReduction[] = { |
5202 | { ISD::AND, MVT::v2i1, 3 }, |
5203 | { ISD::AND, MVT::v4i1, 5 }, |
5204 | { ISD::AND, MVT::v8i1, 7 }, |
5205 | { ISD::AND, MVT::v16i1, 9 }, |
5206 | { ISD::AND, MVT::v32i1, 11 }, |
5207 | { ISD::AND, MVT::v64i1, 13 }, |
5208 | { ISD::OR, MVT::v2i1, 3 }, |
5209 | { ISD::OR, MVT::v4i1, 5 }, |
5210 | { ISD::OR, MVT::v8i1, 7 }, |
5211 | { ISD::OR, MVT::v16i1, 9 }, |
5212 | { ISD::OR, MVT::v32i1, 11 }, |
5213 | { ISD::OR, MVT::v64i1, 13 }, |
5214 | }; |
5215 | |
5216 | static const CostTblEntry AVX2BoolReduction[] = { |
5217 | { ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp |
5218 | { ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp |
5219 | { ISD::OR, MVT::v16i16, 2 }, // vpmovmskb + cmp |
5220 | { ISD::OR, MVT::v32i8, 2 }, // vpmovmskb + cmp |
5221 | }; |
5222 | |
5223 | static const CostTblEntry AVX1BoolReduction[] = { |
5224 | { ISD::AND, MVT::v4i64, 2 }, // vmovmskpd + cmp |
5225 | { ISD::AND, MVT::v8i32, 2 }, // vmovmskps + cmp |
5226 | { ISD::AND, MVT::v16i16, 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
5227 | { ISD::AND, MVT::v32i8, 4 }, // vextractf128 + vpand + vpmovmskb + cmp |
5228 | { ISD::OR, MVT::v4i64, 2 }, // vmovmskpd + cmp |
5229 | { ISD::OR, MVT::v8i32, 2 }, // vmovmskps + cmp |
5230 | { ISD::OR, MVT::v16i16, 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
5231 | { ISD::OR, MVT::v32i8, 4 }, // vextractf128 + vpor + vpmovmskb + cmp |
5232 | }; |
5233 | |
5234 | static const CostTblEntry SSE2BoolReduction[] = { |
5235 | { ISD::AND, MVT::v2i64, 2 }, // movmskpd + cmp |
5236 | { ISD::AND, MVT::v4i32, 2 }, // movmskps + cmp |
5237 | { ISD::AND, MVT::v8i16, 2 }, // pmovmskb + cmp |
5238 | { ISD::AND, MVT::v16i8, 2 }, // pmovmskb + cmp |
5239 | { ISD::OR, MVT::v2i64, 2 }, // movmskpd + cmp |
5240 | { ISD::OR, MVT::v4i32, 2 }, // movmskps + cmp |
5241 | { ISD::OR, MVT::v8i16, 2 }, // pmovmskb + cmp |
5242 | { ISD::OR, MVT::v16i8, 2 }, // pmovmskb + cmp |
5243 | }; |
5244 | |
5245 | // Handle bool allof/anyof patterns. |
5246 | if (ValVTy->getElementType()->isIntegerTy(Bitwidth: 1)) { |
5247 | InstructionCost ArithmeticCost = 0; |
5248 | if (LT.first != 1 && MTy.isVector() && |
5249 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5250 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5251 | auto *SingleOpTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5252 | NumElts: MTy.getVectorNumElements()); |
5253 | ArithmeticCost = getArithmeticInstrCost(Opcode, Ty: SingleOpTy, CostKind); |
5254 | ArithmeticCost *= LT.first - 1; |
5255 | } |
5256 | |
5257 | if (ST->hasAVX512()) |
5258 | if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) |
5259 | return ArithmeticCost + Entry->Cost; |
5260 | if (ST->hasAVX2()) |
5261 | if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) |
5262 | return ArithmeticCost + Entry->Cost; |
5263 | if (ST->hasAVX()) |
5264 | if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) |
5265 | return ArithmeticCost + Entry->Cost; |
5266 | if (ST->hasSSE2()) |
5267 | if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) |
5268 | return ArithmeticCost + Entry->Cost; |
5269 | |
5270 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
5271 | } |
5272 | |
5273 | unsigned NumVecElts = ValVTy->getNumElements(); |
5274 | unsigned ScalarSize = ValVTy->getScalarSizeInBits(); |
5275 | |
5276 | // Special case power of 2 reductions where the scalar type isn't changed |
5277 | // by type legalization. |
5278 | if (!isPowerOf2_32(Value: NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) |
5279 | return BaseT::getArithmeticReductionCost(Opcode, Ty: ValVTy, FMF, CostKind); |
5280 | |
5281 | InstructionCost ReductionCost = 0; |
5282 | |
5283 | auto *Ty = ValVTy; |
5284 | if (LT.first != 1 && MTy.isVector() && |
5285 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5286 | // Type needs to be split. We need LT.first - 1 arithmetic ops. |
5287 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5288 | NumElts: MTy.getVectorNumElements()); |
5289 | ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind); |
5290 | ReductionCost *= LT.first - 1; |
5291 | NumVecElts = MTy.getVectorNumElements(); |
5292 | } |
5293 | |
5294 | // Now handle reduction with the legal type, taking into account size changes |
5295 | // at each level. |
5296 | while (NumVecElts > 1) { |
5297 | // Determine the size of the remaining vector we need to reduce. |
5298 | unsigned Size = NumVecElts * ScalarSize; |
5299 | NumVecElts /= 2; |
5300 | // If we're reducing from 256/512 bits, use an extract_subvector. |
5301 | if (Size > 128) { |
5302 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
5303 | ReductionCost += |
5304 | getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, CostKind, |
5305 | Index: NumVecElts, SubTp: SubTy); |
5306 | Ty = SubTy; |
5307 | } else if (Size == 128) { |
5308 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
5309 | FixedVectorType *ShufTy; |
5310 | if (ValVTy->isFloatingPointTy()) |
5311 | ShufTy = |
5312 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValVTy->getContext()), NumElts: 2); |
5313 | else |
5314 | ShufTy = |
5315 | FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValVTy->getContext()), NumElts: 2); |
5316 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5317 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5318 | } else if (Size == 64) { |
5319 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
5320 | FixedVectorType *ShufTy; |
5321 | if (ValVTy->isFloatingPointTy()) |
5322 | ShufTy = |
5323 | FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValVTy->getContext()), NumElts: 4); |
5324 | else |
5325 | ShufTy = |
5326 | FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValVTy->getContext()), NumElts: 4); |
5327 | ReductionCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5328 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5329 | } else { |
5330 | // Reducing from smaller size is a shift by immediate. |
5331 | auto *ShiftTy = FixedVectorType::get( |
5332 | ElementType: Type::getIntNTy(C&: ValVTy->getContext(), N: Size), NumElts: 128 / Size); |
5333 | ReductionCost += getArithmeticInstrCost( |
5334 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind, |
5335 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
5336 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
5337 | } |
5338 | |
5339 | // Add the arithmetic op for this level. |
5340 | ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind); |
5341 | } |
5342 | |
5343 | // Add the final extract element to the cost. |
5344 | return ReductionCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
5345 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
5346 | } |
5347 | |
5348 | InstructionCost X86TTIImpl::getMinMaxCost(Intrinsic::ID IID, Type *Ty, |
5349 | TTI::TargetCostKind CostKind, |
5350 | FastMathFlags FMF) { |
5351 | IntrinsicCostAttributes ICA(IID, Ty, {Ty, Ty}, FMF); |
5352 | return getIntrinsicInstrCost(ICA, CostKind); |
5353 | } |
5354 | |
5355 | InstructionCost |
5356 | X86TTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *ValTy, |
5357 | FastMathFlags FMF, |
5358 | TTI::TargetCostKind CostKind) { |
5359 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
5360 | |
5361 | MVT MTy = LT.second; |
5362 | |
5363 | int ISD; |
5364 | if (ValTy->isIntOrIntVectorTy()) { |
5365 | ISD = (IID == Intrinsic::umin || IID == Intrinsic::umax) ? ISD::UMIN |
5366 | : ISD::SMIN; |
5367 | } else { |
5368 | assert(ValTy->isFPOrFPVectorTy() && |
5369 | "Expected float point or integer vector type." ); |
5370 | ISD = (IID == Intrinsic::minnum || IID == Intrinsic::maxnum) |
5371 | ? ISD::FMINNUM |
5372 | : ISD::FMINIMUM; |
5373 | } |
5374 | |
5375 | // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput |
5376 | // and make it as the cost. |
5377 | |
5378 | static const CostTblEntry SSE2CostTbl[] = { |
5379 | {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw |
5380 | {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw |
5381 | {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw |
5382 | }; |
5383 | |
5384 | static const CostTblEntry SSE41CostTbl[] = { |
5385 | {ISD::SMIN, MVT::v2i16, 3}, // same as sse2 |
5386 | {ISD::SMIN, MVT::v4i16, 5}, // same as sse2 |
5387 | {ISD::UMIN, MVT::v2i16, 5}, // same as sse2 |
5388 | {ISD::UMIN, MVT::v4i16, 7}, // same as sse2 |
5389 | {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor |
5390 | {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax |
5391 | {ISD::SMIN, MVT::v2i8, 3}, // pminsb |
5392 | {ISD::SMIN, MVT::v4i8, 5}, // pminsb |
5393 | {ISD::SMIN, MVT::v8i8, 7}, // pminsb |
5394 | {ISD::SMIN, MVT::v16i8, 6}, |
5395 | {ISD::UMIN, MVT::v2i8, 3}, // same as sse2 |
5396 | {ISD::UMIN, MVT::v4i8, 5}, // same as sse2 |
5397 | {ISD::UMIN, MVT::v8i8, 7}, // same as sse2 |
5398 | {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax |
5399 | }; |
5400 | |
5401 | static const CostTblEntry AVX1CostTbl[] = { |
5402 | {ISD::SMIN, MVT::v16i16, 6}, |
5403 | {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax |
5404 | {ISD::SMIN, MVT::v32i8, 8}, |
5405 | {ISD::UMIN, MVT::v32i8, 8}, |
5406 | }; |
5407 | |
5408 | static const CostTblEntry AVX512BWCostTbl[] = { |
5409 | {ISD::SMIN, MVT::v32i16, 8}, |
5410 | {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax |
5411 | {ISD::SMIN, MVT::v64i8, 10}, |
5412 | {ISD::UMIN, MVT::v64i8, 10}, |
5413 | }; |
5414 | |
5415 | // Before legalizing the type, give a chance to look up illegal narrow types |
5416 | // in the table. |
5417 | // FIXME: Is there a better way to do this? |
5418 | EVT VT = TLI->getValueType(DL, Ty: ValTy); |
5419 | if (VT.isSimple()) { |
5420 | MVT MTy = VT.getSimpleVT(); |
5421 | if (ST->hasBWI()) |
5422 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) |
5423 | return Entry->Cost; |
5424 | |
5425 | if (ST->hasAVX()) |
5426 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
5427 | return Entry->Cost; |
5428 | |
5429 | if (ST->hasSSE41()) |
5430 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) |
5431 | return Entry->Cost; |
5432 | |
5433 | if (ST->hasSSE2()) |
5434 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
5435 | return Entry->Cost; |
5436 | } |
5437 | |
5438 | auto *ValVTy = cast<FixedVectorType>(Val: ValTy); |
5439 | unsigned NumVecElts = ValVTy->getNumElements(); |
5440 | |
5441 | auto *Ty = ValVTy; |
5442 | InstructionCost MinMaxCost = 0; |
5443 | if (LT.first != 1 && MTy.isVector() && |
5444 | MTy.getVectorNumElements() < ValVTy->getNumElements()) { |
5445 | // Type needs to be split. We need LT.first - 1 operations ops. |
5446 | Ty = FixedVectorType::get(ElementType: ValVTy->getElementType(), |
5447 | NumElts: MTy.getVectorNumElements()); |
5448 | MinMaxCost = getMinMaxCost(IID, Ty, CostKind, FMF); |
5449 | MinMaxCost *= LT.first - 1; |
5450 | NumVecElts = MTy.getVectorNumElements(); |
5451 | } |
5452 | |
5453 | if (ST->hasBWI()) |
5454 | if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy)) |
5455 | return MinMaxCost + Entry->Cost; |
5456 | |
5457 | if (ST->hasAVX()) |
5458 | if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy)) |
5459 | return MinMaxCost + Entry->Cost; |
5460 | |
5461 | if (ST->hasSSE41()) |
5462 | if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy)) |
5463 | return MinMaxCost + Entry->Cost; |
5464 | |
5465 | if (ST->hasSSE2()) |
5466 | if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy)) |
5467 | return MinMaxCost + Entry->Cost; |
5468 | |
5469 | unsigned ScalarSize = ValTy->getScalarSizeInBits(); |
5470 | |
5471 | // Special case power of 2 reductions where the scalar type isn't changed |
5472 | // by type legalization. |
5473 | if (!isPowerOf2_32(Value: ValVTy->getNumElements()) || |
5474 | ScalarSize != MTy.getScalarSizeInBits()) |
5475 | return BaseT::getMinMaxReductionCost(IID, Ty: ValTy, FMF, CostKind); |
5476 | |
5477 | // Now handle reduction with the legal type, taking into account size changes |
5478 | // at each level. |
5479 | while (NumVecElts > 1) { |
5480 | // Determine the size of the remaining vector we need to reduce. |
5481 | unsigned Size = NumVecElts * ScalarSize; |
5482 | NumVecElts /= 2; |
5483 | // If we're reducing from 256/512 bits, use an extract_subvector. |
5484 | if (Size > 128) { |
5485 | auto *SubTy = FixedVectorType::get(ElementType: ValVTy->getElementType(), NumElts: NumVecElts); |
5486 | MinMaxCost += getShuffleCost(Kind: TTI::SK_ExtractSubvector, BaseTp: Ty, Mask: std::nullopt, |
5487 | CostKind, Index: NumVecElts, SubTp: SubTy); |
5488 | Ty = SubTy; |
5489 | } else if (Size == 128) { |
5490 | // Reducing from 128 bits is a permute of v2f64/v2i64. |
5491 | VectorType *ShufTy; |
5492 | if (ValTy->isFloatingPointTy()) |
5493 | ShufTy = |
5494 | FixedVectorType::get(ElementType: Type::getDoubleTy(C&: ValTy->getContext()), NumElts: 2); |
5495 | else |
5496 | ShufTy = FixedVectorType::get(ElementType: Type::getInt64Ty(C&: ValTy->getContext()), NumElts: 2); |
5497 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5498 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5499 | } else if (Size == 64) { |
5500 | // Reducing from 64 bits is a shuffle of v4f32/v4i32. |
5501 | FixedVectorType *ShufTy; |
5502 | if (ValTy->isFloatingPointTy()) |
5503 | ShufTy = FixedVectorType::get(ElementType: Type::getFloatTy(C&: ValTy->getContext()), NumElts: 4); |
5504 | else |
5505 | ShufTy = FixedVectorType::get(ElementType: Type::getInt32Ty(C&: ValTy->getContext()), NumElts: 4); |
5506 | MinMaxCost += getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, BaseTp: ShufTy, |
5507 | Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
5508 | } else { |
5509 | // Reducing from smaller size is a shift by immediate. |
5510 | auto *ShiftTy = FixedVectorType::get( |
5511 | ElementType: Type::getIntNTy(C&: ValTy->getContext(), N: Size), NumElts: 128 / Size); |
5512 | MinMaxCost += getArithmeticInstrCost( |
5513 | Opcode: Instruction::LShr, Ty: ShiftTy, CostKind: TTI::TCK_RecipThroughput, |
5514 | Op1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None}, |
5515 | Op2Info: {.Kind: TargetTransformInfo::OK_UniformConstantValue, .Properties: TargetTransformInfo::OP_None}); |
5516 | } |
5517 | |
5518 | // Add the arithmetic op for this level. |
5519 | MinMaxCost += getMinMaxCost(IID, Ty, CostKind, FMF); |
5520 | } |
5521 | |
5522 | // Add the final extract element to the cost. |
5523 | return MinMaxCost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: Ty, |
5524 | CostKind, Index: 0, Op0: nullptr, Op1: nullptr); |
5525 | } |
5526 | |
5527 | /// Calculate the cost of materializing a 64-bit value. This helper |
5528 | /// method might only calculate a fraction of a larger immediate. Therefore it |
5529 | /// is valid to return a cost of ZERO. |
5530 | InstructionCost X86TTIImpl::getIntImmCost(int64_t Val) { |
5531 | if (Val == 0) |
5532 | return TTI::TCC_Free; |
5533 | |
5534 | if (isInt<32>(x: Val)) |
5535 | return TTI::TCC_Basic; |
5536 | |
5537 | return 2 * TTI::TCC_Basic; |
5538 | } |
5539 | |
5540 | InstructionCost X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, |
5541 | TTI::TargetCostKind CostKind) { |
5542 | assert(Ty->isIntegerTy()); |
5543 | |
5544 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5545 | if (BitSize == 0) |
5546 | return ~0U; |
5547 | |
5548 | // Never hoist constants larger than 128bit, because this might lead to |
5549 | // incorrect code generation or assertions in codegen. |
5550 | // Fixme: Create a cost model for types larger than i128 once the codegen |
5551 | // issues have been fixed. |
5552 | if (BitSize > 128) |
5553 | return TTI::TCC_Free; |
5554 | |
5555 | if (Imm == 0) |
5556 | return TTI::TCC_Free; |
5557 | |
5558 | // Sign-extend all constants to a multiple of 64-bit. |
5559 | APInt ImmVal = Imm; |
5560 | if (BitSize % 64 != 0) |
5561 | ImmVal = Imm.sext(width: alignTo(Value: BitSize, Align: 64)); |
5562 | |
5563 | // Split the constant into 64-bit chunks and calculate the cost for each |
5564 | // chunk. |
5565 | InstructionCost Cost = 0; |
5566 | for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) { |
5567 | APInt Tmp = ImmVal.ashr(ShiftAmt: ShiftVal).sextOrTrunc(width: 64); |
5568 | int64_t Val = Tmp.getSExtValue(); |
5569 | Cost += getIntImmCost(Val); |
5570 | } |
5571 | // We need at least one instruction to materialize the constant. |
5572 | return std::max<InstructionCost>(a: 1, b: Cost); |
5573 | } |
5574 | |
5575 | InstructionCost X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, |
5576 | const APInt &Imm, Type *Ty, |
5577 | TTI::TargetCostKind CostKind, |
5578 | Instruction *Inst) { |
5579 | assert(Ty->isIntegerTy()); |
5580 | |
5581 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5582 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
5583 | // here, so that constant hoisting will ignore this constant. |
5584 | if (BitSize == 0) |
5585 | return TTI::TCC_Free; |
5586 | |
5587 | unsigned ImmIdx = ~0U; |
5588 | switch (Opcode) { |
5589 | default: |
5590 | return TTI::TCC_Free; |
5591 | case Instruction::GetElementPtr: |
5592 | // Always hoist the base address of a GetElementPtr. This prevents the |
5593 | // creation of new constants for every base constant that gets constant |
5594 | // folded with the offset. |
5595 | if (Idx == 0) |
5596 | return 2 * TTI::TCC_Basic; |
5597 | return TTI::TCC_Free; |
5598 | case Instruction::Store: |
5599 | ImmIdx = 0; |
5600 | break; |
5601 | case Instruction::ICmp: |
5602 | // This is an imperfect hack to prevent constant hoisting of |
5603 | // compares that might be trying to check if a 64-bit value fits in |
5604 | // 32-bits. The backend can optimize these cases using a right shift by 32. |
5605 | // Ideally we would check the compare predicate here. There also other |
5606 | // similar immediates the backend can use shifts for. |
5607 | if (Idx == 1 && Imm.getBitWidth() == 64) { |
5608 | uint64_t ImmVal = Imm.getZExtValue(); |
5609 | if (ImmVal == 0x100000000ULL || ImmVal == 0xffffffff) |
5610 | return TTI::TCC_Free; |
5611 | } |
5612 | ImmIdx = 1; |
5613 | break; |
5614 | case Instruction::And: |
5615 | // We support 64-bit ANDs with immediates with 32-bits of leading zeroes |
5616 | // by using a 32-bit operation with implicit zero extension. Detect such |
5617 | // immediates here as the normal path expects bit 31 to be sign extended. |
5618 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.isIntN(N: 32)) |
5619 | return TTI::TCC_Free; |
5620 | ImmIdx = 1; |
5621 | break; |
5622 | case Instruction::Add: |
5623 | case Instruction::Sub: |
5624 | // For add/sub, we can use the opposite instruction for INT32_MIN. |
5625 | if (Idx == 1 && Imm.getBitWidth() == 64 && Imm.getZExtValue() == 0x80000000) |
5626 | return TTI::TCC_Free; |
5627 | ImmIdx = 1; |
5628 | break; |
5629 | case Instruction::UDiv: |
5630 | case Instruction::SDiv: |
5631 | case Instruction::URem: |
5632 | case Instruction::SRem: |
5633 | // Division by constant is typically expanded later into a different |
5634 | // instruction sequence. This completely changes the constants. |
5635 | // Report them as "free" to stop ConstantHoist from marking them as opaque. |
5636 | return TTI::TCC_Free; |
5637 | case Instruction::Mul: |
5638 | case Instruction::Or: |
5639 | case Instruction::Xor: |
5640 | ImmIdx = 1; |
5641 | break; |
5642 | // Always return TCC_Free for the shift value of a shift instruction. |
5643 | case Instruction::Shl: |
5644 | case Instruction::LShr: |
5645 | case Instruction::AShr: |
5646 | if (Idx == 1) |
5647 | return TTI::TCC_Free; |
5648 | break; |
5649 | case Instruction::Trunc: |
5650 | case Instruction::ZExt: |
5651 | case Instruction::SExt: |
5652 | case Instruction::IntToPtr: |
5653 | case Instruction::PtrToInt: |
5654 | case Instruction::BitCast: |
5655 | case Instruction::PHI: |
5656 | case Instruction::Call: |
5657 | case Instruction::Select: |
5658 | case Instruction::Ret: |
5659 | case Instruction::Load: |
5660 | break; |
5661 | } |
5662 | |
5663 | if (Idx == ImmIdx) { |
5664 | uint64_t NumConstants = divideCeil(Numerator: BitSize, Denominator: 64); |
5665 | InstructionCost Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
5666 | return (Cost <= NumConstants * TTI::TCC_Basic) |
5667 | ? static_cast<int>(TTI::TCC_Free) |
5668 | : Cost; |
5669 | } |
5670 | |
5671 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
5672 | } |
5673 | |
5674 | InstructionCost X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, |
5675 | const APInt &Imm, Type *Ty, |
5676 | TTI::TargetCostKind CostKind) { |
5677 | assert(Ty->isIntegerTy()); |
5678 | |
5679 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); |
5680 | // There is no cost model for constants with a bit size of 0. Return TCC_Free |
5681 | // here, so that constant hoisting will ignore this constant. |
5682 | if (BitSize == 0) |
5683 | return TTI::TCC_Free; |
5684 | |
5685 | switch (IID) { |
5686 | default: |
5687 | return TTI::TCC_Free; |
5688 | case Intrinsic::sadd_with_overflow: |
5689 | case Intrinsic::uadd_with_overflow: |
5690 | case Intrinsic::ssub_with_overflow: |
5691 | case Intrinsic::usub_with_overflow: |
5692 | case Intrinsic::smul_with_overflow: |
5693 | case Intrinsic::umul_with_overflow: |
5694 | if ((Idx == 1) && Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 32)) |
5695 | return TTI::TCC_Free; |
5696 | break; |
5697 | case Intrinsic::experimental_stackmap: |
5698 | if ((Idx < 2) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
5699 | return TTI::TCC_Free; |
5700 | break; |
5701 | case Intrinsic::experimental_patchpoint_void: |
5702 | case Intrinsic::experimental_patchpoint: |
5703 | if ((Idx < 4) || (Imm.getBitWidth() <= 64 && Imm.isSignedIntN(N: 64))) |
5704 | return TTI::TCC_Free; |
5705 | break; |
5706 | } |
5707 | return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind); |
5708 | } |
5709 | |
5710 | InstructionCost X86TTIImpl::getCFInstrCost(unsigned Opcode, |
5711 | TTI::TargetCostKind CostKind, |
5712 | const Instruction *I) { |
5713 | if (CostKind != TTI::TCK_RecipThroughput) |
5714 | return Opcode == Instruction::PHI ? 0 : 1; |
5715 | // Branches are assumed to be predicted. |
5716 | return 0; |
5717 | } |
5718 | |
5719 | int X86TTIImpl::getGatherOverhead() const { |
5720 | // Some CPUs have more overhead for gather. The specified overhead is relative |
5721 | // to the Load operation. "2" is the number provided by Intel architects. This |
5722 | // parameter is used for cost estimation of Gather Op and comparison with |
5723 | // other alternatives. |
5724 | // TODO: Remove the explicit hasAVX512()?, That would mean we would only |
5725 | // enable gather with a -march. |
5726 | if (ST->hasAVX512() || (ST->hasAVX2() && ST->hasFastGather())) |
5727 | return 2; |
5728 | |
5729 | return 1024; |
5730 | } |
5731 | |
5732 | int X86TTIImpl::getScatterOverhead() const { |
5733 | if (ST->hasAVX512()) |
5734 | return 2; |
5735 | |
5736 | return 1024; |
5737 | } |
5738 | |
5739 | // Return an average cost of Gather / Scatter instruction, maybe improved later. |
5740 | // FIXME: Add TargetCostKind support. |
5741 | InstructionCost X86TTIImpl::getGSVectorCost(unsigned Opcode, |
5742 | TTI::TargetCostKind CostKind, |
5743 | Type *SrcVTy, const Value *Ptr, |
5744 | Align Alignment, |
5745 | unsigned AddressSpace) { |
5746 | |
5747 | assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost" ); |
5748 | unsigned VF = cast<FixedVectorType>(Val: SrcVTy)->getNumElements(); |
5749 | |
5750 | // Try to reduce index size from 64 bit (default for GEP) |
5751 | // to 32. It is essential for VF 16. If the index can't be reduced to 32, the |
5752 | // operation will use 16 x 64 indices which do not fit in a zmm and needs |
5753 | // to split. Also check that the base pointer is the same for all lanes, |
5754 | // and that there's at most one variable index. |
5755 | auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) { |
5756 | unsigned IndexSize = DL.getPointerSizeInBits(); |
5757 | const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr); |
5758 | if (IndexSize < 64 || !GEP) |
5759 | return IndexSize; |
5760 | |
5761 | unsigned NumOfVarIndices = 0; |
5762 | const Value *Ptrs = GEP->getPointerOperand(); |
5763 | if (Ptrs->getType()->isVectorTy() && !getSplatValue(V: Ptrs)) |
5764 | return IndexSize; |
5765 | for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I) { |
5766 | if (isa<Constant>(Val: GEP->getOperand(i_nocapture: I))) |
5767 | continue; |
5768 | Type *IndxTy = GEP->getOperand(i_nocapture: I)->getType(); |
5769 | if (auto *IndexVTy = dyn_cast<VectorType>(Val: IndxTy)) |
5770 | IndxTy = IndexVTy->getElementType(); |
5771 | if ((IndxTy->getPrimitiveSizeInBits() == 64 && |
5772 | !isa<SExtInst>(Val: GEP->getOperand(i_nocapture: I))) || |
5773 | ++NumOfVarIndices > 1) |
5774 | return IndexSize; // 64 |
5775 | } |
5776 | return (unsigned)32; |
5777 | }; |
5778 | |
5779 | // Trying to reduce IndexSize to 32 bits for vector 16. |
5780 | // By default the IndexSize is equal to pointer size. |
5781 | unsigned IndexSize = (ST->hasAVX512() && VF >= 16) |
5782 | ? getIndexSizeInBits(Ptr, DL) |
5783 | : DL.getPointerSizeInBits(); |
5784 | |
5785 | auto *IndexVTy = FixedVectorType::get( |
5786 | ElementType: IntegerType::get(C&: SrcVTy->getContext(), NumBits: IndexSize), NumElts: VF); |
5787 | std::pair<InstructionCost, MVT> IdxsLT = getTypeLegalizationCost(Ty: IndexVTy); |
5788 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcVTy); |
5789 | InstructionCost::CostType SplitFactor = |
5790 | *std::max(a: IdxsLT.first, b: SrcLT.first).getValue(); |
5791 | if (SplitFactor > 1) { |
5792 | // Handle splitting of vector of pointers |
5793 | auto *SplitSrcTy = |
5794 | FixedVectorType::get(ElementType: SrcVTy->getScalarType(), NumElts: VF / SplitFactor); |
5795 | return SplitFactor * getGSVectorCost(Opcode, CostKind, SrcVTy: SplitSrcTy, Ptr, |
5796 | Alignment, AddressSpace); |
5797 | } |
5798 | |
5799 | // The gather / scatter cost is given by Intel architects. It is a rough |
5800 | // number since we are looking at one instruction in a time. |
5801 | const int GSOverhead = (Opcode == Instruction::Load) |
5802 | ? getGatherOverhead() |
5803 | : getScatterOverhead(); |
5804 | return GSOverhead + VF * getMemoryOpCost(Opcode, Src: SrcVTy->getScalarType(), |
5805 | Alignment: MaybeAlign(Alignment), AddressSpace, |
5806 | CostKind: TTI::TCK_RecipThroughput); |
5807 | } |
5808 | |
5809 | /// Return the cost of full scalarization of gather / scatter operation. |
5810 | /// |
5811 | /// Opcode - Load or Store instruction. |
5812 | /// SrcVTy - The type of the data vector that should be gathered or scattered. |
5813 | /// VariableMask - The mask is non-constant at compile time. |
5814 | /// Alignment - Alignment for one element. |
5815 | /// AddressSpace - pointer[s] address space. |
5816 | /// TODO: Remove this and use getCommonMaskedMemoryOpCost directly. |
5817 | InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode, |
5818 | TTI::TargetCostKind CostKind, |
5819 | Type *SrcVTy, bool VariableMask, |
5820 | Align Alignment, |
5821 | unsigned AddressSpace) { |
5822 | Type *ScalarTy = SrcVTy->getScalarType(); |
5823 | unsigned VF = cast<FixedVectorType>(Val: SrcVTy)->getNumElements(); |
5824 | APInt DemandedElts = APInt::getAllOnes(numBits: VF); |
5825 | |
5826 | InstructionCost MaskUnpackCost = 0; |
5827 | if (VariableMask) { |
5828 | auto *MaskTy = |
5829 | FixedVectorType::get(ElementType: Type::getInt1Ty(C&: SrcVTy->getContext()), NumElts: VF); |
5830 | MaskUnpackCost = getScalarizationOverhead( |
5831 | Ty: MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); |
5832 | InstructionCost ScalarCompareCost = getCmpSelInstrCost( |
5833 | Opcode: Instruction::ICmp, ValTy: Type::getInt1Ty(C&: SrcVTy->getContext()), CondTy: nullptr, |
5834 | VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind); |
5835 | InstructionCost BranchCost = getCFInstrCost(Opcode: Instruction::Br, CostKind); |
5836 | MaskUnpackCost += VF * (BranchCost + ScalarCompareCost); |
5837 | } |
5838 | |
5839 | InstructionCost AddressUnpackCost = getScalarizationOverhead( |
5840 | Ty: FixedVectorType::get(ElementType: PointerType::getUnqual(C&: ScalarTy->getContext()), NumElts: VF), |
5841 | DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind); |
5842 | |
5843 | // The cost of the scalar loads/stores. |
5844 | InstructionCost MemoryOpCost = |
5845 | VF * getMemoryOpCost(Opcode, Src: ScalarTy, Alignment: MaybeAlign(Alignment), |
5846 | AddressSpace, CostKind); |
5847 | |
5848 | // The cost of forming the vector from loaded scalars/ |
5849 | // scalarizing the vector to perform scalar stores. |
5850 | InstructionCost = getScalarizationOverhead( |
5851 | Ty: cast<FixedVectorType>(Val: SrcVTy), DemandedElts, |
5852 | /*Insert=*/Opcode == Instruction::Load, |
5853 | /*Extract=*/Opcode == Instruction::Store, CostKind); |
5854 | |
5855 | return AddressUnpackCost + MemoryOpCost + MaskUnpackCost + InsertExtractCost; |
5856 | } |
5857 | |
5858 | /// Calculate the cost of Gather / Scatter operation |
5859 | InstructionCost X86TTIImpl::getGatherScatterOpCost( |
5860 | unsigned Opcode, Type *SrcVTy, const Value *Ptr, bool VariableMask, |
5861 | Align Alignment, TTI::TargetCostKind CostKind, |
5862 | const Instruction *I = nullptr) { |
5863 | if (CostKind != TTI::TCK_RecipThroughput) { |
5864 | if ((Opcode == Instruction::Load && |
5865 | isLegalMaskedGather(DataType: SrcVTy, Alignment: Align(Alignment)) && |
5866 | !forceScalarizeMaskedGather(VTy: cast<VectorType>(Val: SrcVTy), |
5867 | Alignment: Align(Alignment))) || |
5868 | (Opcode == Instruction::Store && |
5869 | isLegalMaskedScatter(DataType: SrcVTy, Alignment: Align(Alignment)) && |
5870 | !forceScalarizeMaskedScatter(VTy: cast<VectorType>(Val: SrcVTy), |
5871 | Alignment: Align(Alignment)))) |
5872 | return 1; |
5873 | return BaseT::getGatherScatterOpCost(Opcode, DataTy: SrcVTy, Ptr, VariableMask, |
5874 | Alignment, CostKind, I); |
5875 | } |
5876 | |
5877 | assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter" ); |
5878 | PointerType *PtrTy = dyn_cast<PointerType>(Val: Ptr->getType()); |
5879 | if (!PtrTy && Ptr->getType()->isVectorTy()) |
5880 | PtrTy = dyn_cast<PointerType>( |
5881 | Val: cast<VectorType>(Val: Ptr->getType())->getElementType()); |
5882 | assert(PtrTy && "Unexpected type for Ptr argument" ); |
5883 | unsigned AddressSpace = PtrTy->getAddressSpace(); |
5884 | |
5885 | if ((Opcode == Instruction::Load && |
5886 | (!isLegalMaskedGather(DataType: SrcVTy, Alignment: Align(Alignment)) || |
5887 | forceScalarizeMaskedGather(VTy: cast<VectorType>(Val: SrcVTy), |
5888 | Alignment: Align(Alignment)))) || |
5889 | (Opcode == Instruction::Store && |
5890 | (!isLegalMaskedScatter(DataType: SrcVTy, Alignment: Align(Alignment)) || |
5891 | forceScalarizeMaskedScatter(VTy: cast<VectorType>(Val: SrcVTy), |
5892 | Alignment: Align(Alignment))))) |
5893 | return getGSScalarCost(Opcode, CostKind, SrcVTy, VariableMask, Alignment, |
5894 | AddressSpace); |
5895 | |
5896 | return getGSVectorCost(Opcode, CostKind, SrcVTy, Ptr, Alignment, |
5897 | AddressSpace); |
5898 | } |
5899 | |
5900 | bool X86TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, |
5901 | const TargetTransformInfo::LSRCost &C2) { |
5902 | // X86 specific here are "instruction number 1st priority". |
5903 | return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost, |
5904 | args: C1.NumIVMuls, args: C1.NumBaseAdds, |
5905 | args: C1.ScaleCost, args: C1.ImmCost, args: C1.SetupCost) < |
5906 | std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost, |
5907 | args: C2.NumIVMuls, args: C2.NumBaseAdds, |
5908 | args: C2.ScaleCost, args: C2.ImmCost, args: C2.SetupCost); |
5909 | } |
5910 | |
5911 | bool X86TTIImpl::canMacroFuseCmp() { |
5912 | return ST->hasMacroFusion() || ST->hasBranchFusion(); |
5913 | } |
5914 | |
5915 | bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) { |
5916 | if (!ST->hasAVX()) |
5917 | return false; |
5918 | |
5919 | // The backend can't handle a single element vector. |
5920 | if (isa<VectorType>(Val: DataTy) && |
5921 | cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
5922 | return false; |
5923 | Type *ScalarTy = DataTy->getScalarType(); |
5924 | |
5925 | if (ScalarTy->isPointerTy()) |
5926 | return true; |
5927 | |
5928 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
5929 | return true; |
5930 | |
5931 | if (ScalarTy->isHalfTy() && ST->hasBWI()) |
5932 | return true; |
5933 | |
5934 | if (ScalarTy->isBFloatTy() && ST->hasBF16()) |
5935 | return true; |
5936 | |
5937 | if (!ScalarTy->isIntegerTy()) |
5938 | return false; |
5939 | |
5940 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
5941 | return IntWidth == 32 || IntWidth == 64 || |
5942 | ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI()); |
5943 | } |
5944 | |
5945 | bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) { |
5946 | return isLegalMaskedLoad(DataTy: DataType, Alignment); |
5947 | } |
5948 | |
5949 | bool X86TTIImpl::isLegalNTLoad(Type *DataType, Align Alignment) { |
5950 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
5951 | // The only supported nontemporal loads are for aligned vectors of 16 or 32 |
5952 | // bytes. Note that 32-byte nontemporal vector loads are supported by AVX2 |
5953 | // (the equivalent stores only require AVX). |
5954 | if (Alignment >= DataSize && (DataSize == 16 || DataSize == 32)) |
5955 | return DataSize == 16 ? ST->hasSSE1() : ST->hasAVX2(); |
5956 | |
5957 | return false; |
5958 | } |
5959 | |
5960 | bool X86TTIImpl::isLegalNTStore(Type *DataType, Align Alignment) { |
5961 | unsigned DataSize = DL.getTypeStoreSize(Ty: DataType); |
5962 | |
5963 | // SSE4A supports nontemporal stores of float and double at arbitrary |
5964 | // alignment. |
5965 | if (ST->hasSSE4A() && (DataType->isFloatTy() || DataType->isDoubleTy())) |
5966 | return true; |
5967 | |
5968 | // Besides the SSE4A subtarget exception above, only aligned stores are |
5969 | // available nontemporaly on any other subtarget. And only stores with a size |
5970 | // of 4..32 bytes (powers of 2, only) are permitted. |
5971 | if (Alignment < DataSize || DataSize < 4 || DataSize > 32 || |
5972 | !isPowerOf2_32(Value: DataSize)) |
5973 | return false; |
5974 | |
5975 | // 32-byte vector nontemporal stores are supported by AVX (the equivalent |
5976 | // loads require AVX2). |
5977 | if (DataSize == 32) |
5978 | return ST->hasAVX(); |
5979 | if (DataSize == 16) |
5980 | return ST->hasSSE1(); |
5981 | return true; |
5982 | } |
5983 | |
5984 | bool X86TTIImpl::isLegalBroadcastLoad(Type *ElementTy, |
5985 | ElementCount NumElements) const { |
5986 | // movddup |
5987 | return ST->hasSSE3() && !NumElements.isScalable() && |
5988 | NumElements.getFixedValue() == 2 && |
5989 | ElementTy == Type::getDoubleTy(C&: ElementTy->getContext()); |
5990 | } |
5991 | |
5992 | bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) { |
5993 | if (!isa<VectorType>(Val: DataTy)) |
5994 | return false; |
5995 | |
5996 | if (!ST->hasAVX512()) |
5997 | return false; |
5998 | |
5999 | // The backend can't handle a single element vector. |
6000 | if (cast<FixedVectorType>(Val: DataTy)->getNumElements() == 1) |
6001 | return false; |
6002 | |
6003 | Type *ScalarTy = cast<VectorType>(Val: DataTy)->getElementType(); |
6004 | |
6005 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
6006 | return true; |
6007 | |
6008 | if (!ScalarTy->isIntegerTy()) |
6009 | return false; |
6010 | |
6011 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
6012 | return IntWidth == 32 || IntWidth == 64 || |
6013 | ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); |
6014 | } |
6015 | |
6016 | bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy, Align Alignment) { |
6017 | return isLegalMaskedExpandLoad(DataTy, Alignment); |
6018 | } |
6019 | |
6020 | bool X86TTIImpl::supportsGather() const { |
6021 | // Some CPUs have better gather performance than others. |
6022 | // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only |
6023 | // enable gather with a -march. |
6024 | return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()); |
6025 | } |
6026 | |
6027 | bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) { |
6028 | // Gather / Scatter for vector 2 is not profitable on KNL / SKX |
6029 | // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend |
6030 | // it to 8 elements, but zeroing upper bits of the mask vector will add more |
6031 | // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO: |
6032 | // Check, maybe the gather/scatter instruction is better in the VariableMask |
6033 | // case. |
6034 | unsigned NumElts = cast<FixedVectorType>(Val: VTy)->getNumElements(); |
6035 | return NumElts == 1 || |
6036 | (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX()))); |
6037 | } |
6038 | |
6039 | bool X86TTIImpl::isLegalMaskedGatherScatter(Type *DataTy, Align Alignment) { |
6040 | Type *ScalarTy = DataTy->getScalarType(); |
6041 | if (ScalarTy->isPointerTy()) |
6042 | return true; |
6043 | |
6044 | if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) |
6045 | return true; |
6046 | |
6047 | if (!ScalarTy->isIntegerTy()) |
6048 | return false; |
6049 | |
6050 | unsigned IntWidth = ScalarTy->getIntegerBitWidth(); |
6051 | return IntWidth == 32 || IntWidth == 64; |
6052 | } |
6053 | |
6054 | bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) { |
6055 | if (!supportsGather() || !ST->preferGather()) |
6056 | return false; |
6057 | return isLegalMaskedGatherScatter(DataTy, Alignment); |
6058 | } |
6059 | |
6060 | bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, |
6061 | unsigned Opcode1, |
6062 | const SmallBitVector &OpcodeMask) const { |
6063 | // ADDSUBPS 4xf32 SSE3 |
6064 | // VADDSUBPS 4xf32 AVX |
6065 | // VADDSUBPS 8xf32 AVX2 |
6066 | // ADDSUBPD 2xf64 SSE3 |
6067 | // VADDSUBPD 2xf64 AVX |
6068 | // VADDSUBPD 4xf64 AVX2 |
6069 | |
6070 | unsigned NumElements = cast<FixedVectorType>(Val: VecTy)->getNumElements(); |
6071 | assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible" ); |
6072 | if (!isPowerOf2_32(Value: NumElements)) |
6073 | return false; |
6074 | // Check the opcode pattern. We apply the mask on the opcode arguments and |
6075 | // then check if it is what we expect. |
6076 | for (int Lane : seq<int>(Begin: 0, End: NumElements)) { |
6077 | unsigned Opc = OpcodeMask.test(Idx: Lane) ? Opcode1 : Opcode0; |
6078 | // We expect FSub for even lanes and FAdd for odd lanes. |
6079 | if (Lane % 2 == 0 && Opc != Instruction::FSub) |
6080 | return false; |
6081 | if (Lane % 2 == 1 && Opc != Instruction::FAdd) |
6082 | return false; |
6083 | } |
6084 | // Now check that the pattern is supported by the target ISA. |
6085 | Type *ElemTy = cast<VectorType>(Val: VecTy)->getElementType(); |
6086 | if (ElemTy->isFloatTy()) |
6087 | return ST->hasSSE3() && NumElements % 4 == 0; |
6088 | if (ElemTy->isDoubleTy()) |
6089 | return ST->hasSSE3() && NumElements % 2 == 0; |
6090 | return false; |
6091 | } |
6092 | |
6093 | bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) { |
6094 | // AVX2 doesn't support scatter |
6095 | if (!ST->hasAVX512() || !ST->preferScatter()) |
6096 | return false; |
6097 | return isLegalMaskedGatherScatter(DataTy: DataType, Alignment); |
6098 | } |
6099 | |
6100 | bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { |
6101 | EVT VT = TLI->getValueType(DL, Ty: DataType); |
6102 | return TLI->isOperationLegal(Op: IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); |
6103 | } |
6104 | |
6105 | bool X86TTIImpl::isExpensiveToSpeculativelyExecute(const Instruction* I) { |
6106 | // FDIV is always expensive, even if it has a very low uop count. |
6107 | // TODO: Still necessary for recent CPUs with low latency/throughput fdiv? |
6108 | if (I->getOpcode() == Instruction::FDiv) |
6109 | return true; |
6110 | |
6111 | return BaseT::isExpensiveToSpeculativelyExecute(I); |
6112 | } |
6113 | |
6114 | bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { |
6115 | return false; |
6116 | } |
6117 | |
6118 | bool X86TTIImpl::areInlineCompatible(const Function *Caller, |
6119 | const Function *Callee) const { |
6120 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
6121 | |
6122 | // Work this as a subsetting of subtarget features. |
6123 | const FeatureBitset &CallerBits = |
6124 | TM.getSubtargetImpl(*Caller)->getFeatureBits(); |
6125 | const FeatureBitset &CalleeBits = |
6126 | TM.getSubtargetImpl(*Callee)->getFeatureBits(); |
6127 | |
6128 | // Check whether features are the same (apart from the ignore list). |
6129 | FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList; |
6130 | FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList; |
6131 | if (RealCallerBits == RealCalleeBits) |
6132 | return true; |
6133 | |
6134 | // If the features are a subset, we need to additionally check for calls |
6135 | // that may become ABI-incompatible as a result of inlining. |
6136 | if ((RealCallerBits & RealCalleeBits) != RealCalleeBits) |
6137 | return false; |
6138 | |
6139 | for (const Instruction &I : instructions(F: Callee)) { |
6140 | if (const auto *CB = dyn_cast<CallBase>(Val: &I)) { |
6141 | // Having more target features is fine for inline ASM. |
6142 | if (CB->isInlineAsm()) |
6143 | continue; |
6144 | |
6145 | SmallVector<Type *, 8> Types; |
6146 | for (Value *Arg : CB->args()) |
6147 | Types.push_back(Elt: Arg->getType()); |
6148 | if (!CB->getType()->isVoidTy()) |
6149 | Types.push_back(Elt: CB->getType()); |
6150 | |
6151 | // Simple types are always ABI compatible. |
6152 | auto IsSimpleTy = [](Type *Ty) { |
6153 | return !Ty->isVectorTy() && !Ty->isAggregateType(); |
6154 | }; |
6155 | if (all_of(Range&: Types, P: IsSimpleTy)) |
6156 | continue; |
6157 | |
6158 | if (Function *NestedCallee = CB->getCalledFunction()) { |
6159 | // Assume that intrinsics are always ABI compatible. |
6160 | if (NestedCallee->isIntrinsic()) |
6161 | continue; |
6162 | |
6163 | // Do a precise compatibility check. |
6164 | if (!areTypesABICompatible(Caller, Callee: NestedCallee, Type: Types)) |
6165 | return false; |
6166 | } else { |
6167 | // We don't know the target features of the callee, |
6168 | // assume it is incompatible. |
6169 | return false; |
6170 | } |
6171 | } |
6172 | } |
6173 | return true; |
6174 | } |
6175 | |
6176 | bool X86TTIImpl::areTypesABICompatible(const Function *Caller, |
6177 | const Function *Callee, |
6178 | const ArrayRef<Type *> &Types) const { |
6179 | if (!BaseT::areTypesABICompatible(Caller, Callee, Types)) |
6180 | return false; |
6181 | |
6182 | // If we get here, we know the target features match. If one function |
6183 | // considers 512-bit vectors legal and the other does not, consider them |
6184 | // incompatible. |
6185 | const TargetMachine &TM = getTLI()->getTargetMachine(); |
6186 | |
6187 | if (TM.getSubtarget<X86Subtarget>(F: *Caller).useAVX512Regs() == |
6188 | TM.getSubtarget<X86Subtarget>(F: *Callee).useAVX512Regs()) |
6189 | return true; |
6190 | |
6191 | // Consider the arguments compatible if they aren't vectors or aggregates. |
6192 | // FIXME: Look at the size of vectors. |
6193 | // FIXME: Look at the element types of aggregates to see if there are vectors. |
6194 | return llvm::none_of(Range: Types, |
6195 | P: [](Type *T) { return T->isVectorTy() || T->isAggregateType(); }); |
6196 | } |
6197 | |
6198 | X86TTIImpl::TTI::MemCmpExpansionOptions |
6199 | X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { |
6200 | TTI::MemCmpExpansionOptions Options; |
6201 | Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); |
6202 | Options.NumLoadsPerBlock = 2; |
6203 | // All GPR and vector loads can be unaligned. |
6204 | Options.AllowOverlappingLoads = true; |
6205 | if (IsZeroCmp) { |
6206 | // Only enable vector loads for equality comparison. Right now the vector |
6207 | // version is not as fast for three way compare (see #33329). |
6208 | const unsigned PreferredWidth = ST->getPreferVectorWidth(); |
6209 | if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512()) |
6210 | Options.LoadSizes.push_back(Elt: 64); |
6211 | if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(Elt: 32); |
6212 | if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(Elt: 16); |
6213 | } |
6214 | if (ST->is64Bit()) { |
6215 | Options.LoadSizes.push_back(Elt: 8); |
6216 | } |
6217 | Options.LoadSizes.push_back(Elt: 4); |
6218 | Options.LoadSizes.push_back(Elt: 2); |
6219 | Options.LoadSizes.push_back(Elt: 1); |
6220 | return Options; |
6221 | } |
6222 | |
6223 | bool X86TTIImpl::prefersVectorizedAddressing() const { |
6224 | return supportsGather(); |
6225 | } |
6226 | |
6227 | bool X86TTIImpl::supportsEfficientVectorElementLoadStore() const { |
6228 | return false; |
6229 | } |
6230 | |
6231 | bool X86TTIImpl::enableInterleavedAccessVectorization() { |
6232 | // TODO: We expect this to be beneficial regardless of arch, |
6233 | // but there are currently some unexplained performance artifacts on Atom. |
6234 | // As a temporary solution, disable on Atom. |
6235 | return !(ST->isAtom()); |
6236 | } |
6237 | |
6238 | // Get estimation for interleaved load/store operations and strided load. |
6239 | // \p Indices contains indices for strided load. |
6240 | // \p Factor - the factor of interleaving. |
6241 | // AVX-512 provides 3-src shuffles that significantly reduces the cost. |
6242 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512( |
6243 | unsigned Opcode, FixedVectorType *VecTy, unsigned Factor, |
6244 | ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace, |
6245 | TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { |
6246 | // VecTy for interleave memop is <VF*Factor x Elt>. |
6247 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
6248 | // VecTy = <12 x i32>. |
6249 | |
6250 | // Calculate the number of memory operations (NumOfMemOps), required |
6251 | // for load/store the VecTy. |
6252 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
6253 | unsigned VecTySize = DL.getTypeStoreSize(Ty: VecTy); |
6254 | unsigned LegalVTSize = LegalVT.getStoreSize(); |
6255 | unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize; |
6256 | |
6257 | // Get the cost of one memory operation. |
6258 | auto *SingleMemOpTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
6259 | NumElts: LegalVT.getVectorNumElements()); |
6260 | InstructionCost MemOpCost; |
6261 | bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps; |
6262 | if (UseMaskedMemOp) |
6263 | MemOpCost = getMaskedMemoryOpCost(Opcode, SrcTy: SingleMemOpTy, Alignment, |
6264 | AddressSpace, CostKind); |
6265 | else |
6266 | MemOpCost = getMemoryOpCost(Opcode, Src: SingleMemOpTy, Alignment: MaybeAlign(Alignment), |
6267 | AddressSpace, CostKind); |
6268 | |
6269 | unsigned VF = VecTy->getNumElements() / Factor; |
6270 | MVT VT = MVT::getVectorVT(VT: MVT::getVT(Ty: VecTy->getScalarType()), NumElements: VF); |
6271 | |
6272 | InstructionCost MaskCost; |
6273 | if (UseMaskedMemOp) { |
6274 | APInt DemandedLoadStoreElts = APInt::getZero(numBits: VecTy->getNumElements()); |
6275 | for (unsigned Index : Indices) { |
6276 | assert(Index < Factor && "Invalid index for interleaved memory op" ); |
6277 | for (unsigned Elm = 0; Elm < VF; Elm++) |
6278 | DemandedLoadStoreElts.setBit(Index + Elm * Factor); |
6279 | } |
6280 | |
6281 | Type *I1Type = Type::getInt1Ty(C&: VecTy->getContext()); |
6282 | |
6283 | MaskCost = getReplicationShuffleCost( |
6284 | EltTy: I1Type, ReplicationFactor: Factor, VF, |
6285 | DemandedDstElts: UseMaskForGaps ? DemandedLoadStoreElts |
6286 | : APInt::getAllOnes(numBits: VecTy->getNumElements()), |
6287 | CostKind); |
6288 | |
6289 | // The Gaps mask is invariant and created outside the loop, therefore the |
6290 | // cost of creating it is not accounted for here. However if we have both |
6291 | // a MaskForGaps and some other mask that guards the execution of the |
6292 | // memory access, we need to account for the cost of And-ing the two masks |
6293 | // inside the loop. |
6294 | if (UseMaskForGaps) { |
6295 | auto *MaskVT = FixedVectorType::get(ElementType: I1Type, NumElts: VecTy->getNumElements()); |
6296 | MaskCost += getArithmeticInstrCost(Opcode: BinaryOperator::And, Ty: MaskVT, CostKind); |
6297 | } |
6298 | } |
6299 | |
6300 | if (Opcode == Instruction::Load) { |
6301 | // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) |
6302 | // contain the cost of the optimized shuffle sequence that the |
6303 | // X86InterleavedAccess pass will generate. |
6304 | // The cost of loads and stores are computed separately from the table. |
6305 | |
6306 | // X86InterleavedAccess support only the following interleaved-access group. |
6307 | static const CostTblEntry AVX512InterleavedLoadTbl[] = { |
6308 | {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 |
6309 | {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 |
6310 | {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 |
6311 | }; |
6312 | |
6313 | if (const auto *Entry = |
6314 | CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) |
6315 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
6316 | //If an entry does not exist, fallback to the default implementation. |
6317 | |
6318 | // Kind of shuffle depends on number of loaded values. |
6319 | // If we load the entire data in one register, we can use a 1-src shuffle. |
6320 | // Otherwise, we'll merge 2 sources in each operation. |
6321 | TTI::ShuffleKind ShuffleKind = |
6322 | (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc; |
6323 | |
6324 | InstructionCost ShuffleCost = getShuffleCost( |
6325 | Kind: ShuffleKind, BaseTp: SingleMemOpTy, Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
6326 | |
6327 | unsigned NumOfLoadsInInterleaveGrp = |
6328 | Indices.size() ? Indices.size() : Factor; |
6329 | auto *ResultTy = FixedVectorType::get(ElementType: VecTy->getElementType(), |
6330 | NumElts: VecTy->getNumElements() / Factor); |
6331 | InstructionCost NumOfResults = |
6332 | getTypeLegalizationCost(Ty: ResultTy).first * NumOfLoadsInInterleaveGrp; |
6333 | |
6334 | // About a half of the loads may be folded in shuffles when we have only |
6335 | // one result. If we have more than one result, or the loads are masked, |
6336 | // we do not fold loads at all. |
6337 | unsigned NumOfUnfoldedLoads = |
6338 | UseMaskedMemOp || NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2; |
6339 | |
6340 | // Get a number of shuffle operations per result. |
6341 | unsigned NumOfShufflesPerResult = |
6342 | std::max(a: (unsigned)1, b: (unsigned)(NumOfMemOps - 1)); |
6343 | |
6344 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
6345 | // When we have more than one destination, we need additional instructions |
6346 | // to keep sources. |
6347 | InstructionCost NumOfMoves = 0; |
6348 | if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc) |
6349 | NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2; |
6350 | |
6351 | InstructionCost Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost + |
6352 | MaskCost + NumOfUnfoldedLoads * MemOpCost + |
6353 | NumOfMoves; |
6354 | |
6355 | return Cost; |
6356 | } |
6357 | |
6358 | // Store. |
6359 | assert(Opcode == Instruction::Store && |
6360 | "Expected Store Instruction at this point" ); |
6361 | // X86InterleavedAccess support only the following interleaved-access group. |
6362 | static const CostTblEntry AVX512InterleavedStoreTbl[] = { |
6363 | {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) |
6364 | {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) |
6365 | {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) |
6366 | |
6367 | {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) |
6368 | {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) |
6369 | {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) |
6370 | {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) |
6371 | }; |
6372 | |
6373 | if (const auto *Entry = |
6374 | CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) |
6375 | return MaskCost + NumOfMemOps * MemOpCost + Entry->Cost; |
6376 | //If an entry does not exist, fallback to the default implementation. |
6377 | |
6378 | // There is no strided stores meanwhile. And store can't be folded in |
6379 | // shuffle. |
6380 | unsigned NumOfSources = Factor; // The number of values to be merged. |
6381 | InstructionCost ShuffleCost = getShuffleCost( |
6382 | Kind: TTI::SK_PermuteTwoSrc, BaseTp: SingleMemOpTy, Mask: std::nullopt, CostKind, Index: 0, SubTp: nullptr); |
6383 | unsigned NumOfShufflesPerStore = NumOfSources - 1; |
6384 | |
6385 | // The SK_MergeTwoSrc shuffle clobbers one of src operands. |
6386 | // We need additional instructions to keep sources. |
6387 | unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2; |
6388 | InstructionCost Cost = |
6389 | MaskCost + |
6390 | NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) + |
6391 | NumOfMoves; |
6392 | return Cost; |
6393 | } |
6394 | |
6395 | InstructionCost X86TTIImpl::getInterleavedMemoryOpCost( |
6396 | unsigned Opcode, Type *BaseTy, unsigned Factor, ArrayRef<unsigned> Indices, |
6397 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
6398 | bool UseMaskForCond, bool UseMaskForGaps) { |
6399 | auto *VecTy = cast<FixedVectorType>(Val: BaseTy); |
6400 | |
6401 | auto isSupportedOnAVX512 = [&](Type *VecTy) { |
6402 | Type *EltTy = cast<VectorType>(Val: VecTy)->getElementType(); |
6403 | if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(Bitwidth: 64) || |
6404 | EltTy->isIntegerTy(Bitwidth: 32) || EltTy->isPointerTy()) |
6405 | return true; |
6406 | if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy()) |
6407 | return ST->hasBWI(); |
6408 | if (EltTy->isBFloatTy()) |
6409 | return ST->hasBF16(); |
6410 | return false; |
6411 | }; |
6412 | if (ST->hasAVX512() && isSupportedOnAVX512(VecTy)) |
6413 | return getInterleavedMemoryOpCostAVX512( |
6414 | Opcode, VecTy, Factor, Indices, Alignment, |
6415 | AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps); |
6416 | |
6417 | if (UseMaskForCond || UseMaskForGaps) |
6418 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6419 | Alignment, AddressSpace, CostKind, |
6420 | UseMaskForCond, UseMaskForGaps); |
6421 | |
6422 | // Get estimation for interleaved load/store operations for SSE-AVX2. |
6423 | // As opposed to AVX-512, SSE-AVX2 do not have generic shuffles that allow |
6424 | // computing the cost using a generic formula as a function of generic |
6425 | // shuffles. We therefore use a lookup table instead, filled according to |
6426 | // the instruction sequences that codegen currently generates. |
6427 | |
6428 | // VecTy for interleave memop is <VF*Factor x Elt>. |
6429 | // So, for VF=4, Interleave Factor = 3, Element type = i32 we have |
6430 | // VecTy = <12 x i32>. |
6431 | MVT LegalVT = getTypeLegalizationCost(Ty: VecTy).second; |
6432 | |
6433 | // This function can be called with VecTy=<6xi128>, Factor=3, in which case |
6434 | // the VF=2, while v2i128 is an unsupported MVT vector type |
6435 | // (see MachineValueType.h::getVectorVT()). |
6436 | if (!LegalVT.isVector()) |
6437 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6438 | Alignment, AddressSpace, CostKind); |
6439 | |
6440 | unsigned VF = VecTy->getNumElements() / Factor; |
6441 | Type *ScalarTy = VecTy->getElementType(); |
6442 | // Deduplicate entries, model floats/pointers as appropriately-sized integers. |
6443 | if (!ScalarTy->isIntegerTy()) |
6444 | ScalarTy = |
6445 | Type::getIntNTy(C&: ScalarTy->getContext(), N: DL.getTypeSizeInBits(Ty: ScalarTy)); |
6446 | |
6447 | // Get the cost of all the memory operations. |
6448 | // FIXME: discount dead loads. |
6449 | InstructionCost MemOpCosts = getMemoryOpCost( |
6450 | Opcode, Src: VecTy, Alignment: MaybeAlign(Alignment), AddressSpace, CostKind); |
6451 | |
6452 | auto *VT = FixedVectorType::get(ElementType: ScalarTy, NumElts: VF); |
6453 | EVT ETy = TLI->getValueType(DL, Ty: VT); |
6454 | if (!ETy.isSimple()) |
6455 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6456 | Alignment, AddressSpace, CostKind); |
6457 | |
6458 | // TODO: Complete for other data-types and strides. |
6459 | // Each combination of Stride, element bit width and VF results in a different |
6460 | // sequence; The cost tables are therefore accessed with: |
6461 | // Factor (stride) and VectorType=VFxiN. |
6462 | // The Cost accounts only for the shuffle sequence; |
6463 | // The cost of the loads/stores is accounted for separately. |
6464 | // |
6465 | static const CostTblEntry AVX2InterleavedLoadTbl[] = { |
6466 | {2, MVT::v2i8, 2}, // (load 4i8 and) deinterleave into 2 x 2i8 |
6467 | {2, MVT::v4i8, 2}, // (load 8i8 and) deinterleave into 2 x 4i8 |
6468 | {2, MVT::v8i8, 2}, // (load 16i8 and) deinterleave into 2 x 8i8 |
6469 | {2, MVT::v16i8, 4}, // (load 32i8 and) deinterleave into 2 x 16i8 |
6470 | {2, MVT::v32i8, 6}, // (load 64i8 and) deinterleave into 2 x 32i8 |
6471 | |
6472 | {2, MVT::v8i16, 6}, // (load 16i16 and) deinterleave into 2 x 8i16 |
6473 | {2, MVT::v16i16, 9}, // (load 32i16 and) deinterleave into 2 x 16i16 |
6474 | {2, MVT::v32i16, 18}, // (load 64i16 and) deinterleave into 2 x 32i16 |
6475 | |
6476 | {2, MVT::v8i32, 4}, // (load 16i32 and) deinterleave into 2 x 8i32 |
6477 | {2, MVT::v16i32, 8}, // (load 32i32 and) deinterleave into 2 x 16i32 |
6478 | {2, MVT::v32i32, 16}, // (load 64i32 and) deinterleave into 2 x 32i32 |
6479 | |
6480 | {2, MVT::v4i64, 4}, // (load 8i64 and) deinterleave into 2 x 4i64 |
6481 | {2, MVT::v8i64, 8}, // (load 16i64 and) deinterleave into 2 x 8i64 |
6482 | {2, MVT::v16i64, 16}, // (load 32i64 and) deinterleave into 2 x 16i64 |
6483 | {2, MVT::v32i64, 32}, // (load 64i64 and) deinterleave into 2 x 32i64 |
6484 | |
6485 | {3, MVT::v2i8, 3}, // (load 6i8 and) deinterleave into 3 x 2i8 |
6486 | {3, MVT::v4i8, 3}, // (load 12i8 and) deinterleave into 3 x 4i8 |
6487 | {3, MVT::v8i8, 6}, // (load 24i8 and) deinterleave into 3 x 8i8 |
6488 | {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8 |
6489 | {3, MVT::v32i8, 14}, // (load 96i8 and) deinterleave into 3 x 32i8 |
6490 | |
6491 | {3, MVT::v2i16, 5}, // (load 6i16 and) deinterleave into 3 x 2i16 |
6492 | {3, MVT::v4i16, 7}, // (load 12i16 and) deinterleave into 3 x 4i16 |
6493 | {3, MVT::v8i16, 9}, // (load 24i16 and) deinterleave into 3 x 8i16 |
6494 | {3, MVT::v16i16, 28}, // (load 48i16 and) deinterleave into 3 x 16i16 |
6495 | {3, MVT::v32i16, 56}, // (load 96i16 and) deinterleave into 3 x 32i16 |
6496 | |
6497 | {3, MVT::v2i32, 3}, // (load 6i32 and) deinterleave into 3 x 2i32 |
6498 | {3, MVT::v4i32, 3}, // (load 12i32 and) deinterleave into 3 x 4i32 |
6499 | {3, MVT::v8i32, 7}, // (load 24i32 and) deinterleave into 3 x 8i32 |
6500 | {3, MVT::v16i32, 14}, // (load 48i32 and) deinterleave into 3 x 16i32 |
6501 | {3, MVT::v32i32, 32}, // (load 96i32 and) deinterleave into 3 x 32i32 |
6502 | |
6503 | {3, MVT::v2i64, 1}, // (load 6i64 and) deinterleave into 3 x 2i64 |
6504 | {3, MVT::v4i64, 5}, // (load 12i64 and) deinterleave into 3 x 4i64 |
6505 | {3, MVT::v8i64, 10}, // (load 24i64 and) deinterleave into 3 x 8i64 |
6506 | {3, MVT::v16i64, 20}, // (load 48i64 and) deinterleave into 3 x 16i64 |
6507 | |
6508 | {4, MVT::v2i8, 4}, // (load 8i8 and) deinterleave into 4 x 2i8 |
6509 | {4, MVT::v4i8, 4}, // (load 16i8 and) deinterleave into 4 x 4i8 |
6510 | {4, MVT::v8i8, 12}, // (load 32i8 and) deinterleave into 4 x 8i8 |
6511 | {4, MVT::v16i8, 24}, // (load 64i8 and) deinterleave into 4 x 16i8 |
6512 | {4, MVT::v32i8, 56}, // (load 128i8 and) deinterleave into 4 x 32i8 |
6513 | |
6514 | {4, MVT::v2i16, 6}, // (load 8i16 and) deinterleave into 4 x 2i16 |
6515 | {4, MVT::v4i16, 17}, // (load 16i16 and) deinterleave into 4 x 4i16 |
6516 | {4, MVT::v8i16, 33}, // (load 32i16 and) deinterleave into 4 x 8i16 |
6517 | {4, MVT::v16i16, 75}, // (load 64i16 and) deinterleave into 4 x 16i16 |
6518 | {4, MVT::v32i16, 150}, // (load 128i16 and) deinterleave into 4 x 32i16 |
6519 | |
6520 | {4, MVT::v2i32, 4}, // (load 8i32 and) deinterleave into 4 x 2i32 |
6521 | {4, MVT::v4i32, 8}, // (load 16i32 and) deinterleave into 4 x 4i32 |
6522 | {4, MVT::v8i32, 16}, // (load 32i32 and) deinterleave into 4 x 8i32 |
6523 | {4, MVT::v16i32, 32}, // (load 64i32 and) deinterleave into 4 x 16i32 |
6524 | {4, MVT::v32i32, 68}, // (load 128i32 and) deinterleave into 4 x 32i32 |
6525 | |
6526 | {4, MVT::v2i64, 6}, // (load 8i64 and) deinterleave into 4 x 2i64 |
6527 | {4, MVT::v4i64, 8}, // (load 16i64 and) deinterleave into 4 x 4i64 |
6528 | {4, MVT::v8i64, 20}, // (load 32i64 and) deinterleave into 4 x 8i64 |
6529 | {4, MVT::v16i64, 40}, // (load 64i64 and) deinterleave into 4 x 16i64 |
6530 | |
6531 | {6, MVT::v2i8, 6}, // (load 12i8 and) deinterleave into 6 x 2i8 |
6532 | {6, MVT::v4i8, 14}, // (load 24i8 and) deinterleave into 6 x 4i8 |
6533 | {6, MVT::v8i8, 18}, // (load 48i8 and) deinterleave into 6 x 8i8 |
6534 | {6, MVT::v16i8, 43}, // (load 96i8 and) deinterleave into 6 x 16i8 |
6535 | {6, MVT::v32i8, 82}, // (load 192i8 and) deinterleave into 6 x 32i8 |
6536 | |
6537 | {6, MVT::v2i16, 13}, // (load 12i16 and) deinterleave into 6 x 2i16 |
6538 | {6, MVT::v4i16, 9}, // (load 24i16 and) deinterleave into 6 x 4i16 |
6539 | {6, MVT::v8i16, 39}, // (load 48i16 and) deinterleave into 6 x 8i16 |
6540 | {6, MVT::v16i16, 106}, // (load 96i16 and) deinterleave into 6 x 16i16 |
6541 | {6, MVT::v32i16, 212}, // (load 192i16 and) deinterleave into 6 x 32i16 |
6542 | |
6543 | {6, MVT::v2i32, 6}, // (load 12i32 and) deinterleave into 6 x 2i32 |
6544 | {6, MVT::v4i32, 15}, // (load 24i32 and) deinterleave into 6 x 4i32 |
6545 | {6, MVT::v8i32, 31}, // (load 48i32 and) deinterleave into 6 x 8i32 |
6546 | {6, MVT::v16i32, 64}, // (load 96i32 and) deinterleave into 6 x 16i32 |
6547 | |
6548 | {6, MVT::v2i64, 6}, // (load 12i64 and) deinterleave into 6 x 2i64 |
6549 | {6, MVT::v4i64, 18}, // (load 24i64 and) deinterleave into 6 x 4i64 |
6550 | {6, MVT::v8i64, 36}, // (load 48i64 and) deinterleave into 6 x 8i64 |
6551 | |
6552 | {8, MVT::v8i32, 40} // (load 64i32 and) deinterleave into 8 x 8i32 |
6553 | }; |
6554 | |
6555 | static const CostTblEntry SSSE3InterleavedLoadTbl[] = { |
6556 | {2, MVT::v4i16, 2}, // (load 8i16 and) deinterleave into 2 x 4i16 |
6557 | }; |
6558 | |
6559 | static const CostTblEntry SSE2InterleavedLoadTbl[] = { |
6560 | {2, MVT::v2i16, 2}, // (load 4i16 and) deinterleave into 2 x 2i16 |
6561 | {2, MVT::v4i16, 7}, // (load 8i16 and) deinterleave into 2 x 4i16 |
6562 | |
6563 | {2, MVT::v2i32, 2}, // (load 4i32 and) deinterleave into 2 x 2i32 |
6564 | {2, MVT::v4i32, 2}, // (load 8i32 and) deinterleave into 2 x 4i32 |
6565 | |
6566 | {2, MVT::v2i64, 2}, // (load 4i64 and) deinterleave into 2 x 2i64 |
6567 | }; |
6568 | |
6569 | static const CostTblEntry AVX2InterleavedStoreTbl[] = { |
6570 | {2, MVT::v16i8, 3}, // interleave 2 x 16i8 into 32i8 (and store) |
6571 | {2, MVT::v32i8, 4}, // interleave 2 x 32i8 into 64i8 (and store) |
6572 | |
6573 | {2, MVT::v8i16, 3}, // interleave 2 x 8i16 into 16i16 (and store) |
6574 | {2, MVT::v16i16, 4}, // interleave 2 x 16i16 into 32i16 (and store) |
6575 | {2, MVT::v32i16, 8}, // interleave 2 x 32i16 into 64i16 (and store) |
6576 | |
6577 | {2, MVT::v4i32, 2}, // interleave 2 x 4i32 into 8i32 (and store) |
6578 | {2, MVT::v8i32, 4}, // interleave 2 x 8i32 into 16i32 (and store) |
6579 | {2, MVT::v16i32, 8}, // interleave 2 x 16i32 into 32i32 (and store) |
6580 | {2, MVT::v32i32, 16}, // interleave 2 x 32i32 into 64i32 (and store) |
6581 | |
6582 | {2, MVT::v2i64, 2}, // interleave 2 x 2i64 into 4i64 (and store) |
6583 | {2, MVT::v4i64, 4}, // interleave 2 x 4i64 into 8i64 (and store) |
6584 | {2, MVT::v8i64, 8}, // interleave 2 x 8i64 into 16i64 (and store) |
6585 | {2, MVT::v16i64, 16}, // interleave 2 x 16i64 into 32i64 (and store) |
6586 | {2, MVT::v32i64, 32}, // interleave 2 x 32i64 into 64i64 (and store) |
6587 | |
6588 | {3, MVT::v2i8, 4}, // interleave 3 x 2i8 into 6i8 (and store) |
6589 | {3, MVT::v4i8, 4}, // interleave 3 x 4i8 into 12i8 (and store) |
6590 | {3, MVT::v8i8, 6}, // interleave 3 x 8i8 into 24i8 (and store) |
6591 | {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store) |
6592 | {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store) |
6593 | |
6594 | {3, MVT::v2i16, 4}, // interleave 3 x 2i16 into 6i16 (and store) |
6595 | {3, MVT::v4i16, 6}, // interleave 3 x 4i16 into 12i16 (and store) |
6596 | {3, MVT::v8i16, 12}, // interleave 3 x 8i16 into 24i16 (and store) |
6597 | {3, MVT::v16i16, 27}, // interleave 3 x 16i16 into 48i16 (and store) |
6598 | {3, MVT::v32i16, 54}, // interleave 3 x 32i16 into 96i16 (and store) |
6599 | |
6600 | {3, MVT::v2i32, 4}, // interleave 3 x 2i32 into 6i32 (and store) |
6601 | {3, MVT::v4i32, 5}, // interleave 3 x 4i32 into 12i32 (and store) |
6602 | {3, MVT::v8i32, 11}, // interleave 3 x 8i32 into 24i32 (and store) |
6603 | {3, MVT::v16i32, 22}, // interleave 3 x 16i32 into 48i32 (and store) |
6604 | {3, MVT::v32i32, 48}, // interleave 3 x 32i32 into 96i32 (and store) |
6605 | |
6606 | {3, MVT::v2i64, 4}, // interleave 3 x 2i64 into 6i64 (and store) |
6607 | {3, MVT::v4i64, 6}, // interleave 3 x 4i64 into 12i64 (and store) |
6608 | {3, MVT::v8i64, 12}, // interleave 3 x 8i64 into 24i64 (and store) |
6609 | {3, MVT::v16i64, 24}, // interleave 3 x 16i64 into 48i64 (and store) |
6610 | |
6611 | {4, MVT::v2i8, 4}, // interleave 4 x 2i8 into 8i8 (and store) |
6612 | {4, MVT::v4i8, 4}, // interleave 4 x 4i8 into 16i8 (and store) |
6613 | {4, MVT::v8i8, 4}, // interleave 4 x 8i8 into 32i8 (and store) |
6614 | {4, MVT::v16i8, 8}, // interleave 4 x 16i8 into 64i8 (and store) |
6615 | {4, MVT::v32i8, 12}, // interleave 4 x 32i8 into 128i8 (and store) |
6616 | |
6617 | {4, MVT::v2i16, 2}, // interleave 4 x 2i16 into 8i16 (and store) |
6618 | {4, MVT::v4i16, 6}, // interleave 4 x 4i16 into 16i16 (and store) |
6619 | {4, MVT::v8i16, 10}, // interleave 4 x 8i16 into 32i16 (and store) |
6620 | {4, MVT::v16i16, 32}, // interleave 4 x 16i16 into 64i16 (and store) |
6621 | {4, MVT::v32i16, 64}, // interleave 4 x 32i16 into 128i16 (and store) |
6622 | |
6623 | {4, MVT::v2i32, 5}, // interleave 4 x 2i32 into 8i32 (and store) |
6624 | {4, MVT::v4i32, 6}, // interleave 4 x 4i32 into 16i32 (and store) |
6625 | {4, MVT::v8i32, 16}, // interleave 4 x 8i32 into 32i32 (and store) |
6626 | {4, MVT::v16i32, 32}, // interleave 4 x 16i32 into 64i32 (and store) |
6627 | {4, MVT::v32i32, 64}, // interleave 4 x 32i32 into 128i32 (and store) |
6628 | |
6629 | {4, MVT::v2i64, 6}, // interleave 4 x 2i64 into 8i64 (and store) |
6630 | {4, MVT::v4i64, 8}, // interleave 4 x 4i64 into 16i64 (and store) |
6631 | {4, MVT::v8i64, 20}, // interleave 4 x 8i64 into 32i64 (and store) |
6632 | {4, MVT::v16i64, 40}, // interleave 4 x 16i64 into 64i64 (and store) |
6633 | |
6634 | {6, MVT::v2i8, 7}, // interleave 6 x 2i8 into 12i8 (and store) |
6635 | {6, MVT::v4i8, 9}, // interleave 6 x 4i8 into 24i8 (and store) |
6636 | {6, MVT::v8i8, 16}, // interleave 6 x 8i8 into 48i8 (and store) |
6637 | {6, MVT::v16i8, 27}, // interleave 6 x 16i8 into 96i8 (and store) |
6638 | {6, MVT::v32i8, 90}, // interleave 6 x 32i8 into 192i8 (and store) |
6639 | |
6640 | {6, MVT::v2i16, 10}, // interleave 6 x 2i16 into 12i16 (and store) |
6641 | {6, MVT::v4i16, 15}, // interleave 6 x 4i16 into 24i16 (and store) |
6642 | {6, MVT::v8i16, 21}, // interleave 6 x 8i16 into 48i16 (and store) |
6643 | {6, MVT::v16i16, 58}, // interleave 6 x 16i16 into 96i16 (and store) |
6644 | {6, MVT::v32i16, 90}, // interleave 6 x 32i16 into 192i16 (and store) |
6645 | |
6646 | {6, MVT::v2i32, 9}, // interleave 6 x 2i32 into 12i32 (and store) |
6647 | {6, MVT::v4i32, 12}, // interleave 6 x 4i32 into 24i32 (and store) |
6648 | {6, MVT::v8i32, 33}, // interleave 6 x 8i32 into 48i32 (and store) |
6649 | {6, MVT::v16i32, 66}, // interleave 6 x 16i32 into 96i32 (and store) |
6650 | |
6651 | {6, MVT::v2i64, 8}, // interleave 6 x 2i64 into 12i64 (and store) |
6652 | {6, MVT::v4i64, 15}, // interleave 6 x 4i64 into 24i64 (and store) |
6653 | {6, MVT::v8i64, 30}, // interleave 6 x 8i64 into 48i64 (and store) |
6654 | }; |
6655 | |
6656 | static const CostTblEntry SSE2InterleavedStoreTbl[] = { |
6657 | {2, MVT::v2i8, 1}, // interleave 2 x 2i8 into 4i8 (and store) |
6658 | {2, MVT::v4i8, 1}, // interleave 2 x 4i8 into 8i8 (and store) |
6659 | {2, MVT::v8i8, 1}, // interleave 2 x 8i8 into 16i8 (and store) |
6660 | |
6661 | {2, MVT::v2i16, 1}, // interleave 2 x 2i16 into 4i16 (and store) |
6662 | {2, MVT::v4i16, 1}, // interleave 2 x 4i16 into 8i16 (and store) |
6663 | |
6664 | {2, MVT::v2i32, 1}, // interleave 2 x 2i32 into 4i32 (and store) |
6665 | }; |
6666 | |
6667 | if (Opcode == Instruction::Load) { |
6668 | auto GetDiscountedCost = [Factor, NumMembers = Indices.size(), |
6669 | MemOpCosts](const CostTblEntry *Entry) { |
6670 | // NOTE: this is just an approximation! |
6671 | // It can over/under -estimate the cost! |
6672 | return MemOpCosts + divideCeil(Numerator: NumMembers * Entry->Cost, Denominator: Factor); |
6673 | }; |
6674 | |
6675 | if (ST->hasAVX2()) |
6676 | if (const auto *Entry = CostTableLookup(AVX2InterleavedLoadTbl, Factor, |
6677 | ETy.getSimpleVT())) |
6678 | return GetDiscountedCost(Entry); |
6679 | |
6680 | if (ST->hasSSSE3()) |
6681 | if (const auto *Entry = CostTableLookup(SSSE3InterleavedLoadTbl, Factor, |
6682 | ETy.getSimpleVT())) |
6683 | return GetDiscountedCost(Entry); |
6684 | |
6685 | if (ST->hasSSE2()) |
6686 | if (const auto *Entry = CostTableLookup(SSE2InterleavedLoadTbl, Factor, |
6687 | ETy.getSimpleVT())) |
6688 | return GetDiscountedCost(Entry); |
6689 | } else { |
6690 | assert(Opcode == Instruction::Store && |
6691 | "Expected Store Instruction at this point" ); |
6692 | assert((!Indices.size() || Indices.size() == Factor) && |
6693 | "Interleaved store only supports fully-interleaved groups." ); |
6694 | if (ST->hasAVX2()) |
6695 | if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, |
6696 | ETy.getSimpleVT())) |
6697 | return MemOpCosts + Entry->Cost; |
6698 | |
6699 | if (ST->hasSSE2()) |
6700 | if (const auto *Entry = CostTableLookup(SSE2InterleavedStoreTbl, Factor, |
6701 | ETy.getSimpleVT())) |
6702 | return MemOpCosts + Entry->Cost; |
6703 | } |
6704 | |
6705 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
6706 | Alignment, AddressSpace, CostKind, |
6707 | UseMaskForCond, UseMaskForGaps); |
6708 | } |
6709 | |
6710 | InstructionCost X86TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, |
6711 | int64_t BaseOffset, |
6712 | bool HasBaseReg, int64_t Scale, |
6713 | unsigned AddrSpace) const { |
6714 | // Scaling factors are not free at all. |
6715 | // An indexed folded instruction, i.e., inst (reg1, reg2, scale), |
6716 | // will take 2 allocations in the out of order engine instead of 1 |
6717 | // for plain addressing mode, i.e. inst (reg1). |
6718 | // E.g., |
6719 | // vaddps (%rsi,%rdx), %ymm0, %ymm1 |
6720 | // Requires two allocations (one for the load, one for the computation) |
6721 | // whereas: |
6722 | // vaddps (%rsi), %ymm0, %ymm1 |
6723 | // Requires just 1 allocation, i.e., freeing allocations for other operations |
6724 | // and having less micro operations to execute. |
6725 | // |
6726 | // For some X86 architectures, this is even worse because for instance for |
6727 | // stores, the complex addressing mode forces the instruction to use the |
6728 | // "load" ports instead of the dedicated "store" port. |
6729 | // E.g., on Haswell: |
6730 | // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3. |
6731 | // vmovaps %ymm1, (%r8) can use port 2, 3, or 7. |
6732 | TargetLoweringBase::AddrMode AM; |
6733 | AM.BaseGV = BaseGV; |
6734 | AM.BaseOffs = BaseOffset; |
6735 | AM.HasBaseReg = HasBaseReg; |
6736 | AM.Scale = Scale; |
6737 | if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AS: AddrSpace)) |
6738 | // Scale represents reg2 * scale, thus account for 1 |
6739 | // as soon as we use a second register. |
6740 | return AM.Scale != 0; |
6741 | return -1; |
6742 | } |
6743 | |