1 | //===- HexagonTargetTransformInfo.cpp - Hexagon specific TTI pass ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | /// \file |
8 | /// This file implements a TargetTransformInfo analysis pass specific to the |
9 | /// Hexagon target machine. It uses the target's detailed information to provide |
10 | /// more precise answers to certain TTI queries, while letting the target |
11 | /// independent and default TTI implementations handle the rest. |
12 | /// |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "HexagonTargetTransformInfo.h" |
16 | #include "HexagonSubtarget.h" |
17 | #include "llvm/Analysis/TargetTransformInfo.h" |
18 | #include "llvm/CodeGen/ValueTypes.h" |
19 | #include "llvm/IR/InstrTypes.h" |
20 | #include "llvm/IR/Instructions.h" |
21 | #include "llvm/IR/User.h" |
22 | #include "llvm/Support/Casting.h" |
23 | #include "llvm/Support/CommandLine.h" |
24 | #include "llvm/Transforms/Utils/LoopPeel.h" |
25 | #include "llvm/Transforms/Utils/UnrollLoop.h" |
26 | |
27 | using namespace llvm; |
28 | |
29 | #define DEBUG_TYPE "hexagontti" |
30 | |
31 | static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx" , cl::init(Val: false), |
32 | cl::Hidden, cl::desc("Enable loop vectorizer for HVX" )); |
33 | |
34 | static cl::opt<bool> EnableV68FloatAutoHVX( |
35 | "force-hvx-float" , cl::Hidden, |
36 | cl::desc("Enable auto-vectorization of floatint point types on v68." )); |
37 | |
38 | static cl::opt<bool> EmitLookupTables("hexagon-emit-lookup-tables" , |
39 | cl::init(Val: true), cl::Hidden, |
40 | cl::desc("Control lookup table emission on Hexagon target" )); |
41 | |
42 | static cl::opt<bool> HexagonMaskedVMem("hexagon-masked-vmem" , cl::init(Val: true), |
43 | cl::Hidden, cl::desc("Enable masked loads/stores for HVX" )); |
44 | |
45 | // Constant "cost factor" to make floating point operations more expensive |
46 | // in terms of vectorization cost. This isn't the best way, but it should |
47 | // do. Ultimately, the cost should use cycles. |
48 | static const unsigned FloatFactor = 4; |
49 | |
50 | bool HexagonTTIImpl::useHVX() const { |
51 | return ST.useHVXOps() && HexagonAutoHVX; |
52 | } |
53 | |
54 | bool HexagonTTIImpl::isHVXVectorType(Type *Ty) const { |
55 | auto *VecTy = dyn_cast<VectorType>(Val: Ty); |
56 | if (!VecTy) |
57 | return false; |
58 | if (!ST.isTypeForHVX(VecTy)) |
59 | return false; |
60 | if (ST.useHVXV69Ops() || !VecTy->getElementType()->isFloatingPointTy()) |
61 | return true; |
62 | return ST.useHVXV68Ops() && EnableV68FloatAutoHVX; |
63 | } |
64 | |
65 | unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const { |
66 | if (auto *VTy = dyn_cast<FixedVectorType>(Val: Ty)) |
67 | return VTy->getNumElements(); |
68 | assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) && |
69 | "Expecting scalar type" ); |
70 | return 1; |
71 | } |
72 | |
73 | TargetTransformInfo::PopcntSupportKind |
74 | HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const { |
75 | // Return fast hardware support as every input < 64 bits will be promoted |
76 | // to 64 bits. |
77 | return TargetTransformInfo::PSK_FastHardware; |
78 | } |
79 | |
80 | // The Hexagon target can unroll loops with run-time trip counts. |
81 | void HexagonTTIImpl::(Loop *L, ScalarEvolution &SE, |
82 | TTI::UnrollingPreferences &UP, |
83 | OptimizationRemarkEmitter *ORE) { |
84 | UP.Runtime = UP.Partial = true; |
85 | } |
86 | |
87 | void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, |
88 | TTI::PeelingPreferences &PP) { |
89 | BaseT::getPeelingPreferences(L, SE, PP); |
90 | // Only try to peel innermost loops with small runtime trip counts. |
91 | if (L && L->isInnermost() && canPeel(L) && |
92 | SE.getSmallConstantTripCount(L) == 0 && |
93 | SE.getSmallConstantMaxTripCount(L) > 0 && |
94 | SE.getSmallConstantMaxTripCount(L) <= 5) { |
95 | PP.PeelCount = 2; |
96 | } |
97 | } |
98 | |
99 | TTI::AddressingModeKind |
100 | HexagonTTIImpl::getPreferredAddressingMode(const Loop *L, |
101 | ScalarEvolution *SE) const { |
102 | return TTI::AMK_PostIndexed; |
103 | } |
104 | |
105 | /// --- Vector TTI begin --- |
106 | |
107 | unsigned HexagonTTIImpl::getNumberOfRegisters(bool Vector) const { |
108 | if (Vector) |
109 | return useHVX() ? 32 : 0; |
110 | return 32; |
111 | } |
112 | |
113 | unsigned HexagonTTIImpl::getMaxInterleaveFactor(ElementCount VF) { |
114 | return useHVX() ? 2 : 1; |
115 | } |
116 | |
117 | TypeSize |
118 | HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { |
119 | switch (K) { |
120 | case TargetTransformInfo::RGK_Scalar: |
121 | return TypeSize::getFixed(ExactSize: 32); |
122 | case TargetTransformInfo::RGK_FixedWidthVector: |
123 | return TypeSize::getFixed(ExactSize: getMinVectorRegisterBitWidth()); |
124 | case TargetTransformInfo::RGK_ScalableVector: |
125 | return TypeSize::getScalable(MinimumSize: 0); |
126 | } |
127 | |
128 | llvm_unreachable("Unsupported register kind" ); |
129 | } |
130 | |
131 | unsigned HexagonTTIImpl::getMinVectorRegisterBitWidth() const { |
132 | return useHVX() ? ST.getVectorLength()*8 : 32; |
133 | } |
134 | |
135 | ElementCount HexagonTTIImpl::getMinimumVF(unsigned ElemWidth, |
136 | bool IsScalable) const { |
137 | assert(!IsScalable && "Scalable VFs are not supported for Hexagon" ); |
138 | return ElementCount::getFixed(MinVal: (8 * ST.getVectorLength()) / ElemWidth); |
139 | } |
140 | |
141 | InstructionCost HexagonTTIImpl::getScalarizationOverhead( |
142 | VectorType *Ty, const APInt &DemandedElts, bool Insert, bool , |
143 | TTI::TargetCostKind CostKind) { |
144 | return BaseT::getScalarizationOverhead(InTy: Ty, DemandedElts, Insert, Extract, |
145 | CostKind); |
146 | } |
147 | |
148 | InstructionCost |
149 | HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, |
150 | ArrayRef<Type *> Tys, |
151 | TTI::TargetCostKind CostKind) { |
152 | return BaseT::getOperandsScalarizationOverhead(Args, Tys, CostKind); |
153 | } |
154 | |
155 | InstructionCost HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, |
156 | ArrayRef<Type *> Tys, |
157 | TTI::TargetCostKind CostKind) { |
158 | return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind); |
159 | } |
160 | |
161 | InstructionCost |
162 | HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, |
163 | TTI::TargetCostKind CostKind) { |
164 | if (ICA.getID() == Intrinsic::bswap) { |
165 | std::pair<InstructionCost, MVT> LT = |
166 | getTypeLegalizationCost(Ty: ICA.getReturnType()); |
167 | return LT.first + 2; |
168 | } |
169 | return BaseT::getIntrinsicInstrCost(ICA, CostKind); |
170 | } |
171 | |
172 | InstructionCost HexagonTTIImpl::getAddressComputationCost(Type *Tp, |
173 | ScalarEvolution *SE, |
174 | const SCEV *S) { |
175 | return 0; |
176 | } |
177 | |
178 | InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, |
179 | MaybeAlign Alignment, |
180 | unsigned AddressSpace, |
181 | TTI::TargetCostKind CostKind, |
182 | TTI::OperandValueInfo OpInfo, |
183 | const Instruction *I) { |
184 | assert(Opcode == Instruction::Load || Opcode == Instruction::Store); |
185 | // TODO: Handle other cost kinds. |
186 | if (CostKind != TTI::TCK_RecipThroughput) |
187 | return 1; |
188 | |
189 | if (Opcode == Instruction::Store) |
190 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, |
191 | CostKind, OpInfo, I); |
192 | |
193 | if (Src->isVectorTy()) { |
194 | VectorType *VecTy = cast<VectorType>(Val: Src); |
195 | unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedValue(); |
196 | if (isHVXVectorType(Ty: VecTy)) { |
197 | unsigned RegWidth = |
198 | getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
199 | .getFixedValue(); |
200 | assert(RegWidth && "Non-zero vector register width expected" ); |
201 | // Cost of HVX loads. |
202 | if (VecWidth % RegWidth == 0) |
203 | return VecWidth / RegWidth; |
204 | // Cost of constructing HVX vector from scalar loads |
205 | const Align RegAlign(RegWidth / 8); |
206 | if (!Alignment || *Alignment > RegAlign) |
207 | Alignment = RegAlign; |
208 | assert(Alignment); |
209 | unsigned AlignWidth = 8 * Alignment->value(); |
210 | unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth; |
211 | return 3 * NumLoads; |
212 | } |
213 | |
214 | // Non-HVX vectors. |
215 | // Add extra cost for floating point types. |
216 | unsigned Cost = |
217 | VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1; |
218 | |
219 | // At this point unspecified alignment is considered as Align(1). |
220 | const Align BoundAlignment = std::min(a: Alignment.valueOrOne(), b: Align(8)); |
221 | unsigned AlignWidth = 8 * BoundAlignment.value(); |
222 | unsigned NumLoads = alignTo(Value: VecWidth, Align: AlignWidth) / AlignWidth; |
223 | if (Alignment == Align(4) || Alignment == Align(8)) |
224 | return Cost * NumLoads; |
225 | // Loads of less than 32 bits will need extra inserts to compose a vector. |
226 | assert(BoundAlignment <= Align(8)); |
227 | unsigned LogA = Log2(A: BoundAlignment); |
228 | return (3 - LogA) * Cost * NumLoads; |
229 | } |
230 | |
231 | return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, |
232 | OpInfo, I); |
233 | } |
234 | |
235 | InstructionCost |
236 | HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, |
237 | Align Alignment, unsigned AddressSpace, |
238 | TTI::TargetCostKind CostKind) { |
239 | return BaseT::getMaskedMemoryOpCost(Opcode, DataTy: Src, Alignment, AddressSpace, |
240 | CostKind); |
241 | } |
242 | |
243 | InstructionCost HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, |
244 | ArrayRef<int> Mask, |
245 | TTI::TargetCostKind CostKind, |
246 | int Index, Type *SubTp, |
247 | ArrayRef<const Value *> Args) { |
248 | return 1; |
249 | } |
250 | |
251 | InstructionCost HexagonTTIImpl::getGatherScatterOpCost( |
252 | unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, |
253 | Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { |
254 | return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, |
255 | Alignment, CostKind, I); |
256 | } |
257 | |
258 | InstructionCost HexagonTTIImpl::getInterleavedMemoryOpCost( |
259 | unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, |
260 | Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, |
261 | bool UseMaskForCond, bool UseMaskForGaps) { |
262 | if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps) |
263 | return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, |
264 | Alignment, AddressSpace, |
265 | CostKind, |
266 | UseMaskForCond, UseMaskForGaps); |
267 | return getMemoryOpCost(Opcode, Src: VecTy, Alignment: MaybeAlign(Alignment), AddressSpace, |
268 | CostKind); |
269 | } |
270 | |
271 | InstructionCost HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, |
272 | Type *CondTy, |
273 | CmpInst::Predicate VecPred, |
274 | TTI::TargetCostKind CostKind, |
275 | const Instruction *I) { |
276 | if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) { |
277 | if (!isHVXVectorType(Ty: ValTy) && ValTy->isFPOrFPVectorTy()) |
278 | return InstructionCost::getMax(); |
279 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty: ValTy); |
280 | if (Opcode == Instruction::FCmp) |
281 | return LT.first + FloatFactor * getTypeNumElements(Ty: ValTy); |
282 | } |
283 | return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); |
284 | } |
285 | |
286 | InstructionCost HexagonTTIImpl::getArithmeticInstrCost( |
287 | unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, |
288 | TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, |
289 | ArrayRef<const Value *> Args, |
290 | const Instruction *CxtI) { |
291 | // TODO: Handle more cost kinds. |
292 | if (CostKind != TTI::TCK_RecipThroughput) |
293 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, |
294 | Opd2Info: Op2Info, Args, CxtI); |
295 | |
296 | if (Ty->isVectorTy()) { |
297 | if (!isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy()) |
298 | return InstructionCost::getMax(); |
299 | std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); |
300 | if (LT.second.isFloatingPoint()) |
301 | return LT.first + FloatFactor * getTypeNumElements(Ty); |
302 | } |
303 | return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, |
304 | Args, CxtI); |
305 | } |
306 | |
307 | InstructionCost HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy, |
308 | Type *SrcTy, |
309 | TTI::CastContextHint CCH, |
310 | TTI::TargetCostKind CostKind, |
311 | const Instruction *I) { |
312 | auto isNonHVXFP = [this] (Type *Ty) { |
313 | return Ty->isVectorTy() && !isHVXVectorType(Ty) && Ty->isFPOrFPVectorTy(); |
314 | }; |
315 | if (isNonHVXFP(SrcTy) || isNonHVXFP(DstTy)) |
316 | return InstructionCost::getMax(); |
317 | |
318 | if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) { |
319 | unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: SrcTy) : 0; |
320 | unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(Ty: DstTy) : 0; |
321 | |
322 | std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Ty: SrcTy); |
323 | std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Ty: DstTy); |
324 | InstructionCost Cost = |
325 | std::max(a: SrcLT.first, b: DstLT.first) + FloatFactor * (SrcN + DstN); |
326 | // TODO: Allow non-throughput costs that aren't binary. |
327 | if (CostKind != TTI::TCK_RecipThroughput) |
328 | return Cost == 0 ? 0 : 1; |
329 | return Cost; |
330 | } |
331 | return 1; |
332 | } |
333 | |
334 | InstructionCost HexagonTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, |
335 | TTI::TargetCostKind CostKind, |
336 | unsigned Index, Value *Op0, |
337 | Value *Op1) { |
338 | Type *ElemTy = Val->isVectorTy() ? cast<VectorType>(Val)->getElementType() |
339 | : Val; |
340 | if (Opcode == Instruction::InsertElement) { |
341 | // Need two rotations for non-zero index. |
342 | unsigned Cost = (Index != 0) ? 2 : 0; |
343 | if (ElemTy->isIntegerTy(Bitwidth: 32)) |
344 | return Cost; |
345 | // If it's not a 32-bit value, there will need to be an extract. |
346 | return Cost + getVectorInstrCost(Opcode: Instruction::ExtractElement, Val, CostKind, |
347 | Index, Op0, Op1); |
348 | } |
349 | |
350 | if (Opcode == Instruction::ExtractElement) |
351 | return 2; |
352 | |
353 | return 1; |
354 | } |
355 | |
356 | bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) { |
357 | // This function is called from scalarize-masked-mem-intrin, which runs |
358 | // in pre-isel. Use ST directly instead of calling isHVXVectorType. |
359 | return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType); |
360 | } |
361 | |
362 | bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) { |
363 | // This function is called from scalarize-masked-mem-intrin, which runs |
364 | // in pre-isel. Use ST directly instead of calling isHVXVectorType. |
365 | return HexagonMaskedVMem && ST.isTypeForHVX(VecTy: DataType); |
366 | } |
367 | |
368 | /// --- Vector TTI end --- |
369 | |
370 | unsigned HexagonTTIImpl::getPrefetchDistance() const { |
371 | return ST.getL1PrefetchDistance(); |
372 | } |
373 | |
374 | unsigned HexagonTTIImpl::getCacheLineSize() const { |
375 | return ST.getL1CacheLineSize(); |
376 | } |
377 | |
378 | InstructionCost |
379 | HexagonTTIImpl::getInstructionCost(const User *U, |
380 | ArrayRef<const Value *> Operands, |
381 | TTI::TargetCostKind CostKind) { |
382 | auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool { |
383 | if (!CI->isIntegerCast()) |
384 | return false; |
385 | // Only extensions from an integer type shorter than 32-bit to i32 |
386 | // can be folded into the load. |
387 | const DataLayout &DL = getDataLayout(); |
388 | unsigned SBW = DL.getTypeSizeInBits(Ty: CI->getSrcTy()); |
389 | unsigned DBW = DL.getTypeSizeInBits(Ty: CI->getDestTy()); |
390 | if (DBW != 32 || SBW >= DBW) |
391 | return false; |
392 | |
393 | const LoadInst *LI = dyn_cast<const LoadInst>(Val: CI->getOperand(i_nocapture: 0)); |
394 | // Technically, this code could allow multiple uses of the load, and |
395 | // check if all the uses are the same extension operation, but this |
396 | // should be sufficient for most cases. |
397 | return LI && LI->hasOneUse(); |
398 | }; |
399 | |
400 | if (const CastInst *CI = dyn_cast<const CastInst>(Val: U)) |
401 | if (isCastFoldedIntoLoad(CI)) |
402 | return TargetTransformInfo::TCC_Free; |
403 | return BaseT::getInstructionCost(U, Operands, CostKind); |
404 | } |
405 | |
406 | bool HexagonTTIImpl::shouldBuildLookupTables() const { |
407 | return EmitLookupTables; |
408 | } |
409 | |