SystemZTargetTransformInfo.cpp source code [llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp]

1	//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements a TargetTransformInfo analysis pass specific to the
10	// SystemZ target machine. It uses the target's detailed information to provide
11	// more precise answers to certain TTI queries, while letting the target
12	// independent and default TTI implementations handle the rest.
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "SystemZTargetTransformInfo.h"
17	#include "llvm/Analysis/TargetTransformInfo.h"
18	#include "llvm/CodeGen/BasicTTIImpl.h"
19	#include "llvm/CodeGen/CostTable.h"
20	#include "llvm/CodeGen/TargetLowering.h"
21	#include "llvm/IR/DerivedTypes.h"
22	#include "llvm/IR/IntrinsicInst.h"
23	#include "llvm/IR/Intrinsics.h"
24	#include "llvm/Support/Debug.h"
25	#include "llvm/Support/MathExtras.h"
26
27	using namespace llvm;
28
29	#define DEBUG_TYPE "systemztti"
30
31	//===----------------------------------------------------------------------===//
32	//
33	// SystemZ cost model.
34	//
35	//===----------------------------------------------------------------------===//
36
37	static bool isUsedAsMemCpySource(const Value V, bool* &OtherUse) {
38	bool UsedAsMemCpySource = false;
39	for (const User *U : V->users())
40	if (const Instruction *User = dyn_cast<Instruction>(Val: U)) {
41	if (isa<BitCastInst>(Val: User) \|\| isa<GetElementPtrInst>(Val: User)) {
42	UsedAsMemCpySource \|= isUsedAsMemCpySource(V: User, OtherUse);
43	continue;
44	}
45	if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(Val: User)) {
46	if (Memcpy->getOperand(i_nocapture: `1`) == V && !Memcpy->isVolatile()) {
47	UsedAsMemCpySource = true;
48	continue;
49	}
50	}
51	OtherUse = true;
52	}
53	return UsedAsMemCpySource;
54	}
55
56	unsigned SystemZTTIImpl::adjustInliningThreshold(const CallBase CB) const* {
57	unsigned Bonus = `0`;
58
59	// Increase the threshold if an incoming argument is used only as a memcpy
60	// source.
61	if (Function *Callee = CB->getCalledFunction())
62	for (Argument &Arg : Callee->args()) {
63	bool OtherUse = false;
64	if (isUsedAsMemCpySource(V: &Arg, OtherUse) && !OtherUse)
65	Bonus += `150`;
66	}
67
68	LLVM_DEBUG(if (Bonus)
69	dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
70	return Bonus;
71	}
72
73	InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
74	TTI::TargetCostKind CostKind) {
75	assert(Ty->isIntegerTy());
76
77	unsigned BitSize = Ty->getPrimitiveSizeInBits();
78	// There is no cost model for constants with a bit size of 0. Return TCC_Free
79	// here, so that constant hoisting will ignore this constant.
80	if (BitSize == `0`)
81	return TTI::TCC_Free;
82	// No cost model for operations on integers larger than 128 bit implemented yet.
83	if ((!ST->hasVector() && BitSize > `64`) \|\| BitSize > `128`)
84	return TTI::TCC_Free;
85
86	if (Imm == `0`)
87	return TTI::TCC_Free;
88
89	if (Imm.getBitWidth() <= `64`) {
90	// Constants loaded via lgfi.
91	if (isInt<`32`>(x: Imm.getSExtValue()))
92	return TTI::TCC_Basic;
93	// Constants loaded via llilf.
94	if (isUInt<`32`>(x: Imm.getZExtValue()))
95	return TTI::TCC_Basic;
96	// Constants loaded via llihf:
97	if ((Imm.getZExtValue() & `0xffffffff`) == `0`)
98	return TTI::TCC_Basic;
99
100	return `2` * TTI::TCC_Basic;
101	}
102
103	// i128 immediates loads from Constant Pool
104	return `2` * TTI::TCC_Basic;
105	}
106
107	InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
108	const APInt &Imm, Type *Ty,
109	TTI::TargetCostKind CostKind,
110	Instruction *Inst) {
111	assert(Ty->isIntegerTy());
112
113	unsigned BitSize = Ty->getPrimitiveSizeInBits();
114	// There is no cost model for constants with a bit size of 0. Return TCC_Free
115	// here, so that constant hoisting will ignore this constant.
116	if (BitSize == `0`)
117	return TTI::TCC_Free;
118	// No cost model for operations on integers larger than 64 bit implemented yet.
119	if (BitSize > `64`)
120	return TTI::TCC_Free;
121
122	switch (Opcode) {
123	default:
124	return TTI::TCC_Free;
125	case Instruction::GetElementPtr:
126	// Always hoist the base address of a GetElementPtr. This prevents the
127	// creation of new constants for every base constant that gets constant
128	// folded with the offset.
129	if (Idx == `0`)
130	return `2` * TTI::TCC_Basic;
131	return TTI::TCC_Free;
132	case Instruction::Store:
133	if (Idx == `0` && Imm.getBitWidth() <= `64`) {
134	// Any 8-bit immediate store can by implemented via mvi.
135	if (BitSize == `8`)
136	return TTI::TCC_Free;
137	// 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
138	if (isInt<`16`>(x: Imm.getSExtValue()))
139	return TTI::TCC_Free;
140	}
141	break;
142	case Instruction::ICmp:
143	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
144	// Comparisons against signed 32-bit immediates implemented via cgfi.
145	if (isInt<`32`>(x: Imm.getSExtValue()))
146	return TTI::TCC_Free;
147	// Comparisons against unsigned 32-bit immediates implemented via clgfi.
148	if (isUInt<`32`>(x: Imm.getZExtValue()))
149	return TTI::TCC_Free;
150	}
151	break;
152	case Instruction::Add:
153	case Instruction::Sub:
154	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
155	// We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
156	if (isUInt<`32`>(x: Imm.getZExtValue()))
157	return TTI::TCC_Free;
158	// Or their negation, by swapping addition vs. subtraction.
159	if (isUInt<`32`>(x: -Imm.getSExtValue()))
160	return TTI::TCC_Free;
161	}
162	break;
163	case Instruction::Mul:
164	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
165	// We use msgfi to multiply by 32-bit signed immediates.
166	if (isInt<`32`>(x: Imm.getSExtValue()))
167	return TTI::TCC_Free;
168	}
169	break;
170	case Instruction::Or:
171	case Instruction::Xor:
172	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
173	// Masks supported by oilf/xilf.
174	if (isUInt<`32`>(x: Imm.getZExtValue()))
175	return TTI::TCC_Free;
176	// Masks supported by oihf/xihf.
177	if ((Imm.getZExtValue() & `0xffffffff`) == `0`)
178	return TTI::TCC_Free;
179	}
180	break;
181	case Instruction::And:
182	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
183	// Any 32-bit AND operation can by implemented via nilf.
184	if (BitSize <= `32`)
185	return TTI::TCC_Free;
186	// 64-bit masks supported by nilf.
187	if (isUInt<`32`>(x: ~Imm.getZExtValue()))
188	return TTI::TCC_Free;
189	// 64-bit masks supported by nilh.
190	if ((Imm.getZExtValue() & `0xffffffff`) == `0xffffffff`)
191	return TTI::TCC_Free;
192	// Some 64-bit AND operations can be implemented via risbg.
193	const SystemZInstrInfo *TII = ST->getInstrInfo();
194	unsigned Start, End;
195	if (TII->isRxSBGMask(Mask: Imm.getZExtValue(), BitSize, Start, End))
196	return TTI::TCC_Free;
197	}
198	break;
199	case Instruction::Shl:
200	case Instruction::LShr:
201	case Instruction::AShr:
202	// Always return TCC_Free for the shift value of a shift instruction.
203	if (Idx == `1`)
204	return TTI::TCC_Free;
205	break;
206	case Instruction::UDiv:
207	case Instruction::SDiv:
208	case Instruction::URem:
209	case Instruction::SRem:
210	case Instruction::Trunc:
211	case Instruction::ZExt:
212	case Instruction::SExt:
213	case Instruction::IntToPtr:
214	case Instruction::PtrToInt:
215	case Instruction::BitCast:
216	case Instruction::PHI:
217	case Instruction::Call:
218	case Instruction::Select:
219	case Instruction::Ret:
220	case Instruction::Load:
221	break;
222	}
223
224	return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
225	}
226
227	InstructionCost
228	SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
229	const APInt &Imm, Type *Ty,
230	TTI::TargetCostKind CostKind) {
231	assert(Ty->isIntegerTy());
232
233	unsigned BitSize = Ty->getPrimitiveSizeInBits();
234	// There is no cost model for constants with a bit size of 0. Return TCC_Free
235	// here, so that constant hoisting will ignore this constant.
236	if (BitSize == `0`)
237	return TTI::TCC_Free;
238	// No cost model for operations on integers larger than 64 bit implemented yet.
239	if (BitSize > `64`)
240	return TTI::TCC_Free;
241
242	switch (IID) {
243	default:
244	return TTI::TCC_Free;
245	case Intrinsic::sadd_with_overflow:
246	case Intrinsic::uadd_with_overflow:
247	case Intrinsic::ssub_with_overflow:
248	case Intrinsic::usub_with_overflow:
249	// These get expanded to include a normal addition/subtraction.
250	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
251	if (isUInt<`32`>(x: Imm.getZExtValue()))
252	return TTI::TCC_Free;
253	if (isUInt<`32`>(x: -Imm.getSExtValue()))
254	return TTI::TCC_Free;
255	}
256	break;
257	case Intrinsic::smul_with_overflow:
258	case Intrinsic::umul_with_overflow:
259	// These get expanded to include a normal multiplication.
260	if (Idx == `1` && Imm.getBitWidth() <= `64`) {
261	if (isInt<`32`>(x: Imm.getSExtValue()))
262	return TTI::TCC_Free;
263	}
264	break;
265	case Intrinsic::experimental_stackmap:
266	if ((Idx < `2`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
267	return TTI::TCC_Free;
268	break;
269	case Intrinsic::experimental_patchpoint_void:
270	case Intrinsic::experimental_patchpoint:
271	if ((Idx < `4`) \|\| (Imm.getBitWidth() <= `64` && isInt<`64`>(x: Imm.getSExtValue())))
272	return TTI::TCC_Free;
273	break;
274	}
275	return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
276	}
277
278	TargetTransformInfo::PopcntSupportKind
279	SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
280	assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
281	if (ST->hasPopulationCount() && TyWidth <= `64`)
282	return TTI::PSK_FastHardware;
283	return TTI::PSK_Software;
284	}
285
286	void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
287	TTI::UnrollingPreferences &UP,
288	OptimizationRemarkEmitter *ORE) {
289	// Find out if L contains a call, what the machine instruction count
290	// estimate is, and how many stores there are.
291	bool HasCall = false;
292	InstructionCost NumStores = `0`;
293	for (auto &BB : L->blocks())
294	for (auto &I : *BB) {
295	if (isa<CallInst>(Val: &I) \|\| isa<InvokeInst>(Val: &I)) {
296	if (const Function *F = cast<CallBase>(Val&: I).getCalledFunction()) {
297	if (isLoweredToCall(F))
298	HasCall = true;
299	if (F->getIntrinsicID() == Intrinsic::memcpy \|\|
300	F->getIntrinsicID() == Intrinsic::memset)
301	NumStores ++;
302	} else { // indirect call.
303	HasCall = true;
304	}
305	}
306	if (isa<StoreInst>(Val: &I)) {
307	Type *MemAccessTy = I.getOperand(i: `0`)->getType();
308	NumStores += getMemoryOpCost(Opcode: Instruction::Store, Src: MemAccessTy,
309	Alignment: std::nullopt, AddressSpace: `0`, CostKind: TTI::TCK_RecipThroughput);
310	}
311	}
312
313	// The z13 processor will run out of store tags if too many stores
314	// are fed into it too quickly. Therefore make sure there are not
315	// too many stores in the resulting unrolled loop.
316	unsigned const NumStoresVal = *NumStores.getValue();
317	unsigned const Max = (NumStoresVal ? (`12` / NumStoresVal) : UINT_MAX);
318
319	if (HasCall) {
320	// Only allow full unrolling if loop has any calls.
321	UP.FullUnrollMaxCount = Max;
322	UP.MaxCount = `1`;
323	return;
324	}
325
326	UP.MaxCount = Max;
327	if (UP.MaxCount <= `1`)
328	return;
329
330	// Allow partial and runtime trip count unrolling.
331	UP.Partial = UP.Runtime = true;
332
333	UP.PartialThreshold = `75`;
334	UP.DefaultUnrollRuntimeCount = `4`;
335
336	// Allow expensive instructions in the pre-header of the loop.
337	UP.AllowExpensiveTripCount = true;
338
339	UP.Force = true;
340	}
341
342	void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
343	TTI::PeelingPreferences &PP) {
344	BaseT::getPeelingPreferences(L, SE, PP);
345	}
346
347	bool SystemZTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
348	const TargetTransformInfo::LSRCost &C2) {
349	// SystemZ specific: check instruction count (first), and don't care about
350	// ImmCost, since offsets are checked explicitly.
351	return std::tie(args: C1.Insns, args: C1.NumRegs, args: C1.AddRecCost,
352	args: C1.NumIVMuls, args: C1.NumBaseAdds,
353	args: C1.ScaleCost, args: C1.SetupCost) <
354	std::tie(args: C2.Insns, args: C2.NumRegs, args: C2.AddRecCost,
355	args: C2.NumIVMuls, args: C2.NumBaseAdds,
356	args: C2.ScaleCost, args: C2.SetupCost);
357	}
358
359	unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
360	bool Vector = (ClassID == `1`);
361	if (!Vector)
362	// Discount the stack pointer. Also leave out %r0, since it can't
363	// be used in an address.
364	return `14`;
365	if (ST->hasVector())
366	return `32`;
367	return `0`;
368	}
369
370	TypeSize
371	SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
372	switch (K) {
373	case TargetTransformInfo::RGK_Scalar:
374	return TypeSize::getFixed(ExactSize: `64`);
375	case TargetTransformInfo::RGK_FixedWidthVector:
376	return TypeSize::getFixed(ExactSize: ST->hasVector() ? `128` : `0`);
377	case TargetTransformInfo::RGK_ScalableVector:
378	return TypeSize::getScalable(MinimumSize: `0`);
379	}
380
381	llvm_unreachable("Unsupported register kind");
382	}
383
384	unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
385	unsigned NumStridedMemAccesses,
386	unsigned NumPrefetches,
387	bool HasCall) const {
388	// Don't prefetch a loop with many far apart accesses.
389	if (NumPrefetches > `16`)
390	return UINT_MAX;
391
392	// Emit prefetch instructions for smaller strides in cases where we think
393	// the hardware prefetcher might not be able to keep up.
394	if (NumStridedMemAccesses > `32` && !HasCall &&
395	(NumMemAccesses - NumStridedMemAccesses) * `32` <= NumStridedMemAccesses)
396	return `1`;
397
398	return ST->hasMiscellaneousExtensions3() ? `8192` : `2048`;
399	}
400
401	bool SystemZTTIImpl::hasDivRemOp(Type DataType, bool* IsSigned) {
402	EVT VT = TLI->getValueType(DL, Ty: DataType);
403	return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
404	}
405
406	// Return the bit size for the scalar type or vector element
407	// type. getScalarSizeInBits() returns 0 for a pointer type.
408	static unsigned getScalarSizeInBits(Type *Ty) {
409	unsigned Size =
410	(Ty->isPtrOrPtrVectorTy() ? `64U` : Ty->getScalarSizeInBits());
411	assert(Size > `0` && "Element must have non-zero size.");
412	return Size;
413	}
414
415	// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
416	// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
417	// 3.
418	static unsigned getNumVectorRegs(Type *Ty) {
419	auto *VTy = cast<FixedVectorType>(Val: Ty);
420	unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
421	assert(WideBits > `0` && "Could not compute size of vector");
422	return ((WideBits % `128U`) ? ((WideBits / `128U`) + `1`) : (WideBits / `128U`));
423	}
424
425	InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
426	unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
427	TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
428	ArrayRef<const Value *> Args,
429	const Instruction *CxtI) {
430
431	// TODO: Handle more cost kinds.
432	if (CostKind != TTI::TCK_RecipThroughput)
433	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info,
434	Opd2Info: Op2Info, Args, CxtI);
435
436	// TODO: return a good value for BB-VECTORIZER that includes the
437	// immediate loads, which we do not want to count for the loop
438	// vectorizer, since they are hopefully hoisted out of the loop. This
439	// would require a new parameter 'InLoop', but not sure if constant
440	// args are common enough to motivate this.
441
442	unsigned ScalarBits = Ty->getScalarSizeInBits();
443
444	// There are thre cases of division and remainder: Dividing with a register
445	// needs a divide instruction. A divisor which is a power of two constant
446	// can be implemented with a sequence of shifts. Any other constant needs a
447	// multiply and shifts.
448	const unsigned DivInstrCost = `20`;
449	const unsigned DivMulSeqCost = `10`;
450	const unsigned SDivPow2Cost = `4`;
451
452	bool SignedDivRem =
453	Opcode == Instruction::SDiv \|\| Opcode == Instruction::SRem;
454	bool UnsignedDivRem =
455	Opcode == Instruction::UDiv \|\| Opcode == Instruction::URem;
456
457	// Check for a constant divisor.
458	bool DivRemConst = false;
459	bool DivRemConstPow2 = false;
460	if ((SignedDivRem \|\| UnsignedDivRem) && Args.size() == `2`) {
461	if (const Constant *C = dyn_cast<Constant>(Val: Args [`1`])) {
462	const ConstantInt *CVal =
463	(C->getType()->isVectorTy()
464	? dyn_cast_or_null<const ConstantInt>(Val: C->getSplatValue())
465	: dyn_cast<const ConstantInt>(Val: C));
466	if (CVal && (CVal->getValue().isPowerOf2() \|\|
467	CVal->getValue().isNegatedPowerOf2()))
468	DivRemConstPow2 = true;
469	else
470	DivRemConst = true;
471	}
472	}
473
474	if (!Ty->isVectorTy()) {
475	// These FP operations are supported with a dedicated instruction for
476	// float, double and fp128 (base implementation assumes float generally
477	// costs 2).
478	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
479	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv)
480	return `1`;
481
482	// There is no native support for FRem.
483	if (Opcode == Instruction::FRem)
484	return LIBCALL_COST;
485
486	// Give discount for some combined logical operations if supported.
487	if (Args.size() == `2`) {
488	if (Opcode == Instruction::Xor) {
489	for (const Value *A : Args) {
490	if (const Instruction *I = dyn_cast<Instruction>(Val: A))
491	if (I->hasOneUse() &&
492	(I->getOpcode() == Instruction::Or \|\|
493	I->getOpcode() == Instruction::And \|\|
494	I->getOpcode() == Instruction::Xor))
495	if ((ScalarBits <= `64` && ST->hasMiscellaneousExtensions3()) \|\|
496	(isInt128InVR(Ty) &&
497	(I->getOpcode() == Instruction::Or \|\| ST->hasVectorEnhancements1())))
498	return `0`;
499	}
500	}
501	else if (Opcode == Instruction::And \|\| Opcode == Instruction::Or) {
502	for (const Value *A : Args) {
503	if (const Instruction *I = dyn_cast<Instruction>(Val: A))
504	if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
505	((ScalarBits <= `64` && ST->hasMiscellaneousExtensions3()) \|\|
506	(isInt128InVR(Ty) &&
507	(Opcode == Instruction::And \|\| ST->hasVectorEnhancements1()))))
508	return `0`;
509	}
510	}
511	}
512
513	// Or requires one instruction, although it has custom handling for i64.
514	if (Opcode == Instruction::Or)
515	return `1`;
516
517	if (Opcode == Instruction::Xor && ScalarBits == `1`) {
518	if (ST->hasLoadStoreOnCond2())
519	return `5`; // 2 (li 0; loc 1); xor*
520	return `7`; // 2 ipm sequences ; xor ; shift ; compare*
521	}
522
523	if (DivRemConstPow2)
524	return (SignedDivRem ? SDivPow2Cost : `1`);
525	if (DivRemConst)
526	return DivMulSeqCost;
527	if (SignedDivRem \|\| UnsignedDivRem)
528	return DivInstrCost;
529	}
530	else if (ST->hasVector()) {
531	auto *VTy = cast<FixedVectorType>(Val: Ty);
532	unsigned VF = VTy->getNumElements();
533	unsigned NumVectors = getNumVectorRegs(Ty);
534
535	// These vector operations are custom handled, but are still supported
536	// with one instruction per vector, regardless of element size.
537	if (Opcode == Instruction::Shl \|\| Opcode == Instruction::LShr \|\|
538	Opcode == Instruction::AShr) {
539	return NumVectors;
540	}
541
542	if (DivRemConstPow2)
543	return (NumVectors * (SignedDivRem ? SDivPow2Cost : `1`));
544	if (DivRemConst) {
545	SmallVector<Type *> Tys(Args.size(), Ty);
546	return VF * DivMulSeqCost +
547	getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
548	}
549	if ((SignedDivRem \|\| UnsignedDivRem) && VF > `4`)
550	// Temporary hack: disable high vectorization factors with integer
551	// division/remainder, which will get scalarized and handled with
552	// GR128 registers. The mischeduler is not clever enough to avoid
553	// spilling yet.
554	return `1000`;
555
556	// These FP operations are supported with a single vector instruction for
557	// double (base implementation assumes float generally costs 2). For
558	// FP128, the scalar cost is 1, and there is no overhead since the values
559	// are already in scalar registers.
560	if (Opcode == Instruction::FAdd \|\| Opcode == Instruction::FSub \|\|
561	Opcode == Instruction::FMul \|\| Opcode == Instruction::FDiv) {
562	switch (ScalarBits) {
563	case `32`: {
564	// The vector enhancements facility 1 provides v4f32 instructions.
565	if (ST->hasVectorEnhancements1())
566	return NumVectors;
567	// Return the cost of multiple scalar invocation plus the cost of
568	// inserting and extracting the values.
569	InstructionCost ScalarCost =
570	getArithmeticInstrCost(Opcode, Ty: Ty->getScalarType(), CostKind);
571	SmallVector<Type *> Tys(Args.size(), Ty);
572	InstructionCost Cost =
573	(VF * ScalarCost) +
574	getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
575	// FIXME: VF 2 for these FP operations are currently just as
576	// expensive as for VF 4.
577	if (VF == `2`)
578	Cost *= `2`;
579	return Cost;
580	}
581	case `64`:
582	case `128`:
583	return NumVectors;
584	default:
585	break;
586	}
587	}
588
589	// There is no native support for FRem.
590	if (Opcode == Instruction::FRem) {
591	SmallVector<Type *> Tys(Args.size(), Ty);
592	InstructionCost Cost = (VF * LIBCALL_COST) +
593	getScalarizationOverhead(RetTy: VTy, Args, Tys, CostKind);
594	// FIXME: VF 2 for float is currently just as expensive as for VF 4.
595	if (VF == `2` && ScalarBits == `32`)
596	Cost *= `2`;
597	return Cost;
598	}
599	}
600
601	// Fallback to the default implementation.
602	return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info,
603	Args, CxtI);
604	}
605
606	InstructionCost SystemZTTIImpl::getShuffleCost(
607	TTI::ShuffleKind Kind, VectorType Tp, ArrayRef<int*> Mask,
608	TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
609	ArrayRef<const Value > Args, const* Instruction *CxtI) {
610	Kind = improveShuffleKindFromMask(Kind, Mask, Ty: Tp, Index, SubTy&: SubTp);
611	if (ST->hasVector()) {
612	unsigned NumVectors = getNumVectorRegs(Ty: Tp);
613
614	// TODO: Since fp32 is expanded, the shuffle cost should always be 0.
615
616	// FP128 values are always in scalar registers, so there is no work
617	// involved with a shuffle, except for broadcast. In that case register
618	// moves are done with a single instruction per element.
619	if (Tp->getScalarType()->isFP128Ty())
620	return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - `1` : `0`);
621
622	switch (Kind) {
623	case TargetTransformInfo::SK_ExtractSubvector:
624	// ExtractSubvector Index indicates start offset.
625
626	// Extracting a subvector from first index is a noop.
627	return (Index == `0` ? `0` : NumVectors);
628
629	case TargetTransformInfo::SK_Broadcast:
630	// Loop vectorizer calls here to figure out the extra cost of
631	// broadcasting a loaded value to all elements of a vector. Since vlrep
632	// loads and replicates with a single instruction, adjust the returned
633	// value.
634	return NumVectors - `1`;
635
636	default:
637
638	// SystemZ supports single instruction permutation / replication.
639	return NumVectors;
640	}
641	}
642
643	return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
644	}
645
646	// Return the log2 difference of the element sizes of the two vector types.
647	static unsigned getElSizeLog2Diff(Type Ty0, Type Ty1) {
648	unsigned Bits0 = Ty0->getScalarSizeInBits();
649	unsigned Bits1 = Ty1->getScalarSizeInBits();
650
651	if (Bits1 > Bits0)
652	return (Log2_32(Value: Bits1) - Log2_32(Value: Bits0));
653
654	return (Log2_32(Value: Bits0) - Log2_32(Value: Bits1));
655	}
656
657	// Return the number of instructions needed to truncate SrcTy to DstTy.
658	unsigned SystemZTTIImpl::
659	getVectorTruncCost(Type SrcTy, Type DstTy) {
660	assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
661	assert(SrcTy->getPrimitiveSizeInBits().getFixedValue() >
662	DstTy->getPrimitiveSizeInBits().getFixedValue() &&
663	"Packing must reduce size of vector type.");
664	assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
665	cast<FixedVectorType>(DstTy)->getNumElements() &&
666	"Packing should not change number of elements.");
667
668	// TODO: Since fp32 is expanded, the extract cost should always be 0.
669
670	unsigned NumParts = getNumVectorRegs(Ty: SrcTy);
671	if (NumParts <= `2`)
672	// Up to 2 vector registers can be truncated efficiently with pack or
673	// permute. The latter requires an immediate mask to be loaded, which
674	// typically gets hoisted out of a loop. TODO: return a good value for
675	// BB-VECTORIZER that includes the immediate loads, which we do not want
676	// to count for the loop vectorizer.
677	return `1`;
678
679	unsigned Cost = `0`;
680	unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy);
681	unsigned VF = cast<FixedVectorType>(Val: SrcTy)->getNumElements();
682	for (unsigned P = `0`; P < Log2Diff; ++P) {
683	if (NumParts > `1`)
684	NumParts /= `2`;
685	Cost += NumParts;
686	}
687
688	// Currently, a general mix of permutes and pack instructions is output by
689	// isel, which follow the cost computation above except for this case which
690	// is one instruction less:
691	if (VF == `8` && SrcTy->getScalarSizeInBits() == `64` &&
692	DstTy->getScalarSizeInBits() == `8`)
693	Cost--;
694
695	return Cost;
696	}
697
698	// Return the cost of converting a vector bitmask produced by a compare
699	// (SrcTy), to the type of the select or extend instruction (DstTy).
700	unsigned SystemZTTIImpl::
701	getVectorBitmaskConversionCost(Type SrcTy, Type DstTy) {
702	assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
703	"Should only be called with vector types.");
704
705	unsigned PackCost = `0`;
706	unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
707	unsigned DstScalarBits = DstTy->getScalarSizeInBits();
708	unsigned Log2Diff = getElSizeLog2Diff(Ty0: SrcTy, Ty1: DstTy);
709	if (SrcScalarBits > DstScalarBits)
710	// The bitmask will be truncated.
711	PackCost = getVectorTruncCost(SrcTy, DstTy);
712	else if (SrcScalarBits < DstScalarBits) {
713	unsigned DstNumParts = getNumVectorRegs(Ty: DstTy);
714	// Each vector select needs its part of the bitmask unpacked.
715	PackCost = Log2Diff * DstNumParts;
716	// Extra cost for moving part of mask before unpacking.
717	PackCost += DstNumParts - `1`;
718	}
719
720	return PackCost;
721	}
722
723	// Return the type of the compared operands. This is needed to compute the
724	// cost for a Select / ZExt or SExt instruction.
725	static Type getCmpOpsType(const* Instruction I, unsigned* VF = `1`) {
726	Type OpTy = nullptr*;
727	if (CmpInst *CI = dyn_cast<CmpInst>(Val: I->getOperand(i: `0`)))
728	OpTy = CI->getOperand(i_nocapture: `0`)->getType();
729	else if (Instruction *LogicI = dyn_cast<Instruction>(Val: I->getOperand(i: `0`)))
730	if (LogicI->getNumOperands() == `2`)
731	if (CmpInst *CI0 = dyn_cast<CmpInst>(Val: LogicI->getOperand(i: `0`)))
732	if (isa<CmpInst>(Val: LogicI->getOperand(i: `1`)))
733	OpTy = CI0->getOperand(i_nocapture: `0`)->getType();
734
735	if (OpTy != nullptr) {
736	if (VF == `1`) {
737	assert (!OpTy->isVectorTy() && "Expected scalar type");
738	return OpTy;
739	}
740	// Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
741	// be either scalar or already vectorized with a same or lesser VF.
742	Type *ElTy = OpTy->getScalarType();
743	return FixedVectorType::get(ElementType: ElTy, NumElts: VF);
744	}
745
746	return nullptr;
747	}
748
749	// Get the cost of converting a boolean vector to a vector with same width
750	// and element size as Dst, plus the cost of zero extending if needed.
751	unsigned SystemZTTIImpl::
752	getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
753	const Instruction *I) {
754	auto *DstVTy = cast<FixedVectorType>(Val: Dst);
755	unsigned VF = DstVTy->getNumElements();
756	unsigned Cost = `0`;
757	// If we know what the widths of the compared operands, get any cost of
758	// converting it to match Dst. Otherwise assume same widths.
759	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr*);
760	if (CmpOpTy != nullptr)
761	Cost = getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: Dst);
762	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::UIToFP)
763	// One 'vn' per dst vector with an immediate mask.
764	Cost += getNumVectorRegs(Ty: Dst);
765	return Cost;
766	}
767
768	InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
769	Type *Src,
770	TTI::CastContextHint CCH,
771	TTI::TargetCostKind CostKind,
772	const Instruction *I) {
773	// FIXME: Can the logic below also be used for these cost kinds?
774	if (CostKind == TTI::TCK_CodeSize \|\| CostKind == TTI::TCK_SizeAndLatency) {
775	auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
776	return BaseCost == `0` ? BaseCost : `1`;
777	}
778
779	unsigned DstScalarBits = Dst->getScalarSizeInBits();
780	unsigned SrcScalarBits = Src->getScalarSizeInBits();
781
782	if (!Src->isVectorTy()) {
783	assert (!Dst->isVectorTy());
784
785	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP) {
786	if (Src->isIntegerTy(Bitwidth: `128`))
787	return LIBCALL_COST;
788	if (SrcScalarBits >= `32` \|\|
789	(I != nullptr && isa<LoadInst>(Val: I->getOperand(i: `0`))))
790	return `1`;
791	return SrcScalarBits > `1` ? `2` /i8/i16 extend/ : `5` /branch seq./;
792	}
793
794	if ((Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) &&
795	Dst->isIntegerTy(Bitwidth: `128`))
796	return LIBCALL_COST;
797
798	if ((Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt)) {
799	if (Src->isIntegerTy(Bitwidth: `1`)) {
800	if (DstScalarBits == `128`)
801	return `5` /branch seq./;
802
803	if (ST->hasLoadStoreOnCond2())
804	return `2`; // li 0; loc 1
805
806	// This should be extension of a compare i1 result, which is done with
807	// ipm and a varying sequence of instructions.
808	unsigned Cost = `0`;
809	if (Opcode == Instruction::SExt)
810	Cost = (DstScalarBits < `64` ? `3` : `4`);
811	if (Opcode == Instruction::ZExt)
812	Cost = `3`;
813	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr*);
814	if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
815	// If operands of an fp-type was compared, this costs +1.
816	Cost++;
817	return Cost;
818	}
819	else if (isInt128InVR(Ty: Dst)) {
820	// Extensions from GPR to i128 (in VR) typically costs two instructions,
821	// but a zero-extending load would be just one extra instruction.
822	if (Opcode == Instruction::ZExt && I != nullptr)
823	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
824	if (Ld->hasOneUse())
825	return `1`;
826	return `2`;
827	}
828	}
829
830	if (Opcode == Instruction::Trunc && isInt128InVR(Ty: Src) && I != nullptr) {
831	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
832	if (Ld->hasOneUse())
833	return `0`; // Will be converted to GPR load.
834	bool OnlyTruncatingStores = true;
835	for (const User *U : I->users())
836	if (!isa<StoreInst>(Val: U)) {
837	OnlyTruncatingStores = false;
838	break;
839	}
840	if (OnlyTruncatingStores)
841	return `0`;
842	return `2`; // Vector element extraction.
843	}
844	}
845	else if (ST->hasVector()) {
846	// Vector to scalar cast.
847	auto *SrcVecTy = cast<FixedVectorType>(Val: Src);
848	auto *DstVecTy = dyn_cast<FixedVectorType>(Val: Dst);
849	if (!DstVecTy) {
850	// TODO: tune vector-to-scalar cast.
851	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
852	}
853	unsigned VF = SrcVecTy->getNumElements();
854	unsigned NumDstVectors = getNumVectorRegs(Ty: Dst);
855	unsigned NumSrcVectors = getNumVectorRegs(Ty: Src);
856
857	if (Opcode == Instruction::Trunc) {
858	if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
859	return `0`; // Check for NOOP conversions.
860	return getVectorTruncCost(SrcTy: Src, DstTy: Dst);
861	}
862
863	if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt) {
864	if (SrcScalarBits >= `8`) {
865	// ZExt will use either a single unpack or a vector permute.
866	if (Opcode == Instruction::ZExt)
867	return NumDstVectors;
868
869	// SExt will be handled with one unpack per doubling of width.
870	unsigned NumUnpacks = getElSizeLog2Diff(Ty0: Src, Ty1: Dst);
871
872	// For types that spans multiple vector registers, some additional
873	// instructions are used to setup the unpacking.
874	unsigned NumSrcVectorOps =
875	(NumUnpacks > `1` ? (NumDstVectors - NumSrcVectors)
876	: (NumDstVectors / `2`));
877
878	return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
879	}
880	else if (SrcScalarBits == `1`)
881	return getBoolVecToIntConversionCost(Opcode, Dst, I);
882	}
883
884	if (Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP \|\|
885	Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI) {
886	// TODO: Fix base implementation which could simplify things a bit here
887	// (seems to miss on differentiating on scalar/vector types).
888
889	// Only 64 bit vector conversions are natively supported before z15.
890	if (DstScalarBits == `64` \|\| ST->hasVectorEnhancements2()) {
891	if (SrcScalarBits == DstScalarBits)
892	return NumDstVectors;
893
894	if (SrcScalarBits == `1`)
895	return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
896	}
897
898	// Return the cost of multiple scalar invocation plus the cost of
899	// inserting and extracting the values. Base implementation does not
900	// realize float->int gets scalarized.
901	InstructionCost ScalarCost = getCastInstrCost(
902	Opcode, Dst: Dst->getScalarType(), Src: Src->getScalarType(), CCH, CostKind);
903	InstructionCost TotCost = VF * ScalarCost;
904	bool NeedsInserts = true, NeedsExtracts = true;
905	// FP128 registers do not get inserted or extracted.
906	if (DstScalarBits == `128` &&
907	(Opcode == Instruction::SIToFP \|\| Opcode == Instruction::UIToFP))
908	NeedsInserts = false;
909	if (SrcScalarBits == `128` &&
910	(Opcode == Instruction::FPToSI \|\| Opcode == Instruction::FPToUI))
911	NeedsExtracts = false;
912
913	TotCost += getScalarizationOverhead(InTy: SrcVecTy, /Insert/ false,
914	Extract: NeedsExtracts, CostKind);
915	TotCost += getScalarizationOverhead(InTy: DstVecTy, Insert: NeedsInserts,
916	/Extract/ false, CostKind);
917
918	// FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
919	if (VF == `2` && SrcScalarBits == `32` && DstScalarBits == `32`)
920	TotCost *= `2`;
921
922	return TotCost;
923	}
924
925	if (Opcode == Instruction::FPTrunc) {
926	if (SrcScalarBits == `128`) // fp128 -> double/float + inserts of elements.
927	return VF /ldxbr/lexbr/ +
928	getScalarizationOverhead(InTy: DstVecTy, /Insert/ true,
929	/Extract/ false, CostKind);
930	else // double -> float
931	return VF / `2` /vledb/ + std::max(a: `1U`, b: VF / `4` /vperm/);
932	}
933
934	if (Opcode == Instruction::FPExt) {
935	if (SrcScalarBits == `32` && DstScalarBits == `64`) {
936	// float -> double is very rare and currently unoptimized. Instead of
937	// using vldeb, which can do two at a time, all conversions are
938	// scalarized.
939	return VF * `2`;
940	}
941	// -> fp128. VF lxdb/lxeb + extraction of elements.*
942	return VF + getScalarizationOverhead(InTy: SrcVecTy, /Insert/ false,
943	/Extract/ true, CostKind);
944	}
945	}
946
947	return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
948	}
949
950	// Scalar i8 / i16 operations will typically be made after first extending
951	// the operands to i32.
952	static unsigned getOperandsExtensionCost(const Instruction *I) {
953	unsigned ExtCost = `0`;
954	for (Value *Op : I->operands())
955	// A load of i8 or i16 sign/zero extends to i32.
956	if (!isa<LoadInst>(Val: Op) && !isa<ConstantInt>(Val: Op))
957	ExtCost++;
958
959	return ExtCost;
960	}
961
962	InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
963	Type *CondTy,
964	CmpInst::Predicate VecPred,
965	TTI::TargetCostKind CostKind,
966	const Instruction *I) {
967	if (CostKind != TTI::TCK_RecipThroughput)
968	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
969
970	if (!ValTy->isVectorTy()) {
971	switch (Opcode) {
972	case Instruction::ICmp: {
973	// A loaded value compared with 0 with multiple users becomes Load and
974	// Test. The load is then not foldable, so return 0 cost for the ICmp.
975	unsigned ScalarBits = ValTy->getScalarSizeInBits();
976	if (I != nullptr && (ScalarBits == `32` \|\| ScalarBits == `64`))
977	if (LoadInst *Ld = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
978	if (const ConstantInt *C = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`)))
979	if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
980	C->isZero())
981	return `0`;
982
983	unsigned Cost = `1`;
984	if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= `16`)
985	Cost += (I != nullptr ? getOperandsExtensionCost(I) : `2`);
986	return Cost;
987	}
988	case Instruction::Select:
989	if (ValTy->isFloatingPointTy() \|\| isInt128InVR(Ty: ValTy))
990	return `4`; // No LOC for FP / i128 - costs a conditional jump.
991	return `1`; // Load On Condition / Select Register.
992	}
993	}
994	else if (ST->hasVector()) {
995	unsigned VF = cast<FixedVectorType>(Val: ValTy)->getNumElements();
996
997	// Called with a compare instruction.
998	if (Opcode == Instruction::ICmp \|\| Opcode == Instruction::FCmp) {
999	unsigned PredicateExtraCost = `0`;
1000	if (I != nullptr) {
1001	// Some predicates cost one or two extra instructions.
1002	switch (cast<CmpInst>(Val: I)->getPredicate()) {
1003	case CmpInst::Predicate::ICMP_NE:
1004	case CmpInst::Predicate::ICMP_UGE:
1005	case CmpInst::Predicate::ICMP_ULE:
1006	case CmpInst::Predicate::ICMP_SGE:
1007	case CmpInst::Predicate::ICMP_SLE:
1008	PredicateExtraCost = `1`;
1009	break;
1010	case CmpInst::Predicate::FCMP_ONE:
1011	case CmpInst::Predicate::FCMP_ORD:
1012	case CmpInst::Predicate::FCMP_UEQ:
1013	case CmpInst::Predicate::FCMP_UNO:
1014	PredicateExtraCost = `2`;
1015	break;
1016	default:
1017	break;
1018	}
1019	}
1020
1021	// Float is handled with 2vmr[lh]f + 2vldeb + vfchdb for each pair of
1022	// floats. FIXME: <2 x float> generates same code as <4 x float>.
1023	unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? `10` : `1`);
1024	unsigned NumVecs_cmp = getNumVectorRegs(Ty: ValTy);
1025
1026	unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1027	return Cost;
1028	}
1029	else { // Called with a select instruction.
1030	assert (Opcode == Instruction::Select);
1031
1032	// We can figure out the extra cost of packing / unpacking if the
1033	// instruction was passed and the compare instruction is found.
1034	unsigned PackCost = `0`;
1035	Type CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr*);
1036	if (CmpOpTy != nullptr)
1037	PackCost =
1038	getVectorBitmaskConversionCost(SrcTy: CmpOpTy, DstTy: ValTy);
1039
1040	return getNumVectorRegs(Ty: ValTy) /vsel/ + PackCost;
1041	}
1042	}
1043
1044	return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind);
1045	}
1046
1047	InstructionCost SystemZTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
1048	TTI::TargetCostKind CostKind,
1049	unsigned Index, Value *Op0,
1050	Value *Op1) {
1051	// vlvgp will insert two grs into a vector register, so only count half the
1052	// number of instructions.
1053	if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(BitWidth: `64`))
1054	return ((Index % `2` == `0`) ? `1` : `0`);
1055
1056	if (Opcode == Instruction::ExtractElement) {
1057	int Cost = ((getScalarSizeInBits(Ty: Val) == `1`) ? `2` /+test-under-mask/ : `1`);
1058
1059	// Give a slight penalty for moving out of vector pipeline to FXU unit.
1060	if (Index == `0` && Val->isIntOrIntVectorTy())
1061	Cost += `1`;
1062
1063	return Cost;
1064	}
1065
1066	return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1067	}
1068
1069	// Check if a load may be folded as a memory operand in its user.
1070	bool SystemZTTIImpl::
1071	isFoldableLoad(const LoadInst Ld, const* Instruction *&FoldedValue) {
1072	if (!Ld->hasOneUse())
1073	return false;
1074	FoldedValue = Ld;
1075	const Instruction UserI = cast<Instruction>(Val: Ld->user_begin());
1076	unsigned LoadedBits = getScalarSizeInBits(Ty: Ld->getType());
1077	unsigned TruncBits = `0`;
1078	unsigned SExtBits = `0`;
1079	unsigned ZExtBits = `0`;
1080	if (UserI->hasOneUse()) {
1081	unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1082	if (isa<TruncInst>(Val: UserI))
1083	TruncBits = UserBits;
1084	else if (isa<SExtInst>(Val: UserI))
1085	SExtBits = UserBits;
1086	else if (isa<ZExtInst>(Val: UserI))
1087	ZExtBits = UserBits;
1088	}
1089	if (TruncBits \|\| SExtBits \|\| ZExtBits) {
1090	FoldedValue = UserI;
1091	UserI = cast<Instruction>(Val: *UserI->user_begin());
1092	// Load (single use) -> trunc/extend (single use) -> UserI
1093	}
1094	if ((UserI->getOpcode() == Instruction::Sub \|\|
1095	UserI->getOpcode() == Instruction::SDiv \|\|
1096	UserI->getOpcode() == Instruction::UDiv) &&
1097	UserI->getOperand(i: `1`) != FoldedValue)
1098	return false; // Not commutative, only RHS foldable.
1099	// LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1100	// extension was made of the load.
1101	unsigned LoadOrTruncBits =
1102	((SExtBits \|\| ZExtBits) ? `0` : (TruncBits ? TruncBits : LoadedBits));
1103	switch (UserI->getOpcode()) {
1104	case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1105	case Instruction::Sub:
1106	case Instruction::ICmp:
1107	if (LoadedBits == `32` && ZExtBits == `64`)
1108	return true;
1109	[[fallthrough]];
1110	case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1111	if (UserI->getOpcode() != Instruction::ICmp) {
1112	if (LoadedBits == `16` &&
1113	(SExtBits == `32` \|\|
1114	(SExtBits == `64` && ST->hasMiscellaneousExtensions2())))
1115	return true;
1116	if (LoadOrTruncBits == `16`)
1117	return true;
1118	}
1119	[[fallthrough]];
1120	case Instruction::SDiv:// SE: 32->64
1121	if (LoadedBits == `32` && SExtBits == `64`)
1122	return true;
1123	[[fallthrough]];
1124	case Instruction::UDiv:
1125	case Instruction::And:
1126	case Instruction::Or:
1127	case Instruction::Xor:
1128	// This also makes sense for float operations, but disabled for now due
1129	// to regressions.
1130	// case Instruction::FCmp:
1131	// case Instruction::FAdd:
1132	// case Instruction::FSub:
1133	// case Instruction::FMul:
1134	// case Instruction::FDiv:
1135
1136	// All possible extensions of memory checked above.
1137
1138	// Comparison between memory and immediate.
1139	if (UserI->getOpcode() == Instruction::ICmp)
1140	if (ConstantInt *CI = dyn_cast<ConstantInt>(Val: UserI->getOperand(i: `1`)))
1141	if (CI->getValue().isIntN(N: `16`))
1142	return true;
1143	return (LoadOrTruncBits == `32` \|\| LoadOrTruncBits == `64`);
1144	break;
1145	}
1146	return false;
1147	}
1148
1149	static bool isBswapIntrinsicCall(const Value *V) {
1150	if (const Instruction *I = dyn_cast<Instruction>(Val: V))
1151	if (auto *CI = dyn_cast<CallInst>(Val: I))
1152	if (auto *F = CI->getCalledFunction())
1153	if (F->getIntrinsicID() == Intrinsic::bswap)
1154	return true;
1155	return false;
1156	}
1157
1158	InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1159	MaybeAlign Alignment,
1160	unsigned AddressSpace,
1161	TTI::TargetCostKind CostKind,
1162	TTI::OperandValueInfo OpInfo,
1163	const Instruction *I) {
1164	assert(!Src->isVoidTy() && "Invalid type");
1165
1166	// TODO: Handle other cost kinds.
1167	if (CostKind != TTI::TCK_RecipThroughput)
1168	return `1`;
1169
1170	if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1171	// Store the load or its truncated or extended value in FoldedValue.
1172	const Instruction FoldedValue = nullptr*;
1173	if (isFoldableLoad(Ld: cast<LoadInst>(Val: I), FoldedValue)) {
1174	const Instruction UserI = cast<Instruction>(Val: FoldedValue->user_begin());
1175	assert (UserI->getNumOperands() == `2` && "Expected a binop.");
1176
1177	// UserI can't fold two loads, so in that case return 0 cost only
1178	// half of the time.
1179	for (unsigned i = `0`; i < `2`; ++i) {
1180	if (UserI->getOperand(i) == FoldedValue)
1181	continue;
1182
1183	if (Instruction *OtherOp = dyn_cast<Instruction>(Val: UserI->getOperand(i))){
1184	LoadInst *OtherLoad = dyn_cast<LoadInst>(Val: OtherOp);
1185	if (!OtherLoad &&
1186	(isa<TruncInst>(Val: OtherOp) \|\| isa<SExtInst>(Val: OtherOp) \|\|
1187	isa<ZExtInst>(Val: OtherOp)))
1188	OtherLoad = dyn_cast<LoadInst>(Val: OtherOp->getOperand(i: `0`));
1189	if (OtherLoad && isFoldableLoad(Ld: OtherLoad, FoldedValue/dummy/))
1190	return i == `0`; // Both operands foldable.
1191	}
1192	}
1193
1194	return `0`; // Only I is foldable in user.
1195	}
1196	}
1197
1198	// Type legalization (via getNumberOfParts) can't handle structs
1199	if (TLI->getValueType(DL, Ty: Src, AllowUnknown: true) == MVT::Other)
1200	return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1201	CostKind);
1202
1203	// FP128 is a legal type but kept in a register pair on older CPUs.
1204	if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1205	return `2`;
1206
1207	unsigned NumOps =
1208	(Src->isVectorTy() ? getNumVectorRegs(Ty: Src) : getNumberOfParts(Tp: Src));
1209
1210	// Store/Load reversed saves one instruction.
1211	if (((!Src->isVectorTy() && NumOps == `1`) \|\| ST->hasVectorEnhancements2()) &&
1212	I != nullptr) {
1213	if (Opcode == Instruction::Load && I->hasOneUse()) {
1214	const Instruction LdUser = cast<Instruction>(Val: I->user_begin());
1215	// In case of load -> bswap -> store, return normal cost for the load.
1216	if (isBswapIntrinsicCall(V: LdUser) &&
1217	(!LdUser->hasOneUse() \|\| !isa<StoreInst>(Val: *LdUser->user_begin())))
1218	return `0`;
1219	}
1220	else if (const StoreInst *SI = dyn_cast<StoreInst>(Val: I)) {
1221	const Value *StoredVal = SI->getValueOperand();
1222	if (StoredVal->hasOneUse() && isBswapIntrinsicCall(V: StoredVal))
1223	return `0`;
1224	}
1225	}
1226
1227	return NumOps;
1228	}
1229
1230	// The generic implementation of getInterleavedMemoryOpCost() is based on
1231	// adding costs of the memory operations plus all the extracts and inserts
1232	// needed for using / defining the vector operands. The SystemZ version does
1233	// roughly the same but bases the computations on vector permutations
1234	// instead.
1235	InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
1236	unsigned Opcode, Type VecTy, unsigned* Factor, ArrayRef<unsigned> Indices,
1237	Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1238	bool UseMaskForCond, bool UseMaskForGaps) {
1239	if (UseMaskForCond \|\| UseMaskForGaps)
1240	return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1241	Alignment, AddressSpace, CostKind,
1242	UseMaskForCond, UseMaskForGaps);
1243	assert(isa<VectorType>(VecTy) &&
1244	"Expect a vector type for interleaved memory op");
1245
1246	unsigned NumElts = cast<FixedVectorType>(Val: VecTy)->getNumElements();
1247	assert(Factor > `1` && NumElts % Factor == `0` && "Invalid interleave factor");
1248	unsigned VF = NumElts / Factor;
1249	unsigned NumEltsPerVecReg = (`128U` / getScalarSizeInBits(Ty: VecTy));
1250	unsigned NumVectorMemOps = getNumVectorRegs(Ty: VecTy);
1251	unsigned NumPermutes = `0`;
1252
1253	if (Opcode == Instruction::Load) {
1254	// Loading interleave groups may have gaps, which may mean fewer
1255	// loads. Find out how many vectors will be loaded in total, and in how
1256	// many of them each value will be in.
1257	BitVector UsedInsts(NumVectorMemOps, false);
1258	std::vector<BitVector> ValueVecs(Factor, BitVector (NumVectorMemOps, false));
1259	for (unsigned Index : Indices)
1260	for (unsigned Elt = `0`; Elt < VF; ++Elt) {
1261	unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1262	UsedInsts.set(Vec);
1263	ValueVecs [Index].set(Vec);
1264	}
1265	NumVectorMemOps = UsedInsts.count();
1266
1267	for (unsigned Index : Indices) {
1268	// Estimate that each loaded source vector containing this Index
1269	// requires one operation, except that vperm can handle two input
1270	// registers first time for each dst vector.
1271	unsigned NumSrcVecs = ValueVecs [Index].count();
1272	unsigned NumDstVecs = divideCeil(Numerator: VF * getScalarSizeInBits(Ty: VecTy), Denominator: `128U`);
1273	assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1274	NumPermutes += std::max(a: `1U`, b: NumSrcVecs - NumDstVecs);
1275	}
1276	} else {
1277	// Estimate the permutes for each stored vector as the smaller of the
1278	// number of elements and the number of source vectors. Subtract one per
1279	// dst vector for vperm (S.A.).
1280	unsigned NumSrcVecs = std::min(a: NumEltsPerVecReg, b: Factor);
1281	unsigned NumDstVecs = NumVectorMemOps;
1282	NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1283	}
1284
1285	// Cost of load/store operations and the permutations needed.
1286	return NumVectorMemOps + NumPermutes;
1287	}
1288
1289	static int
1290	getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1291	const SmallVectorImpl<Type *> &ParamTys) {
1292	if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1293	return getNumVectorRegs(Ty: RetTy); // VPERM
1294
1295	if (ID == Intrinsic::vector_reduce_add) {
1296	// Retrieve number and size of elements for the vector op.
1297	auto *VTy = cast<FixedVectorType>(Val: ParamTys.front());
1298	unsigned ScalarSize = VTy->getScalarSizeInBits();
1299	// For scalar sizes >128 bits, we fall back to the generic cost estimate.
1300	if (ScalarSize > SystemZ::VectorBits)
1301	return -`1`;
1302	// This many vector regs are needed to represent the input elements (V).
1303	unsigned VectorRegsNeeded = getNumVectorRegs(Ty: VTy);
1304	// This many instructions are needed for the final sum of vector elems (S).
1305	unsigned LastVectorHandling = (ScalarSize < `32`) ? `3` : `2`;
1306	// We use vector adds to create a sum vector, which takes
1307	// V/2 + V/4 + ... = V - 1 operations.
1308	// Then, we need S operations to sum up the elements of that sum vector,
1309	// for a total of V + S - 1 operations.
1310	int Cost = VectorRegsNeeded + LastVectorHandling - `1`;
1311	return Cost;
1312	}
1313	return -`1`;
1314	}
1315
1316	InstructionCost
1317	SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1318	TTI::TargetCostKind CostKind) {
1319	InstructionCost Cost = getVectorIntrinsicInstrCost(
1320	ID: ICA.getID(), RetTy: ICA.getReturnType(), ParamTys: ICA.getArgTypes());
1321	if (Cost != -`1`)
1322	return Cost;
1323	return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1324	}
1325
1326	bool SystemZTTIImpl::shouldExpandReduction(const IntrinsicInst II) const* {
1327	// Always expand on Subtargets without vector instructions.
1328	if (!ST->hasVector())
1329	return true;
1330
1331	// Whether or not to expand is a per-intrinsic decision.
1332	switch (II->getIntrinsicID()) {
1333	default:
1334	return true;
1335	// Do not expand vector.reduce.add...
1336	case Intrinsic::vector_reduce_add:
1337	auto *VType = cast<FixedVectorType>(Val: II->getOperand(i_nocapture: `0`)->getType());
1338	// ...unless the scalar size is i64 or larger,
1339	// or the operand vector is not full, since the
1340	// performance benefit is dubious in those cases.
1341	return VType->getScalarSizeInBits() >= `64` \|\|
1342	VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1343	}
1344	}
1345

source code of llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp