AMDGPUImageIntrinsicOptimizer.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp]

1	//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10	// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11	//
12	// - they refer to the same vaddr except for sample_id,
13	// - they use a constant sample_id and they fall into the same group,
14	// - they have the same dmask and the number of intrinsics and the number of
15	// vaddr/vdata dword transfers is reduced by the combine.
16	//
17	// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18	//
19	// +----------+-----+-----+-------+---------+------------+---------+----------+
20	// \| popcount \| a16 \| d16 \| #load \| vaddr / \| #msaa_load \| vaddr / \| combine? \|
21	// \| (dmask) \| \| \| \| vdata \| \| vdata \| \|
22	// +----------+-----+-----+-------+---------+------------+---------+----------+
23	// \| 1 \| 0 \| 0 \| 4 \| 12 / 4 \| 1 \| 3 / 4 \| yes \|
24	// +----------+-----+-----+-------+---------+------------+---------+----------+
25	// \| 1 \| 0 \| 0 \| 2 \| 6 / 2 \| 1 \| 3 / 4 \| yes? \|
26	// +----------+-----+-----+-------+---------+------------+---------+----------+
27	// \| 2 \| 0 \| 0 \| 4 \| 12 / 8 \| 2 \| 6 / 8 \| yes \|
28	// +----------+-----+-----+-------+---------+------------+---------+----------+
29	// \| 2 \| 0 \| 0 \| 2 \| 6 / 4 \| 2 \| 6 / 8 \| no \|
30	// +----------+-----+-----+-------+---------+------------+---------+----------+
31	// \| 1 \| 0 \| 1 \| 2 \| 6 / 2 \| 1 \| 3 / 2 \| yes \|
32	// +----------+-----+-----+-------+---------+------------+---------+----------+
33	//
34	// Some cases are of questionable benefit, like the one marked with "yes?"
35	// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36	// and TX, but higher vdata. We start by erring on the side of converting these
37	// to MSAA_LOAD.
38	//
39	// clang-format off
40	//
41	// This pass will combine intrinsics such as (not neccessarily consecutive):
42	// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43	// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44	// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45	// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46	// ==>
47	// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48	//
49	// clang-format on
50	//
51	// Future improvements:
52	//
53	// - We may occasionally not want to do the combine if it increases the maximum
54	// register pressure.
55	//
56	// - Ensure clausing when multiple MSAA_LOAD are generated.
57	//
58	// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59	// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60	// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61	// we don't know the format at compile time.
62	//===----------------------------------------------------------------------===//
63
64	#include "AMDGPU.h"
65	#include "AMDGPUInstrInfo.h"
66	#include "AMDGPUTargetMachine.h"
67	#include "llvm/IR/Function.h"
68	#include "llvm/IR/IRBuilder.h"
69	#include "llvm/IR/IntrinsicInst.h"
70	#include "llvm/IR/IntrinsicsAMDGPU.h"
71	#include "llvm/Pass.h"
72	#include "llvm/Support/raw_ostream.h"
73
74	using namespace llvm;
75
76	#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77
78	namespace {
79	class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80	const TargetMachine *TM;
81
82	public:
83	static char ID;
84
85	AMDGPUImageIntrinsicOptimizer(const TargetMachine TM = nullptr*)
86	: FunctionPass (ID), TM(TM) {}
87
88	bool runOnFunction(Function &F) override;
89
90	}; // End of class AMDGPUImageIntrinsicOptimizer
91	} // End anonymous namespace
92
93	INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94	"AMDGPU Image Intrinsic Optimizer", false, false)
95
96	char AMDGPUImageIntrinsicOptimizer::ID = `0`;
97
98	void addInstToMergeableList(
99	IntrinsicInst *II,
100	SmallVector<SmallVector<IntrinsicInst *, `4`>> &MergeableInsts,
101	const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102	for (SmallVector<IntrinsicInst *, `4`> &IIList : MergeableInsts) {
103	// Check Dim.
104	if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105	continue;
106
107	// Check D16.
108	if (IIList.front()->getType() != II->getType())
109	continue;
110
111	// Check all arguments (DMask, VAddr, RSrc etc).
112	bool AllEqual = true;
113	assert(IIList.front()->arg_size() == II->arg_size());
114	for (int I = `1`, E = II->arg_size(); AllEqual && I != E; ++I) {
115	Value *ArgList = IIList.front()->getArgOperand(i: I);
116	Value *Arg = II->getArgOperand(i: I);
117	if (I == ImageDimIntr->VAddrEnd - `1`) {
118	// Check FragId group.
119	auto FragIdList = cast<ConstantInt>(Val: IIList.front()->getArgOperand(i: I));
120	auto FragId = cast<ConstantInt>(Val: II->getArgOperand(i: I));
121	AllEqual = FragIdList->getValue().udiv(RHS: `4`) == FragId->getValue().udiv(RHS: `4`);
122	} else {
123	// Check all arguments except FragId.
124	AllEqual = ArgList == Arg;
125	}
126	}
127	if (!AllEqual)
128	continue;
129
130	// Add to the list.
131	IIList.emplace_back(Args&: II);
132	return;
133	}
134
135	// Similar instruction not found, so add a new list.
136	MergeableInsts.emplace_back(Args: `1`, Args&: II);
137	LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
138	}
139
140	// Collect list of all instructions we know how to merge in a subset of the
141	// block. It returns an iterator to the instruction after the last one analyzed.
142	BasicBlock::iterator collectMergeableInsts(
143	BasicBlock::iterator I, BasicBlock::iterator E,
144	SmallVector<SmallVector<IntrinsicInst *, `4`>> &MergeableInsts) {
145	for (; I != E; ++I) {
146	// Don't combine if there is a store in the middle or if there is a memory
147	// barrier.
148	if (I ->mayHaveSideEffects()) {
149	++I;
150	break;
151	}
152
153	// Ignore non-intrinsics.
154	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val&: I)) {
155	Intrinsic::ID IntrinID = II->getIntrinsicID();
156
157	// Ignore other intrinsics.
158	if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159	IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160	continue;
161
162	// Check for constant FragId.
163	const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinID);
164	const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - `1`;
165	if (!isa<ConstantInt>(Val: II->getArgOperand(i: FragIdIndex)))
166	continue;
167
168	LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
169	addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
170	}
171	}
172
173	return I;
174	}
175
176	bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, `4`>> MergeableInsts) {
177	bool Modified = false;
178
179	SmallVector<Instruction *, `4`> InstrsToErase;
180	for (const auto &IIList : MergeableInsts) {
181	if (IIList.size() <= `1`)
182	continue;
183
184	// Assume the arguments are unchanged and later override them, if needed.
185	SmallVector<Value *, `16`> Args(IIList.front()->args());
186
187	// Validate function argument and return types, extracting overloaded
188	// types along the way.
189	SmallVector<Type *, `6`> OverloadTys;
190	Function *F = IIList.front()->getCalledFunction();
191	if (!Intrinsic::getIntrinsicSignature(F, ArgTys&: OverloadTys))
192	continue;
193
194	Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
195	const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
196	AMDGPU::getImageDimIntrinsicInfo(Intr: IntrinID);
197
198	Type *EltTy = IIList.front()->getType()->getScalarType();
199	Type *NewTy = FixedVectorType::get(ElementType: EltTy, NumElts: `4`);
200	OverloadTys [`0`] = NewTy;
201	bool isD16 = EltTy->isHalfTy();
202
203	ConstantInt *DMask = cast<ConstantInt>(
204	Val: IIList.front()->getArgOperand(i: ImageDimIntr->DMaskIndex));
205	unsigned DMaskVal = DMask->getZExtValue() & `0xf`;
206	unsigned NumElts = popcount(Value: DMaskVal);
207
208	// Number of instructions and the number of vaddr/vdata dword transfers
209	// should be reduced.
210	unsigned NumLoads = IIList.size();
211	unsigned NumMsaas = NumElts;
212	unsigned NumVAddrLoads = `3` * NumLoads;
213	unsigned NumVDataLoads = divideCeil(Numerator: NumElts, Denominator: isD16 ? `2` : `1`) * NumLoads;
214	unsigned NumVAddrMsaas = `3` * NumMsaas;
215	unsigned NumVDataMsaas = divideCeil(Numerator: `4`, Denominator: isD16 ? `2` : `1`) * NumMsaas;
216
217	if (NumLoads < NumMsaas \|\|
218	(NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219	continue;
220
221	const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - `1`;
222	auto FragId = cast<ConstantInt>(Val: IIList.front()->getArgOperand(i: FragIdIndex));
223	const APInt &NewFragIdVal = FragId->getValue().udiv(RHS: `4`) * `4`;
224
225	// Create the new instructions.
226	IRBuilder<> B(IIList.front());
227
228	// Create the new image_msaa_load intrinsic.
229	SmallVector<Instruction *, `4`> NewCalls;
230	while (DMaskVal != `0`) {
231	unsigned NewMaskVal = `1` << countr_zero(Val: DMaskVal);
232
233	Intrinsic::ID NewIntrinID;
234	if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
235	NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
236	else
237	NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
238
239	Function *NewIntrin = Intrinsic::getDeclaration(
240	M: IIList.front()->getModule(), id: NewIntrinID, Tys: OverloadTys);
241	Args [ImageDimIntr->DMaskIndex] =
242	ConstantInt::get(Ty: DMask->getType(), V: NewMaskVal);
243	Args [FragIdIndex] = ConstantInt::get(Ty: FragId->getType(), V: NewFragIdVal);
244	CallInst *NewCall = B.CreateCall(Callee: NewIntrin, Args);
245	LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
246
247	NewCalls.push_back(Elt: NewCall);
248	DMaskVal -= NewMaskVal;
249	}
250
251	// Create the new extractelement instructions.
252	for (auto &II : IIList) {
253	Value VecOp = nullptr*;
254	auto Idx = cast<ConstantInt>(Val: II->getArgOperand(i: FragIdIndex));
255	B.SetCurrentDebugLocation(II->getDebugLoc());
256	if (NumElts == `1`) {
257	VecOp = B.CreateExtractElement(Vec: NewCalls [`0`], Idx: Idx->getValue().urem(RHS: `4`));
258	LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
259	} else {
260	VecOp = UndefValue::get(T: II->getType());
261	for (unsigned I = `0`; I < NumElts; ++I) {
262	VecOp = B.CreateInsertElement(
263	Vec: VecOp,
264	NewElt: B.CreateExtractElement(Vec: NewCalls [I], Idx: Idx->getValue().urem(RHS: `4`)), Idx: I);
265	LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
266	}
267	}
268
269	// Replace the old instruction.
270	II->replaceAllUsesWith(V: VecOp);
271	VecOp->takeName(V: II);
272	InstrsToErase.push_back(Elt: II);
273	}
274
275	Modified = true;
276	}
277
278	for (auto I : InstrsToErase)
279	I->eraseFromParent();
280
281	return Modified;
282	}
283
284	static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
285	if (!TM)
286	return false;
287
288	// This optimization only applies to GFX11 and beyond.
289	const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
290	if (!AMDGPU::isGFX11Plus(ST) \|\| ST.hasMSAALoadDstSelBug())
291	return false;
292
293	Module *M = F.getParent();
294
295	// Early test to determine if the intrinsics are used.
296	if (llvm::none_of(Range&: *M, P: [](Function &F) {
297	return !F.users().empty() &&
298	(F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa \|\|
299	F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
300	}))
301	return false;
302
303	bool Modified = false;
304	for (auto &BB : F) {
305	BasicBlock::iterator SectionEnd;
306	for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
307	I = SectionEnd) {
308	SmallVector<SmallVector<IntrinsicInst *, `4`>> MergeableInsts;
309
310	SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
311	Modified \|= optimizeSection(MergeableInsts);
312	}
313	}
314
315	return Modified;
316	}
317
318	bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
319	if (skipFunction(F))
320	return false;
321
322	return imageIntrinsicOptimizerImpl(F, TM);
323	}
324
325	FunctionPass *
326	llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
327	return new AMDGPUImageIntrinsicOptimizer (TM);
328	}
329
330	PreservedAnalyses
331	AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
332	FunctionAnalysisManager &AM) {
333
334	bool Changed = imageIntrinsicOptimizerImpl(F, TM: &TM);
335	return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
336	}
337

source code of llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp