AMDGPUPromoteAlloca.cpp source code [llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp]

1	//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// Eliminates allocas by either converting them into vectors or by migrating
10	// them to local address space.
11	//
12	// Two passes are exposed by this file:
13	// - "promote-alloca-to-vector", which runs early in the pipeline and only
14	// promotes to vector. Promotion to vector is almost always profitable
15	// except when the alloca is too big and the promotion would result in
16	// very high register pressure.
17	// - "promote-alloca", which does both promotion to vector and LDS and runs
18	// much later in the pipeline. This runs after SROA because promoting to
19	// LDS is of course less profitable than getting rid of the alloca or
20	// vectorizing it, thus we only want to do it when the only alternative is
21	// lowering the alloca to stack.
22	//
23	// Note that both of them exist for the old and new PMs. The new PM passes are
24	// declared in AMDGPU.h and the legacy PM ones are declared here.s
25	//
26	//===----------------------------------------------------------------------===//
27
28	#include "AMDGPU.h"
29	#include "GCNSubtarget.h"
30	#include "Utils/AMDGPUBaseInfo.h"
31	#include "llvm/ADT/STLExtras.h"
32	#include "llvm/Analysis/CaptureTracking.h"
33	#include "llvm/Analysis/InstSimplifyFolder.h"
34	#include "llvm/Analysis/InstructionSimplify.h"
35	#include "llvm/Analysis/LoopInfo.h"
36	#include "llvm/Analysis/ValueTracking.h"
37	#include "llvm/CodeGen/TargetPassConfig.h"
38	#include "llvm/IR/IRBuilder.h"
39	#include "llvm/IR/IntrinsicInst.h"
40	#include "llvm/IR/IntrinsicsAMDGPU.h"
41	#include "llvm/IR/IntrinsicsR600.h"
42	#include "llvm/IR/PatternMatch.h"
43	#include "llvm/InitializePasses.h"
44	#include "llvm/Pass.h"
45	#include "llvm/Target/TargetMachine.h"
46	#include "llvm/Transforms/Utils/SSAUpdater.h"
47
48	#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50	using namespace llvm;
51
52	namespace {
53
54	static cl::opt<bool>
55	DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56	cl::desc ("Disable promote alloca to vector"),
57	cl::init(Val: false));
58
59	static cl::opt<bool>
60	DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61	cl::desc ("Disable promote alloca to LDS"),
62	cl::init(Val: false));
63
64	static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65	"amdgpu-promote-alloca-to-vector-limit",
66	cl::desc ("Maximum byte size to consider promote alloca to vector"),
67	cl::init(Val: `0`));
68
69	static cl::opt<unsigned>
70	LoopUserWeight("promote-alloca-vector-loop-user-weight",
71	cl::desc ("The bonus weight of users of allocas within loop "
72	"when sorting profitable allocas"),
73	cl::init(Val: `4`));
74
75	// Shared implementation which can do both promotion to vector and to LDS.
76	class AMDGPUPromoteAllocaImpl {
77	private:
78	const TargetMachine &TM;
79	LoopInfo &LI;
80	Module Mod = nullptr*;
81	const DataLayout DL = nullptr*;
82
83	// FIXME: This should be per-kernel.
84	uint32_t LocalMemLimit = `0`;
85	uint32_t CurrentLocalMemUsage = `0`;
86	unsigned MaxVGPRs;
87
88	bool IsAMDGCN = false;
89	bool IsAMDHSA = false;
90
91	std::pair<Value , Value > getLocalSizeYZ(IRBuilder<> &Builder);
92	Value getWorkitemID(IRBuilder<> &Builder, unsigned* N);
93
94	/// BaseAlloca is the alloca root the search started from.
95	/// Val may be that alloca or a recursive user of it.
96	bool collectUsesWithPtrTypes(Value BaseAlloca, Value Val,
97	std::vector<Value > &WorkList) const*;
98
99	/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
100	/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
101	/// Returns true if both operands are derived from the same alloca. Val should
102	/// be the same value as one of the input operands of UseInst.
103	bool binaryOpIsDerivedFromSameAlloca(Value Alloca, Value Val,
104	Instruction UseInst, int* OpIdx0,
105	int OpIdx1) const;
106
107	/// Check whether we have enough local memory for promotion.
108	bool hasSufficientLocalMem(const Function &F);
109
110	bool tryPromoteAllocaToVector(AllocaInst &I);
111	bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
112
113	void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
114
115	public:
116	AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
117
118	const Triple &TT = TM.getTargetTriple();
119	IsAMDGCN = TT.getArch() == Triple::amdgcn;
120	IsAMDHSA = TT.getOS() == Triple::AMDHSA;
121	}
122
123	bool run(Function &F, bool PromoteToLDS);
124	};
125
126	// FIXME: This can create globals so should be a module pass.
127	class AMDGPUPromoteAlloca : public FunctionPass {
128	public:
129	static char ID;
130
131	AMDGPUPromoteAlloca() : FunctionPass (ID) {}
132
133	bool runOnFunction(Function &F) override {
134	if (skipFunction(F))
135	return false;
136	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
137	return AMDGPUPromoteAllocaImpl (
138	TPC->getTM<TargetMachine>(),
139	getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
140	.run(F, /PromoteToLDS/ true);
141	return false;
142	}
143
144	StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
145
146	void getAnalysisUsage(AnalysisUsage &AU) const override {
147	AU.setPreservesCFG();
148	AU.addRequired<LoopInfoWrapperPass>();
149	FunctionPass::getAnalysisUsage(AU);
150	}
151	};
152
153	class AMDGPUPromoteAllocaToVector : public FunctionPass {
154	public:
155	static char ID;
156
157	AMDGPUPromoteAllocaToVector() : FunctionPass (ID) {}
158
159	bool runOnFunction(Function &F) override {
160	if (skipFunction(F))
161	return false;
162	if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
163	return AMDGPUPromoteAllocaImpl (
164	TPC->getTM<TargetMachine>(),
165	getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
166	.run(F, /PromoteToLDS/ false);
167	return false;
168	}
169
170	StringRef getPassName() const override {
171	return "AMDGPU Promote Alloca to vector";
172	}
173
174	void getAnalysisUsage(AnalysisUsage &AU) const override {
175	AU.setPreservesCFG();
176	AU.addRequired<LoopInfoWrapperPass>();
177	FunctionPass::getAnalysisUsage(AU);
178	}
179	};
180
181	unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
182	if (!TM.getTargetTriple().isAMDGCN())
183	return `128`;
184
185	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
186	unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
187
188	// A non-entry function has only 32 caller preserved registers.
189	// Do not promote alloca which will force spilling unless we know the function
190	// will be inlined.
191	if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
192	!AMDGPU::isEntryFunctionCC(CC: F.getCallingConv()))
193	MaxVGPRs = std::min(a: MaxVGPRs, b: `32u`);
194	return MaxVGPRs;
195	}
196
197	} // end anonymous namespace
198
199	char AMDGPUPromoteAlloca::ID = `0`;
200	char AMDGPUPromoteAllocaToVector::ID = `0`;
201
202	INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
203	"AMDGPU promote alloca to vector or LDS", false, false)
204	// Move LDS uses from functions to kernels before promote alloca for accurate
205	// estimation of LDS available
206	INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
207	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
208	INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
209	"AMDGPU promote alloca to vector or LDS", false, false)
210
211	INITIALIZE_PASS_BEGIN(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
212	"AMDGPU promote alloca to vector", false, false)
213	INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
214	INITIALIZE_PASS_END(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
215	"AMDGPU promote alloca to vector", false, false)
216
217	char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
218	char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
219
220	PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
221	FunctionAnalysisManager &AM) {
222	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
223	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/true);
224	if (Changed) {
225	PreservedAnalyses PA;
226	PA.preserveSet<CFGAnalyses>();
227	return PA;
228	}
229	return PreservedAnalyses::all();
230	}
231
232	PreservedAnalyses
233	AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
234	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
235	bool Changed = AMDGPUPromoteAllocaImpl (TM, LI).run(F, /PromoteToLDS=/false);
236	if (Changed) {
237	PreservedAnalyses PA;
238	PA.preserveSet<CFGAnalyses>();
239	return PA;
240	}
241	return PreservedAnalyses::all();
242	}
243
244	FunctionPass *llvm::createAMDGPUPromoteAlloca() {
245	return new AMDGPUPromoteAlloca ();
246	}
247
248	FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
249	return new AMDGPUPromoteAllocaToVector ();
250	}
251
252	static void collectAllocaUses(AllocaInst &Alloca,
253	SmallVectorImpl<Use *> &Uses) {
254	SmallVector<Instruction *, `4`> WorkList({&Alloca});
255	while (!WorkList.empty()) {
256	auto *Cur = WorkList.pop_back_val();
257	for (auto &U : Cur->uses()) {
258	Uses.push_back(Elt: &U);
259
260	if (isa<GetElementPtrInst>(Val: U.getUser()))
261	WorkList.push_back(Elt: cast<Instruction>(Val: U.getUser()));
262	}
263	}
264	}
265
266	void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
267	SmallVectorImpl<AllocaInst *> &Allocas) {
268	DenseMap<AllocaInst , unsigned*> Scores;
269
270	for (auto *Alloca : Allocas) {
271	LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n");
272	unsigned &Score = Scores [Alloca];
273	// Increment score by one for each user + a bonus for users within loops.
274	SmallVector<Use *, `8`> Uses;
275	collectAllocaUses(Alloca&: *Alloca, Uses);
276	for (auto *U : Uses) {
277	Instruction *Inst = cast<Instruction>(Val: U->getUser());
278	if (isa<GetElementPtrInst>(Val: Inst))
279	continue;
280	unsigned UserScore =
281	`1` + (LoopUserWeight * LI.getLoopDepth(BB: Inst->getParent()));
282	LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
283	Score += UserScore;
284	}
285	LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
286	}
287
288	stable_sort(Range&: Allocas, C: [&](AllocaInst A, AllocaInst B) {
289	return Scores.at(Val: A) > Scores.at(Val: B);
290	});
291
292	// clang-format off
293	LLVM_DEBUG(
294	dbgs() << "Sorted Worklist:\n";
295	for (auto *A: Allocas)
296	dbgs() << " " << *A << "\n";
297	);
298	// clang-format on
299	}
300
301	bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
302	Mod = F.getParent();
303	DL = &Mod->getDataLayout();
304
305	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
306	if (!ST.isPromoteAllocaEnabled())
307	return false;
308
309	MaxVGPRs = getMaxVGPRs(TM, F);
310
311	bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
312
313	// Use up to 1/4 of available register budget for vectorization.
314	// FIXME: Increase the limit for whole function budgets? Perhaps x2?
315	unsigned VectorizationBudget =
316	(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * `8`
317	: (MaxVGPRs * `32`)) /
318	`4`;
319
320	SmallVector<AllocaInst *, `16`> Allocas;
321	for (Instruction &I : F.getEntryBlock()) {
322	if (AllocaInst *AI = dyn_cast<AllocaInst>(Val: &I)) {
323	// Array allocations are probably not worth handling, since an allocation
324	// of the array type is the canonical form.
325	if (!AI->isStaticAlloca() \|\| AI->isArrayAllocation())
326	continue;
327	Allocas.push_back(Elt: AI);
328	}
329	}
330
331	sortAllocasToPromote(Allocas);
332
333	bool Changed = false;
334	for (AllocaInst *AI : Allocas) {
335	const unsigned AllocaCost = DL->getTypeSizeInBits(Ty: AI->getAllocatedType());
336	if (AllocaCost > VectorizationBudget) {
337	LLVM_DEBUG(dbgs() << " Alloca too big for vectorization: " << *AI
338	<< "\n");
339	return Changed;
340	}
341
342	if (tryPromoteAllocaToVector(I&: *AI)) {
343	Changed = true;
344	assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
345	"Underflow!");
346	VectorizationBudget -= AllocaCost;
347	LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
348	<< VectorizationBudget << "\n");
349	if (VectorizationBudget == `0`)
350	break;
351	} else if (PromoteToLDS && tryPromoteAllocaToLDS(I&: *AI, SufficientLDS))
352	Changed = true;
353	}
354
355	// NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
356	// dangling pointers. If we want to reuse it past this point, the loop above
357	// would need to be updated to remove successfully promoted allocas.
358
359	return Changed;
360	}
361
362	struct MemTransferInfo {
363	ConstantInt SrcIndex = nullptr*;
364	ConstantInt DestIndex = nullptr*;
365	};
366
367	// Checks if the instruction I is a memset user of the alloca AI that we can
368	// deal with. Currently, only non-volatile memsets that affect the whole alloca
369	// are handled.
370	static bool isSupportedMemset(MemSetInst I, AllocaInst AI,
371	const DataLayout &DL) {
372	using namespace PatternMatch;
373	// For now we only care about non-volatile memsets that affect the whole type
374	// (start at index 0 and fill the whole alloca).
375	//
376	// TODO: Now that we moved to PromoteAlloca we could handle any memsets
377	// (except maybe volatile ones?) - we just need to use shufflevector if it
378	// only affects a subset of the vector.
379	const unsigned Size = DL.getTypeStoreSize(Ty: AI->getAllocatedType());
380	return I->getOperand(i_nocapture: `0`) == AI &&
381	match(V: I->getOperand(i_nocapture: `2`), P: m_SpecificInt(V: Size)) && !I->isVolatile();
382	}
383
384	static Value *
385	calculateVectorIndex(Value *Ptr,
386	const std::map<GetElementPtrInst , Value > &GEPIdx) {
387	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr->stripPointerCasts());
388	if (!GEP)
389	return ConstantInt::getNullValue(Ty: Type::getInt32Ty(C&: Ptr->getContext()));
390
391	auto I = GEPIdx.find(x: GEP);
392	assert(I != GEPIdx.end() && "Must have entry for GEP!");
393	return I ->second;
394	}
395
396	static Value GEPToVectorIndex(GetElementPtrInst GEP, AllocaInst *Alloca,
397	Type VecElemTy, const* DataLayout &DL) {
398	// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
399	// helper.
400	unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
401	MapVector<Value *, APInt> VarOffsets;
402	APInt ConstOffset(BW, `0`);
403	if (GEP->getPointerOperand()->stripPointerCasts() != Alloca \|\|
404	!GEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ConstOffset))
405	return nullptr;
406
407	unsigned VecElemSize = DL.getTypeAllocSize(Ty: VecElemTy);
408	if (VarOffsets.size() > `1`)
409	return nullptr;
410
411	if (VarOffsets.size() == `1`) {
412	// Only handle cases where we don't need to insert extra arithmetic
413	// instructions.
414	const auto &VarOffset = VarOffsets.front();
415	if (!ConstOffset.isZero() \|\| VarOffset.second != VecElemSize)
416	return nullptr;
417	return VarOffset.first;
418	}
419
420	APInt Quot;
421	uint64_t Rem;
422	APInt::udivrem(LHS: ConstOffset, RHS: VecElemSize, Quotient&: Quot, Remainder&: Rem);
423	if (Rem != `0`)
424	return nullptr;
425
426	return ConstantInt::get(Context&: GEP->getContext(), V: Quot);
427	}
428
429	/// Promotes a single user of the alloca to a vector form.
430	///
431	/// \param Inst Instruction to be promoted.
432	/// \param DL Module Data Layout.
433	/// \param VectorTy Vectorized Type.
434	/// \param VecStoreSize Size of \p VectorTy in bytes.
435	/// \param ElementSize Size of \p VectorTy element type in bytes.
436	/// \param TransferInfo MemTransferInst info map.
437	/// \param GEPVectorIdx GEP -> VectorIdx cache.
438	/// \param CurVal Current value of the vector (e.g. last stored value)
439	/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
440	/// be promoted now. This happens when promoting requires \p
441	/// CurVal, but \p CurVal is nullptr.
442	/// \return the stored value if \p Inst would have written to the alloca, or
443	/// nullptr otherwise.
444	static Value *promoteAllocaUserToVector(
445	Instruction Inst, const* DataLayout &DL, FixedVectorType *VectorTy,
446	unsigned VecStoreSize, unsigned ElementSize,
447	DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
448	std::map<GetElementPtrInst , Value > &GEPVectorIdx, Value *CurVal,
449	SmallVectorImpl<LoadInst *> &DeferredLoads) {
450	// Note: we use InstSimplifyFolder because it can leverage the DataLayout
451	// to do more folding, especially in the case of vector splats.
452	IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
453	InstSimplifyFolder (DL));
454	Builder.SetInsertPoint(Inst);
455
456	const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
457	if (CurVal)
458	return CurVal;
459
460	// If the current value is not known, insert a dummy load and lower it on
461	// the second pass.
462	LoadInst *Dummy =
463	Builder.CreateLoad(Ty: VectorTy, Ptr: PoisonValue::get(T: Builder.getPtrTy()),
464	Name: "promotealloca.dummyload");
465	DeferredLoads.push_back(Elt: Dummy);
466	return Dummy;
467	};
468
469	const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
470	Type PtrTy) -> Value {
471	assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
472	const unsigned Size = DL.getTypeStoreSizeInBits(Ty: PtrTy);
473	if (!PtrTy->isVectorTy())
474	return Builder.CreateBitOrPointerCast(V: Val, DestTy: Builder.getIntNTy(N: Size));
475	const unsigned NumPtrElts = cast<FixedVectorType>(Val: PtrTy)->getNumElements();
476	// If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
477	// first cast the ptr vector to <2 x i64>.
478	assert((Size % NumPtrElts == `0`) && "Vector size not divisble");
479	Type *EltTy = Builder.getIntNTy(N: Size / NumPtrElts);
480	return Builder.CreateBitOrPointerCast(
481	V: Val, DestTy: FixedVectorType::get(ElementType: EltTy, NumElts: NumPtrElts));
482	};
483
484	Type *VecEltTy = VectorTy->getElementType();
485
486	switch (Inst->getOpcode()) {
487	case Instruction::Load: {
488	// Loads can only be lowered if the value is known.
489	if (!CurVal) {
490	DeferredLoads.push_back(Elt: cast<LoadInst>(Val: Inst));
491	return nullptr;
492	}
493
494	Value *Index = calculateVectorIndex(
495	Ptr: cast<LoadInst>(Val: Inst)->getPointerOperand(), GEPIdx: GEPVectorIdx);
496
497	// We're loading the full vector.
498	Type *AccessTy = Inst->getType();
499	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
500	if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
501	if (CI->isZeroValue() && AccessSize == VecStoreSize) {
502	if (AccessTy->isPtrOrPtrVectorTy())
503	CurVal = CreateTempPtrIntCast (CurVal, AccessTy);
504	else if (CurVal->getType()->isPtrOrPtrVectorTy())
505	CurVal = CreateTempPtrIntCast (CurVal, CurVal->getType());
506	Value *NewVal = Builder.CreateBitOrPointerCast(V: CurVal, DestTy: AccessTy);
507	Inst->replaceAllUsesWith(V: NewVal);
508	return nullptr;
509	}
510	}
511
512	// Loading a subvector.
513	if (isa<FixedVectorType>(Val: AccessTy)) {
514	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
515	const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
516	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumLoadedElts);
517	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
518
519	Value *SubVec = PoisonValue::get(T: SubVecTy);
520	for (unsigned K = `0`; K < NumLoadedElts; ++K) {
521	Value *CurIdx =
522	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
523	SubVec = Builder.CreateInsertElement(
524	Vec: SubVec, NewElt: Builder.CreateExtractElement(Vec: CurVal, Idx: CurIdx), Idx: K);
525	}
526
527	if (AccessTy->isPtrOrPtrVectorTy())
528	SubVec = CreateTempPtrIntCast (SubVec, AccessTy);
529	else if (SubVecTy->isPtrOrPtrVectorTy())
530	SubVec = CreateTempPtrIntCast (SubVec, SubVecTy);
531
532	SubVec = Builder.CreateBitOrPointerCast(V: SubVec, DestTy: AccessTy);
533	Inst->replaceAllUsesWith(V: SubVec);
534	return nullptr;
535	}
536
537	// We're loading one element.
538	Value *ExtractElement = Builder.CreateExtractElement(Vec: CurVal, Idx: Index);
539	if (AccessTy != VecEltTy)
540	ExtractElement = Builder.CreateBitOrPointerCast(V: ExtractElement, DestTy: AccessTy);
541
542	Inst->replaceAllUsesWith(V: ExtractElement);
543	return nullptr;
544	}
545	case Instruction::Store: {
546	// For stores, it's a bit trickier and it depends on whether we're storing
547	// the full vector or not. If we're storing the full vector, we don't need
548	// to know the current value. If this is a store of a single element, we
549	// need to know the value.
550	StoreInst *SI = cast<StoreInst>(Val: Inst);
551	Value *Index = calculateVectorIndex(Ptr: SI->getPointerOperand(), GEPIdx: GEPVectorIdx);
552	Value *Val = SI->getValueOperand();
553
554	// We're storing the full vector, we can handle this without knowing CurVal.
555	Type *AccessTy = Val->getType();
556	TypeSize AccessSize = DL.getTypeStoreSize(Ty: AccessTy);
557	if (Constant *CI = dyn_cast<Constant>(Val: Index)) {
558	if (CI->isZeroValue() && AccessSize == VecStoreSize) {
559	if (AccessTy->isPtrOrPtrVectorTy())
560	Val = CreateTempPtrIntCast (Val, AccessTy);
561	else if (VectorTy->isPtrOrPtrVectorTy())
562	Val = CreateTempPtrIntCast (Val, VectorTy);
563	return Builder.CreateBitOrPointerCast(V: Val, DestTy: VectorTy);
564	}
565	}
566
567	// Storing a subvector.
568	if (isa<FixedVectorType>(Val: AccessTy)) {
569	assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
570	const unsigned NumWrittenElts =
571	AccessSize / DL.getTypeStoreSize(Ty: VecEltTy);
572	const unsigned NumVecElts = VectorTy->getNumElements();
573	auto *SubVecTy = FixedVectorType::get(ElementType: VecEltTy, NumElts: NumWrittenElts);
574	assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
575
576	if (SubVecTy->isPtrOrPtrVectorTy())
577	Val = CreateTempPtrIntCast (Val, SubVecTy);
578	else if (AccessTy->isPtrOrPtrVectorTy())
579	Val = CreateTempPtrIntCast (Val, AccessTy);
580
581	Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: SubVecTy);
582
583	Value *CurVec = GetOrLoadCurrentVectorValue ();
584	for (unsigned K = `0`, NumElts = std::min(a: NumWrittenElts, b: NumVecElts);
585	K < NumElts; ++K) {
586	Value *CurIdx =
587	Builder.CreateAdd(LHS: Index, RHS: ConstantInt::get(Ty: Index->getType(), V: K));
588	CurVec = Builder.CreateInsertElement(
589	Vec: CurVec, NewElt: Builder.CreateExtractElement(Vec: Val, Idx: K), Idx: CurIdx);
590	}
591	return CurVec;
592	}
593
594	if (Val->getType() != VecEltTy)
595	Val = Builder.CreateBitOrPointerCast(V: Val, DestTy: VecEltTy);
596	return Builder.CreateInsertElement(Vec: GetOrLoadCurrentVectorValue (), NewElt: Val,
597	Idx: Index);
598	}
599	case Instruction::Call: {
600	if (auto *MTI = dyn_cast<MemTransferInst>(Val: Inst)) {
601	// For memcpy, we need to know curval.
602	ConstantInt *Length = cast<ConstantInt>(Val: MTI->getLength());
603	unsigned NumCopied = Length->getZExtValue() / ElementSize;
604	MemTransferInfo *TI = &TransferInfo [MTI];
605	unsigned SrcBegin = TI->SrcIndex->getZExtValue();
606	unsigned DestBegin = TI->DestIndex->getZExtValue();
607
608	SmallVector<int> Mask;
609	for (unsigned Idx = `0`; Idx < VectorTy->getNumElements(); ++Idx) {
610	if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
611	Mask.push_back(Elt: SrcBegin++);
612	} else {
613	Mask.push_back(Elt: Idx);
614	}
615	}
616
617	return Builder.CreateShuffleVector(V: GetOrLoadCurrentVectorValue (), Mask);
618	}
619
620	if (auto *MSI = dyn_cast<MemSetInst>(Val: Inst)) {
621	// For memset, we don't need to know the previous value because we
622	// currently only allow memsets that cover the whole alloca.
623	Value *Elt = MSI->getOperand(i_nocapture: `1`);
624	const unsigned BytesPerElt = DL.getTypeStoreSize(Ty: VecEltTy);
625	if (BytesPerElt > `1`) {
626	Value *EltBytes = Builder.CreateVectorSplat(NumElts: BytesPerElt, V: Elt);
627
628	// If the element type of the vector is a pointer, we need to first cast
629	// to an integer, then use a PtrCast.
630	if (VecEltTy->isPointerTy()) {
631	Type PtrInt = Builder.getIntNTy(N: BytesPerElt `8`);
632	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: PtrInt);
633	Elt = Builder.CreateIntToPtr(V: Elt, DestTy: VecEltTy);
634	} else
635	Elt = Builder.CreateBitCast(V: EltBytes, DestTy: VecEltTy);
636	}
637
638	return Builder.CreateVectorSplat(EC: VectorTy->getElementCount(), V: Elt);
639	}
640
641	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
642	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
643	Intr->replaceAllUsesWith(
644	V: Builder.getIntN(N: Intr->getType()->getIntegerBitWidth(),
645	C: DL.getTypeAllocSize(Ty: VectorTy)));
646	return nullptr;
647	}
648	}
649
650	llvm_unreachable("Unsupported call when promoting alloca to vector");
651	}
652
653	default:
654	llvm_unreachable("Inconsistency in instructions promotable to vector");
655	}
656
657	llvm_unreachable("Did not return after promoting instruction!");
658	}
659
660	static bool isSupportedAccessType(FixedVectorType VecTy, Type AccessTy,
661	const DataLayout &DL) {
662	// Access as a vector type can work if the size of the access vector is a
663	// multiple of the size of the alloca's vector element type.
664	//
665	// Examples:
666	// - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
667	// - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
668	// - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
669	// - 332 is not a multiple of 64*
670	//
671	// We could handle more complicated cases, but it'd make things a lot more
672	// complicated.
673	if (isa<FixedVectorType>(Val: AccessTy)) {
674	TypeSize AccTS = DL.getTypeStoreSize(Ty: AccessTy);
675	TypeSize VecTS = DL.getTypeStoreSize(Ty: VecTy->getElementType());
676	return AccTS.isKnownMultipleOf(RHS: VecTS);
677	}
678
679	return CastInst::isBitOrNoopPointerCastable(SrcTy: VecTy->getElementType(), DestTy: AccessTy,
680	DL);
681	}
682
683	/// Iterates over an instruction worklist that may contain multiple instructions
684	/// from the same basic block, but in a different order.
685	template <typename InstContainer>
686	static void forEachWorkListItem(const InstContainer &WorkList,
687	std::function<void(Instruction *)> Fn) {
688	// Bucket up uses of the alloca by the block they occur in.
689	// This is important because we have to handle multiple defs/uses in a block
690	// ourselves: SSAUpdater is purely for cross-block references.
691	DenseMap<BasicBlock , SmallDenseSet<Instruction >> UsesByBlock;
692	for (Instruction *User : WorkList)
693	UsesByBlock [User->getParent()].insert(V: User);
694
695	for (Instruction *User : WorkList) {
696	BasicBlock *BB = User->getParent();
697	auto &BlockUses = UsesByBlock [BB];
698
699	// Already processed, skip.
700	if (BlockUses.empty())
701	continue;
702
703	// Only user in the block, directly process it.
704	if (BlockUses.size() == `1`) {
705	Fn (User);
706	continue;
707	}
708
709	// Multiple users in the block, do a linear scan to see users in order.
710	for (Instruction &Inst : *BB) {
711	if (!BlockUses.contains(V: &Inst))
712	continue;
713
714	Fn (&Inst);
715	}
716
717	// Clear the block so we know it's been processed.
718	BlockUses.clear();
719	}
720	}
721
722	// FIXME: Should try to pick the most likely to be profitable allocas first.
723	bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
724	LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << `'\n'`);
725
726	if (DisablePromoteAllocaToVector) {
727	LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
728	return false;
729	}
730
731	Type *AllocaTy = Alloca.getAllocatedType();
732	auto *VectorTy = dyn_cast<FixedVectorType>(Val: AllocaTy);
733	if (auto *ArrayTy = dyn_cast<ArrayType>(Val: AllocaTy)) {
734	if (VectorType::isValidElementType(ElemTy: ArrayTy->getElementType()) &&
735	ArrayTy->getNumElements() > `0`)
736	VectorTy = FixedVectorType::get(ElementType: ArrayTy->getElementType(),
737	NumElts: ArrayTy->getNumElements());
738	}
739
740	// FIXME: There is no reason why we can't support larger arrays, we
741	// are just being conservative for now.
742	// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
743	// equivalent. Potentially these could also be promoted but we don't currently
744	// handle this case
745	if (!VectorTy) {
746	LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
747	return false;
748	}
749
750	if (VectorTy->getNumElements() > `16` \|\| VectorTy->getNumElements() < `2`) {
751	LLVM_DEBUG(dbgs() << " " << *VectorTy
752	<< " has an unsupported number of elements\n");
753	return false;
754	}
755
756	std::map<GetElementPtrInst , Value > GEPVectorIdx;
757	SmallVector<Instruction *> WorkList;
758	SmallVector<Instruction *> UsersToRemove;
759	SmallVector<Instruction *> DeferredInsts;
760	DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
761
762	const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
763	LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
764	<< " " << *Inst << "\n");
765	return false;
766	};
767
768	SmallVector<Use *, `8`> Uses;
769	collectAllocaUses(Alloca, Uses);
770
771	LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
772
773	Type *VecEltTy = VectorTy->getElementType();
774	unsigned ElementSize = DL->getTypeSizeInBits(Ty: VecEltTy) / `8`;
775	for (auto *U : Uses) {
776	Instruction *Inst = cast<Instruction>(Val: U->getUser());
777
778	if (Value *Ptr = getLoadStorePointerOperand(V: Inst)) {
779	// This is a store of the pointer, not to the pointer.
780	if (isa<StoreInst>(Val: Inst) &&
781	U->getOperandNo() != StoreInst::getPointerOperandIndex())
782	return RejectUser (Inst, "pointer is being stored");
783
784	Type *AccessTy = getLoadStoreType(I: Inst);
785	if (AccessTy->isAggregateType())
786	return RejectUser (Inst, "unsupported load/store as aggregate");
787	assert(!AccessTy->isAggregateType() \|\| AccessTy->isArrayTy());
788
789	// Check that this is a simple access of a vector element.
790	bool IsSimple = isa<LoadInst>(Val: Inst) ? cast<LoadInst>(Val: Inst)->isSimple()
791	: cast<StoreInst>(Val: Inst)->isSimple();
792	if (!IsSimple)
793	return RejectUser (Inst, "not a simple load or store");
794
795	Ptr = Ptr->stripPointerCasts();
796
797	// Alloca already accessed as vector.
798	if (Ptr == &Alloca && DL->getTypeStoreSize(Ty: Alloca.getAllocatedType()) ==
799	DL->getTypeStoreSize(Ty: AccessTy)) {
800	WorkList.push_back(Elt: Inst);
801	continue;
802	}
803
804	if (!isSupportedAccessType(VecTy: VectorTy, AccessTy, DL: *DL))
805	return RejectUser (Inst, "not a supported access type");
806
807	WorkList.push_back(Elt: Inst);
808	continue;
809	}
810
811	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Inst)) {
812	// If we can't compute a vector index from this GEP, then we can't
813	// promote this alloca to vector.
814	Value Index = GEPToVectorIndex(GEP, Alloca: &Alloca, VecElemTy: VecEltTy, DL: DL);
815	if (!Index)
816	return RejectUser (Inst, "cannot compute vector index for GEP");
817
818	GEPVectorIdx [GEP] = Index;
819	UsersToRemove.push_back(Elt: Inst);
820	continue;
821	}
822
823	if (MemSetInst *MSI = dyn_cast<MemSetInst>(Val: Inst);
824	MSI && isSupportedMemset(I: MSI, AI: &Alloca, DL: *DL)) {
825	WorkList.push_back(Elt: Inst);
826	continue;
827	}
828
829	if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Val: Inst)) {
830	if (TransferInst->isVolatile())
831	return RejectUser (Inst, "mem transfer inst is volatile");
832
833	ConstantInt *Len = dyn_cast<ConstantInt>(Val: TransferInst->getLength());
834	if (!Len \|\| (Len->getZExtValue() % ElementSize))
835	return RejectUser (Inst, "mem transfer inst length is non-constant or "
836	"not a multiple of the vector element size");
837
838	if (!TransferInfo.count(Val: TransferInst)) {
839	DeferredInsts.push_back(Elt: Inst);
840	WorkList.push_back(Elt: Inst);
841	TransferInfo [TransferInst] = MemTransferInfo ();
842	}
843
844	auto getPointerIndexOfAlloca = [&](Value Ptr) -> ConstantInt {
845	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: Ptr);
846	if (Ptr != &Alloca && !GEPVectorIdx.count(x: GEP))
847	return nullptr;
848
849	return dyn_cast<ConstantInt>(Val: calculateVectorIndex(Ptr, GEPIdx: GEPVectorIdx));
850	};
851
852	unsigned OpNum = U->getOperandNo();
853	MemTransferInfo *TI = &TransferInfo [TransferInst];
854	if (OpNum == `0`) {
855	Value *Dest = TransferInst->getDest();
856	ConstantInt *Index = getPointerIndexOfAlloca (Dest);
857	if (!Index)
858	return RejectUser (Inst, "could not calculate constant dest index");
859	TI->DestIndex = Index;
860	} else {
861	assert(OpNum == `1`);
862	Value *Src = TransferInst->getSource();
863	ConstantInt *Index = getPointerIndexOfAlloca (Src);
864	if (!Index)
865	return RejectUser (Inst, "could not calculate constant src index");
866	TI->SrcIndex = Index;
867	}
868	continue;
869	}
870
871	if (auto *Intr = dyn_cast<IntrinsicInst>(Val: Inst)) {
872	if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
873	WorkList.push_back(Elt: Inst);
874	continue;
875	}
876	}
877
878	// Ignore assume-like intrinsics and comparisons used in assumes.
879	if (isAssumeLikeIntrinsic(I: Inst)) {
880	if (!Inst->use_empty())
881	return RejectUser (Inst, "assume-like intrinsic cannot have any users");
882	UsersToRemove.push_back(Elt: Inst);
883	continue;
884	}
885
886	if (isa<ICmpInst>(Val: Inst) && all_of(Range: Inst->users(), P: [](User *U) {
887	return isAssumeLikeIntrinsic(I: cast<Instruction>(Val: U));
888	})) {
889	UsersToRemove.push_back(Elt: Inst);
890	continue;
891	}
892
893	return RejectUser (Inst, "unhandled alloca user");
894	}
895
896	while (!DeferredInsts.empty()) {
897	Instruction *Inst = DeferredInsts.pop_back_val();
898	MemTransferInst *TransferInst = cast<MemTransferInst>(Val: Inst);
899	// TODO: Support the case if the pointers are from different alloca or
900	// from different address spaces.
901	MemTransferInfo &Info = TransferInfo [TransferInst];
902	if (!Info.SrcIndex \|\| !Info.DestIndex)
903	return RejectUser (
904	Inst, "mem transfer inst is missing constant src and/or dst index");
905	}
906
907	LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
908	<< *VectorTy << `'\n'`);
909	const unsigned VecStoreSize = DL->getTypeStoreSize(Ty: VectorTy);
910
911	// Alloca is uninitialized memory. Imitate that by making the first value
912	// undef.
913	SSAUpdater Updater;
914	Updater.Initialize(Ty: VectorTy, Name: "promotealloca");
915	Updater.AddAvailableValue(BB: Alloca.getParent(), V: UndefValue::get(T: VectorTy));
916
917	// First handle the initial worklist.
918	SmallVector<LoadInst *, `4`> DeferredLoads;
919	forEachWorkListItem(WorkList, Fn: [&](Instruction *I) {
920	BasicBlock *BB = I->getParent();
921	// On the first pass, we only take values that are trivially known, i.e.
922	// where AddAvailableValue was already called in this block.
923	Value *Result = promoteAllocaUserToVector(
924	Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
925	CurVal: Updater.FindValueForBlock(BB), DeferredLoads);
926	if (Result)
927	Updater.AddAvailableValue(BB, V: Result);
928	});
929
930	// Then handle deferred loads.
931	forEachWorkListItem(WorkList: DeferredLoads, Fn: [&](Instruction *I) {
932	SmallVector<LoadInst *, `0`> NewDLs;
933	BasicBlock *BB = I->getParent();
934	// On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
935	// get a value, inserting PHIs as needed.
936	Value *Result = promoteAllocaUserToVector(
937	Inst: I, DL: *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
938	CurVal: Updater.GetValueInMiddleOfBlock(BB: I->getParent()), DeferredLoads&: NewDLs);
939	if (Result)
940	Updater.AddAvailableValue(BB, V: Result);
941	assert(NewDLs.empty() && "No more deferred loads should be queued!");
942	});
943
944	// Delete all instructions. On the first pass, new dummy loads may have been
945	// added so we need to collect them too.
946	DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
947	InstsToDelete.insert(I: DeferredLoads.begin(), E: DeferredLoads.end());
948	for (Instruction *I : InstsToDelete) {
949	assert(I->use_empty());
950	I->eraseFromParent();
951	}
952
953	// Delete all the users that are known to be removeable.
954	for (Instruction *I : reverse(C&: UsersToRemove)) {
955	I->dropDroppableUses();
956	assert(I->use_empty());
957	I->eraseFromParent();
958	}
959
960	// Alloca should now be dead too.
961	assert(Alloca.use_empty());
962	Alloca.eraseFromParent();
963	return true;
964	}
965
966	std::pair<Value , Value >
967	AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
968	Function &F = *Builder.GetInsertBlock()->getParent();
969	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
970
971	if (!IsAMDHSA) {
972	Function *LocalSizeYFn =
973	Intrinsic::getDeclaration(M: Mod, Intrinsic::id: r600_read_local_size_y);
974	Function *LocalSizeZFn =
975	Intrinsic::getDeclaration(M: Mod, Intrinsic::id: r600_read_local_size_z);
976
977	CallInst *LocalSizeY = Builder.CreateCall(Callee: LocalSizeYFn, Args: {});
978	CallInst *LocalSizeZ = Builder.CreateCall(Callee: LocalSizeZFn, Args: {});
979
980	ST.makeLIDRangeMetadata(I: LocalSizeY);
981	ST.makeLIDRangeMetadata(I: LocalSizeZ);
982
983	return std::pair(LocalSizeY, LocalSizeZ);
984	}
985
986	// We must read the size out of the dispatch pointer.
987	assert(IsAMDGCN);
988
989	// We are indexing into this struct, and want to extract the workgroup_size_*
990	// fields.
991	//
992	// typedef struct hsa_kernel_dispatch_packet_s {
993	// uint16_t header;
994	// uint16_t setup;
995	// uint16_t workgroup_size_x ;
996	// uint16_t workgroup_size_y;
997	// uint16_t workgroup_size_z;
998	// uint16_t reserved0;
999	// uint32_t grid_size_x ;
1000	// uint32_t grid_size_y ;
1001	// uint32_t grid_size_z;
1002	//
1003	// uint32_t private_segment_size;
1004	// uint32_t group_segment_size;
1005	// uint64_t kernel_object;
1006	//
1007	// #ifdef HSA_LARGE_MODEL
1008	// void kernarg_address;*
1009	// #elif defined HSA_LITTLE_ENDIAN
1010	// void kernarg_address;*
1011	// uint32_t reserved1;
1012	// #else
1013	// uint32_t reserved1;
1014	// void kernarg_address;*
1015	// #endif
1016	// uint64_t reserved2;
1017	// hsa_signal_t completion_signal; // uint64_t wrapper
1018	// } hsa_kernel_dispatch_packet_t
1019	//
1020	Function *DispatchPtrFn =
1021	Intrinsic::getDeclaration(M: Mod, Intrinsic::id: amdgcn_dispatch_ptr);
1022
1023	CallInst *DispatchPtr = Builder.CreateCall(Callee: DispatchPtrFn, Args: {});
1024	DispatchPtr->addRetAttr(Attribute::NoAlias);
1025	DispatchPtr->addRetAttr(Attribute::NonNull);
1026	F.removeFnAttr(Kind: "amdgpu-no-dispatch-ptr");
1027
1028	// Size of the dispatch packet struct.
1029	DispatchPtr->addDereferenceableRetAttr(Bytes: `64`);
1030
1031	Type *I32Ty = Type::getInt32Ty(C&: Mod->getContext());
1032	Value *CastDispatchPtr = Builder.CreateBitCast(
1033	V: DispatchPtr, DestTy: PointerType::get(ElementType: I32Ty, AddressSpace: AMDGPUAS::CONSTANT_ADDRESS));
1034
1035	// We could do a single 64-bit load here, but it's likely that the basic
1036	// 32-bit and extract sequence is already present, and it is probably easier
1037	// to CSE this. The loads should be mergeable later anyway.
1038	Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: CastDispatchPtr, Idx0: `1`);
1039	LoadInst *LoadXY = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPXY, Align: Align (`4`));
1040
1041	Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(Ty: I32Ty, Ptr: CastDispatchPtr, Idx0: `2`);
1042	LoadInst *LoadZU = Builder.CreateAlignedLoad(Ty: I32Ty, Ptr: GEPZU, Align: Align (`4`));
1043
1044	MDNode *MD = MDNode::get(Context&: Mod->getContext(), MDs: std::nullopt);
1045	LoadXY->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1046	LoadZU->setMetadata(KindID: LLVMContext::MD_invariant_load, Node: MD);
1047	ST.makeLIDRangeMetadata(I: LoadZU);
1048
1049	// Extract y component. Upper half of LoadZU should be zero already.
1050	Value *Y = Builder.CreateLShr(LHS: LoadXY, RHS: `16`);
1051
1052	return std::pair(Y, LoadZU);
1053	}
1054
1055	Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1056	unsigned N) {
1057	Function *F = Builder.GetInsertBlock()->getParent();
1058	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: *F);
1059	Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
1060	StringRef AttrName;
1061
1062	switch (N) {
1063	case `0`:
1064	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1065	: (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1066	AttrName = "amdgpu-no-workitem-id-x";
1067	break;
1068	case `1`:
1069	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1070	: (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1071	AttrName = "amdgpu-no-workitem-id-y";
1072	break;
1073
1074	case `2`:
1075	IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1076	: (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1077	AttrName = "amdgpu-no-workitem-id-z";
1078	break;
1079	default:
1080	llvm_unreachable("invalid dimension");
1081	}
1082
1083	Function *WorkitemIdFn = Intrinsic::getDeclaration(M: Mod, id: IntrID);
1084	CallInst *CI = Builder.CreateCall(Callee: WorkitemIdFn);
1085	ST.makeLIDRangeMetadata(I: CI);
1086	F->removeFnAttr(Kind: AttrName);
1087
1088	return CI;
1089	}
1090
1091	static bool isCallPromotable(CallInst *CI) {
1092	IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: CI);
1093	if (!II)
1094	return false;
1095
1096	switch (II->getIntrinsicID()) {
1097	case Intrinsic::memcpy:
1098	case Intrinsic::memmove:
1099	case Intrinsic::memset:
1100	case Intrinsic::lifetime_start:
1101	case Intrinsic::lifetime_end:
1102	case Intrinsic::invariant_start:
1103	case Intrinsic::invariant_end:
1104	case Intrinsic::launder_invariant_group:
1105	case Intrinsic::strip_invariant_group:
1106	case Intrinsic::objectsize:
1107	return true;
1108	default:
1109	return false;
1110	}
1111	}
1112
1113	bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1114	Value BaseAlloca, Value Val, Instruction Inst, int* OpIdx0,
1115	int OpIdx1) const {
1116	// Figure out which operand is the one we might not be promoting.
1117	Value *OtherOp = Inst->getOperand(i: OpIdx0);
1118	if (Val == OtherOp)
1119	OtherOp = Inst->getOperand(i: OpIdx1);
1120
1121	if (isa<ConstantPointerNull>(Val: OtherOp))
1122	return true;
1123
1124	Value *OtherObj = getUnderlyingObject(V: OtherOp);
1125	if (!isa<AllocaInst>(Val: OtherObj))
1126	return false;
1127
1128	// TODO: We should be able to replace undefs with the right pointer type.
1129
1130	// TODO: If we know the other base object is another promotable
1131	// alloca, not necessarily this alloca, we can do this. The
1132	// important part is both must have the same address space at
1133	// the end.
1134	if (OtherObj != BaseAlloca) {
1135	LLVM_DEBUG(
1136	dbgs() << "Found a binary instruction with another alloca object\n");
1137	return false;
1138	}
1139
1140	return true;
1141	}
1142
1143	bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
1144	Value BaseAlloca, Value Val, std::vector<Value > &WorkList) const* {
1145
1146	for (User *User : Val->users()) {
1147	if (is_contained(Range&: WorkList, Element: User))
1148	continue;
1149
1150	if (CallInst *CI = dyn_cast<CallInst>(Val: User)) {
1151	if (!isCallPromotable(CI))
1152	return false;
1153
1154	WorkList.push_back(x: User);
1155	continue;
1156	}
1157
1158	Instruction *UseInst = cast<Instruction>(Val: User);
1159	if (UseInst->getOpcode() == Instruction::PtrToInt)
1160	return false;
1161
1162	if (LoadInst *LI = dyn_cast<LoadInst>(Val: UseInst)) {
1163	if (LI->isVolatile())
1164	return false;
1165
1166	continue;
1167	}
1168
1169	if (StoreInst *SI = dyn_cast<StoreInst>(Val: UseInst)) {
1170	if (SI->isVolatile())
1171	return false;
1172
1173	// Reject if the stored value is not the pointer operand.
1174	if (SI->getPointerOperand() != Val)
1175	return false;
1176	} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Val: UseInst)) {
1177	if (RMW->isVolatile())
1178	return false;
1179	} else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(Val: UseInst)) {
1180	if (CAS->isVolatile())
1181	return false;
1182	}
1183
1184	// Only promote a select if we know that the other select operand
1185	// is from another pointer that will also be promoted.
1186	if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Val: UseInst)) {
1187	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: ICmp, OpIdx0: `0`, OpIdx1: `1`))
1188	return false;
1189
1190	// May need to rewrite constant operands.
1191	WorkList.push_back(x: ICmp);
1192	}
1193
1194	if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
1195	// Give up if the pointer may be captured.
1196	if (PointerMayBeCaptured(V: UseInst, ReturnCaptures: true, StoreCaptures: true))
1197	return false;
1198	// Don't collect the users of this.
1199	WorkList.push_back(x: User);
1200	continue;
1201	}
1202
1203	// Do not promote vector/aggregate type instructions. It is hard to track
1204	// their users.
1205	if (isa<InsertValueInst>(Val: User) \|\| isa<InsertElementInst>(Val: User))
1206	return false;
1207
1208	if (!User->getType()->isPointerTy())
1209	continue;
1210
1211	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: UseInst)) {
1212	// Be conservative if an address could be computed outside the bounds of
1213	// the alloca.
1214	if (!GEP->isInBounds())
1215	return false;
1216	}
1217
1218	// Only promote a select if we know that the other select operand is from
1219	// another pointer that will also be promoted.
1220	if (SelectInst *SI = dyn_cast<SelectInst>(Val: UseInst)) {
1221	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: SI, OpIdx0: `1`, OpIdx1: `2`))
1222	return false;
1223	}
1224
1225	// Repeat for phis.
1226	if (PHINode *Phi = dyn_cast<PHINode>(Val: UseInst)) {
1227	// TODO: Handle more complex cases. We should be able to replace loops
1228	// over arrays.
1229	switch (Phi->getNumIncomingValues()) {
1230	case `1`:
1231	break;
1232	case `2`:
1233	if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Inst: Phi, OpIdx0: `0`, OpIdx1: `1`))
1234	return false;
1235	break;
1236	default:
1237	return false;
1238	}
1239	}
1240
1241	WorkList.push_back(x: User);
1242	if (!collectUsesWithPtrTypes(BaseAlloca, Val: User, WorkList))
1243	return false;
1244	}
1245
1246	return true;
1247	}
1248
1249	bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1250
1251	FunctionType *FTy = F.getFunctionType();
1252	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
1253
1254	// If the function has any arguments in the local address space, then it's
1255	// possible these arguments require the entire local memory space, so
1256	// we cannot use local memory in the pass.
1257	for (Type *ParamTy : FTy->params()) {
1258	PointerType *PtrTy = dyn_cast<PointerType>(Val: ParamTy);
1259	if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1260	LocalMemLimit = `0`;
1261	LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1262	"local memory disabled.\n");
1263	return false;
1264	}
1265	}
1266
1267	LocalMemLimit = ST.getAddressableLocalMemorySize();
1268	if (LocalMemLimit == `0`)
1269	return false;
1270
1271	SmallVector<const Constant *, `16`> Stack;
1272	SmallPtrSet<const Constant *, `8`> VisitedConstants;
1273	SmallPtrSet<const GlobalVariable *, `8`> UsedLDS;
1274
1275	auto visitUsers = [&](const GlobalVariable GV, const* Constant Val) -> bool* {
1276	for (const User *U : Val->users()) {
1277	if (const Instruction *Use = dyn_cast<Instruction>(Val: U)) {
1278	if (Use->getParent()->getParent() == &F)
1279	return true;
1280	} else {
1281	const Constant *C = cast<Constant>(Val: U);
1282	if (VisitedConstants.insert(Ptr: C).second)
1283	Stack.push_back(Elt: C);
1284	}
1285	}
1286
1287	return false;
1288	};
1289
1290	for (GlobalVariable &GV : Mod->globals()) {
1291	if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
1292	continue;
1293
1294	if (visitUsers (&GV, &GV)) {
1295	UsedLDS.insert(Ptr: &GV);
1296	Stack.clear();
1297	continue;
1298	}
1299
1300	// For any ConstantExpr uses, we need to recursively search the users until
1301	// we see a function.
1302	while (!Stack.empty()) {
1303	const Constant *C = Stack.pop_back_val();
1304	if (visitUsers (&GV, C)) {
1305	UsedLDS.insert(Ptr: &GV);
1306	Stack.clear();
1307	break;
1308	}
1309	}
1310	}
1311
1312	const DataLayout &DL = Mod->getDataLayout();
1313	SmallVector<std::pair<uint64_t, Align>, `16`> AllocatedSizes;
1314	AllocatedSizes.reserve(N: UsedLDS.size());
1315
1316	for (const GlobalVariable *GV : UsedLDS) {
1317	Align Alignment =
1318	DL.getValueOrABITypeAlignment(Alignment: GV->getAlign(), Ty: GV->getValueType());
1319	uint64_t AllocSize = DL.getTypeAllocSize(Ty: GV->getValueType());
1320
1321	// HIP uses an extern unsized array in local address space for dynamically
1322	// allocated shared memory. In that case, we have to disable the promotion.
1323	if (GV->hasExternalLinkage() && AllocSize == `0`) {
1324	LocalMemLimit = `0`;
1325	LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1326	"local memory. Promoting to local memory "
1327	"disabled.\n");
1328	return false;
1329	}
1330
1331	AllocatedSizes.emplace_back(Args&: AllocSize, Args&: Alignment);
1332	}
1333
1334	// Sort to try to estimate the worst case alignment padding
1335	//
1336	// FIXME: We should really do something to fix the addresses to a more optimal
1337	// value instead
1338	llvm::sort(C&: AllocatedSizes, Comp: llvm::less_second ());
1339
1340	// Check how much local memory is being used by global objects
1341	CurrentLocalMemUsage = `0`;
1342
1343	// FIXME: Try to account for padding here. The real padding and address is
1344	// currently determined from the inverse order of uses in the function when
1345	// legalizing, which could also potentially change. We try to estimate the
1346	// worst case here, but we probably should fix the addresses earlier.
1347	for (auto Alloc : AllocatedSizes) {
1348	CurrentLocalMemUsage = alignTo(Size: CurrentLocalMemUsage, A: Alloc.second);
1349	CurrentLocalMemUsage += Alloc.first;
1350	}
1351
1352	unsigned MaxOccupancy =
1353	ST.getOccupancyWithLocalMemSize(Bytes: CurrentLocalMemUsage, F);
1354
1355	// Restrict local memory usage so that we don't drastically reduce occupancy,
1356	// unless it is already significantly reduced.
1357
1358	// TODO: Have some sort of hint or other heuristics to guess occupancy based
1359	// on other factors..
1360	unsigned OccupancyHint = ST.getWavesPerEU(F).second;
1361	if (OccupancyHint == `0`)
1362	OccupancyHint = `7`;
1363
1364	// Clamp to max value.
1365	OccupancyHint = std::min(a: OccupancyHint, b: ST.getMaxWavesPerEU());
1366
1367	// Check the hint but ignore it if it's obviously wrong from the existing LDS
1368	// usage.
1369	MaxOccupancy = std::min(a: OccupancyHint, b: MaxOccupancy);
1370
1371	// Round up to the next tier of usage.
1372	unsigned MaxSizeWithWaveCount =
1373	ST.getMaxLocalMemSizeWithWaveCount(WaveCount: MaxOccupancy, F);
1374
1375	// Program is possibly broken by using more local mem than available.
1376	if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1377	return false;
1378
1379	LocalMemLimit = MaxSizeWithWaveCount;
1380
1381	LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1382	<< " bytes of LDS\n"
1383	<< " Rounding size to " << MaxSizeWithWaveCount
1384	<< " with a maximum occupancy of " << MaxOccupancy << `'\n'`
1385	<< " and " << (LocalMemLimit - CurrentLocalMemUsage)
1386	<< " available for promotion\n");
1387
1388	return true;
1389	}
1390
1391	// FIXME: Should try to pick the most likely to be profitable allocas first.
1392	bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
1393	bool SufficientLDS) {
1394	LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << `'\n'`);
1395
1396	if (DisablePromoteAllocaToLDS) {
1397	LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1398	return false;
1399	}
1400
1401	const DataLayout &DL = Mod->getDataLayout();
1402	IRBuilder<> Builder(&I);
1403
1404	const Function &ContainingFunction = *I.getParent()->getParent();
1405	CallingConv::ID CC = ContainingFunction.getCallingConv();
1406
1407	// Don't promote the alloca to LDS for shader calling conventions as the work
1408	// item ID intrinsics are not supported for these calling conventions.
1409	// Furthermore not all LDS is available for some of the stages.
1410	switch (CC) {
1411	case CallingConv::AMDGPU_KERNEL:
1412	case CallingConv::SPIR_KERNEL:
1413	break;
1414	default:
1415	LLVM_DEBUG(
1416	dbgs()
1417	<< " promote alloca to LDS not supported with calling convention.\n");
1418	return false;
1419	}
1420
1421	// Not likely to have sufficient local memory for promotion.
1422	if (!SufficientLDS)
1423	return false;
1424
1425	const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F: ContainingFunction);
1426	unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F: ContainingFunction).second;
1427
1428	Align Alignment =
1429	DL.getValueOrABITypeAlignment(Alignment: I.getAlign(), Ty: I.getAllocatedType());
1430
1431	// FIXME: This computed padding is likely wrong since it depends on inverse
1432	// usage order.
1433	//
1434	// FIXME: It is also possible that if we're allowed to use all of the memory
1435	// could end up using more than the maximum due to alignment padding.
1436
1437	uint32_t NewSize = alignTo(Size: CurrentLocalMemUsage, A: Alignment);
1438	uint32_t AllocSize =
1439	WorkGroupSize * DL.getTypeAllocSize(Ty: I.getAllocatedType());
1440	NewSize += AllocSize;
1441
1442	if (NewSize > LocalMemLimit) {
1443	LLVM_DEBUG(dbgs() << " " << AllocSize
1444	<< " bytes of local memory not available to promote\n");
1445	return false;
1446	}
1447
1448	CurrentLocalMemUsage = NewSize;
1449
1450	std::vector<Value *> WorkList;
1451
1452	if (!collectUsesWithPtrTypes(BaseAlloca: &I, Val: &I, WorkList)) {
1453	LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
1454	return false;
1455	}
1456
1457	LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1458
1459	Function *F = I.getParent()->getParent();
1460
1461	Type *GVTy = ArrayType::get(ElementType: I.getAllocatedType(), NumElements: WorkGroupSize);
1462	GlobalVariable GV = new* GlobalVariable (
1463	Mod, GVTy, false*, GlobalValue::InternalLinkage, PoisonValue::get(T: GVTy),
1464	Twine (F->getName()) + Twine (`'.'`) + I.getName(), nullptr,
1465	GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
1466	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1467	GV->setAlignment(I.getAlign());
1468
1469	Value TCntY, TCntZ;
1470
1471	std::tie(args&: TCntY, args&: TCntZ) = getLocalSizeYZ(Builder);
1472	Value *TIdX = getWorkitemID(Builder, N: `0`);
1473	Value *TIdY = getWorkitemID(Builder, N: `1`);
1474	Value *TIdZ = getWorkitemID(Builder, N: `2`);
1475
1476	Value Tmp0 = Builder.CreateMul(LHS: TCntY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1477	Tmp0 = Builder.CreateMul(LHS: Tmp0, RHS: TIdX);
1478	Value Tmp1 = Builder.CreateMul(LHS: TIdY, RHS: TCntZ, Name: "", HasNUW: true, HasNSW: true*);
1479	Value *TID = Builder.CreateAdd(LHS: Tmp0, RHS: Tmp1);
1480	TID = Builder.CreateAdd(LHS: TID, RHS: TIdZ);
1481
1482	LLVMContext &Context = Mod->getContext();
1483	Value *Indices[] = {Constant::getNullValue(Ty: Type::getInt32Ty(C&: Context)), TID};
1484
1485	Value *Offset = Builder.CreateInBoundsGEP(Ty: GVTy, Ptr: GV, IdxList: Indices);
1486	I.mutateType(Ty: Offset->getType());
1487	I.replaceAllUsesWith(V: Offset);
1488	I.eraseFromParent();
1489
1490	SmallVector<IntrinsicInst *> DeferredIntrs;
1491
1492	for (Value *V : WorkList) {
1493	CallInst *Call = dyn_cast<CallInst>(Val: V);
1494	if (!Call) {
1495	if (ICmpInst *CI = dyn_cast<ICmpInst>(Val: V)) {
1496	PointerType *NewTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1497
1498	if (isa<ConstantPointerNull>(Val: CI->getOperand(i_nocapture: `0`)))
1499	CI->setOperand(i_nocapture: `0`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1500
1501	if (isa<ConstantPointerNull>(Val: CI->getOperand(i_nocapture: `1`)))
1502	CI->setOperand(i_nocapture: `1`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1503
1504	continue;
1505	}
1506
1507	// The operand's value should be corrected on its own and we don't want to
1508	// touch the users.
1509	if (isa<AddrSpaceCastInst>(Val: V))
1510	continue;
1511
1512	PointerType *NewTy = PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS);
1513
1514	// FIXME: It doesn't really make sense to try to do this for all
1515	// instructions.
1516	V->mutateType(Ty: NewTy);
1517
1518	// Adjust the types of any constant operands.
1519	if (SelectInst *SI = dyn_cast<SelectInst>(Val: V)) {
1520	if (isa<ConstantPointerNull>(Val: SI->getOperand(i_nocapture: `1`)))
1521	SI->setOperand(i_nocapture: `1`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1522
1523	if (isa<ConstantPointerNull>(Val: SI->getOperand(i_nocapture: `2`)))
1524	SI->setOperand(i_nocapture: `2`, Val_nocapture: ConstantPointerNull::get(T: NewTy));
1525	} else if (PHINode *Phi = dyn_cast<PHINode>(Val: V)) {
1526	for (unsigned I = `0`, E = Phi->getNumIncomingValues(); I != E; ++I) {
1527	if (isa<ConstantPointerNull>(Val: Phi->getIncomingValue(i: I)))
1528	Phi->setIncomingValue(i: I, V: ConstantPointerNull::get(T: NewTy));
1529	}
1530	}
1531
1532	continue;
1533	}
1534
1535	IntrinsicInst *Intr = cast<IntrinsicInst>(Val: Call);
1536	Builder.SetInsertPoint(Intr);
1537	switch (Intr->getIntrinsicID()) {
1538	case Intrinsic::lifetime_start:
1539	case Intrinsic::lifetime_end:
1540	// These intrinsics are for address space 0 only
1541	Intr->eraseFromParent();
1542	continue;
1543	case Intrinsic::memcpy:
1544	case Intrinsic::memmove:
1545	// These have 2 pointer operands. In case if second pointer also needs
1546	// to be replaced we defer processing of these intrinsics until all
1547	// other values are processed.
1548	DeferredIntrs.push_back(Elt: Intr);
1549	continue;
1550	case Intrinsic::memset: {
1551	MemSetInst *MemSet = cast<MemSetInst>(Val: Intr);
1552	Builder.CreateMemSet(Ptr: MemSet->getRawDest(), Val: MemSet->getValue(),
1553	Size: MemSet->getLength(), Align: MemSet->getDestAlign(),
1554	isVolatile: MemSet->isVolatile());
1555	Intr->eraseFromParent();
1556	continue;
1557	}
1558	case Intrinsic::invariant_start:
1559	case Intrinsic::invariant_end:
1560	case Intrinsic::launder_invariant_group:
1561	case Intrinsic::strip_invariant_group:
1562	Intr->eraseFromParent();
1563	// FIXME: I think the invariant marker should still theoretically apply,
1564	// but the intrinsics need to be changed to accept pointers with any
1565	// address space.
1566	continue;
1567	case Intrinsic::objectsize: {
1568	Value *Src = Intr->getOperand(i_nocapture: `0`);
1569	Function *ObjectSize = Intrinsic::getDeclaration(
1570	M: Mod, Intrinsic::id: objectsize,
1571	Tys: {Intr->getType(),
1572	PointerType::get(C&: Context, AddressSpace: AMDGPUAS::LOCAL_ADDRESS)});
1573
1574	CallInst *NewCall = Builder.CreateCall(
1575	Callee: ObjectSize,
1576	Args: {Src, Intr->getOperand(i_nocapture: `1`), Intr->getOperand(i_nocapture: `2`), Intr->getOperand(i_nocapture: `3`)});
1577	Intr->replaceAllUsesWith(V: NewCall);
1578	Intr->eraseFromParent();
1579	continue;
1580	}
1581	default:
1582	Intr->print(O&: errs());
1583	llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1584	}
1585	}
1586
1587	for (IntrinsicInst *Intr : DeferredIntrs) {
1588	Builder.SetInsertPoint(Intr);
1589	Intrinsic::ID ID = Intr->getIntrinsicID();
1590	assert(ID == Intrinsic::memcpy \|\| ID == Intrinsic::memmove);
1591
1592	MemTransferInst *MI = cast<MemTransferInst>(Val: Intr);
1593	auto *B = Builder.CreateMemTransferInst(
1594	IntrID: ID, Dst: MI->getRawDest(), DstAlign: MI->getDestAlign(), Src: MI->getRawSource(),
1595	SrcAlign: MI->getSourceAlign(), Size: MI->getLength(), isVolatile: MI->isVolatile());
1596
1597	for (unsigned I = `0`; I != `2`; ++I) {
1598	if (uint64_t Bytes = Intr->getParamDereferenceableBytes(i: I)) {
1599	B->addDereferenceableParamAttr(i: I, Bytes);
1600	}
1601	}
1602
1603	Intr->eraseFromParent();
1604	}
1605
1606	return true;
1607	}
1608

source code of llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp