AMDGPULateCodeGenPrepare.cpp source code [llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp]

1	//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This pass does misc. AMDGPU optimizations on IR just* before instruction*
11	/// selection.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "AMDGPU.h"
16	#include "AMDGPUTargetMachine.h"
17	#include "llvm/Analysis/AssumptionCache.h"
18	#include "llvm/Analysis/UniformityAnalysis.h"
19	#include "llvm/Analysis/ValueTracking.h"
20	#include "llvm/CodeGen/TargetPassConfig.h"
21	#include "llvm/IR/IRBuilder.h"
22	#include "llvm/IR/InstVisitor.h"
23	#include "llvm/InitializePasses.h"
24	#include "llvm/Support/CommandLine.h"
25	#include "llvm/Support/KnownBits.h"
26	#include "llvm/Transforms/Utils/Local.h"
27
28	#define DEBUG_TYPE "amdgpu-late-codegenprepare"
29
30	using namespace llvm;
31
32	// Scalar load widening needs running after load-store-vectorizer as that pass
33	// doesn't handle overlapping cases. In addition, this pass enhances the
34	// widening to handle cases where scalar sub-dword loads are naturally aligned
35	// only but not dword aligned.
36	static cl::opt<bool>
37	WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
38	cl::desc ("Widen sub-dword constant address space loads in "
39	"AMDGPULateCodeGenPrepare"),
40	cl::ReallyHidden, cl::init(Val: true));
41
42	namespace {
43
44	class AMDGPULateCodeGenPrepare
45	: public FunctionPass,
46	public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
47	Module Mod = nullptr*;
48	const DataLayout DL = nullptr*;
49
50	AssumptionCache AC = nullptr*;
51	UniformityInfo UA = nullptr*;
52
53	public:
54	static char ID;
55
56	AMDGPULateCodeGenPrepare() : FunctionPass (ID) {}
57
58	StringRef getPassName() const override {
59	return "AMDGPU IR late optimizations";
60	}
61
62	void getAnalysisUsage(AnalysisUsage &AU) const override {
63	AU.addRequired<TargetPassConfig>();
64	AU.addRequired<AssumptionCacheTracker>();
65	AU.addRequired<UniformityInfoWrapperPass>();
66	AU.setPreservesAll();
67	}
68
69	bool doInitialization(Module &M) override;
70	bool runOnFunction(Function &F) override;
71
72	bool visitInstruction(Instruction &) { return false; }
73
74	// Check if the specified value is at least DWORD aligned.
75	bool isDWORDAligned(const Value V) const* {
76	KnownBits Known = computeKnownBits(V, DL: *DL, Depth: `0`, AC);
77	return Known.countMinTrailingZeros() >= `2`;
78	}
79
80	bool canWidenScalarExtLoad(LoadInst &LI) const;
81	bool visitLoadInst(LoadInst &LI);
82	};
83
84	} // end anonymous namespace
85
86	bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
87	Mod = &M;
88	DL = &Mod->getDataLayout();
89	return false;
90	}
91
92	bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
93	if (skipFunction(F))
94	return false;
95
96	const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
97	const TargetMachine &TM = TPC.getTM<TargetMachine>();
98	const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
99	if (ST.hasScalarSubwordLoads())
100	return false;
101
102	AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
103	UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
104
105	bool Changed = false;
106	for (auto &BB : F)
107	for (Instruction &I : llvm::make_early_inc_range(Range&: BB))
108	Changed \|= visit(I);
109
110	return Changed;
111	}
112
113	bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
114	unsigned AS = LI.getPointerAddressSpace();
115	// Skip non-constant address space.
116	if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
117	AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT)
118	return false;
119	// Skip non-simple loads.
120	if (!LI.isSimple())
121	return false;
122	auto *Ty = LI.getType();
123	// Skip aggregate types.
124	if (Ty->isAggregateType())
125	return false;
126	unsigned TySize = DL->getTypeStoreSize(Ty);
127	// Only handle sub-DWORD loads.
128	if (TySize >= `4`)
129	return false;
130	// That load must be at least naturally aligned.
131	if (LI.getAlign() < DL->getABITypeAlign(Ty))
132	return false;
133	// It should be uniform, i.e. a scalar load.
134	return UA->isUniform(I: &LI);
135	}
136
137	bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
138	if (!WidenLoads)
139	return false;
140
141	// Skip if that load is already aligned on DWORD at least as it's handled in
142	// SDAG.
143	if (LI.getAlign() >= `4`)
144	return false;
145
146	if (!canWidenScalarExtLoad(LI))
147	return false;
148
149	int64_t Offset = `0`;
150	auto *Base =
151	GetPointerBaseWithConstantOffset(Ptr: LI.getPointerOperand(), Offset, DL: *DL);
152	// If that base is not DWORD aligned, it's not safe to perform the following
153	// transforms.
154	if (!isDWORDAligned(V: Base))
155	return false;
156
157	int64_t Adjust = Offset & `0x3`;
158	if (Adjust == `0`) {
159	// With a zero adjust, the original alignment could be promoted with a
160	// better one.
161	LI.setAlignment(Align (`4`));
162	return true;
163	}
164
165	IRBuilder<> IRB(&LI);
166	IRB.SetCurrentDebugLocation(LI.getDebugLoc());
167
168	unsigned LdBits = DL->getTypeStoreSizeInBits(Ty: LI.getType());
169	auto IntNTy = Type::getIntNTy(C&: LI.getContext(), N: LdBits);
170
171	auto *NewPtr = IRB.CreateConstGEP1_64(
172	Ty: IRB.getInt8Ty(),
173	Ptr: IRB.CreateAddrSpaceCast(V: Base, DestTy: LI.getPointerOperand()->getType()),
174	Idx0: Offset - Adjust);
175
176	LoadInst *NewLd = IRB.CreateAlignedLoad(Ty: IRB.getInt32Ty(), Ptr: NewPtr, Align: Align (`4`));
177	NewLd->copyMetadata(SrcInst: LI);
178	NewLd->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr);
179
180	unsigned ShAmt = Adjust * `8`;
181	auto *NewVal = IRB.CreateBitCast(
182	V: IRB.CreateTrunc(V: IRB.CreateLShr(LHS: NewLd, RHS: ShAmt), DestTy: IntNTy), DestTy: LI.getType());
183	LI.replaceAllUsesWith(V: NewVal);
184	RecursivelyDeleteTriviallyDeadInstructions(V: &LI);
185
186	return true;
187	}
188
189	INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
190	"AMDGPU IR late optimizations", false, false)
191	INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
192	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
193	INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
194	INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
195	"AMDGPU IR late optimizations", false, false)
196
197	char AMDGPULateCodeGenPrepare::ID = `0`;
198
199	FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() {
200	return new AMDGPULateCodeGenPrepare ();
201	}
202

source code of llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp