1 | //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This pass does misc. AMDGPU optimizations on IR *just* before instruction |
11 | /// selection. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "AMDGPU.h" |
16 | #include "AMDGPUTargetMachine.h" |
17 | #include "llvm/Analysis/AssumptionCache.h" |
18 | #include "llvm/Analysis/UniformityAnalysis.h" |
19 | #include "llvm/Analysis/ValueTracking.h" |
20 | #include "llvm/CodeGen/TargetPassConfig.h" |
21 | #include "llvm/IR/IRBuilder.h" |
22 | #include "llvm/IR/InstVisitor.h" |
23 | #include "llvm/InitializePasses.h" |
24 | #include "llvm/Support/CommandLine.h" |
25 | #include "llvm/Support/KnownBits.h" |
26 | #include "llvm/Transforms/Utils/Local.h" |
27 | |
28 | #define DEBUG_TYPE "amdgpu-late-codegenprepare" |
29 | |
30 | using namespace llvm; |
31 | |
32 | // Scalar load widening needs running after load-store-vectorizer as that pass |
33 | // doesn't handle overlapping cases. In addition, this pass enhances the |
34 | // widening to handle cases where scalar sub-dword loads are naturally aligned |
35 | // only but not dword aligned. |
36 | static cl::opt<bool> |
37 | WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads" , |
38 | cl::desc("Widen sub-dword constant address space loads in " |
39 | "AMDGPULateCodeGenPrepare" ), |
40 | cl::ReallyHidden, cl::init(Val: true)); |
41 | |
42 | namespace { |
43 | |
44 | class AMDGPULateCodeGenPrepare |
45 | : public FunctionPass, |
46 | public InstVisitor<AMDGPULateCodeGenPrepare, bool> { |
47 | Module *Mod = nullptr; |
48 | const DataLayout *DL = nullptr; |
49 | |
50 | AssumptionCache *AC = nullptr; |
51 | UniformityInfo *UA = nullptr; |
52 | |
53 | public: |
54 | static char ID; |
55 | |
56 | AMDGPULateCodeGenPrepare() : FunctionPass(ID) {} |
57 | |
58 | StringRef getPassName() const override { |
59 | return "AMDGPU IR late optimizations" ; |
60 | } |
61 | |
62 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
63 | AU.addRequired<TargetPassConfig>(); |
64 | AU.addRequired<AssumptionCacheTracker>(); |
65 | AU.addRequired<UniformityInfoWrapperPass>(); |
66 | AU.setPreservesAll(); |
67 | } |
68 | |
69 | bool doInitialization(Module &M) override; |
70 | bool runOnFunction(Function &F) override; |
71 | |
72 | bool visitInstruction(Instruction &) { return false; } |
73 | |
74 | // Check if the specified value is at least DWORD aligned. |
75 | bool isDWORDAligned(const Value *V) const { |
76 | KnownBits Known = computeKnownBits(V, DL: *DL, Depth: 0, AC); |
77 | return Known.countMinTrailingZeros() >= 2; |
78 | } |
79 | |
80 | bool canWidenScalarExtLoad(LoadInst &LI) const; |
81 | bool visitLoadInst(LoadInst &LI); |
82 | }; |
83 | |
84 | } // end anonymous namespace |
85 | |
86 | bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) { |
87 | Mod = &M; |
88 | DL = &Mod->getDataLayout(); |
89 | return false; |
90 | } |
91 | |
92 | bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) { |
93 | if (skipFunction(F)) |
94 | return false; |
95 | |
96 | const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); |
97 | const TargetMachine &TM = TPC.getTM<TargetMachine>(); |
98 | const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); |
99 | if (ST.hasScalarSubwordLoads()) |
100 | return false; |
101 | |
102 | AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
103 | UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo(); |
104 | |
105 | bool Changed = false; |
106 | for (auto &BB : F) |
107 | for (Instruction &I : llvm::make_early_inc_range(Range&: BB)) |
108 | Changed |= visit(I); |
109 | |
110 | return Changed; |
111 | } |
112 | |
113 | bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const { |
114 | unsigned AS = LI.getPointerAddressSpace(); |
115 | // Skip non-constant address space. |
116 | if (AS != AMDGPUAS::CONSTANT_ADDRESS && |
117 | AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT) |
118 | return false; |
119 | // Skip non-simple loads. |
120 | if (!LI.isSimple()) |
121 | return false; |
122 | auto *Ty = LI.getType(); |
123 | // Skip aggregate types. |
124 | if (Ty->isAggregateType()) |
125 | return false; |
126 | unsigned TySize = DL->getTypeStoreSize(Ty); |
127 | // Only handle sub-DWORD loads. |
128 | if (TySize >= 4) |
129 | return false; |
130 | // That load must be at least naturally aligned. |
131 | if (LI.getAlign() < DL->getABITypeAlign(Ty)) |
132 | return false; |
133 | // It should be uniform, i.e. a scalar load. |
134 | return UA->isUniform(I: &LI); |
135 | } |
136 | |
137 | bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) { |
138 | if (!WidenLoads) |
139 | return false; |
140 | |
141 | // Skip if that load is already aligned on DWORD at least as it's handled in |
142 | // SDAG. |
143 | if (LI.getAlign() >= 4) |
144 | return false; |
145 | |
146 | if (!canWidenScalarExtLoad(LI)) |
147 | return false; |
148 | |
149 | int64_t Offset = 0; |
150 | auto *Base = |
151 | GetPointerBaseWithConstantOffset(Ptr: LI.getPointerOperand(), Offset, DL: *DL); |
152 | // If that base is not DWORD aligned, it's not safe to perform the following |
153 | // transforms. |
154 | if (!isDWORDAligned(V: Base)) |
155 | return false; |
156 | |
157 | int64_t Adjust = Offset & 0x3; |
158 | if (Adjust == 0) { |
159 | // With a zero adjust, the original alignment could be promoted with a |
160 | // better one. |
161 | LI.setAlignment(Align(4)); |
162 | return true; |
163 | } |
164 | |
165 | IRBuilder<> IRB(&LI); |
166 | IRB.SetCurrentDebugLocation(LI.getDebugLoc()); |
167 | |
168 | unsigned LdBits = DL->getTypeStoreSizeInBits(Ty: LI.getType()); |
169 | auto IntNTy = Type::getIntNTy(C&: LI.getContext(), N: LdBits); |
170 | |
171 | auto *NewPtr = IRB.CreateConstGEP1_64( |
172 | Ty: IRB.getInt8Ty(), |
173 | Ptr: IRB.CreateAddrSpaceCast(V: Base, DestTy: LI.getPointerOperand()->getType()), |
174 | Idx0: Offset - Adjust); |
175 | |
176 | LoadInst *NewLd = IRB.CreateAlignedLoad(Ty: IRB.getInt32Ty(), Ptr: NewPtr, Align: Align(4)); |
177 | NewLd->copyMetadata(SrcInst: LI); |
178 | NewLd->setMetadata(KindID: LLVMContext::MD_range, Node: nullptr); |
179 | |
180 | unsigned ShAmt = Adjust * 8; |
181 | auto *NewVal = IRB.CreateBitCast( |
182 | V: IRB.CreateTrunc(V: IRB.CreateLShr(LHS: NewLd, RHS: ShAmt), DestTy: IntNTy), DestTy: LI.getType()); |
183 | LI.replaceAllUsesWith(V: NewVal); |
184 | RecursivelyDeleteTriviallyDeadInstructions(V: &LI); |
185 | |
186 | return true; |
187 | } |
188 | |
189 | INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, |
190 | "AMDGPU IR late optimizations" , false, false) |
191 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
192 | INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) |
193 | INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass) |
194 | INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE, |
195 | "AMDGPU IR late optimizations" , false, false) |
196 | |
197 | char AMDGPULateCodeGenPrepare::ID = 0; |
198 | |
199 | FunctionPass *llvm::createAMDGPULateCodeGenPreparePass() { |
200 | return new AMDGPULateCodeGenPrepare(); |
201 | } |
202 | |