1 | //===----- HipStdPar.cpp - HIP C++ Standard Parallelism Support Passes ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // This file implements two passes that enable HIP C++ Standard Parallelism |
9 | // Support: |
10 | // |
11 | // 1. AcceleratorCodeSelection (required): Given that only algorithms are |
12 | // accelerated, and that the accelerated implementation exists in the form of |
13 | // a compute kernel, we assume that only the kernel, and all functions |
14 | // reachable from it, constitute code that the user expects the accelerator |
15 | // to execute. Thus, we identify the set of all functions reachable from |
16 | // kernels, and then remove all unreachable ones. This last part is necessary |
17 | // because it is possible for code that the user did not expect to execute on |
18 | // an accelerator to contain constructs that cannot be handled by the target |
19 | // BE, which cannot be provably demonstrated to be dead code in general, and |
20 | // thus can lead to mis-compilation. The degenerate case of this is when a |
21 | // Module contains no kernels (the parent TU had no algorithm invocations fit |
22 | // for acceleration), which we handle by completely emptying said module. |
23 | // **NOTE**: The above does not handle indirectly reachable functions i.e. |
24 | // it is possible to obtain a case where the target of an indirect |
25 | // call is otherwise unreachable and thus is removed; this |
26 | // restriction is aligned with the current `-hipstdpar` limitations |
27 | // and will be relaxed in the future. |
28 | // |
29 | // 2. AllocationInterposition (required only when on-demand paging is |
30 | // unsupported): Some accelerators or operating systems might not support |
31 | // transparent on-demand paging. Thus, they would only be able to access |
32 | // memory that is allocated by an accelerator-aware mechanism. For such cases |
33 | // the user can opt into enabling allocation / deallocation interposition, |
34 | // whereby we replace calls to known allocation / deallocation functions with |
35 | // calls to runtime implemented equivalents that forward the requests to |
36 | // accelerator-aware interfaces. We also support freeing system allocated |
37 | // memory that ends up in one of the runtime equivalents, since this can |
38 | // happen if e.g. a library that was compiled without interposition returns |
39 | // an allocation that can be validly passed to `free`. |
40 | //===----------------------------------------------------------------------===// |
41 | |
42 | #include "llvm/Transforms/HipStdPar/HipStdPar.h" |
43 | |
44 | #include "llvm/ADT/SmallPtrSet.h" |
45 | #include "llvm/ADT/SmallVector.h" |
46 | #include "llvm/ADT/STLExtras.h" |
47 | #include "llvm/Analysis/CallGraph.h" |
48 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
49 | #include "llvm/IR/Constants.h" |
50 | #include "llvm/IR/DebugInfoMetadata.h" |
51 | #include "llvm/IR/Function.h" |
52 | #include "llvm/IR/Module.h" |
53 | #include "llvm/Transforms/Utils/ModuleUtils.h" |
54 | |
55 | #include <cassert> |
56 | #include <string> |
57 | #include <utility> |
58 | |
59 | using namespace llvm; |
60 | |
61 | template<typename T> |
62 | static inline void eraseFromModule(T &ToErase) { |
63 | ToErase.replaceAllUsesWith(PoisonValue::get(T: ToErase.getType())); |
64 | ToErase.eraseFromParent(); |
65 | } |
66 | |
67 | static inline bool checkIfSupported(GlobalVariable &G) { |
68 | if (!G.isThreadLocal()) |
69 | return true; |
70 | |
71 | G.dropDroppableUses(); |
72 | |
73 | if (!G.isConstantUsed()) |
74 | return true; |
75 | |
76 | std::string W; |
77 | raw_string_ostream OS(W); |
78 | |
79 | OS << "Accelerator does not support the thread_local variable " |
80 | << G.getName(); |
81 | |
82 | Instruction *I = nullptr; |
83 | SmallVector<User *> Tmp(G.user_begin(), G.user_end()); |
84 | SmallPtrSet<User *, 5> Visited; |
85 | do { |
86 | auto U = std::move(Tmp.back()); |
87 | Tmp.pop_back(); |
88 | |
89 | if (Visited.contains(Ptr: U)) |
90 | continue; |
91 | |
92 | if (isa<Instruction>(Val: U)) |
93 | I = cast<Instruction>(Val: U); |
94 | else |
95 | Tmp.insert(I: Tmp.end(), From: U->user_begin(), To: U->user_end()); |
96 | |
97 | Visited.insert(Ptr: U); |
98 | } while (!I && !Tmp.empty()); |
99 | |
100 | assert(I && "thread_local global should have at least one non-constant use." ); |
101 | |
102 | G.getContext().diagnose( |
103 | DI: DiagnosticInfoUnsupported(*I->getParent()->getParent(), W, |
104 | I->getDebugLoc(), DS_Error)); |
105 | |
106 | return false; |
107 | } |
108 | |
109 | static inline void clearModule(Module &M) { // TODO: simplify. |
110 | while (!M.functions().empty()) |
111 | eraseFromModule(ToErase&: *M.begin()); |
112 | while (!M.globals().empty()) |
113 | eraseFromModule(ToErase&: *M.globals().begin()); |
114 | while (!M.aliases().empty()) |
115 | eraseFromModule(ToErase&: *M.aliases().begin()); |
116 | while (!M.ifuncs().empty()) |
117 | eraseFromModule(ToErase&: *M.ifuncs().begin()); |
118 | } |
119 | |
120 | static inline void maybeHandleGlobals(Module &M) { |
121 | unsigned GlobAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); |
122 | for (auto &&G : M.globals()) { // TODO: should we handle these in the FE? |
123 | if (!checkIfSupported(G)) |
124 | return clearModule(M); |
125 | |
126 | if (G.isThreadLocal()) |
127 | continue; |
128 | if (G.isConstant()) |
129 | continue; |
130 | if (G.getAddressSpace() != GlobAS) |
131 | continue; |
132 | if (G.getLinkage() != GlobalVariable::ExternalLinkage) |
133 | continue; |
134 | |
135 | G.setLinkage(GlobalVariable::ExternalWeakLinkage); |
136 | G.setExternallyInitialized(true); |
137 | } |
138 | } |
139 | |
140 | template<unsigned N> |
141 | static inline void removeUnreachableFunctions( |
142 | const SmallPtrSet<const Function *, N>& Reachable, Module &M) { |
143 | removeFromUsedLists(M, [&](Constant *C) { |
144 | if (auto F = dyn_cast<Function>(Val: C)) |
145 | return !Reachable.contains(F); |
146 | |
147 | return false; |
148 | }); |
149 | |
150 | SmallVector<std::reference_wrapper<Function>> ToRemove; |
151 | copy_if(M, std::back_inserter(x&: ToRemove), [&](auto &&F) { |
152 | return !F.isIntrinsic() && !Reachable.contains(&F); |
153 | }); |
154 | |
155 | for_each(Range&: ToRemove, F: eraseFromModule<Function>); |
156 | } |
157 | |
158 | static inline bool isAcceleratorExecutionRoot(const Function *F) { |
159 | if (!F) |
160 | return false; |
161 | |
162 | return F->getCallingConv() == CallingConv::AMDGPU_KERNEL; |
163 | } |
164 | |
165 | static inline bool checkIfSupported(const Function *F, const CallBase *CB) { |
166 | const auto Dx = F->getName().rfind(Str: "__hipstdpar_unsupported" ); |
167 | |
168 | if (Dx == StringRef::npos) |
169 | return true; |
170 | |
171 | const auto N = F->getName().substr(Start: 0, N: Dx); |
172 | |
173 | std::string W; |
174 | raw_string_ostream OS(W); |
175 | |
176 | if (N == "__ASM" ) |
177 | OS << "Accelerator does not support the ASM block:\n" |
178 | << cast<ConstantDataArray>(Val: CB->getArgOperand(i: 0))->getAsCString(); |
179 | else |
180 | OS << "Accelerator does not support the " << N << " function." ; |
181 | |
182 | auto Caller = CB->getParent()->getParent(); |
183 | |
184 | Caller->getContext().diagnose( |
185 | DI: DiagnosticInfoUnsupported(*Caller, W, CB->getDebugLoc(), DS_Error)); |
186 | |
187 | return false; |
188 | } |
189 | |
190 | PreservedAnalyses |
191 | HipStdParAcceleratorCodeSelectionPass::run(Module &M, |
192 | ModuleAnalysisManager &MAM) { |
193 | auto &CGA = MAM.getResult<CallGraphAnalysis>(IR&: M); |
194 | |
195 | SmallPtrSet<const Function *, 32> Reachable; |
196 | for (auto &&CGN : CGA) { |
197 | if (!isAcceleratorExecutionRoot(F: CGN.first)) |
198 | continue; |
199 | |
200 | Reachable.insert(Ptr: CGN.first); |
201 | |
202 | SmallVector<const Function *> Tmp({CGN.first}); |
203 | do { |
204 | auto F = std::move(Tmp.back()); |
205 | Tmp.pop_back(); |
206 | |
207 | for (auto &&N : *CGA[F]) { |
208 | if (!N.second) |
209 | continue; |
210 | if (!N.second->getFunction()) |
211 | continue; |
212 | if (Reachable.contains(Ptr: N.second->getFunction())) |
213 | continue; |
214 | |
215 | if (!checkIfSupported(F: N.second->getFunction(), |
216 | CB: dyn_cast<CallBase>(Val&: *N.first))) |
217 | return PreservedAnalyses::none(); |
218 | |
219 | Reachable.insert(Ptr: N.second->getFunction()); |
220 | Tmp.push_back(Elt: N.second->getFunction()); |
221 | } |
222 | } while (!std::empty(cont: Tmp)); |
223 | } |
224 | |
225 | if (std::empty(cont: Reachable)) |
226 | clearModule(M); |
227 | else |
228 | removeUnreachableFunctions(Reachable, M); |
229 | |
230 | maybeHandleGlobals(M); |
231 | |
232 | return PreservedAnalyses::none(); |
233 | } |
234 | |
235 | static constexpr std::pair<StringLiteral, StringLiteral> ReplaceMap[]{ |
236 | {"aligned_alloc" , "__hipstdpar_aligned_alloc" }, |
237 | {"calloc" , "__hipstdpar_calloc" }, |
238 | {"free" , "__hipstdpar_free" }, |
239 | {"malloc" , "__hipstdpar_malloc" }, |
240 | {"memalign" , "__hipstdpar_aligned_alloc" }, |
241 | {"posix_memalign" , "__hipstdpar_posix_aligned_alloc" }, |
242 | {"realloc" , "__hipstdpar_realloc" }, |
243 | {"reallocarray" , "__hipstdpar_realloc_array" }, |
244 | {"_ZdaPv" , "__hipstdpar_operator_delete" }, |
245 | {"_ZdaPvm" , "__hipstdpar_operator_delete_sized" }, |
246 | {"_ZdaPvSt11align_val_t" , "__hipstdpar_operator_delete_aligned" }, |
247 | {"_ZdaPvmSt11align_val_t" , "__hipstdpar_operator_delete_aligned_sized" }, |
248 | {"_ZdlPv" , "__hipstdpar_operator_delete" }, |
249 | {"_ZdlPvm" , "__hipstdpar_operator_delete_sized" }, |
250 | {"_ZdlPvSt11align_val_t" , "__hipstdpar_operator_delete_aligned" }, |
251 | {"_ZdlPvmSt11align_val_t" , "__hipstdpar_operator_delete_aligned_sized" }, |
252 | {"_Znam" , "__hipstdpar_operator_new" }, |
253 | {"_ZnamRKSt9nothrow_t" , "__hipstdpar_operator_new_nothrow" }, |
254 | {"_ZnamSt11align_val_t" , "__hipstdpar_operator_new_aligned" }, |
255 | {"_ZnamSt11align_val_tRKSt9nothrow_t" , |
256 | "__hipstdpar_operator_new_aligned_nothrow" }, |
257 | |
258 | {"_Znwm" , "__hipstdpar_operator_new" }, |
259 | {"_ZnwmRKSt9nothrow_t" , "__hipstdpar_operator_new_nothrow" }, |
260 | {"_ZnwmSt11align_val_t" , "__hipstdpar_operator_new_aligned" }, |
261 | {"_ZnwmSt11align_val_tRKSt9nothrow_t" , |
262 | "__hipstdpar_operator_new_aligned_nothrow" }, |
263 | {"__builtin_calloc" , "__hipstdpar_calloc" }, |
264 | {"__builtin_free" , "__hipstdpar_free" }, |
265 | {"__builtin_malloc" , "__hipstdpar_malloc" }, |
266 | {"__builtin_operator_delete" , "__hipstdpar_operator_delete" }, |
267 | {"__builtin_operator_new" , "__hipstdpar_operator_new" }, |
268 | {"__builtin_realloc" , "__hipstdpar_realloc" }, |
269 | {"__libc_calloc" , "__hipstdpar_calloc" }, |
270 | {"__libc_free" , "__hipstdpar_free" }, |
271 | {"__libc_malloc" , "__hipstdpar_malloc" }, |
272 | {"__libc_memalign" , "__hipstdpar_aligned_alloc" }, |
273 | {"__libc_realloc" , "__hipstdpar_realloc" } |
274 | }; |
275 | |
276 | PreservedAnalyses |
277 | HipStdParAllocationInterpositionPass::run(Module &M, ModuleAnalysisManager&) { |
278 | SmallDenseMap<StringRef, StringRef> AllocReplacements(std::cbegin(cont: ReplaceMap), |
279 | std::cend(cont: ReplaceMap)); |
280 | |
281 | for (auto &&F : M) { |
282 | if (!F.hasName()) |
283 | continue; |
284 | if (!AllocReplacements.contains(Val: F.getName())) |
285 | continue; |
286 | |
287 | if (auto R = M.getFunction(Name: AllocReplacements[F.getName()])) { |
288 | F.replaceAllUsesWith(V: R); |
289 | } else { |
290 | std::string W; |
291 | raw_string_ostream OS(W); |
292 | |
293 | OS << "cannot be interposed, missing: " << AllocReplacements[F.getName()] |
294 | << ". Tried to run the allocation interposition pass without the " |
295 | << "replacement functions available." ; |
296 | |
297 | F.getContext().diagnose(DI: DiagnosticInfoUnsupported(F, W, |
298 | F.getSubprogram(), |
299 | DS_Warning)); |
300 | } |
301 | } |
302 | |
303 | if (auto F = M.getFunction(Name: "__hipstdpar_hidden_free" )) { |
304 | auto LibcFree = M.getOrInsertFunction(Name: "__libc_free" , T: F->getFunctionType(), |
305 | AttributeList: F->getAttributes()); |
306 | F->replaceAllUsesWith(V: LibcFree.getCallee()); |
307 | |
308 | eraseFromModule(ToErase&: *F); |
309 | } |
310 | |
311 | return PreservedAnalyses::none(); |
312 | } |
313 | |