1 | //===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements a pass that serializes a gpu module into HSAco blob and |
10 | // adds that blob as a string attribute of the module. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "mlir/Config/mlir-config.h" |
15 | #include "mlir/Dialect/GPU/Transforms/Passes.h" |
16 | #include "mlir/IR/Location.h" |
17 | #include "mlir/IR/MLIRContext.h" |
18 | |
19 | #if MLIR_ENABLE_ROCM_CONVERSIONS |
20 | #include "mlir/ExecutionEngine/OptUtils.h" |
21 | #include "mlir/Pass/Pass.h" |
22 | #include "mlir/Support/FileUtilities.h" |
23 | #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" |
24 | #include "mlir/Target/LLVMIR/Export.h" |
25 | |
26 | #include "llvm/IR/Constants.h" |
27 | #include "llvm/IR/GlobalVariable.h" |
28 | #include "llvm/IR/Module.h" |
29 | #include "llvm/IRReader/IRReader.h" |
30 | #include "llvm/Linker/Linker.h" |
31 | |
32 | #include "llvm/MC/MCAsmBackend.h" |
33 | #include "llvm/MC/MCAsmInfo.h" |
34 | #include "llvm/MC/MCCodeEmitter.h" |
35 | #include "llvm/MC/MCContext.h" |
36 | #include "llvm/MC/MCInstrInfo.h" |
37 | #include "llvm/MC/MCObjectFileInfo.h" |
38 | #include "llvm/MC/MCObjectWriter.h" |
39 | #include "llvm/MC/MCParser/MCTargetAsmParser.h" |
40 | #include "llvm/MC/MCRegisterInfo.h" |
41 | #include "llvm/MC/MCStreamer.h" |
42 | #include "llvm/MC/MCSubtargetInfo.h" |
43 | #include "llvm/MC/TargetRegistry.h" |
44 | |
45 | #include "llvm/Support/CommandLine.h" |
46 | #include "llvm/Support/FileSystem.h" |
47 | #include "llvm/Support/FileUtilities.h" |
48 | #include "llvm/Support/Path.h" |
49 | #include "llvm/Support/Program.h" |
50 | #include "llvm/Support/SourceMgr.h" |
51 | #include "llvm/Support/TargetSelect.h" |
52 | #include "llvm/Support/Threading.h" |
53 | #include "llvm/Support/WithColor.h" |
54 | |
55 | #include "llvm/Target/TargetMachine.h" |
56 | #include "llvm/Target/TargetOptions.h" |
57 | |
58 | #include "llvm/Transforms/IPO/Internalize.h" |
59 | |
60 | #include <optional> |
61 | |
62 | using namespace mlir; |
63 | |
64 | namespace { |
65 | class SerializeToHsacoPass |
66 | : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> { |
67 | static llvm::once_flag initializeBackendOnce; |
68 | |
69 | public: |
70 | MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToHsacoPass) |
71 | |
72 | SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, |
73 | int optLevel); |
74 | SerializeToHsacoPass(const SerializeToHsacoPass &other); |
75 | StringRef getArgument() const override { return "gpu-to-hsaco" ; } |
76 | StringRef getDescription() const override { |
77 | return "Lower GPU kernel function to HSACO binary annotations" ; |
78 | } |
79 | |
80 | protected: |
81 | Option<std::string> rocmPath{*this, "rocm-path" , |
82 | llvm::cl::desc("Path to ROCm install" )}; |
83 | |
84 | // Overload to allow linking in device libs |
85 | std::unique_ptr<llvm::Module> |
86 | translateToLLVMIR(llvm::LLVMContext &llvmContext) override; |
87 | |
88 | private: |
89 | // Loads LLVM bitcode libraries |
90 | std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> |
91 | loadLibraries(SmallVectorImpl<char> &path, |
92 | SmallVectorImpl<StringRef> &libraries, |
93 | llvm::LLVMContext &context); |
94 | |
95 | // Serializes ROCDL to HSACO. |
96 | std::unique_ptr<std::vector<char>> |
97 | serializeISA(const std::string &isa) override; |
98 | |
99 | LogicalResult assembleIsa(const std::string &isa, |
100 | SmallVectorImpl<char> &result); |
101 | std::unique_ptr<std::vector<char>> createHsaco(ArrayRef<char> isaBinary); |
102 | |
103 | std::string getRocmPath(); |
104 | }; |
105 | } // namespace |
106 | |
107 | SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other) |
108 | : PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {} |
109 | |
110 | /// Get a user-specified path to ROCm |
111 | // Tries, in order, the --rocm-path option, the ROCM_PATH environment variable |
112 | // and a compile-time default |
113 | std::string SerializeToHsacoPass::getRocmPath() { |
114 | if (rocmPath.getNumOccurrences() > 0) |
115 | return rocmPath.getValue(); |
116 | |
117 | return __DEFAULT_ROCM_PATH__; |
118 | } |
119 | |
120 | // Sets the 'option' to 'value' unless it already has a value. |
121 | static void maybeSetOption(Pass::Option<std::string> &option, |
122 | function_ref<std::string()> getValue) { |
123 | if (!option.hasValue()) |
124 | option = getValue(); |
125 | } |
126 | |
127 | llvm::once_flag SerializeToHsacoPass::initializeBackendOnce; |
128 | |
129 | SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch, |
130 | StringRef features, int optLevel) { |
131 | // No matter how this pass is constructed, ensure that the AMDGPU backend |
132 | // is initialized exactly once. |
133 | llvm::call_once(flag&: initializeBackendOnce, F: []() { |
134 | // Initialize LLVM AMDGPU backend. |
135 | LLVMInitializeAMDGPUAsmParser(); |
136 | LLVMInitializeAMDGPUAsmPrinter(); |
137 | LLVMInitializeAMDGPUTarget(); |
138 | LLVMInitializeAMDGPUTargetInfo(); |
139 | LLVMInitializeAMDGPUTargetMC(); |
140 | }); |
141 | maybeSetOption(this->triple, [&triple] { return triple.str(); }); |
142 | maybeSetOption(this->chip, [&arch] { return arch.str(); }); |
143 | maybeSetOption(this->features, [&features] { return features.str(); }); |
144 | if (this->optLevel.getNumOccurrences() == 0) |
145 | this->optLevel.setValue(optLevel); |
146 | } |
147 | |
148 | std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> |
149 | SerializeToHsacoPass::loadLibraries(SmallVectorImpl<char> &path, |
150 | SmallVectorImpl<StringRef> &libraries, |
151 | llvm::LLVMContext &context) { |
152 | SmallVector<std::unique_ptr<llvm::Module>, 3> ret; |
153 | size_t dirLength = path.size(); |
154 | |
155 | if (!llvm::sys::fs::is_directory(Path: path)) { |
156 | getOperation().emitRemark() << "Bitcode path: " << path |
157 | << " does not exist or is not a directory\n" ; |
158 | return std::nullopt; |
159 | } |
160 | |
161 | for (const StringRef file : libraries) { |
162 | llvm::SMDiagnostic error; |
163 | llvm::sys::path::append(path, a: file); |
164 | llvm::StringRef pathRef(path.data(), path.size()); |
165 | std::unique_ptr<llvm::Module> library = |
166 | llvm::getLazyIRFileModule(Filename: pathRef, Err&: error, Context&: context); |
167 | path.truncate(N: dirLength); |
168 | if (!library) { |
169 | getOperation().emitError() << "Failed to load library " << file |
170 | << " from " << path << error.getMessage(); |
171 | return std::nullopt; |
172 | } |
173 | // Some ROCM builds don't strip this like they should |
174 | if (auto *openclVersion = library->getNamedMetadata(Name: "opencl.ocl.version" )) |
175 | library->eraseNamedMetadata(NMD: openclVersion); |
176 | // Stop spamming us with clang version numbers |
177 | if (auto *ident = library->getNamedMetadata(Name: "llvm.ident" )) |
178 | library->eraseNamedMetadata(NMD: ident); |
179 | ret.push_back(Elt: std::move(library)); |
180 | } |
181 | |
182 | return std::move(ret); |
183 | } |
184 | |
185 | std::unique_ptr<llvm::Module> |
186 | SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) { |
187 | // MLIR -> LLVM translation |
188 | std::unique_ptr<llvm::Module> ret = |
189 | gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext); |
190 | |
191 | if (!ret) { |
192 | getOperation().emitOpError("Module lowering failed" ); |
193 | return ret; |
194 | } |
195 | // Walk the LLVM module in order to determine if we need to link in device |
196 | // libs |
197 | bool needOpenCl = false; |
198 | bool needOckl = false; |
199 | bool needOcml = false; |
200 | for (llvm::Function &f : ret->functions()) { |
201 | if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) { |
202 | StringRef funcName = f.getName(); |
203 | if ("printf" == funcName) |
204 | needOpenCl = true; |
205 | if (funcName.starts_with(Prefix: "__ockl_" )) |
206 | needOckl = true; |
207 | if (funcName.starts_with(Prefix: "__ocml_" )) |
208 | needOcml = true; |
209 | } |
210 | } |
211 | |
212 | if (needOpenCl) |
213 | needOcml = needOckl = true; |
214 | |
215 | // No libraries needed (the typical case) |
216 | if (!(needOpenCl || needOcml || needOckl)) |
217 | return ret; |
218 | |
219 | // Define one of the control constants the ROCm device libraries expect to be |
220 | // present These constants can either be defined in the module or can be |
221 | // imported by linking in bitcode that defines the constant. To simplify our |
222 | // logic, we define the constants into the module we are compiling |
223 | auto addControlConstant = [&module = *ret](StringRef name, uint32_t value, |
224 | uint32_t bitwidth) { |
225 | using llvm::GlobalVariable; |
226 | if (module.getNamedGlobal(Name: name)) { |
227 | return; |
228 | } |
229 | llvm::IntegerType *type = |
230 | llvm::IntegerType::getIntNTy(C&: module.getContext(), N: bitwidth); |
231 | auto *initializer = llvm::ConstantInt::get(Ty: type, V: value, /*isSigned=*/IsSigned: false); |
232 | auto *constant = new GlobalVariable( |
233 | module, type, |
234 | /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage, |
235 | initializer, name, |
236 | /*before=*/nullptr, |
237 | /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal, |
238 | /*addressSpace=*/4); |
239 | constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local); |
240 | constant->setVisibility( |
241 | GlobalVariable::VisibilityTypes::ProtectedVisibility); |
242 | constant->setAlignment(llvm::MaybeAlign(bitwidth / 8)); |
243 | }; |
244 | |
245 | // Set up control variables in the module instead of linking in tiny bitcode |
246 | if (needOcml) { |
247 | // TODO(kdrewnia): Enable math optimizations once we have support for |
248 | // `-ffast-math`-like options |
249 | addControlConstant("__oclc_finite_only_opt" , 0, 8); |
250 | addControlConstant("__oclc_daz_opt" , 0, 8); |
251 | addControlConstant("__oclc_correctly_rounded_sqrt32" , 1, 8); |
252 | addControlConstant("__oclc_unsafe_math_opt" , 0, 8); |
253 | } |
254 | if (needOcml || needOckl) { |
255 | addControlConstant("__oclc_wavefrontsize64" , 1, 8); |
256 | StringRef chipSet = this->chip.getValue(); |
257 | if (chipSet.starts_with(Prefix: "gfx" )) |
258 | chipSet = chipSet.substr(Start: 3); |
259 | uint32_t minor = |
260 | llvm::APInt(32, chipSet.substr(Start: chipSet.size() - 2), 16).getZExtValue(); |
261 | uint32_t major = llvm::APInt(32, chipSet.substr(Start: 0, N: chipSet.size() - 2), 10) |
262 | .getZExtValue(); |
263 | uint32_t isaNumber = minor + 1000 * major; |
264 | addControlConstant("__oclc_ISA_version" , isaNumber, 32); |
265 | |
266 | // This constant must always match the default code object ABI version |
267 | // of the AMDGPU backend. |
268 | addControlConstant("__oclc_ABI_version" , 500, 32); |
269 | } |
270 | |
271 | // Determine libraries we need to link - order matters due to dependencies |
272 | llvm::SmallVector<StringRef, 4> libraries; |
273 | if (needOpenCl) |
274 | libraries.push_back(Elt: "opencl.bc" ); |
275 | if (needOcml) |
276 | libraries.push_back(Elt: "ocml.bc" ); |
277 | if (needOckl) |
278 | libraries.push_back(Elt: "ockl.bc" ); |
279 | |
280 | std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> mbModules; |
281 | std::string theRocmPath = getRocmPath(); |
282 | llvm::SmallString<32> bitcodePath(theRocmPath); |
283 | llvm::sys::path::append(path&: bitcodePath, a: "amdgcn" , b: "bitcode" ); |
284 | mbModules = loadLibraries(path&: bitcodePath, libraries, context&: llvmContext); |
285 | |
286 | if (!mbModules) { |
287 | getOperation() |
288 | .emitWarning("Could not load required device libraries" ) |
289 | .attachNote() |
290 | << "This will probably cause link-time or run-time failures" ; |
291 | return ret; // We can still abort here |
292 | } |
293 | |
294 | llvm::Linker linker(*ret); |
295 | for (std::unique_ptr<llvm::Module> &libModule : *mbModules) { |
296 | // This bitcode linking code is substantially similar to what is used in |
297 | // hip-clang It imports the library functions into the module, allowing LLVM |
298 | // optimization passes (which must run after linking) to optimize across the |
299 | // libraries and the module's code. We also only import symbols if they are |
300 | // referenced by the module or a previous library since there will be no |
301 | // other source of references to those symbols in this compilation and since |
302 | // we don't want to bloat the resulting code object. |
303 | bool err = linker.linkInModule( |
304 | Src: std::move(libModule), Flags: llvm::Linker::Flags::LinkOnlyNeeded, |
305 | InternalizeCallback: [](llvm::Module &m, const StringSet<> &gvs) { |
306 | llvm::internalizeModule(TheModule&: m, MustPreserveGV: [&gvs](const llvm::GlobalValue &gv) { |
307 | return !gv.hasName() || (gvs.count(Key: gv.getName()) == 0); |
308 | }); |
309 | }); |
310 | // True is linker failure |
311 | if (err) { |
312 | getOperation().emitError( |
313 | "Unrecoverable failure during device library linking." ); |
314 | // We have no guaranties about the state of `ret`, so bail |
315 | return nullptr; |
316 | } |
317 | } |
318 | |
319 | return ret; |
320 | } |
321 | |
322 | LogicalResult SerializeToHsacoPass::assembleIsa(const std::string &isa, |
323 | SmallVectorImpl<char> &result) { |
324 | auto loc = getOperation().getLoc(); |
325 | |
326 | llvm::raw_svector_ostream os(result); |
327 | |
328 | llvm::Triple triple(llvm::Triple::normalize(this->triple)); |
329 | std::string error; |
330 | const llvm::Target *target = |
331 | llvm::TargetRegistry::lookupTarget(Triple: triple.normalize(), Error&: error); |
332 | if (!target) |
333 | return emitError(loc, Twine("failed to lookup target: " ) + error); |
334 | |
335 | llvm::SourceMgr srcMgr; |
336 | srcMgr.AddNewSourceBuffer(F: llvm::MemoryBuffer::getMemBuffer(InputData: isa), IncludeLoc: SMLoc()); |
337 | |
338 | const llvm::MCTargetOptions mcOptions; |
339 | std::unique_ptr<llvm::MCRegisterInfo> mri( |
340 | target->createMCRegInfo(TT: this->triple)); |
341 | std::unique_ptr<llvm::MCAsmInfo> mai( |
342 | target->createMCAsmInfo(MRI: *mri, TheTriple: this->triple, Options: mcOptions)); |
343 | std::unique_ptr<llvm::MCSubtargetInfo> sti( |
344 | target->createMCSubtargetInfo(TheTriple: this->triple, CPU: this->chip, Features: this->features)); |
345 | |
346 | llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr, |
347 | &mcOptions); |
348 | std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo( |
349 | Ctx&: ctx, /*PIC=*/false, /*LargeCodeModel=*/false)); |
350 | ctx.setObjectFileInfo(mofi.get()); |
351 | |
352 | SmallString<128> cwd; |
353 | if (!llvm::sys::fs::current_path(result&: cwd)) |
354 | ctx.setCompilationDir(cwd); |
355 | |
356 | std::unique_ptr<llvm::MCStreamer> mcStreamer; |
357 | std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo()); |
358 | |
359 | llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(II: *mcii, Ctx&: ctx); |
360 | llvm::MCAsmBackend *mab = target->createMCAsmBackend(STI: *sti, MRI: *mri, Options: mcOptions); |
361 | mcStreamer.reset(p: target->createMCObjectStreamer( |
362 | T: triple, Ctx&: ctx, TAB: std::unique_ptr<llvm::MCAsmBackend>(mab), |
363 | OW: mab->createObjectWriter(OS&: os), Emitter: std::unique_ptr<llvm::MCCodeEmitter>(ce), |
364 | STI: *sti, RelaxAll: mcOptions.MCRelaxAll, IncrementalLinkerCompatible: mcOptions.MCIncrementalLinkerCompatible, |
365 | /*DWARFMustBeAtTheEnd*/ false)); |
366 | mcStreamer->setUseAssemblerInfoForParsing(true); |
367 | |
368 | std::unique_ptr<llvm::MCAsmParser> parser( |
369 | createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); |
370 | std::unique_ptr<llvm::MCTargetAsmParser> tap( |
371 | target->createMCAsmParser(STI: *sti, Parser&: *parser, MII: *mcii, Options: mcOptions)); |
372 | |
373 | if (!tap) |
374 | return emitError(loc, "assembler initialization error" ); |
375 | |
376 | parser->setTargetParser(*tap); |
377 | parser->Run(NoInitialTextSection: false); |
378 | |
379 | return success(); |
380 | } |
381 | |
382 | std::unique_ptr<std::vector<char>> |
383 | SerializeToHsacoPass::createHsaco(ArrayRef<char> isaBinary) { |
384 | auto loc = getOperation().getLoc(); |
385 | |
386 | // Save the ISA binary to a temp file. |
387 | int tempIsaBinaryFd = -1; |
388 | SmallString<128> tempIsaBinaryFilename; |
389 | if (llvm::sys::fs::createTemporaryFile(Prefix: "kernel" , Suffix: "o" , ResultFD&: tempIsaBinaryFd, |
390 | ResultPath&: tempIsaBinaryFilename)) { |
391 | emitError(loc, "temporary file for ISA binary creation error" ); |
392 | return {}; |
393 | } |
394 | llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename); |
395 | llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); |
396 | tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size()); |
397 | tempIsaBinaryOs.close(); |
398 | |
399 | // Create a temp file for HSA code object. |
400 | SmallString<128> tempHsacoFilename; |
401 | if (llvm::sys::fs::createTemporaryFile(Prefix: "kernel" , Suffix: "hsaco" , |
402 | ResultPath&: tempHsacoFilename)) { |
403 | emitError(loc, "temporary file for HSA code object creation error" ); |
404 | return {}; |
405 | } |
406 | llvm::FileRemover cleanupHsaco(tempHsacoFilename); |
407 | |
408 | std::string theRocmPath = getRocmPath(); |
409 | llvm::SmallString<32> lldPath(theRocmPath); |
410 | llvm::sys::path::append(path&: lldPath, a: "llvm" , b: "bin" , c: "ld.lld" ); |
411 | int lldResult = llvm::sys::ExecuteAndWait( |
412 | Program: lldPath, |
413 | Args: {"ld.lld" , "-shared" , tempIsaBinaryFilename, "-o" , tempHsacoFilename}); |
414 | if (lldResult != 0) { |
415 | emitError(loc, "lld invocation error" ); |
416 | return {}; |
417 | } |
418 | |
419 | // Load the HSA code object. |
420 | auto hsacoFile = |
421 | llvm::MemoryBuffer::getFile(Filename: tempHsacoFilename, /*IsText=*/false); |
422 | if (!hsacoFile) { |
423 | emitError(loc, "read HSA code object from temp file error" ); |
424 | return {}; |
425 | } |
426 | |
427 | StringRef buffer = (*hsacoFile)->getBuffer(); |
428 | return std::make_unique<std::vector<char>>(args: buffer.begin(), args: buffer.end()); |
429 | } |
430 | |
431 | std::unique_ptr<std::vector<char>> |
432 | SerializeToHsacoPass::serializeISA(const std::string &isa) { |
433 | SmallVector<char, 0> isaBinary; |
434 | if (failed(result: assembleIsa(isa, result&: isaBinary))) |
435 | return {}; |
436 | return createHsaco(isaBinary); |
437 | } |
438 | |
439 | // Register pass to serialize GPU kernel functions to a HSACO binary annotation. |
440 | void mlir::registerGpuSerializeToHsacoPass() { |
441 | PassRegistration<SerializeToHsacoPass> registerSerializeToHSACO([] { |
442 | return std::make_unique<SerializeToHsacoPass>(args: "amdgcn-amd-amdhsa" , args: "" , args: "" , |
443 | args: 2); |
444 | }); |
445 | } |
446 | |
447 | /// Create an instance of the GPU kernel function to HSAco binary serialization |
448 | /// pass. |
449 | std::unique_ptr<Pass> mlir::createGpuSerializeToHsacoPass(StringRef triple, |
450 | StringRef arch, |
451 | StringRef features, |
452 | int optLevel) { |
453 | return std::make_unique<SerializeToHsacoPass>(args&: triple, args&: arch, args&: features, |
454 | args&: optLevel); |
455 | } |
456 | |
457 | #else // MLIR_ENABLE_ROCM_CONVERSIONS |
458 | void mlir::registerGpuSerializeToHsacoPass() {} |
459 | #endif // MLIR_ENABLE_ROCM_CONVERSIONS |
460 | |