1//===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass that serializes a gpu module into HSAco blob and
10// adds that blob as a string attribute of the module.
11//
12//===----------------------------------------------------------------------===//
13
14#include "mlir/Config/mlir-config.h"
15#include "mlir/Dialect/GPU/Transforms/Passes.h"
16#include "mlir/IR/Location.h"
17#include "mlir/IR/MLIRContext.h"
18
19#if MLIR_ENABLE_ROCM_CONVERSIONS
20#include "mlir/ExecutionEngine/OptUtils.h"
21#include "mlir/Pass/Pass.h"
22#include "mlir/Support/FileUtilities.h"
23#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
24#include "mlir/Target/LLVMIR/Export.h"
25
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/GlobalVariable.h"
28#include "llvm/IR/Module.h"
29#include "llvm/IRReader/IRReader.h"
30#include "llvm/Linker/Linker.h"
31
32#include "llvm/MC/MCAsmBackend.h"
33#include "llvm/MC/MCAsmInfo.h"
34#include "llvm/MC/MCCodeEmitter.h"
35#include "llvm/MC/MCContext.h"
36#include "llvm/MC/MCInstrInfo.h"
37#include "llvm/MC/MCObjectFileInfo.h"
38#include "llvm/MC/MCObjectWriter.h"
39#include "llvm/MC/MCParser/MCTargetAsmParser.h"
40#include "llvm/MC/MCRegisterInfo.h"
41#include "llvm/MC/MCStreamer.h"
42#include "llvm/MC/MCSubtargetInfo.h"
43#include "llvm/MC/TargetRegistry.h"
44
45#include "llvm/Support/CommandLine.h"
46#include "llvm/Support/FileSystem.h"
47#include "llvm/Support/FileUtilities.h"
48#include "llvm/Support/Path.h"
49#include "llvm/Support/Program.h"
50#include "llvm/Support/SourceMgr.h"
51#include "llvm/Support/TargetSelect.h"
52#include "llvm/Support/Threading.h"
53#include "llvm/Support/WithColor.h"
54
55#include "llvm/Target/TargetMachine.h"
56#include "llvm/Target/TargetOptions.h"
57
58#include "llvm/Transforms/IPO/Internalize.h"
59
60#include <optional>
61
62using namespace mlir;
63
64namespace {
65class SerializeToHsacoPass
66 : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
67 static llvm::once_flag initializeBackendOnce;
68
69public:
70 MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(SerializeToHsacoPass)
71
72 SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
73 int optLevel);
74 SerializeToHsacoPass(const SerializeToHsacoPass &other);
75 StringRef getArgument() const override { return "gpu-to-hsaco"; }
76 StringRef getDescription() const override {
77 return "Lower GPU kernel function to HSACO binary annotations";
78 }
79
80protected:
81 Option<std::string> rocmPath{*this, "rocm-path",
82 llvm::cl::desc("Path to ROCm install")};
83
84 // Overload to allow linking in device libs
85 std::unique_ptr<llvm::Module>
86 translateToLLVMIR(llvm::LLVMContext &llvmContext) override;
87
88private:
89 // Loads LLVM bitcode libraries
90 std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>>
91 loadLibraries(SmallVectorImpl<char> &path,
92 SmallVectorImpl<StringRef> &libraries,
93 llvm::LLVMContext &context);
94
95 // Serializes ROCDL to HSACO.
96 std::unique_ptr<std::vector<char>>
97 serializeISA(const std::string &isa) override;
98
99 LogicalResult assembleIsa(const std::string &isa,
100 SmallVectorImpl<char> &result);
101 std::unique_ptr<std::vector<char>> createHsaco(ArrayRef<char> isaBinary);
102
103 std::string getRocmPath();
104};
105} // namespace
106
107SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other)
108 : PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {}
109
110/// Get a user-specified path to ROCm
111// Tries, in order, the --rocm-path option, the ROCM_PATH environment variable
112// and a compile-time default
113std::string SerializeToHsacoPass::getRocmPath() {
114 if (rocmPath.getNumOccurrences() > 0)
115 return rocmPath.getValue();
116
117 return __DEFAULT_ROCM_PATH__;
118}
119
120// Sets the 'option' to 'value' unless it already has a value.
121static void maybeSetOption(Pass::Option<std::string> &option,
122 function_ref<std::string()> getValue) {
123 if (!option.hasValue())
124 option = getValue();
125}
126
127llvm::once_flag SerializeToHsacoPass::initializeBackendOnce;
128
129SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
130 StringRef features, int optLevel) {
131 // No matter how this pass is constructed, ensure that the AMDGPU backend
132 // is initialized exactly once.
133 llvm::call_once(flag&: initializeBackendOnce, F: []() {
134 // Initialize LLVM AMDGPU backend.
135 LLVMInitializeAMDGPUAsmParser();
136 LLVMInitializeAMDGPUAsmPrinter();
137 LLVMInitializeAMDGPUTarget();
138 LLVMInitializeAMDGPUTargetInfo();
139 LLVMInitializeAMDGPUTargetMC();
140 });
141 maybeSetOption(this->triple, [&triple] { return triple.str(); });
142 maybeSetOption(this->chip, [&arch] { return arch.str(); });
143 maybeSetOption(this->features, [&features] { return features.str(); });
144 if (this->optLevel.getNumOccurrences() == 0)
145 this->optLevel.setValue(optLevel);
146}
147
148std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>>
149SerializeToHsacoPass::loadLibraries(SmallVectorImpl<char> &path,
150 SmallVectorImpl<StringRef> &libraries,
151 llvm::LLVMContext &context) {
152 SmallVector<std::unique_ptr<llvm::Module>, 3> ret;
153 size_t dirLength = path.size();
154
155 if (!llvm::sys::fs::is_directory(Path: path)) {
156 getOperation().emitRemark() << "Bitcode path: " << path
157 << " does not exist or is not a directory\n";
158 return std::nullopt;
159 }
160
161 for (const StringRef file : libraries) {
162 llvm::SMDiagnostic error;
163 llvm::sys::path::append(path, a: file);
164 llvm::StringRef pathRef(path.data(), path.size());
165 std::unique_ptr<llvm::Module> library =
166 llvm::getLazyIRFileModule(Filename: pathRef, Err&: error, Context&: context);
167 path.truncate(N: dirLength);
168 if (!library) {
169 getOperation().emitError() << "Failed to load library " << file
170 << " from " << path << error.getMessage();
171 return std::nullopt;
172 }
173 // Some ROCM builds don't strip this like they should
174 if (auto *openclVersion = library->getNamedMetadata(Name: "opencl.ocl.version"))
175 library->eraseNamedMetadata(NMD: openclVersion);
176 // Stop spamming us with clang version numbers
177 if (auto *ident = library->getNamedMetadata(Name: "llvm.ident"))
178 library->eraseNamedMetadata(NMD: ident);
179 ret.push_back(Elt: std::move(library));
180 }
181
182 return std::move(ret);
183}
184
185std::unique_ptr<llvm::Module>
186SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
187 // MLIR -> LLVM translation
188 std::unique_ptr<llvm::Module> ret =
189 gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext);
190
191 if (!ret) {
192 getOperation().emitOpError("Module lowering failed");
193 return ret;
194 }
195 // Walk the LLVM module in order to determine if we need to link in device
196 // libs
197 bool needOpenCl = false;
198 bool needOckl = false;
199 bool needOcml = false;
200 for (llvm::Function &f : ret->functions()) {
201 if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
202 StringRef funcName = f.getName();
203 if ("printf" == funcName)
204 needOpenCl = true;
205 if (funcName.starts_with(Prefix: "__ockl_"))
206 needOckl = true;
207 if (funcName.starts_with(Prefix: "__ocml_"))
208 needOcml = true;
209 }
210 }
211
212 if (needOpenCl)
213 needOcml = needOckl = true;
214
215 // No libraries needed (the typical case)
216 if (!(needOpenCl || needOcml || needOckl))
217 return ret;
218
219 // Define one of the control constants the ROCm device libraries expect to be
220 // present These constants can either be defined in the module or can be
221 // imported by linking in bitcode that defines the constant. To simplify our
222 // logic, we define the constants into the module we are compiling
223 auto addControlConstant = [&module = *ret](StringRef name, uint32_t value,
224 uint32_t bitwidth) {
225 using llvm::GlobalVariable;
226 if (module.getNamedGlobal(Name: name)) {
227 return;
228 }
229 llvm::IntegerType *type =
230 llvm::IntegerType::getIntNTy(C&: module.getContext(), N: bitwidth);
231 auto *initializer = llvm::ConstantInt::get(Ty: type, V: value, /*isSigned=*/IsSigned: false);
232 auto *constant = new GlobalVariable(
233 module, type,
234 /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage,
235 initializer, name,
236 /*before=*/nullptr,
237 /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal,
238 /*addressSpace=*/4);
239 constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local);
240 constant->setVisibility(
241 GlobalVariable::VisibilityTypes::ProtectedVisibility);
242 constant->setAlignment(llvm::MaybeAlign(bitwidth / 8));
243 };
244
245 // Set up control variables in the module instead of linking in tiny bitcode
246 if (needOcml) {
247 // TODO(kdrewnia): Enable math optimizations once we have support for
248 // `-ffast-math`-like options
249 addControlConstant("__oclc_finite_only_opt", 0, 8);
250 addControlConstant("__oclc_daz_opt", 0, 8);
251 addControlConstant("__oclc_correctly_rounded_sqrt32", 1, 8);
252 addControlConstant("__oclc_unsafe_math_opt", 0, 8);
253 }
254 if (needOcml || needOckl) {
255 addControlConstant("__oclc_wavefrontsize64", 1, 8);
256 StringRef chipSet = this->chip.getValue();
257 if (chipSet.starts_with(Prefix: "gfx"))
258 chipSet = chipSet.substr(Start: 3);
259 uint32_t minor =
260 llvm::APInt(32, chipSet.substr(Start: chipSet.size() - 2), 16).getZExtValue();
261 uint32_t major = llvm::APInt(32, chipSet.substr(Start: 0, N: chipSet.size() - 2), 10)
262 .getZExtValue();
263 uint32_t isaNumber = minor + 1000 * major;
264 addControlConstant("__oclc_ISA_version", isaNumber, 32);
265
266 // This constant must always match the default code object ABI version
267 // of the AMDGPU backend.
268 addControlConstant("__oclc_ABI_version", 500, 32);
269 }
270
271 // Determine libraries we need to link - order matters due to dependencies
272 llvm::SmallVector<StringRef, 4> libraries;
273 if (needOpenCl)
274 libraries.push_back(Elt: "opencl.bc");
275 if (needOcml)
276 libraries.push_back(Elt: "ocml.bc");
277 if (needOckl)
278 libraries.push_back(Elt: "ockl.bc");
279
280 std::optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> mbModules;
281 std::string theRocmPath = getRocmPath();
282 llvm::SmallString<32> bitcodePath(theRocmPath);
283 llvm::sys::path::append(path&: bitcodePath, a: "amdgcn", b: "bitcode");
284 mbModules = loadLibraries(path&: bitcodePath, libraries, context&: llvmContext);
285
286 if (!mbModules) {
287 getOperation()
288 .emitWarning("Could not load required device libraries")
289 .attachNote()
290 << "This will probably cause link-time or run-time failures";
291 return ret; // We can still abort here
292 }
293
294 llvm::Linker linker(*ret);
295 for (std::unique_ptr<llvm::Module> &libModule : *mbModules) {
296 // This bitcode linking code is substantially similar to what is used in
297 // hip-clang It imports the library functions into the module, allowing LLVM
298 // optimization passes (which must run after linking) to optimize across the
299 // libraries and the module's code. We also only import symbols if they are
300 // referenced by the module or a previous library since there will be no
301 // other source of references to those symbols in this compilation and since
302 // we don't want to bloat the resulting code object.
303 bool err = linker.linkInModule(
304 Src: std::move(libModule), Flags: llvm::Linker::Flags::LinkOnlyNeeded,
305 InternalizeCallback: [](llvm::Module &m, const StringSet<> &gvs) {
306 llvm::internalizeModule(TheModule&: m, MustPreserveGV: [&gvs](const llvm::GlobalValue &gv) {
307 return !gv.hasName() || (gvs.count(Key: gv.getName()) == 0);
308 });
309 });
310 // True is linker failure
311 if (err) {
312 getOperation().emitError(
313 "Unrecoverable failure during device library linking.");
314 // We have no guaranties about the state of `ret`, so bail
315 return nullptr;
316 }
317 }
318
319 return ret;
320}
321
322LogicalResult SerializeToHsacoPass::assembleIsa(const std::string &isa,
323 SmallVectorImpl<char> &result) {
324 auto loc = getOperation().getLoc();
325
326 llvm::raw_svector_ostream os(result);
327
328 llvm::Triple triple(llvm::Triple::normalize(this->triple));
329 std::string error;
330 const llvm::Target *target =
331 llvm::TargetRegistry::lookupTarget(Triple: triple.normalize(), Error&: error);
332 if (!target)
333 return emitError(loc, Twine("failed to lookup target: ") + error);
334
335 llvm::SourceMgr srcMgr;
336 srcMgr.AddNewSourceBuffer(F: llvm::MemoryBuffer::getMemBuffer(InputData: isa), IncludeLoc: SMLoc());
337
338 const llvm::MCTargetOptions mcOptions;
339 std::unique_ptr<llvm::MCRegisterInfo> mri(
340 target->createMCRegInfo(TT: this->triple));
341 std::unique_ptr<llvm::MCAsmInfo> mai(
342 target->createMCAsmInfo(MRI: *mri, TheTriple: this->triple, Options: mcOptions));
343 std::unique_ptr<llvm::MCSubtargetInfo> sti(
344 target->createMCSubtargetInfo(TheTriple: this->triple, CPU: this->chip, Features: this->features));
345
346 llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
347 &mcOptions);
348 std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo(
349 Ctx&: ctx, /*PIC=*/false, /*LargeCodeModel=*/false));
350 ctx.setObjectFileInfo(mofi.get());
351
352 SmallString<128> cwd;
353 if (!llvm::sys::fs::current_path(result&: cwd))
354 ctx.setCompilationDir(cwd);
355
356 std::unique_ptr<llvm::MCStreamer> mcStreamer;
357 std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());
358
359 llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(II: *mcii, Ctx&: ctx);
360 llvm::MCAsmBackend *mab = target->createMCAsmBackend(STI: *sti, MRI: *mri, Options: mcOptions);
361 mcStreamer.reset(p: target->createMCObjectStreamer(
362 T: triple, Ctx&: ctx, TAB: std::unique_ptr<llvm::MCAsmBackend>(mab),
363 OW: mab->createObjectWriter(OS&: os), Emitter: std::unique_ptr<llvm::MCCodeEmitter>(ce),
364 STI: *sti, RelaxAll: mcOptions.MCRelaxAll, IncrementalLinkerCompatible: mcOptions.MCIncrementalLinkerCompatible,
365 /*DWARFMustBeAtTheEnd*/ false));
366 mcStreamer->setUseAssemblerInfoForParsing(true);
367
368 std::unique_ptr<llvm::MCAsmParser> parser(
369 createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
370 std::unique_ptr<llvm::MCTargetAsmParser> tap(
371 target->createMCAsmParser(STI: *sti, Parser&: *parser, MII: *mcii, Options: mcOptions));
372
373 if (!tap)
374 return emitError(loc, "assembler initialization error");
375
376 parser->setTargetParser(*tap);
377 parser->Run(NoInitialTextSection: false);
378
379 return success();
380}
381
382std::unique_ptr<std::vector<char>>
383SerializeToHsacoPass::createHsaco(ArrayRef<char> isaBinary) {
384 auto loc = getOperation().getLoc();
385
386 // Save the ISA binary to a temp file.
387 int tempIsaBinaryFd = -1;
388 SmallString<128> tempIsaBinaryFilename;
389 if (llvm::sys::fs::createTemporaryFile(Prefix: "kernel", Suffix: "o", ResultFD&: tempIsaBinaryFd,
390 ResultPath&: tempIsaBinaryFilename)) {
391 emitError(loc, "temporary file for ISA binary creation error");
392 return {};
393 }
394 llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
395 llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
396 tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size());
397 tempIsaBinaryOs.close();
398
399 // Create a temp file for HSA code object.
400 SmallString<128> tempHsacoFilename;
401 if (llvm::sys::fs::createTemporaryFile(Prefix: "kernel", Suffix: "hsaco",
402 ResultPath&: tempHsacoFilename)) {
403 emitError(loc, "temporary file for HSA code object creation error");
404 return {};
405 }
406 llvm::FileRemover cleanupHsaco(tempHsacoFilename);
407
408 std::string theRocmPath = getRocmPath();
409 llvm::SmallString<32> lldPath(theRocmPath);
410 llvm::sys::path::append(path&: lldPath, a: "llvm", b: "bin", c: "ld.lld");
411 int lldResult = llvm::sys::ExecuteAndWait(
412 Program: lldPath,
413 Args: {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
414 if (lldResult != 0) {
415 emitError(loc, "lld invocation error");
416 return {};
417 }
418
419 // Load the HSA code object.
420 auto hsacoFile =
421 llvm::MemoryBuffer::getFile(Filename: tempHsacoFilename, /*IsText=*/false);
422 if (!hsacoFile) {
423 emitError(loc, "read HSA code object from temp file error");
424 return {};
425 }
426
427 StringRef buffer = (*hsacoFile)->getBuffer();
428 return std::make_unique<std::vector<char>>(args: buffer.begin(), args: buffer.end());
429}
430
431std::unique_ptr<std::vector<char>>
432SerializeToHsacoPass::serializeISA(const std::string &isa) {
433 SmallVector<char, 0> isaBinary;
434 if (failed(result: assembleIsa(isa, result&: isaBinary)))
435 return {};
436 return createHsaco(isaBinary);
437}
438
439// Register pass to serialize GPU kernel functions to a HSACO binary annotation.
440void mlir::registerGpuSerializeToHsacoPass() {
441 PassRegistration<SerializeToHsacoPass> registerSerializeToHSACO([] {
442 return std::make_unique<SerializeToHsacoPass>(args: "amdgcn-amd-amdhsa", args: "", args: "",
443 args: 2);
444 });
445}
446
447/// Create an instance of the GPU kernel function to HSAco binary serialization
448/// pass.
449std::unique_ptr<Pass> mlir::createGpuSerializeToHsacoPass(StringRef triple,
450 StringRef arch,
451 StringRef features,
452 int optLevel) {
453 return std::make_unique<SerializeToHsacoPass>(args&: triple, args&: arch, args&: features,
454 args&: optLevel);
455}
456
457#else // MLIR_ENABLE_ROCM_CONVERSIONS
458void mlir::registerGpuSerializeToHsacoPass() {}
459#endif // MLIR_ENABLE_ROCM_CONVERSIONS
460

source code of mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp