1//===- Target.cpp - MLIR LLVM NVVM target compilation -----------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This files defines NVVM target related functions including registration
10// calls for the `#nvvm.target` compilation attribute.
11//
12//===----------------------------------------------------------------------===//
13
14#include "mlir/Target/LLVM/NVVM/Target.h"
15
16#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
17#include "mlir/Dialect/GPU/IR/GPUDialect.h"
18#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
19#include "mlir/IR/BuiltinAttributeInterfaces.h"
20#include "mlir/IR/BuiltinDialect.h"
21#include "mlir/IR/BuiltinTypes.h"
22#include "mlir/IR/DialectResourceBlobManager.h"
23#include "mlir/Target/LLVM/NVVM/Utils.h"
24#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
25#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
26#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
27#include "mlir/Target/LLVMIR/Export.h"
28
29#include "llvm/ADT/ScopeExit.h"
30#include "llvm/Config/Targets.h"
31#include "llvm/Support/FileSystem.h"
32#include "llvm/Support/FileUtilities.h"
33#include "llvm/Support/FormatVariadic.h"
34#include "llvm/Support/MemoryBuffer.h"
35#include "llvm/Support/Path.h"
36#include "llvm/Support/Process.h"
37#include "llvm/Support/Program.h"
38#include "llvm/Support/TargetSelect.h"
39#include "llvm/Support/Timer.h"
40#include "llvm/Support/raw_ostream.h"
41
42#include <cstdint>
43#include <cstdlib>
44#include <optional>
45
46using namespace mlir;
47using namespace mlir::NVVM;
48
49#ifndef __DEFAULT_CUDATOOLKIT_PATH__
50#define __DEFAULT_CUDATOOLKIT_PATH__ ""
51#endif
52
53extern "C" const unsigned char _mlir_embedded_libdevice[];
54extern "C" const unsigned _mlir_embedded_libdevice_size;
55
56namespace {
57// Implementation of the `TargetAttrInterface` model.
58class NVVMTargetAttrImpl
59 : public gpu::TargetAttrInterface::FallbackModel<NVVMTargetAttrImpl> {
60public:
61 std::optional<SmallVector<char, 0>>
62 serializeToObject(Attribute attribute, Operation *module,
63 const gpu::TargetOptions &options) const;
64
65 Attribute createObject(Attribute attribute, Operation *module,
66 const SmallVector<char, 0> &object,
67 const gpu::TargetOptions &options) const;
68};
69} // namespace
70
71// Register the NVVM dialect, the NVVM translation & the target interface.
72void mlir::NVVM::registerNVVMTargetInterfaceExternalModels(
73 DialectRegistry &registry) {
74 registry.addExtension(extensionFn: +[](MLIRContext *ctx, NVVM::NVVMDialect *dialect) {
75 NVVMTargetAttr::attachInterface<NVVMTargetAttrImpl>(*ctx);
76 });
77}
78
79void mlir::NVVM::registerNVVMTargetInterfaceExternalModels(
80 MLIRContext &context) {
81 DialectRegistry registry;
82 registerNVVMTargetInterfaceExternalModels(registry);
83 context.appendDialectRegistry(registry);
84}
85
86// Search for the CUDA toolkit path.
87StringRef mlir::NVVM::getCUDAToolkitPath() {
88 if (const char *var = std::getenv(name: "CUDA_ROOT"))
89 return var;
90 if (const char *var = std::getenv(name: "CUDA_HOME"))
91 return var;
92 if (const char *var = std::getenv(name: "CUDA_PATH"))
93 return var;
94 return __DEFAULT_CUDATOOLKIT_PATH__;
95}
96
97SerializeGPUModuleBase::SerializeGPUModuleBase(
98 Operation &module, NVVMTargetAttr target,
99 const gpu::TargetOptions &targetOptions)
100 : ModuleToObject(module, target.getTriple(), target.getChip(),
101 target.getFeatures(), target.getO(),
102 targetOptions.getInitialLlvmIRCallback(),
103 targetOptions.getLinkedLlvmIRCallback(),
104 targetOptions.getOptimizedLlvmIRCallback(),
105 targetOptions.getISACallback()),
106 target(target), toolkitPath(targetOptions.getToolkitPath()),
107 librariesToLink(targetOptions.getLibrariesToLink()) {
108
109 // If `targetOptions` have an empty toolkitPath use `getCUDAToolkitPath`
110 if (toolkitPath.empty())
111 toolkitPath = getCUDAToolkitPath();
112
113 // Append the files in the target attribute.
114 if (target.getLink())
115 librariesToLink.append(target.getLink().begin(), target.getLink().end());
116
117 // Append libdevice to the files to be loaded.
118 (void)appendStandardLibs();
119}
120
121void SerializeGPUModuleBase::init() {
122 static llvm::once_flag initializeBackendOnce;
123 llvm::call_once(flag&: initializeBackendOnce, F: []() {
124 // If the `NVPTX` LLVM target was built, initialize it.
125#if LLVM_HAS_NVPTX_TARGET
126 LLVMInitializeNVPTXTarget();
127 LLVMInitializeNVPTXTargetInfo();
128 LLVMInitializeNVPTXTargetMC();
129 LLVMInitializeNVPTXAsmPrinter();
130#endif
131 });
132}
133
134NVVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; }
135
136StringRef SerializeGPUModuleBase::getToolkitPath() const { return toolkitPath; }
137
138ArrayRef<Attribute> SerializeGPUModuleBase::getLibrariesToLink() const {
139 return librariesToLink;
140}
141
142// Try to append `libdevice` from a CUDA toolkit installation.
143LogicalResult SerializeGPUModuleBase::appendStandardLibs() {
144#if MLIR_NVVM_EMBED_LIBDEVICE
145 // If libdevice is embedded in the binary, we don't look it up on the
146 // filesystem.
147 MLIRContext *ctx = target.getContext();
148 auto type =
149 RankedTensorType::get(ArrayRef<int64_t>{_mlir_embedded_libdevice_size},
150 IntegerType::get(ctx, 8));
151 auto resourceManager = DenseResourceElementsHandle::getManagerInterface(ctx);
152
153 // Lookup if we already loaded the resource, otherwise create it.
154 DialectResourceBlobManager::BlobEntry *blob =
155 resourceManager.getBlobManager().lookup("_mlir_embedded_libdevice");
156 if (blob) {
157 librariesToLink.push_back(DenseResourceElementsAttr::get(
158 type, DenseResourceElementsHandle(
159 blob, ctx->getLoadedDialect<BuiltinDialect>())));
160 return success();
161 }
162
163 // Allocate a resource using one of the UnManagedResourceBlob method to wrap
164 // the embedded data.
165 auto unmanagedBlob = UnmanagedAsmResourceBlob::allocateInferAlign(
166 ArrayRef<char>{(const char *)_mlir_embedded_libdevice,
167 _mlir_embedded_libdevice_size});
168 librariesToLink.push_back(DenseResourceElementsAttr::get(
169 type, resourceManager.insert("_mlir_embedded_libdevice",
170 std::move(unmanagedBlob))));
171#else
172 StringRef pathRef = getToolkitPath();
173 if (!pathRef.empty()) {
174 SmallVector<char, 256> path;
175 path.insert(I: path.begin(), From: pathRef.begin(), To: pathRef.end());
176 pathRef = StringRef(path.data(), path.size());
177 if (!llvm::sys::fs::is_directory(Path: pathRef)) {
178 getOperation().emitError() << "CUDA path: " << pathRef
179 << " does not exist or is not a directory.\n";
180 return failure();
181 }
182 llvm::sys::path::append(path, a: "nvvm", b: "libdevice", c: "libdevice.10.bc");
183 pathRef = StringRef(path.data(), path.size());
184 if (!llvm::sys::fs::is_regular_file(Path: pathRef)) {
185 getOperation().emitError() << "LibDevice path: " << pathRef
186 << " does not exist or is not a file.\n";
187 return failure();
188 }
189 librariesToLink.push_back(StringAttr::get(target.getContext(), pathRef));
190 }
191#endif
192 return success();
193}
194
195std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
196SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
197 SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
198 if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
199 bcFiles, true)))
200 return std::nullopt;
201 return std::move(bcFiles);
202}
203
204namespace {
205class NVPTXSerializer : public SerializeGPUModuleBase {
206public:
207 NVPTXSerializer(Operation &module, NVVMTargetAttr target,
208 const gpu::TargetOptions &targetOptions);
209
210 /// Returns the GPU module op being serialized.
211 gpu::GPUModuleOp getOperation();
212
213 /// Compiles PTX to cubin using `ptxas`.
214 std::optional<SmallVector<char, 0>>
215 compileToBinary(const std::string &ptxCode);
216
217 /// Compiles PTX to cubin using the `nvptxcompiler` library.
218 std::optional<SmallVector<char, 0>>
219 compileToBinaryNVPTX(const std::string &ptxCode);
220
221 /// Serializes the LLVM module to an object format, depending on the
222 /// compilation target selected in target options.
223 std::optional<SmallVector<char, 0>>
224 moduleToObject(llvm::Module &llvmModule) override;
225
226 /// Get LLVMIR->ISA performance result.
227 /// Return nullopt if moduleToObject has not been called or the target format
228 /// is LLVMIR.
229 std::optional<int64_t> getLLVMIRToISATimeInMs();
230
231 /// Get ISA->Binary performance result.
232 /// Return nullopt if moduleToObject has not been called or the target format
233 /// is LLVMIR or ISA.
234 std::optional<int64_t> getISAToBinaryTimeInMs();
235
236private:
237 using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
238
239 /// Creates a temp file.
240 std::optional<TmpFile> createTemp(StringRef name, StringRef suffix);
241
242 /// Finds the `tool` path, where `tool` is the name of the binary to search,
243 /// i.e. `ptxas` or `fatbinary`. The search order is:
244 /// 1. The toolkit path in `targetOptions`.
245 /// 2. In the system PATH.
246 /// 3. The path from `getCUDAToolkitPath()`.
247 std::optional<std::string> findTool(StringRef tool);
248
249 /// Target options.
250 gpu::TargetOptions targetOptions;
251
252 /// LLVMIR->ISA perf result.
253 std::optional<int64_t> llvmToISATimeInMs;
254
255 /// ISA->Binary perf result.
256 std::optional<int64_t> isaToBinaryTimeInMs;
257};
258} // namespace
259
260NVPTXSerializer::NVPTXSerializer(Operation &module, NVVMTargetAttr target,
261 const gpu::TargetOptions &targetOptions)
262 : SerializeGPUModuleBase(module, target, targetOptions),
263 targetOptions(targetOptions), llvmToISATimeInMs(std::nullopt),
264 isaToBinaryTimeInMs(std::nullopt) {}
265
266std::optional<NVPTXSerializer::TmpFile>
267NVPTXSerializer::createTemp(StringRef name, StringRef suffix) {
268 llvm::SmallString<128> filename;
269 std::error_code ec =
270 llvm::sys::fs::createTemporaryFile(Prefix: name, Suffix: suffix, ResultPath&: filename);
271 if (ec) {
272 getOperation().emitError() << "Couldn't create the temp file: `" << filename
273 << "`, error message: " << ec.message();
274 return std::nullopt;
275 }
276 return TmpFile(filename, llvm::FileRemover(filename.c_str()));
277}
278
279std::optional<int64_t> NVPTXSerializer::getLLVMIRToISATimeInMs() {
280 return llvmToISATimeInMs;
281}
282
283std::optional<int64_t> NVPTXSerializer::getISAToBinaryTimeInMs() {
284 return isaToBinaryTimeInMs;
285}
286
287gpu::GPUModuleOp NVPTXSerializer::getOperation() {
288 return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
289}
290
291std::optional<std::string> NVPTXSerializer::findTool(StringRef tool) {
292 // Find the `tool` path.
293 // 1. Check the toolkit path given in the command line.
294 StringRef pathRef = targetOptions.getToolkitPath();
295 SmallVector<char, 256> path;
296 if (!pathRef.empty()) {
297 path.insert(I: path.begin(), From: pathRef.begin(), To: pathRef.end());
298 llvm::sys::path::append(path, a: "bin", b: tool);
299 if (llvm::sys::fs::can_execute(Path: path))
300 return StringRef(path.data(), path.size()).str();
301 }
302
303 // 2. Check PATH.
304 if (std::optional<std::string> toolPath =
305 llvm::sys::Process::FindInEnvPath(EnvName: "PATH", FileName: tool))
306 return *toolPath;
307
308 // 3. Check `getCUDAToolkitPath()`.
309 pathRef = getCUDAToolkitPath();
310 path.clear();
311 if (!pathRef.empty()) {
312 path.insert(I: path.begin(), From: pathRef.begin(), To: pathRef.end());
313 llvm::sys::path::append(path, a: "bin", b: tool);
314 if (llvm::sys::fs::can_execute(Path: path))
315 return StringRef(path.data(), path.size()).str();
316 }
317 getOperation().emitError()
318 << "Couldn't find the `" << tool
319 << "` binary. Please specify the toolkit "
320 "path, add the compiler to $PATH, or set one of the environment "
321 "variables in `NVVM::getCUDAToolkitPath()`.";
322 return std::nullopt;
323}
324
325/// Adds optional command-line arguments to existing arguments.
326template <typename T>
327static void setOptionalCommandlineArguments(NVVMTargetAttr target,
328 SmallVectorImpl<T> &ptxasArgs) {
329 if (!target.hasCmdOptions())
330 return;
331
332 std::optional<mlir::NamedAttribute> cmdOptions = target.getCmdOptions();
333 for (Attribute attr : cast<ArrayAttr>(cmdOptions->getValue())) {
334 if (auto strAttr = dyn_cast<StringAttr>(attr)) {
335 if constexpr (std::is_same_v<T, StringRef>) {
336 ptxasArgs.push_back(strAttr.getValue());
337 } else if constexpr (std::is_same_v<T, const char *>) {
338 ptxasArgs.push_back(strAttr.getValue().data());
339 }
340 }
341 }
342}
343
344// TODO: clean this method & have a generic tool driver or never emit binaries
345// with this mechanism and let another stage take care of it.
346std::optional<SmallVector<char, 0>>
347NVPTXSerializer::compileToBinary(const std::string &ptxCode) {
348 // Determine if the serializer should create a fatbinary with the PTX embeded
349 // or a simple CUBIN binary.
350 const bool createFatbin =
351 targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin;
352
353 // Find the `ptxas` & `fatbinary` tools.
354 std::optional<std::string> ptxasCompiler = findTool(tool: "ptxas");
355 if (!ptxasCompiler)
356 return std::nullopt;
357 std::optional<std::string> fatbinaryTool;
358 if (createFatbin) {
359 fatbinaryTool = findTool(tool: "fatbinary");
360 if (!fatbinaryTool)
361 return std::nullopt;
362 }
363 Location loc = getOperation().getLoc();
364
365 // Base name for all temp files: mlir-<module name>-<target triple>-<chip>.
366 std::string basename =
367 llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(),
368 getTarget().getTriple(), getTarget().getChip());
369
370 // Create temp files:
371 std::optional<TmpFile> ptxFile = createTemp(name: basename, suffix: "ptx");
372 if (!ptxFile)
373 return std::nullopt;
374 std::optional<TmpFile> logFile = createTemp(name: basename, suffix: "log");
375 if (!logFile)
376 return std::nullopt;
377 std::optional<TmpFile> binaryFile = createTemp(name: basename, suffix: "bin");
378 if (!binaryFile)
379 return std::nullopt;
380 TmpFile cubinFile;
381 if (createFatbin) {
382 std::string cubinFilename = (ptxFile->first + ".cubin").str();
383 cubinFile = TmpFile(cubinFilename, llvm::FileRemover(cubinFilename));
384 } else {
385 cubinFile.first = binaryFile->first;
386 }
387
388 std::error_code ec;
389 // Dump the PTX to a temp file.
390 {
391 llvm::raw_fd_ostream ptxStream(ptxFile->first, ec);
392 if (ec) {
393 emitError(loc) << "Couldn't open the file: `" << ptxFile->first
394 << "`, error message: " << ec.message();
395 return std::nullopt;
396 }
397 ptxStream << ptxCode;
398 if (ptxStream.has_error()) {
399 emitError(loc) << "An error occurred while writing the PTX to: `"
400 << ptxFile->first << "`.";
401 return std::nullopt;
402 }
403 ptxStream.flush();
404 }
405
406 // Command redirects.
407 std::optional<StringRef> redirects[] = {
408 std::nullopt,
409 logFile->first,
410 logFile->first,
411 };
412
413 // Get any extra args passed in `targetOptions`.
414 std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
415 targetOptions.tokenizeCmdOptions();
416
417 // Create ptxas args.
418 std::string optLevel = std::to_string(val: this->optLevel);
419 SmallVector<StringRef, 12> ptxasArgs(
420 {StringRef("ptxas"), StringRef("-arch"), getTarget().getChip(),
421 StringRef(ptxFile->first), StringRef("-o"), StringRef(cubinFile.first),
422 "--opt-level", optLevel});
423
424 bool useFatbin32 = false;
425 for (const auto *cArg : cmdOpts.second) {
426 // All `cmdOpts` are for `ptxas` except `-32` which passes `-32` to
427 // `fatbinary`, indicating a 32-bit target. By default a 64-bit target is
428 // assumed.
429 if (StringRef arg(cArg); arg != "-32")
430 ptxasArgs.push_back(Elt: arg);
431 else
432 useFatbin32 = true;
433 }
434
435 // Set optional command line arguments
436 setOptionalCommandlineArguments(getTarget(), ptxasArgs);
437
438 // Create the `fatbinary` args.
439 StringRef chip = getTarget().getChip();
440 // Remove the arch prefix to obtain the compute capability.
441 chip.consume_front(Prefix: "sm_"), chip.consume_front(Prefix: "compute_");
442 // Embed the cubin object.
443 std::string cubinArg =
444 llvm::formatv(Fmt: "--image3=kind=elf,sm={0},file={1}", Vals&: chip, Vals&: cubinFile.first)
445 .str();
446 // Embed the PTX file so the driver can JIT if needed.
447 std::string ptxArg =
448 llvm::formatv(Fmt: "--image3=kind=ptx,sm={0},file={1}", Vals&: chip, Vals&: ptxFile->first)
449 .str();
450 SmallVector<StringRef, 6> fatbinArgs({StringRef("fatbinary"),
451 useFatbin32 ? "-32" : "-64", cubinArg,
452 ptxArg, "--create", binaryFile->first});
453
454 // Dump tool invocation commands.
455#define DEBUG_TYPE "serialize-to-binary"
456 LLVM_DEBUG({
457 llvm::dbgs() << "Tool invocation for module: "
458 << getOperation().getNameAttr() << "\n";
459 llvm::dbgs() << "ptxas executable:" << ptxasCompiler.value() << "\n";
460 llvm::interleave(ptxasArgs, llvm::dbgs(), " ");
461 llvm::dbgs() << "\n";
462 if (createFatbin) {
463 llvm::interleave(fatbinArgs, llvm::dbgs(), " ");
464 llvm::dbgs() << "\n";
465 }
466 });
467#undef DEBUG_TYPE
468
469 // Helper function for printing tool error logs.
470 std::string message;
471 auto emitLogError =
472 [&](StringRef toolName) -> std::optional<SmallVector<char, 0>> {
473 if (message.empty()) {
474 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> toolStderr =
475 llvm::MemoryBuffer::getFile(Filename: logFile->first);
476 if (toolStderr)
477 emitError(loc) << toolName << " invocation failed. Log:\n"
478 << toolStderr->get()->getBuffer();
479 else
480 emitError(loc) << toolName << " invocation failed.";
481 return std::nullopt;
482 }
483 emitError(loc) << toolName
484 << " invocation failed, error message: " << message;
485 return std::nullopt;
486 };
487
488 // Invoke PTXAS.
489 if (llvm::sys::ExecuteAndWait(Program: ptxasCompiler.value(), Args: ptxasArgs,
490 /*Env=*/std::nullopt,
491 /*Redirects=*/redirects,
492 /*SecondsToWait=*/0,
493 /*MemoryLimit=*/0,
494 /*ErrMsg=*/&message))
495 return emitLogError("`ptxas`");
496#define DEBUG_TYPE "dump-sass"
497 LLVM_DEBUG({
498 std::optional<std::string> nvdisasm = findTool("nvdisasm");
499 SmallVector<StringRef> nvdisasmArgs(
500 {StringRef("nvdisasm"), StringRef(cubinFile.first)});
501 if (llvm::sys::ExecuteAndWait(nvdisasm.value(), nvdisasmArgs,
502 /*Env=*/std::nullopt,
503 /*Redirects=*/redirects,
504 /*SecondsToWait=*/0,
505 /*MemoryLimit=*/0,
506 /*ErrMsg=*/&message))
507 return emitLogError("`nvdisasm`");
508 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> logBuffer =
509 llvm::MemoryBuffer::getFile(logFile->first);
510 if (logBuffer && !(*logBuffer)->getBuffer().empty()) {
511 llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n";
512 llvm::dbgs().flush();
513 }
514 });
515#undef DEBUG_TYPE
516
517 // Invoke `fatbin`.
518 message.clear();
519 if (createFatbin && llvm::sys::ExecuteAndWait(Program: *fatbinaryTool, Args: fatbinArgs,
520 /*Env=*/std::nullopt,
521 /*Redirects=*/redirects,
522 /*SecondsToWait=*/0,
523 /*MemoryLimit=*/0,
524 /*ErrMsg=*/&message))
525 return emitLogError("`fatbinary`");
526
527// Dump the output of the tools, helpful if the verbose flag was passed.
528#define DEBUG_TYPE "serialize-to-binary"
529 LLVM_DEBUG({
530 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> logBuffer =
531 llvm::MemoryBuffer::getFile(logFile->first);
532 if (logBuffer && !(*logBuffer)->getBuffer().empty()) {
533 llvm::dbgs() << "Output:\n" << (*logBuffer)->getBuffer() << "\n";
534 llvm::dbgs().flush();
535 }
536 });
537#undef DEBUG_TYPE
538
539 // Read the fatbin.
540 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> binaryBuffer =
541 llvm::MemoryBuffer::getFile(Filename: binaryFile->first);
542 if (!binaryBuffer) {
543 emitError(loc) << "Couldn't open the file: `" << binaryFile->first
544 << "`, error message: " << binaryBuffer.getError().message();
545 return std::nullopt;
546 }
547 StringRef fatbin = (*binaryBuffer)->getBuffer();
548 return SmallVector<char, 0>(fatbin.begin(), fatbin.end());
549}
550
551#if MLIR_ENABLE_NVPTXCOMPILER
552#include "nvPTXCompiler.h"
553
554#define RETURN_ON_NVPTXCOMPILER_ERROR(expr) \
555 do { \
556 if (auto status = (expr)) { \
557 emitError(loc) << llvm::Twine(#expr).concat(" failed with error code ") \
558 << status; \
559 return std::nullopt; \
560 } \
561 } while (false)
562
563#include "nvFatbin.h"
564
565#define RETURN_ON_NVFATBIN_ERROR(expr) \
566 do { \
567 auto result = (expr); \
568 if (result != nvFatbinResult::NVFATBIN_SUCCESS) { \
569 emitError(loc) << llvm::Twine(#expr).concat(" failed with error: ") \
570 << nvFatbinGetErrorString(result); \
571 return std::nullopt; \
572 } \
573 } while (false)
574
575std::optional<SmallVector<char, 0>>
576NVPTXSerializer::compileToBinaryNVPTX(const std::string &ptxCode) {
577 Location loc = getOperation().getLoc();
578 nvPTXCompilerHandle compiler = nullptr;
579 nvPTXCompileResult status;
580 size_t logSize;
581
582 // Create the options.
583 std::string optLevel = std::to_string(this->optLevel);
584 std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
585 targetOptions.tokenizeCmdOptions();
586 cmdOpts.second.append(
587 {"-arch", getTarget().getChip().data(), "--opt-level", optLevel.c_str()});
588
589 // Set optional command line arguments
590 setOptionalCommandlineArguments(getTarget(), cmdOpts.second);
591 // Create the compiler handle.
592 RETURN_ON_NVPTXCOMPILER_ERROR(
593 nvPTXCompilerCreate(&compiler, ptxCode.size(), ptxCode.c_str()));
594
595 // Try to compile the binary.
596 status = nvPTXCompilerCompile(compiler, cmdOpts.second.size(),
597 cmdOpts.second.data());
598
599 // Check if compilation failed.
600 if (status != NVPTXCOMPILE_SUCCESS) {
601 RETURN_ON_NVPTXCOMPILER_ERROR(
602 nvPTXCompilerGetErrorLogSize(compiler, &logSize));
603 if (logSize != 0) {
604 SmallVector<char> log(logSize + 1, 0);
605 RETURN_ON_NVPTXCOMPILER_ERROR(
606 nvPTXCompilerGetErrorLog(compiler, log.data()));
607 emitError(loc) << "NVPTX compiler invocation failed, error log: "
608 << log.data();
609 } else {
610 emitError(loc) << "NVPTX compiler invocation failed with error code: "
611 << status;
612 }
613 return std::nullopt;
614 }
615
616 // Retrieve the binary.
617 size_t elfSize;
618 RETURN_ON_NVPTXCOMPILER_ERROR(
619 nvPTXCompilerGetCompiledProgramSize(compiler, &elfSize));
620 SmallVector<char, 0> binary(elfSize, 0);
621 RETURN_ON_NVPTXCOMPILER_ERROR(
622 nvPTXCompilerGetCompiledProgram(compiler, (void *)binary.data()));
623
624// Dump the log of the compiler, helpful if the verbose flag was passed.
625#define DEBUG_TYPE "serialize-to-binary"
626 LLVM_DEBUG({
627 RETURN_ON_NVPTXCOMPILER_ERROR(
628 nvPTXCompilerGetInfoLogSize(compiler, &logSize));
629 if (logSize != 0) {
630 SmallVector<char> log(logSize + 1, 0);
631 RETURN_ON_NVPTXCOMPILER_ERROR(
632 nvPTXCompilerGetInfoLog(compiler, log.data()));
633 llvm::dbgs() << "NVPTX compiler invocation for module: "
634 << getOperation().getNameAttr() << "\n";
635 llvm::dbgs() << "Arguments: ";
636 llvm::interleave(cmdOpts.second, llvm::dbgs(), " ");
637 llvm::dbgs() << "\nOutput\n" << log.data() << "\n";
638 llvm::dbgs().flush();
639 }
640 });
641#undef DEBUG_TYPE
642 RETURN_ON_NVPTXCOMPILER_ERROR(nvPTXCompilerDestroy(&compiler));
643
644 if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin) {
645 bool useFatbin32 = llvm::any_of(cmdOpts.second, [](const char *option) {
646 return llvm::StringRef(option) == "-32";
647 });
648
649 const char *cubinOpts[1] = {useFatbin32 ? "-32" : "-64"};
650 nvFatbinHandle handle;
651
652 auto chip = getTarget().getChip();
653 chip.consume_front("sm_");
654
655 RETURN_ON_NVFATBIN_ERROR(nvFatbinCreate(&handle, cubinOpts, 1));
656 RETURN_ON_NVFATBIN_ERROR(nvFatbinAddCubin(
657 handle, binary.data(), binary.size(), chip.data(), nullptr));
658 RETURN_ON_NVFATBIN_ERROR(nvFatbinAddPTX(
659 handle, ptxCode.data(), ptxCode.size(), chip.data(), nullptr, nullptr));
660
661 size_t fatbinSize;
662 RETURN_ON_NVFATBIN_ERROR(nvFatbinSize(handle, &fatbinSize));
663 SmallVector<char, 0> fatbin(fatbinSize, 0);
664 RETURN_ON_NVFATBIN_ERROR(nvFatbinGet(handle, (void *)fatbin.data()));
665 RETURN_ON_NVFATBIN_ERROR(nvFatbinDestroy(&handle));
666 return fatbin;
667 }
668
669 return binary;
670}
671#endif // MLIR_ENABLE_NVPTXCOMPILER
672
673std::optional<SmallVector<char, 0>>
674NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
675 llvm::Timer moduleToObjectTimer(
676 "moduleToObjectTimer",
677 "Timer for perf llvm-ir -> isa and isa -> binary.");
678 auto clear = llvm::make_scope_exit(F: [&]() { moduleToObjectTimer.clear(); });
679 // Return LLVM IR if the compilation target is `offload`.
680#define DEBUG_TYPE "serialize-to-llvm"
681 LLVM_DEBUG({
682 llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
683 << "\n";
684 llvm::dbgs() << llvmModule << "\n";
685 llvm::dbgs().flush();
686 });
687#undef DEBUG_TYPE
688 if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
689 return SerializeGPUModuleBase::moduleToObject(llvmModule);
690
691#if !LLVM_HAS_NVPTX_TARGET
692 getOperation()->emitError(
693 "The `NVPTX` target was not built. Please enable it when building LLVM.");
694 return std::nullopt;
695#endif // LLVM_HAS_NVPTX_TARGET
696
697 // Emit PTX code.
698 std::optional<llvm::TargetMachine *> targetMachine =
699 getOrCreateTargetMachine();
700 if (!targetMachine) {
701 getOperation().emitError() << "Target Machine unavailable for triple "
702 << triple << ", can't optimize with LLVM\n";
703 return std::nullopt;
704 }
705 moduleToObjectTimer.startTimer();
706 std::optional<std::string> serializedISA =
707 translateToISA(llvmModule, targetMachine&: **targetMachine);
708 moduleToObjectTimer.stopTimer();
709 llvmToISATimeInMs = moduleToObjectTimer.getTotalTime().getWallTime() * 1000;
710 moduleToObjectTimer.clear();
711 if (!serializedISA) {
712 getOperation().emitError() << "Failed translating the module to ISA.";
713 return std::nullopt;
714 }
715
716 if (isaCallback)
717 isaCallback(serializedISA.value());
718
719#define DEBUG_TYPE "serialize-to-isa"
720 LLVM_DEBUG({
721 llvm::dbgs() << "PTX for module: " << getOperation().getNameAttr() << "\n";
722 llvm::dbgs() << *serializedISA << "\n";
723 llvm::dbgs().flush();
724 });
725#undef DEBUG_TYPE
726
727 // Return PTX if the compilation target is `assembly`.
728 if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
729 return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
730
731 std::optional<SmallVector<char, 0>> result;
732 moduleToObjectTimer.startTimer();
733 // Compile to binary.
734#if MLIR_ENABLE_NVPTXCOMPILER
735 result = compileToBinaryNVPTX(*serializedISA);
736#else
737 result = compileToBinary(ptxCode: *serializedISA);
738#endif // MLIR_ENABLE_NVPTXCOMPILER
739
740 moduleToObjectTimer.stopTimer();
741 isaToBinaryTimeInMs = moduleToObjectTimer.getTotalTime().getWallTime() * 1000;
742 moduleToObjectTimer.clear();
743 return result;
744}
745
746std::optional<SmallVector<char, 0>>
747NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
748 const gpu::TargetOptions &options) const {
749 Builder builder(attribute.getContext());
750 assert(module && "The module must be non null.");
751 if (!module)
752 return std::nullopt;
753 if (!mlir::isa<gpu::GPUModuleOp>(module)) {
754 module->emitError(message: "Module must be a GPU module.");
755 return std::nullopt;
756 }
757 NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
758 serializer.init();
759 std::optional<SmallVector<char, 0>> result = serializer.run();
760 auto llvmToISATimeInMs = serializer.getLLVMIRToISATimeInMs();
761 if (llvmToISATimeInMs.has_value())
762 module->setAttr("LLVMIRToISATimeInMs",
763 builder.getI64IntegerAttr(*llvmToISATimeInMs));
764 auto isaToBinaryTimeInMs = serializer.getISAToBinaryTimeInMs();
765 if (isaToBinaryTimeInMs.has_value())
766 module->setAttr("ISAToBinaryTimeInMs",
767 builder.getI64IntegerAttr(*isaToBinaryTimeInMs));
768 return result;
769}
770
771Attribute
772NVVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
773 const SmallVector<char, 0> &object,
774 const gpu::TargetOptions &options) const {
775 auto target = cast<NVVMTargetAttr>(attribute);
776 gpu::CompilationTarget format = options.getCompilationTarget();
777 DictionaryAttr objectProps;
778 Builder builder(attribute.getContext());
779 SmallVector<NamedAttribute, 4> properties;
780 if (format == gpu::CompilationTarget::Assembly)
781 properties.push_back(
782 Elt: builder.getNamedAttr(name: "O", val: builder.getI32IntegerAttr(value: target.getO())));
783
784 if (StringRef section = options.getELFSection(); !section.empty())
785 properties.push_back(Elt: builder.getNamedAttr(gpu::elfSectionName,
786 builder.getStringAttr(section)));
787
788 for (const auto *perfName : {"LLVMIRToISATimeInMs", "ISAToBinaryTimeInMs"}) {
789 if (module->hasAttr(name: perfName)) {
790 IntegerAttr attr = llvm::dyn_cast<IntegerAttr>(module->getAttr(perfName));
791 properties.push_back(Elt: builder.getNamedAttr(
792 name: perfName, val: builder.getI64IntegerAttr(value: attr.getInt())));
793 }
794 }
795
796 if (!properties.empty())
797 objectProps = builder.getDictionaryAttr(properties);
798
799 return builder.getAttr<gpu::ObjectAttr>(
800 attribute, format,
801 builder.getStringAttr(StringRef(object.data(), object.size())),
802 objectProps, /*kernels=*/nullptr);
803}
804

source code of mlir/lib/Target/LLVM/NVVM/Target.cpp