1//===-- CUFGPUToLLVMConversion.cpp ----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h"
10#include "flang/Optimizer/Builder/CUFCommon.h"
11#include "flang/Optimizer/CodeGen/TypeConverter.h"
12#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
13#include "flang/Optimizer/Support/DataLayout.h"
14#include "flang/Runtime/CUDA/common.h"
15#include "flang/Support/Fortran.h"
16#include "mlir/Conversion/LLVMCommon/Pattern.h"
17#include "mlir/Dialect/DLTI/DLTI.h"
18#include "mlir/Dialect/GPU/IR/GPUDialect.h"
19#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
20#include "mlir/Pass/Pass.h"
21#include "mlir/Transforms/DialectConversion.h"
22#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
23#include "llvm/Support/FormatVariadic.h"
24
25namespace fir {
26#define GEN_PASS_DEF_CUFGPUTOLLVMCONVERSION
27#include "flang/Optimizer/Transforms/Passes.h.inc"
28} // namespace fir
29
30using namespace fir;
31using namespace mlir;
32using namespace Fortran::runtime;
33
34namespace {
35
36static mlir::Value createKernelArgArray(mlir::Location loc,
37 mlir::ValueRange operands,
38 mlir::PatternRewriter &rewriter) {
39
40 auto *ctx = rewriter.getContext();
41 llvm::SmallVector<mlir::Type> structTypes(operands.size(), nullptr);
42
43 for (auto [i, arg] : llvm::enumerate(operands))
44 structTypes[i] = arg.getType();
45
46 auto structTy = mlir::LLVM::LLVMStructType::getLiteral(ctx, structTypes);
47 auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
48 mlir::Type i32Ty = rewriter.getI32Type();
49 auto zero = rewriter.create<mlir::LLVM::ConstantOp>(
50 loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
51 auto one = rewriter.create<mlir::LLVM::ConstantOp>(
52 loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 1));
53 mlir::Value argStruct =
54 rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, structTy, one);
55 auto size = rewriter.create<mlir::LLVM::ConstantOp>(
56 loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size()));
57 mlir::Value argArray =
58 rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, ptrTy, size);
59
60 for (auto [i, arg] : llvm::enumerate(operands)) {
61 auto indice = rewriter.create<mlir::LLVM::ConstantOp>(
62 loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i));
63 mlir::Value structMember = rewriter.create<LLVM::GEPOp>(
64 loc, ptrTy, structTy, argStruct,
65 mlir::ArrayRef<mlir::Value>({zero, indice}));
66 rewriter.create<LLVM::StoreOp>(loc, arg, structMember);
67 mlir::Value arrayMember = rewriter.create<LLVM::GEPOp>(
68 loc, ptrTy, ptrTy, argArray, mlir::ArrayRef<mlir::Value>({indice}));
69 rewriter.create<LLVM::StoreOp>(loc, structMember, arrayMember);
70 }
71 return argArray;
72}
73
74struct GPULaunchKernelConversion
75 : public mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp> {
76 explicit GPULaunchKernelConversion(
77 const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
78 : mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp>(typeConverter,
79 benefit) {}
80
81 using OpAdaptor = typename mlir::gpu::LaunchFuncOp::Adaptor;
82
83 mlir::LogicalResult
84 matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
85 mlir::ConversionPatternRewriter &rewriter) const override {
86 // Only convert gpu.launch_func for CUDA Fortran.
87 if (!op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
88 cuf::getProcAttrName()))
89 return mlir::failure();
90
91 mlir::Location loc = op.getLoc();
92 auto *ctx = rewriter.getContext();
93 mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
94 mlir::Value dynamicMemorySize = op.getDynamicSharedMemorySize();
95 mlir::Type i32Ty = rewriter.getI32Type();
96 if (!dynamicMemorySize)
97 dynamicMemorySize = rewriter.create<mlir::LLVM::ConstantOp>(
98 loc, i32Ty, rewriter.getIntegerAttr(i32Ty, 0));
99
100 mlir::Value kernelArgs =
101 createKernelArgArray(loc, adaptor.getKernelOperands(), rewriter);
102
103 auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
104 auto kernel = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(op.getKernelName());
105 mlir::Value kernelPtr;
106 if (!kernel) {
107 auto funcOp = mod.lookupSymbol<mlir::func::FuncOp>(op.getKernelName());
108 if (!funcOp)
109 return mlir::failure();
110 kernelPtr =
111 rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, funcOp.getName());
112 } else {
113 kernelPtr =
114 rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
115 }
116
117 auto llvmIntPtrType = mlir::IntegerType::get(
118 ctx, this->getTypeConverter()->getPointerBitwidth(0));
119 auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
120
121 mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
122
123 if (op.hasClusterSize()) {
124 auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
125 RTNAME_STRING(CUFLaunchClusterKernel));
126 auto funcTy = mlir::LLVM::LLVMFunctionType::get(
127 voidTy,
128 {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
129 llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
130 llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy, ptrTy},
131 /*isVarArg=*/false);
132 auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
133 mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
134 if (!funcOp) {
135 mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
136 rewriter.setInsertionPointToStart(mod.getBody());
137 auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
138 loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
139 launchKernelFuncOp.setVisibility(
140 mlir::SymbolTable::Visibility::Private);
141 }
142
143 mlir::Value stream = nullPtr;
144 if (!adaptor.getAsyncDependencies().empty()) {
145 if (adaptor.getAsyncDependencies().size() != 1)
146 return rewriter.notifyMatchFailure(
147 op, "Can only convert with exactly one stream dependency.");
148 stream = adaptor.getAsyncDependencies().front();
149 }
150
151 rewriter.create<mlir::LLVM::CallOp>(
152 loc, funcTy, cufLaunchClusterKernel,
153 mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
154 adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
155 adaptor.getGridSizeX(), adaptor.getGridSizeY(),
156 adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
157 adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
158 stream, dynamicMemorySize, kernelArgs, nullPtr});
159 rewriter.eraseOp(op);
160 } else {
161 auto procAttr =
162 op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
163 bool isGridGlobal =
164 procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
165 llvm::StringRef fctName = isGridGlobal
166 ? RTNAME_STRING(CUFLaunchCooperativeKernel)
167 : RTNAME_STRING(CUFLaunchKernel);
168 auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
169 auto funcTy = mlir::LLVM::LLVMFunctionType::get(
170 voidTy,
171 {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
172 llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy,
173 ptrTy},
174 /*isVarArg=*/false);
175 auto cufLaunchKernel =
176 mlir::SymbolRefAttr::get(mod.getContext(), fctName);
177 if (!funcOp) {
178 mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
179 rewriter.setInsertionPointToStart(mod.getBody());
180 auto launchKernelFuncOp =
181 rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
182 launchKernelFuncOp.setVisibility(
183 mlir::SymbolTable::Visibility::Private);
184 }
185
186 mlir::Value stream = nullPtr;
187 if (!adaptor.getAsyncDependencies().empty()) {
188 if (adaptor.getAsyncDependencies().size() != 1)
189 return rewriter.notifyMatchFailure(
190 op, "Can only convert with exactly one stream dependency.");
191 stream = adaptor.getAsyncDependencies().front();
192 }
193
194 rewriter.create<mlir::LLVM::CallOp>(
195 loc, funcTy, cufLaunchKernel,
196 mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
197 adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
198 adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
199 adaptor.getBlockSizeZ(), stream, dynamicMemorySize,
200 kernelArgs, nullPtr});
201 rewriter.eraseOp(op);
202 }
203
204 return mlir::success();
205 }
206};
207
208static std::string getFuncName(cuf::SharedMemoryOp op) {
209 if (auto gpuFuncOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
210 return gpuFuncOp.getName().str();
211 if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>())
212 return funcOp.getName().str();
213 if (auto llvmFuncOp = op->getParentOfType<mlir::LLVM::LLVMFuncOp>())
214 return llvmFuncOp.getSymName().str();
215 return "";
216}
217
218static mlir::Value createAddressOfOp(mlir::ConversionPatternRewriter &rewriter,
219 mlir::Location loc,
220 gpu::GPUModuleOp gpuMod,
221 std::string &sharedGlobalName) {
222 auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(
223 rewriter.getContext(), mlir::NVVM::NVVMMemorySpace::kSharedMemorySpace);
224 if (auto g = gpuMod.lookupSymbol<fir::GlobalOp>(sharedGlobalName))
225 return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
226 g.getSymName());
227 if (auto g = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(sharedGlobalName))
228 return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
229 g.getSymName());
230 return {};
231}
232
233struct CUFSharedMemoryOpConversion
234 : public mlir::ConvertOpToLLVMPattern<cuf::SharedMemoryOp> {
235 explicit CUFSharedMemoryOpConversion(
236 const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
237 : mlir::ConvertOpToLLVMPattern<cuf::SharedMemoryOp>(typeConverter,
238 benefit) {}
239 using OpAdaptor = typename cuf::SharedMemoryOp::Adaptor;
240
241 mlir::LogicalResult
242 matchAndRewrite(cuf::SharedMemoryOp op, OpAdaptor adaptor,
243 mlir::ConversionPatternRewriter &rewriter) const override {
244 mlir::Location loc = op->getLoc();
245 if (!op.getOffset())
246 mlir::emitError(loc,
247 "cuf.shared_memory must have an offset for code gen");
248
249 auto gpuMod = op->getParentOfType<gpu::GPUModuleOp>();
250 std::string sharedGlobalName =
251 (getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
252 mlir::Value sharedGlobalAddr =
253 createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName);
254
255 if (!sharedGlobalAddr)
256 mlir::emitError(loc, "Could not find the shared global operation\n");
257
258 auto castPtr = rewriter.create<mlir::LLVM::AddrSpaceCastOp>(
259 loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()),
260 sharedGlobalAddr);
261 mlir::Type baseType = castPtr->getResultTypes().front();
262 llvm::SmallVector<mlir::LLVM::GEPArg> gepArgs = {op.getOffset()};
263 mlir::Value shmemPtr = rewriter.create<mlir::LLVM::GEPOp>(
264 loc, baseType, rewriter.getI8Type(), castPtr, gepArgs);
265 rewriter.replaceOp(op, {shmemPtr});
266 return mlir::success();
267 }
268};
269
270struct CUFStreamCastConversion
271 : public mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp> {
272 explicit CUFStreamCastConversion(const fir::LLVMTypeConverter &typeConverter,
273 mlir::PatternBenefit benefit)
274 : mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp>(typeConverter,
275 benefit) {}
276 using OpAdaptor = typename cuf::StreamCastOp::Adaptor;
277
278 mlir::LogicalResult
279 matchAndRewrite(cuf::StreamCastOp op, OpAdaptor adaptor,
280 mlir::ConversionPatternRewriter &rewriter) const override {
281 rewriter.replaceOp(op, adaptor.getStream());
282 return mlir::success();
283 }
284};
285
286class CUFGPUToLLVMConversion
287 : public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
288public:
289 void runOnOperation() override {
290 auto *ctx = &getContext();
291 mlir::RewritePatternSet patterns(ctx);
292 mlir::ConversionTarget target(*ctx);
293
294 mlir::Operation *op = getOperation();
295 mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
296 if (!module)
297 return signalPassFailure();
298
299 std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout(
300 module, /*allowDefaultLayout=*/false);
301 fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
302 /*forceUnifiedTBAATree=*/false, *dl);
303 cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns);
304
305 target.addDynamicallyLegalOp<mlir::gpu::LaunchFuncOp>(
306 [&](mlir::gpu::LaunchFuncOp op) {
307 if (op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
308 cuf::getProcAttrName()))
309 return false;
310 return true;
311 });
312
313 target.addIllegalOp<cuf::SharedMemoryOp>();
314 target.addLegalDialect<mlir::LLVM::LLVMDialect>();
315 if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
316 std::move(patterns)))) {
317 mlir::emitError(mlir::UnknownLoc::get(ctx),
318 "error in CUF GPU op conversion\n");
319 signalPassFailure();
320 }
321 }
322};
323} // namespace
324
325void cuf::populateCUFGPUToLLVMConversionPatterns(
326 fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
327 mlir::PatternBenefit benefit) {
328 converter.addConversion([&converter](mlir::gpu::AsyncTokenType) -> Type {
329 return mlir::LLVM::LLVMPointerType::get(&converter.getContext());
330 });
331 patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion,
332 CUFStreamCastConversion>(converter, benefit);
333}
334

source code of flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp