CUFGPUToLLVMConversion.cpp source code [flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp]

1	//===-- CUFGPUToLLVMConversion.cpp ----------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h"
10	#include "flang/Optimizer/Builder/CUFCommon.h"
11	#include "flang/Optimizer/CodeGen/TypeConverter.h"
12	#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
13	#include "flang/Optimizer/Support/DataLayout.h"
14	#include "flang/Runtime/CUDA/common.h"
15	#include "flang/Support/Fortran.h"
16	#include "mlir/Conversion/LLVMCommon/Pattern.h"
17	#include "mlir/Dialect/DLTI/DLTI.h"
18	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
19	#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
20	#include "mlir/Pass/Pass.h"
21	#include "mlir/Transforms/DialectConversion.h"
22	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
23	#include "llvm/Support/FormatVariadic.h"
24
25	namespace fir {
26	#define GEN_PASS_DEF_CUFGPUTOLLVMCONVERSION
27	#include "flang/Optimizer/Transforms/Passes.h.inc"
28	} // namespace fir
29
30	using namespace fir;
31	using namespace mlir;
32	using namespace Fortran::runtime;
33
34	namespace {
35
36	static mlir::Value createKernelArgArray(mlir::Location loc,
37	mlir::ValueRange operands,
38	mlir::PatternRewriter &rewriter) {
39
40	auto *ctx = rewriter.getContext();
41	llvm::SmallVector<mlir::Type> structTypes(operands.size(), nullptr);
42
43	for (auto [i, arg] : llvm::enumerate(operands))
44	structTypes[i] = arg.getType();
45
46	auto structTy = mlir::LLVM::LLVMStructType::getLiteral(ctx, structTypes);
47	auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
48	mlir::Type i32Ty = rewriter.getI32Type();
49	auto zero = rewriter.create<mlir::LLVM::ConstantOp>(
50	loc, i32Ty, rewriter.getIntegerAttr(i32Ty, `0`));
51	auto one = rewriter.create<mlir::LLVM::ConstantOp>(
52	loc, i32Ty, rewriter.getIntegerAttr(i32Ty, `1`));
53	mlir::Value argStruct =
54	rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, structTy, one);
55	auto size = rewriter.create<mlir::LLVM::ConstantOp>(
56	loc, i32Ty, rewriter.getIntegerAttr(i32Ty, structTypes.size()));
57	mlir::Value argArray =
58	rewriter.create<mlir::LLVM::AllocaOp>(loc, ptrTy, ptrTy, size);
59
60	for (auto [i, arg] : llvm::enumerate(operands)) {
61	auto indice = rewriter.create<mlir::LLVM::ConstantOp>(
62	loc, i32Ty, rewriter.getIntegerAttr(i32Ty, i));
63	mlir::Value structMember = rewriter.create<LLVM::GEPOp>(
64	loc, ptrTy, structTy, argStruct,
65	mlir::ArrayRef<mlir::Value>({zero, indice}));
66	rewriter.create<LLVM::StoreOp>(loc, arg, structMember);
67	mlir::Value arrayMember = rewriter.create<LLVM::GEPOp>(
68	loc, ptrTy, ptrTy, argArray, mlir::ArrayRef<mlir::Value>({indice}));
69	rewriter.create<LLVM::StoreOp>(loc, structMember, arrayMember);
70	}
71	return argArray;
72	}
73
74	struct GPULaunchKernelConversion
75	: public mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp> {
76	explicit GPULaunchKernelConversion(
77	const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
78	: mlir::ConvertOpToLLVMPattern<mlir::gpu::LaunchFuncOp>(typeConverter,
79	benefit) {}
80
81	using OpAdaptor = typename mlir::gpu::LaunchFuncOp::Adaptor;
82
83	mlir::LogicalResult
84	matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
85	mlir::ConversionPatternRewriter &rewriter) const override {
86	// Only convert gpu.launch_func for CUDA Fortran.
87	if (!op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
88	cuf::getProcAttrName()))
89	return mlir::failure();
90
91	mlir::Location loc = op.getLoc();
92	auto *ctx = rewriter.getContext();
93	mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
94	mlir::Value dynamicMemorySize = op.getDynamicSharedMemorySize();
95	mlir::Type i32Ty = rewriter.getI32Type();
96	if (!dynamicMemorySize)
97	dynamicMemorySize = rewriter.create<mlir::LLVM::ConstantOp>(
98	loc, i32Ty, rewriter.getIntegerAttr(i32Ty, `0`));
99
100	mlir::Value kernelArgs =
101	createKernelArgArray(loc, adaptor.getKernelOperands(), rewriter);
102
103	auto ptrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext());
104	auto kernel = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(op.getKernelName());
105	mlir::Value kernelPtr;
106	if (!kernel) {
107	auto funcOp = mod.lookupSymbol<mlir::func::FuncOp>(op.getKernelName());
108	if (!funcOp)
109	return mlir::failure();
110	kernelPtr =
111	rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, funcOp.getName());
112	} else {
113	kernelPtr =
114	rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, kernel.getName());
115	}
116
117	auto llvmIntPtrType = mlir::IntegerType::get(
118	ctx, this->getTypeConverter()->getPointerBitwidth(`0`));
119	auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
120
121	mlir::Value nullPtr = rewriter.create<LLVM::ZeroOp>(loc, ptrTy);
122
123	if (op.hasClusterSize()) {
124	auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
125	RTNAME_STRING(CUFLaunchClusterKernel));
126	auto funcTy = mlir::LLVM::LLVMFunctionType::get(
127	voidTy,
128	{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
129	llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
130	llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy, ptrTy},
131	/isVarArg=/false);
132	auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
133	mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
134	if (!funcOp) {
135	mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
136	rewriter.setInsertionPointToStart(mod.getBody());
137	auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
138	loc, RTNAME_STRING(CUFLaunchClusterKernel), funcTy);
139	launchKernelFuncOp.setVisibility(
140	mlir::SymbolTable::Visibility::Private);
141	}
142
143	mlir::Value stream = nullPtr;
144	if (!adaptor.getAsyncDependencies().empty()) {
145	if (adaptor.getAsyncDependencies().size() != `1`)
146	return rewriter.notifyMatchFailure(
147	op, "Can only convert with exactly one stream dependency.");
148	stream = adaptor.getAsyncDependencies().front();
149	}
150
151	rewriter.create<mlir::LLVM::CallOp>(
152	loc, funcTy, cufLaunchClusterKernel,
153	mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
154	adaptor.getClusterSizeY(), adaptor.getClusterSizeZ(),
155	adaptor.getGridSizeX(), adaptor.getGridSizeY(),
156	adaptor.getGridSizeZ(), adaptor.getBlockSizeX(),
157	adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
158	stream, dynamicMemorySize, kernelArgs, nullPtr});
159	rewriter.eraseOp(op);
160	} else {
161	auto procAttr =
162	op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
163	bool isGridGlobal =
164	procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
165	llvm::StringRef fctName = isGridGlobal
166	? RTNAME_STRING(CUFLaunchCooperativeKernel)
167	: RTNAME_STRING(CUFLaunchKernel);
168	auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
169	auto funcTy = mlir::LLVM::LLVMFunctionType::get(
170	voidTy,
171	{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
172	llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy,
173	ptrTy},
174	/isVarArg=/false);
175	auto cufLaunchKernel =
176	mlir::SymbolRefAttr::get(mod.getContext(), fctName);
177	if (!funcOp) {
178	mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
179	rewriter.setInsertionPointToStart(mod.getBody());
180	auto launchKernelFuncOp =
181	rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
182	launchKernelFuncOp.setVisibility(
183	mlir::SymbolTable::Visibility::Private);
184	}
185
186	mlir::Value stream = nullPtr;
187	if (!adaptor.getAsyncDependencies().empty()) {
188	if (adaptor.getAsyncDependencies().size() != `1`)
189	return rewriter.notifyMatchFailure(
190	op, "Can only convert with exactly one stream dependency.");
191	stream = adaptor.getAsyncDependencies().front();
192	}
193
194	rewriter.create<mlir::LLVM::CallOp>(
195	loc, funcTy, cufLaunchKernel,
196	mlir::ValueRange{kernelPtr, adaptor.getGridSizeX(),
197	adaptor.getGridSizeY(), adaptor.getGridSizeZ(),
198	adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
199	adaptor.getBlockSizeZ(), stream, dynamicMemorySize,
200	kernelArgs, nullPtr});
201	rewriter.eraseOp(op);
202	}
203
204	return mlir::success();
205	}
206	};
207
208	static std::string getFuncName(cuf::SharedMemoryOp op) {
209	if (auto gpuFuncOp = op->getParentOfType<mlir::gpu::GPUFuncOp>())
210	return gpuFuncOp.getName().str();
211	if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>())
212	return funcOp.getName().str();
213	if (auto llvmFuncOp = op->getParentOfType<mlir::LLVM::LLVMFuncOp>())
214	return llvmFuncOp.getSymName().str();
215	return "";
216	}
217
218	static mlir::Value createAddressOfOp(mlir::ConversionPatternRewriter &rewriter,
219	mlir::Location loc,
220	gpu::GPUModuleOp gpuMod,
221	std::string &sharedGlobalName) {
222	auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(
223	rewriter.getContext(), mlir::NVVM::NVVMMemorySpace::kSharedMemorySpace);
224	if (auto g = gpuMod.lookupSymbol<fir::GlobalOp>(sharedGlobalName))
225	return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
226	g.getSymName());
227	if (auto g = gpuMod.lookupSymbol<mlir::LLVM::GlobalOp>(sharedGlobalName))
228	return rewriter.create<mlir::LLVM::AddressOfOp>(loc, llvmPtrTy,
229	g.getSymName());
230	return {};
231	}
232
233	struct CUFSharedMemoryOpConversion
234	: public mlir::ConvertOpToLLVMPattern<cuf::SharedMemoryOp> {
235	explicit CUFSharedMemoryOpConversion(
236	const fir::LLVMTypeConverter &typeConverter, mlir::PatternBenefit benefit)
237	: mlir::ConvertOpToLLVMPattern<cuf::SharedMemoryOp>(typeConverter,
238	benefit) {}
239	using OpAdaptor = typename cuf::SharedMemoryOp::Adaptor;
240
241	mlir::LogicalResult
242	matchAndRewrite(cuf::SharedMemoryOp op, OpAdaptor adaptor,
243	mlir::ConversionPatternRewriter &rewriter) const override {
244	mlir::Location loc = op->getLoc();
245	if (!op.getOffset())
246	mlir::emitError(loc,
247	"cuf.shared_memory must have an offset for code gen");
248
249	auto gpuMod = op->getParentOfType<gpu::GPUModuleOp>();
250	std::string sharedGlobalName =
251	(getFuncName(op) + llvm::Twine(cudaSharedMemSuffix)).str();
252	mlir::Value sharedGlobalAddr =
253	createAddressOfOp(rewriter, loc, gpuMod, sharedGlobalName);
254
255	if (!sharedGlobalAddr)
256	mlir::emitError(loc, "Could not find the shared global operation\n");
257
258	auto castPtr = rewriter.create<mlir::LLVM::AddrSpaceCastOp>(
259	loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()),
260	sharedGlobalAddr);
261	mlir::Type baseType = castPtr->getResultTypes().front();
262	llvm::SmallVector<mlir::LLVM::GEPArg> gepArgs = {op.getOffset()};
263	mlir::Value shmemPtr = rewriter.create<mlir::LLVM::GEPOp>(
264	loc, baseType, rewriter.getI8Type(), castPtr, gepArgs);
265	rewriter.replaceOp(op, {shmemPtr});
266	return mlir::success();
267	}
268	};
269
270	struct CUFStreamCastConversion
271	: public mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp> {
272	explicit CUFStreamCastConversion(const fir::LLVMTypeConverter &typeConverter,
273	mlir::PatternBenefit benefit)
274	: mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp>(typeConverter,
275	benefit) {}
276	using OpAdaptor = typename cuf::StreamCastOp::Adaptor;
277
278	mlir::LogicalResult
279	matchAndRewrite(cuf::StreamCastOp op, OpAdaptor adaptor,
280	mlir::ConversionPatternRewriter &rewriter) const override {
281	rewriter.replaceOp(op, adaptor.getStream());
282	return mlir::success();
283	}
284	};
285
286	class CUFGPUToLLVMConversion
287	: public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
288	public:
289	void runOnOperation() override {
290	auto *ctx = &getContext();
291	mlir::RewritePatternSet patterns(ctx);
292	mlir::ConversionTarget target(*ctx);
293
294	mlir::Operation *op = getOperation();
295	mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(op);
296	if (!module)
297	return signalPassFailure();
298
299	std::optional<mlir::DataLayout> dl = fir::support::getOrSetMLIRDataLayout(
300	module, /allowDefaultLayout=/false);
301	fir::LLVMTypeConverter typeConverter(module, /applyTBAA=/false,
302	/forceUnifiedTBAATree=/false, *dl);
303	cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns);
304
305	target.addDynamicallyLegalOp<mlir::gpu::LaunchFuncOp>(
306	[&](mlir::gpu::LaunchFuncOp op) {
307	if (op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
308	cuf::getProcAttrName()))
309	return false;
310	return true;
311	});
312
313	target.addIllegalOp<cuf::SharedMemoryOp>();
314	target.addLegalDialect<mlir::LLVM::LLVMDialect>();
315	if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
316	std::move(patterns)))) {
317	mlir::emitError(mlir::UnknownLoc::get(ctx),
318	"error in CUF GPU op conversion\n");
319	signalPassFailure();
320	}
321	}
322	};
323	} // namespace
324
325	void cuf::populateCUFGPUToLLVMConversionPatterns(
326	fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
327	mlir::PatternBenefit benefit) {
328	converter.addConversion([&converter](mlir::gpu::AsyncTokenType) -> Type {
329	return mlir::LLVM::LLVMPointerType::get(&converter.getContext());
330	});
331	patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion,
332	CUFStreamCastConversion>(converter, benefit);
333	}
334

source code of flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp