LowerGpuOpsToROCDLOps.cpp source code [mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp]

1	//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements a pass to generate ROCDLIR operations for higher-level
10	// GPU operations.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
15	#include "mlir/Dialect/Arith/Transforms/Passes.h"
16	#include "mlir/Pass/Pass.h"
17	#include "mlir/Pass/PassManager.h"
18	#include "mlir/Transforms/Passes.h"
19
20	#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
21	#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
22	#include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"
23	#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
24	#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
25	#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
26	#include "mlir/Conversion/LLVMCommon/Pattern.h"
27	#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
28	#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
29	#include "mlir/Conversion/MathToROCDL/MathToROCDL.h"
30	#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
31	#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
32	#include "mlir/Dialect/Func/IR/FuncOps.h"
33	#include "mlir/Dialect/GPU/IR/GPUDialect.h"
34	#include "mlir/Dialect/GPU/Transforms/Passes.h"
35	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
36	#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
37	#include "mlir/Dialect/Math/IR/Math.h"
38	#include "mlir/Dialect/MemRef/IR/MemRef.h"
39	#include "mlir/Dialect/Vector/IR/VectorOps.h"
40	#include "mlir/IR/BuiltinAttributes.h"
41	#include "mlir/Pass/Pass.h"
42	#include "mlir/Transforms/DialectConversion.h"
43	#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
44	#include "llvm/Support/FormatVariadic.h"
45
46	#include "../GPUCommon/GPUOpsLowering.h"
47	#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
48
49	namespace mlir {
50	#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
51	#include "mlir/Conversion/Passes.h.inc"
52	} // namespace mlir
53
54	using namespace mlir;
55
56	// Truncate or extend the result depending on the index bitwidth specified
57	// by the LLVMTypeConverter options.
58	static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
59	Location loc, Value value,
60	const LLVMTypeConverter &converter) {
61	int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
62	int64_t indexBitwidth = converter.getIndexTypeBitwidth();
63	auto indexBitwidthType =
64	IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
65	// TODO: use <=> in C++20.
66	if (indexBitwidth > intWidth) {
67	return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
68	}
69	if (indexBitwidth < intWidth) {
70	return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
71	}
72	return value;
73	}
74
75	/// Returns true if the given `gpu.func` can be safely called using the bare
76	/// pointer calling convention.
77	static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
78	bool canBeBare = true;
79	for (Type type : func.getArgumentTypes())
80	if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
81	canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
82	return canBeBare;
83	}
84
85	static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
86	const unsigned indexBitwidth) {
87	auto int32Type = IntegerType::get(rewriter.getContext(), `32`);
88	Value zero = rewriter.create<arith::ConstantIntOp>(location: loc, args: `0`, args: `32`);
89	Value minus1 = rewriter.create<arith::ConstantIntOp>(location: loc, args: -`1`, args: `32`);
90	Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
91	ValueRange{minus1, zero});
92	Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
93	ValueRange{minus1, mbcntLo});
94	return laneId;
95	}
96	static constexpr StringLiteral amdgcnDataLayout =
97	"e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
98	"-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
99	"32-v32:"
100	"32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
101	"64-S32-A5-G1-ni:7:8:9";
102
103	namespace {
104	struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
105	using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
106
107	LogicalResult
108	matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
109	ConversionPatternRewriter &rewriter) const override {
110	auto loc = op->getLoc();
111	MLIRContext *context = rewriter.getContext();
112	// convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
113	// followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
114
115	Type intTy = IntegerType::get(context, `32`);
116	Value zero = rewriter.create<arith::ConstantIntOp>(loc, `0`, `32`);
117	Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -`1`, `32`);
118	Value mbcntLo =
119	rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
120	Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
121	loc, intTy, ValueRange{minus1, mbcntLo});
122	// Truncate or extend the result depending on the index bitwidth specified
123	// by the LLVMTypeConverter options.
124	const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
125	if (indexBitwidth > `32`) {
126	laneId = rewriter.create<LLVM::SExtOp>(
127	loc, IntegerType::get(context, indexBitwidth), laneId);
128	} else if (indexBitwidth < `32`) {
129	laneId = rewriter.create<LLVM::TruncOp>(
130	loc, IntegerType::get(context, indexBitwidth), laneId);
131	}
132	rewriter.replaceOp(op, {laneId});
133	return success();
134	}
135	};
136
137	struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
138	using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
139
140	GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
141	amdgpu::Chipset chipset)
142	: ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter),
143	chipset (chipset) {}
144
145	LogicalResult
146	matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
147	ConversionPatternRewriter &rewriter) const override {
148	LLVM::ConstantRangeAttr bounds = nullptr;
149	bool isBeforeGfx10 = chipset.majorVersion < `10`;
150	if (auto upperBoundAttr = op.getUpperBoundAttr()) {
151	bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
152	/bitWidth=/`32`, /lower=/isBeforeGfx10 ? `64` : `32`,
153	/upper=/op.getUpperBoundAttr().getInt() + `1`);
154	}
155	Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
156	op.getLoc(), rewriter.getI32Type(), bounds);
157	wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
158	*getTypeConverter());
159	rewriter.replaceOp(op, {wavefrontOp});
160	return success();
161	}
162
163	const amdgpu::Chipset chipset;
164	};
165
166	struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
167	using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
168
169	/// Lowers a shuffle to the corresponding ROCDL ops.
170	///
171	/// Use the `width` argument to see if src lane is participating.
172	/// If not the dstLane would be itself.
173	///
174	/// Shuffle with DS Bpermute:
175	/// let shflMode = [xor, up, down, idx]
176	/// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
177	/// 1. curLaneId = using mbcnt.lo + mbcnt.hi
178	/// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
179	/// 3. dstLane = shflMode(curLaneId, step)
180	/// 4. isActiveSrcLane = dstLane < isActiveSrcLane
181	/// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
182	/// 6. dwordAlignedDstLane = dstLane 4 or dstLane << 2.*
183	/// 7. bpermute(dwordAlignedDstLane, shfl_value).
184	///
185	LogicalResult
186	matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
187	ConversionPatternRewriter &rewriter) const override {
188	Location loc = op->getLoc();
189	Value initShflValue = adaptor.getValue();
190
191	const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
192	Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
193
194	auto int32Type = IntegerType::get(rewriter.getContext(), `32`);
195	Value width = adaptor.getWidth();
196	Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, `0`);
197	Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
198	Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
199	Value widthOrZeroIfOutside =
200	rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
201	Value dstLane;
202
203	switch (op.getMode()) {
204	case gpu::ShuffleMode::UP:
205	dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
206	adaptor.getOffset());
207	break;
208	case gpu::ShuffleMode::DOWN:
209	dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
210	adaptor.getOffset());
211	break;
212	case gpu::ShuffleMode::XOR:
213	dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
214	adaptor.getOffset());
215	break;
216	case gpu::ShuffleMode::IDX:
217	dstLane = adaptor.getOffset();
218	break;
219	}
220	Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
221	loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
222	Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
223	dstLane, srcLaneId);
224	Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, `2`);
225	Value dwordAlignedDstLane =
226	rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
227
228	SmallVector<Value> decomposed =
229	LLVM::decomposeValue(builder&: rewriter, loc, src: initShflValue, dstType: int32Type);
230	SmallVector<Value> swizzled;
231	for (Value v : decomposed) {
232	Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
233	dwordAlignedDstLane, v);
234	swizzled.emplace_back(res);
235	}
236	Value shflValue =
237	LLVM::composeValue(builder&: rewriter, loc, src: swizzled, dstType: initShflValue.getType());
238	rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
239	return success();
240	}
241	};
242
243	/// Import the GPU Ops to ROCDL Patterns.
244	#include "GPUToROCDL.cpp.inc"
245
246	// A pass that replaces all occurrences of GPU device operations with their
247	// corresponding ROCDL equivalent.
248	//
249	// This pass only handles device code and is not meant to be run on GPU host
250	// code.
251	struct LowerGpuOpsToROCDLOpsPass final
252	: public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
253	LowerGpuOpsToROCDLOpsPass() = default;
254	LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
255	bool useBarePtrCallConv,
256	gpu::amd::Runtime runtime) {
257	if (this->chipset.getNumOccurrences() == `0`)
258	this->chipset = chipset;
259	if (this->indexBitwidth.getNumOccurrences() == `0`)
260	this->indexBitwidth = indexBitwidth;
261	if (this->useBarePtrCallConv.getNumOccurrences() == `0`)
262	this->useBarePtrCallConv = useBarePtrCallConv;
263	if (this->runtime.getNumOccurrences() == `0`)
264	this->runtime = runtime;
265	}
266
267	void getDependentDialects(DialectRegistry &registry) const override {
268	Base::getDependentDialects(registry);
269	registerConvertToLLVMDependentDialectLoading(registry);
270	}
271
272	void runOnOperation() override {
273	gpu::GPUModuleOp m = getOperation();
274	MLIRContext *ctx = m.getContext();
275
276	auto llvmDataLayout = m->getAttrOfType<StringAttr>(
277	LLVM::LLVMDialect::getDataLayoutAttrName());
278	if (!llvmDataLayout) {
279	llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
280	m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
281	}
282	// Request C wrapper emission.
283	for (auto func : m.getOps<func::FuncOp>()) {
284	func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
285	UnitAttr::get(ctx));
286	}
287
288	FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
289	if (failed(maybeChipset)) {
290	emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
291	return signalPassFailure();
292	}
293
294	/// Customize the bitwidth used for the device side index computations.
295	LowerToLLVMOptions options(
296	ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
297	options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
298	if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
299	options.overrideIndexBitwidth(indexBitwidth);
300
301	if (useBarePtrCallConv) {
302	options.useBarePtrCallConv = true;
303	WalkResult canUseBarePointers =
304	m.walk([](gpu::GPUFuncOp func) -> WalkResult {
305	if (canBeCalledWithBarePointers(func))
306	return WalkResult::advance();
307	return WalkResult::interrupt();
308	});
309	if (canUseBarePointers.wasInterrupted()) {
310	emitError(UnknownLoc::get(ctx),
311	"bare pointer calling convention requires all memrefs to "
312	"have static shape and use the identity map");
313	return signalPassFailure();
314	}
315	}
316
317	// Apply in-dialect lowering. In-dialect lowering will replace
318	// ops which need to be lowered further, which is not supported by a
319	// single conversion pass.
320	{
321	RewritePatternSet patterns(ctx);
322	populateGpuRewritePatterns(patterns);
323	populateGpuPromoteShuffleToAMDGPUPatterns(patterns);
324	(void)applyPatternsGreedily(m, std::move(patterns));
325	}
326
327	LLVMTypeConverter converter(ctx, options);
328	populateGpuMemorySpaceAttributeConversions(
329	converter, [](gpu::AddressSpace space) {
330	switch (space) {
331	case gpu::AddressSpace::Global:
332	return `1`;
333	case gpu::AddressSpace::Workgroup:
334	return `3`;
335	case gpu::AddressSpace::Private:
336	return `5`;
337	}
338	llvm_unreachable("unknown address space enum value");
339	return `0`;
340	});
341
342	RewritePatternSet llvmPatterns(ctx);
343	LLVMConversionTarget target(getContext());
344
345	llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
346	allowedDialects.end());
347	for (Dialect *dialect : ctx->getLoadedDialects()) {
348	bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
349	// Empty `allowedDialectsSet` means all dialects are allowed.
350	if (!allowedDialectsSet.empty() && !allowed)
351	continue;
352
353	auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
354	if (!iface) {
355	// Error out if dialect was explicily specified but doesn't implement
356	// conversion interface.
357	if (allowed) {
358	m.emitError()
359	<< "dialect does not implement ConvertToLLVMPatternInterface: "
360	<< dialect->getNamespace();
361	return signalPassFailure();
362	}
363	continue;
364	}
365
366	iface->populateConvertToLLVMConversionPatterns(target, converter,
367	llvmPatterns);
368	}
369
370	populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
371	*maybeChipset);
372	populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
373	*maybeChipset);
374	configureGpuToROCDLConversionLegality(target);
375	if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
376	signalPassFailure();
377	auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
378	auto reqdWorkGroupSizeAttrHelper =
379	rocdlDialect->getReqdWorkGroupSizeAttrHelper();
380	auto flatWorkGroupSizeAttrHelper =
381	rocdlDialect->getFlatWorkGroupSizeAttrHelper();
382	// Manually rewrite known block size attributes so the LLVMIR translation
383	// infrastructure can pick them up.
384	m.walk([&](LLVM::LLVMFuncOp op) {
385	if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
386	auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
387	// Also set up the rocdl.flat_work_group_size attribute to prevent
388	// conflicting metadata.
389	uint32_t flatSize = `1`;
390	for (uint32_t size : blockSizes.asArrayRef()) {
391	flatSize *= size;
392	}
393	StringAttr flatSizeAttr =
394	StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
395	flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
396	}
397	});
398	}
399	};
400
401	} // namespace
402
403	void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
404	target.addIllegalOp<func::FuncOp>();
405	target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
406	target.addLegalDialect<ROCDL::ROCDLDialect>();
407	target.addIllegalDialect<gpu::GPUDialect>();
408	target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
409	LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
410	LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
411	// These ops are legal for f32 type.
412	target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
413	return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
414	});
415	// TODO: Remove once we support replacing non-root ops.
416	target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
417	}
418
419	void mlir::populateGpuToROCDLConversionPatterns(
420	const LLVMTypeConverter &converter, RewritePatternSet &patterns,
421	mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
422	using gpu::index_lowering::IndexKind;
423	using gpu::index_lowering::IntrType;
424	using mlir::gpu::amd::Runtime;
425	auto *rocdlDialect =
426	converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
427	populateWithGenerated(patterns);
428	patterns.add<
429	gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
430	ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
431	converter, IndexKind::Block, IntrType::Id);
432	patterns.add<gpu::index_lowering::OpLowering<
433	gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
434	converter, IndexKind::Grid, IntrType::Id);
435	patterns.add<
436	gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
437	ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
438	converter, IndexKind::Block, IntrType::Dim);
439	patterns.add<gpu::index_lowering::OpLowering<
440	gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
441	converter, IndexKind::Grid, IntrType::Dim);
442	patterns.add<GPUReturnOpLowering>(arg: converter);
443	patterns.add<GPUFuncOpLowering>(
444	converter,
445	GPUFuncOpLoweringOptions{
446	/allocaAddrSpace=/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
447	/workgroupAddrSpace=/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
448	rocdlDialect->getKernelAttrHelper().getName(),
449	rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
450	if (Runtime::HIP == runtime) {
451	patterns.add<GPUPrintfOpToHIPLowering>(arg: converter);
452	} else if (Runtime::OpenCL == runtime) {
453	// Use address space = 4 to match the OpenCL definition of printf()
454	patterns.add<GPUPrintfOpToLLVMCallLowering>(arg: converter, /addressSpace=/args: `4`);
455	}
456	// TODO: Add alignment for workgroup memory
457	patterns.add<GPUDynamicSharedMemoryOpLowering>(arg: converter);
458
459	patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(arg: converter);
460	patterns.add<GPUSubgroupSizeOpToROCDL>(arg: converter, args&: chipset);
461
462	populateMathToROCDLConversionPatterns(converter, patterns);
463	}
464
465	std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
466	mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
467	unsigned indexBitwidth,
468	bool useBarePtrCallConv,
469	gpu::amd::Runtime runtime) {
470	return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
471	args: chipset, args&: indexBitwidth, args&: useBarePtrCallConv, args&: runtime);
472	}
473

source code of mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp