1//===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a pass to generate ROCDLIR operations for higher-level
10// GPU operations.
11//
12//===----------------------------------------------------------------------===//
13
14#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
15#include "mlir/Dialect/Arith/Transforms/Passes.h"
16#include "mlir/Pass/Pass.h"
17#include "mlir/Pass/PassManager.h"
18#include "mlir/Transforms/Passes.h"
19
20#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
21#include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h"
22#include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h"
23#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
24#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
25#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
26#include "mlir/Conversion/LLVMCommon/Pattern.h"
27#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
28#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
29#include "mlir/Conversion/MathToROCDL/MathToROCDL.h"
30#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
31#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
32#include "mlir/Dialect/Func/IR/FuncOps.h"
33#include "mlir/Dialect/GPU/IR/GPUDialect.h"
34#include "mlir/Dialect/GPU/Transforms/Passes.h"
35#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
36#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
37#include "mlir/Dialect/Math/IR/Math.h"
38#include "mlir/Dialect/MemRef/IR/MemRef.h"
39#include "mlir/Dialect/Vector/IR/VectorOps.h"
40#include "mlir/IR/BuiltinAttributes.h"
41#include "mlir/Pass/Pass.h"
42#include "mlir/Transforms/DialectConversion.h"
43#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
44#include "llvm/Support/FormatVariadic.h"
45
46#include "../GPUCommon/GPUOpsLowering.h"
47#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
48
49namespace mlir {
50#define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS
51#include "mlir/Conversion/Passes.h.inc"
52} // namespace mlir
53
54using namespace mlir;
55
56// Truncate or extend the result depending on the index bitwidth specified
57// by the LLVMTypeConverter options.
58static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
59 Location loc, Value value,
60 const LLVMTypeConverter &converter) {
61 int64_t intWidth = cast<IntegerType>(value.getType()).getWidth();
62 int64_t indexBitwidth = converter.getIndexTypeBitwidth();
63 auto indexBitwidthType =
64 IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
65 // TODO: use <=> in C++20.
66 if (indexBitwidth > intWidth) {
67 return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
68 }
69 if (indexBitwidth < intWidth) {
70 return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value);
71 }
72 return value;
73}
74
75/// Returns true if the given `gpu.func` can be safely called using the bare
76/// pointer calling convention.
77static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
78 bool canBeBare = true;
79 for (Type type : func.getArgumentTypes())
80 if (auto memrefTy = dyn_cast<BaseMemRefType>(type))
81 canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy);
82 return canBeBare;
83}
84
85static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
86 const unsigned indexBitwidth) {
87 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
88 Value zero = rewriter.create<arith::ConstantIntOp>(location: loc, args: 0, args: 32);
89 Value minus1 = rewriter.create<arith::ConstantIntOp>(location: loc, args: -1, args: 32);
90 Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
91 ValueRange{minus1, zero});
92 Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
93 ValueRange{minus1, mbcntLo});
94 return laneId;
95}
96static constexpr StringLiteral amdgcnDataLayout =
97 "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
98 "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:"
99 "32-v32:"
100 "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:"
101 "64-S32-A5-G1-ni:7:8:9";
102
103namespace {
104struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
105 using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
106
107 LogicalResult
108 matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor,
109 ConversionPatternRewriter &rewriter) const override {
110 auto loc = op->getLoc();
111 MLIRContext *context = rewriter.getContext();
112 // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0)
113 // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo)
114
115 Type intTy = IntegerType::get(context, 32);
116 Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
117 Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32);
118 Value mbcntLo =
119 rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
120 Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
121 loc, intTy, ValueRange{minus1, mbcntLo});
122 // Truncate or extend the result depending on the index bitwidth specified
123 // by the LLVMTypeConverter options.
124 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
125 if (indexBitwidth > 32) {
126 laneId = rewriter.create<LLVM::SExtOp>(
127 loc, IntegerType::get(context, indexBitwidth), laneId);
128 } else if (indexBitwidth < 32) {
129 laneId = rewriter.create<LLVM::TruncOp>(
130 loc, IntegerType::get(context, indexBitwidth), laneId);
131 }
132 rewriter.replaceOp(op, {laneId});
133 return success();
134 }
135};
136
137struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> {
138 using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
139
140 GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter,
141 amdgpu::Chipset chipset)
142 : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter),
143 chipset(chipset) {}
144
145 LogicalResult
146 matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor,
147 ConversionPatternRewriter &rewriter) const override {
148 LLVM::ConstantRangeAttr bounds = nullptr;
149 bool isBeforeGfx10 = chipset.majorVersion < 10;
150 if (auto upperBoundAttr = op.getUpperBoundAttr()) {
151 bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
152 /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32,
153 /*upper=*/op.getUpperBoundAttr().getInt() + 1);
154 }
155 Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>(
156 op.getLoc(), rewriter.getI32Type(), bounds);
157 wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp,
158 *getTypeConverter());
159 rewriter.replaceOp(op, {wavefrontOp});
160 return success();
161 }
162
163 const amdgpu::Chipset chipset;
164};
165
166struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
167 using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
168
169 /// Lowers a shuffle to the corresponding ROCDL ops.
170 ///
171 /// Use the `width` argument to see if src lane is participating.
172 /// If not the dstLane would be itself.
173 ///
174 /// Shuffle with DS Bpermute:
175 /// let shflMode = [xor, up, down, idx]
176 /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width].
177 /// 1. curLaneId = using mbcnt.lo + mbcnt.hi
178 /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width
179 /// 3. dstLane = shflMode(curLaneId, step)
180 /// 4. isActiveSrcLane = dstLane < isActiveSrcLane
181 /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId
182 /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
183 /// 7. bpermute(dwordAlignedDstLane, shfl_value).
184 ///
185 LogicalResult
186 matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
187 ConversionPatternRewriter &rewriter) const override {
188 Location loc = op->getLoc();
189 Value initShflValue = adaptor.getValue();
190
191 const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
192 Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
193
194 auto int32Type = IntegerType::get(rewriter.getContext(), 32);
195 Value width = adaptor.getWidth();
196 Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
197 Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
198 Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
199 Value widthOrZeroIfOutside =
200 rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth);
201 Value dstLane;
202
203 switch (op.getMode()) {
204 case gpu::ShuffleMode::UP:
205 dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId,
206 adaptor.getOffset());
207 break;
208 case gpu::ShuffleMode::DOWN:
209 dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
210 adaptor.getOffset());
211 break;
212 case gpu::ShuffleMode::XOR:
213 dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
214 adaptor.getOffset());
215 break;
216 case gpu::ShuffleMode::IDX:
217 dstLane = adaptor.getOffset();
218 break;
219 }
220 Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
221 loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside);
222 Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
223 dstLane, srcLaneId);
224 Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
225 Value dwordAlignedDstLane =
226 rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
227
228 SmallVector<Value> decomposed =
229 LLVM::decomposeValue(builder&: rewriter, loc, src: initShflValue, dstType: int32Type);
230 SmallVector<Value> swizzled;
231 for (Value v : decomposed) {
232 Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type,
233 dwordAlignedDstLane, v);
234 swizzled.emplace_back(res);
235 }
236 Value shflValue =
237 LLVM::composeValue(builder&: rewriter, loc, src: swizzled, dstType: initShflValue.getType());
238 rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
239 return success();
240 }
241};
242
243/// Import the GPU Ops to ROCDL Patterns.
244#include "GPUToROCDL.cpp.inc"
245
246// A pass that replaces all occurrences of GPU device operations with their
247// corresponding ROCDL equivalent.
248//
249// This pass only handles device code and is not meant to be run on GPU host
250// code.
251struct LowerGpuOpsToROCDLOpsPass final
252 : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
253 LowerGpuOpsToROCDLOpsPass() = default;
254 LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
255 bool useBarePtrCallConv,
256 gpu::amd::Runtime runtime) {
257 if (this->chipset.getNumOccurrences() == 0)
258 this->chipset = chipset;
259 if (this->indexBitwidth.getNumOccurrences() == 0)
260 this->indexBitwidth = indexBitwidth;
261 if (this->useBarePtrCallConv.getNumOccurrences() == 0)
262 this->useBarePtrCallConv = useBarePtrCallConv;
263 if (this->runtime.getNumOccurrences() == 0)
264 this->runtime = runtime;
265 }
266
267 void getDependentDialects(DialectRegistry &registry) const override {
268 Base::getDependentDialects(registry);
269 registerConvertToLLVMDependentDialectLoading(registry);
270 }
271
272 void runOnOperation() override {
273 gpu::GPUModuleOp m = getOperation();
274 MLIRContext *ctx = m.getContext();
275
276 auto llvmDataLayout = m->getAttrOfType<StringAttr>(
277 LLVM::LLVMDialect::getDataLayoutAttrName());
278 if (!llvmDataLayout) {
279 llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout);
280 m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout);
281 }
282 // Request C wrapper emission.
283 for (auto func : m.getOps<func::FuncOp>()) {
284 func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
285 UnitAttr::get(ctx));
286 }
287
288 FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
289 if (failed(maybeChipset)) {
290 emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
291 return signalPassFailure();
292 }
293
294 /// Customize the bitwidth used for the device side index computations.
295 LowerToLLVMOptions options(
296 ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
297 options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue());
298 if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
299 options.overrideIndexBitwidth(indexBitwidth);
300
301 if (useBarePtrCallConv) {
302 options.useBarePtrCallConv = true;
303 WalkResult canUseBarePointers =
304 m.walk([](gpu::GPUFuncOp func) -> WalkResult {
305 if (canBeCalledWithBarePointers(func))
306 return WalkResult::advance();
307 return WalkResult::interrupt();
308 });
309 if (canUseBarePointers.wasInterrupted()) {
310 emitError(UnknownLoc::get(ctx),
311 "bare pointer calling convention requires all memrefs to "
312 "have static shape and use the identity map");
313 return signalPassFailure();
314 }
315 }
316
317 // Apply in-dialect lowering. In-dialect lowering will replace
318 // ops which need to be lowered further, which is not supported by a
319 // single conversion pass.
320 {
321 RewritePatternSet patterns(ctx);
322 populateGpuRewritePatterns(patterns);
323 populateGpuPromoteShuffleToAMDGPUPatterns(patterns);
324 (void)applyPatternsGreedily(m, std::move(patterns));
325 }
326
327 LLVMTypeConverter converter(ctx, options);
328 populateGpuMemorySpaceAttributeConversions(
329 converter, [](gpu::AddressSpace space) {
330 switch (space) {
331 case gpu::AddressSpace::Global:
332 return 1;
333 case gpu::AddressSpace::Workgroup:
334 return 3;
335 case gpu::AddressSpace::Private:
336 return 5;
337 }
338 llvm_unreachable("unknown address space enum value");
339 return 0;
340 });
341
342 RewritePatternSet llvmPatterns(ctx);
343 LLVMConversionTarget target(getContext());
344
345 llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
346 allowedDialects.end());
347 for (Dialect *dialect : ctx->getLoadedDialects()) {
348 bool allowed = allowedDialectsSet.contains(dialect->getNamespace());
349 // Empty `allowedDialectsSet` means all dialects are allowed.
350 if (!allowedDialectsSet.empty() && !allowed)
351 continue;
352
353 auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect);
354 if (!iface) {
355 // Error out if dialect was explicily specified but doesn't implement
356 // conversion interface.
357 if (allowed) {
358 m.emitError()
359 << "dialect does not implement ConvertToLLVMPatternInterface: "
360 << dialect->getNamespace();
361 return signalPassFailure();
362 }
363 continue;
364 }
365
366 iface->populateConvertToLLVMConversionPatterns(target, converter,
367 llvmPatterns);
368 }
369
370 populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
371 *maybeChipset);
372 populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
373 *maybeChipset);
374 configureGpuToROCDLConversionLegality(target);
375 if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
376 signalPassFailure();
377 auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
378 auto reqdWorkGroupSizeAttrHelper =
379 rocdlDialect->getReqdWorkGroupSizeAttrHelper();
380 auto flatWorkGroupSizeAttrHelper =
381 rocdlDialect->getFlatWorkGroupSizeAttrHelper();
382 // Manually rewrite known block size attributes so the LLVMIR translation
383 // infrastructure can pick them up.
384 m.walk([&](LLVM::LLVMFuncOp op) {
385 if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) {
386 auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op);
387 // Also set up the rocdl.flat_work_group_size attribute to prevent
388 // conflicting metadata.
389 uint32_t flatSize = 1;
390 for (uint32_t size : blockSizes.asArrayRef()) {
391 flatSize *= size;
392 }
393 StringAttr flatSizeAttr =
394 StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
395 flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr);
396 }
397 });
398 }
399};
400
401} // namespace
402
403void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
404 target.addIllegalOp<func::FuncOp>();
405 target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
406 target.addLegalDialect<ROCDL::ROCDLDialect>();
407 target.addIllegalDialect<gpu::GPUDialect>();
408 target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp,
409 LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op,
410 LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
411 // These ops are legal for f32 type.
412 target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
413 return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
414 });
415 // TODO: Remove once we support replacing non-root ops.
416 target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
417}
418
419void mlir::populateGpuToROCDLConversionPatterns(
420 const LLVMTypeConverter &converter, RewritePatternSet &patterns,
421 mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) {
422 using gpu::index_lowering::IndexKind;
423 using gpu::index_lowering::IntrType;
424 using mlir::gpu::amd::Runtime;
425 auto *rocdlDialect =
426 converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>();
427 populateWithGenerated(patterns);
428 patterns.add<
429 gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
430 ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
431 converter, IndexKind::Block, IntrType::Id);
432 patterns.add<gpu::index_lowering::OpLowering<
433 gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
434 converter, IndexKind::Grid, IntrType::Id);
435 patterns.add<
436 gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
437 ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>(
438 converter, IndexKind::Block, IntrType::Dim);
439 patterns.add<gpu::index_lowering::OpLowering<
440 gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
441 converter, IndexKind::Grid, IntrType::Dim);
442 patterns.add<GPUReturnOpLowering>(arg: converter);
443 patterns.add<GPUFuncOpLowering>(
444 converter,
445 GPUFuncOpLoweringOptions{
446 /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace,
447 /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace,
448 rocdlDialect->getKernelAttrHelper().getName(),
449 rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()});
450 if (Runtime::HIP == runtime) {
451 patterns.add<GPUPrintfOpToHIPLowering>(arg: converter);
452 } else if (Runtime::OpenCL == runtime) {
453 // Use address space = 4 to match the OpenCL definition of printf()
454 patterns.add<GPUPrintfOpToLLVMCallLowering>(arg: converter, /*addressSpace=*/args: 4);
455 }
456 // TODO: Add alignment for workgroup memory
457 patterns.add<GPUDynamicSharedMemoryOpLowering>(arg: converter);
458
459 patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(arg: converter);
460 patterns.add<GPUSubgroupSizeOpToROCDL>(arg: converter, args&: chipset);
461
462 populateMathToROCDLConversionPatterns(converter, patterns);
463}
464
465std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
466mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
467 unsigned indexBitwidth,
468 bool useBarePtrCallConv,
469 gpu::amd::Runtime runtime) {
470 return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
471 args: chipset, args&: indexBitwidth, args&: useBarePtrCallConv, args&: runtime);
472}
473

Provided by KDAB

Privacy Policy
Learn to use CMake with our Intro Training
Find out more

source code of mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp