| 1 | //===- LowerGpuOpsToROCDLOps.cpp - MLIR GPU to ROCDL lowering passes ------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements a pass to generate ROCDLIR operations for higher-level |
| 10 | // GPU operations. |
| 11 | // |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" |
| 15 | #include "mlir/Dialect/Arith/Transforms/Passes.h" |
| 16 | #include "mlir/Pass/Pass.h" |
| 17 | #include "mlir/Pass/PassManager.h" |
| 18 | #include "mlir/Transforms/Passes.h" |
| 19 | |
| 20 | #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" |
| 21 | #include "mlir/Conversion/ConvertToLLVM/ToLLVMInterface.h" |
| 22 | #include "mlir/Conversion/ConvertToLLVM/ToLLVMPass.h" |
| 23 | #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" |
| 24 | #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" |
| 25 | #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" |
| 26 | #include "mlir/Conversion/LLVMCommon/Pattern.h" |
| 27 | #include "mlir/Conversion/LLVMCommon/TypeConverter.h" |
| 28 | #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" |
| 29 | #include "mlir/Conversion/MathToROCDL/MathToROCDL.h" |
| 30 | #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" |
| 31 | #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" |
| 32 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
| 33 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| 34 | #include "mlir/Dialect/GPU/Transforms/Passes.h" |
| 35 | #include "mlir/Dialect/LLVMIR/LLVMDialect.h" |
| 36 | #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" |
| 37 | #include "mlir/Dialect/Math/IR/Math.h" |
| 38 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
| 39 | #include "mlir/Dialect/Vector/IR/VectorOps.h" |
| 40 | #include "mlir/IR/BuiltinAttributes.h" |
| 41 | #include "mlir/Pass/Pass.h" |
| 42 | #include "mlir/Transforms/DialectConversion.h" |
| 43 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" |
| 44 | #include "llvm/Support/FormatVariadic.h" |
| 45 | |
| 46 | #include "../GPUCommon/GPUOpsLowering.h" |
| 47 | #include "../GPUCommon/IndexIntrinsicsOpLowering.h" |
| 48 | |
| 49 | namespace mlir { |
| 50 | #define GEN_PASS_DEF_CONVERTGPUOPSTOROCDLOPS |
| 51 | #include "mlir/Conversion/Passes.h.inc" |
| 52 | } // namespace mlir |
| 53 | |
| 54 | using namespace mlir; |
| 55 | |
| 56 | // Truncate or extend the result depending on the index bitwidth specified |
| 57 | // by the LLVMTypeConverter options. |
| 58 | static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter, |
| 59 | Location loc, Value value, |
| 60 | const LLVMTypeConverter &converter) { |
| 61 | int64_t intWidth = cast<IntegerType>(value.getType()).getWidth(); |
| 62 | int64_t indexBitwidth = converter.getIndexTypeBitwidth(); |
| 63 | auto indexBitwidthType = |
| 64 | IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth()); |
| 65 | // TODO: use <=> in C++20. |
| 66 | if (indexBitwidth > intWidth) { |
| 67 | return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value); |
| 68 | } |
| 69 | if (indexBitwidth < intWidth) { |
| 70 | return rewriter.create<LLVM::TruncOp>(loc, indexBitwidthType, value); |
| 71 | } |
| 72 | return value; |
| 73 | } |
| 74 | |
| 75 | /// Returns true if the given `gpu.func` can be safely called using the bare |
| 76 | /// pointer calling convention. |
| 77 | static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) { |
| 78 | bool canBeBare = true; |
| 79 | for (Type type : func.getArgumentTypes()) |
| 80 | if (auto memrefTy = dyn_cast<BaseMemRefType>(type)) |
| 81 | canBeBare &= LLVMTypeConverter::canConvertToBarePtr(memrefTy); |
| 82 | return canBeBare; |
| 83 | } |
| 84 | |
| 85 | static Value getLaneId(ConversionPatternRewriter &rewriter, Location loc, |
| 86 | const unsigned indexBitwidth) { |
| 87 | auto int32Type = IntegerType::get(rewriter.getContext(), 32); |
| 88 | Value zero = rewriter.create<arith::ConstantIntOp>(location: loc, args: 0, args: 32); |
| 89 | Value minus1 = rewriter.create<arith::ConstantIntOp>(location: loc, args: -1, args: 32); |
| 90 | Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type, |
| 91 | ValueRange{minus1, zero}); |
| 92 | Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type, |
| 93 | ValueRange{minus1, mbcntLo}); |
| 94 | return laneId; |
| 95 | } |
| 96 | static constexpr StringLiteral amdgcnDataLayout = |
| 97 | "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32" |
| 98 | "-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:" |
| 99 | "32-v32:" |
| 100 | "32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:" |
| 101 | "64-S32-A5-G1-ni:7:8:9" ; |
| 102 | |
| 103 | namespace { |
| 104 | struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> { |
| 105 | using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern; |
| 106 | |
| 107 | LogicalResult |
| 108 | matchAndRewrite(gpu::LaneIdOp op, gpu::LaneIdOp::Adaptor adaptor, |
| 109 | ConversionPatternRewriter &rewriter) const override { |
| 110 | auto loc = op->getLoc(); |
| 111 | MLIRContext *context = rewriter.getContext(); |
| 112 | // convert to: %mlo = call @llvm.amdgcn.mbcnt.lo(-1, 0) |
| 113 | // followed by: %lid = call @llvm.amdgcn.mbcnt.hi(-1, %mlo) |
| 114 | |
| 115 | Type intTy = IntegerType::get(context, 32); |
| 116 | Value zero = rewriter.create<arith::ConstantIntOp>(loc, 0, 32); |
| 117 | Value minus1 = rewriter.create<arith::ConstantIntOp>(loc, -1, 32); |
| 118 | Value mbcntLo = |
| 119 | rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero}); |
| 120 | Value laneId = rewriter.create<ROCDL::MbcntHiOp>( |
| 121 | loc, intTy, ValueRange{minus1, mbcntLo}); |
| 122 | // Truncate or extend the result depending on the index bitwidth specified |
| 123 | // by the LLVMTypeConverter options. |
| 124 | const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); |
| 125 | if (indexBitwidth > 32) { |
| 126 | laneId = rewriter.create<LLVM::SExtOp>( |
| 127 | loc, IntegerType::get(context, indexBitwidth), laneId); |
| 128 | } else if (indexBitwidth < 32) { |
| 129 | laneId = rewriter.create<LLVM::TruncOp>( |
| 130 | loc, IntegerType::get(context, indexBitwidth), laneId); |
| 131 | } |
| 132 | rewriter.replaceOp(op, {laneId}); |
| 133 | return success(); |
| 134 | } |
| 135 | }; |
| 136 | |
| 137 | struct GPUSubgroupSizeOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp> { |
| 138 | using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; |
| 139 | |
| 140 | GPUSubgroupSizeOpToROCDL(const LLVMTypeConverter &converter, |
| 141 | amdgpu::Chipset chipset) |
| 142 | : ConvertOpToLLVMPattern<gpu::SubgroupSizeOp>(converter), |
| 143 | chipset(chipset) {} |
| 144 | |
| 145 | LogicalResult |
| 146 | matchAndRewrite(gpu::SubgroupSizeOp op, gpu::SubgroupSizeOp::Adaptor adaptor, |
| 147 | ConversionPatternRewriter &rewriter) const override { |
| 148 | LLVM::ConstantRangeAttr bounds = nullptr; |
| 149 | bool isBeforeGfx10 = chipset.majorVersion < 10; |
| 150 | if (auto upperBoundAttr = op.getUpperBoundAttr()) { |
| 151 | bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>( |
| 152 | /*bitWidth=*/32, /*lower=*/isBeforeGfx10 ? 64 : 32, |
| 153 | /*upper=*/op.getUpperBoundAttr().getInt() + 1); |
| 154 | } |
| 155 | Value wavefrontOp = rewriter.create<ROCDL::WavefrontSizeOp>( |
| 156 | op.getLoc(), rewriter.getI32Type(), bounds); |
| 157 | wavefrontOp = truncOrExtToLLVMType(rewriter, op.getLoc(), wavefrontOp, |
| 158 | *getTypeConverter()); |
| 159 | rewriter.replaceOp(op, {wavefrontOp}); |
| 160 | return success(); |
| 161 | } |
| 162 | |
| 163 | const amdgpu::Chipset chipset; |
| 164 | }; |
| 165 | |
| 166 | struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> { |
| 167 | using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern; |
| 168 | |
| 169 | /// Lowers a shuffle to the corresponding ROCDL ops. |
| 170 | /// |
| 171 | /// Use the `width` argument to see if src lane is participating. |
| 172 | /// If not the dstLane would be itself. |
| 173 | /// |
| 174 | /// Shuffle with DS Bpermute: |
| 175 | /// let shflMode = [xor, up, down, idx] |
| 176 | /// let width = 32(usually warpsize), step = [1, 2, 4, 8, 16, ... , width]. |
| 177 | /// 1. curLaneId = using mbcnt.lo + mbcnt.hi |
| 178 | /// 2. widthOrZeroIfOutside = (curLaneId + width) & -width |
| 179 | /// 3. dstLane = shflMode(curLaneId, step) |
| 180 | /// 4. isActiveSrcLane = dstLane < isActiveSrcLane |
| 181 | /// 5. dstLane = isActiveSrcLane ? dstLane : curLaneId |
| 182 | /// 6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2. |
| 183 | /// 7. bpermute(dwordAlignedDstLane, shfl_value). |
| 184 | /// |
| 185 | LogicalResult |
| 186 | matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor, |
| 187 | ConversionPatternRewriter &rewriter) const override { |
| 188 | Location loc = op->getLoc(); |
| 189 | Value initShflValue = adaptor.getValue(); |
| 190 | |
| 191 | const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth(); |
| 192 | Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth); |
| 193 | |
| 194 | auto int32Type = IntegerType::get(rewriter.getContext(), 32); |
| 195 | Value width = adaptor.getWidth(); |
| 196 | Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0); |
| 197 | Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width); |
| 198 | Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width); |
| 199 | Value widthOrZeroIfOutside = |
| 200 | rewriter.create<LLVM::AndOp>(loc, int32Type, add, negwidth); |
| 201 | Value dstLane; |
| 202 | |
| 203 | switch (op.getMode()) { |
| 204 | case gpu::ShuffleMode::UP: |
| 205 | dstLane = rewriter.create<LLVM::SubOp>(loc, int32Type, srcLaneId, |
| 206 | adaptor.getOffset()); |
| 207 | break; |
| 208 | case gpu::ShuffleMode::DOWN: |
| 209 | dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, |
| 210 | adaptor.getOffset()); |
| 211 | break; |
| 212 | case gpu::ShuffleMode::XOR: |
| 213 | dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId, |
| 214 | adaptor.getOffset()); |
| 215 | break; |
| 216 | case gpu::ShuffleMode::IDX: |
| 217 | dstLane = adaptor.getOffset(); |
| 218 | break; |
| 219 | } |
| 220 | Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>( |
| 221 | loc, LLVM::ICmpPredicate::slt, dstLane, widthOrZeroIfOutside); |
| 222 | Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane, |
| 223 | dstLane, srcLaneId); |
| 224 | Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2); |
| 225 | Value dwordAlignedDstLane = |
| 226 | rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two); |
| 227 | |
| 228 | SmallVector<Value> decomposed = |
| 229 | LLVM::decomposeValue(builder&: rewriter, loc, src: initShflValue, dstType: int32Type); |
| 230 | SmallVector<Value> swizzled; |
| 231 | for (Value v : decomposed) { |
| 232 | Value res = rewriter.create<ROCDL::DsBpermuteOp>(loc, int32Type, |
| 233 | dwordAlignedDstLane, v); |
| 234 | swizzled.emplace_back(res); |
| 235 | } |
| 236 | Value shflValue = |
| 237 | LLVM::composeValue(builder&: rewriter, loc, src: swizzled, dstType: initShflValue.getType()); |
| 238 | rewriter.replaceOp(op, {shflValue, isActiveSrcLane}); |
| 239 | return success(); |
| 240 | } |
| 241 | }; |
| 242 | |
| 243 | /// Import the GPU Ops to ROCDL Patterns. |
| 244 | #include "GPUToROCDL.cpp.inc" |
| 245 | |
| 246 | // A pass that replaces all occurrences of GPU device operations with their |
| 247 | // corresponding ROCDL equivalent. |
| 248 | // |
| 249 | // This pass only handles device code and is not meant to be run on GPU host |
| 250 | // code. |
| 251 | struct LowerGpuOpsToROCDLOpsPass final |
| 252 | : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> { |
| 253 | LowerGpuOpsToROCDLOpsPass() = default; |
| 254 | LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth, |
| 255 | bool useBarePtrCallConv, |
| 256 | gpu::amd::Runtime runtime) { |
| 257 | if (this->chipset.getNumOccurrences() == 0) |
| 258 | this->chipset = chipset; |
| 259 | if (this->indexBitwidth.getNumOccurrences() == 0) |
| 260 | this->indexBitwidth = indexBitwidth; |
| 261 | if (this->useBarePtrCallConv.getNumOccurrences() == 0) |
| 262 | this->useBarePtrCallConv = useBarePtrCallConv; |
| 263 | if (this->runtime.getNumOccurrences() == 0) |
| 264 | this->runtime = runtime; |
| 265 | } |
| 266 | |
| 267 | void getDependentDialects(DialectRegistry ®istry) const override { |
| 268 | Base::getDependentDialects(registry); |
| 269 | registerConvertToLLVMDependentDialectLoading(registry); |
| 270 | } |
| 271 | |
| 272 | void runOnOperation() override { |
| 273 | gpu::GPUModuleOp m = getOperation(); |
| 274 | MLIRContext *ctx = m.getContext(); |
| 275 | |
| 276 | auto llvmDataLayout = m->getAttrOfType<StringAttr>( |
| 277 | LLVM::LLVMDialect::getDataLayoutAttrName()); |
| 278 | if (!llvmDataLayout) { |
| 279 | llvmDataLayout = StringAttr::get(ctx, amdgcnDataLayout); |
| 280 | m->setAttr(LLVM::LLVMDialect::getDataLayoutAttrName(), llvmDataLayout); |
| 281 | } |
| 282 | // Request C wrapper emission. |
| 283 | for (auto func : m.getOps<func::FuncOp>()) { |
| 284 | func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), |
| 285 | UnitAttr::get(ctx)); |
| 286 | } |
| 287 | |
| 288 | FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset); |
| 289 | if (failed(maybeChipset)) { |
| 290 | emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); |
| 291 | return signalPassFailure(); |
| 292 | } |
| 293 | |
| 294 | /// Customize the bitwidth used for the device side index computations. |
| 295 | LowerToLLVMOptions options( |
| 296 | ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation()))); |
| 297 | options.dataLayout = llvm::DataLayout(llvmDataLayout.getValue()); |
| 298 | if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) |
| 299 | options.overrideIndexBitwidth(indexBitwidth); |
| 300 | |
| 301 | if (useBarePtrCallConv) { |
| 302 | options.useBarePtrCallConv = true; |
| 303 | WalkResult canUseBarePointers = |
| 304 | m.walk([](gpu::GPUFuncOp func) -> WalkResult { |
| 305 | if (canBeCalledWithBarePointers(func)) |
| 306 | return WalkResult::advance(); |
| 307 | return WalkResult::interrupt(); |
| 308 | }); |
| 309 | if (canUseBarePointers.wasInterrupted()) { |
| 310 | emitError(UnknownLoc::get(ctx), |
| 311 | "bare pointer calling convention requires all memrefs to " |
| 312 | "have static shape and use the identity map" ); |
| 313 | return signalPassFailure(); |
| 314 | } |
| 315 | } |
| 316 | |
| 317 | // Apply in-dialect lowering. In-dialect lowering will replace |
| 318 | // ops which need to be lowered further, which is not supported by a |
| 319 | // single conversion pass. |
| 320 | { |
| 321 | RewritePatternSet patterns(ctx); |
| 322 | populateGpuRewritePatterns(patterns); |
| 323 | populateGpuPromoteShuffleToAMDGPUPatterns(patterns); |
| 324 | (void)applyPatternsGreedily(m, std::move(patterns)); |
| 325 | } |
| 326 | |
| 327 | LLVMTypeConverter converter(ctx, options); |
| 328 | populateGpuMemorySpaceAttributeConversions( |
| 329 | converter, [](gpu::AddressSpace space) { |
| 330 | switch (space) { |
| 331 | case gpu::AddressSpace::Global: |
| 332 | return 1; |
| 333 | case gpu::AddressSpace::Workgroup: |
| 334 | return 3; |
| 335 | case gpu::AddressSpace::Private: |
| 336 | return 5; |
| 337 | } |
| 338 | llvm_unreachable("unknown address space enum value" ); |
| 339 | return 0; |
| 340 | }); |
| 341 | |
| 342 | RewritePatternSet llvmPatterns(ctx); |
| 343 | LLVMConversionTarget target(getContext()); |
| 344 | |
| 345 | llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(), |
| 346 | allowedDialects.end()); |
| 347 | for (Dialect *dialect : ctx->getLoadedDialects()) { |
| 348 | bool allowed = allowedDialectsSet.contains(dialect->getNamespace()); |
| 349 | // Empty `allowedDialectsSet` means all dialects are allowed. |
| 350 | if (!allowedDialectsSet.empty() && !allowed) |
| 351 | continue; |
| 352 | |
| 353 | auto iface = dyn_cast<ConvertToLLVMPatternInterface>(dialect); |
| 354 | if (!iface) { |
| 355 | // Error out if dialect was explicily specified but doesn't implement |
| 356 | // conversion interface. |
| 357 | if (allowed) { |
| 358 | m.emitError() |
| 359 | << "dialect does not implement ConvertToLLVMPatternInterface: " |
| 360 | << dialect->getNamespace(); |
| 361 | return signalPassFailure(); |
| 362 | } |
| 363 | continue; |
| 364 | } |
| 365 | |
| 366 | iface->populateConvertToLLVMConversionPatterns(target, converter, |
| 367 | llvmPatterns); |
| 368 | } |
| 369 | |
| 370 | populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns, |
| 371 | *maybeChipset); |
| 372 | populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime, |
| 373 | *maybeChipset); |
| 374 | configureGpuToROCDLConversionLegality(target); |
| 375 | if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) |
| 376 | signalPassFailure(); |
| 377 | auto *rocdlDialect = getContext().getLoadedDialect<ROCDL::ROCDLDialect>(); |
| 378 | auto reqdWorkGroupSizeAttrHelper = |
| 379 | rocdlDialect->getReqdWorkGroupSizeAttrHelper(); |
| 380 | auto flatWorkGroupSizeAttrHelper = |
| 381 | rocdlDialect->getFlatWorkGroupSizeAttrHelper(); |
| 382 | // Manually rewrite known block size attributes so the LLVMIR translation |
| 383 | // infrastructure can pick them up. |
| 384 | m.walk([&](LLVM::LLVMFuncOp op) { |
| 385 | if (reqdWorkGroupSizeAttrHelper.isAttrPresent(op)) { |
| 386 | auto blockSizes = reqdWorkGroupSizeAttrHelper.getAttr(op); |
| 387 | // Also set up the rocdl.flat_work_group_size attribute to prevent |
| 388 | // conflicting metadata. |
| 389 | uint32_t flatSize = 1; |
| 390 | for (uint32_t size : blockSizes.asArrayRef()) { |
| 391 | flatSize *= size; |
| 392 | } |
| 393 | StringAttr flatSizeAttr = |
| 394 | StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize)); |
| 395 | flatWorkGroupSizeAttrHelper.setAttr(op, flatSizeAttr); |
| 396 | } |
| 397 | }); |
| 398 | } |
| 399 | }; |
| 400 | |
| 401 | } // namespace |
| 402 | |
| 403 | void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { |
| 404 | target.addIllegalOp<func::FuncOp>(); |
| 405 | target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); |
| 406 | target.addLegalDialect<ROCDL::ROCDLDialect>(); |
| 407 | target.addIllegalDialect<gpu::GPUDialect>(); |
| 408 | target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FCeilOp, |
| 409 | LLVM::FFloorOp, LLVM::FRemOp, LLVM::LogOp, LLVM::Log10Op, |
| 410 | LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>(); |
| 411 | // These ops are legal for f32 type. |
| 412 | target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) { |
| 413 | return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>); |
| 414 | }); |
| 415 | // TODO: Remove once we support replacing non-root ops. |
| 416 | target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>(); |
| 417 | } |
| 418 | |
| 419 | void mlir::populateGpuToROCDLConversionPatterns( |
| 420 | const LLVMTypeConverter &converter, RewritePatternSet &patterns, |
| 421 | mlir::gpu::amd::Runtime runtime, amdgpu::Chipset chipset) { |
| 422 | using gpu::index_lowering::IndexKind; |
| 423 | using gpu::index_lowering::IntrType; |
| 424 | using mlir::gpu::amd::Runtime; |
| 425 | auto *rocdlDialect = |
| 426 | converter.getContext().getLoadedDialect<ROCDL::ROCDLDialect>(); |
| 427 | populateWithGenerated(patterns); |
| 428 | patterns.add< |
| 429 | gpu::index_lowering::OpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp, |
| 430 | ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>( |
| 431 | converter, IndexKind::Block, IntrType::Id); |
| 432 | patterns.add<gpu::index_lowering::OpLowering< |
| 433 | gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>( |
| 434 | converter, IndexKind::Grid, IntrType::Id); |
| 435 | patterns.add< |
| 436 | gpu::index_lowering::OpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp, |
| 437 | ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>>( |
| 438 | converter, IndexKind::Block, IntrType::Dim); |
| 439 | patterns.add<gpu::index_lowering::OpLowering< |
| 440 | gpu::GridDimOp, ROCDL::GridDimXOp, ROCDL::GridDimYOp, ROCDL::GridDimZOp>>( |
| 441 | converter, IndexKind::Grid, IntrType::Dim); |
| 442 | patterns.add<GPUReturnOpLowering>(arg: converter); |
| 443 | patterns.add<GPUFuncOpLowering>( |
| 444 | converter, |
| 445 | GPUFuncOpLoweringOptions{ |
| 446 | /*allocaAddrSpace=*/ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace, |
| 447 | /*workgroupAddrSpace=*/ROCDL::ROCDLDialect::kSharedMemoryAddressSpace, |
| 448 | rocdlDialect->getKernelAttrHelper().getName(), |
| 449 | rocdlDialect->getReqdWorkGroupSizeAttrHelper().getName()}); |
| 450 | if (Runtime::HIP == runtime) { |
| 451 | patterns.add<GPUPrintfOpToHIPLowering>(arg: converter); |
| 452 | } else if (Runtime::OpenCL == runtime) { |
| 453 | // Use address space = 4 to match the OpenCL definition of printf() |
| 454 | patterns.add<GPUPrintfOpToLLVMCallLowering>(arg: converter, /*addressSpace=*/args: 4); |
| 455 | } |
| 456 | // TODO: Add alignment for workgroup memory |
| 457 | patterns.add<GPUDynamicSharedMemoryOpLowering>(arg: converter); |
| 458 | |
| 459 | patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(arg: converter); |
| 460 | patterns.add<GPUSubgroupSizeOpToROCDL>(arg: converter, args&: chipset); |
| 461 | |
| 462 | populateMathToROCDLConversionPatterns(converter, patterns); |
| 463 | } |
| 464 | |
| 465 | std::unique_ptr<OperationPass<gpu::GPUModuleOp>> |
| 466 | mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset, |
| 467 | unsigned indexBitwidth, |
| 468 | bool useBarePtrCallConv, |
| 469 | gpu::amd::Runtime runtime) { |
| 470 | return std::make_unique<LowerGpuOpsToROCDLOpsPass>( |
| 471 | args: chipset, args&: indexBitwidth, args&: useBarePtrCallConv, args&: runtime); |
| 472 | } |
| 473 | |