1//===- EmulateAtomics.cpp - Emulate unsupported AMDGPU atomics ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
10
11#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
12#include "mlir/Dialect/Arith/IR/Arith.h"
13#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
14#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
15#include "mlir/IR/BuiltinAttributes.h"
16#include "mlir/Transforms/DialectConversion.h"
17#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
18
19namespace mlir::amdgpu {
20#define GEN_PASS_DEF_AMDGPUEMULATEATOMICSPASS
21#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
22} // namespace mlir::amdgpu
23
24using namespace mlir;
25using namespace mlir::amdgpu;
26
27namespace {
28struct AmdgpuEmulateAtomicsPass
29 : public amdgpu::impl::AmdgpuEmulateAtomicsPassBase<
30 AmdgpuEmulateAtomicsPass> {
31 using AmdgpuEmulateAtomicsPassBase<
32 AmdgpuEmulateAtomicsPass>::AmdgpuEmulateAtomicsPassBase;
33 void runOnOperation() override;
34};
35
36template <typename AtomicOp, typename ArithOp>
37struct RawBufferAtomicByCasPattern : public OpConversionPattern<AtomicOp> {
38 using OpConversionPattern<AtomicOp>::OpConversionPattern;
39 using Adaptor = typename AtomicOp::Adaptor;
40
41 LogicalResult
42 matchAndRewrite(AtomicOp atomicOp, Adaptor adaptor,
43 ConversionPatternRewriter &rewriter) const override;
44};
45} // namespace
46
47namespace {
48enum class DataArgAction : unsigned char {
49 Duplicate,
50 Drop,
51};
52} // namespace
53
54// Fix up the fact that, when we're migrating from a general bugffer atomic
55// to a load or to a CAS, the number of openrands, and thus the number of
56// entries needed in operandSegmentSizes, needs to change. We use this method
57// because we'd like to preserve unknown attributes on the atomic instead of
58// discarding them.
59static void patchOperandSegmentSizes(ArrayRef<NamedAttribute> attrs,
60 SmallVectorImpl<NamedAttribute> &newAttrs,
61 DataArgAction action) {
62 newAttrs.reserve(N: attrs.size());
63 for (NamedAttribute attr : attrs) {
64 if (attr.getName().getValue() != "operandSegmentSizes") {
65 newAttrs.push_back(Elt: attr);
66 continue;
67 }
68 auto segmentAttr = cast<DenseI32ArrayAttr>(attr.getValue());
69 MLIRContext *context = segmentAttr.getContext();
70 DenseI32ArrayAttr newSegments;
71 switch (action) {
72 case DataArgAction::Drop:
73 newSegments = DenseI32ArrayAttr::get(
74 context, segmentAttr.asArrayRef().drop_front());
75 break;
76 case DataArgAction::Duplicate: {
77 SmallVector<int32_t> newVals;
78 ArrayRef<int32_t> oldVals = segmentAttr.asArrayRef();
79 newVals.push_back(Elt: oldVals[0]);
80 newVals.append(in_start: oldVals.begin(), in_end: oldVals.end());
81 newSegments = DenseI32ArrayAttr::get(context, newVals);
82 break;
83 }
84 }
85 newAttrs.push_back(Elt: NamedAttribute(attr.getName(), newSegments));
86 }
87}
88
89template <typename AtomicOp, typename ArithOp>
90LogicalResult RawBufferAtomicByCasPattern<AtomicOp, ArithOp>::matchAndRewrite(
91 AtomicOp atomicOp, Adaptor adaptor,
92 ConversionPatternRewriter &rewriter) const {
93 Location loc = atomicOp.getLoc();
94
95 ArrayRef<NamedAttribute> origAttrs = atomicOp->getAttrs();
96 ValueRange operands = adaptor.getOperands();
97 Value data = operands.take_front()[0];
98 ValueRange invariantArgs = operands.drop_front();
99 Type dataType = data.getType();
100
101 SmallVector<NamedAttribute> loadAttrs;
102 patchOperandSegmentSizes(attrs: origAttrs, newAttrs&: loadAttrs, action: DataArgAction::Drop);
103 Value initialLoad =
104 rewriter.create<RawBufferLoadOp>(loc, dataType, invariantArgs, loadAttrs);
105 Block *currentBlock = rewriter.getInsertionBlock();
106 Block *afterAtomic =
107 rewriter.splitBlock(block: currentBlock, before: rewriter.getInsertionPoint());
108 Block *loopBlock = rewriter.createBlock(insertBefore: afterAtomic, argTypes: {dataType}, locs: {loc});
109
110 rewriter.setInsertionPointToEnd(currentBlock);
111 rewriter.create<cf::BranchOp>(loc, loopBlock, initialLoad);
112
113 rewriter.setInsertionPointToEnd(loopBlock);
114 Value prevLoad = loopBlock->getArgument(i: 0);
115 Value operated = rewriter.create<ArithOp>(loc, data, prevLoad);
116
117 SmallVector<NamedAttribute> cmpswapAttrs;
118 patchOperandSegmentSizes(attrs: origAttrs, newAttrs&: cmpswapAttrs, action: DataArgAction::Duplicate);
119 SmallVector<Value> cmpswapArgs = {operated, prevLoad};
120 cmpswapArgs.append(in_start: invariantArgs.begin(), in_end: invariantArgs.end());
121 Value atomicRes = rewriter.create<RawBufferAtomicCmpswapOp>(
122 loc, dataType, cmpswapArgs, cmpswapAttrs);
123
124 // We care about exact bitwise equality here, so do some bitcasts.
125 // These will fold away during lowering to the ROCDL dialect, where
126 // an int->float bitcast is introduced to account for the fact that cmpswap
127 // only takes integer arguments.
128
129 Value prevLoadForCompare = prevLoad;
130 Value atomicResForCompare = atomicRes;
131 if (auto floatDataTy = dyn_cast<FloatType>(Val&: dataType)) {
132 Type equivInt = rewriter.getIntegerType(floatDataTy.getWidth());
133 prevLoadForCompare =
134 rewriter.create<arith::BitcastOp>(loc, equivInt, prevLoad);
135 atomicResForCompare =
136 rewriter.create<arith::BitcastOp>(loc, equivInt, atomicRes);
137 }
138 Value canLeave = rewriter.create<arith::CmpIOp>(
139 loc, arith::CmpIPredicate::eq, atomicResForCompare, prevLoadForCompare);
140 rewriter.create<cf::CondBranchOp>(loc, canLeave, afterAtomic, ValueRange{},
141 loopBlock, atomicRes);
142 rewriter.eraseOp(op: atomicOp);
143 return success();
144}
145
146void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns(
147 ConversionTarget &target, RewritePatternSet &patterns, Chipset chipset) {
148 // gfx10 has no atomic adds.
149 if (chipset.majorVersion == 10 || chipset.majorVersion < 9 ||
150 (chipset.majorVersion == 9 && chipset.minorVersion < 0x08)) {
151 target.addIllegalOp<RawBufferAtomicFaddOp>();
152 }
153 // gfx9 has no to a very limited support for floating-point min and max.
154 if (chipset.majorVersion == 9) {
155 if (chipset.minorVersion >= 0x0a && chipset.minorVersion != 0x41) {
156 // gfx90a supports f64 max (and min, but we don't have a min wrapper right
157 // now) but all other types need to be emulated.
158 target.addDynamicallyLegalOp<RawBufferAtomicFmaxOp>(
159 [](RawBufferAtomicFmaxOp op) -> bool {
160 return op.getValue().getType().isF64();
161 });
162 } else {
163 target.addIllegalOp<RawBufferAtomicFmaxOp>();
164 }
165 if (chipset.minorVersion == 0x41) {
166 // gfx941 requires non-CAS atomics to be implemented with CAS loops.
167 // The workaround here mirrors HIP and OpenMP.
168 target.addIllegalOp<RawBufferAtomicFaddOp, RawBufferAtomicFmaxOp,
169 RawBufferAtomicSmaxOp, RawBufferAtomicUminOp>();
170 }
171 }
172 patterns.add<
173 RawBufferAtomicByCasPattern<RawBufferAtomicFaddOp, arith::AddFOp>,
174 RawBufferAtomicByCasPattern<RawBufferAtomicFmaxOp, arith::MaximumFOp>,
175 RawBufferAtomicByCasPattern<RawBufferAtomicSmaxOp, arith::MaxSIOp>,
176 RawBufferAtomicByCasPattern<RawBufferAtomicUminOp, arith::MinUIOp>>(
177 patterns.getContext());
178}
179
180void AmdgpuEmulateAtomicsPass::runOnOperation() {
181 Operation *op = getOperation();
182 FailureOr<Chipset> maybeChipset = Chipset::parse(chipset);
183 if (failed(result: maybeChipset)) {
184 emitError(op->getLoc(), "Invalid chipset name: " + chipset);
185 return signalPassFailure();
186 }
187
188 MLIRContext &ctx = getContext();
189 ConversionTarget target(ctx);
190 RewritePatternSet patterns(&ctx);
191 target.markUnknownOpDynamicallyLegal(
192 fn: [](Operation *op) -> bool { return true; });
193
194 populateAmdgpuEmulateAtomicsPatterns(target, patterns, chipset: *maybeChipset);
195 if (failed(applyPartialConversion(op, target, std::move(patterns))))
196 return signalPassFailure();
197}
198

source code of mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp