| 1 | //===- AMDGPUDialect.cpp - MLIR AMDGPU dialect implementation --------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the AMDGPU dialect and its operations. |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" |
| 14 | |
| 15 | #include "mlir/Dialect/Arith/IR/Arith.h" |
| 16 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
| 17 | #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" |
| 18 | #include "mlir/Dialect/MemRef/Utils/MemRefUtils.h" |
| 19 | #include "mlir/IR/Builders.h" |
| 20 | #include "mlir/IR/BuiltinTypes.h" |
| 21 | #include "mlir/IR/Diagnostics.h" |
| 22 | #include "mlir/IR/DialectImplementation.h" |
| 23 | #include "mlir/IR/Matchers.h" |
| 24 | #include "mlir/IR/OpImplementation.h" |
| 25 | #include "mlir/IR/PatternMatch.h" |
| 26 | #include "mlir/IR/TypeUtilities.h" |
| 27 | #include "llvm/ADT/TypeSwitch.h" |
| 28 | |
| 29 | #include <limits> |
| 30 | #include <optional> |
| 31 | |
| 32 | using namespace mlir; |
| 33 | using namespace mlir::amdgpu; |
| 34 | |
| 35 | #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.cpp.inc" |
| 36 | |
| 37 | void AMDGPUDialect::initialize() { |
| 38 | addOperations< |
| 39 | #define GET_OP_LIST |
| 40 | #include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc" |
| 41 | >(); |
| 42 | addAttributes< |
| 43 | #define GET_ATTRDEF_LIST |
| 44 | #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc" |
| 45 | >(); |
| 46 | } |
| 47 | |
| 48 | //===----------------------------------------------------------------------===// |
| 49 | // 8-bit float ops |
| 50 | //===----------------------------------------------------------------------===// |
| 51 | LogicalResult PackedTrunc2xFp8Op::verify() { |
| 52 | if (getExisting() && getExisting().getType() != getResult().getType()) |
| 53 | return emitOpError("existing values must have same type as result" ); |
| 54 | return success(); |
| 55 | } |
| 56 | |
| 57 | LogicalResult PackedStochRoundFp8Op::verify() { |
| 58 | if (getExisting() && getExisting().getType() != getResult().getType()) |
| 59 | return emitOpError("existing values must have same type as result" ); |
| 60 | return success(); |
| 61 | } |
| 62 | |
| 63 | //===----------------------------------------------------------------------===// |
| 64 | // FatRawBufferCastOp |
| 65 | //===----------------------------------------------------------------------===// |
| 66 | |
| 67 | /// Convert the type `source` to one with the same sizes and strides - and |
| 68 | /// offset, unless `stripOffset` is true, in which case the offset is reset to |
| 69 | /// 0, if the offset should be reset but the layout of `source` isn't either the |
| 70 | /// identity layout or a strided layout, this function fails. |
| 71 | static FailureOr<MemRefType> getFatRawBufferTypeLike(MemRefType source, |
| 72 | bool resetOffset) { |
| 73 | MLIRContext *ctx = source.getContext(); |
| 74 | MemRefType::Builder mb(source); |
| 75 | mb.setMemorySpace( |
| 76 | amdgpu::AddressSpaceAttr::get(ctx, amdgpu::AddressSpace::FatRawBuffer)); |
| 77 | MemRefLayoutAttrInterface layout = source.getLayout(); |
| 78 | if (resetOffset && !layout.isIdentity()) { |
| 79 | auto stridedLayout = dyn_cast<StridedLayoutAttr>(layout); |
| 80 | if (!stridedLayout) |
| 81 | return failure(); |
| 82 | mb.setLayout(StridedLayoutAttr::get(ctx, 0, stridedLayout.getStrides())); |
| 83 | } |
| 84 | return (MemRefType)(mb); |
| 85 | } |
| 86 | |
| 87 | LogicalResult FatRawBufferCastOp::inferReturnTypes( |
| 88 | MLIRContext *context, std::optional<Location> location, ValueRange operands, |
| 89 | DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions, |
| 90 | SmallVectorImpl<Type> &inferredReturnTypes) { |
| 91 | Adaptor adaptor(operands, attributes, properties, regions); |
| 92 | auto sourceType = |
| 93 | dyn_cast_if_present<MemRefType>(adaptor.getSource().getType()); |
| 94 | if (!sourceType) |
| 95 | return failure(); |
| 96 | FailureOr<MemRefType> resultType = |
| 97 | getFatRawBufferTypeLike(sourceType, adaptor.getResetOffset()); |
| 98 | if (failed(resultType)) |
| 99 | return failure(); |
| 100 | inferredReturnTypes = SmallVector<Type>{*resultType}; |
| 101 | return success(); |
| 102 | } |
| 103 | |
| 104 | LogicalResult FatRawBufferCastOp::verify() { |
| 105 | FailureOr<MemRefType> expectedResultType = |
| 106 | getFatRawBufferTypeLike(getSource().getType(), getResetOffset()); |
| 107 | if (failed(expectedResultType)) |
| 108 | return emitOpError("source type " ) |
| 109 | << getSource().getType() << " can't have its offset reset" ; |
| 110 | if (getResult().getType() != *expectedResultType) |
| 111 | return emitOpError("expected result type to be " ) |
| 112 | << *expectedResultType << " but got " << getResult().getType(); |
| 113 | return success(); |
| 114 | } |
| 115 | |
| 116 | static bool hasGlobalMemorySpace(Attribute memorySpace) { |
| 117 | if (!memorySpace) |
| 118 | return true; |
| 119 | if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace)) |
| 120 | return intMemorySpace.getInt() == 0 || intMemorySpace.getInt() == 1; |
| 121 | if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace)) |
| 122 | return gpuMemorySpace.getValue() == gpu::AddressSpace::Global; |
| 123 | return false; |
| 124 | } |
| 125 | |
| 126 | static bool hasWorkgroupMemorySpace(Attribute memorySpace) { |
| 127 | if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace)) |
| 128 | return intMemorySpace.getInt() == 3; |
| 129 | if (auto gpuMemorySpace = dyn_cast<gpu::AddressSpaceAttr>(memorySpace)) |
| 130 | return gpuMemorySpace.getValue() == gpu::AddressSpace::Workgroup; |
| 131 | return false; |
| 132 | } |
| 133 | |
| 134 | static bool hasFatRawBufferMemorySpace(Attribute memorySpace) { |
| 135 | if (auto intMemorySpace = dyn_cast<IntegerAttr>(memorySpace)) |
| 136 | return intMemorySpace.getInt() == 7; |
| 137 | if (auto gpuMemorySpace = dyn_cast<amdgpu::AddressSpaceAttr>(memorySpace)) |
| 138 | return gpuMemorySpace.getValue() == amdgpu::AddressSpace::FatRawBuffer; |
| 139 | return false; |
| 140 | } |
| 141 | |
| 142 | //===----------------------------------------------------------------------===// |
| 143 | // RawBuffer*Op |
| 144 | //===----------------------------------------------------------------------===// |
| 145 | template <typename T> |
| 146 | static LogicalResult verifyRawBufferOp(T &op) { |
| 147 | MemRefType bufferType = llvm::cast<MemRefType>(op.getMemref().getType()); |
| 148 | bool isGlobal = hasGlobalMemorySpace(bufferType.getMemorySpace()); |
| 149 | |
| 150 | if (!isGlobal) |
| 151 | return op.emitOpError( |
| 152 | "Buffer ops must operate on a memref in global memory" ); |
| 153 | if (!bufferType.hasRank()) |
| 154 | return op.emitOpError( |
| 155 | "Cannot meaningfully buffer_store to an unranked memref" ); |
| 156 | if (static_cast<int64_t>(op.getIndices().size()) != bufferType.getRank()) |
| 157 | return op.emitOpError("Expected " + Twine(bufferType.getRank()) + |
| 158 | " indices to memref" ); |
| 159 | return success(); |
| 160 | } |
| 161 | |
| 162 | LogicalResult RawBufferLoadOp::verify() { return verifyRawBufferOp(*this); } |
| 163 | |
| 164 | LogicalResult RawBufferStoreOp::verify() { return verifyRawBufferOp(*this); } |
| 165 | |
| 166 | LogicalResult RawBufferAtomicFaddOp::verify() { |
| 167 | return verifyRawBufferOp(*this); |
| 168 | } |
| 169 | |
| 170 | LogicalResult RawBufferAtomicFmaxOp::verify() { |
| 171 | return verifyRawBufferOp(*this); |
| 172 | } |
| 173 | |
| 174 | LogicalResult RawBufferAtomicSmaxOp::verify() { |
| 175 | return verifyRawBufferOp(*this); |
| 176 | } |
| 177 | |
| 178 | LogicalResult RawBufferAtomicUminOp::verify() { |
| 179 | return verifyRawBufferOp(*this); |
| 180 | } |
| 181 | |
| 182 | LogicalResult RawBufferAtomicCmpswapOp::verify() { |
| 183 | return verifyRawBufferOp(*this); |
| 184 | } |
| 185 | |
| 186 | static std::optional<uint32_t> getConstantUint32(Value v) { |
| 187 | APInt cst; |
| 188 | if (!v.getType().isInteger(width: 32)) |
| 189 | return std::nullopt; |
| 190 | if (matchPattern(v, m_ConstantInt(&cst))) |
| 191 | return cst.getZExtValue(); |
| 192 | return std::nullopt; |
| 193 | } |
| 194 | |
| 195 | template <typename OpType> |
| 196 | static bool staticallyOutOfBounds(OpType op) { |
| 197 | if (!op.getBoundsCheck()) |
| 198 | return false; |
| 199 | MemRefType bufferType = op.getMemref().getType(); |
| 200 | if (!bufferType.hasStaticShape()) |
| 201 | return false; |
| 202 | int64_t offset; |
| 203 | SmallVector<int64_t> strides; |
| 204 | if (failed(bufferType.getStridesAndOffset(strides, offset))) |
| 205 | return false; |
| 206 | int64_t result = offset + op.getIndexOffset().value_or(0); |
| 207 | if (op.getSgprOffset()) { |
| 208 | std::optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset()); |
| 209 | if (!sgprOffset) |
| 210 | return false; |
| 211 | result += *sgprOffset; |
| 212 | } |
| 213 | if (strides.size() != op.getIndices().size()) |
| 214 | return false; |
| 215 | int64_t indexVal = 0; |
| 216 | for (auto pair : llvm::zip(strides, op.getIndices())) { |
| 217 | int64_t stride = std::get<0>(pair); |
| 218 | Value idx = std::get<1>(pair); |
| 219 | std::optional<uint32_t> idxVal = getConstantUint32(v: idx); |
| 220 | if (!idxVal) |
| 221 | return false; |
| 222 | indexVal += stride * *idxVal; |
| 223 | } |
| 224 | result += indexVal; |
| 225 | if (result > std::numeric_limits<uint32_t>::max()) |
| 226 | // Overflow means don't drop |
| 227 | return false; |
| 228 | return result >= bufferType.getNumElements(); |
| 229 | } |
| 230 | |
| 231 | namespace { |
| 232 | template <typename OpType> |
| 233 | struct RemoveStaticallyOobBufferLoads final : public OpRewritePattern<OpType> { |
| 234 | using OpRewritePattern<OpType>::OpRewritePattern; |
| 235 | |
| 236 | LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override { |
| 237 | if (!staticallyOutOfBounds(op)) |
| 238 | return failure(); |
| 239 | Type loadType = op.getResult().getType(); |
| 240 | rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType, |
| 241 | rw.getZeroAttr(loadType)); |
| 242 | return success(); |
| 243 | } |
| 244 | }; |
| 245 | |
| 246 | template <typename OpType> |
| 247 | struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> { |
| 248 | using OpRewritePattern<OpType>::OpRewritePattern; |
| 249 | |
| 250 | LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override { |
| 251 | if (!staticallyOutOfBounds(op)) |
| 252 | return failure(); |
| 253 | |
| 254 | rw.eraseOp(op); |
| 255 | return success(); |
| 256 | } |
| 257 | }; |
| 258 | } // end namespace |
| 259 | |
| 260 | void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results, |
| 261 | MLIRContext *context) { |
| 262 | results.add<RemoveStaticallyOobBufferLoads<RawBufferLoadOp>>(context); |
| 263 | } |
| 264 | |
| 265 | void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results, |
| 266 | MLIRContext *context) { |
| 267 | results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context); |
| 268 | } |
| 269 | |
| 270 | void RawBufferAtomicFaddOp::getCanonicalizationPatterns( |
| 271 | RewritePatternSet &results, MLIRContext *context) { |
| 272 | results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context); |
| 273 | } |
| 274 | |
| 275 | void RawBufferAtomicFmaxOp::getCanonicalizationPatterns( |
| 276 | RewritePatternSet &results, MLIRContext *context) { |
| 277 | results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFmaxOp>>(context); |
| 278 | } |
| 279 | |
| 280 | void RawBufferAtomicSmaxOp::getCanonicalizationPatterns( |
| 281 | RewritePatternSet &results, MLIRContext *context) { |
| 282 | results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicSmaxOp>>(context); |
| 283 | } |
| 284 | |
| 285 | void RawBufferAtomicUminOp::getCanonicalizationPatterns( |
| 286 | RewritePatternSet &results, MLIRContext *context) { |
| 287 | results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicUminOp>>(context); |
| 288 | } |
| 289 | |
| 290 | void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns( |
| 291 | RewritePatternSet &results, MLIRContext *context) { |
| 292 | results.add<RemoveStaticallyOobBufferLoads<RawBufferAtomicCmpswapOp>>( |
| 293 | context); |
| 294 | } |
| 295 | |
| 296 | //===----------------------------------------------------------------------===// |
| 297 | // WMMAOp |
| 298 | //===----------------------------------------------------------------------===// |
| 299 | LogicalResult WMMAOp::verify() { |
| 300 | Type sourceAType = getSourceA().getType(); |
| 301 | Type sourceBType = getSourceB().getType(); |
| 302 | Type destType = getDestC().getType(); |
| 303 | |
| 304 | VectorType sourceVectorAType = dyn_cast<VectorType>(sourceAType); |
| 305 | VectorType sourceVectorBType = dyn_cast<VectorType>(sourceBType); |
| 306 | VectorType destVectorType = dyn_cast<VectorType>(destType); |
| 307 | |
| 308 | Type sourceAElemType = sourceVectorAType.getElementType(); |
| 309 | Type sourceBElemType = sourceVectorBType.getElementType(); |
| 310 | Type destElemType = destVectorType.getElementType(); |
| 311 | |
| 312 | if (sourceVectorAType.getNumElements() != |
| 313 | sourceVectorBType.getNumElements()) { |
| 314 | return emitOpError("source vectors have different lengths: " ) |
| 315 | << sourceVectorAType << " vs. " << sourceVectorBType; |
| 316 | } |
| 317 | |
| 318 | bool isDestFloat = isa<Float32Type, Float16Type, BFloat16Type>(destElemType); |
| 319 | bool isSrcFloat = |
| 320 | isa<Float16Type, BFloat16Type, Float8E4M3FNType, Float8E5M2Type>( |
| 321 | sourceAElemType); |
| 322 | |
| 323 | if (isDestFloat && !isSrcFloat) { |
| 324 | return emitOpError("Expected float sources with float destination" ); |
| 325 | } |
| 326 | |
| 327 | if (!isDestFloat && isSrcFloat) { |
| 328 | return emitOpError("Expected int sources with int destination" ); |
| 329 | } |
| 330 | |
| 331 | if (sourceAElemType != sourceBElemType && |
| 332 | !(isa<Float8E5M2Type, Float8E4M3FNType>(sourceAElemType) && |
| 333 | isa<Float8E5M2Type, Float8E4M3FNType>(sourceBElemType))) { |
| 334 | return emitOpError( |
| 335 | "source element types much match (except for fp8) but have " ) |
| 336 | << sourceAType << " and " << sourceBType; |
| 337 | } |
| 338 | return success(); |
| 339 | } |
| 340 | |
| 341 | //===----------------------------------------------------------------------===// |
| 342 | // MFMAOp |
| 343 | //===----------------------------------------------------------------------===// |
| 344 | LogicalResult MFMAOp::verify() { |
| 345 | constexpr uint32_t waveSize = 64; |
| 346 | Builder b(getContext()); |
| 347 | |
| 348 | Type sourceType = getSourceA().getType(); |
| 349 | Type destType = getDestC().getType(); |
| 350 | |
| 351 | Type sourceElem = sourceType, destElem = destType; |
| 352 | uint32_t sourceLen = 1, destLen = 1; |
| 353 | if (auto sourceVector = llvm::dyn_cast<VectorType>(sourceType)) { |
| 354 | sourceLen = sourceVector.getNumElements(); |
| 355 | sourceElem = sourceVector.getElementType(); |
| 356 | } |
| 357 | if (auto destVector = llvm::dyn_cast<VectorType>(destType)) { |
| 358 | destLen = destVector.getNumElements(); |
| 359 | destElem = destVector.getElementType(); |
| 360 | } |
| 361 | |
| 362 | Type sourceBType = getSourceB().getType(); |
| 363 | if (sourceElem.isFloat(8) || sourceElem.isFloat(6) || sourceElem.isFloat(4)) { |
| 364 | int64_t sourceBLen = 1; |
| 365 | Type sourceBElem = sourceBType; |
| 366 | if (auto sourceBVector = llvm::dyn_cast<VectorType>(sourceBType)) { |
| 367 | sourceBLen = sourceBVector.getNumElements(); |
| 368 | sourceBElem = sourceBVector.getElementType(); |
| 369 | } |
| 370 | if (!sourceBElem.isFloat(8) && !sourceBElem.isFloat(6) && |
| 371 | !sourceBElem.isFloat(4)) |
| 372 | return emitOpError("expected both source operands to have small-float " |
| 373 | "elements if one does" ); |
| 374 | if (sourceLen != sourceBLen) |
| 375 | return emitOpError( |
| 376 | "expected both small-float source vectors to have the same length" ); |
| 377 | } else { |
| 378 | if (sourceType != sourceBType) |
| 379 | return emitOpError("expected both non-small-float source operand types " |
| 380 | "to match exactly" ); |
| 381 | } |
| 382 | // Normalize the wider integer types the compiler expects to i8 |
| 383 | if (sourceElem.isInteger(32)) { |
| 384 | sourceLen *= 4; |
| 385 | sourceElem = b.getI8Type(); |
| 386 | } |
| 387 | if (sourceElem.isInteger(64)) { |
| 388 | sourceLen *= 8; |
| 389 | sourceElem = b.getI8Type(); |
| 390 | } |
| 391 | |
| 392 | int64_t numSourceElems = (getM() * getK() * getBlocks()) / waveSize; |
| 393 | if (sourceLen != numSourceElems) |
| 394 | return emitOpError("expected " + Twine(numSourceElems) + |
| 395 | " source values for this operation but got " + |
| 396 | Twine(sourceLen)); |
| 397 | |
| 398 | int64_t numDestElems = (getM() * getN() * getBlocks()) / waveSize; |
| 399 | if (destLen != numDestElems) |
| 400 | return emitOpError("expected " + Twine(numDestElems) + |
| 401 | " result values for this operation but got " + |
| 402 | Twine(destLen)); |
| 403 | |
| 404 | if (destElem.isF64() && getBlgp() != MFMAPermB::none) |
| 405 | return emitOpError( |
| 406 | "double-precision ops do not support permuting lanes of B" ); |
| 407 | if (destElem.isF64() && getCbsz() != 0) |
| 408 | return emitOpError( |
| 409 | "double-precision ops do not support permuting lanes of A" ); |
| 410 | if (getAbid() >= (1u << getCbsz())) |
| 411 | return emitOpError( |
| 412 | "block ID for permuting A (abid) must be below 2 ** cbsz" ); |
| 413 | |
| 414 | if ((getNegateA() || getNegateB() || getNegateC()) && !destElem.isF64()) |
| 415 | return emitOpError( |
| 416 | "negation flags only available for double-precision operations" ); |
| 417 | |
| 418 | return success(); |
| 419 | } |
| 420 | |
| 421 | //===----------------------------------------------------------------------===// |
| 422 | // DPPOp |
| 423 | //===----------------------------------------------------------------------===// |
| 424 | LogicalResult DPPOp::verify() { |
| 425 | Type srcType = getSrc().getType(); |
| 426 | if (srcType.getIntOrFloatBitWidth() > 64) { |
| 427 | return emitOpError("integer and floating point types larger than 64 bits " |
| 428 | "are not supported" ); |
| 429 | } |
| 430 | |
| 431 | DPPPerm kind = getKind(); |
| 432 | Attribute permArgument = getPermArgument().value_or(Attribute{}); |
| 433 | |
| 434 | switch (kind) { |
| 435 | |
| 436 | case DPPPerm::quad_perm: { |
| 437 | auto quadPermAttr = dyn_cast_or_null<ArrayAttr>(permArgument); |
| 438 | if (!quadPermAttr || quadPermAttr.size() != 4) { |
| 439 | return emitOpError("quad_perm attribute must have exactly 4 elements" ); |
| 440 | } |
| 441 | for (auto elem : quadPermAttr.getAsRange<IntegerAttr>()) { |
| 442 | int32_t num = elem.getInt(); |
| 443 | if (num < 0 || num > 3) { |
| 444 | return emitOpError( |
| 445 | "Each element of quad_perm must be in the range [0, 3]" ); |
| 446 | } |
| 447 | } |
| 448 | } break; |
| 449 | |
| 450 | case DPPPerm::row_shl: |
| 451 | case DPPPerm::row_shr: |
| 452 | case DPPPerm::row_ror: { |
| 453 | if (!permArgument) { |
| 454 | return emitOpError("Attribute '" + Twine(stringifyDPPPerm(kind)) + |
| 455 | "' value not specified" ); |
| 456 | } |
| 457 | if (auto intAttr = dyn_cast<IntegerAttr>(permArgument)) { |
| 458 | uint32_t attrValue = intAttr.getInt(); |
| 459 | if (attrValue < 1 || attrValue > 15) { |
| 460 | return emitOpError("Attribute value must be between 1 and 15" ); |
| 461 | } |
| 462 | } |
| 463 | } break; |
| 464 | |
| 465 | case DPPPerm::wave_shl: |
| 466 | case DPPPerm::wave_shr: |
| 467 | case DPPPerm::wave_rol: |
| 468 | case DPPPerm::wave_ror: |
| 469 | case DPPPerm::row_mirror: |
| 470 | case DPPPerm::row_half_mirror: |
| 471 | case DPPPerm::row_bcast_15: |
| 472 | case DPPPerm::row_bcast_31: { |
| 473 | if (permArgument && !isa<UnitAttr>(permArgument)) { |
| 474 | return emitOpError("Expected unit attribute for permArgument, but found " |
| 475 | "non-trivial argument" ); |
| 476 | } |
| 477 | break; |
| 478 | } |
| 479 | } |
| 480 | return success(); |
| 481 | } |
| 482 | |
| 483 | LogicalResult GatherToLDSOp::verify() { |
| 484 | MemRefType srcType = cast<MemRefType>(getSrc().getType()); |
| 485 | MemRefType dstType = cast<MemRefType>(getDst().getType()); |
| 486 | |
| 487 | if (!dstType.areTrailingDimsContiguous(dstType.getRank())) |
| 488 | return emitOpError("destination types must be contiguous" ); |
| 489 | |
| 490 | auto elemType = srcType.getElementType(); |
| 491 | // Check $src and $dst element types are the same. |
| 492 | if (elemType != dstType.getElementType()) |
| 493 | return emitOpError("source and destination element types must match" ); |
| 494 | |
| 495 | // copy type sizes should be 1, 2, or 4 bytes. |
| 496 | auto transferType = getTransferType(); |
| 497 | size_t transferSize; |
| 498 | if (auto vectorTransfer = dyn_cast<VectorType>(transferType)) { |
| 499 | transferSize = vectorTransfer.getNumElements() * |
| 500 | vectorTransfer.getElementTypeBitWidth(); |
| 501 | } else { |
| 502 | transferSize = transferType.getIntOrFloatBitWidth(); |
| 503 | } |
| 504 | if (transferSize != 8 && transferSize != 16 && transferSize != 32) |
| 505 | return emitOpError("Transfering type size must be 8, 16, or 32 bits" ); |
| 506 | |
| 507 | if (!hasGlobalMemorySpace(srcType.getMemorySpace()) && |
| 508 | !hasFatRawBufferMemorySpace(srcType.getMemorySpace())) |
| 509 | return emitOpError( |
| 510 | "source memory address space must be global or fat raw buffer" ); |
| 511 | |
| 512 | if (!hasWorkgroupMemorySpace(dstType.getMemorySpace())) |
| 513 | return emitOpError("destination memory address space must be Workgroup" ); |
| 514 | |
| 515 | return success(); |
| 516 | } |
| 517 | |
| 518 | #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.cpp.inc" |
| 519 | |
| 520 | #define GET_ATTRDEF_CLASSES |
| 521 | #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.cpp.inc" |
| 522 | |
| 523 | #define GET_OP_CLASSES |
| 524 | #include "mlir/Dialect/AMDGPU/IR/AMDGPU.cpp.inc" |
| 525 | |