1//===- Utils.cpp - Utils for GPU transform ops ----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "mlir/Dialect/GPU/TransformOps/Utils.h"
10
11#include "mlir/Dialect/Affine/IR/AffineOps.h"
12#include "mlir/Dialect/Arith/IR/Arith.h"
13#include "mlir/Dialect/GPU/IR/GPUDialect.h"
14#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
15#include "mlir/Dialect/MemRef/IR/MemRef.h"
16#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
17#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
18#include "mlir/Dialect/Utils/IndexingUtils.h"
19#include "mlir/Dialect/Vector/IR/VectorOps.h"
20#include "mlir/IR/AffineExpr.h"
21#include "mlir/IR/Builders.h"
22#include "mlir/IR/BuiltinAttributes.h"
23#include "mlir/IR/MLIRContext.h"
24#include "mlir/IR/OpDefinition.h"
25#include "mlir/IR/Value.h"
26#include "mlir/IR/Visitors.h"
27#include "mlir/Support/LLVM.h"
28#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/SmallVector.h"
30#include "llvm/Support/Debug.h"
31#include "llvm/Support/InterleavedRange.h"
32
33using namespace mlir;
34using namespace mlir::gpu;
35using namespace mlir::transform;
36using namespace mlir::transform::gpu;
37
38#define DEBUG_TYPE "gpu-transforms"
39
40#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
41#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
42#define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")
43
44/// Build predicates to filter execution by only the activeIds. Along each
45/// dimension, 3 cases appear:
46/// 1. activeMappingSize > availableMappingSize: this is an unsupported case
47/// as this requires additional looping. An error message is produced to
48/// advise the user to tile more or to use more threads.
49/// 2. activeMappingSize == availableMappingSize: no predication is needed.
50/// 3. activeMappingSize < availableMappingSize: only a subset of threads
51/// should be active and we produce the boolean `id < activeMappingSize`
52/// for further use in building predicated execution.
53static FailureOr<SmallVector<Value>>
54buildPredicates(RewriterBase &rewriter, Location loc, ArrayRef<Value> activeIds,
55 ArrayRef<int64_t> activeMappingSizes,
56 ArrayRef<int64_t> availableMappingSizes,
57 std::string &errorMsg) {
58 // clang-format off
59 LLVM_DEBUG(
60 llvm::interleaveComma(
61 activeMappingSizes, DBGS() << "----activeMappingSizes: ");
62 DBGS() << "\n";
63 llvm::interleaveComma(
64 availableMappingSizes, DBGS() << "----availableMappingSizes: ");
65 DBGS() << "\n";);
66 // clang-format on
67
68 SmallVector<Value> predicateOps;
69 for (auto [activeId, activeMappingSize, availableMappingSize] :
70 llvm::zip_equal(t&: activeIds, u&: activeMappingSizes, args&: availableMappingSizes)) {
71 if (activeMappingSize > availableMappingSize) {
72 errorMsg = "Trying to map to fewer GPU threads than loop iterations but "
73 "overprovisioning is not yet supported. Try additional tiling "
74 "before mapping or map to more threads.";
75 return failure();
76 }
77 if (activeMappingSize == availableMappingSize)
78 continue;
79 Value idx = rewriter.create<arith::ConstantIndexOp>(location: loc, args: activeMappingSize);
80 Value pred = rewriter.create<arith::CmpIOp>(location: loc, args: arith::CmpIPredicate::ult,
81 args: activeId, args&: idx);
82 predicateOps.push_back(Elt: pred);
83 }
84 return predicateOps;
85}
86
87/// Return a flattened thread id for the workgroup with given sizes.
88template <typename ThreadOrBlockIdOp>
89static Value buildLinearId(RewriterBase &rewriter, Location loc,
90 ArrayRef<OpFoldResult> originalBasisOfr) {
91 LLVM_DEBUG(llvm::interleaveComma(
92 originalBasisOfr,
93 DBGS() << "----buildLinearId with originalBasisOfr: ");
94 llvm::dbgs() << "\n");
95 assert(originalBasisOfr.size() == 3 && "expected 3 sizes");
96 IndexType indexType = rewriter.getIndexType();
97 AffineExpr tx, ty, tz, bdx, bdy;
98 bindDims(ctx: rewriter.getContext(), exprs&: tx, exprs&: ty, exprs&: tz);
99 bindSymbols(ctx: rewriter.getContext(), exprs&: bdx, exprs&: bdy);
100 SmallVector<OpFoldResult> vals{
101 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x)
102 .getResult(),
103 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y)
104 .getResult(),
105 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)
106 .getResult(),
107 originalBasisOfr[0], originalBasisOfr[1]};
108 OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
109 b&: rewriter, loc, expr: tx + ty * bdx + tz * bdx * bdy, operands: vals);
110 return getValueOrCreateConstantIndexOp(b&: rewriter, loc, ofr);
111}
112
113/// Create a linear id builder that takes the `originalBasisOfr` and decompose
114/// it in the basis of `forallMappingSizes`. The linear id builder returns an
115/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
116template <typename ThreadOrBlockIdOp>
117static GpuIdBuilderFnType
118commonLinearIdBuilderFn(int64_t multiplicity = 1,
119 DeviceMaskingAttrInterface mask = nullptr) {
120 auto res = [multiplicity, mask](RewriterBase &rewriter, Location loc,
121 ArrayRef<int64_t> forallMappingSizes,
122 ArrayRef<int64_t> originalBasis) {
123 // 0. Early-exit mask case.
124 if (mask) {
125 if (computeProduct(basis: originalBasis) >
126 mask.getMaxNumPhysicalIds() * multiplicity) {
127 return IdBuilderResult{
128 /*errorMsg=*/std::string(
129 "mask representation too short to capture all physical ids: ") +
130 std::to_string(val: mask.getMaxNumPhysicalIds()),
131 /*mappingIdOps=*/{},
132 /*predicateOps=*/{}};
133 }
134 }
135
136 // 1. Compute linearId.
137 SmallVector<OpFoldResult> originalBasisOfr =
138 getAsIndexOpFoldResult(ctx: rewriter.getContext(), values: originalBasis);
139 Value physicalLinearId =
140 buildLinearId<ThreadOrBlockIdOp>(rewriter, loc, originalBasisOfr);
141
142 // 2. Compute scaledLinearId.
143 AffineExpr d0 = getAffineDimExpr(position: 0, context: rewriter.getContext());
144 OpFoldResult scaledLinearIdOfr = affine::makeComposedFoldedAffineApply(
145 b&: rewriter, loc, expr: d0.floorDiv(v: multiplicity), operands: {physicalLinearId});
146
147 // 2.b. Adjust with mask if needed.
148 Value scaledLinearIdI64;
149 Value scaledLinearId =
150 getValueOrCreateConstantIndexOp(b&: rewriter, loc, ofr: scaledLinearIdOfr);
151 if (mask) {
152 scaledLinearId =
153 getValueOrCreateConstantIndexOp(b&: rewriter, loc, ofr: scaledLinearIdOfr);
154 scaledLinearIdI64 = rewriter.create<arith::IndexCastUIOp>(
155 location: loc, args: rewriter.getI64Type(), args&: scaledLinearId);
156 Value logicalLinearIdI64 =
157 mask.createLogicalLinearMappingId(builder&: rewriter, physicalLinearMappingId: scaledLinearIdI64);
158 scaledLinearId = rewriter.create<arith::IndexCastUIOp>(
159 location: loc, args: rewriter.getIndexType(), args&: logicalLinearIdI64);
160 LDBG("------adjusting linearId with mask: " << scaledLinearId);
161 }
162
163 // 3. Compute remapped indices.
164 SmallVector<Value> ids;
165 // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
166 // "row-major" order.
167 SmallVector<int64_t> reverseBasisSizes(llvm::reverse(C&: forallMappingSizes));
168 SmallVector<int64_t> strides = computeStrides(sizes: reverseBasisSizes);
169 SmallVector<AffineExpr> delinearizingExprs = delinearize(linearIndex: d0, strides);
170 // Reverse back to be in [0 .. n] order.
171 for (AffineExpr e : llvm::reverse(C&: delinearizingExprs)) {
172 ids.push_back(
173 Elt: affine::makeComposedAffineApply(b&: rewriter, loc, e, operands: {scaledLinearId}));
174 }
175
176 std::string errorMsg;
177 SmallVector<Value> predicateOps;
178 // 4. If mask present, it takes precedence to determine predication.
179 if (mask) {
180 Value isActiveIdPredicate =
181 mask.createIsActiveIdPredicate(builder&: rewriter, physicalLinearMappingId: scaledLinearIdI64);
182 LDBG("------adjusting predicate with mask: " << isActiveIdPredicate);
183 predicateOps.push_back(Elt: isActiveIdPredicate);
184 } else {
185 // 4.b. Otherwise, handle predicates using physicalLinearId.
186 FailureOr<SmallVector<Value>> maybePredicateOps =
187 buildPredicates(rewriter, loc, activeIds: physicalLinearId,
188 activeMappingSizes: computeProduct(basis: forallMappingSizes) * multiplicity,
189 availableMappingSizes: computeProduct(basis: originalBasis), errorMsg);
190 if (succeeded(Result: maybePredicateOps))
191 predicateOps = *maybePredicateOps;
192 }
193
194 return IdBuilderResult{/*errorMsg=*/errorMsg,
195 /*mappingIdOps=*/ids,
196 /*predicateOps=*/predicateOps};
197 };
198
199 return res;
200}
201
202/// Create a simple 3-D id builder that takes the `originalBasisOfr`
203/// The 3-D id builder returns a 3-D vector of ids for indexing and 3-D sizes
204/// + ids for predicate generation.
205template <typename ThreadOrBlockIdOp>
206static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
207 auto res = [multiplicity](RewriterBase &rewriter, Location loc,
208 ArrayRef<int64_t> forallMappingSizes,
209 ArrayRef<int64_t> originalBasis) {
210 IndexType indexType = rewriter.getIndexType();
211 SmallVector<Value> ids{
212 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::x),
213 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::y),
214 rewriter.create<ThreadOrBlockIdOp>(loc, indexType, Dimension::z)};
215 // In the 3-D mapping case, scale the first dimension by the multiplicity.
216 SmallVector<Value> scaledIds = ids;
217 AffineExpr d0 = getAffineDimExpr(position: 0, context: rewriter.getContext());
218 scaledIds[0] = cast<Value>(Val: affine::makeComposedFoldedAffineApply(
219 b&: rewriter, loc, expr: d0.floorDiv(v: multiplicity), operands: {scaledIds[0]}));
220 // In the 3-D mapping case, unscale the first dimension by the multiplicity.
221 SmallVector<int64_t> forallMappingSizeInOriginalBasis(forallMappingSizes);
222 forallMappingSizeInOriginalBasis[0] *= multiplicity;
223
224 std::string errorMsg;
225 SmallVector<Value> predicateOps;
226 FailureOr<SmallVector<Value>> maybePredicateOps =
227 buildPredicates(rewriter, loc, activeIds: ids, activeMappingSizes: forallMappingSizeInOriginalBasis,
228 availableMappingSizes: originalBasis, errorMsg);
229 if (succeeded(Result: maybePredicateOps))
230 predicateOps = *maybePredicateOps;
231
232 return IdBuilderResult{/*errorMsg=*/errorMsg,
233 /*mappingIdOps=*/scaledIds,
234 /*predicateOps=*/predicateOps};
235 };
236 return res;
237}
238
239/// Create a lane id builder that takes the `originalBasis` and decompose
240/// it in the basis of `forallMappingSizes`. The linear id builder returns an
241/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
242static GpuIdBuilderFnType laneIdBuilderFn(int64_t warpSize) {
243 auto res = [warpSize](RewriterBase &rewriter, Location loc,
244 ArrayRef<int64_t> forallMappingSizes,
245 ArrayRef<int64_t> originalBasis) {
246 // 1. Compute linearId.
247 SmallVector<OpFoldResult> originalBasisOfr =
248 getAsIndexOpFoldResult(ctx: rewriter.getContext(), values: originalBasis);
249 Value physicalLinearId =
250 buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
251
252 // 2. Compute laneId.
253 AffineExpr d0 = getAffineDimExpr(position: 0, context: rewriter.getContext());
254 OpFoldResult laneId = affine::makeComposedFoldedAffineApply(
255 b&: rewriter, loc, expr: d0 % warpSize, operands: {physicalLinearId});
256
257 // 3. Compute remapped indices.
258 SmallVector<Value> ids;
259 // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
260 // "row-major" order.
261 SmallVector<int64_t> reverseBasisSizes(llvm::reverse(C&: forallMappingSizes));
262 SmallVector<int64_t> strides = computeStrides(sizes: reverseBasisSizes);
263 SmallVector<AffineExpr> delinearizingExprs = delinearize(linearIndex: d0, strides);
264 // Reverse back to be in [0 .. n] order.
265 for (AffineExpr e : llvm::reverse(C&: delinearizingExprs)) {
266 ids.push_back(
267 Elt: affine::makeComposedAffineApply(b&: rewriter, loc, e, operands: {laneId}));
268 }
269
270 // 4. Handle predicates using laneId.
271 std::string errorMsg;
272 SmallVector<Value> predicateOps;
273 FailureOr<SmallVector<Value>> maybePredicateOps = buildPredicates(
274 rewriter, loc, activeIds: cast<Value>(Val&: laneId), activeMappingSizes: computeProduct(basis: forallMappingSizes),
275 availableMappingSizes: computeProduct(basis: originalBasis), errorMsg);
276 if (succeeded(Result: maybePredicateOps))
277 predicateOps = *maybePredicateOps;
278
279 return IdBuilderResult{/*errorMsg=*/errorMsg,
280 /*mappingIdOps=*/ids,
281 /*predicateOps=*/predicateOps};
282 };
283
284 return res;
285}
286
287namespace mlir {
288namespace transform {
289namespace gpu {
290
291GpuIdBuilder::GpuIdBuilder(MLIRContext *ctx, bool useLinearMapping,
292 const MappingIdBuilderFnType &fn)
293 : mappingAttributes(), idBuilder() {
294 if (useLinearMapping) {
295 for (uint64_t d = static_cast<uint64_t>(MappingId::LinearDim0),
296 e = getMaxEnumValForMappingId();
297 d <= e; ++d)
298 mappingAttributes.push_back(Elt: fn(ctx, symbolizeMappingId(d).value()));
299 } else {
300 for (uint64_t d = static_cast<uint64_t>(MappingId::DimX),
301 e = static_cast<uint64_t>(MappingId::DimZ);
302 d <= e; ++d)
303 mappingAttributes.push_back(Elt: fn(ctx, symbolizeMappingId(d).value()));
304 }
305}
306
307GpuBlockIdBuilder::GpuBlockIdBuilder(MLIRContext *ctx, bool useLinearMapping,
308 DeviceMaskingAttrInterface mask)
309 : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
310 return GPUBlockMappingAttr::get(context: ctx, block: id);
311 }) {
312 assert((!mask || useLinearMapping) && "mask requires linear mapping");
313 idBuilder = useLinearMapping
314 ? commonLinearIdBuilderFn<BlockIdOp>(/*multiplicity=*/1, mask)
315 : common3DIdBuilderFn<BlockIdOp>(/*multiplicity=*/1);
316}
317
318GpuWarpgroupIdBuilder::GpuWarpgroupIdBuilder(MLIRContext *ctx, int64_t warpSize,
319 bool useLinearMapping,
320 DeviceMaskingAttrInterface mask)
321 : GpuIdBuilder(ctx, useLinearMapping,
322 [](MLIRContext *ctx, MappingId id) {
323 return GPUWarpgroupMappingAttr::get(context: ctx, warpgroup: id);
324 }),
325 warpSize(warpSize) {
326 assert((!mask || useLinearMapping) && "mask requires linear mapping");
327 idBuilder = useLinearMapping
328 ? commonLinearIdBuilderFn<ThreadIdOp>(
329 /*multiplicity=*/kNumWarpsPerGroup * warpSize, mask)
330 : common3DIdBuilderFn<ThreadIdOp>(
331 /*multiplicity=*/kNumWarpsPerGroup * warpSize);
332}
333
334GpuWarpIdBuilder::GpuWarpIdBuilder(MLIRContext *ctx, int64_t warpSize,
335 bool useLinearMapping,
336 DeviceMaskingAttrInterface mask)
337 : GpuIdBuilder(ctx, useLinearMapping,
338 [](MLIRContext *ctx, MappingId id) {
339 return GPUWarpMappingAttr::get(context: ctx, warp: id);
340 }),
341 warpSize(warpSize) {
342 assert((!mask || useLinearMapping) && "mask requires linear mapping");
343 idBuilder = useLinearMapping
344 ? commonLinearIdBuilderFn<ThreadIdOp>(
345 /*multiplicity=*/warpSize, mask)
346 : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/warpSize);
347}
348
349GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping,
350 DeviceMaskingAttrInterface mask)
351 : GpuIdBuilder(ctx, useLinearMapping, [](MLIRContext *ctx, MappingId id) {
352 return GPUThreadMappingAttr::get(context: ctx, thread: id);
353 }) {
354 idBuilder =
355 useLinearMapping
356 ? commonLinearIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1, mask)
357 : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
358}
359
360GpuLaneIdBuilder::GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize,
361 bool unused, DeviceMaskingAttrInterface mask)
362 : GpuIdBuilder(ctx, /*useLinearMapping=*/true,
363 [](MLIRContext *ctx, MappingId id) {
364 return GPULaneMappingAttr::get(context: ctx, lane: id);
365 }),
366 warpSize(warpSize) {
367 assert(!mask && "mask NYI for lanes, unclear it should be at all");
368 idBuilder = laneIdBuilderFn(/*periodicity=*/warpSize);
369}
370
371DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
372 std::optional<int64_t> gridDimX,
373 std::optional<int64_t> gridDimY,
374 std::optional<int64_t> gridDimZ,
375 std::optional<int64_t> blockDimX,
376 std::optional<int64_t> blockDimY,
377 std::optional<int64_t> blockDimZ) {
378
379 // TODO: pass a configuration object to set the limits properly.
380
381 if ((blockDimX.value_or(u: 1) * blockDimY.value_or(u: 1) * blockDimZ.value_or(u: 1)) >
382 kMaxTotalBlockdim ||
383 (gridDimX.value_or(u: 1) * gridDimY.value_or(u: 1) * gridDimZ.value_or(u: 1)) >
384 kMaxTotalGriddim ||
385 blockDimX.value_or(u: 1) > kMaxBlockdimx ||
386 blockDimY.value_or(u: 1) > kMaxBlockdimy ||
387 blockDimZ.value_or(u: 1) > kMaxBlockdimz ||
388 gridDimY.value_or(u: 1) > kMaxGriddimy ||
389 gridDimZ.value_or(u: 1) > kMaxGriddimz ||
390 gridDimX.value_or(u: 1) > kMaxGriddimx) {
391 return transformOp.emitSilenceableError()
392 << "Trying to launch a GPU kernel with grid_dims = ("
393 << gridDimX.value_or(u: 1) << ", " << gridDimY.value_or(u: 1) << ", "
394 << gridDimZ.value_or(u: 1) << ") block_dims = ("
395 << blockDimX.value_or(u: 1) << ", " << blockDimY.value_or(u: 1) << ", "
396 << blockDimZ.value_or(u: 1) << "). It is larger than the limits.";
397 }
398 return DiagnosedSilenceableFailure::success();
399}
400
401DiagnosedSilenceableFailure createGpuLaunch(
402 RewriterBase &rewriter, Location loc, TransformOpInterface transformOp,
403 LaunchOp &launchOp, std::optional<int64_t> gridDimX,
404 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
405 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
406 std::optional<int64_t> blockDimZ) {
407 DiagnosedSilenceableFailure diag =
408 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
409 blockDimY, blockDimZ);
410 if (!diag.succeeded())
411 return diag;
412
413 auto createConst = [&](int dim) {
414 return rewriter.create<arith::ConstantIndexOp>(location: loc, args&: dim);
415 };
416 OpBuilder::InsertionGuard guard(rewriter);
417 Value one = createConst(1);
418 Value gridSizeX = gridDimX.has_value() ? createConst(gridDimX.value()) : one;
419 Value gridSizeY = gridDimY.has_value() ? createConst(gridDimY.value()) : one;
420 Value gridSizeZ = gridDimZ.has_value() ? createConst(gridDimZ.value()) : one;
421 Value blkSizeX = blockDimX.has_value() ? createConst(blockDimX.value()) : one;
422 Value blkSizeY = blockDimY.has_value() ? createConst(blockDimY.value()) : one;
423 Value blkSizeZ = blockDimZ.has_value() ? createConst(blockDimZ.value()) : one;
424 launchOp = rewriter.create<LaunchOp>(location: loc, args&: gridSizeX, args&: gridSizeY, args&: gridSizeZ,
425 args&: blkSizeX, args&: blkSizeY, args&: blkSizeZ);
426 rewriter.setInsertionPointToEnd(&launchOp.getBody().front());
427 rewriter.create<TerminatorOp>(location: loc);
428 return DiagnosedSilenceableFailure::success();
429}
430
431/// Alter kernel configuration of the given kernel.
432DiagnosedSilenceableFailure alterGpuLaunch(
433 RewriterBase &rewriter, LaunchOp gpuLaunch,
434 TransformOpInterface transformOp, std::optional<int64_t> gridDimX,
435 std::optional<int64_t> gridDimY, std::optional<int64_t> gridDimZ,
436 std::optional<int64_t> blockDimX, std::optional<int64_t> blockDimY,
437 std::optional<int64_t> blockDimZ) {
438 DiagnosedSilenceableFailure diag =
439 checkGpuLimits(transformOp, gridDimX, gridDimY, gridDimZ, blockDimX,
440 blockDimY, blockDimZ);
441 if (!diag.succeeded())
442 return diag;
443
444 KernelDim3 currentBlockdim = gpuLaunch.getBlockSizeOperandValues();
445 OpBuilder::InsertionGuard guard(rewriter);
446 rewriter.setInsertionPointAfterValue(currentBlockdim.x);
447 auto createConstValue = [&](int dim) {
448 return rewriter.create<arith::ConstantIndexOp>(location: currentBlockdim.x.getLoc(),
449 args&: dim);
450 };
451
452 if (gridDimX.has_value())
453 gpuLaunch.getGridSizeXMutable().assign(value: createConstValue(gridDimX.value()));
454 if (gridDimY.has_value())
455 gpuLaunch.getGridSizeYMutable().assign(value: createConstValue(gridDimY.value()));
456 if (gridDimZ.has_value())
457 gpuLaunch.getGridSizeZMutable().assign(value: createConstValue(gridDimZ.value()));
458 if (blockDimX.has_value())
459 gpuLaunch.getBlockSizeXMutable().assign(
460 value: createConstValue(blockDimX.value()));
461 if (blockDimY.has_value())
462 gpuLaunch.getBlockSizeYMutable().assign(
463 value: createConstValue(blockDimY.value()));
464 if (blockDimZ.has_value())
465 gpuLaunch.getBlockSizeZMutable().assign(
466 value: createConstValue(blockDimZ.value()));
467 return DiagnosedSilenceableFailure::success();
468}
469
470} // namespace gpu
471} // namespace transform
472} // namespace mlir
473

source code of mlir/lib/Dialect/GPU/TransformOps/Utils.cpp