1 | //===- KernelOutlining.cpp - Implementation of GPU kernel outlining -------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the GPU dialect kernel outlining pass. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "mlir/Dialect/GPU/Transforms/Passes.h" |
14 | |
15 | #include "mlir/AsmParser/AsmParser.h" |
16 | #include "mlir/Dialect/Arith/IR/Arith.h" |
17 | #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" |
18 | #include "mlir/Dialect/DLTI/DLTI.h" |
19 | #include "mlir/Dialect/Func/IR/FuncOps.h" |
20 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
21 | #include "mlir/Dialect/GPU/Transforms/Utils.h" |
22 | #include "mlir/Dialect/MemRef/IR/MemRef.h" |
23 | #include "mlir/IR/Builders.h" |
24 | #include "mlir/IR/BuiltinAttributes.h" |
25 | #include "mlir/IR/IRMapping.h" |
26 | #include "mlir/IR/Matchers.h" |
27 | #include "mlir/IR/SymbolTable.h" |
28 | #include "mlir/Support/LLVM.h" |
29 | #include "mlir/Transforms/RegionUtils.h" |
30 | #include <limits> |
31 | |
32 | namespace mlir { |
33 | #define GEN_PASS_DEF_GPULAUNCHSINKINDEXCOMPUTATIONS |
34 | #define GEN_PASS_DEF_GPUKERNELOUTLINING |
35 | #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" |
36 | } // namespace mlir |
37 | |
38 | using namespace mlir; |
39 | |
40 | template <typename OpTy> |
41 | static void createForAllDimensions(OpBuilder &builder, Location loc, |
42 | SmallVectorImpl<Value> &values) { |
43 | for (auto dim : {gpu::Dimension::x, gpu::Dimension::y, gpu::Dimension::z}) |
44 | values.push_back(builder.create<OpTy>(loc, builder.getIndexType(), dim)); |
45 | } |
46 | |
47 | /// Adds operations generating block/thread ids and grid/block dimensions at the |
48 | /// beginning of the `launchFuncOpBody` region. Add mapping from argument in |
49 | /// entry block of `launchOpBody`, to the corresponding result value of the |
50 | /// added operations. |
51 | static void injectGpuIndexOperations(Location loc, Region &launchFuncOpBody, |
52 | Region &launchOpBody, IRMapping &map, |
53 | bool hasCluster = false) { |
54 | OpBuilder builder(loc->getContext()); |
55 | Block &firstBlock = launchOpBody.front(); |
56 | builder.setInsertionPointToStart(&launchFuncOpBody.front()); |
57 | SmallVector<Value> indexOps; |
58 | // The order is important here, as it must match the order of the arguments |
59 | createForAllDimensions<gpu::BlockIdOp>(builder, loc, indexOps); |
60 | createForAllDimensions<gpu::ThreadIdOp>(builder, loc, indexOps); |
61 | createForAllDimensions<gpu::GridDimOp>(builder, loc, indexOps); |
62 | createForAllDimensions<gpu::BlockDimOp>(builder, loc, indexOps); |
63 | if (hasCluster) { |
64 | createForAllDimensions<gpu::ClusterIdOp>(builder, loc, indexOps); |
65 | createForAllDimensions<gpu::ClusterDimOp>(builder, loc, indexOps); |
66 | } |
67 | // Replace the leading 12 function args with the respective thread/block index |
68 | // operations. Iterate backwards since args are erased and indices change. |
69 | for (const auto &indexOp : enumerate(First&: indexOps)) |
70 | map.map(from: firstBlock.getArgument(i: indexOp.index()), to: indexOp.value()); |
71 | } |
72 | |
73 | /// Identifies operations that are beneficial to sink into kernels. These |
74 | /// operations may not have side-effects, as otherwise sinking (and hence |
75 | /// duplicating them) is not legal. |
76 | static bool isLikelyAnIndexComputation(Operation *op) { |
77 | return matchPattern(op, m_Constant()) || |
78 | isa<memref::DimOp, arith::SelectOp, arith::CmpIOp>(op); |
79 | } |
80 | |
81 | /// For a given operation `op`, computes whether it is beneficial to sink the |
82 | /// operation into the kernel. An operation can be sunk if doing so does not |
83 | /// introduce new kernel arguments. Whether a value is already available in the |
84 | /// kernel (and hence does not introduce new arguments) is checked by |
85 | /// querying `existingDependencies` and `availableValues`. |
86 | /// If an operand is not yet available, we recursively check whether it can be |
87 | /// made available by siking its defining op. |
88 | /// Operations that are indentified for sinking are added to `beneficiaryOps` in |
89 | /// the order they should appear in the kernel. Furthermore, `availableValues` |
90 | /// is updated with results that will be available after sinking the identified |
91 | /// ops. |
92 | static bool ( |
93 | Operation *op, const SetVector<Value> &existingDependencies, |
94 | SetVector<Operation *> &beneficiaryOps, |
95 | llvm::SmallPtrSetImpl<Value> &availableValues, |
96 | llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) { |
97 | if (beneficiaryOps.count(key: op)) |
98 | return true; |
99 | |
100 | if (!isSinkingBeneficiary(op)) |
101 | return false; |
102 | |
103 | for (Value operand : op->getOperands()) { |
104 | // It is already visible in the kernel, keep going. |
105 | if (availableValues.count(Ptr: operand)) |
106 | continue; |
107 | // Else check whether it can be made available via sinking or already is a |
108 | // dependency. |
109 | Operation *definingOp = operand.getDefiningOp(); |
110 | if ((!definingOp || !extractBeneficiaryOps(op: definingOp, existingDependencies, |
111 | beneficiaryOps, availableValues, |
112 | isSinkingBeneficiary)) && |
113 | !existingDependencies.count(key: operand)) |
114 | return false; |
115 | } |
116 | // We will sink the operation, mark its results as now available. |
117 | beneficiaryOps.insert(X: op); |
118 | for (Value result : op->getResults()) |
119 | availableValues.insert(Ptr: result); |
120 | return true; |
121 | } |
122 | |
123 | LogicalResult mlir::sinkOperationsIntoLaunchOp( |
124 | gpu::LaunchOp launchOp, |
125 | llvm::function_ref<bool(Operation *)> isSinkingBeneficiary) { |
126 | assert(isSinkingBeneficiary); |
127 | Region &launchOpBody = launchOp.getBody(); |
128 | |
129 | // Identify uses from values defined outside of the scope of the launch |
130 | // operation. |
131 | SetVector<Value> sinkCandidates; |
132 | getUsedValuesDefinedAbove(regions: launchOpBody, values&: sinkCandidates); |
133 | |
134 | SetVector<Operation *> toBeSunk; |
135 | llvm::SmallPtrSet<Value, 4> availableValues; |
136 | for (Value operand : sinkCandidates) { |
137 | Operation *operandOp = operand.getDefiningOp(); |
138 | if (!operandOp) |
139 | continue; |
140 | extractBeneficiaryOps(op: operandOp, existingDependencies: sinkCandidates, beneficiaryOps&: toBeSunk, availableValues, |
141 | isSinkingBeneficiary); |
142 | } |
143 | |
144 | // Insert operations so that the defs get cloned before uses. |
145 | IRMapping map; |
146 | OpBuilder builder(launchOpBody); |
147 | for (Operation *op : toBeSunk) { |
148 | Operation *clonedOp = builder.clone(op&: *op, mapper&: map); |
149 | // Only replace uses within the launch op. |
150 | for (auto pair : llvm::zip(op->getResults(), clonedOp->getResults())) |
151 | replaceAllUsesInRegionWith(std::get<0>(pair), std::get<1>(pair), |
152 | launchOp.getBody()); |
153 | } |
154 | return success(); |
155 | } |
156 | |
157 | /// Return the provided KernelDim3 as an array of i32 constants if possible. |
158 | static DenseI32ArrayAttr maybeConstantDimsAttr(gpu::KernelDim3 dims) { |
159 | SmallVector<int32_t, 3> constants; |
160 | MLIRContext *ctx = dims.x.getContext(); |
161 | for (Value v : {dims.x, dims.y, dims.z}) { |
162 | APInt constValue; |
163 | if (!matchPattern(v, m_ConstantInt(&constValue))) |
164 | return nullptr; |
165 | // In the event someone called for a too-large block or grid dimension, |
166 | // don't set bounds as it is likely to cause more confusing behavior. |
167 | if (constValue.ugt(RHS: std::numeric_limits<uint32_t>::max())) |
168 | return nullptr; |
169 | constants.push_back( |
170 | Elt: constValue.getLimitedValue(Limit: std::numeric_limits<uint32_t>::max())); |
171 | } |
172 | return DenseI32ArrayAttr::get(ctx, constants); |
173 | } |
174 | |
175 | /// Outline the `gpu.launch` operation body into a kernel function. Replace |
176 | /// `gpu.terminator` operations by `gpu.return` in the generated function. |
177 | /// Set block and grid size bounds if known. |
178 | static gpu::GPUFuncOp outlineKernelFuncImpl(gpu::LaunchOp launchOp, |
179 | StringRef kernelFnName, |
180 | SetVector<Value> &operands) { |
181 | Location loc = launchOp.getLoc(); |
182 | // Create a builder with no insertion point, insertion will happen separately |
183 | // due to symbol table manipulation. |
184 | OpBuilder builder(launchOp.getContext()); |
185 | Region &launchOpBody = launchOp.getBody(); |
186 | |
187 | // Identify uses from values defined outside of the scope of the launch |
188 | // operation. |
189 | getUsedValuesDefinedAbove(regions: launchOpBody, values&: operands); |
190 | |
191 | // Create the gpu.func operation. |
192 | SmallVector<Type, 4> kernelOperandTypes; |
193 | kernelOperandTypes.reserve(N: operands.size()); |
194 | for (Value operand : operands) { |
195 | kernelOperandTypes.push_back(Elt: operand.getType()); |
196 | } |
197 | FunctionType type = |
198 | FunctionType::get(launchOp.getContext(), kernelOperandTypes, {}); |
199 | auto outlinedFunc = builder.create<gpu::GPUFuncOp>( |
200 | loc, kernelFnName, type, |
201 | TypeRange(ValueRange(launchOp.getWorkgroupAttributions())), |
202 | TypeRange(ValueRange(launchOp.getPrivateAttributions()))); |
203 | outlinedFunc->setAttr(gpu::GPUDialect::getKernelFuncAttrName(), |
204 | builder.getUnitAttr()); |
205 | |
206 | // If we can infer bounds on the grid and/or block sizes from the arguments |
207 | // to the launch op, propagate them to the generated kernel. This is safe |
208 | // because multiple launches with the same body are not deduplicated. |
209 | if (auto blockBounds = |
210 | maybeConstantDimsAttr(launchOp.getBlockSizeOperandValues())) |
211 | outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName(), |
212 | blockBounds); |
213 | if (auto gridBounds = |
214 | maybeConstantDimsAttr(launchOp.getGridSizeOperandValues())) |
215 | outlinedFunc->setAttr(gpu::GPUFuncOp::getKnownGridSizeAttrName(), |
216 | gridBounds); |
217 | |
218 | IRMapping map; |
219 | |
220 | // Map the arguments corresponding to the launch parameters like blockIdx, |
221 | // threadIdx, etc. If cluster is present, then we also generate clusterIdx and |
222 | // clusterDim. |
223 | Region &outlinedFuncBody = outlinedFunc.getBody(); |
224 | injectGpuIndexOperations(loc, outlinedFuncBody, launchOpBody, map, |
225 | launchOp.hasClusterSize()); |
226 | |
227 | // Map memory attributions from the LaunOp op to the GPUFuncOp attributions. |
228 | for (const auto &[launchArg, funcArg] : |
229 | llvm::zip(launchOp.getWorkgroupAttributions(), |
230 | outlinedFunc.getWorkgroupAttributions())) |
231 | map.map(launchArg, funcArg); |
232 | for (const auto &[launchArg, funcArg] : |
233 | llvm::zip(launchOp.getPrivateAttributions(), |
234 | outlinedFunc.getPrivateAttributions())) |
235 | map.map(launchArg, funcArg); |
236 | |
237 | // Map arguments from gpu.launch region to the arguments of the gpu.func |
238 | // operation. |
239 | Block &entryBlock = outlinedFuncBody.front(); |
240 | for (const auto &operand : enumerate(First&: operands)) |
241 | map.map(from: operand.value(), to: entryBlock.getArgument(i: operand.index())); |
242 | |
243 | // Clone the region of the gpu.launch operation into the gpu.func operation. |
244 | // TODO: If cloneInto can be modified such that if a mapping for |
245 | // a block exists, that block will be used to clone operations into (at the |
246 | // end of the block), instead of creating a new block, this would be much |
247 | // cleaner. |
248 | launchOpBody.cloneInto(dest: &outlinedFuncBody, mapper&: map); |
249 | |
250 | // Branch from entry of the gpu.func operation to the block that is cloned |
251 | // from the entry block of the gpu.launch operation. |
252 | Block &launchOpEntry = launchOpBody.front(); |
253 | Block *clonedLaunchOpEntry = map.lookup(from: &launchOpEntry); |
254 | builder.setInsertionPointToEnd(&entryBlock); |
255 | builder.create<cf::BranchOp>(loc, clonedLaunchOpEntry); |
256 | |
257 | outlinedFunc.walk([](gpu::TerminatorOp op) { |
258 | OpBuilder replacer(op); |
259 | replacer.create<gpu::ReturnOp>(op.getLoc()); |
260 | op.erase(); |
261 | }); |
262 | return outlinedFunc; |
263 | } |
264 | |
265 | gpu::GPUFuncOp mlir::outlineKernelFunc(gpu::LaunchOp launchOp, |
266 | StringRef kernelFnName, |
267 | llvm::SmallVectorImpl<Value> &operands) { |
268 | DenseSet<Value> inputOperandSet; |
269 | inputOperandSet.insert(I: operands.begin(), E: operands.end()); |
270 | SetVector<Value> operandSet(operands.begin(), operands.end()); |
271 | auto funcOp = outlineKernelFuncImpl(launchOp, kernelFnName, operandSet); |
272 | for (auto operand : operandSet) { |
273 | if (!inputOperandSet.count(V: operand)) |
274 | operands.push_back(Elt: operand); |
275 | } |
276 | return funcOp; |
277 | } |
278 | |
279 | /// Replace `gpu.launch` operations with an `gpu.launch_func` operation |
280 | /// launching `kernelFunc`. The kernel func contains the body of the |
281 | /// `gpu.launch` with constant region arguments inlined. |
282 | static void convertToLaunchFuncOp(gpu::LaunchOp launchOp, |
283 | gpu::GPUFuncOp kernelFunc, |
284 | ValueRange operands) { |
285 | OpBuilder builder(launchOp); |
286 | // The launch op has an optional dynamic shared memory size. If it doesn't |
287 | // exist, we use zero. |
288 | Value asyncToken = launchOp.getAsyncToken(); |
289 | std::optional<gpu::KernelDim3> clusterSize = |
290 | launchOp.getClusterSizeOperandValues(); |
291 | auto launchFunc = builder.create<gpu::LaunchFuncOp>( |
292 | launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), |
293 | launchOp.getBlockSizeOperandValues(), |
294 | launchOp.getDynamicSharedMemorySize(), operands, |
295 | asyncToken ? asyncToken.getType() : nullptr, |
296 | launchOp.getAsyncDependencies(), clusterSize); |
297 | launchOp.replaceAllUsesWith(launchFunc); |
298 | launchOp.erase(); |
299 | } |
300 | |
301 | namespace { |
302 | /// Pass that moves ops which are likely an index computation into gpu.launch |
303 | /// body. |
304 | class GpuLaunchSinkIndexComputationsPass |
305 | : public impl::GpuLaunchSinkIndexComputationsBase< |
306 | GpuLaunchSinkIndexComputationsPass> { |
307 | public: |
308 | void runOnOperation() override { |
309 | Operation *op = getOperation(); |
310 | if (op->walk([](gpu::LaunchOp launch) { |
311 | // Pull in instructions that can be sunk |
312 | if (failed(sinkOperationsIntoLaunchOp(launch, |
313 | isLikelyAnIndexComputation))) |
314 | return WalkResult::interrupt(); |
315 | |
316 | return WalkResult::advance(); |
317 | }).wasInterrupted()) |
318 | signalPassFailure(); |
319 | } |
320 | }; |
321 | |
322 | /// Pass that moves the kernel of each LaunchOp into its separate nested module. |
323 | /// |
324 | /// This pass moves the kernel code of each LaunchOp into a function created |
325 | /// inside a nested module. It also creates an external function of the same |
326 | /// name in the parent module. |
327 | /// |
328 | /// The gpu.modules are intended to be compiled to a cubin blob independently in |
329 | /// a separate pass. The external functions can then be annotated with the |
330 | /// symbol of the cubin accessor function. |
331 | class GpuKernelOutliningPass |
332 | : public impl::GpuKernelOutliningBase<GpuKernelOutliningPass> { |
333 | public: |
334 | GpuKernelOutliningPass(StringRef dlStr) { |
335 | if (!dlStr.empty() && !dataLayoutStr.hasValue()) |
336 | dataLayoutStr = dlStr.str(); |
337 | } |
338 | |
339 | GpuKernelOutliningPass(const GpuKernelOutliningPass &other) |
340 | : GpuKernelOutliningBase(other), dataLayoutSpec(other.dataLayoutSpec) { |
341 | dataLayoutStr = other.dataLayoutStr.getValue(); |
342 | } |
343 | |
344 | LogicalResult initialize(MLIRContext *context) override { |
345 | // Initialize the data layout specification from the data layout string. |
346 | if (!dataLayoutStr.empty()) { |
347 | Attribute resultAttr = mlir::parseAttribute(dataLayoutStr, context); |
348 | if (!resultAttr) |
349 | return failure(); |
350 | |
351 | dataLayoutSpec = dyn_cast<DataLayoutSpecInterface>(resultAttr); |
352 | if (!dataLayoutSpec) |
353 | return failure(); |
354 | } |
355 | |
356 | return success(); |
357 | } |
358 | |
359 | void runOnOperation() override { |
360 | SymbolTable symbolTable(getOperation()); |
361 | bool modified = false; |
362 | for (auto func : getOperation().getOps<SymbolOpInterface>()) { |
363 | // Insert just after the function. |
364 | Block::iterator insertPt(func->getNextNode()); |
365 | auto funcWalkResult = func.walk([&](gpu::LaunchOp op) { |
366 | SetVector<Value> operands; |
367 | std::string kernelFnName = |
368 | Twine(op->getParentOfType<SymbolOpInterface>().getName(), "_kernel" ) |
369 | .str(); |
370 | |
371 | gpu::GPUFuncOp outlinedFunc = |
372 | outlineKernelFuncImpl(op, kernelFnName, operands); |
373 | |
374 | // Create nested module and insert outlinedFunc. The module will |
375 | // originally get the same name as the function, but may be renamed on |
376 | // insertion into the parent module. |
377 | auto kernelModule = createKernelModule(outlinedFunc, symbolTable); |
378 | symbolTable.insert(kernelModule, insertPt); |
379 | |
380 | // Potentially changes signature, pulling in constants. |
381 | convertToLaunchFuncOp(op, outlinedFunc, operands.getArrayRef()); |
382 | modified = true; |
383 | return WalkResult::advance(); |
384 | }); |
385 | if (funcWalkResult.wasInterrupted()) |
386 | return signalPassFailure(); |
387 | } |
388 | |
389 | // If any new module was inserted in this module, annotate this module as |
390 | // a container module. |
391 | if (modified) |
392 | getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(), |
393 | UnitAttr::get(&getContext())); |
394 | } |
395 | |
396 | private: |
397 | /// Returns a gpu.module containing kernelFunc and all callees (recursive). |
398 | gpu::GPUModuleOp createKernelModule(gpu::GPUFuncOp kernelFunc, |
399 | const SymbolTable &parentSymbolTable) { |
400 | // TODO: This code cannot use an OpBuilder because it must be inserted into |
401 | // a SymbolTable by the caller. SymbolTable needs to be refactored to |
402 | // prevent manual building of Ops with symbols in code using SymbolTables |
403 | // and then this needs to use the OpBuilder. |
404 | auto *context = getOperation().getContext(); |
405 | OpBuilder builder(context); |
406 | auto kernelModule = builder.create<gpu::GPUModuleOp>(kernelFunc.getLoc(), |
407 | kernelFunc.getName()); |
408 | |
409 | // If a valid data layout spec was provided, attach it to the kernel module. |
410 | // Otherwise, the default data layout will be used. |
411 | if (dataLayoutSpec) |
412 | kernelModule->setAttr(DLTIDialect::kDataLayoutAttrName, dataLayoutSpec); |
413 | |
414 | SymbolTable symbolTable(kernelModule); |
415 | symbolTable.insert(symbol: kernelFunc); |
416 | |
417 | SmallVector<Operation *, 8> symbolDefWorklist = {kernelFunc}; |
418 | while (!symbolDefWorklist.empty()) { |
419 | if (std::optional<SymbolTable::UseRange> symbolUses = |
420 | SymbolTable::getSymbolUses(from: symbolDefWorklist.pop_back_val())) { |
421 | for (SymbolTable::SymbolUse symbolUse : *symbolUses) { |
422 | StringRef symbolName = |
423 | cast<FlatSymbolRefAttr>(symbolUse.getSymbolRef()).getValue(); |
424 | if (symbolTable.lookup(symbolName)) |
425 | continue; |
426 | |
427 | Operation *symbolDefClone = |
428 | parentSymbolTable.lookup(symbolName)->clone(); |
429 | symbolDefWorklist.push_back(symbolDefClone); |
430 | symbolTable.insert(symbolDefClone); |
431 | } |
432 | } |
433 | } |
434 | |
435 | return kernelModule; |
436 | } |
437 | |
438 | Option<std::string> dataLayoutStr{ |
439 | *this, "data-layout-str" , |
440 | llvm::cl::desc("String containing the data layout specification to be " |
441 | "attached to the GPU kernel module" )}; |
442 | |
443 | DataLayoutSpecInterface dataLayoutSpec; |
444 | }; |
445 | |
446 | } // namespace |
447 | |
448 | std::unique_ptr<Pass> mlir::createGpuLauchSinkIndexComputationsPass() { |
449 | return std::make_unique<GpuLaunchSinkIndexComputationsPass>(); |
450 | } |
451 | |
452 | std::unique_ptr<OperationPass<ModuleOp>> |
453 | mlir::createGpuKernelOutliningPass(StringRef dataLayoutStr) { |
454 | return std::make_unique<GpuKernelOutliningPass>(args&: dataLayoutStr); |
455 | } |
456 | |