1//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This header file defines prototypes that expose pass constructors.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
14#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
15
16#include "Utils.h"
17#include "mlir/Dialect/GPU/IR/GPUDialect.h"
18#include "mlir/IR/PatternMatch.h"
19#include "mlir/Pass/Pass.h"
20#include <optional>
21
22namespace llvm {
23class TargetMachine;
24class LLVMContext;
25class Module;
26} // namespace llvm
27
28namespace mlir {
29class TypeConverter;
30class ConversionTarget;
31namespace func {
32class FuncOp;
33} // namespace func
34
35#define GEN_PASS_DECL
36#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
37
38/// Pass that moves ops which are likely an index computation into gpu.launch
39/// body.
40std::unique_ptr<Pass> createGpuLauchSinkIndexComputationsPass();
41
42/// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into
43/// a separate kernel function.
44std::unique_ptr<OperationPass<ModuleOp>>
45createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef());
46
47/// Rewrites a function region so that GPU ops execute asynchronously.
48std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass();
49
50/// Maps the parallel loops found in the given function to workgroups. The first
51/// loop encountered will be mapped to the global workgroup and the second loop
52/// encountered to the local workgroup. Within each mapping, the first three
53/// dimensions are mapped to x/y/z hardware ids and all following dimensions are
54/// mapped to sequential loops.
55std::unique_ptr<OperationPass<func::FuncOp>> createGpuMapParallelLoopsPass();
56
57/// Collect a set of patterns to rewrite GlobalIdOp op within the GPU dialect.
58void populateGpuGlobalIdPatterns(RewritePatternSet &patterns);
59
60/// Collect a set of patterns to rewrite shuffle ops within the GPU dialect.
61void populateGpuShufflePatterns(RewritePatternSet &patterns);
62
63/// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect.
64void populateGpuAllReducePatterns(RewritePatternSet &patterns);
65
66/// Collect a set of patterns to break down subgroup_reduce ops into smaller
67/// ones supported by the target of `size <= maxShuffleBitwidth`, where `size`
68/// is the subgroup_reduce value bitwidth.
69void populateGpuBreakDownSubgrupReducePatterns(RewritePatternSet &patterns,
70 unsigned maxShuffleBitwidth = 32,
71 PatternBenefit benefit = 1);
72
73/// Collect a set of patterns to lower `gpu.subgroup_reduce` into `gpu.shuffle`
74/// ops over `shuffleBitwidth` scalar types. Assumes that the subgroup has
75/// `subgroupSize` lanes. Uses the butterfly shuffle algorithm.
76void populateGpuLowerSubgroupReduceToShufflePattenrs(
77 RewritePatternSet &patterns, unsigned subgroupSize,
78 unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1);
79
80/// Collect all patterns to rewrite ops within the GPU dialect.
81inline void populateGpuRewritePatterns(RewritePatternSet &patterns) {
82 populateGpuAllReducePatterns(patterns);
83 populateGpuGlobalIdPatterns(patterns);
84 populateGpuShufflePatterns(patterns);
85}
86
87namespace gpu {
88/// Searches for all GPU modules in `op` and transforms them into GPU binary
89/// operations. The resulting `gpu.binary` has `handler` as its offloading
90/// handler attribute.
91LogicalResult transformGpuModulesToBinaries(
92 Operation *op, OffloadingLLVMTranslationAttrInterface handler = nullptr,
93 const gpu::TargetOptions &options = {});
94
95/// Base pass class to serialize kernel functions through LLVM into
96/// user-specified IR and add the resulting blob as module attribute.
97class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> {
98public:
99 SerializeToBlobPass(TypeID passID);
100 SerializeToBlobPass(const SerializeToBlobPass &other);
101
102 void runOnOperation() final;
103
104protected:
105 /// Hook allowing the application of optimizations before codegen
106 /// By default, does nothing
107 virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule,
108 llvm::TargetMachine &targetMachine);
109
110 /// Translates the 'getOperation()' result to an LLVM module.
111 virtual std::unique_ptr<llvm::Module>
112 translateToLLVMIR(llvm::LLVMContext &llvmContext);
113
114private:
115 /// Creates the LLVM target machine to generate the ISA.
116 std::unique_ptr<llvm::TargetMachine> createTargetMachine();
117
118 /// Translates the module to ISA
119 std::optional<std::string> translateToISA(llvm::Module &llvmModule,
120 llvm::TargetMachine &targetMachine);
121
122 /// Serializes the target ISA to binary form.
123 virtual std::unique_ptr<std::vector<char>>
124 serializeISA(const std::string &isa) = 0;
125
126protected:
127 Option<std::string> triple{*this, "triple",
128 ::llvm::cl::desc("Target triple")};
129 Option<std::string> chip{*this, "chip",
130 ::llvm::cl::desc("Target architecture")};
131 Option<std::string> features{*this, "features",
132 ::llvm::cl::desc("Target features")};
133 Option<int> optLevel{*this, "opt-level",
134 llvm::cl::desc("Optimization level for compilation"),
135 llvm::cl::init(2)};
136 Option<std::string> gpuBinaryAnnotation{
137 *this, "gpu-binary-annotation",
138 llvm::cl::desc("Annotation attribute string for GPU binary"),
139 llvm::cl::init(getDefaultGpuBinaryAnnotation())};
140 Option<bool> dumpPtx{*this, "dump-ptx",
141 ::llvm::cl::desc("Dump generated PTX"),
142 llvm::cl::init(false)};
143};
144} // namespace gpu
145
146//===----------------------------------------------------------------------===//
147// Registration
148//===----------------------------------------------------------------------===//
149
150/// Register pass to serialize GPU kernel functions to a HSAco binary
151/// annotation.
152LLVM_DEPRECATED("use Target attributes instead", "")
153void registerGpuSerializeToHsacoPass();
154
155/// Create an instance of the GPU kernel function to HSAco binary serialization
156/// pass.
157LLVM_DEPRECATED("use Target attributes instead", "")
158std::unique_ptr<Pass> createGpuSerializeToHsacoPass(StringRef triple,
159 StringRef arch,
160 StringRef features,
161 int optLevel);
162
163/// Collect a set of patterns to decompose memrefs ops.
164void populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns);
165
166/// Pass decomposes memref ops inside `gpu.launch` body.
167std::unique_ptr<Pass> createGpuDecomposeMemrefsPass();
168
169/// Erase barriers that do not enforce conflicting memory side effects.
170void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns);
171
172/// Generate the code for registering passes.
173#define GEN_PASS_REGISTRATION
174#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
175
176} // namespace mlir
177
178#endif // MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
179

source code of mlir/include/mlir/Dialect/GPU/Transforms/Passes.h