1 | //===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This header file defines prototypes that expose pass constructors. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ |
14 | #define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ |
15 | |
16 | #include "Utils.h" |
17 | #include "mlir/Dialect/GPU/IR/GPUDialect.h" |
18 | #include "mlir/IR/PatternMatch.h" |
19 | #include "mlir/Pass/Pass.h" |
20 | #include <optional> |
21 | |
22 | namespace llvm { |
23 | class TargetMachine; |
24 | class LLVMContext; |
25 | class Module; |
26 | } // namespace llvm |
27 | |
28 | namespace mlir { |
29 | class TypeConverter; |
30 | class ConversionTarget; |
31 | namespace func { |
32 | class FuncOp; |
33 | } // namespace func |
34 | |
35 | #define GEN_PASS_DECL |
36 | #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" |
37 | |
38 | /// Pass that moves ops which are likely an index computation into gpu.launch |
39 | /// body. |
40 | std::unique_ptr<Pass> createGpuLauchSinkIndexComputationsPass(); |
41 | |
42 | /// Replaces `gpu.launch` with `gpu.launch_func` by moving the region into |
43 | /// a separate kernel function. |
44 | std::unique_ptr<OperationPass<ModuleOp>> |
45 | createGpuKernelOutliningPass(StringRef dataLayoutStr = StringRef()); |
46 | |
47 | /// Rewrites a function region so that GPU ops execute asynchronously. |
48 | std::unique_ptr<OperationPass<func::FuncOp>> createGpuAsyncRegionPass(); |
49 | |
50 | /// Maps the parallel loops found in the given function to workgroups. The first |
51 | /// loop encountered will be mapped to the global workgroup and the second loop |
52 | /// encountered to the local workgroup. Within each mapping, the first three |
53 | /// dimensions are mapped to x/y/z hardware ids and all following dimensions are |
54 | /// mapped to sequential loops. |
55 | std::unique_ptr<OperationPass<func::FuncOp>> createGpuMapParallelLoopsPass(); |
56 | |
57 | /// Collect a set of patterns to rewrite GlobalIdOp op within the GPU dialect. |
58 | void populateGpuGlobalIdPatterns(RewritePatternSet &patterns); |
59 | |
60 | /// Collect a set of patterns to rewrite shuffle ops within the GPU dialect. |
61 | void populateGpuShufflePatterns(RewritePatternSet &patterns); |
62 | |
63 | /// Collect a set of patterns to rewrite all-reduce ops within the GPU dialect. |
64 | void populateGpuAllReducePatterns(RewritePatternSet &patterns); |
65 | |
66 | /// Collect a set of patterns to break down subgroup_reduce ops into smaller |
67 | /// ones supported by the target of `size <= maxShuffleBitwidth`, where `size` |
68 | /// is the subgroup_reduce value bitwidth. |
69 | void populateGpuBreakDownSubgrupReducePatterns(RewritePatternSet &patterns, |
70 | unsigned maxShuffleBitwidth = 32, |
71 | PatternBenefit benefit = 1); |
72 | |
73 | /// Collect a set of patterns to lower `gpu.subgroup_reduce` into `gpu.shuffle` |
74 | /// ops over `shuffleBitwidth` scalar types. Assumes that the subgroup has |
75 | /// `subgroupSize` lanes. Uses the butterfly shuffle algorithm. |
76 | void populateGpuLowerSubgroupReduceToShufflePattenrs( |
77 | RewritePatternSet &patterns, unsigned subgroupSize, |
78 | unsigned shuffleBitwidth = 32, PatternBenefit benefit = 1); |
79 | |
80 | /// Collect all patterns to rewrite ops within the GPU dialect. |
81 | inline void populateGpuRewritePatterns(RewritePatternSet &patterns) { |
82 | populateGpuAllReducePatterns(patterns); |
83 | populateGpuGlobalIdPatterns(patterns); |
84 | populateGpuShufflePatterns(patterns); |
85 | } |
86 | |
87 | namespace gpu { |
88 | /// Searches for all GPU modules in `op` and transforms them into GPU binary |
89 | /// operations. The resulting `gpu.binary` has `handler` as its offloading |
90 | /// handler attribute. |
91 | LogicalResult transformGpuModulesToBinaries( |
92 | Operation *op, OffloadingLLVMTranslationAttrInterface handler = nullptr, |
93 | const gpu::TargetOptions &options = {}); |
94 | |
95 | /// Base pass class to serialize kernel functions through LLVM into |
96 | /// user-specified IR and add the resulting blob as module attribute. |
97 | class SerializeToBlobPass : public OperationPass<gpu::GPUModuleOp> { |
98 | public: |
99 | SerializeToBlobPass(TypeID passID); |
100 | SerializeToBlobPass(const SerializeToBlobPass &other); |
101 | |
102 | void runOnOperation() final; |
103 | |
104 | protected: |
105 | /// Hook allowing the application of optimizations before codegen |
106 | /// By default, does nothing |
107 | virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule, |
108 | llvm::TargetMachine &targetMachine); |
109 | |
110 | /// Translates the 'getOperation()' result to an LLVM module. |
111 | virtual std::unique_ptr<llvm::Module> |
112 | translateToLLVMIR(llvm::LLVMContext &llvmContext); |
113 | |
114 | private: |
115 | /// Creates the LLVM target machine to generate the ISA. |
116 | std::unique_ptr<llvm::TargetMachine> createTargetMachine(); |
117 | |
118 | /// Translates the module to ISA |
119 | std::optional<std::string> translateToISA(llvm::Module &llvmModule, |
120 | llvm::TargetMachine &targetMachine); |
121 | |
122 | /// Serializes the target ISA to binary form. |
123 | virtual std::unique_ptr<std::vector<char>> |
124 | serializeISA(const std::string &isa) = 0; |
125 | |
126 | protected: |
127 | Option<std::string> triple{*this, "triple" , |
128 | ::llvm::cl::desc("Target triple" )}; |
129 | Option<std::string> chip{*this, "chip" , |
130 | ::llvm::cl::desc("Target architecture" )}; |
131 | Option<std::string> features{*this, "features" , |
132 | ::llvm::cl::desc("Target features" )}; |
133 | Option<int> optLevel{*this, "opt-level" , |
134 | llvm::cl::desc("Optimization level for compilation" ), |
135 | llvm::cl::init(2)}; |
136 | Option<std::string> gpuBinaryAnnotation{ |
137 | *this, "gpu-binary-annotation" , |
138 | llvm::cl::desc("Annotation attribute string for GPU binary" ), |
139 | llvm::cl::init(getDefaultGpuBinaryAnnotation())}; |
140 | Option<bool> dumpPtx{*this, "dump-ptx" , |
141 | ::llvm::cl::desc("Dump generated PTX" ), |
142 | llvm::cl::init(false)}; |
143 | }; |
144 | } // namespace gpu |
145 | |
146 | //===----------------------------------------------------------------------===// |
147 | // Registration |
148 | //===----------------------------------------------------------------------===// |
149 | |
150 | /// Register pass to serialize GPU kernel functions to a HSAco binary |
151 | /// annotation. |
152 | LLVM_DEPRECATED("use Target attributes instead" , "" ) |
153 | void registerGpuSerializeToHsacoPass(); |
154 | |
155 | /// Create an instance of the GPU kernel function to HSAco binary serialization |
156 | /// pass. |
157 | LLVM_DEPRECATED("use Target attributes instead" , "" ) |
158 | std::unique_ptr<Pass> createGpuSerializeToHsacoPass(StringRef triple, |
159 | StringRef arch, |
160 | StringRef features, |
161 | int optLevel); |
162 | |
163 | /// Collect a set of patterns to decompose memrefs ops. |
164 | void populateGpuDecomposeMemrefsPatterns(RewritePatternSet &patterns); |
165 | |
166 | /// Pass decomposes memref ops inside `gpu.launch` body. |
167 | std::unique_ptr<Pass> createGpuDecomposeMemrefsPass(); |
168 | |
169 | /// Erase barriers that do not enforce conflicting memory side effects. |
170 | void populateGpuEliminateBarriersPatterns(RewritePatternSet &patterns); |
171 | |
172 | /// Generate the code for registering passes. |
173 | #define GEN_PASS_REGISTRATION |
174 | #include "mlir/Dialect/GPU/Transforms/Passes.h.inc" |
175 | |
176 | } // namespace mlir |
177 | |
178 | #endif // MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_ |
179 | |