1 | //===-- lib/cuda/kernel.cpp -------------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang/Runtime/CUDA/kernel.h" |
10 | #include "flang-rt/runtime/descriptor.h" |
11 | #include "flang-rt/runtime/terminator.h" |
12 | #include "flang/Runtime/CUDA/common.h" |
13 | |
14 | #include "cuda_runtime.h" |
15 | |
16 | extern "C" { |
17 | |
18 | void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, |
19 | intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, |
20 | int64_t *stream, int32_t smem, void **params, void **extra) { |
21 | dim3 gridDim; |
22 | gridDim.x = gridX; |
23 | gridDim.y = gridY; |
24 | gridDim.z = gridZ; |
25 | dim3 blockDim; |
26 | blockDim.x = blockX; |
27 | blockDim.y = blockY; |
28 | blockDim.z = blockZ; |
29 | unsigned nbNegGridDim{0}; |
30 | if (gridX < 0) { |
31 | ++nbNegGridDim; |
32 | } |
33 | if (gridY < 0) { |
34 | ++nbNegGridDim; |
35 | } |
36 | if (gridZ < 0) { |
37 | ++nbNegGridDim; |
38 | } |
39 | if (nbNegGridDim == 1) { |
40 | int maxBlocks, nbBlocks, dev, multiProcCount; |
41 | cudaError_t err1, err2; |
42 | nbBlocks = blockDim.x * blockDim.y * blockDim.z; |
43 | cudaGetDevice(&dev); |
44 | err1 = cudaDeviceGetAttribute( |
45 | &multiProcCount, cudaDevAttrMultiProcessorCount, dev); |
46 | err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( |
47 | &maxBlocks, kernel, nbBlocks, smem); |
48 | if (err1 == cudaSuccess && err2 == cudaSuccess) { |
49 | maxBlocks = multiProcCount * maxBlocks; |
50 | } |
51 | if (maxBlocks > 0) { |
52 | if (gridX > 0) { |
53 | maxBlocks = maxBlocks / gridDim.x; |
54 | } |
55 | if (gridY > 0) { |
56 | maxBlocks = maxBlocks / gridDim.y; |
57 | } |
58 | if (gridZ > 0) { |
59 | maxBlocks = maxBlocks / gridDim.z; |
60 | } |
61 | if (maxBlocks < 1) { |
62 | maxBlocks = 1; |
63 | } |
64 | if (gridX < 0) { |
65 | gridDim.x = maxBlocks; |
66 | } |
67 | if (gridY < 0) { |
68 | gridDim.y = maxBlocks; |
69 | } |
70 | if (gridZ < 0) { |
71 | gridDim.z = maxBlocks; |
72 | } |
73 | } |
74 | } else if (nbNegGridDim > 1) { |
75 | Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; |
76 | terminator.Crash("Too many invalid grid dimensions" ); |
77 | } |
78 | cudaStream_t defaultStream = 0; |
79 | CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, |
80 | stream != nullptr ? (cudaStream_t)(*stream) : defaultStream)); |
81 | } |
82 | |
83 | void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, |
84 | intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY, |
85 | intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, |
86 | int64_t *stream, int32_t smem, void **params, void **extra) { |
87 | cudaLaunchConfig_t config; |
88 | config.gridDim.x = gridX; |
89 | config.gridDim.y = gridY; |
90 | config.gridDim.z = gridZ; |
91 | config.blockDim.x = blockX; |
92 | config.blockDim.y = blockY; |
93 | config.blockDim.z = blockZ; |
94 | unsigned nbNegGridDim{0}; |
95 | if (gridX < 0) { |
96 | ++nbNegGridDim; |
97 | } |
98 | if (gridY < 0) { |
99 | ++nbNegGridDim; |
100 | } |
101 | if (gridZ < 0) { |
102 | ++nbNegGridDim; |
103 | } |
104 | if (nbNegGridDim == 1) { |
105 | int maxBlocks, nbBlocks, dev, multiProcCount; |
106 | cudaError_t err1, err2; |
107 | nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z; |
108 | cudaGetDevice(&dev); |
109 | err1 = cudaDeviceGetAttribute( |
110 | &multiProcCount, cudaDevAttrMultiProcessorCount, dev); |
111 | err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( |
112 | &maxBlocks, kernel, nbBlocks, smem); |
113 | if (err1 == cudaSuccess && err2 == cudaSuccess) { |
114 | maxBlocks = multiProcCount * maxBlocks; |
115 | } |
116 | if (maxBlocks > 0) { |
117 | if (gridX > 0) { |
118 | maxBlocks = maxBlocks / config.gridDim.x; |
119 | } |
120 | if (gridY > 0) { |
121 | maxBlocks = maxBlocks / config.gridDim.y; |
122 | } |
123 | if (gridZ > 0) { |
124 | maxBlocks = maxBlocks / config.gridDim.z; |
125 | } |
126 | if (maxBlocks < 1) { |
127 | maxBlocks = 1; |
128 | } |
129 | if (gridX < 0) { |
130 | config.gridDim.x = maxBlocks; |
131 | } |
132 | if (gridY < 0) { |
133 | config.gridDim.y = maxBlocks; |
134 | } |
135 | if (gridZ < 0) { |
136 | config.gridDim.z = maxBlocks; |
137 | } |
138 | } |
139 | } else if (nbNegGridDim > 1) { |
140 | Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; |
141 | terminator.Crash("Too many invalid grid dimensions" ); |
142 | } |
143 | config.dynamicSmemBytes = smem; |
144 | if (stream != nullptr) { |
145 | config.stream = (cudaStream_t)(*stream); |
146 | } else { |
147 | config.stream = 0; |
148 | } |
149 | cudaLaunchAttribute launchAttr[1]; |
150 | launchAttr[0].id = cudaLaunchAttributeClusterDimension; |
151 | launchAttr[0].val.clusterDim.x = clusterX; |
152 | launchAttr[0].val.clusterDim.y = clusterY; |
153 | launchAttr[0].val.clusterDim.z = clusterZ; |
154 | config.numAttrs = 1; |
155 | config.attrs = launchAttr; |
156 | CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params)); |
157 | } |
158 | |
159 | void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX, |
160 | intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, |
161 | intptr_t blockZ, int64_t *stream, int32_t smem, void **params, |
162 | void **extra) { |
163 | dim3 gridDim; |
164 | gridDim.x = gridX; |
165 | gridDim.y = gridY; |
166 | gridDim.z = gridZ; |
167 | dim3 blockDim; |
168 | blockDim.x = blockX; |
169 | blockDim.y = blockY; |
170 | blockDim.z = blockZ; |
171 | unsigned nbNegGridDim{0}; |
172 | if (gridX < 0) { |
173 | ++nbNegGridDim; |
174 | } |
175 | if (gridY < 0) { |
176 | ++nbNegGridDim; |
177 | } |
178 | if (gridZ < 0) { |
179 | ++nbNegGridDim; |
180 | } |
181 | if (nbNegGridDim == 1) { |
182 | int maxBlocks, nbBlocks, dev, multiProcCount; |
183 | cudaError_t err1, err2; |
184 | nbBlocks = blockDim.x * blockDim.y * blockDim.z; |
185 | cudaGetDevice(&dev); |
186 | err1 = cudaDeviceGetAttribute( |
187 | &multiProcCount, cudaDevAttrMultiProcessorCount, dev); |
188 | err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( |
189 | &maxBlocks, kernel, nbBlocks, smem); |
190 | if (err1 == cudaSuccess && err2 == cudaSuccess) { |
191 | maxBlocks = multiProcCount * maxBlocks; |
192 | } |
193 | if (maxBlocks > 0) { |
194 | if (gridX > 0) { |
195 | maxBlocks = maxBlocks / gridDim.x; |
196 | } |
197 | if (gridY > 0) { |
198 | maxBlocks = maxBlocks / gridDim.y; |
199 | } |
200 | if (gridZ > 0) { |
201 | maxBlocks = maxBlocks / gridDim.z; |
202 | } |
203 | if (maxBlocks < 1) { |
204 | maxBlocks = 1; |
205 | } |
206 | if (gridX < 0) { |
207 | gridDim.x = maxBlocks; |
208 | } |
209 | if (gridY < 0) { |
210 | gridDim.y = maxBlocks; |
211 | } |
212 | if (gridZ < 0) { |
213 | gridDim.z = maxBlocks; |
214 | } |
215 | } |
216 | } else if (nbNegGridDim > 1) { |
217 | Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; |
218 | terminator.Crash("Too many invalid grid dimensions" ); |
219 | } |
220 | cudaStream_t defaultStream = 0; |
221 | CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, |
222 | params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream)); |
223 | } |
224 | |
225 | } // extern "C" |
226 | |