kernel.cpp source code [flang-rt/lib/cuda/kernel.cpp]

1	//===-- lib/cuda/kernel.cpp -------------------------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "flang/Runtime/CUDA/kernel.h"
10	#include "flang-rt/runtime/descriptor.h"
11	#include "flang-rt/runtime/terminator.h"
12	#include "flang/Runtime/CUDA/common.h"
13
14	#include "cuda_runtime.h"
15
16	extern "C" {
17
18	void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
19	intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
20	int64_t stream, int32_t smem, void* *params, void* **extra) {
21	dim3 gridDim;
22	gridDim.x = gridX;
23	gridDim.y = gridY;
24	gridDim.z = gridZ;
25	dim3 blockDim;
26	blockDim.x = blockX;
27	blockDim.y = blockY;
28	blockDim.z = blockZ;
29	unsigned nbNegGridDim{`0`};
30	if (gridX < `0`) {
31	++nbNegGridDim;
32	}
33	if (gridY < `0`) {
34	++nbNegGridDim;
35	}
36	if (gridZ < `0`) {
37	++nbNegGridDim;
38	}
39	if (nbNegGridDim == `1`) {
40	int maxBlocks, nbBlocks, dev, multiProcCount;
41	cudaError_t err1, err2;
42	nbBlocks = blockDim.x * blockDim.y * blockDim.z;
43	cudaGetDevice(&dev);
44	err1 = cudaDeviceGetAttribute(
45	&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
46	err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
47	&maxBlocks, kernel, nbBlocks, smem);
48	if (err1 == cudaSuccess && err2 == cudaSuccess) {
49	maxBlocks = multiProcCount * maxBlocks;
50	}
51	if (maxBlocks > `0`) {
52	if (gridX > `0`) {
53	maxBlocks = maxBlocks / gridDim.x;
54	}
55	if (gridY > `0`) {
56	maxBlocks = maxBlocks / gridDim.y;
57	}
58	if (gridZ > `0`) {
59	maxBlocks = maxBlocks / gridDim.z;
60	}
61	if (maxBlocks < `1`) {
62	maxBlocks = `1`;
63	}
64	if (gridX < `0`) {
65	gridDim.x = maxBlocks;
66	}
67	if (gridY < `0`) {
68	gridDim.y = maxBlocks;
69	}
70	if (gridZ < `0`) {
71	gridDim.z = maxBlocks;
72	}
73	}
74	} else if (nbNegGridDim > `1`) {
75	Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
76	terminator.Crash("Too many invalid grid dimensions");
77	}
78	cudaStream_t defaultStream = `0`;
79	CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
80	stream != nullptr ? (cudaStream_t)(*stream) : defaultStream));
81	}
82
83	void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
84	intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
85	intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
86	int64_t stream, int32_t smem, void* *params, void* **extra) {
87	cudaLaunchConfig_t config;
88	config.gridDim.x = gridX;
89	config.gridDim.y = gridY;
90	config.gridDim.z = gridZ;
91	config.blockDim.x = blockX;
92	config.blockDim.y = blockY;
93	config.blockDim.z = blockZ;
94	unsigned nbNegGridDim{`0`};
95	if (gridX < `0`) {
96	++nbNegGridDim;
97	}
98	if (gridY < `0`) {
99	++nbNegGridDim;
100	}
101	if (gridZ < `0`) {
102	++nbNegGridDim;
103	}
104	if (nbNegGridDim == `1`) {
105	int maxBlocks, nbBlocks, dev, multiProcCount;
106	cudaError_t err1, err2;
107	nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z;
108	cudaGetDevice(&dev);
109	err1 = cudaDeviceGetAttribute(
110	&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
111	err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
112	&maxBlocks, kernel, nbBlocks, smem);
113	if (err1 == cudaSuccess && err2 == cudaSuccess) {
114	maxBlocks = multiProcCount * maxBlocks;
115	}
116	if (maxBlocks > `0`) {
117	if (gridX > `0`) {
118	maxBlocks = maxBlocks / config.gridDim.x;
119	}
120	if (gridY > `0`) {
121	maxBlocks = maxBlocks / config.gridDim.y;
122	}
123	if (gridZ > `0`) {
124	maxBlocks = maxBlocks / config.gridDim.z;
125	}
126	if (maxBlocks < `1`) {
127	maxBlocks = `1`;
128	}
129	if (gridX < `0`) {
130	config.gridDim.x = maxBlocks;
131	}
132	if (gridY < `0`) {
133	config.gridDim.y = maxBlocks;
134	}
135	if (gridZ < `0`) {
136	config.gridDim.z = maxBlocks;
137	}
138	}
139	} else if (nbNegGridDim > `1`) {
140	Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
141	terminator.Crash("Too many invalid grid dimensions");
142	}
143	config.dynamicSmemBytes = smem;
144	if (stream != nullptr) {
145	config.stream = (cudaStream_t)(*stream);
146	} else {
147	config.stream = `0`;
148	}
149	cudaLaunchAttribute launchAttr[`1`];
150	launchAttr[`0`].id = cudaLaunchAttributeClusterDimension;
151	launchAttr[`0`].val.clusterDim.x = clusterX;
152	launchAttr[`0`].val.clusterDim.y = clusterY;
153	launchAttr[`0`].val.clusterDim.z = clusterZ;
154	config.numAttrs = `1`;
155	config.attrs = launchAttr;
156	CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
157	}
158
159	void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
160	intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
161	intptr_t blockZ, int64_t stream, int32_t smem, void* **params,
162	void **extra) {
163	dim3 gridDim;
164	gridDim.x = gridX;
165	gridDim.y = gridY;
166	gridDim.z = gridZ;
167	dim3 blockDim;
168	blockDim.x = blockX;
169	blockDim.y = blockY;
170	blockDim.z = blockZ;
171	unsigned nbNegGridDim{`0`};
172	if (gridX < `0`) {
173	++nbNegGridDim;
174	}
175	if (gridY < `0`) {
176	++nbNegGridDim;
177	}
178	if (gridZ < `0`) {
179	++nbNegGridDim;
180	}
181	if (nbNegGridDim == `1`) {
182	int maxBlocks, nbBlocks, dev, multiProcCount;
183	cudaError_t err1, err2;
184	nbBlocks = blockDim.x * blockDim.y * blockDim.z;
185	cudaGetDevice(&dev);
186	err1 = cudaDeviceGetAttribute(
187	&multiProcCount, cudaDevAttrMultiProcessorCount, dev);
188	err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
189	&maxBlocks, kernel, nbBlocks, smem);
190	if (err1 == cudaSuccess && err2 == cudaSuccess) {
191	maxBlocks = multiProcCount * maxBlocks;
192	}
193	if (maxBlocks > `0`) {
194	if (gridX > `0`) {
195	maxBlocks = maxBlocks / gridDim.x;
196	}
197	if (gridY > `0`) {
198	maxBlocks = maxBlocks / gridDim.y;
199	}
200	if (gridZ > `0`) {
201	maxBlocks = maxBlocks / gridDim.z;
202	}
203	if (maxBlocks < `1`) {
204	maxBlocks = `1`;
205	}
206	if (gridX < `0`) {
207	gridDim.x = maxBlocks;
208	}
209	if (gridY < `0`) {
210	gridDim.y = maxBlocks;
211	}
212	if (gridZ < `0`) {
213	gridDim.z = maxBlocks;
214	}
215	}
216	} else if (nbNegGridDim > `1`) {
217	Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
218	terminator.Crash("Too many invalid grid dimensions");
219	}
220	cudaStream_t defaultStream = `0`;
221	CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim,
222	params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream));
223	}
224
225	} // extern "C"
226

source code of flang-rt/lib/cuda/kernel.cpp