allocator.cpp source code [flang-rt/lib/cuda/allocator.cpp]

1	//===-- lib/cuda/allocator.cpp ----------------------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "flang/Runtime/CUDA/allocator.h"
10	#include "flang-rt/runtime/allocator-registry.h"
11	#include "flang-rt/runtime/derived.h"
12	#include "flang-rt/runtime/descriptor.h"
13	#include "flang-rt/runtime/environment.h"
14	#include "flang-rt/runtime/lock.h"
15	#include "flang-rt/runtime/stat.h"
16	#include "flang-rt/runtime/terminator.h"
17	#include "flang-rt/runtime/type-info.h"
18	#include "flang/Common/ISO_Fortran_binding_wrapper.h"
19	#include "flang/Runtime/CUDA/common.h"
20	#include "flang/Support/Fortran.h"
21
22	#include "cuda_runtime.h"
23
24	namespace Fortran::runtime::cuda {
25
26	struct DeviceAllocation {
27	void *ptr;
28	std::size_t size;
29	cudaStream_t stream;
30	};
31
32	// Compare address values. nullptr will be sorted at the end of the array.
33	int compareDeviceAlloc(const void a, const* void *b) {
34	const DeviceAllocation deva = (const* DeviceAllocation *)a;
35	const DeviceAllocation devb = (const* DeviceAllocation *)b;
36	if (deva->ptr == nullptr && devb->ptr == nullptr)
37	return `0`;
38	if (deva->ptr == nullptr)
39	return `1`;
40	if (devb->ptr == nullptr)
41	return -`1`;
42	return deva->ptr < devb->ptr ? -`1` : (deva->ptr > devb->ptr ? `1` : `0`);
43	}
44
45	// Dynamic array for tracking asynchronous allocations.
46	static DeviceAllocation deviceAllocations = nullptr*;
47	Lock lock;
48	static int maxDeviceAllocations{`512`}; // Initial size
49	static int numDeviceAllocations{`0`};
50	static constexpr int allocNotFound{-`1`};
51
52	static void initAllocations() {
53	if (!deviceAllocations) {
54	deviceAllocations = static_cast<DeviceAllocation *>(
55	malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
56	if (!deviceAllocations) {
57	Terminator terminator{__FILE__, __LINE__};
58	terminator.Crash("Failed to allocate tracking array");
59	}
60	}
61	}
62
63	static void doubleAllocationArray() {
64	unsigned newSize = maxDeviceAllocations * `2`;
65	DeviceAllocation newArray = static_cast<DeviceAllocation >(
66	realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
67	if (!newArray) {
68	Terminator terminator{__FILE__, __LINE__};
69	terminator.Crash("Failed to reallocate tracking array");
70	}
71	deviceAllocations = newArray;
72	maxDeviceAllocations = newSize;
73	}
74
75	static unsigned findAllocation(void *ptr) {
76	if (numDeviceAllocations == `0`) {
77	return allocNotFound;
78	}
79
80	int left{`0`};
81	int right{numDeviceAllocations - `1`};
82
83	if (left == right) {
84	return left;
85	}
86
87	while (left <= right) {
88	int mid = left + (right - left) / `2`;
89	if (deviceAllocations[mid].ptr == ptr) {
90	return mid;
91	}
92	if (deviceAllocations[mid].ptr < ptr) {
93	left = mid + `1`;
94	} else {
95	right = mid - `1`;
96	}
97	}
98	return allocNotFound;
99	}
100
101	static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
102	CriticalSection critical{lock};
103	initAllocations();
104	if (numDeviceAllocations >= maxDeviceAllocations) {
105	doubleAllocationArray();
106	}
107	deviceAllocations[numDeviceAllocations].ptr = ptr;
108	deviceAllocations[numDeviceAllocations].size = size;
109	deviceAllocations[numDeviceAllocations].stream = stream;
110	++numDeviceAllocations;
111	qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
112	compareDeviceAlloc);
113	}
114
115	static void eraseAllocation(int pos) {
116	deviceAllocations[pos].ptr = nullptr;
117	deviceAllocations[pos].size = `0`;
118	deviceAllocations[pos].stream = (cudaStream_t)`0`;
119	qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
120	compareDeviceAlloc);
121	--numDeviceAllocations;
122	}
123
124	extern "C" {
125
126	void RTDEF(CUFRegisterAllocator)() {
127	allocatorRegistry.Register(
128	kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
129	allocatorRegistry.Register(
130	kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
131	allocatorRegistry.Register(
132	kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
133	allocatorRegistry.Register(
134	kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
135	}
136	}
137
138	void *CUFAllocPinned(
139	std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
140	void *p;
141	CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
142	return p;
143	}
144
145	void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
146
147	void CUFAllocDevice(std::size_t sizeInBytes, std::int64_t asyncObject) {
148	void *p;
149	if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
150	CUDA_REPORT_IF_ERROR(
151	cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
152	} else {
153	if (asyncObject == nullptr) {
154	CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
155	} else {
156	CUDA_REPORT_IF_ERROR(
157	cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
158	insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
159	}
160	}
161	return p;
162	}
163
164	void CUFFreeDevice(void *p) {
165	CriticalSection critical{lock};
166	int pos = findAllocation(ptr: p);
167	if (pos >= `0`) {
168	cudaStream_t stream = deviceAllocations[pos].stream;
169	eraseAllocation(pos);
170	CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
171	} else {
172	CUDA_REPORT_IF_ERROR(cudaFree(p));
173	}
174	}
175
176	void *CUFAllocManaged(
177	std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
178	void *p;
179	CUDA_REPORT_IF_ERROR(
180	cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
181	return reinterpret_cast<void *>(p);
182	}
183
184	void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
185
186	void *CUFAllocUnified(
187	std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
188	// Call alloc managed for the time being.
189	return CUFAllocManaged(sizeInBytes, asyncObject);
190	}
191
192	void CUFFreeUnified(void *p) {
193	// Call free managed for the time being.
194	CUFFreeManaged(p);
195	}
196
197	} // namespace Fortran::runtime::cuda
198

source code of flang-rt/lib/cuda/allocator.cpp