1//===-- lib/cuda/allocator.cpp ----------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#include "flang/Runtime/CUDA/allocator.h"
10#include "flang-rt/runtime/allocator-registry.h"
11#include "flang-rt/runtime/derived.h"
12#include "flang-rt/runtime/descriptor.h"
13#include "flang-rt/runtime/environment.h"
14#include "flang-rt/runtime/lock.h"
15#include "flang-rt/runtime/stat.h"
16#include "flang-rt/runtime/terminator.h"
17#include "flang-rt/runtime/type-info.h"
18#include "flang/Common/ISO_Fortran_binding_wrapper.h"
19#include "flang/Runtime/CUDA/common.h"
20#include "flang/Support/Fortran.h"
21
22#include "cuda_runtime.h"
23
24namespace Fortran::runtime::cuda {
25
26struct DeviceAllocation {
27 void *ptr;
28 std::size_t size;
29 cudaStream_t stream;
30};
31
32// Compare address values. nullptr will be sorted at the end of the array.
33int compareDeviceAlloc(const void *a, const void *b) {
34 const DeviceAllocation *deva = (const DeviceAllocation *)a;
35 const DeviceAllocation *devb = (const DeviceAllocation *)b;
36 if (deva->ptr == nullptr && devb->ptr == nullptr)
37 return 0;
38 if (deva->ptr == nullptr)
39 return 1;
40 if (devb->ptr == nullptr)
41 return -1;
42 return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
43}
44
45// Dynamic array for tracking asynchronous allocations.
46static DeviceAllocation *deviceAllocations = nullptr;
47Lock lock;
48static int maxDeviceAllocations{512}; // Initial size
49static int numDeviceAllocations{0};
50static constexpr int allocNotFound{-1};
51
52static void initAllocations() {
53 if (!deviceAllocations) {
54 deviceAllocations = static_cast<DeviceAllocation *>(
55 malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
56 if (!deviceAllocations) {
57 Terminator terminator{__FILE__, __LINE__};
58 terminator.Crash("Failed to allocate tracking array");
59 }
60 }
61}
62
63static void doubleAllocationArray() {
64 unsigned newSize = maxDeviceAllocations * 2;
65 DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
66 realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
67 if (!newArray) {
68 Terminator terminator{__FILE__, __LINE__};
69 terminator.Crash("Failed to reallocate tracking array");
70 }
71 deviceAllocations = newArray;
72 maxDeviceAllocations = newSize;
73}
74
75static unsigned findAllocation(void *ptr) {
76 if (numDeviceAllocations == 0) {
77 return allocNotFound;
78 }
79
80 int left{0};
81 int right{numDeviceAllocations - 1};
82
83 if (left == right) {
84 return left;
85 }
86
87 while (left <= right) {
88 int mid = left + (right - left) / 2;
89 if (deviceAllocations[mid].ptr == ptr) {
90 return mid;
91 }
92 if (deviceAllocations[mid].ptr < ptr) {
93 left = mid + 1;
94 } else {
95 right = mid - 1;
96 }
97 }
98 return allocNotFound;
99}
100
101static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
102 CriticalSection critical{lock};
103 initAllocations();
104 if (numDeviceAllocations >= maxDeviceAllocations) {
105 doubleAllocationArray();
106 }
107 deviceAllocations[numDeviceAllocations].ptr = ptr;
108 deviceAllocations[numDeviceAllocations].size = size;
109 deviceAllocations[numDeviceAllocations].stream = stream;
110 ++numDeviceAllocations;
111 qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
112 compareDeviceAlloc);
113}
114
115static void eraseAllocation(int pos) {
116 deviceAllocations[pos].ptr = nullptr;
117 deviceAllocations[pos].size = 0;
118 deviceAllocations[pos].stream = (cudaStream_t)0;
119 qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
120 compareDeviceAlloc);
121 --numDeviceAllocations;
122}
123
124extern "C" {
125
126void RTDEF(CUFRegisterAllocator)() {
127 allocatorRegistry.Register(
128 kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
129 allocatorRegistry.Register(
130 kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
131 allocatorRegistry.Register(
132 kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
133 allocatorRegistry.Register(
134 kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
135}
136}
137
138void *CUFAllocPinned(
139 std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
140 void *p;
141 CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
142 return p;
143}
144
145void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
146
147void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
148 void *p;
149 if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
150 CUDA_REPORT_IF_ERROR(
151 cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
152 } else {
153 if (asyncObject == nullptr) {
154 CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
155 } else {
156 CUDA_REPORT_IF_ERROR(
157 cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
158 insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
159 }
160 }
161 return p;
162}
163
164void CUFFreeDevice(void *p) {
165 CriticalSection critical{lock};
166 int pos = findAllocation(ptr: p);
167 if (pos >= 0) {
168 cudaStream_t stream = deviceAllocations[pos].stream;
169 eraseAllocation(pos);
170 CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
171 } else {
172 CUDA_REPORT_IF_ERROR(cudaFree(p));
173 }
174}
175
176void *CUFAllocManaged(
177 std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
178 void *p;
179 CUDA_REPORT_IF_ERROR(
180 cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
181 return reinterpret_cast<void *>(p);
182}
183
184void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
185
186void *CUFAllocUnified(
187 std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
188 // Call alloc managed for the time being.
189 return CUFAllocManaged(sizeInBytes, asyncObject);
190}
191
192void CUFFreeUnified(void *p) {
193 // Call free managed for the time being.
194 CUFFreeManaged(p);
195}
196
197} // namespace Fortran::runtime::cuda
198

source code of flang-rt/lib/cuda/allocator.cpp