1 | //===-- lib/cuda/allocator.cpp ----------------------------------*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang/Runtime/CUDA/allocator.h" |
10 | #include "flang-rt/runtime/allocator-registry.h" |
11 | #include "flang-rt/runtime/derived.h" |
12 | #include "flang-rt/runtime/descriptor.h" |
13 | #include "flang-rt/runtime/environment.h" |
14 | #include "flang-rt/runtime/lock.h" |
15 | #include "flang-rt/runtime/stat.h" |
16 | #include "flang-rt/runtime/terminator.h" |
17 | #include "flang-rt/runtime/type-info.h" |
18 | #include "flang/Common/ISO_Fortran_binding_wrapper.h" |
19 | #include "flang/Runtime/CUDA/common.h" |
20 | #include "flang/Support/Fortran.h" |
21 | |
22 | #include "cuda_runtime.h" |
23 | |
24 | namespace Fortran::runtime::cuda { |
25 | |
26 | struct DeviceAllocation { |
27 | void *ptr; |
28 | std::size_t size; |
29 | cudaStream_t stream; |
30 | }; |
31 | |
32 | // Compare address values. nullptr will be sorted at the end of the array. |
33 | int compareDeviceAlloc(const void *a, const void *b) { |
34 | const DeviceAllocation *deva = (const DeviceAllocation *)a; |
35 | const DeviceAllocation *devb = (const DeviceAllocation *)b; |
36 | if (deva->ptr == nullptr && devb->ptr == nullptr) |
37 | return 0; |
38 | if (deva->ptr == nullptr) |
39 | return 1; |
40 | if (devb->ptr == nullptr) |
41 | return -1; |
42 | return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0); |
43 | } |
44 | |
45 | // Dynamic array for tracking asynchronous allocations. |
46 | static DeviceAllocation *deviceAllocations = nullptr; |
47 | Lock lock; |
48 | static int maxDeviceAllocations{512}; // Initial size |
49 | static int numDeviceAllocations{0}; |
50 | static constexpr int allocNotFound{-1}; |
51 | |
52 | static void initAllocations() { |
53 | if (!deviceAllocations) { |
54 | deviceAllocations = static_cast<DeviceAllocation *>( |
55 | malloc(maxDeviceAllocations * sizeof(DeviceAllocation))); |
56 | if (!deviceAllocations) { |
57 | Terminator terminator{__FILE__, __LINE__}; |
58 | terminator.Crash("Failed to allocate tracking array" ); |
59 | } |
60 | } |
61 | } |
62 | |
63 | static void doubleAllocationArray() { |
64 | unsigned newSize = maxDeviceAllocations * 2; |
65 | DeviceAllocation *newArray = static_cast<DeviceAllocation *>( |
66 | realloc(deviceAllocations, newSize * sizeof(DeviceAllocation))); |
67 | if (!newArray) { |
68 | Terminator terminator{__FILE__, __LINE__}; |
69 | terminator.Crash("Failed to reallocate tracking array" ); |
70 | } |
71 | deviceAllocations = newArray; |
72 | maxDeviceAllocations = newSize; |
73 | } |
74 | |
75 | static unsigned findAllocation(void *ptr) { |
76 | if (numDeviceAllocations == 0) { |
77 | return allocNotFound; |
78 | } |
79 | |
80 | int left{0}; |
81 | int right{numDeviceAllocations - 1}; |
82 | |
83 | if (left == right) { |
84 | return left; |
85 | } |
86 | |
87 | while (left <= right) { |
88 | int mid = left + (right - left) / 2; |
89 | if (deviceAllocations[mid].ptr == ptr) { |
90 | return mid; |
91 | } |
92 | if (deviceAllocations[mid].ptr < ptr) { |
93 | left = mid + 1; |
94 | } else { |
95 | right = mid - 1; |
96 | } |
97 | } |
98 | return allocNotFound; |
99 | } |
100 | |
101 | static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) { |
102 | CriticalSection critical{lock}; |
103 | initAllocations(); |
104 | if (numDeviceAllocations >= maxDeviceAllocations) { |
105 | doubleAllocationArray(); |
106 | } |
107 | deviceAllocations[numDeviceAllocations].ptr = ptr; |
108 | deviceAllocations[numDeviceAllocations].size = size; |
109 | deviceAllocations[numDeviceAllocations].stream = stream; |
110 | ++numDeviceAllocations; |
111 | qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), |
112 | compareDeviceAlloc); |
113 | } |
114 | |
115 | static void eraseAllocation(int pos) { |
116 | deviceAllocations[pos].ptr = nullptr; |
117 | deviceAllocations[pos].size = 0; |
118 | deviceAllocations[pos].stream = (cudaStream_t)0; |
119 | qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation), |
120 | compareDeviceAlloc); |
121 | --numDeviceAllocations; |
122 | } |
123 | |
124 | extern "C" { |
125 | |
126 | void RTDEF(CUFRegisterAllocator)() { |
127 | allocatorRegistry.Register( |
128 | kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned}); |
129 | allocatorRegistry.Register( |
130 | kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice}); |
131 | allocatorRegistry.Register( |
132 | kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged}); |
133 | allocatorRegistry.Register( |
134 | kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified}); |
135 | } |
136 | } |
137 | |
138 | void *CUFAllocPinned( |
139 | std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) { |
140 | void *p; |
141 | CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes)); |
142 | return p; |
143 | } |
144 | |
145 | void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); } |
146 | |
147 | void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) { |
148 | void *p; |
149 | if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) { |
150 | CUDA_REPORT_IF_ERROR( |
151 | cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); |
152 | } else { |
153 | if (asyncObject == nullptr) { |
154 | CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes)); |
155 | } else { |
156 | CUDA_REPORT_IF_ERROR( |
157 | cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject)); |
158 | insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject); |
159 | } |
160 | } |
161 | return p; |
162 | } |
163 | |
164 | void CUFFreeDevice(void *p) { |
165 | CriticalSection critical{lock}; |
166 | int pos = findAllocation(ptr: p); |
167 | if (pos >= 0) { |
168 | cudaStream_t stream = deviceAllocations[pos].stream; |
169 | eraseAllocation(pos); |
170 | CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream)); |
171 | } else { |
172 | CUDA_REPORT_IF_ERROR(cudaFree(p)); |
173 | } |
174 | } |
175 | |
176 | void *CUFAllocManaged( |
177 | std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) { |
178 | void *p; |
179 | CUDA_REPORT_IF_ERROR( |
180 | cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal)); |
181 | return reinterpret_cast<void *>(p); |
182 | } |
183 | |
184 | void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); } |
185 | |
186 | void *CUFAllocUnified( |
187 | std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) { |
188 | // Call alloc managed for the time being. |
189 | return CUFAllocManaged(sizeInBytes, asyncObject); |
190 | } |
191 | |
192 | void CUFFreeUnified(void *p) { |
193 | // Call free managed for the time being. |
194 | CUFFreeManaged(p); |
195 | } |
196 | |
197 | } // namespace Fortran::runtime::cuda |
198 | |