1 | /* |
2 | * Copyright 2021 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | */ |
23 | |
24 | #include <linux/devcoredump.h> |
25 | #include <generated/utsrelease.h> |
26 | |
27 | #include "amdgpu_reset.h" |
28 | #include "aldebaran.h" |
29 | #include "sienna_cichlid.h" |
30 | #include "smu_v13_0_10.h" |
31 | |
32 | int amdgpu_reset_init(struct amdgpu_device *adev) |
33 | { |
34 | int ret = 0; |
35 | |
36 | switch (amdgpu_ip_version(adev, ip: MP1_HWIP, inst: 0)) { |
37 | case IP_VERSION(13, 0, 2): |
38 | case IP_VERSION(13, 0, 6): |
39 | ret = aldebaran_reset_init(adev); |
40 | break; |
41 | case IP_VERSION(11, 0, 7): |
42 | ret = sienna_cichlid_reset_init(adev); |
43 | break; |
44 | case IP_VERSION(13, 0, 10): |
45 | ret = smu_v13_0_10_reset_init(adev); |
46 | break; |
47 | default: |
48 | break; |
49 | } |
50 | |
51 | return ret; |
52 | } |
53 | |
54 | int amdgpu_reset_fini(struct amdgpu_device *adev) |
55 | { |
56 | int ret = 0; |
57 | |
58 | switch (amdgpu_ip_version(adev, ip: MP1_HWIP, inst: 0)) { |
59 | case IP_VERSION(13, 0, 2): |
60 | case IP_VERSION(13, 0, 6): |
61 | ret = aldebaran_reset_fini(adev); |
62 | break; |
63 | case IP_VERSION(11, 0, 7): |
64 | ret = sienna_cichlid_reset_fini(adev); |
65 | break; |
66 | case IP_VERSION(13, 0, 10): |
67 | ret = smu_v13_0_10_reset_fini(adev); |
68 | break; |
69 | default: |
70 | break; |
71 | } |
72 | |
73 | return ret; |
74 | } |
75 | |
76 | int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, |
77 | struct amdgpu_reset_context *reset_context) |
78 | { |
79 | struct amdgpu_reset_handler *reset_handler = NULL; |
80 | |
81 | if (adev->reset_cntl && adev->reset_cntl->get_reset_handler) |
82 | reset_handler = adev->reset_cntl->get_reset_handler( |
83 | adev->reset_cntl, reset_context); |
84 | if (!reset_handler) |
85 | return -EOPNOTSUPP; |
86 | |
87 | return reset_handler->prepare_hwcontext(adev->reset_cntl, |
88 | reset_context); |
89 | } |
90 | |
91 | int amdgpu_reset_perform_reset(struct amdgpu_device *adev, |
92 | struct amdgpu_reset_context *reset_context) |
93 | { |
94 | int ret; |
95 | struct amdgpu_reset_handler *reset_handler = NULL; |
96 | |
97 | if (adev->reset_cntl) |
98 | reset_handler = adev->reset_cntl->get_reset_handler( |
99 | adev->reset_cntl, reset_context); |
100 | if (!reset_handler) |
101 | return -EOPNOTSUPP; |
102 | |
103 | ret = reset_handler->perform_reset(adev->reset_cntl, reset_context); |
104 | if (ret) |
105 | return ret; |
106 | |
107 | return reset_handler->restore_hwcontext(adev->reset_cntl, |
108 | reset_context); |
109 | } |
110 | |
111 | |
112 | void amdgpu_reset_destroy_reset_domain(struct kref *ref) |
113 | { |
114 | struct amdgpu_reset_domain *reset_domain = container_of(ref, |
115 | struct amdgpu_reset_domain, |
116 | refcount); |
117 | if (reset_domain->wq) |
118 | destroy_workqueue(wq: reset_domain->wq); |
119 | |
120 | kvfree(addr: reset_domain); |
121 | } |
122 | |
123 | struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, |
124 | char *wq_name) |
125 | { |
126 | struct amdgpu_reset_domain *reset_domain; |
127 | |
128 | reset_domain = kvzalloc(size: sizeof(struct amdgpu_reset_domain), GFP_KERNEL); |
129 | if (!reset_domain) { |
130 | DRM_ERROR("Failed to allocate amdgpu_reset_domain!" ); |
131 | return NULL; |
132 | } |
133 | |
134 | reset_domain->type = type; |
135 | kref_init(kref: &reset_domain->refcount); |
136 | |
137 | reset_domain->wq = create_singlethread_workqueue(wq_name); |
138 | if (!reset_domain->wq) { |
139 | DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!" ); |
140 | amdgpu_reset_put_reset_domain(domain: reset_domain); |
141 | return NULL; |
142 | |
143 | } |
144 | |
145 | atomic_set(v: &reset_domain->in_gpu_reset, i: 0); |
146 | atomic_set(v: &reset_domain->reset_res, i: 0); |
147 | init_rwsem(&reset_domain->sem); |
148 | |
149 | return reset_domain; |
150 | } |
151 | |
152 | void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) |
153 | { |
154 | atomic_set(v: &reset_domain->in_gpu_reset, i: 1); |
155 | down_write(sem: &reset_domain->sem); |
156 | } |
157 | |
158 | |
159 | void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) |
160 | { |
161 | atomic_set(v: &reset_domain->in_gpu_reset, i: 0); |
162 | up_write(sem: &reset_domain->sem); |
163 | } |
164 | |
165 | #ifndef CONFIG_DEV_COREDUMP |
166 | void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, |
167 | struct amdgpu_reset_context *reset_context) |
168 | { |
169 | } |
170 | #else |
171 | static ssize_t |
172 | amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count, |
173 | void *data, size_t datalen) |
174 | { |
175 | struct drm_printer p; |
176 | struct amdgpu_coredump_info *coredump = data; |
177 | struct drm_print_iterator iter; |
178 | int i; |
179 | |
180 | iter.data = buffer; |
181 | iter.offset = 0; |
182 | iter.start = offset; |
183 | iter.remain = count; |
184 | |
185 | p = drm_coredump_printer(iter: &iter); |
186 | |
187 | drm_printf(p: &p, f: "**** AMDGPU Device Coredump ****\n" ); |
188 | drm_printf(p: &p, f: "version: " AMDGPU_COREDUMP_VERSION "\n" ); |
189 | drm_printf(p: &p, f: "kernel: " UTS_RELEASE "\n" ); |
190 | drm_printf(p: &p, f: "module: " KBUILD_MODNAME "\n" ); |
191 | drm_printf(p: &p, f: "time: %lld.%09ld\n" , coredump->reset_time.tv_sec, |
192 | coredump->reset_time.tv_nsec); |
193 | |
194 | if (coredump->reset_task_info.pid) |
195 | drm_printf(p: &p, f: "process_name: %s PID: %d\n" , |
196 | coredump->reset_task_info.process_name, |
197 | coredump->reset_task_info.pid); |
198 | |
199 | if (coredump->ring) { |
200 | drm_printf(p: &p, f: "\nRing timed out details\n" ); |
201 | drm_printf(p: &p, f: "IP Type: %d Ring Name: %s\n" , |
202 | coredump->ring->funcs->type, |
203 | coredump->ring->name); |
204 | } |
205 | |
206 | if (coredump->reset_vram_lost) |
207 | drm_printf(p: &p, f: "VRAM is lost due to GPU reset!\n" ); |
208 | if (coredump->adev->reset_info.num_regs) { |
209 | drm_printf(p: &p, f: "AMDGPU register dumps:\nOffset: Value:\n" ); |
210 | |
211 | for (i = 0; i < coredump->adev->reset_info.num_regs; i++) |
212 | drm_printf(p: &p, f: "0x%08x: 0x%08x\n" , |
213 | coredump->adev->reset_info.reset_dump_reg_list[i], |
214 | coredump->adev->reset_info.reset_dump_reg_value[i]); |
215 | } |
216 | |
217 | return count - iter.remain; |
218 | } |
219 | |
220 | static void amdgpu_devcoredump_free(void *data) |
221 | { |
222 | kfree(objp: data); |
223 | } |
224 | |
225 | void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, |
226 | struct amdgpu_reset_context *reset_context) |
227 | { |
228 | struct amdgpu_coredump_info *coredump; |
229 | struct drm_device *dev = adev_to_drm(adev); |
230 | struct amdgpu_job *job = reset_context->job; |
231 | struct drm_sched_job *s_job; |
232 | |
233 | coredump = kzalloc(size: sizeof(*coredump), GFP_NOWAIT); |
234 | |
235 | if (!coredump) { |
236 | DRM_ERROR("%s: failed to allocate memory for coredump\n" , __func__); |
237 | return; |
238 | } |
239 | |
240 | coredump->reset_vram_lost = vram_lost; |
241 | |
242 | if (reset_context->job && reset_context->job->vm) { |
243 | struct amdgpu_task_info *ti; |
244 | struct amdgpu_vm *vm = reset_context->job->vm; |
245 | |
246 | ti = amdgpu_vm_get_task_info_vm(vm); |
247 | if (ti) { |
248 | coredump->reset_task_info = *ti; |
249 | amdgpu_vm_put_task_info(task_info: ti); |
250 | } |
251 | } |
252 | |
253 | if (job) { |
254 | s_job = &job->base; |
255 | coredump->ring = to_amdgpu_ring(s_job->sched); |
256 | } |
257 | |
258 | coredump->adev = adev; |
259 | |
260 | ktime_get_ts64(ts: &coredump->reset_time); |
261 | |
262 | dev_coredumpm(dev: dev->dev, THIS_MODULE, data: coredump, datalen: 0, GFP_NOWAIT, |
263 | read: amdgpu_devcoredump_read, free: amdgpu_devcoredump_free); |
264 | } |
265 | #endif |
266 | |