1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (C) 2020-2024 Intel Corporation |
4 | */ |
5 | |
6 | #include <linux/highmem.h> |
7 | #include <linux/moduleparam.h> |
8 | #include <linux/pci.h> |
9 | #include <linux/pm_runtime.h> |
10 | #include <linux/reboot.h> |
11 | |
12 | #include "vpu_boot_api.h" |
13 | #include "ivpu_drv.h" |
14 | #include "ivpu_hw.h" |
15 | #include "ivpu_fw.h" |
16 | #include "ivpu_fw_log.h" |
17 | #include "ivpu_ipc.h" |
18 | #include "ivpu_job.h" |
19 | #include "ivpu_jsm_msg.h" |
20 | #include "ivpu_mmu.h" |
21 | #include "ivpu_pm.h" |
22 | |
23 | static bool ivpu_disable_recovery; |
24 | module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644); |
25 | MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected" ); |
26 | |
27 | static unsigned long ivpu_tdr_timeout_ms; |
28 | module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644); |
29 | MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default" ); |
30 | |
31 | #define PM_RESCHEDULE_LIMIT 5 |
32 | |
33 | static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev) |
34 | { |
35 | struct ivpu_fw_info *fw = vdev->fw; |
36 | |
37 | ivpu_cmdq_reset_all_contexts(vdev); |
38 | ivpu_ipc_reset(vdev); |
39 | ivpu_fw_load(vdev); |
40 | fw->entry_point = fw->cold_boot_entry_point; |
41 | } |
42 | |
43 | static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev) |
44 | { |
45 | struct ivpu_fw_info *fw = vdev->fw; |
46 | struct vpu_boot_params *bp = ivpu_bo_vaddr(bo: fw->mem); |
47 | |
48 | if (!bp->save_restore_ret_address) { |
49 | ivpu_pm_prepare_cold_boot(vdev); |
50 | return; |
51 | } |
52 | |
53 | ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx" , bp->save_restore_ret_address); |
54 | fw->entry_point = bp->save_restore_ret_address; |
55 | } |
56 | |
57 | static int ivpu_suspend(struct ivpu_device *vdev) |
58 | { |
59 | int ret; |
60 | |
61 | ivpu_prepare_for_reset(vdev); |
62 | |
63 | ret = ivpu_shutdown(vdev); |
64 | if (ret) |
65 | ivpu_err(vdev, "Failed to shutdown NPU: %d\n" , ret); |
66 | |
67 | return ret; |
68 | } |
69 | |
70 | static int ivpu_resume(struct ivpu_device *vdev) |
71 | { |
72 | int ret; |
73 | |
74 | retry: |
75 | pci_restore_state(to_pci_dev(vdev->drm.dev)); |
76 | pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0); |
77 | |
78 | ret = ivpu_hw_power_up(vdev); |
79 | if (ret) { |
80 | ivpu_err(vdev, "Failed to power up HW: %d\n" , ret); |
81 | goto err_power_down; |
82 | } |
83 | |
84 | ret = ivpu_mmu_enable(vdev); |
85 | if (ret) { |
86 | ivpu_err(vdev, "Failed to resume MMU: %d\n" , ret); |
87 | goto err_power_down; |
88 | } |
89 | |
90 | ret = ivpu_boot(vdev); |
91 | if (ret) |
92 | goto err_mmu_disable; |
93 | |
94 | return 0; |
95 | |
96 | err_mmu_disable: |
97 | ivpu_mmu_disable(vdev); |
98 | err_power_down: |
99 | ivpu_hw_power_down(vdev); |
100 | pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot); |
101 | |
102 | if (!ivpu_fw_is_cold_boot(vdev)) { |
103 | ivpu_pm_prepare_cold_boot(vdev); |
104 | goto retry; |
105 | } else { |
106 | ivpu_err(vdev, "Failed to resume the FW: %d\n" , ret); |
107 | } |
108 | |
109 | return ret; |
110 | } |
111 | |
112 | static void ivpu_pm_recovery_work(struct work_struct *work) |
113 | { |
114 | struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work); |
115 | struct ivpu_device *vdev = pm->vdev; |
116 | char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER" , NULL}; |
117 | int ret; |
118 | |
119 | ivpu_err(vdev, "Recovering the NPU (reset #%d)\n" , atomic_read(&vdev->pm->reset_counter)); |
120 | |
121 | ret = pm_runtime_resume_and_get(dev: vdev->drm.dev); |
122 | if (ret) |
123 | ivpu_err(vdev, "Failed to resume NPU: %d\n" , ret); |
124 | |
125 | ivpu_fw_log_dump(vdev); |
126 | |
127 | atomic_inc(v: &vdev->pm->reset_counter); |
128 | atomic_set(v: &vdev->pm->reset_pending, i: 1); |
129 | down_write(sem: &vdev->pm->reset_lock); |
130 | |
131 | ivpu_suspend(vdev); |
132 | ivpu_pm_prepare_cold_boot(vdev); |
133 | ivpu_jobs_abort_all(vdev); |
134 | |
135 | ret = ivpu_resume(vdev); |
136 | if (ret) |
137 | ivpu_err(vdev, "Failed to resume NPU: %d\n" , ret); |
138 | |
139 | up_write(sem: &vdev->pm->reset_lock); |
140 | atomic_set(v: &vdev->pm->reset_pending, i: 0); |
141 | |
142 | kobject_uevent_env(kobj: &vdev->drm.dev->kobj, action: KOBJ_CHANGE, envp: evt); |
143 | pm_runtime_mark_last_busy(dev: vdev->drm.dev); |
144 | pm_runtime_put_autosuspend(dev: vdev->drm.dev); |
145 | } |
146 | |
147 | void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason) |
148 | { |
149 | ivpu_err(vdev, "Recovery triggered by %s\n" , reason); |
150 | |
151 | if (ivpu_disable_recovery) { |
152 | ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n" ); |
153 | return; |
154 | } |
155 | |
156 | if (ivpu_is_fpga(vdev)) { |
157 | ivpu_err(vdev, "Recovery not available on FPGA\n" ); |
158 | return; |
159 | } |
160 | |
161 | /* Trigger recovery if it's not in progress */ |
162 | if (atomic_cmpxchg(v: &vdev->pm->reset_pending, old: 0, new: 1) == 0) { |
163 | ivpu_hw_diagnose_failure(vdev); |
164 | ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */ |
165 | queue_work(wq: system_long_wq, work: &vdev->pm->recovery_work); |
166 | } |
167 | } |
168 | |
169 | static void ivpu_job_timeout_work(struct work_struct *work) |
170 | { |
171 | struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); |
172 | struct ivpu_device *vdev = pm->vdev; |
173 | |
174 | ivpu_pm_trigger_recovery(vdev, reason: "TDR" ); |
175 | } |
176 | |
177 | void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) |
178 | { |
179 | unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr; |
180 | |
181 | /* No-op if already queued */ |
182 | queue_delayed_work(wq: system_wq, dwork: &vdev->pm->job_timeout_work, delay: msecs_to_jiffies(m: timeout_ms)); |
183 | } |
184 | |
185 | void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev) |
186 | { |
187 | cancel_delayed_work_sync(dwork: &vdev->pm->job_timeout_work); |
188 | } |
189 | |
190 | int ivpu_pm_suspend_cb(struct device *dev) |
191 | { |
192 | struct drm_device *drm = dev_get_drvdata(dev); |
193 | struct ivpu_device *vdev = to_ivpu_device(dev: drm); |
194 | unsigned long timeout; |
195 | |
196 | ivpu_dbg(vdev, PM, "Suspend..\n" ); |
197 | |
198 | timeout = jiffies + msecs_to_jiffies(m: vdev->timeout.tdr); |
199 | while (!ivpu_hw_is_idle(vdev)) { |
200 | cond_resched(); |
201 | if (time_after_eq(jiffies, timeout)) { |
202 | ivpu_err(vdev, "Failed to enter idle on system suspend\n" ); |
203 | return -EBUSY; |
204 | } |
205 | } |
206 | |
207 | ivpu_jsm_pwr_d0i3_enter(vdev); |
208 | |
209 | ivpu_suspend(vdev); |
210 | ivpu_pm_prepare_warm_boot(vdev); |
211 | |
212 | ivpu_dbg(vdev, PM, "Suspend done.\n" ); |
213 | |
214 | return 0; |
215 | } |
216 | |
217 | int ivpu_pm_resume_cb(struct device *dev) |
218 | { |
219 | struct drm_device *drm = dev_get_drvdata(dev); |
220 | struct ivpu_device *vdev = to_ivpu_device(dev: drm); |
221 | int ret; |
222 | |
223 | ivpu_dbg(vdev, PM, "Resume..\n" ); |
224 | |
225 | ret = ivpu_resume(vdev); |
226 | if (ret) |
227 | ivpu_err(vdev, "Failed to resume: %d\n" , ret); |
228 | |
229 | ivpu_dbg(vdev, PM, "Resume done.\n" ); |
230 | |
231 | return ret; |
232 | } |
233 | |
234 | int ivpu_pm_runtime_suspend_cb(struct device *dev) |
235 | { |
236 | struct drm_device *drm = dev_get_drvdata(dev); |
237 | struct ivpu_device *vdev = to_ivpu_device(dev: drm); |
238 | bool hw_is_idle = true; |
239 | int ret; |
240 | |
241 | drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa)); |
242 | drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work)); |
243 | |
244 | ivpu_dbg(vdev, PM, "Runtime suspend..\n" ); |
245 | |
246 | if (!ivpu_hw_is_idle(vdev) && vdev->pm->suspend_reschedule_counter) { |
247 | ivpu_dbg(vdev, PM, "Failed to enter idle, rescheduling suspend, retries left %d\n" , |
248 | vdev->pm->suspend_reschedule_counter); |
249 | pm_schedule_suspend(dev, delay: vdev->timeout.reschedule_suspend); |
250 | vdev->pm->suspend_reschedule_counter--; |
251 | return -EAGAIN; |
252 | } |
253 | |
254 | if (!vdev->pm->suspend_reschedule_counter) |
255 | hw_is_idle = false; |
256 | else if (ivpu_jsm_pwr_d0i3_enter(vdev)) |
257 | hw_is_idle = false; |
258 | |
259 | ret = ivpu_suspend(vdev); |
260 | if (ret) |
261 | ivpu_err(vdev, "Failed to suspend NPU: %d\n" , ret); |
262 | |
263 | if (!hw_is_idle) { |
264 | ivpu_err(vdev, "NPU failed to enter idle, force suspended.\n" ); |
265 | ivpu_fw_log_dump(vdev); |
266 | ivpu_pm_prepare_cold_boot(vdev); |
267 | } else { |
268 | ivpu_pm_prepare_warm_boot(vdev); |
269 | } |
270 | |
271 | vdev->pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; |
272 | |
273 | ivpu_dbg(vdev, PM, "Runtime suspend done.\n" ); |
274 | |
275 | return 0; |
276 | } |
277 | |
278 | int ivpu_pm_runtime_resume_cb(struct device *dev) |
279 | { |
280 | struct drm_device *drm = dev_get_drvdata(dev); |
281 | struct ivpu_device *vdev = to_ivpu_device(dev: drm); |
282 | int ret; |
283 | |
284 | ivpu_dbg(vdev, PM, "Runtime resume..\n" ); |
285 | |
286 | ret = ivpu_resume(vdev); |
287 | if (ret) |
288 | ivpu_err(vdev, "Failed to set RESUME state: %d\n" , ret); |
289 | |
290 | ivpu_dbg(vdev, PM, "Runtime resume done.\n" ); |
291 | |
292 | return ret; |
293 | } |
294 | |
295 | int ivpu_rpm_get(struct ivpu_device *vdev) |
296 | { |
297 | int ret; |
298 | |
299 | ret = pm_runtime_resume_and_get(dev: vdev->drm.dev); |
300 | if (!drm_WARN_ON(&vdev->drm, ret < 0)) |
301 | vdev->pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; |
302 | |
303 | return ret; |
304 | } |
305 | |
306 | int ivpu_rpm_get_if_active(struct ivpu_device *vdev) |
307 | { |
308 | int ret; |
309 | |
310 | ret = pm_runtime_get_if_in_use(dev: vdev->drm.dev); |
311 | drm_WARN_ON(&vdev->drm, ret < 0); |
312 | |
313 | return ret; |
314 | } |
315 | |
316 | void ivpu_rpm_put(struct ivpu_device *vdev) |
317 | { |
318 | pm_runtime_mark_last_busy(dev: vdev->drm.dev); |
319 | pm_runtime_put_autosuspend(dev: vdev->drm.dev); |
320 | } |
321 | |
322 | void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) |
323 | { |
324 | struct ivpu_device *vdev = pci_get_drvdata(pdev); |
325 | |
326 | ivpu_dbg(vdev, PM, "Pre-reset..\n" ); |
327 | atomic_inc(v: &vdev->pm->reset_counter); |
328 | atomic_set(v: &vdev->pm->reset_pending, i: 1); |
329 | |
330 | pm_runtime_get_sync(dev: vdev->drm.dev); |
331 | down_write(sem: &vdev->pm->reset_lock); |
332 | ivpu_prepare_for_reset(vdev); |
333 | ivpu_hw_reset(vdev); |
334 | ivpu_pm_prepare_cold_boot(vdev); |
335 | ivpu_jobs_abort_all(vdev); |
336 | ivpu_dbg(vdev, PM, "Pre-reset done.\n" ); |
337 | } |
338 | |
339 | void ivpu_pm_reset_done_cb(struct pci_dev *pdev) |
340 | { |
341 | struct ivpu_device *vdev = pci_get_drvdata(pdev); |
342 | int ret; |
343 | |
344 | ivpu_dbg(vdev, PM, "Post-reset..\n" ); |
345 | ret = ivpu_resume(vdev); |
346 | if (ret) |
347 | ivpu_err(vdev, "Failed to set RESUME state: %d\n" , ret); |
348 | up_write(sem: &vdev->pm->reset_lock); |
349 | atomic_set(v: &vdev->pm->reset_pending, i: 0); |
350 | ivpu_dbg(vdev, PM, "Post-reset done.\n" ); |
351 | |
352 | pm_runtime_mark_last_busy(dev: vdev->drm.dev); |
353 | pm_runtime_put_autosuspend(dev: vdev->drm.dev); |
354 | } |
355 | |
356 | void ivpu_pm_init(struct ivpu_device *vdev) |
357 | { |
358 | struct device *dev = vdev->drm.dev; |
359 | struct ivpu_pm_info *pm = vdev->pm; |
360 | int delay; |
361 | |
362 | pm->vdev = vdev; |
363 | pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; |
364 | |
365 | init_rwsem(&pm->reset_lock); |
366 | atomic_set(v: &pm->reset_pending, i: 0); |
367 | atomic_set(v: &pm->reset_counter, i: 0); |
368 | |
369 | INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); |
370 | INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); |
371 | |
372 | if (ivpu_disable_recovery) |
373 | delay = -1; |
374 | else |
375 | delay = vdev->timeout.autosuspend; |
376 | |
377 | pm_runtime_use_autosuspend(dev); |
378 | pm_runtime_set_autosuspend_delay(dev, delay); |
379 | |
380 | ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n" , delay); |
381 | } |
382 | |
383 | void ivpu_pm_cancel_recovery(struct ivpu_device *vdev) |
384 | { |
385 | drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work)); |
386 | cancel_work_sync(work: &vdev->pm->recovery_work); |
387 | } |
388 | |
389 | void ivpu_pm_enable(struct ivpu_device *vdev) |
390 | { |
391 | struct device *dev = vdev->drm.dev; |
392 | |
393 | pm_runtime_set_active(dev); |
394 | pm_runtime_allow(dev); |
395 | pm_runtime_mark_last_busy(dev); |
396 | pm_runtime_put_autosuspend(dev); |
397 | } |
398 | |
399 | void ivpu_pm_disable(struct ivpu_device *vdev) |
400 | { |
401 | pm_runtime_get_noresume(dev: vdev->drm.dev); |
402 | pm_runtime_forbid(dev: vdev->drm.dev); |
403 | } |
404 | |