1 | /* |
2 | * Copyright 2021 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | * |
22 | */ |
23 | |
24 | #ifndef __AMDGPU_RESET_H__ |
25 | #define __AMDGPU_RESET_H__ |
26 | |
27 | #include "amdgpu.h" |
28 | |
29 | #define AMDGPU_RESET_MAX_HANDLERS 5 |
30 | |
31 | enum AMDGPU_RESET_FLAGS { |
32 | |
33 | AMDGPU_NEED_FULL_RESET = 0, |
34 | AMDGPU_SKIP_HW_RESET = 1, |
35 | AMDGPU_RESET_FOR_DEVICE_REMOVE = 2, |
36 | }; |
37 | |
38 | struct amdgpu_reset_context { |
39 | enum amd_reset_method method; |
40 | struct amdgpu_device *reset_req_dev; |
41 | struct amdgpu_job *job; |
42 | struct amdgpu_hive_info *hive; |
43 | struct list_head *reset_device_list; |
44 | unsigned long flags; |
45 | }; |
46 | |
47 | struct amdgpu_reset_handler { |
48 | enum amd_reset_method reset_method; |
49 | int (*prepare_env)(struct amdgpu_reset_control *reset_ctl, |
50 | struct amdgpu_reset_context *context); |
51 | int (*prepare_hwcontext)(struct amdgpu_reset_control *reset_ctl, |
52 | struct amdgpu_reset_context *context); |
53 | int (*perform_reset)(struct amdgpu_reset_control *reset_ctl, |
54 | struct amdgpu_reset_context *context); |
55 | int (*restore_hwcontext)(struct amdgpu_reset_control *reset_ctl, |
56 | struct amdgpu_reset_context *context); |
57 | int (*restore_env)(struct amdgpu_reset_control *reset_ctl, |
58 | struct amdgpu_reset_context *context); |
59 | |
60 | int (*do_reset)(struct amdgpu_device *adev); |
61 | }; |
62 | |
63 | struct amdgpu_reset_control { |
64 | void *handle; |
65 | struct work_struct reset_work; |
66 | struct mutex reset_lock; |
67 | struct amdgpu_reset_handler *( |
68 | *reset_handlers)[AMDGPU_RESET_MAX_HANDLERS]; |
69 | atomic_t in_reset; |
70 | enum amd_reset_method active_reset; |
71 | struct amdgpu_reset_handler *(*get_reset_handler)( |
72 | struct amdgpu_reset_control *reset_ctl, |
73 | struct amdgpu_reset_context *context); |
74 | void (*async_reset)(struct work_struct *work); |
75 | }; |
76 | |
77 | |
78 | enum amdgpu_reset_domain_type { |
79 | SINGLE_DEVICE, |
80 | XGMI_HIVE |
81 | }; |
82 | |
83 | struct amdgpu_reset_domain { |
84 | struct kref refcount; |
85 | struct workqueue_struct *wq; |
86 | enum amdgpu_reset_domain_type type; |
87 | struct rw_semaphore sem; |
88 | atomic_t in_gpu_reset; |
89 | atomic_t reset_res; |
90 | }; |
91 | |
92 | #ifdef CONFIG_DEV_COREDUMP |
93 | |
94 | #define AMDGPU_COREDUMP_VERSION "1" |
95 | |
96 | struct amdgpu_coredump_info { |
97 | struct amdgpu_device *adev; |
98 | struct amdgpu_task_info reset_task_info; |
99 | struct timespec64 reset_time; |
100 | bool reset_vram_lost; |
101 | }; |
102 | #endif |
103 | |
104 | int amdgpu_reset_init(struct amdgpu_device *adev); |
105 | int amdgpu_reset_fini(struct amdgpu_device *adev); |
106 | |
107 | int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev, |
108 | struct amdgpu_reset_context *reset_context); |
109 | |
110 | int amdgpu_reset_perform_reset(struct amdgpu_device *adev, |
111 | struct amdgpu_reset_context *reset_context); |
112 | |
113 | int amdgpu_reset_prepare_env(struct amdgpu_device *adev, |
114 | struct amdgpu_reset_context *reset_context); |
115 | int amdgpu_reset_restore_env(struct amdgpu_device *adev, |
116 | struct amdgpu_reset_context *reset_context); |
117 | |
118 | struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type, |
119 | char *wq_name); |
120 | |
121 | void amdgpu_reset_destroy_reset_domain(struct kref *ref); |
122 | |
123 | static inline bool amdgpu_reset_get_reset_domain(struct amdgpu_reset_domain *domain) |
124 | { |
125 | return kref_get_unless_zero(kref: &domain->refcount) != 0; |
126 | } |
127 | |
128 | static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *domain) |
129 | { |
130 | if (domain) |
131 | kref_put(kref: &domain->refcount, release: amdgpu_reset_destroy_reset_domain); |
132 | } |
133 | |
134 | static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain, |
135 | struct work_struct *work) |
136 | { |
137 | return queue_work(wq: domain->wq, work); |
138 | } |
139 | |
140 | void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain); |
141 | |
142 | void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain); |
143 | |
144 | void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost, |
145 | struct amdgpu_reset_context *reset_context); |
146 | |
147 | #define for_each_handler(i, handler, reset_ctl) \ |
148 | for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \ |
149 | (handler = (*reset_ctl->reset_handlers)[i]); \ |
150 | ++i) |
151 | #endif |
152 | |