1 | /* |
2 | * Copyright 2019 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ |
22 | #include <linux/module.h> |
23 | #include <linux/fdtable.h> |
24 | #include <linux/uaccess.h> |
25 | #include <linux/firmware.h> |
26 | #include "amdgpu.h" |
27 | #include "amdgpu_amdkfd.h" |
28 | #include "amdgpu_amdkfd_arcturus.h" |
29 | #include "amdgpu_reset.h" |
30 | #include "sdma0/sdma0_4_2_2_offset.h" |
31 | #include "sdma0/sdma0_4_2_2_sh_mask.h" |
32 | #include "sdma1/sdma1_4_2_2_offset.h" |
33 | #include "sdma1/sdma1_4_2_2_sh_mask.h" |
34 | #include "sdma2/sdma2_4_2_2_offset.h" |
35 | #include "sdma2/sdma2_4_2_2_sh_mask.h" |
36 | #include "sdma3/sdma3_4_2_2_offset.h" |
37 | #include "sdma3/sdma3_4_2_2_sh_mask.h" |
38 | #include "sdma4/sdma4_4_2_2_offset.h" |
39 | #include "sdma4/sdma4_4_2_2_sh_mask.h" |
40 | #include "sdma5/sdma5_4_2_2_offset.h" |
41 | #include "sdma5/sdma5_4_2_2_sh_mask.h" |
42 | #include "sdma6/sdma6_4_2_2_offset.h" |
43 | #include "sdma6/sdma6_4_2_2_sh_mask.h" |
44 | #include "sdma7/sdma7_4_2_2_offset.h" |
45 | #include "sdma7/sdma7_4_2_2_sh_mask.h" |
46 | #include "v9_structs.h" |
47 | #include "soc15.h" |
48 | #include "soc15d.h" |
49 | #include "amdgpu_amdkfd_gfx_v9.h" |
50 | #include "gfxhub_v1_0.h" |
51 | #include "mmhub_v9_4.h" |
52 | #include "gc/gc_9_0_offset.h" |
53 | #include "gc/gc_9_0_sh_mask.h" |
54 | |
55 | #define HQD_N_REGS 56 |
56 | #define DUMP_REG(addr) do { \ |
57 | if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ |
58 | break; \ |
59 | (*dump)[i][0] = (addr) << 2; \ |
60 | (*dump)[i++][1] = RREG32(addr); \ |
61 | } while (0) |
62 | |
63 | static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) |
64 | { |
65 | return (struct v9_sdma_mqd *)mqd; |
66 | } |
67 | |
68 | static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, |
69 | unsigned int engine_id, |
70 | unsigned int queue_id) |
71 | { |
72 | uint32_t sdma_engine_reg_base = 0; |
73 | uint32_t sdma_rlc_reg_offset; |
74 | |
75 | switch (engine_id) { |
76 | default: |
77 | dev_warn(adev->dev, |
78 | "Invalid sdma engine id (%d), using engine id 0\n" , |
79 | engine_id); |
80 | fallthrough; |
81 | case 0: |
82 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, |
83 | mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; |
84 | break; |
85 | case 1: |
86 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0, |
87 | mmSDMA1_RLC0_RB_CNTL) - mmSDMA1_RLC0_RB_CNTL; |
88 | break; |
89 | case 2: |
90 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA2, 0, |
91 | mmSDMA2_RLC0_RB_CNTL) - mmSDMA2_RLC0_RB_CNTL; |
92 | break; |
93 | case 3: |
94 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA3, 0, |
95 | mmSDMA3_RLC0_RB_CNTL) - mmSDMA3_RLC0_RB_CNTL; |
96 | break; |
97 | case 4: |
98 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA4, 0, |
99 | mmSDMA4_RLC0_RB_CNTL) - mmSDMA4_RLC0_RB_CNTL; |
100 | break; |
101 | case 5: |
102 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA5, 0, |
103 | mmSDMA5_RLC0_RB_CNTL) - mmSDMA5_RLC0_RB_CNTL; |
104 | break; |
105 | case 6: |
106 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA6, 0, |
107 | mmSDMA6_RLC0_RB_CNTL) - mmSDMA6_RLC0_RB_CNTL; |
108 | break; |
109 | case 7: |
110 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA7, 0, |
111 | mmSDMA7_RLC0_RB_CNTL) - mmSDMA7_RLC0_RB_CNTL; |
112 | break; |
113 | } |
114 | |
115 | sdma_rlc_reg_offset = sdma_engine_reg_base |
116 | + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); |
117 | |
118 | pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n" , engine_id, |
119 | queue_id, sdma_rlc_reg_offset); |
120 | |
121 | return sdma_rlc_reg_offset; |
122 | } |
123 | |
124 | int kgd_arcturus_hqd_sdma_load(struct amdgpu_device *adev, void *mqd, |
125 | uint32_t __user *wptr, struct mm_struct *mm) |
126 | { |
127 | struct v9_sdma_mqd *m; |
128 | uint32_t sdma_rlc_reg_offset; |
129 | unsigned long end_jiffies; |
130 | uint32_t data; |
131 | uint64_t data64; |
132 | uint64_t __user *wptr64 = (uint64_t __user *)wptr; |
133 | |
134 | m = get_sdma_mqd(mqd); |
135 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
136 | queue_id: m->sdma_queue_id); |
137 | |
138 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, |
139 | m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); |
140 | |
141 | end_jiffies = msecs_to_jiffies(m: 2000) + jiffies; |
142 | while (true) { |
143 | data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); |
144 | if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
145 | break; |
146 | if (time_after(jiffies, end_jiffies)) { |
147 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
148 | return -ETIME; |
149 | } |
150 | usleep_range(min: 500, max: 1000); |
151 | } |
152 | |
153 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, |
154 | m->sdmax_rlcx_doorbell_offset); |
155 | |
156 | data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, |
157 | ENABLE, 1); |
158 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); |
159 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, |
160 | m->sdmax_rlcx_rb_rptr); |
161 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, |
162 | m->sdmax_rlcx_rb_rptr_hi); |
163 | |
164 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); |
165 | if (read_user_wptr(mm, wptr64, data64)) { |
166 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, |
167 | lower_32_bits(data64)); |
168 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, |
169 | upper_32_bits(data64)); |
170 | } else { |
171 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, |
172 | m->sdmax_rlcx_rb_rptr); |
173 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, |
174 | m->sdmax_rlcx_rb_rptr_hi); |
175 | } |
176 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); |
177 | |
178 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); |
179 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, |
180 | m->sdmax_rlcx_rb_base_hi); |
181 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, |
182 | m->sdmax_rlcx_rb_rptr_addr_lo); |
183 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, |
184 | m->sdmax_rlcx_rb_rptr_addr_hi); |
185 | |
186 | data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, |
187 | RB_ENABLE, 1); |
188 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); |
189 | |
190 | return 0; |
191 | } |
192 | |
193 | int kgd_arcturus_hqd_sdma_dump(struct amdgpu_device *adev, |
194 | uint32_t engine_id, uint32_t queue_id, |
195 | uint32_t (**dump)[2], uint32_t *n_regs) |
196 | { |
197 | uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, |
198 | engine_id, queue_id); |
199 | uint32_t i = 0, reg; |
200 | #undef HQD_N_REGS |
201 | #define HQD_N_REGS (19+6+7+10) |
202 | |
203 | *dump = kmalloc_array(HQD_N_REGS * 2, size: sizeof(uint32_t), GFP_KERNEL); |
204 | if (*dump == NULL) |
205 | return -ENOMEM; |
206 | |
207 | for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) |
208 | DUMP_REG(sdma_rlc_reg_offset + reg); |
209 | for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) |
210 | DUMP_REG(sdma_rlc_reg_offset + reg); |
211 | for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; |
212 | reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) |
213 | DUMP_REG(sdma_rlc_reg_offset + reg); |
214 | for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; |
215 | reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) |
216 | DUMP_REG(sdma_rlc_reg_offset + reg); |
217 | |
218 | WARN_ON_ONCE(i != HQD_N_REGS); |
219 | *n_regs = i; |
220 | |
221 | return 0; |
222 | } |
223 | |
224 | bool kgd_arcturus_hqd_sdma_is_occupied(struct amdgpu_device *adev, |
225 | void *mqd) |
226 | { |
227 | struct v9_sdma_mqd *m; |
228 | uint32_t sdma_rlc_reg_offset; |
229 | uint32_t sdma_rlc_rb_cntl; |
230 | |
231 | m = get_sdma_mqd(mqd); |
232 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
233 | queue_id: m->sdma_queue_id); |
234 | |
235 | sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); |
236 | |
237 | if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) |
238 | return true; |
239 | |
240 | return false; |
241 | } |
242 | |
243 | int kgd_arcturus_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd, |
244 | unsigned int utimeout) |
245 | { |
246 | struct v9_sdma_mqd *m; |
247 | uint32_t sdma_rlc_reg_offset; |
248 | uint32_t temp; |
249 | unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; |
250 | |
251 | m = get_sdma_mqd(mqd); |
252 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
253 | queue_id: m->sdma_queue_id); |
254 | |
255 | temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); |
256 | temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; |
257 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); |
258 | |
259 | while (true) { |
260 | temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); |
261 | if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
262 | break; |
263 | if (time_after(jiffies, end_jiffies)) { |
264 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
265 | return -ETIME; |
266 | } |
267 | usleep_range(min: 500, max: 1000); |
268 | } |
269 | |
270 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); |
271 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, |
272 | RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | |
273 | SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); |
274 | |
275 | m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); |
276 | m->sdmax_rlcx_rb_rptr_hi = |
277 | RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); |
278 | |
279 | return 0; |
280 | } |
281 | |
282 | /* |
283 | * Helper used to suspend/resume gfx pipe for image post process work to set |
284 | * barrier behaviour. |
285 | */ |
286 | static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool suspend) |
287 | { |
288 | int i, r = 0; |
289 | |
290 | for (i = 0; i < adev->gfx.num_compute_rings; i++) { |
291 | struct amdgpu_ring *ring = &adev->gfx.compute_ring[i]; |
292 | |
293 | if (!(ring && ring->sched.thread)) |
294 | continue; |
295 | |
296 | /* stop secheduler and drain ring. */ |
297 | if (suspend) { |
298 | drm_sched_stop(sched: &ring->sched, NULL); |
299 | r = amdgpu_fence_wait_empty(ring); |
300 | if (r) |
301 | goto out; |
302 | } else { |
303 | drm_sched_start(sched: &ring->sched, full_recovery: false); |
304 | } |
305 | } |
306 | |
307 | out: |
308 | /* return on resume or failure to drain rings. */ |
309 | if (!suspend || r) |
310 | return r; |
311 | |
312 | return amdgpu_device_ip_wait_for_idle(adev, block_type: AMD_IP_BLOCK_TYPE_GFX); |
313 | } |
314 | |
315 | static void set_barrier_auto_waitcnt(struct amdgpu_device *adev, bool enable_waitcnt) |
316 | { |
317 | uint32_t data; |
318 | |
319 | WRITE_ONCE(adev->barrier_has_auto_waitcnt, enable_waitcnt); |
320 | |
321 | if (!down_read_trylock(sem: &adev->reset_domain->sem)) |
322 | return; |
323 | |
324 | amdgpu_amdkfd_suspend(adev, run_pm: false); |
325 | |
326 | if (suspend_resume_compute_scheduler(adev, suspend: true)) |
327 | goto out; |
328 | |
329 | data = RREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG)); |
330 | data = REG_SET_FIELD(data, SQ_CONFIG, DISABLE_BARRIER_WAITCNT, |
331 | !enable_waitcnt); |
332 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSQ_CONFIG), data); |
333 | |
334 | out: |
335 | suspend_resume_compute_scheduler(adev, suspend: false); |
336 | |
337 | amdgpu_amdkfd_resume(adev, run_pm: false); |
338 | |
339 | up_read(sem: &adev->reset_domain->sem); |
340 | } |
341 | |
342 | /* |
343 | * restore_dbg_registers is ignored here but is a general interface requirement |
344 | * for devices that support GFXOFF and where the RLC save/restore list |
345 | * does not support hw registers for debugging i.e. the driver has to manually |
346 | * initialize the debug mode registers after it has disabled GFX off during the |
347 | * debug session. |
348 | */ |
349 | static uint32_t kgd_arcturus_enable_debug_trap(struct amdgpu_device *adev, |
350 | bool restore_dbg_registers, |
351 | uint32_t vmid) |
352 | { |
353 | mutex_lock(&adev->grbm_idx_mutex); |
354 | |
355 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true); |
356 | |
357 | set_barrier_auto_waitcnt(adev, enable_waitcnt: true); |
358 | |
359 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0); |
360 | |
361 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false); |
362 | |
363 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
364 | |
365 | return 0; |
366 | } |
367 | |
368 | /* |
369 | * keep_trap_enabled is ignored here but is a general interface requirement |
370 | * for devices that support multi-process debugging where the performance |
371 | * overhead from trap temporary setup needs to be bypassed when the debug |
372 | * session has ended. |
373 | */ |
374 | static uint32_t kgd_arcturus_disable_debug_trap(struct amdgpu_device *adev, |
375 | bool keep_trap_enabled, |
376 | uint32_t vmid) |
377 | { |
378 | |
379 | mutex_lock(&adev->grbm_idx_mutex); |
380 | |
381 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: true); |
382 | |
383 | set_barrier_auto_waitcnt(adev, enable_waitcnt: false); |
384 | |
385 | WREG32(SOC15_REG_OFFSET(GC, 0, mmSPI_GDBG_TRAP_MASK), 0); |
386 | |
387 | kgd_gfx_v9_set_wave_launch_stall(adev, vmid, stall: false); |
388 | |
389 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
390 | |
391 | return 0; |
392 | } |
393 | const struct kfd2kgd_calls arcturus_kfd2kgd = { |
394 | .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, |
395 | .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, |
396 | .init_interrupts = kgd_gfx_v9_init_interrupts, |
397 | .hqd_load = kgd_gfx_v9_hqd_load, |
398 | .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load, |
399 | .hqd_sdma_load = kgd_arcturus_hqd_sdma_load, |
400 | .hqd_dump = kgd_gfx_v9_hqd_dump, |
401 | .hqd_sdma_dump = kgd_arcturus_hqd_sdma_dump, |
402 | .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied, |
403 | .hqd_sdma_is_occupied = kgd_arcturus_hqd_sdma_is_occupied, |
404 | .hqd_destroy = kgd_gfx_v9_hqd_destroy, |
405 | .hqd_sdma_destroy = kgd_arcturus_hqd_sdma_destroy, |
406 | .wave_control_execute = kgd_gfx_v9_wave_control_execute, |
407 | .get_atc_vmid_pasid_mapping_info = |
408 | kgd_gfx_v9_get_atc_vmid_pasid_mapping_info, |
409 | .set_vm_context_page_table_base = |
410 | kgd_gfx_v9_set_vm_context_page_table_base, |
411 | .enable_debug_trap = kgd_arcturus_enable_debug_trap, |
412 | .disable_debug_trap = kgd_arcturus_disable_debug_trap, |
413 | .validate_trap_override_request = kgd_gfx_v9_validate_trap_override_request, |
414 | .set_wave_launch_trap_override = kgd_gfx_v9_set_wave_launch_trap_override, |
415 | .set_wave_launch_mode = kgd_gfx_v9_set_wave_launch_mode, |
416 | .set_address_watch = kgd_gfx_v9_set_address_watch, |
417 | .clear_address_watch = kgd_gfx_v9_clear_address_watch, |
418 | .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, |
419 | .build_grace_period_packet_info = kgd_gfx_v9_build_grace_period_packet_info, |
420 | .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, |
421 | .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings |
422 | }; |
423 | |