1 | /* |
2 | * Copyright 2021 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ |
22 | #include <linux/mmu_context.h> |
23 | #include "amdgpu.h" |
24 | #include "amdgpu_amdkfd.h" |
25 | #include "gc/gc_11_0_0_offset.h" |
26 | #include "gc/gc_11_0_0_sh_mask.h" |
27 | #include "oss/osssys_6_0_0_offset.h" |
28 | #include "oss/osssys_6_0_0_sh_mask.h" |
29 | #include "soc15_common.h" |
30 | #include "soc15d.h" |
31 | #include "v11_structs.h" |
32 | #include "soc21.h" |
33 | #include <uapi/linux/kfd_ioctl.h> |
34 | |
35 | enum hqd_dequeue_request_type { |
36 | NO_ACTION = 0, |
37 | DRAIN_PIPE, |
38 | RESET_WAVES, |
39 | SAVE_WAVES |
40 | }; |
41 | |
42 | static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe, |
43 | uint32_t queue, uint32_t vmid) |
44 | { |
45 | mutex_lock(&adev->srbm_mutex); |
46 | soc21_grbm_select(adev, me: mec, pipe, queue, vmid); |
47 | } |
48 | |
49 | static void unlock_srbm(struct amdgpu_device *adev) |
50 | { |
51 | soc21_grbm_select(adev, me: 0, pipe: 0, queue: 0, vmid: 0); |
52 | mutex_unlock(lock: &adev->srbm_mutex); |
53 | } |
54 | |
55 | static void acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id, |
56 | uint32_t queue_id) |
57 | { |
58 | uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
59 | uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
60 | |
61 | lock_srbm(adev, mec, pipe, queue: queue_id, vmid: 0); |
62 | } |
63 | |
64 | static uint64_t get_queue_mask(struct amdgpu_device *adev, |
65 | uint32_t pipe_id, uint32_t queue_id) |
66 | { |
67 | unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe + |
68 | queue_id; |
69 | |
70 | return 1ull << bit; |
71 | } |
72 | |
73 | static void release_queue(struct amdgpu_device *adev) |
74 | { |
75 | unlock_srbm(adev); |
76 | } |
77 | |
78 | static void program_sh_mem_settings_v11(struct amdgpu_device *adev, uint32_t vmid, |
79 | uint32_t sh_mem_config, |
80 | uint32_t sh_mem_ape1_base, |
81 | uint32_t sh_mem_ape1_limit, |
82 | uint32_t sh_mem_bases, uint32_t inst) |
83 | { |
84 | lock_srbm(adev, mec: 0, pipe: 0, queue: 0, vmid); |
85 | |
86 | WREG32(SOC15_REG_OFFSET(GC, 0, regSH_MEM_CONFIG), sh_mem_config); |
87 | WREG32(SOC15_REG_OFFSET(GC, 0, regSH_MEM_BASES), sh_mem_bases); |
88 | |
89 | unlock_srbm(adev); |
90 | } |
91 | |
92 | static int set_pasid_vmid_mapping_v11(struct amdgpu_device *adev, unsigned int pasid, |
93 | unsigned int vmid, uint32_t inst) |
94 | { |
95 | uint32_t value = pasid << IH_VMID_0_LUT__PASID__SHIFT; |
96 | |
97 | /* Mapping vmid to pasid also for IH block */ |
98 | pr_debug("mapping vmid %d -> pasid %d in IH block for GFX client\n" , |
99 | vmid, pasid); |
100 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, regIH_VMID_0_LUT) + vmid, value); |
101 | |
102 | return 0; |
103 | } |
104 | |
105 | static int init_interrupts_v11(struct amdgpu_device *adev, uint32_t pipe_id, |
106 | uint32_t inst) |
107 | { |
108 | uint32_t mec; |
109 | uint32_t pipe; |
110 | |
111 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
112 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
113 | |
114 | lock_srbm(adev, mec, pipe, queue: 0, vmid: 0); |
115 | |
116 | WREG32_SOC15(GC, 0, regCPC_INT_CNTL, |
117 | CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | |
118 | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); |
119 | |
120 | unlock_srbm(adev); |
121 | |
122 | return 0; |
123 | } |
124 | |
125 | static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, |
126 | unsigned int engine_id, |
127 | unsigned int queue_id) |
128 | { |
129 | uint32_t sdma_engine_reg_base = 0; |
130 | uint32_t sdma_rlc_reg_offset; |
131 | |
132 | switch (engine_id) { |
133 | case 0: |
134 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, |
135 | regSDMA0_QUEUE0_RB_CNTL) - regSDMA0_QUEUE0_RB_CNTL; |
136 | break; |
137 | case 1: |
138 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0, |
139 | regSDMA1_QUEUE0_RB_CNTL) - regSDMA0_QUEUE0_RB_CNTL; |
140 | break; |
141 | default: |
142 | BUG(); |
143 | } |
144 | |
145 | sdma_rlc_reg_offset = sdma_engine_reg_base |
146 | + queue_id * (regSDMA0_QUEUE1_RB_CNTL - regSDMA0_QUEUE0_RB_CNTL); |
147 | |
148 | pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n" , engine_id, |
149 | queue_id, sdma_rlc_reg_offset); |
150 | |
151 | return sdma_rlc_reg_offset; |
152 | } |
153 | |
154 | static inline struct v11_compute_mqd *get_mqd(void *mqd) |
155 | { |
156 | return (struct v11_compute_mqd *)mqd; |
157 | } |
158 | |
159 | static inline struct v11_sdma_mqd *get_sdma_mqd(void *mqd) |
160 | { |
161 | return (struct v11_sdma_mqd *)mqd; |
162 | } |
163 | |
164 | static int hqd_load_v11(struct amdgpu_device *adev, void *mqd, uint32_t pipe_id, |
165 | uint32_t queue_id, uint32_t __user *wptr, |
166 | uint32_t wptr_shift, uint32_t wptr_mask, |
167 | struct mm_struct *mm, uint32_t inst) |
168 | { |
169 | struct v11_compute_mqd *m; |
170 | uint32_t *mqd_hqd; |
171 | uint32_t reg, hqd_base, data; |
172 | |
173 | m = get_mqd(mqd); |
174 | |
175 | pr_debug("Load hqd of pipe %d queue %d\n" , pipe_id, queue_id); |
176 | acquire_queue(adev, pipe_id, queue_id); |
177 | |
178 | /* HIQ is set during driver init period with vmid set to 0*/ |
179 | if (m->cp_hqd_vmid == 0) { |
180 | uint32_t value, mec, pipe; |
181 | |
182 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
183 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
184 | |
185 | pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n" , |
186 | mec, pipe, queue_id); |
187 | value = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_CP_SCHEDULERS)); |
188 | value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1, |
189 | ((mec << 5) | (pipe << 3) | queue_id | 0x80)); |
190 | WREG32(SOC15_REG_OFFSET(GC, 0, regRLC_CP_SCHEDULERS), value); |
191 | } |
192 | |
193 | /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ |
194 | mqd_hqd = &m->cp_mqd_base_addr_lo; |
195 | hqd_base = SOC15_REG_OFFSET(GC, 0, regCP_MQD_BASE_ADDR); |
196 | |
197 | for (reg = hqd_base; |
198 | reg <= SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_HI); reg++) |
199 | WREG32(reg, mqd_hqd[reg - hqd_base]); |
200 | |
201 | |
202 | /* Activate doorbell logic before triggering WPTR poll. */ |
203 | data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, |
204 | CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); |
205 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL), data); |
206 | |
207 | if (wptr) { |
208 | /* Don't read wptr with get_user because the user |
209 | * context may not be accessible (if this function |
210 | * runs in a work queue). Instead trigger a one-shot |
211 | * polling read from memory in the CP. This assumes |
212 | * that wptr is GPU-accessible in the queue's VMID via |
213 | * ATC or SVM. WPTR==RPTR before starting the poll so |
214 | * the CP starts fetching new commands from the right |
215 | * place. |
216 | * |
217 | * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit |
218 | * tricky. Assume that the queue didn't overflow. The |
219 | * number of valid bits in the 32-bit RPTR depends on |
220 | * the queue size. The remaining bits are taken from |
221 | * the saved 64-bit WPTR. If the WPTR wrapped, add the |
222 | * queue size. |
223 | */ |
224 | uint32_t queue_size = |
225 | 2 << REG_GET_FIELD(m->cp_hqd_pq_control, |
226 | CP_HQD_PQ_CONTROL, QUEUE_SIZE); |
227 | uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); |
228 | |
229 | if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) |
230 | guessed_wptr += queue_size; |
231 | guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); |
232 | guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; |
233 | |
234 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_LO), |
235 | lower_32_bits(guessed_wptr)); |
236 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_HI), |
237 | upper_32_bits(guessed_wptr)); |
238 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_POLL_ADDR), |
239 | lower_32_bits((uint64_t)wptr)); |
240 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_POLL_ADDR_HI), |
241 | upper_32_bits((uint64_t)wptr)); |
242 | pr_debug("%s setting CP_PQ_WPTR_POLL_CNTL1 to %x\n" , __func__, |
243 | (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); |
244 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_PQ_WPTR_POLL_CNTL1), |
245 | (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); |
246 | } |
247 | |
248 | /* Start the EOP fetcher */ |
249 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_EOP_RPTR), |
250 | REG_SET_FIELD(m->cp_hqd_eop_rptr, |
251 | CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); |
252 | |
253 | data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); |
254 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_ACTIVE), data); |
255 | |
256 | release_queue(adev); |
257 | |
258 | return 0; |
259 | } |
260 | |
261 | static int hiq_mqd_load_v11(struct amdgpu_device *adev, void *mqd, |
262 | uint32_t pipe_id, uint32_t queue_id, |
263 | uint32_t doorbell_off, uint32_t inst) |
264 | { |
265 | struct amdgpu_ring *kiq_ring = &adev->gfx.kiq[0].ring; |
266 | struct v11_compute_mqd *m; |
267 | uint32_t mec, pipe; |
268 | int r; |
269 | |
270 | m = get_mqd(mqd); |
271 | |
272 | acquire_queue(adev, pipe_id, queue_id); |
273 | |
274 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
275 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
276 | |
277 | pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n" , |
278 | mec, pipe, queue_id); |
279 | |
280 | spin_lock(lock: &adev->gfx.kiq[0].ring_lock); |
281 | r = amdgpu_ring_alloc(ring: kiq_ring, ndw: 7); |
282 | if (r) { |
283 | pr_err("Failed to alloc KIQ (%d).\n" , r); |
284 | goto out_unlock; |
285 | } |
286 | |
287 | amdgpu_ring_write(ring: kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); |
288 | amdgpu_ring_write(ring: kiq_ring, |
289 | PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ |
290 | PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */ |
291 | PACKET3_MAP_QUEUES_QUEUE(queue_id) | |
292 | PACKET3_MAP_QUEUES_PIPE(pipe) | |
293 | PACKET3_MAP_QUEUES_ME((mec - 1)) | |
294 | PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */ |
295 | PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */ |
296 | PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */ |
297 | PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */ |
298 | amdgpu_ring_write(ring: kiq_ring, |
299 | PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off)); |
300 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_mqd_base_addr_lo); |
301 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_mqd_base_addr_hi); |
302 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_hqd_pq_wptr_poll_addr_lo); |
303 | amdgpu_ring_write(ring: kiq_ring, v: m->cp_hqd_pq_wptr_poll_addr_hi); |
304 | amdgpu_ring_commit(ring: kiq_ring); |
305 | |
306 | out_unlock: |
307 | spin_unlock(lock: &adev->gfx.kiq[0].ring_lock); |
308 | release_queue(adev); |
309 | |
310 | return r; |
311 | } |
312 | |
313 | static int hqd_dump_v11(struct amdgpu_device *adev, |
314 | uint32_t pipe_id, uint32_t queue_id, |
315 | uint32_t (**dump)[2], uint32_t *n_regs, uint32_t inst) |
316 | { |
317 | uint32_t i = 0, reg; |
318 | #define HQD_N_REGS 56 |
319 | #define DUMP_REG(addr) do { \ |
320 | if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ |
321 | break; \ |
322 | (*dump)[i][0] = (addr) << 2; \ |
323 | (*dump)[i++][1] = RREG32(addr); \ |
324 | } while (0) |
325 | |
326 | *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); |
327 | if (*dump == NULL) |
328 | return -ENOMEM; |
329 | |
330 | acquire_queue(adev, pipe_id, queue_id); |
331 | |
332 | for (reg = SOC15_REG_OFFSET(GC, 0, regCP_MQD_BASE_ADDR); |
333 | reg <= SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_HI); reg++) |
334 | DUMP_REG(reg); |
335 | |
336 | release_queue(adev); |
337 | |
338 | WARN_ON_ONCE(i != HQD_N_REGS); |
339 | *n_regs = i; |
340 | |
341 | return 0; |
342 | } |
343 | |
344 | static int hqd_sdma_load_v11(struct amdgpu_device *adev, void *mqd, |
345 | uint32_t __user *wptr, struct mm_struct *mm) |
346 | { |
347 | struct v11_sdma_mqd *m; |
348 | uint32_t sdma_rlc_reg_offset; |
349 | unsigned long end_jiffies; |
350 | uint32_t data; |
351 | uint64_t data64; |
352 | uint64_t __user *wptr64 = (uint64_t __user *)wptr; |
353 | |
354 | m = get_sdma_mqd(mqd); |
355 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
356 | queue_id: m->sdma_queue_id); |
357 | |
358 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL, |
359 | m->sdmax_rlcx_rb_cntl & (~SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK)); |
360 | |
361 | end_jiffies = msecs_to_jiffies(m: 2000) + jiffies; |
362 | while (true) { |
363 | data = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_CONTEXT_STATUS); |
364 | if (data & SDMA0_QUEUE0_CONTEXT_STATUS__IDLE_MASK) |
365 | break; |
366 | if (time_after(jiffies, end_jiffies)) { |
367 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
368 | return -ETIME; |
369 | } |
370 | usleep_range(min: 500, max: 1000); |
371 | } |
372 | |
373 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_DOORBELL_OFFSET, |
374 | m->sdmax_rlcx_doorbell_offset); |
375 | |
376 | data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_QUEUE0_DOORBELL, |
377 | ENABLE, 1); |
378 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_DOORBELL, data); |
379 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR, |
380 | m->sdmax_rlcx_rb_rptr); |
381 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_HI, |
382 | m->sdmax_rlcx_rb_rptr_hi); |
383 | |
384 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_MINOR_PTR_UPDATE, 1); |
385 | if (read_user_wptr(mm, wptr64, data64)) { |
386 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR, |
387 | lower_32_bits(data64)); |
388 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR_HI, |
389 | upper_32_bits(data64)); |
390 | } else { |
391 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR, |
392 | m->sdmax_rlcx_rb_rptr); |
393 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR_HI, |
394 | m->sdmax_rlcx_rb_rptr_hi); |
395 | } |
396 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_MINOR_PTR_UPDATE, 0); |
397 | |
398 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_BASE, m->sdmax_rlcx_rb_base); |
399 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_BASE_HI, |
400 | m->sdmax_rlcx_rb_base_hi); |
401 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_ADDR_LO, |
402 | m->sdmax_rlcx_rb_rptr_addr_lo); |
403 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_ADDR_HI, |
404 | m->sdmax_rlcx_rb_rptr_addr_hi); |
405 | |
406 | data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_QUEUE0_RB_CNTL, |
407 | RB_ENABLE, 1); |
408 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL, data); |
409 | |
410 | return 0; |
411 | } |
412 | |
413 | static int hqd_sdma_dump_v11(struct amdgpu_device *adev, |
414 | uint32_t engine_id, uint32_t queue_id, |
415 | uint32_t (**dump)[2], uint32_t *n_regs) |
416 | { |
417 | uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, |
418 | engine_id, queue_id); |
419 | uint32_t i = 0, reg; |
420 | #undef HQD_N_REGS |
421 | #define HQD_N_REGS (7+11+1+12+12) |
422 | |
423 | *dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL); |
424 | if (*dump == NULL) |
425 | return -ENOMEM; |
426 | |
427 | for (reg = regSDMA0_QUEUE0_RB_CNTL; |
428 | reg <= regSDMA0_QUEUE0_RB_WPTR_HI; reg++) |
429 | DUMP_REG(sdma_rlc_reg_offset + reg); |
430 | for (reg = regSDMA0_QUEUE0_RB_RPTR_ADDR_HI; |
431 | reg <= regSDMA0_QUEUE0_DOORBELL; reg++) |
432 | DUMP_REG(sdma_rlc_reg_offset + reg); |
433 | for (reg = regSDMA0_QUEUE0_DOORBELL_LOG; |
434 | reg <= regSDMA0_QUEUE0_DOORBELL_LOG; reg++) |
435 | DUMP_REG(sdma_rlc_reg_offset + reg); |
436 | for (reg = regSDMA0_QUEUE0_DOORBELL_OFFSET; |
437 | reg <= regSDMA0_QUEUE0_RB_PREEMPT; reg++) |
438 | DUMP_REG(sdma_rlc_reg_offset + reg); |
439 | for (reg = regSDMA0_QUEUE0_MIDCMD_DATA0; |
440 | reg <= regSDMA0_QUEUE0_MIDCMD_CNTL; reg++) |
441 | DUMP_REG(sdma_rlc_reg_offset + reg); |
442 | |
443 | WARN_ON_ONCE(i != HQD_N_REGS); |
444 | *n_regs = i; |
445 | |
446 | return 0; |
447 | } |
448 | |
449 | static bool hqd_is_occupied_v11(struct amdgpu_device *adev, uint64_t queue_address, |
450 | uint32_t pipe_id, uint32_t queue_id, uint32_t inst) |
451 | { |
452 | uint32_t act; |
453 | bool retval = false; |
454 | uint32_t low, high; |
455 | |
456 | acquire_queue(adev, pipe_id, queue_id); |
457 | act = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_ACTIVE)); |
458 | if (act) { |
459 | low = lower_32_bits(queue_address >> 8); |
460 | high = upper_32_bits(queue_address >> 8); |
461 | |
462 | if (low == RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_BASE)) && |
463 | high == RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_BASE_HI))) |
464 | retval = true; |
465 | } |
466 | release_queue(adev); |
467 | return retval; |
468 | } |
469 | |
470 | static bool hqd_sdma_is_occupied_v11(struct amdgpu_device *adev, void *mqd) |
471 | { |
472 | struct v11_sdma_mqd *m; |
473 | uint32_t sdma_rlc_reg_offset; |
474 | uint32_t sdma_rlc_rb_cntl; |
475 | |
476 | m = get_sdma_mqd(mqd); |
477 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
478 | queue_id: m->sdma_queue_id); |
479 | |
480 | sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL); |
481 | |
482 | if (sdma_rlc_rb_cntl & SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK) |
483 | return true; |
484 | |
485 | return false; |
486 | } |
487 | |
488 | static int hqd_destroy_v11(struct amdgpu_device *adev, void *mqd, |
489 | enum kfd_preempt_type reset_type, |
490 | unsigned int utimeout, uint32_t pipe_id, |
491 | uint32_t queue_id, uint32_t inst) |
492 | { |
493 | enum hqd_dequeue_request_type type; |
494 | unsigned long end_jiffies; |
495 | uint32_t temp; |
496 | struct v11_compute_mqd *m = get_mqd(mqd); |
497 | |
498 | acquire_queue(adev, pipe_id, queue_id); |
499 | |
500 | if (m->cp_hqd_vmid == 0) |
501 | WREG32_FIELD15_PREREG(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); |
502 | |
503 | switch (reset_type) { |
504 | case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: |
505 | type = DRAIN_PIPE; |
506 | break; |
507 | case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: |
508 | type = RESET_WAVES; |
509 | break; |
510 | default: |
511 | type = DRAIN_PIPE; |
512 | break; |
513 | } |
514 | |
515 | WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_DEQUEUE_REQUEST), type); |
516 | |
517 | end_jiffies = (utimeout * HZ / 1000) + jiffies; |
518 | while (true) { |
519 | temp = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_ACTIVE)); |
520 | if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) |
521 | break; |
522 | if (time_after(jiffies, end_jiffies)) { |
523 | pr_err("cp queue pipe %d queue %d preemption failed\n" , |
524 | pipe_id, queue_id); |
525 | release_queue(adev); |
526 | return -ETIME; |
527 | } |
528 | usleep_range(min: 500, max: 1000); |
529 | } |
530 | |
531 | release_queue(adev); |
532 | return 0; |
533 | } |
534 | |
535 | static int hqd_sdma_destroy_v11(struct amdgpu_device *adev, void *mqd, |
536 | unsigned int utimeout) |
537 | { |
538 | struct v11_sdma_mqd *m; |
539 | uint32_t sdma_rlc_reg_offset; |
540 | uint32_t temp; |
541 | unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; |
542 | |
543 | m = get_sdma_mqd(mqd); |
544 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, engine_id: m->sdma_engine_id, |
545 | queue_id: m->sdma_queue_id); |
546 | |
547 | temp = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL); |
548 | temp = temp & ~SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK; |
549 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL, temp); |
550 | |
551 | while (true) { |
552 | temp = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_CONTEXT_STATUS); |
553 | if (temp & SDMA0_QUEUE0_CONTEXT_STATUS__IDLE_MASK) |
554 | break; |
555 | if (time_after(jiffies, end_jiffies)) { |
556 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
557 | return -ETIME; |
558 | } |
559 | usleep_range(min: 500, max: 1000); |
560 | } |
561 | |
562 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_DOORBELL, 0); |
563 | WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL, |
564 | RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL) | |
565 | SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK); |
566 | |
567 | m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR); |
568 | m->sdmax_rlcx_rb_rptr_hi = |
569 | RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_HI); |
570 | |
571 | return 0; |
572 | } |
573 | |
574 | static int wave_control_execute_v11(struct amdgpu_device *adev, |
575 | uint32_t gfx_index_val, |
576 | uint32_t sq_cmd, uint32_t inst) |
577 | { |
578 | uint32_t data = 0; |
579 | |
580 | mutex_lock(&adev->grbm_idx_mutex); |
581 | |
582 | WREG32(SOC15_REG_OFFSET(GC, 0, regGRBM_GFX_INDEX), gfx_index_val); |
583 | WREG32(SOC15_REG_OFFSET(GC, 0, regSQ_CMD), sq_cmd); |
584 | |
585 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
586 | INSTANCE_BROADCAST_WRITES, 1); |
587 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
588 | SA_BROADCAST_WRITES, 1); |
589 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
590 | SE_BROADCAST_WRITES, 1); |
591 | |
592 | WREG32(SOC15_REG_OFFSET(GC, 0, regGRBM_GFX_INDEX), data); |
593 | mutex_unlock(lock: &adev->grbm_idx_mutex); |
594 | |
595 | return 0; |
596 | } |
597 | |
598 | static void set_vm_context_page_table_base_v11(struct amdgpu_device *adev, |
599 | uint32_t vmid, uint64_t page_table_base) |
600 | { |
601 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { |
602 | pr_err("trying to set page table base for wrong VMID %u\n" , |
603 | vmid); |
604 | return; |
605 | } |
606 | |
607 | /* SDMA is on gfxhub as well for gfx11 adapters */ |
608 | adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); |
609 | } |
610 | |
611 | /* |
612 | * Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. |
613 | * |
614 | * restore_dbg_registers is ignored here but is a general interface requirement |
615 | * for devices that support GFXOFF and where the RLC save/restore list |
616 | * does not support hw registers for debugging i.e. the driver has to manually |
617 | * initialize the debug mode registers after it has disabled GFX off during the |
618 | * debug session. |
619 | */ |
620 | static uint32_t kgd_gfx_v11_enable_debug_trap(struct amdgpu_device *adev, |
621 | bool restore_dbg_registers, |
622 | uint32_t vmid) |
623 | { |
624 | uint32_t data = 0; |
625 | |
626 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1); |
627 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0); |
628 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0); |
629 | |
630 | return data; |
631 | } |
632 | |
633 | /* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */ |
634 | static uint32_t kgd_gfx_v11_disable_debug_trap(struct amdgpu_device *adev, |
635 | bool keep_trap_enabled, |
636 | uint32_t vmid) |
637 | { |
638 | uint32_t data = 0; |
639 | |
640 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1); |
641 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, 0); |
642 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, 0); |
643 | |
644 | return data; |
645 | } |
646 | |
647 | static int kgd_gfx_v11_validate_trap_override_request(struct amdgpu_device *adev, |
648 | uint32_t trap_override, |
649 | uint32_t *trap_mask_supported) |
650 | { |
651 | *trap_mask_supported &= KFD_DBG_TRAP_MASK_FP_INVALID | |
652 | KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL | |
653 | KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO | |
654 | KFD_DBG_TRAP_MASK_FP_OVERFLOW | |
655 | KFD_DBG_TRAP_MASK_FP_UNDERFLOW | |
656 | KFD_DBG_TRAP_MASK_FP_INEXACT | |
657 | KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO | |
658 | KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH | |
659 | KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION; |
660 | |
661 | if (amdgpu_ip_version(adev, ip: GC_HWIP, inst: 0) >= IP_VERSION(11, 0, 4)) |
662 | *trap_mask_supported |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START | |
663 | KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END; |
664 | |
665 | if (trap_override != KFD_DBG_TRAP_OVERRIDE_OR && |
666 | trap_override != KFD_DBG_TRAP_OVERRIDE_REPLACE) |
667 | return -EPERM; |
668 | |
669 | return 0; |
670 | } |
671 | |
672 | static uint32_t trap_mask_map_sw_to_hw(uint32_t mask) |
673 | { |
674 | uint32_t trap_on_start = (mask & KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START) ? 1 : 0; |
675 | uint32_t trap_on_end = (mask & KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END) ? 1 : 0; |
676 | uint32_t excp_en = mask & (KFD_DBG_TRAP_MASK_FP_INVALID | |
677 | KFD_DBG_TRAP_MASK_FP_INPUT_DENORMAL | |
678 | KFD_DBG_TRAP_MASK_FP_DIVIDE_BY_ZERO | |
679 | KFD_DBG_TRAP_MASK_FP_OVERFLOW | |
680 | KFD_DBG_TRAP_MASK_FP_UNDERFLOW | |
681 | KFD_DBG_TRAP_MASK_FP_INEXACT | |
682 | KFD_DBG_TRAP_MASK_INT_DIVIDE_BY_ZERO | |
683 | KFD_DBG_TRAP_MASK_DBG_ADDRESS_WATCH | |
684 | KFD_DBG_TRAP_MASK_DBG_MEMORY_VIOLATION); |
685 | uint32_t ret; |
686 | |
687 | ret = REG_SET_FIELD(0, SPI_GDBG_PER_VMID_CNTL, EXCP_EN, excp_en); |
688 | ret = REG_SET_FIELD(ret, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_START, trap_on_start); |
689 | ret = REG_SET_FIELD(ret, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_END, trap_on_end); |
690 | |
691 | return ret; |
692 | } |
693 | |
694 | static uint32_t trap_mask_map_hw_to_sw(uint32_t mask) |
695 | { |
696 | uint32_t ret = REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, EXCP_EN); |
697 | |
698 | if (REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_START)) |
699 | ret |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_START; |
700 | |
701 | if (REG_GET_FIELD(mask, SPI_GDBG_PER_VMID_CNTL, TRAP_ON_END)) |
702 | ret |= KFD_DBG_TRAP_MASK_TRAP_ON_WAVE_END; |
703 | |
704 | return ret; |
705 | } |
706 | |
707 | /* Returns TRAP_EN, EXCP_EN and EXCP_REPLACE. */ |
708 | static uint32_t kgd_gfx_v11_set_wave_launch_trap_override(struct amdgpu_device *adev, |
709 | uint32_t vmid, |
710 | uint32_t trap_override, |
711 | uint32_t trap_mask_bits, |
712 | uint32_t trap_mask_request, |
713 | uint32_t *trap_mask_prev, |
714 | uint32_t kfd_dbg_trap_cntl_prev) |
715 | { |
716 | uint32_t data = 0; |
717 | |
718 | *trap_mask_prev = trap_mask_map_hw_to_sw(mask: kfd_dbg_trap_cntl_prev); |
719 | |
720 | data = (trap_mask_bits & trap_mask_request) | (*trap_mask_prev & ~trap_mask_request); |
721 | data = trap_mask_map_sw_to_hw(mask: data); |
722 | |
723 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, TRAP_EN, 1); |
724 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, EXCP_REPLACE, trap_override); |
725 | |
726 | return data; |
727 | } |
728 | |
729 | static uint32_t kgd_gfx_v11_set_wave_launch_mode(struct amdgpu_device *adev, |
730 | uint8_t wave_launch_mode, |
731 | uint32_t vmid) |
732 | { |
733 | uint32_t data = 0; |
734 | |
735 | data = REG_SET_FIELD(data, SPI_GDBG_PER_VMID_CNTL, LAUNCH_MODE, wave_launch_mode); |
736 | |
737 | return data; |
738 | } |
739 | |
740 | #define TCP_WATCH_STRIDE (regTCP_WATCH1_ADDR_H - regTCP_WATCH0_ADDR_H) |
741 | static uint32_t kgd_gfx_v11_set_address_watch(struct amdgpu_device *adev, |
742 | uint64_t watch_address, |
743 | uint32_t watch_address_mask, |
744 | uint32_t watch_id, |
745 | uint32_t watch_mode, |
746 | uint32_t debug_vmid, |
747 | uint32_t inst) |
748 | { |
749 | uint32_t watch_address_high; |
750 | uint32_t watch_address_low; |
751 | uint32_t watch_address_cntl; |
752 | |
753 | watch_address_cntl = 0; |
754 | watch_address_low = lower_32_bits(watch_address); |
755 | watch_address_high = upper_32_bits(watch_address) & 0xffff; |
756 | |
757 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
758 | TCP_WATCH0_CNTL, |
759 | MODE, |
760 | watch_mode); |
761 | |
762 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
763 | TCP_WATCH0_CNTL, |
764 | MASK, |
765 | watch_address_mask >> 7); |
766 | |
767 | watch_address_cntl = REG_SET_FIELD(watch_address_cntl, |
768 | TCP_WATCH0_CNTL, |
769 | VALID, |
770 | 1); |
771 | |
772 | WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_H) + |
773 | (watch_id * TCP_WATCH_STRIDE)), |
774 | watch_address_high); |
775 | |
776 | WREG32_RLC((SOC15_REG_OFFSET(GC, 0, regTCP_WATCH0_ADDR_L) + |
777 | (watch_id * TCP_WATCH_STRIDE)), |
778 | watch_address_low); |
779 | |
780 | return watch_address_cntl; |
781 | } |
782 | |
783 | static uint32_t kgd_gfx_v11_clear_address_watch(struct amdgpu_device *adev, |
784 | uint32_t watch_id) |
785 | { |
786 | return 0; |
787 | } |
788 | |
789 | const struct kfd2kgd_calls gfx_v11_kfd2kgd = { |
790 | .program_sh_mem_settings = program_sh_mem_settings_v11, |
791 | .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11, |
792 | .init_interrupts = init_interrupts_v11, |
793 | .hqd_load = hqd_load_v11, |
794 | .hiq_mqd_load = hiq_mqd_load_v11, |
795 | .hqd_sdma_load = hqd_sdma_load_v11, |
796 | .hqd_dump = hqd_dump_v11, |
797 | .hqd_sdma_dump = hqd_sdma_dump_v11, |
798 | .hqd_is_occupied = hqd_is_occupied_v11, |
799 | .hqd_sdma_is_occupied = hqd_sdma_is_occupied_v11, |
800 | .hqd_destroy = hqd_destroy_v11, |
801 | .hqd_sdma_destroy = hqd_sdma_destroy_v11, |
802 | .wave_control_execute = wave_control_execute_v11, |
803 | .get_atc_vmid_pasid_mapping_info = NULL, |
804 | .set_vm_context_page_table_base = set_vm_context_page_table_base_v11, |
805 | .enable_debug_trap = kgd_gfx_v11_enable_debug_trap, |
806 | .disable_debug_trap = kgd_gfx_v11_disable_debug_trap, |
807 | .validate_trap_override_request = kgd_gfx_v11_validate_trap_override_request, |
808 | .set_wave_launch_trap_override = kgd_gfx_v11_set_wave_launch_trap_override, |
809 | .set_wave_launch_mode = kgd_gfx_v11_set_wave_launch_mode, |
810 | .set_address_watch = kgd_gfx_v11_set_address_watch, |
811 | .clear_address_watch = kgd_gfx_v11_clear_address_watch |
812 | }; |
813 | |