1 | /* |
2 | * Copyright 2014-2018 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ |
22 | #include "amdgpu.h" |
23 | #include "amdgpu_amdkfd.h" |
24 | #include "gc/gc_9_0_offset.h" |
25 | #include "gc/gc_9_0_sh_mask.h" |
26 | #include "vega10_enum.h" |
27 | #include "sdma0/sdma0_4_0_offset.h" |
28 | #include "sdma0/sdma0_4_0_sh_mask.h" |
29 | #include "sdma1/sdma1_4_0_offset.h" |
30 | #include "sdma1/sdma1_4_0_sh_mask.h" |
31 | #include "athub/athub_1_0_offset.h" |
32 | #include "athub/athub_1_0_sh_mask.h" |
33 | #include "oss/osssys_4_0_offset.h" |
34 | #include "oss/osssys_4_0_sh_mask.h" |
35 | #include "soc15_common.h" |
36 | #include "v9_structs.h" |
37 | #include "soc15.h" |
38 | #include "soc15d.h" |
39 | #include "gfx_v9_0.h" |
40 | #include "amdgpu_amdkfd_gfx_v9.h" |
41 | |
42 | enum hqd_dequeue_request_type { |
43 | NO_ACTION = 0, |
44 | DRAIN_PIPE, |
45 | RESET_WAVES, |
46 | SAVE_WAVES |
47 | }; |
48 | |
49 | static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe, |
50 | uint32_t queue, uint32_t vmid) |
51 | { |
52 | mutex_lock(&adev->srbm_mutex); |
53 | soc15_grbm_select(adev, mec, pipe, queue, vmid); |
54 | } |
55 | |
56 | static void unlock_srbm(struct amdgpu_device *adev) |
57 | { |
58 | soc15_grbm_select(adev, 0, 0, 0, 0); |
59 | mutex_unlock(&adev->srbm_mutex); |
60 | } |
61 | |
62 | static void acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id, |
63 | uint32_t queue_id) |
64 | { |
65 | uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
66 | uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
67 | |
68 | lock_srbm(adev, mec, pipe, queue_id, 0); |
69 | } |
70 | |
71 | static uint64_t get_queue_mask(struct amdgpu_device *adev, |
72 | uint32_t pipe_id, uint32_t queue_id) |
73 | { |
74 | unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe + |
75 | queue_id; |
76 | |
77 | return 1ull << bit; |
78 | } |
79 | |
80 | static void release_queue(struct amdgpu_device *adev) |
81 | { |
82 | unlock_srbm(adev); |
83 | } |
84 | |
85 | void kgd_gfx_v9_program_sh_mem_settings(struct amdgpu_device *adev, uint32_t vmid, |
86 | uint32_t sh_mem_config, |
87 | uint32_t sh_mem_ape1_base, |
88 | uint32_t sh_mem_ape1_limit, |
89 | uint32_t sh_mem_bases) |
90 | { |
91 | lock_srbm(adev, 0, 0, 0, vmid); |
92 | |
93 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_CONFIG), sh_mem_config); |
94 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmSH_MEM_BASES), sh_mem_bases); |
95 | /* APE1 no longer exists on GFX9 */ |
96 | |
97 | unlock_srbm(adev); |
98 | } |
99 | |
100 | int kgd_gfx_v9_set_pasid_vmid_mapping(struct amdgpu_device *adev, u32 pasid, |
101 | unsigned int vmid) |
102 | { |
103 | /* |
104 | * We have to assume that there is no outstanding mapping. |
105 | * The ATC_VMID_PASID_MAPPING_UPDATE_STATUS bit could be 0 because |
106 | * a mapping is in progress or because a mapping finished |
107 | * and the SW cleared it. |
108 | * So the protocol is to always wait & clear. |
109 | */ |
110 | uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | |
111 | ATC_VMID0_PASID_MAPPING__VALID_MASK; |
112 | |
113 | /* |
114 | * need to do this twice, once for gfx and once for mmhub |
115 | * for ATC add 16 to VMID for mmhub, for IH different registers. |
116 | * ATC_VMID0..15 registers are separate from ATC_VMID16..31. |
117 | */ |
118 | |
119 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) + vmid, |
120 | pasid_mapping); |
121 | |
122 | while (!(RREG32(SOC15_REG_OFFSET( |
123 | ATHUB, 0, |
124 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & |
125 | (1U << vmid))) |
126 | cpu_relax(); |
127 | |
128 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, |
129 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), |
130 | 1U << vmid); |
131 | |
132 | /* Mapping vmid to pasid also for IH block */ |
133 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT) + vmid, |
134 | pasid_mapping); |
135 | |
136 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID16_PASID_MAPPING) + vmid, |
137 | pasid_mapping); |
138 | |
139 | while (!(RREG32(SOC15_REG_OFFSET( |
140 | ATHUB, 0, |
141 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS)) & |
142 | (1U << (vmid + 16)))) |
143 | cpu_relax(); |
144 | |
145 | WREG32(SOC15_REG_OFFSET(ATHUB, 0, |
146 | mmATC_VMID_PASID_MAPPING_UPDATE_STATUS), |
147 | 1U << (vmid + 16)); |
148 | |
149 | /* Mapping vmid to pasid also for IH block */ |
150 | WREG32(SOC15_REG_OFFSET(OSSSYS, 0, mmIH_VMID_0_LUT_MM) + vmid, |
151 | pasid_mapping); |
152 | return 0; |
153 | } |
154 | |
155 | /* TODO - RING0 form of field is obsolete, seems to date back to SI |
156 | * but still works |
157 | */ |
158 | |
159 | int kgd_gfx_v9_init_interrupts(struct amdgpu_device *adev, uint32_t pipe_id) |
160 | { |
161 | uint32_t mec; |
162 | uint32_t pipe; |
163 | |
164 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
165 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
166 | |
167 | lock_srbm(adev, mec, pipe, 0, 0); |
168 | |
169 | WREG32_SOC15(GC, 0, mmCPC_INT_CNTL, |
170 | CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | |
171 | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); |
172 | |
173 | unlock_srbm(adev); |
174 | |
175 | return 0; |
176 | } |
177 | |
178 | static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev, |
179 | unsigned int engine_id, |
180 | unsigned int queue_id) |
181 | { |
182 | uint32_t sdma_engine_reg_base = 0; |
183 | uint32_t sdma_rlc_reg_offset; |
184 | |
185 | switch (engine_id) { |
186 | default: |
187 | dev_warn(adev->dev, |
188 | "Invalid sdma engine id (%d), using engine id 0\n" , |
189 | engine_id); |
190 | fallthrough; |
191 | case 0: |
192 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0, |
193 | mmSDMA0_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; |
194 | break; |
195 | case 1: |
196 | sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0, |
197 | mmSDMA1_RLC0_RB_CNTL) - mmSDMA0_RLC0_RB_CNTL; |
198 | break; |
199 | } |
200 | |
201 | sdma_rlc_reg_offset = sdma_engine_reg_base |
202 | + queue_id * (mmSDMA0_RLC1_RB_CNTL - mmSDMA0_RLC0_RB_CNTL); |
203 | |
204 | pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n" , engine_id, |
205 | queue_id, sdma_rlc_reg_offset); |
206 | |
207 | return sdma_rlc_reg_offset; |
208 | } |
209 | |
210 | static inline struct v9_mqd *get_mqd(void *mqd) |
211 | { |
212 | return (struct v9_mqd *)mqd; |
213 | } |
214 | |
215 | static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd) |
216 | { |
217 | return (struct v9_sdma_mqd *)mqd; |
218 | } |
219 | |
220 | int kgd_gfx_v9_hqd_load(struct amdgpu_device *adev, void *mqd, |
221 | uint32_t pipe_id, uint32_t queue_id, |
222 | uint32_t __user *wptr, uint32_t wptr_shift, |
223 | uint32_t wptr_mask, struct mm_struct *mm) |
224 | { |
225 | struct v9_mqd *m; |
226 | uint32_t *mqd_hqd; |
227 | uint32_t reg, hqd_base, data; |
228 | |
229 | m = get_mqd(mqd); |
230 | |
231 | acquire_queue(adev, pipe_id, queue_id); |
232 | |
233 | /* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */ |
234 | mqd_hqd = &m->cp_mqd_base_addr_lo; |
235 | hqd_base = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); |
236 | |
237 | for (reg = hqd_base; |
238 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) |
239 | WREG32_RLC(reg, mqd_hqd[reg - hqd_base]); |
240 | |
241 | |
242 | /* Activate doorbell logic before triggering WPTR poll. */ |
243 | data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control, |
244 | CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1); |
245 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_DOORBELL_CONTROL), data); |
246 | |
247 | if (wptr) { |
248 | /* Don't read wptr with get_user because the user |
249 | * context may not be accessible (if this function |
250 | * runs in a work queue). Instead trigger a one-shot |
251 | * polling read from memory in the CP. This assumes |
252 | * that wptr is GPU-accessible in the queue's VMID via |
253 | * ATC or SVM. WPTR==RPTR before starting the poll so |
254 | * the CP starts fetching new commands from the right |
255 | * place. |
256 | * |
257 | * Guessing a 64-bit WPTR from a 32-bit RPTR is a bit |
258 | * tricky. Assume that the queue didn't overflow. The |
259 | * number of valid bits in the 32-bit RPTR depends on |
260 | * the queue size. The remaining bits are taken from |
261 | * the saved 64-bit WPTR. If the WPTR wrapped, add the |
262 | * queue size. |
263 | */ |
264 | uint32_t queue_size = |
265 | 2 << REG_GET_FIELD(m->cp_hqd_pq_control, |
266 | CP_HQD_PQ_CONTROL, QUEUE_SIZE); |
267 | uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1); |
268 | |
269 | if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr) |
270 | guessed_wptr += queue_size; |
271 | guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1); |
272 | guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32; |
273 | |
274 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_LO), |
275 | lower_32_bits(guessed_wptr)); |
276 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI), |
277 | upper_32_bits(guessed_wptr)); |
278 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR), |
279 | lower_32_bits((uintptr_t)wptr)); |
280 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_POLL_ADDR_HI), |
281 | upper_32_bits((uintptr_t)wptr)); |
282 | WREG32_SOC15(GC, 0, mmCP_PQ_WPTR_POLL_CNTL1, |
283 | (uint32_t)get_queue_mask(adev, pipe_id, queue_id)); |
284 | } |
285 | |
286 | /* Start the EOP fetcher */ |
287 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_EOP_RPTR), |
288 | REG_SET_FIELD(m->cp_hqd_eop_rptr, |
289 | CP_HQD_EOP_RPTR, INIT_FETCHER, 1)); |
290 | |
291 | data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1); |
292 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE), data); |
293 | |
294 | release_queue(adev); |
295 | |
296 | return 0; |
297 | } |
298 | |
299 | int kgd_gfx_v9_hiq_mqd_load(struct amdgpu_device *adev, void *mqd, |
300 | uint32_t pipe_id, uint32_t queue_id, |
301 | uint32_t doorbell_off) |
302 | { |
303 | struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring; |
304 | struct v9_mqd *m; |
305 | uint32_t mec, pipe; |
306 | int r; |
307 | |
308 | m = get_mqd(mqd); |
309 | |
310 | acquire_queue(adev, pipe_id, queue_id); |
311 | |
312 | mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1; |
313 | pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec); |
314 | |
315 | pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n" , |
316 | mec, pipe, queue_id); |
317 | |
318 | spin_lock(&adev->gfx.kiq.ring_lock); |
319 | r = amdgpu_ring_alloc(kiq_ring, 7); |
320 | if (r) { |
321 | pr_err("Failed to alloc KIQ (%d).\n" , r); |
322 | goto out_unlock; |
323 | } |
324 | |
325 | amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5)); |
326 | amdgpu_ring_write(kiq_ring, |
327 | PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */ |
328 | PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */ |
329 | PACKET3_MAP_QUEUES_QUEUE(queue_id) | |
330 | PACKET3_MAP_QUEUES_PIPE(pipe) | |
331 | PACKET3_MAP_QUEUES_ME((mec - 1)) | |
332 | PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */ |
333 | PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */ |
334 | PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */ |
335 | PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */ |
336 | amdgpu_ring_write(kiq_ring, |
337 | PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off)); |
338 | amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo); |
339 | amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi); |
340 | amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo); |
341 | amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi); |
342 | amdgpu_ring_commit(kiq_ring); |
343 | |
344 | out_unlock: |
345 | spin_unlock(&adev->gfx.kiq.ring_lock); |
346 | release_queue(adev); |
347 | |
348 | return r; |
349 | } |
350 | |
351 | int kgd_gfx_v9_hqd_dump(struct amdgpu_device *adev, |
352 | uint32_t pipe_id, uint32_t queue_id, |
353 | uint32_t (**dump)[2], uint32_t *n_regs) |
354 | { |
355 | uint32_t i = 0, reg; |
356 | #define HQD_N_REGS 56 |
357 | #define DUMP_REG(addr) do { \ |
358 | if (WARN_ON_ONCE(i >= HQD_N_REGS)) \ |
359 | break; \ |
360 | (*dump)[i][0] = (addr) << 2; \ |
361 | (*dump)[i++][1] = RREG32(addr); \ |
362 | } while (0) |
363 | |
364 | *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); |
365 | if (*dump == NULL) |
366 | return -ENOMEM; |
367 | |
368 | acquire_queue(adev, pipe_id, queue_id); |
369 | |
370 | for (reg = SOC15_REG_OFFSET(GC, 0, mmCP_MQD_BASE_ADDR); |
371 | reg <= SOC15_REG_OFFSET(GC, 0, mmCP_HQD_PQ_WPTR_HI); reg++) |
372 | DUMP_REG(reg); |
373 | |
374 | release_queue(adev); |
375 | |
376 | WARN_ON_ONCE(i != HQD_N_REGS); |
377 | *n_regs = i; |
378 | |
379 | return 0; |
380 | } |
381 | |
382 | static int kgd_hqd_sdma_load(struct amdgpu_device *adev, void *mqd, |
383 | uint32_t __user *wptr, struct mm_struct *mm) |
384 | { |
385 | struct v9_sdma_mqd *m; |
386 | uint32_t sdma_rlc_reg_offset; |
387 | unsigned long end_jiffies; |
388 | uint32_t data; |
389 | uint64_t data64; |
390 | uint64_t __user *wptr64 = (uint64_t __user *)wptr; |
391 | |
392 | m = get_sdma_mqd(mqd); |
393 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, |
394 | m->sdma_queue_id); |
395 | |
396 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, |
397 | m->sdmax_rlcx_rb_cntl & (~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK)); |
398 | |
399 | end_jiffies = msecs_to_jiffies(2000) + jiffies; |
400 | while (true) { |
401 | data = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); |
402 | if (data & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
403 | break; |
404 | if (time_after(jiffies, end_jiffies)) { |
405 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
406 | return -ETIME; |
407 | } |
408 | usleep_range(500, 1000); |
409 | } |
410 | |
411 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL_OFFSET, |
412 | m->sdmax_rlcx_doorbell_offset); |
413 | |
414 | data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_RLC0_DOORBELL, |
415 | ENABLE, 1); |
416 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, data); |
417 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR, |
418 | m->sdmax_rlcx_rb_rptr); |
419 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI, |
420 | m->sdmax_rlcx_rb_rptr_hi); |
421 | |
422 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 1); |
423 | if (read_user_wptr(mm, wptr64, data64)) { |
424 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, |
425 | lower_32_bits(data64)); |
426 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, |
427 | upper_32_bits(data64)); |
428 | } else { |
429 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR, |
430 | m->sdmax_rlcx_rb_rptr); |
431 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_WPTR_HI, |
432 | m->sdmax_rlcx_rb_rptr_hi); |
433 | } |
434 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_MINOR_PTR_UPDATE, 0); |
435 | |
436 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE, m->sdmax_rlcx_rb_base); |
437 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_BASE_HI, |
438 | m->sdmax_rlcx_rb_base_hi); |
439 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_LO, |
440 | m->sdmax_rlcx_rb_rptr_addr_lo); |
441 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_ADDR_HI, |
442 | m->sdmax_rlcx_rb_rptr_addr_hi); |
443 | |
444 | data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_RLC0_RB_CNTL, |
445 | RB_ENABLE, 1); |
446 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, data); |
447 | |
448 | return 0; |
449 | } |
450 | |
451 | static int kgd_hqd_sdma_dump(struct amdgpu_device *adev, |
452 | uint32_t engine_id, uint32_t queue_id, |
453 | uint32_t (**dump)[2], uint32_t *n_regs) |
454 | { |
455 | uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, |
456 | engine_id, queue_id); |
457 | uint32_t i = 0, reg; |
458 | #undef HQD_N_REGS |
459 | #define HQD_N_REGS (19+6+7+10) |
460 | |
461 | *dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL); |
462 | if (*dump == NULL) |
463 | return -ENOMEM; |
464 | |
465 | for (reg = mmSDMA0_RLC0_RB_CNTL; reg <= mmSDMA0_RLC0_DOORBELL; reg++) |
466 | DUMP_REG(sdma_rlc_reg_offset + reg); |
467 | for (reg = mmSDMA0_RLC0_STATUS; reg <= mmSDMA0_RLC0_CSA_ADDR_HI; reg++) |
468 | DUMP_REG(sdma_rlc_reg_offset + reg); |
469 | for (reg = mmSDMA0_RLC0_IB_SUB_REMAIN; |
470 | reg <= mmSDMA0_RLC0_MINOR_PTR_UPDATE; reg++) |
471 | DUMP_REG(sdma_rlc_reg_offset + reg); |
472 | for (reg = mmSDMA0_RLC0_MIDCMD_DATA0; |
473 | reg <= mmSDMA0_RLC0_MIDCMD_CNTL; reg++) |
474 | DUMP_REG(sdma_rlc_reg_offset + reg); |
475 | |
476 | WARN_ON_ONCE(i != HQD_N_REGS); |
477 | *n_regs = i; |
478 | |
479 | return 0; |
480 | } |
481 | |
482 | bool kgd_gfx_v9_hqd_is_occupied(struct amdgpu_device *adev, |
483 | uint64_t queue_address, uint32_t pipe_id, |
484 | uint32_t queue_id) |
485 | { |
486 | uint32_t act; |
487 | bool retval = false; |
488 | uint32_t low, high; |
489 | |
490 | acquire_queue(adev, pipe_id, queue_id); |
491 | act = RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE); |
492 | if (act) { |
493 | low = lower_32_bits(queue_address >> 8); |
494 | high = upper_32_bits(queue_address >> 8); |
495 | |
496 | if (low == RREG32_SOC15(GC, 0, mmCP_HQD_PQ_BASE) && |
497 | high == RREG32_SOC15(GC, 0, mmCP_HQD_PQ_BASE_HI)) |
498 | retval = true; |
499 | } |
500 | release_queue(adev); |
501 | return retval; |
502 | } |
503 | |
504 | static bool kgd_hqd_sdma_is_occupied(struct amdgpu_device *adev, void *mqd) |
505 | { |
506 | struct v9_sdma_mqd *m; |
507 | uint32_t sdma_rlc_reg_offset; |
508 | uint32_t sdma_rlc_rb_cntl; |
509 | |
510 | m = get_sdma_mqd(mqd); |
511 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, |
512 | m->sdma_queue_id); |
513 | |
514 | sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); |
515 | |
516 | if (sdma_rlc_rb_cntl & SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK) |
517 | return true; |
518 | |
519 | return false; |
520 | } |
521 | |
522 | int kgd_gfx_v9_hqd_destroy(struct amdgpu_device *adev, void *mqd, |
523 | enum kfd_preempt_type reset_type, |
524 | unsigned int utimeout, uint32_t pipe_id, |
525 | uint32_t queue_id) |
526 | { |
527 | enum hqd_dequeue_request_type type; |
528 | unsigned long end_jiffies; |
529 | uint32_t temp; |
530 | struct v9_mqd *m = get_mqd(mqd); |
531 | |
532 | if (amdgpu_in_reset(adev)) |
533 | return -EIO; |
534 | |
535 | acquire_queue(adev, pipe_id, queue_id); |
536 | |
537 | if (m->cp_hqd_vmid == 0) |
538 | WREG32_FIELD15_RLC(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0); |
539 | |
540 | switch (reset_type) { |
541 | case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN: |
542 | type = DRAIN_PIPE; |
543 | break; |
544 | case KFD_PREEMPT_TYPE_WAVEFRONT_RESET: |
545 | type = RESET_WAVES; |
546 | break; |
547 | case KFD_PREEMPT_TYPE_WAVEFRONT_SAVE: |
548 | type = SAVE_WAVES; |
549 | break; |
550 | default: |
551 | type = DRAIN_PIPE; |
552 | break; |
553 | } |
554 | |
555 | WREG32_RLC(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_DEQUEUE_REQUEST), type); |
556 | |
557 | end_jiffies = (utimeout * HZ / 1000) + jiffies; |
558 | while (true) { |
559 | temp = RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE); |
560 | if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK)) |
561 | break; |
562 | if (time_after(jiffies, end_jiffies)) { |
563 | pr_err("cp queue preemption time out.\n" ); |
564 | release_queue(adev); |
565 | return -ETIME; |
566 | } |
567 | usleep_range(500, 1000); |
568 | } |
569 | |
570 | release_queue(adev); |
571 | return 0; |
572 | } |
573 | |
574 | static int kgd_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd, |
575 | unsigned int utimeout) |
576 | { |
577 | struct v9_sdma_mqd *m; |
578 | uint32_t sdma_rlc_reg_offset; |
579 | uint32_t temp; |
580 | unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies; |
581 | |
582 | m = get_sdma_mqd(mqd); |
583 | sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id, |
584 | m->sdma_queue_id); |
585 | |
586 | temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL); |
587 | temp = temp & ~SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK; |
588 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, temp); |
589 | |
590 | while (true) { |
591 | temp = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_CONTEXT_STATUS); |
592 | if (temp & SDMA0_RLC0_CONTEXT_STATUS__IDLE_MASK) |
593 | break; |
594 | if (time_after(jiffies, end_jiffies)) { |
595 | pr_err("SDMA RLC not idle in %s\n" , __func__); |
596 | return -ETIME; |
597 | } |
598 | usleep_range(500, 1000); |
599 | } |
600 | |
601 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_DOORBELL, 0); |
602 | WREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL, |
603 | RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_CNTL) | |
604 | SDMA0_RLC0_RB_CNTL__RB_ENABLE_MASK); |
605 | |
606 | m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR); |
607 | m->sdmax_rlcx_rb_rptr_hi = |
608 | RREG32(sdma_rlc_reg_offset + mmSDMA0_RLC0_RB_RPTR_HI); |
609 | |
610 | return 0; |
611 | } |
612 | |
613 | bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev, |
614 | uint8_t vmid, uint16_t *p_pasid) |
615 | { |
616 | uint32_t value; |
617 | |
618 | value = RREG32(SOC15_REG_OFFSET(ATHUB, 0, mmATC_VMID0_PASID_MAPPING) |
619 | + vmid); |
620 | *p_pasid = value & ATC_VMID0_PASID_MAPPING__PASID_MASK; |
621 | |
622 | return !!(value & ATC_VMID0_PASID_MAPPING__VALID_MASK); |
623 | } |
624 | |
625 | int kgd_gfx_v9_wave_control_execute(struct amdgpu_device *adev, |
626 | uint32_t gfx_index_val, |
627 | uint32_t sq_cmd) |
628 | { |
629 | uint32_t data = 0; |
630 | |
631 | mutex_lock(&adev->grbm_idx_mutex); |
632 | |
633 | WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, gfx_index_val); |
634 | WREG32_SOC15(GC, 0, mmSQ_CMD, sq_cmd); |
635 | |
636 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
637 | INSTANCE_BROADCAST_WRITES, 1); |
638 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
639 | SH_BROADCAST_WRITES, 1); |
640 | data = REG_SET_FIELD(data, GRBM_GFX_INDEX, |
641 | SE_BROADCAST_WRITES, 1); |
642 | |
643 | WREG32_SOC15_RLC_SHADOW(GC, 0, mmGRBM_GFX_INDEX, data); |
644 | mutex_unlock(&adev->grbm_idx_mutex); |
645 | |
646 | return 0; |
647 | } |
648 | |
649 | void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev, |
650 | uint32_t vmid, uint64_t page_table_base) |
651 | { |
652 | if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) { |
653 | pr_err("trying to set page table base for wrong VMID %u\n" , |
654 | vmid); |
655 | return; |
656 | } |
657 | |
658 | adev->mmhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); |
659 | |
660 | adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base); |
661 | } |
662 | |
663 | static void lock_spi_csq_mutexes(struct amdgpu_device *adev) |
664 | { |
665 | mutex_lock(&adev->srbm_mutex); |
666 | mutex_lock(&adev->grbm_idx_mutex); |
667 | |
668 | } |
669 | |
670 | static void unlock_spi_csq_mutexes(struct amdgpu_device *adev) |
671 | { |
672 | mutex_unlock(&adev->grbm_idx_mutex); |
673 | mutex_unlock(&adev->srbm_mutex); |
674 | } |
675 | |
676 | /** |
677 | * get_wave_count: Read device registers to get number of waves in flight for |
678 | * a particular queue. The method also returns the VMID associated with the |
679 | * queue. |
680 | * |
681 | * @adev: Handle of device whose registers are to be read |
682 | * @queue_idx: Index of queue in the queue-map bit-field |
683 | * @wave_cnt: Output parameter updated with number of waves in flight |
684 | * @vmid: Output parameter updated with VMID of queue whose wave count |
685 | * is being collected |
686 | */ |
687 | static void get_wave_count(struct amdgpu_device *adev, int queue_idx, |
688 | int *wave_cnt, int *vmid) |
689 | { |
690 | int pipe_idx; |
691 | int queue_slot; |
692 | unsigned int reg_val; |
693 | |
694 | /* |
695 | * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID |
696 | * parameters to read out waves in flight. Get VMID if there are |
697 | * non-zero waves in flight. |
698 | */ |
699 | *vmid = 0xFF; |
700 | *wave_cnt = 0; |
701 | pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; |
702 | queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; |
703 | soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0); |
704 | reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, 0, mmSPI_CSQ_WF_ACTIVE_COUNT_0) + |
705 | queue_slot); |
706 | *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; |
707 | if (*wave_cnt != 0) |
708 | *vmid = (RREG32_SOC15(GC, 0, mmCP_HQD_VMID) & |
709 | CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT; |
710 | } |
711 | |
712 | /** |
713 | * kgd_gfx_v9_get_cu_occupancy: Reads relevant registers associated with each |
714 | * shader engine and aggregates the number of waves that are in flight for the |
715 | * process whose pasid is provided as a parameter. The process could have ZERO |
716 | * or more queues running and submitting waves to compute units. |
717 | * |
718 | * @adev: Handle of device from which to get number of waves in flight |
719 | * @pasid: Identifies the process for which this query call is invoked |
720 | * @pasid_wave_cnt: Output parameter updated with number of waves in flight that |
721 | * belong to process with given pasid |
722 | * @max_waves_per_cu: Output parameter updated with maximum number of waves |
723 | * possible per Compute Unit |
724 | * |
725 | * Note: It's possible that the device has too many queues (oversubscription) |
726 | * in which case a VMID could be remapped to a different PASID. This could lead |
727 | * to an inaccurate wave count. Following is a high-level sequence: |
728 | * Time T1: vmid = getVmid(); vmid is associated with Pasid P1 |
729 | * Time T2: passId = getPasId(vmid); vmid is associated with Pasid P2 |
730 | * In the sequence above wave count obtained from time T1 will be incorrectly |
731 | * lost or added to total wave count. |
732 | * |
733 | * The registers that provide the waves in flight are: |
734 | * |
735 | * SPI_CSQ_WF_ACTIVE_STATUS - bit-map of queues per pipe. The bit is ON if a |
736 | * queue is slotted, OFF if there is no queue. A process could have ZERO or |
737 | * more queues slotted and submitting waves to be run on compute units. Even |
738 | * when there is a queue it is possible there could be zero wave fronts, this |
739 | * can happen when queue is waiting on top-of-pipe events - e.g. waitRegMem |
740 | * command |
741 | * |
742 | * For each bit that is ON from above: |
743 | * |
744 | * Read (SPI_CSQ_WF_ACTIVE_COUNT_0 + queue_idx) register. It provides the |
745 | * number of waves that are in flight for the queue at specified index. The |
746 | * index ranges from 0 to 7. |
747 | * |
748 | * If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID |
749 | * of the wave(s). |
750 | * |
751 | * Determine if VMID from above step maps to pasid provided as parameter. If |
752 | * it matches agrregate the wave count. That the VMID will not match pasid is |
753 | * a normal condition i.e. a device is expected to support multiple queues |
754 | * from multiple proceses. |
755 | * |
756 | * Reading registers referenced above involves programming GRBM appropriately |
757 | */ |
758 | void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, |
759 | int *pasid_wave_cnt, int *max_waves_per_cu) |
760 | { |
761 | int qidx; |
762 | int vmid; |
763 | int se_idx; |
764 | int sh_idx; |
765 | int se_cnt; |
766 | int sh_cnt; |
767 | int wave_cnt; |
768 | int queue_map; |
769 | int pasid_tmp; |
770 | int max_queue_cnt; |
771 | int vmid_wave_cnt = 0; |
772 | DECLARE_BITMAP(cp_queue_bitmap, KGD_MAX_QUEUES); |
773 | |
774 | lock_spi_csq_mutexes(adev); |
775 | soc15_grbm_select(adev, 1, 0, 0, 0); |
776 | |
777 | /* |
778 | * Iterate through the shader engines and arrays of the device |
779 | * to get number of waves in flight |
780 | */ |
781 | bitmap_complement(cp_queue_bitmap, adev->gfx.mec.queue_bitmap, |
782 | KGD_MAX_QUEUES); |
783 | max_queue_cnt = adev->gfx.mec.num_pipe_per_mec * |
784 | adev->gfx.mec.num_queue_per_pipe; |
785 | sh_cnt = adev->gfx.config.max_sh_per_se; |
786 | se_cnt = adev->gfx.config.max_shader_engines; |
787 | for (se_idx = 0; se_idx < se_cnt; se_idx++) { |
788 | for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) { |
789 | |
790 | gfx_v9_0_select_se_sh(adev, se_idx, sh_idx, 0xffffffff); |
791 | queue_map = RREG32_SOC15(GC, 0, mmSPI_CSQ_WF_ACTIVE_STATUS); |
792 | |
793 | /* |
794 | * Assumption: queue map encodes following schema: four |
795 | * pipes per each micro-engine, with each pipe mapping |
796 | * eight queues. This schema is true for GFX9 devices |
797 | * and must be verified for newer device families |
798 | */ |
799 | for (qidx = 0; qidx < max_queue_cnt; qidx++) { |
800 | |
801 | /* Skip qeueus that are not associated with |
802 | * compute functions |
803 | */ |
804 | if (!test_bit(qidx, cp_queue_bitmap)) |
805 | continue; |
806 | |
807 | if (!(queue_map & (1 << qidx))) |
808 | continue; |
809 | |
810 | /* Get number of waves in flight and aggregate them */ |
811 | get_wave_count(adev, qidx, &wave_cnt, &vmid); |
812 | if (wave_cnt != 0) { |
813 | pasid_tmp = |
814 | RREG32(SOC15_REG_OFFSET(OSSSYS, 0, |
815 | mmIH_VMID_0_LUT) + vmid); |
816 | if (pasid_tmp == pasid) |
817 | vmid_wave_cnt += wave_cnt; |
818 | } |
819 | } |
820 | } |
821 | } |
822 | |
823 | gfx_v9_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); |
824 | soc15_grbm_select(adev, 0, 0, 0, 0); |
825 | unlock_spi_csq_mutexes(adev); |
826 | |
827 | /* Update the output parameters and return */ |
828 | *pasid_wave_cnt = vmid_wave_cnt; |
829 | *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu * |
830 | adev->gfx.cu_info.max_waves_per_simd; |
831 | } |
832 | |
833 | void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev, |
834 | uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr) |
835 | { |
836 | lock_srbm(adev, 0, 0, 0, vmid); |
837 | |
838 | /* |
839 | * Program TBA registers |
840 | */ |
841 | WREG32_SOC15(GC, 0, mmSQ_SHADER_TBA_LO, |
842 | lower_32_bits(tba_addr >> 8)); |
843 | WREG32_SOC15(GC, 0, mmSQ_SHADER_TBA_HI, |
844 | upper_32_bits(tba_addr >> 8)); |
845 | |
846 | /* |
847 | * Program TMA registers |
848 | */ |
849 | WREG32_SOC15(GC, 0, mmSQ_SHADER_TMA_LO, |
850 | lower_32_bits(tma_addr >> 8)); |
851 | WREG32_SOC15(GC, 0, mmSQ_SHADER_TMA_HI, |
852 | upper_32_bits(tma_addr >> 8)); |
853 | |
854 | unlock_srbm(adev); |
855 | } |
856 | |
857 | const struct kfd2kgd_calls gfx_v9_kfd2kgd = { |
858 | .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, |
859 | .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, |
860 | .init_interrupts = kgd_gfx_v9_init_interrupts, |
861 | .hqd_load = kgd_gfx_v9_hqd_load, |
862 | .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load, |
863 | .hqd_sdma_load = kgd_hqd_sdma_load, |
864 | .hqd_dump = kgd_gfx_v9_hqd_dump, |
865 | .hqd_sdma_dump = kgd_hqd_sdma_dump, |
866 | .hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied, |
867 | .hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied, |
868 | .hqd_destroy = kgd_gfx_v9_hqd_destroy, |
869 | .hqd_sdma_destroy = kgd_hqd_sdma_destroy, |
870 | .wave_control_execute = kgd_gfx_v9_wave_control_execute, |
871 | .get_atc_vmid_pasid_mapping_info = |
872 | kgd_gfx_v9_get_atc_vmid_pasid_mapping_info, |
873 | .set_vm_context_page_table_base = kgd_gfx_v9_set_vm_context_page_table_base, |
874 | .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, |
875 | .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, |
876 | }; |
877 | |