1 | /* |
2 | * Copyright 2022 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ |
22 | |
23 | #include <linux/firmware.h> |
24 | #include <drm/drm_drv.h> |
25 | |
26 | #include "amdgpu.h" |
27 | #include "amdgpu_ucode.h" |
28 | #include "amdgpu_vpe.h" |
29 | #include "amdgpu_smu.h" |
30 | #include "soc15_common.h" |
31 | #include "vpe_v6_1.h" |
32 | |
33 | #define AMDGPU_CSA_VPE_SIZE 64 |
34 | /* VPE CSA resides in the 4th page of CSA */ |
35 | #define AMDGPU_CSA_VPE_OFFSET (4096 * 3) |
36 | |
37 | /* 1 second timeout */ |
38 | #define VPE_IDLE_TIMEOUT msecs_to_jiffies(1000) |
39 | |
40 | #define VPE_MAX_DPM_LEVEL 4 |
41 | #define FIXED1_8_BITS_PER_FRACTIONAL_PART 8 |
42 | #define GET_PRATIO_INTEGER_PART(x) ((x) >> FIXED1_8_BITS_PER_FRACTIONAL_PART) |
43 | |
44 | static void vpe_set_ring_funcs(struct amdgpu_device *adev); |
45 | |
46 | static inline uint16_t div16_u16_rem(uint16_t dividend, uint16_t divisor, uint16_t *remainder) |
47 | { |
48 | *remainder = dividend % divisor; |
49 | return dividend / divisor; |
50 | } |
51 | |
52 | static inline uint16_t complete_integer_division_u16( |
53 | uint16_t dividend, |
54 | uint16_t divisor, |
55 | uint16_t *remainder) |
56 | { |
57 | return div16_u16_rem(dividend, divisor, remainder: (uint16_t *)remainder); |
58 | } |
59 | |
60 | static uint16_t vpe_u1_8_from_fraction(uint16_t numerator, uint16_t denominator) |
61 | { |
62 | u16 arg1_value = numerator; |
63 | u16 arg2_value = denominator; |
64 | |
65 | uint16_t remainder; |
66 | |
67 | /* determine integer part */ |
68 | uint16_t res_value = complete_integer_division_u16( |
69 | dividend: arg1_value, divisor: arg2_value, remainder: &remainder); |
70 | |
71 | if (res_value > 127 /* CHAR_MAX */) |
72 | return 0; |
73 | |
74 | /* determine fractional part */ |
75 | { |
76 | unsigned int i = FIXED1_8_BITS_PER_FRACTIONAL_PART; |
77 | |
78 | do { |
79 | remainder <<= 1; |
80 | |
81 | res_value <<= 1; |
82 | |
83 | if (remainder >= arg2_value) { |
84 | res_value |= 1; |
85 | remainder -= arg2_value; |
86 | } |
87 | } while (--i != 0); |
88 | } |
89 | |
90 | /* round up LSB */ |
91 | { |
92 | uint16_t summand = (remainder << 1) >= arg2_value; |
93 | |
94 | if ((res_value + summand) > 32767 /* SHRT_MAX */) |
95 | return 0; |
96 | |
97 | res_value += summand; |
98 | } |
99 | |
100 | return res_value; |
101 | } |
102 | |
103 | static uint16_t vpe_internal_get_pratio(uint16_t from_frequency, uint16_t to_frequency) |
104 | { |
105 | uint16_t pratio = vpe_u1_8_from_fraction(numerator: from_frequency, denominator: to_frequency); |
106 | |
107 | if (GET_PRATIO_INTEGER_PART(pratio) > 1) |
108 | pratio = 0; |
109 | |
110 | return pratio; |
111 | } |
112 | |
113 | /* |
114 | * VPE has 4 DPM levels from level 0 (lowerest) to 3 (highest), |
115 | * VPE FW will dynamically decide which level should be used according to current loading. |
116 | * |
117 | * Get VPE and SOC clocks from PM, and select the appropriate four clock values, |
118 | * calculate the ratios of adjusting from one clock to another. |
119 | * The VPE FW can then request the appropriate frequency from the PMFW. |
120 | */ |
121 | int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe) |
122 | { |
123 | struct amdgpu_device *adev = vpe->ring.adev; |
124 | uint32_t dpm_ctl; |
125 | |
126 | if (adev->pm.dpm_enabled) { |
127 | struct dpm_clocks clock_table = { 0 }; |
128 | struct dpm_clock *VPEClks; |
129 | struct dpm_clock *SOCClks; |
130 | uint32_t idx; |
131 | uint32_t pratio_vmax_vnorm = 0, pratio_vnorm_vmid = 0, pratio_vmid_vmin = 0; |
132 | uint16_t pratio_vmin_freq = 0, pratio_vmid_freq = 0, pratio_vnorm_freq = 0, pratio_vmax_freq = 0; |
133 | |
134 | dpm_ctl = RREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable)); |
135 | dpm_ctl |= 1; /* DPM enablement */ |
136 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable), dpm_ctl); |
137 | |
138 | /* Get VPECLK and SOCCLK */ |
139 | if (amdgpu_dpm_get_dpm_clock_table(adev, clock_table: &clock_table)) { |
140 | dev_dbg(adev->dev, "%s: get clock failed!\n" , __func__); |
141 | goto disable_dpm; |
142 | } |
143 | |
144 | SOCClks = clock_table.SocClocks; |
145 | VPEClks = clock_table.VPEClocks; |
146 | |
147 | /* vpe dpm only cares 4 levels. */ |
148 | for (idx = 0; idx < VPE_MAX_DPM_LEVEL; idx++) { |
149 | uint32_t soc_dpm_level; |
150 | uint32_t min_freq; |
151 | |
152 | if (idx == 0) |
153 | soc_dpm_level = 0; |
154 | else |
155 | soc_dpm_level = (idx * 2) + 1; |
156 | |
157 | /* clamp the max level */ |
158 | if (soc_dpm_level > PP_SMU_NUM_VPECLK_DPM_LEVELS - 1) |
159 | soc_dpm_level = PP_SMU_NUM_VPECLK_DPM_LEVELS - 1; |
160 | |
161 | min_freq = (SOCClks[soc_dpm_level].Freq < VPEClks[soc_dpm_level].Freq) ? |
162 | SOCClks[soc_dpm_level].Freq : VPEClks[soc_dpm_level].Freq; |
163 | |
164 | switch (idx) { |
165 | case 0: |
166 | pratio_vmin_freq = min_freq; |
167 | break; |
168 | case 1: |
169 | pratio_vmid_freq = min_freq; |
170 | break; |
171 | case 2: |
172 | pratio_vnorm_freq = min_freq; |
173 | break; |
174 | case 3: |
175 | pratio_vmax_freq = min_freq; |
176 | break; |
177 | default: |
178 | break; |
179 | } |
180 | } |
181 | |
182 | if (pratio_vmin_freq && pratio_vmid_freq && pratio_vnorm_freq && pratio_vmax_freq) { |
183 | uint32_t pratio_ctl; |
184 | |
185 | pratio_vmax_vnorm = (uint32_t)vpe_internal_get_pratio(from_frequency: pratio_vmax_freq, to_frequency: pratio_vnorm_freq); |
186 | pratio_vnorm_vmid = (uint32_t)vpe_internal_get_pratio(from_frequency: pratio_vnorm_freq, to_frequency: pratio_vmid_freq); |
187 | pratio_vmid_vmin = (uint32_t)vpe_internal_get_pratio(from_frequency: pratio_vmid_freq, to_frequency: pratio_vmin_freq); |
188 | |
189 | pratio_ctl = pratio_vmax_vnorm | (pratio_vnorm_vmid << 9) | (pratio_vmid_vmin << 18); |
190 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_pratio), pratio_ctl); /* PRatio */ |
191 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_request_interval), 24000); /* 1ms, unit=1/24MHz */ |
192 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_decision_threshold), 1200000); /* 50ms */ |
193 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_busy_clamp_threshold), 1200000);/* 50ms */ |
194 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_idle_clamp_threshold), 1200000);/* 50ms */ |
195 | dev_dbg(adev->dev, "%s: configure vpe dpm pratio done!\n" , __func__); |
196 | } else { |
197 | dev_dbg(adev->dev, "%s: invalid pratio parameters!\n" , __func__); |
198 | goto disable_dpm; |
199 | } |
200 | } |
201 | return 0; |
202 | |
203 | disable_dpm: |
204 | dpm_ctl = RREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable)); |
205 | dpm_ctl &= 0xfffffffe; /* Disable DPM */ |
206 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable), dpm_ctl); |
207 | dev_dbg(adev->dev, "%s: disable vpe dpm\n" , __func__); |
208 | return 0; |
209 | } |
210 | |
211 | int amdgpu_vpe_psp_update_sram(struct amdgpu_device *adev) |
212 | { |
213 | struct amdgpu_firmware_info ucode = { |
214 | .ucode_id = AMDGPU_UCODE_ID_VPE, |
215 | .mc_addr = adev->vpe.cmdbuf_gpu_addr, |
216 | .ucode_size = 8, |
217 | }; |
218 | |
219 | return psp_execute_ip_fw_load(psp: &adev->psp, ucode: &ucode); |
220 | } |
221 | |
222 | int amdgpu_vpe_init_microcode(struct amdgpu_vpe *vpe) |
223 | { |
224 | struct amdgpu_device *adev = vpe->ring.adev; |
225 | const struct vpe_firmware_header_v1_0 *vpe_hdr; |
226 | char fw_prefix[32], fw_name[64]; |
227 | int ret; |
228 | |
229 | amdgpu_ucode_ip_version_decode(adev, block_type: VPE_HWIP, ucode_prefix: fw_prefix, len: sizeof(fw_prefix)); |
230 | snprintf(buf: fw_name, size: sizeof(fw_name), fmt: "amdgpu/%s.bin" , fw_prefix); |
231 | |
232 | ret = amdgpu_ucode_request(adev, fw: &adev->vpe.fw, fw_name); |
233 | if (ret) |
234 | goto out; |
235 | |
236 | vpe_hdr = (const struct vpe_firmware_header_v1_0 *)adev->vpe.fw->data; |
237 | adev->vpe.fw_version = le32_to_cpu(vpe_hdr->header.ucode_version); |
238 | adev->vpe.feature_version = le32_to_cpu(vpe_hdr->ucode_feature_version); |
239 | |
240 | if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) { |
241 | struct amdgpu_firmware_info *info; |
242 | |
243 | info = &adev->firmware.ucode[AMDGPU_UCODE_ID_VPE_CTX]; |
244 | info->ucode_id = AMDGPU_UCODE_ID_VPE_CTX; |
245 | info->fw = adev->vpe.fw; |
246 | adev->firmware.fw_size += |
247 | ALIGN(le32_to_cpu(vpe_hdr->ctx_ucode_size_bytes), PAGE_SIZE); |
248 | |
249 | info = &adev->firmware.ucode[AMDGPU_UCODE_ID_VPE_CTL]; |
250 | info->ucode_id = AMDGPU_UCODE_ID_VPE_CTL; |
251 | info->fw = adev->vpe.fw; |
252 | adev->firmware.fw_size += |
253 | ALIGN(le32_to_cpu(vpe_hdr->ctl_ucode_size_bytes), PAGE_SIZE); |
254 | } |
255 | |
256 | return 0; |
257 | out: |
258 | dev_err(adev->dev, "fail to initialize vpe microcode\n" ); |
259 | release_firmware(fw: adev->vpe.fw); |
260 | adev->vpe.fw = NULL; |
261 | return ret; |
262 | } |
263 | |
264 | int amdgpu_vpe_ring_init(struct amdgpu_vpe *vpe) |
265 | { |
266 | struct amdgpu_device *adev = container_of(vpe, struct amdgpu_device, vpe); |
267 | struct amdgpu_ring *ring = &vpe->ring; |
268 | int ret; |
269 | |
270 | ring->ring_obj = NULL; |
271 | ring->use_doorbell = true; |
272 | ring->vm_hub = AMDGPU_MMHUB0(0); |
273 | ring->doorbell_index = (adev->doorbell_index.vpe_ring << 1); |
274 | snprintf(buf: ring->name, size: 4, fmt: "vpe" ); |
275 | |
276 | ret = amdgpu_ring_init(adev, ring, max_dw: 1024, irq_src: &vpe->trap_irq, irq_type: 0, |
277 | hw_prio: AMDGPU_RING_PRIO_DEFAULT, NULL); |
278 | if (ret) |
279 | return ret; |
280 | |
281 | return 0; |
282 | } |
283 | |
284 | int amdgpu_vpe_ring_fini(struct amdgpu_vpe *vpe) |
285 | { |
286 | amdgpu_ring_fini(ring: &vpe->ring); |
287 | |
288 | return 0; |
289 | } |
290 | |
291 | static int vpe_early_init(void *handle) |
292 | { |
293 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
294 | struct amdgpu_vpe *vpe = &adev->vpe; |
295 | |
296 | switch (amdgpu_ip_version(adev, ip: VPE_HWIP, inst: 0)) { |
297 | case IP_VERSION(6, 1, 0): |
298 | vpe_v6_1_set_funcs(vpe); |
299 | break; |
300 | case IP_VERSION(6, 1, 1): |
301 | vpe_v6_1_set_funcs(vpe); |
302 | vpe->collaborate_mode = true; |
303 | break; |
304 | default: |
305 | return -EINVAL; |
306 | } |
307 | |
308 | vpe_set_ring_funcs(adev); |
309 | vpe_set_regs(vpe); |
310 | |
311 | dev_info(adev->dev, "VPE: collaborate mode %s" , vpe->collaborate_mode ? "true" : "false" ); |
312 | |
313 | return 0; |
314 | } |
315 | |
316 | static void vpe_idle_work_handler(struct work_struct *work) |
317 | { |
318 | struct amdgpu_device *adev = |
319 | container_of(work, struct amdgpu_device, vpe.idle_work.work); |
320 | unsigned int fences = 0; |
321 | |
322 | fences += amdgpu_fence_count_emitted(ring: &adev->vpe.ring); |
323 | |
324 | if (fences == 0) |
325 | amdgpu_device_ip_set_powergating_state(dev: adev, block_type: AMD_IP_BLOCK_TYPE_VPE, state: AMD_PG_STATE_GATE); |
326 | else |
327 | schedule_delayed_work(dwork: &adev->vpe.idle_work, VPE_IDLE_TIMEOUT); |
328 | } |
329 | |
330 | static int vpe_common_init(struct amdgpu_vpe *vpe) |
331 | { |
332 | struct amdgpu_device *adev = container_of(vpe, struct amdgpu_device, vpe); |
333 | int r; |
334 | |
335 | r = amdgpu_bo_create_kernel(adev, PAGE_SIZE, PAGE_SIZE, |
336 | AMDGPU_GEM_DOMAIN_GTT, |
337 | bo_ptr: &adev->vpe.cmdbuf_obj, |
338 | gpu_addr: &adev->vpe.cmdbuf_gpu_addr, |
339 | cpu_addr: (void **)&adev->vpe.cmdbuf_cpu_addr); |
340 | if (r) { |
341 | dev_err(adev->dev, "VPE: failed to allocate cmdbuf bo %d\n" , r); |
342 | return r; |
343 | } |
344 | |
345 | vpe->context_started = false; |
346 | INIT_DELAYED_WORK(&adev->vpe.idle_work, vpe_idle_work_handler); |
347 | |
348 | return 0; |
349 | } |
350 | |
351 | static int vpe_sw_init(void *handle) |
352 | { |
353 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
354 | struct amdgpu_vpe *vpe = &adev->vpe; |
355 | int ret; |
356 | |
357 | ret = vpe_common_init(vpe); |
358 | if (ret) |
359 | goto out; |
360 | |
361 | ret = vpe_irq_init(vpe); |
362 | if (ret) |
363 | goto out; |
364 | |
365 | ret = vpe_ring_init(vpe); |
366 | if (ret) |
367 | goto out; |
368 | |
369 | ret = vpe_init_microcode(vpe); |
370 | if (ret) |
371 | goto out; |
372 | out: |
373 | return ret; |
374 | } |
375 | |
376 | static int vpe_sw_fini(void *handle) |
377 | { |
378 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
379 | struct amdgpu_vpe *vpe = &adev->vpe; |
380 | |
381 | release_firmware(fw: vpe->fw); |
382 | vpe->fw = NULL; |
383 | |
384 | vpe_ring_fini(vpe); |
385 | |
386 | amdgpu_bo_free_kernel(bo: &adev->vpe.cmdbuf_obj, |
387 | gpu_addr: &adev->vpe.cmdbuf_gpu_addr, |
388 | cpu_addr: (void **)&adev->vpe.cmdbuf_cpu_addr); |
389 | |
390 | return 0; |
391 | } |
392 | |
393 | static int vpe_hw_init(void *handle) |
394 | { |
395 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
396 | struct amdgpu_vpe *vpe = &adev->vpe; |
397 | int ret; |
398 | |
399 | /* Power on VPE */ |
400 | ret = amdgpu_device_ip_set_powergating_state(dev: adev, block_type: AMD_IP_BLOCK_TYPE_VPE, |
401 | state: AMD_PG_STATE_UNGATE); |
402 | if (ret) |
403 | return ret; |
404 | |
405 | ret = vpe_load_microcode(vpe); |
406 | if (ret) |
407 | return ret; |
408 | |
409 | ret = vpe_ring_start(vpe); |
410 | if (ret) |
411 | return ret; |
412 | |
413 | return 0; |
414 | } |
415 | |
416 | static int vpe_hw_fini(void *handle) |
417 | { |
418 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
419 | struct amdgpu_vpe *vpe = &adev->vpe; |
420 | |
421 | vpe_ring_stop(vpe); |
422 | |
423 | /* Power off VPE */ |
424 | amdgpu_device_ip_set_powergating_state(dev: adev, block_type: AMD_IP_BLOCK_TYPE_VPE, state: AMD_PG_STATE_GATE); |
425 | |
426 | return 0; |
427 | } |
428 | |
429 | static int vpe_suspend(void *handle) |
430 | { |
431 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
432 | |
433 | cancel_delayed_work_sync(dwork: &adev->vpe.idle_work); |
434 | |
435 | return vpe_hw_fini(handle: adev); |
436 | } |
437 | |
438 | static int vpe_resume(void *handle) |
439 | { |
440 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
441 | |
442 | return vpe_hw_init(handle: adev); |
443 | } |
444 | |
445 | static void vpe_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count) |
446 | { |
447 | int i; |
448 | |
449 | for (i = 0; i < count; i++) |
450 | if (i == 0) |
451 | amdgpu_ring_write(ring, v: ring->funcs->nop | |
452 | VPE_CMD_NOP_HEADER_COUNT(count - 1)); |
453 | else |
454 | amdgpu_ring_write(ring, v: ring->funcs->nop); |
455 | } |
456 | |
457 | static uint64_t vpe_get_csa_mc_addr(struct amdgpu_ring *ring, uint32_t vmid) |
458 | { |
459 | struct amdgpu_device *adev = ring->adev; |
460 | uint32_t index = 0; |
461 | uint64_t csa_mc_addr; |
462 | |
463 | if (amdgpu_sriov_vf(adev) || vmid == 0 || !adev->gfx.mcbp) |
464 | return 0; |
465 | |
466 | csa_mc_addr = amdgpu_csa_vaddr(adev) + AMDGPU_CSA_VPE_OFFSET + |
467 | index * AMDGPU_CSA_VPE_SIZE; |
468 | |
469 | return csa_mc_addr; |
470 | } |
471 | |
472 | static void vpe_ring_emit_pred_exec(struct amdgpu_ring *ring, |
473 | uint32_t device_select, |
474 | uint32_t exec_count) |
475 | { |
476 | if (!ring->adev->vpe.collaborate_mode) |
477 | return; |
478 | |
479 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_PRED_EXE, 0) | |
480 | (device_select << 16)); |
481 | amdgpu_ring_write(ring, v: exec_count & 0x1fff); |
482 | } |
483 | |
484 | static void vpe_ring_emit_ib(struct amdgpu_ring *ring, |
485 | struct amdgpu_job *job, |
486 | struct amdgpu_ib *ib, |
487 | uint32_t flags) |
488 | { |
489 | uint32_t vmid = AMDGPU_JOB_GET_VMID(job); |
490 | uint64_t csa_mc_addr = vpe_get_csa_mc_addr(ring, vmid); |
491 | |
492 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_INDIRECT, 0) | |
493 | VPE_CMD_INDIRECT_HEADER_VMID(vmid & 0xf)); |
494 | |
495 | /* base must be 32 byte aligned */ |
496 | amdgpu_ring_write(ring, v: ib->gpu_addr & 0xffffffe0); |
497 | amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr)); |
498 | amdgpu_ring_write(ring, v: ib->length_dw); |
499 | amdgpu_ring_write(ring, lower_32_bits(csa_mc_addr)); |
500 | amdgpu_ring_write(ring, upper_32_bits(csa_mc_addr)); |
501 | } |
502 | |
503 | static void vpe_ring_emit_fence(struct amdgpu_ring *ring, uint64_t addr, |
504 | uint64_t seq, unsigned int flags) |
505 | { |
506 | int i = 0; |
507 | |
508 | do { |
509 | /* write the fence */ |
510 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_FENCE, 0)); |
511 | /* zero in first two bits */ |
512 | WARN_ON_ONCE(addr & 0x3); |
513 | amdgpu_ring_write(ring, lower_32_bits(addr)); |
514 | amdgpu_ring_write(ring, upper_32_bits(addr)); |
515 | amdgpu_ring_write(ring, v: i == 0 ? lower_32_bits(seq) : upper_32_bits(seq)); |
516 | addr += 4; |
517 | } while ((flags & AMDGPU_FENCE_FLAG_64BIT) && (i++ < 1)); |
518 | |
519 | if (flags & AMDGPU_FENCE_FLAG_INT) { |
520 | /* generate an interrupt */ |
521 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_TRAP, 0)); |
522 | amdgpu_ring_write(ring, v: 0); |
523 | } |
524 | |
525 | } |
526 | |
527 | static void vpe_ring_emit_pipeline_sync(struct amdgpu_ring *ring) |
528 | { |
529 | uint32_t seq = ring->fence_drv.sync_seq; |
530 | uint64_t addr = ring->fence_drv.gpu_addr; |
531 | |
532 | vpe_ring_emit_pred_exec(ring, device_select: 0, exec_count: 6); |
533 | |
534 | /* wait for idle */ |
535 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_POLL_REGMEM, |
536 | VPE_POLL_REGMEM_SUBOP_REGMEM) | |
537 | VPE_CMD_POLL_REGMEM_HEADER_FUNC(3) | /* equal */ |
538 | VPE_CMD_POLL_REGMEM_HEADER_MEM(1)); |
539 | amdgpu_ring_write(ring, v: addr & 0xfffffffc); |
540 | amdgpu_ring_write(ring, upper_32_bits(addr)); |
541 | amdgpu_ring_write(ring, v: seq); /* reference */ |
542 | amdgpu_ring_write(ring, v: 0xffffffff); /* mask */ |
543 | amdgpu_ring_write(ring, VPE_CMD_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) | |
544 | VPE_CMD_POLL_REGMEM_DW5_INTERVAL(4)); |
545 | } |
546 | |
547 | static void vpe_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t val) |
548 | { |
549 | vpe_ring_emit_pred_exec(ring, device_select: 0, exec_count: 3); |
550 | |
551 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_REG_WRITE, 0)); |
552 | amdgpu_ring_write(ring, v: reg << 2); |
553 | amdgpu_ring_write(ring, v: val); |
554 | } |
555 | |
556 | static void vpe_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg, |
557 | uint32_t val, uint32_t mask) |
558 | { |
559 | vpe_ring_emit_pred_exec(ring, device_select: 0, exec_count: 6); |
560 | |
561 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_POLL_REGMEM, |
562 | VPE_POLL_REGMEM_SUBOP_REGMEM) | |
563 | VPE_CMD_POLL_REGMEM_HEADER_FUNC(3) | /* equal */ |
564 | VPE_CMD_POLL_REGMEM_HEADER_MEM(0)); |
565 | amdgpu_ring_write(ring, v: reg << 2); |
566 | amdgpu_ring_write(ring, v: 0); |
567 | amdgpu_ring_write(ring, v: val); /* reference */ |
568 | amdgpu_ring_write(ring, v: mask); /* mask */ |
569 | amdgpu_ring_write(ring, VPE_CMD_POLL_REGMEM_DW5_RETRY_COUNT(0xfff) | |
570 | VPE_CMD_POLL_REGMEM_DW5_INTERVAL(10)); |
571 | } |
572 | |
573 | static void vpe_ring_emit_vm_flush(struct amdgpu_ring *ring, unsigned int vmid, |
574 | uint64_t pd_addr) |
575 | { |
576 | amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr); |
577 | } |
578 | |
579 | static unsigned int vpe_ring_init_cond_exec(struct amdgpu_ring *ring, |
580 | uint64_t addr) |
581 | { |
582 | unsigned int ret; |
583 | |
584 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_COND_EXE, 0)); |
585 | amdgpu_ring_write(ring, lower_32_bits(addr)); |
586 | amdgpu_ring_write(ring, upper_32_bits(addr)); |
587 | amdgpu_ring_write(ring, v: 1); |
588 | ret = ring->wptr & ring->buf_mask; |
589 | amdgpu_ring_write(ring, v: 0); |
590 | |
591 | return ret; |
592 | } |
593 | |
594 | static int vpe_ring_preempt_ib(struct amdgpu_ring *ring) |
595 | { |
596 | struct amdgpu_device *adev = ring->adev; |
597 | struct amdgpu_vpe *vpe = &adev->vpe; |
598 | uint32_t preempt_reg = vpe->regs.queue0_preempt; |
599 | int i, r = 0; |
600 | |
601 | /* assert preemption condition */ |
602 | amdgpu_ring_set_preempt_cond_exec(ring, cond_exec: false); |
603 | |
604 | /* emit the trailing fence */ |
605 | ring->trail_seq += 1; |
606 | amdgpu_ring_alloc(ring, ndw: 10); |
607 | vpe_ring_emit_fence(ring, addr: ring->trail_fence_gpu_addr, seq: ring->trail_seq, flags: 0); |
608 | amdgpu_ring_commit(ring); |
609 | |
610 | /* assert IB preemption */ |
611 | WREG32(vpe_get_reg_offset(vpe, ring->me, preempt_reg), 1); |
612 | |
613 | /* poll the trailing fence */ |
614 | for (i = 0; i < adev->usec_timeout; i++) { |
615 | if (ring->trail_seq == |
616 | le32_to_cpu(*(ring->trail_fence_cpu_addr))) |
617 | break; |
618 | udelay(1); |
619 | } |
620 | |
621 | if (i >= adev->usec_timeout) { |
622 | r = -EINVAL; |
623 | dev_err(adev->dev, "ring %d failed to be preempted\n" , ring->idx); |
624 | } |
625 | |
626 | /* deassert IB preemption */ |
627 | WREG32(vpe_get_reg_offset(vpe, ring->me, preempt_reg), 0); |
628 | |
629 | /* deassert the preemption condition */ |
630 | amdgpu_ring_set_preempt_cond_exec(ring, cond_exec: true); |
631 | |
632 | return r; |
633 | } |
634 | |
635 | static int vpe_set_clockgating_state(void *handle, |
636 | enum amd_clockgating_state state) |
637 | { |
638 | return 0; |
639 | } |
640 | |
641 | static int vpe_set_powergating_state(void *handle, |
642 | enum amd_powergating_state state) |
643 | { |
644 | struct amdgpu_device *adev = (struct amdgpu_device *)handle; |
645 | struct amdgpu_vpe *vpe = &adev->vpe; |
646 | |
647 | if (!adev->pm.dpm_enabled) |
648 | dev_err(adev->dev, "Without PM, cannot support powergating\n" ); |
649 | |
650 | dev_dbg(adev->dev, "%s: %s!\n" , __func__, (state == AMD_PG_STATE_GATE) ? "GATE" :"UNGATE" ); |
651 | |
652 | if (state == AMD_PG_STATE_GATE) { |
653 | amdgpu_dpm_enable_vpe(adev, enable: false); |
654 | vpe->context_started = false; |
655 | } else { |
656 | amdgpu_dpm_enable_vpe(adev, enable: true); |
657 | } |
658 | |
659 | return 0; |
660 | } |
661 | |
662 | static uint64_t vpe_ring_get_rptr(struct amdgpu_ring *ring) |
663 | { |
664 | struct amdgpu_device *adev = ring->adev; |
665 | struct amdgpu_vpe *vpe = &adev->vpe; |
666 | uint64_t rptr; |
667 | |
668 | if (ring->use_doorbell) { |
669 | rptr = atomic64_read(v: (atomic64_t *)ring->rptr_cpu_addr); |
670 | dev_dbg(adev->dev, "rptr/doorbell before shift == 0x%016llx\n" , rptr); |
671 | } else { |
672 | rptr = RREG32(vpe_get_reg_offset(vpe, ring->me, vpe->regs.queue0_rb_rptr_hi)); |
673 | rptr = rptr << 32; |
674 | rptr |= RREG32(vpe_get_reg_offset(vpe, ring->me, vpe->regs.queue0_rb_rptr_lo)); |
675 | dev_dbg(adev->dev, "rptr before shift [%i] == 0x%016llx\n" , ring->me, rptr); |
676 | } |
677 | |
678 | return (rptr >> 2); |
679 | } |
680 | |
681 | static uint64_t vpe_ring_get_wptr(struct amdgpu_ring *ring) |
682 | { |
683 | struct amdgpu_device *adev = ring->adev; |
684 | struct amdgpu_vpe *vpe = &adev->vpe; |
685 | uint64_t wptr; |
686 | |
687 | if (ring->use_doorbell) { |
688 | wptr = atomic64_read(v: (atomic64_t *)ring->wptr_cpu_addr); |
689 | dev_dbg(adev->dev, "wptr/doorbell before shift == 0x%016llx\n" , wptr); |
690 | } else { |
691 | wptr = RREG32(vpe_get_reg_offset(vpe, ring->me, vpe->regs.queue0_rb_wptr_hi)); |
692 | wptr = wptr << 32; |
693 | wptr |= RREG32(vpe_get_reg_offset(vpe, ring->me, vpe->regs.queue0_rb_wptr_lo)); |
694 | dev_dbg(adev->dev, "wptr before shift [%i] == 0x%016llx\n" , ring->me, wptr); |
695 | } |
696 | |
697 | return (wptr >> 2); |
698 | } |
699 | |
700 | static void vpe_ring_set_wptr(struct amdgpu_ring *ring) |
701 | { |
702 | struct amdgpu_device *adev = ring->adev; |
703 | struct amdgpu_vpe *vpe = &adev->vpe; |
704 | |
705 | if (ring->use_doorbell) { |
706 | dev_dbg(adev->dev, "Using doorbell, \ |
707 | wptr_offs == 0x%08x, \ |
708 | lower_32_bits(ring->wptr) << 2 == 0x%08x, \ |
709 | upper_32_bits(ring->wptr) << 2 == 0x%08x\n" , |
710 | ring->wptr_offs, |
711 | lower_32_bits(ring->wptr << 2), |
712 | upper_32_bits(ring->wptr << 2)); |
713 | atomic64_set(v: (atomic64_t *)ring->wptr_cpu_addr, i: ring->wptr << 2); |
714 | WDOORBELL64(ring->doorbell_index, ring->wptr << 2); |
715 | if (vpe->collaborate_mode) |
716 | WDOORBELL64(ring->doorbell_index + 4, ring->wptr << 2); |
717 | } else { |
718 | int i; |
719 | |
720 | for (i = 0; i < vpe->num_instances; i++) { |
721 | dev_dbg(adev->dev, "Not using doorbell, \ |
722 | regVPEC_QUEUE0_RB_WPTR == 0x%08x, \ |
723 | regVPEC_QUEUE0_RB_WPTR_HI == 0x%08x\n" , |
724 | lower_32_bits(ring->wptr << 2), |
725 | upper_32_bits(ring->wptr << 2)); |
726 | WREG32(vpe_get_reg_offset(vpe, i, vpe->regs.queue0_rb_wptr_lo), |
727 | lower_32_bits(ring->wptr << 2)); |
728 | WREG32(vpe_get_reg_offset(vpe, i, vpe->regs.queue0_rb_wptr_hi), |
729 | upper_32_bits(ring->wptr << 2)); |
730 | } |
731 | } |
732 | } |
733 | |
734 | static int vpe_ring_test_ring(struct amdgpu_ring *ring) |
735 | { |
736 | struct amdgpu_device *adev = ring->adev; |
737 | const uint32_t test_pattern = 0xdeadbeef; |
738 | uint32_t index, i; |
739 | uint64_t wb_addr; |
740 | int ret; |
741 | |
742 | ret = amdgpu_device_wb_get(adev, wb: &index); |
743 | if (ret) { |
744 | dev_err(adev->dev, "(%d) failed to allocate wb slot\n" , ret); |
745 | return ret; |
746 | } |
747 | |
748 | adev->wb.wb[index] = 0; |
749 | wb_addr = adev->wb.gpu_addr + (index * 4); |
750 | |
751 | ret = amdgpu_ring_alloc(ring, ndw: 4); |
752 | if (ret) { |
753 | dev_err(adev->dev, "amdgpu: dma failed to lock ring %d (%d).\n" , ring->idx, ret); |
754 | goto out; |
755 | } |
756 | |
757 | amdgpu_ring_write(ring, VPE_CMD_HEADER(VPE_CMD_OPCODE_FENCE, 0)); |
758 | amdgpu_ring_write(ring, lower_32_bits(wb_addr)); |
759 | amdgpu_ring_write(ring, upper_32_bits(wb_addr)); |
760 | amdgpu_ring_write(ring, v: test_pattern); |
761 | amdgpu_ring_commit(ring); |
762 | |
763 | for (i = 0; i < adev->usec_timeout; i++) { |
764 | if (le32_to_cpu(adev->wb.wb[index]) == test_pattern) |
765 | goto out; |
766 | udelay(1); |
767 | } |
768 | |
769 | ret = -ETIMEDOUT; |
770 | out: |
771 | amdgpu_device_wb_free(adev, wb: index); |
772 | |
773 | return ret; |
774 | } |
775 | |
776 | static int vpe_ring_test_ib(struct amdgpu_ring *ring, long timeout) |
777 | { |
778 | struct amdgpu_device *adev = ring->adev; |
779 | const uint32_t test_pattern = 0xdeadbeef; |
780 | struct amdgpu_ib ib = {}; |
781 | struct dma_fence *f = NULL; |
782 | uint32_t index; |
783 | uint64_t wb_addr; |
784 | int ret; |
785 | |
786 | ret = amdgpu_device_wb_get(adev, wb: &index); |
787 | if (ret) { |
788 | dev_err(adev->dev, "(%d) failed to allocate wb slot\n" , ret); |
789 | return ret; |
790 | } |
791 | |
792 | adev->wb.wb[index] = 0; |
793 | wb_addr = adev->wb.gpu_addr + (index * 4); |
794 | |
795 | ret = amdgpu_ib_get(adev, NULL, size: 256, pool: AMDGPU_IB_POOL_DIRECT, ib: &ib); |
796 | if (ret) |
797 | goto err0; |
798 | |
799 | ib.ptr[0] = VPE_CMD_HEADER(VPE_CMD_OPCODE_FENCE, 0); |
800 | ib.ptr[1] = lower_32_bits(wb_addr); |
801 | ib.ptr[2] = upper_32_bits(wb_addr); |
802 | ib.ptr[3] = test_pattern; |
803 | ib.ptr[4] = VPE_CMD_HEADER(VPE_CMD_OPCODE_NOP, 0); |
804 | ib.ptr[5] = VPE_CMD_HEADER(VPE_CMD_OPCODE_NOP, 0); |
805 | ib.ptr[6] = VPE_CMD_HEADER(VPE_CMD_OPCODE_NOP, 0); |
806 | ib.ptr[7] = VPE_CMD_HEADER(VPE_CMD_OPCODE_NOP, 0); |
807 | ib.length_dw = 8; |
808 | |
809 | ret = amdgpu_ib_schedule(ring, num_ibs: 1, ibs: &ib, NULL, f: &f); |
810 | if (ret) |
811 | goto err1; |
812 | |
813 | ret = dma_fence_wait_timeout(f, intr: false, timeout); |
814 | if (ret <= 0) { |
815 | ret = ret ? : -ETIMEDOUT; |
816 | goto err1; |
817 | } |
818 | |
819 | ret = (le32_to_cpu(adev->wb.wb[index]) == test_pattern) ? 0 : -EINVAL; |
820 | |
821 | err1: |
822 | amdgpu_ib_free(adev, ib: &ib, NULL); |
823 | dma_fence_put(fence: f); |
824 | err0: |
825 | amdgpu_device_wb_free(adev, wb: index); |
826 | |
827 | return ret; |
828 | } |
829 | |
830 | static void vpe_ring_begin_use(struct amdgpu_ring *ring) |
831 | { |
832 | struct amdgpu_device *adev = ring->adev; |
833 | struct amdgpu_vpe *vpe = &adev->vpe; |
834 | |
835 | cancel_delayed_work_sync(dwork: &adev->vpe.idle_work); |
836 | |
837 | /* Power on VPE and notify VPE of new context */ |
838 | if (!vpe->context_started) { |
839 | uint32_t context_notify; |
840 | |
841 | /* Power on VPE */ |
842 | amdgpu_device_ip_set_powergating_state(dev: adev, block_type: AMD_IP_BLOCK_TYPE_VPE, state: AMD_PG_STATE_UNGATE); |
843 | |
844 | /* Indicates that a job from a new context has been submitted. */ |
845 | context_notify = RREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.context_indicator)); |
846 | if ((context_notify & 0x1) == 0) |
847 | context_notify |= 0x1; |
848 | else |
849 | context_notify &= ~(0x1); |
850 | WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.context_indicator), context_notify); |
851 | vpe->context_started = true; |
852 | } |
853 | } |
854 | |
855 | static void vpe_ring_end_use(struct amdgpu_ring *ring) |
856 | { |
857 | struct amdgpu_device *adev = ring->adev; |
858 | |
859 | schedule_delayed_work(dwork: &adev->vpe.idle_work, VPE_IDLE_TIMEOUT); |
860 | } |
861 | |
862 | static const struct amdgpu_ring_funcs vpe_ring_funcs = { |
863 | .type = AMDGPU_RING_TYPE_VPE, |
864 | .align_mask = 0xf, |
865 | .nop = VPE_CMD_HEADER(VPE_CMD_OPCODE_NOP, 0), |
866 | .support_64bit_ptrs = true, |
867 | .get_rptr = vpe_ring_get_rptr, |
868 | .get_wptr = vpe_ring_get_wptr, |
869 | .set_wptr = vpe_ring_set_wptr, |
870 | .emit_frame_size = |
871 | 5 + /* vpe_ring_init_cond_exec */ |
872 | 6 + /* vpe_ring_emit_pipeline_sync */ |
873 | 10 + 10 + 10 + /* vpe_ring_emit_fence */ |
874 | /* vpe_ring_emit_vm_flush */ |
875 | SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 + |
876 | SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6, |
877 | .emit_ib_size = 7 + 6, |
878 | .emit_ib = vpe_ring_emit_ib, |
879 | .emit_pipeline_sync = vpe_ring_emit_pipeline_sync, |
880 | .emit_fence = vpe_ring_emit_fence, |
881 | .emit_vm_flush = vpe_ring_emit_vm_flush, |
882 | .emit_wreg = vpe_ring_emit_wreg, |
883 | .emit_reg_wait = vpe_ring_emit_reg_wait, |
884 | .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper, |
885 | .insert_nop = vpe_ring_insert_nop, |
886 | .pad_ib = amdgpu_ring_generic_pad_ib, |
887 | .test_ring = vpe_ring_test_ring, |
888 | .test_ib = vpe_ring_test_ib, |
889 | .init_cond_exec = vpe_ring_init_cond_exec, |
890 | .preempt_ib = vpe_ring_preempt_ib, |
891 | .begin_use = vpe_ring_begin_use, |
892 | .end_use = vpe_ring_end_use, |
893 | }; |
894 | |
895 | static void vpe_set_ring_funcs(struct amdgpu_device *adev) |
896 | { |
897 | adev->vpe.ring.funcs = &vpe_ring_funcs; |
898 | } |
899 | |
900 | const struct amd_ip_funcs vpe_ip_funcs = { |
901 | .name = "vpe_v6_1" , |
902 | .early_init = vpe_early_init, |
903 | .late_init = NULL, |
904 | .sw_init = vpe_sw_init, |
905 | .sw_fini = vpe_sw_fini, |
906 | .hw_init = vpe_hw_init, |
907 | .hw_fini = vpe_hw_fini, |
908 | .suspend = vpe_suspend, |
909 | .resume = vpe_resume, |
910 | .soft_reset = NULL, |
911 | .set_clockgating_state = vpe_set_clockgating_state, |
912 | .set_powergating_state = vpe_set_powergating_state, |
913 | }; |
914 | |
915 | const struct amdgpu_ip_block_version vpe_v6_1_ip_block = { |
916 | .type = AMD_IP_BLOCK_TYPE_VPE, |
917 | .major = 6, |
918 | .minor = 1, |
919 | .rev = 0, |
920 | .funcs = &vpe_ip_funcs, |
921 | }; |
922 | |