1 | // SPDX-License-Identifier: GPL-2.0+ |
2 | /* Copyright (C) 2018 Broadcom */ |
3 | |
4 | /** |
5 | * DOC: Broadcom V3D scheduling |
6 | * |
7 | * The shared DRM GPU scheduler is used to coordinate submitting jobs |
8 | * to the hardware. Each DRM fd (roughly a client process) gets its |
9 | * own scheduler entity, which will process jobs in order. The GPU |
10 | * scheduler will round-robin between clients to submit the next job. |
11 | * |
12 | * For simplicity, and in order to keep latency low for interactive |
13 | * jobs when bulk background jobs are queued up, we submit a new job |
14 | * to the HW only when it has completed the last one, instead of |
15 | * filling up the CT[01]Q FIFOs with jobs. Similarly, we use |
16 | * drm_sched_job_add_dependency() to manage the dependency between bin and |
17 | * render, instead of having the clients submit jobs using the HW's |
18 | * semaphores to interlock between them. |
19 | */ |
20 | |
21 | #include <linux/kthread.h> |
22 | |
23 | #include "v3d_drv.h" |
24 | #include "v3d_regs.h" |
25 | #include "v3d_trace.h" |
26 | |
27 | static struct v3d_job * |
28 | to_v3d_job(struct drm_sched_job *sched_job) |
29 | { |
30 | return container_of(sched_job, struct v3d_job, base); |
31 | } |
32 | |
33 | static struct v3d_bin_job * |
34 | to_bin_job(struct drm_sched_job *sched_job) |
35 | { |
36 | return container_of(sched_job, struct v3d_bin_job, base.base); |
37 | } |
38 | |
39 | static struct v3d_render_job * |
40 | to_render_job(struct drm_sched_job *sched_job) |
41 | { |
42 | return container_of(sched_job, struct v3d_render_job, base.base); |
43 | } |
44 | |
45 | static struct v3d_tfu_job * |
46 | to_tfu_job(struct drm_sched_job *sched_job) |
47 | { |
48 | return container_of(sched_job, struct v3d_tfu_job, base.base); |
49 | } |
50 | |
51 | static struct v3d_csd_job * |
52 | to_csd_job(struct drm_sched_job *sched_job) |
53 | { |
54 | return container_of(sched_job, struct v3d_csd_job, base.base); |
55 | } |
56 | |
57 | static void |
58 | v3d_sched_job_free(struct drm_sched_job *sched_job) |
59 | { |
60 | struct v3d_job *job = to_v3d_job(sched_job); |
61 | |
62 | v3d_job_cleanup(job); |
63 | } |
64 | |
65 | static void |
66 | v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job) |
67 | { |
68 | if (job->perfmon != v3d->active_perfmon) |
69 | v3d_perfmon_stop(v3d, perfmon: v3d->active_perfmon, capture: true); |
70 | |
71 | if (job->perfmon && v3d->active_perfmon != job->perfmon) |
72 | v3d_perfmon_start(v3d, perfmon: job->perfmon); |
73 | } |
74 | |
75 | static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job) |
76 | { |
77 | struct v3d_bin_job *job = to_bin_job(sched_job); |
78 | struct v3d_dev *v3d = job->base.v3d; |
79 | struct drm_device *dev = &v3d->drm; |
80 | struct dma_fence *fence; |
81 | unsigned long irqflags; |
82 | |
83 | if (unlikely(job->base.base.s_fence->finished.error)) |
84 | return NULL; |
85 | |
86 | /* Lock required around bin_job update vs |
87 | * v3d_overflow_mem_work(). |
88 | */ |
89 | spin_lock_irqsave(&v3d->job_lock, irqflags); |
90 | v3d->bin_job = job; |
91 | /* Clear out the overflow allocation, so we don't |
92 | * reuse the overflow attached to a previous job. |
93 | */ |
94 | V3D_CORE_WRITE(0, V3D_PTB_BPOS, 0); |
95 | spin_unlock_irqrestore(lock: &v3d->job_lock, flags: irqflags); |
96 | |
97 | v3d_invalidate_caches(v3d); |
98 | |
99 | fence = v3d_fence_create(v3d, queue: V3D_BIN); |
100 | if (IS_ERR(ptr: fence)) |
101 | return NULL; |
102 | |
103 | if (job->base.irq_fence) |
104 | dma_fence_put(fence: job->base.irq_fence); |
105 | job->base.irq_fence = dma_fence_get(fence); |
106 | |
107 | trace_v3d_submit_cl(dev, is_render: false, seqno: to_v3d_fence(fence)->seqno, |
108 | ctnqba: job->start, ctnqea: job->end); |
109 | |
110 | v3d_switch_perfmon(v3d, job: &job->base); |
111 | |
112 | /* Set the current and end address of the control list. |
113 | * Writing the end register is what starts the job. |
114 | */ |
115 | if (job->qma) { |
116 | V3D_CORE_WRITE(0, V3D_CLE_CT0QMA, job->qma); |
117 | V3D_CORE_WRITE(0, V3D_CLE_CT0QMS, job->qms); |
118 | } |
119 | if (job->qts) { |
120 | V3D_CORE_WRITE(0, V3D_CLE_CT0QTS, |
121 | V3D_CLE_CT0QTS_ENABLE | |
122 | job->qts); |
123 | } |
124 | V3D_CORE_WRITE(0, V3D_CLE_CT0QBA, job->start); |
125 | V3D_CORE_WRITE(0, V3D_CLE_CT0QEA, job->end); |
126 | |
127 | return fence; |
128 | } |
129 | |
130 | static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job) |
131 | { |
132 | struct v3d_render_job *job = to_render_job(sched_job); |
133 | struct v3d_dev *v3d = job->base.v3d; |
134 | struct drm_device *dev = &v3d->drm; |
135 | struct dma_fence *fence; |
136 | |
137 | if (unlikely(job->base.base.s_fence->finished.error)) |
138 | return NULL; |
139 | |
140 | v3d->render_job = job; |
141 | |
142 | /* Can we avoid this flush? We need to be careful of |
143 | * scheduling, though -- imagine job0 rendering to texture and |
144 | * job1 reading, and them being executed as bin0, bin1, |
145 | * render0, render1, so that render1's flush at bin time |
146 | * wasn't enough. |
147 | */ |
148 | v3d_invalidate_caches(v3d); |
149 | |
150 | fence = v3d_fence_create(v3d, queue: V3D_RENDER); |
151 | if (IS_ERR(ptr: fence)) |
152 | return NULL; |
153 | |
154 | if (job->base.irq_fence) |
155 | dma_fence_put(fence: job->base.irq_fence); |
156 | job->base.irq_fence = dma_fence_get(fence); |
157 | |
158 | trace_v3d_submit_cl(dev, is_render: true, seqno: to_v3d_fence(fence)->seqno, |
159 | ctnqba: job->start, ctnqea: job->end); |
160 | |
161 | v3d_switch_perfmon(v3d, job: &job->base); |
162 | |
163 | /* XXX: Set the QCFG */ |
164 | |
165 | /* Set the current and end address of the control list. |
166 | * Writing the end register is what starts the job. |
167 | */ |
168 | V3D_CORE_WRITE(0, V3D_CLE_CT1QBA, job->start); |
169 | V3D_CORE_WRITE(0, V3D_CLE_CT1QEA, job->end); |
170 | |
171 | return fence; |
172 | } |
173 | |
174 | static struct dma_fence * |
175 | v3d_tfu_job_run(struct drm_sched_job *sched_job) |
176 | { |
177 | struct v3d_tfu_job *job = to_tfu_job(sched_job); |
178 | struct v3d_dev *v3d = job->base.v3d; |
179 | struct drm_device *dev = &v3d->drm; |
180 | struct dma_fence *fence; |
181 | |
182 | fence = v3d_fence_create(v3d, queue: V3D_TFU); |
183 | if (IS_ERR(ptr: fence)) |
184 | return NULL; |
185 | |
186 | v3d->tfu_job = job; |
187 | if (job->base.irq_fence) |
188 | dma_fence_put(fence: job->base.irq_fence); |
189 | job->base.irq_fence = dma_fence_get(fence); |
190 | |
191 | trace_v3d_submit_tfu(dev, seqno: to_v3d_fence(fence)->seqno); |
192 | |
193 | V3D_WRITE(V3D_TFU_IIA, job->args.iia); |
194 | V3D_WRITE(V3D_TFU_IIS, job->args.iis); |
195 | V3D_WRITE(V3D_TFU_ICA, job->args.ica); |
196 | V3D_WRITE(V3D_TFU_IUA, job->args.iua); |
197 | V3D_WRITE(V3D_TFU_IOA, job->args.ioa); |
198 | V3D_WRITE(V3D_TFU_IOS, job->args.ios); |
199 | V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]); |
200 | if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) { |
201 | V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]); |
202 | V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]); |
203 | V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]); |
204 | } |
205 | /* ICFG kicks off the job. */ |
206 | V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC); |
207 | |
208 | return fence; |
209 | } |
210 | |
211 | static struct dma_fence * |
212 | v3d_csd_job_run(struct drm_sched_job *sched_job) |
213 | { |
214 | struct v3d_csd_job *job = to_csd_job(sched_job); |
215 | struct v3d_dev *v3d = job->base.v3d; |
216 | struct drm_device *dev = &v3d->drm; |
217 | struct dma_fence *fence; |
218 | int i; |
219 | |
220 | v3d->csd_job = job; |
221 | |
222 | v3d_invalidate_caches(v3d); |
223 | |
224 | fence = v3d_fence_create(v3d, queue: V3D_CSD); |
225 | if (IS_ERR(ptr: fence)) |
226 | return NULL; |
227 | |
228 | if (job->base.irq_fence) |
229 | dma_fence_put(fence: job->base.irq_fence); |
230 | job->base.irq_fence = dma_fence_get(fence); |
231 | |
232 | trace_v3d_submit_csd(dev, seqno: to_v3d_fence(fence)->seqno); |
233 | |
234 | v3d_switch_perfmon(v3d, job: &job->base); |
235 | |
236 | for (i = 1; i <= 6; i++) |
237 | V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]); |
238 | /* CFG0 write kicks off the job. */ |
239 | V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]); |
240 | |
241 | return fence; |
242 | } |
243 | |
244 | static struct dma_fence * |
245 | v3d_cache_clean_job_run(struct drm_sched_job *sched_job) |
246 | { |
247 | struct v3d_job *job = to_v3d_job(sched_job); |
248 | struct v3d_dev *v3d = job->v3d; |
249 | |
250 | v3d_clean_caches(v3d); |
251 | |
252 | return NULL; |
253 | } |
254 | |
255 | static enum drm_gpu_sched_stat |
256 | v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) |
257 | { |
258 | enum v3d_queue q; |
259 | |
260 | mutex_lock(&v3d->reset_lock); |
261 | |
262 | /* block scheduler */ |
263 | for (q = 0; q < V3D_MAX_QUEUES; q++) |
264 | drm_sched_stop(sched: &v3d->queue[q].sched, bad: sched_job); |
265 | |
266 | if (sched_job) |
267 | drm_sched_increase_karma(bad: sched_job); |
268 | |
269 | /* get the GPU back into the init state */ |
270 | v3d_reset(v3d); |
271 | |
272 | for (q = 0; q < V3D_MAX_QUEUES; q++) |
273 | drm_sched_resubmit_jobs(sched: &v3d->queue[q].sched); |
274 | |
275 | /* Unblock schedulers and restart their jobs. */ |
276 | for (q = 0; q < V3D_MAX_QUEUES; q++) { |
277 | drm_sched_start(sched: &v3d->queue[q].sched, full_recovery: true); |
278 | } |
279 | |
280 | mutex_unlock(lock: &v3d->reset_lock); |
281 | |
282 | return DRM_GPU_SCHED_STAT_NOMINAL; |
283 | } |
284 | |
285 | /* If the current address or return address have changed, then the GPU |
286 | * has probably made progress and we should delay the reset. This |
287 | * could fail if the GPU got in an infinite loop in the CL, but that |
288 | * is pretty unlikely outside of an i-g-t testcase. |
289 | */ |
290 | static enum drm_gpu_sched_stat |
291 | v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q, |
292 | u32 *timedout_ctca, u32 *timedout_ctra) |
293 | { |
294 | struct v3d_job *job = to_v3d_job(sched_job); |
295 | struct v3d_dev *v3d = job->v3d; |
296 | u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q)); |
297 | u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q)); |
298 | |
299 | if (*timedout_ctca != ctca || *timedout_ctra != ctra) { |
300 | *timedout_ctca = ctca; |
301 | *timedout_ctra = ctra; |
302 | return DRM_GPU_SCHED_STAT_NOMINAL; |
303 | } |
304 | |
305 | return v3d_gpu_reset_for_timeout(v3d, sched_job); |
306 | } |
307 | |
308 | static enum drm_gpu_sched_stat |
309 | v3d_bin_job_timedout(struct drm_sched_job *sched_job) |
310 | { |
311 | struct v3d_bin_job *job = to_bin_job(sched_job); |
312 | |
313 | return v3d_cl_job_timedout(sched_job, q: V3D_BIN, |
314 | timedout_ctca: &job->timedout_ctca, timedout_ctra: &job->timedout_ctra); |
315 | } |
316 | |
317 | static enum drm_gpu_sched_stat |
318 | v3d_render_job_timedout(struct drm_sched_job *sched_job) |
319 | { |
320 | struct v3d_render_job *job = to_render_job(sched_job); |
321 | |
322 | return v3d_cl_job_timedout(sched_job, q: V3D_RENDER, |
323 | timedout_ctca: &job->timedout_ctca, timedout_ctra: &job->timedout_ctra); |
324 | } |
325 | |
326 | static enum drm_gpu_sched_stat |
327 | v3d_generic_job_timedout(struct drm_sched_job *sched_job) |
328 | { |
329 | struct v3d_job *job = to_v3d_job(sched_job); |
330 | |
331 | return v3d_gpu_reset_for_timeout(v3d: job->v3d, sched_job); |
332 | } |
333 | |
334 | static enum drm_gpu_sched_stat |
335 | v3d_csd_job_timedout(struct drm_sched_job *sched_job) |
336 | { |
337 | struct v3d_csd_job *job = to_csd_job(sched_job); |
338 | struct v3d_dev *v3d = job->base.v3d; |
339 | u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4); |
340 | |
341 | /* If we've made progress, skip reset and let the timer get |
342 | * rearmed. |
343 | */ |
344 | if (job->timedout_batches != batches) { |
345 | job->timedout_batches = batches; |
346 | return DRM_GPU_SCHED_STAT_NOMINAL; |
347 | } |
348 | |
349 | return v3d_gpu_reset_for_timeout(v3d, sched_job); |
350 | } |
351 | |
352 | static const struct drm_sched_backend_ops v3d_bin_sched_ops = { |
353 | .run_job = v3d_bin_job_run, |
354 | .timedout_job = v3d_bin_job_timedout, |
355 | .free_job = v3d_sched_job_free, |
356 | }; |
357 | |
358 | static const struct drm_sched_backend_ops v3d_render_sched_ops = { |
359 | .run_job = v3d_render_job_run, |
360 | .timedout_job = v3d_render_job_timedout, |
361 | .free_job = v3d_sched_job_free, |
362 | }; |
363 | |
364 | static const struct drm_sched_backend_ops v3d_tfu_sched_ops = { |
365 | .run_job = v3d_tfu_job_run, |
366 | .timedout_job = v3d_generic_job_timedout, |
367 | .free_job = v3d_sched_job_free, |
368 | }; |
369 | |
370 | static const struct drm_sched_backend_ops v3d_csd_sched_ops = { |
371 | .run_job = v3d_csd_job_run, |
372 | .timedout_job = v3d_csd_job_timedout, |
373 | .free_job = v3d_sched_job_free |
374 | }; |
375 | |
376 | static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = { |
377 | .run_job = v3d_cache_clean_job_run, |
378 | .timedout_job = v3d_generic_job_timedout, |
379 | .free_job = v3d_sched_job_free |
380 | }; |
381 | |
382 | int |
383 | v3d_sched_init(struct v3d_dev *v3d) |
384 | { |
385 | int hw_jobs_limit = 1; |
386 | int job_hang_limit = 0; |
387 | int hang_limit_ms = 500; |
388 | int ret; |
389 | |
390 | ret = drm_sched_init(sched: &v3d->queue[V3D_BIN].sched, |
391 | ops: &v3d_bin_sched_ops, |
392 | num_rqs: DRM_SCHED_PRIORITY_COUNT, |
393 | hw_submission: hw_jobs_limit, hang_limit: job_hang_limit, |
394 | timeout: msecs_to_jiffies(m: hang_limit_ms), NULL, |
395 | NULL, name: "v3d_bin" , dev: v3d->drm.dev); |
396 | if (ret) |
397 | return ret; |
398 | |
399 | ret = drm_sched_init(sched: &v3d->queue[V3D_RENDER].sched, |
400 | ops: &v3d_render_sched_ops, |
401 | num_rqs: DRM_SCHED_PRIORITY_COUNT, |
402 | hw_submission: hw_jobs_limit, hang_limit: job_hang_limit, |
403 | timeout: msecs_to_jiffies(m: hang_limit_ms), NULL, |
404 | NULL, name: "v3d_render" , dev: v3d->drm.dev); |
405 | if (ret) |
406 | goto fail; |
407 | |
408 | ret = drm_sched_init(sched: &v3d->queue[V3D_TFU].sched, |
409 | ops: &v3d_tfu_sched_ops, |
410 | num_rqs: DRM_SCHED_PRIORITY_COUNT, |
411 | hw_submission: hw_jobs_limit, hang_limit: job_hang_limit, |
412 | timeout: msecs_to_jiffies(m: hang_limit_ms), NULL, |
413 | NULL, name: "v3d_tfu" , dev: v3d->drm.dev); |
414 | if (ret) |
415 | goto fail; |
416 | |
417 | if (v3d_has_csd(v3d)) { |
418 | ret = drm_sched_init(sched: &v3d->queue[V3D_CSD].sched, |
419 | ops: &v3d_csd_sched_ops, |
420 | num_rqs: DRM_SCHED_PRIORITY_COUNT, |
421 | hw_submission: hw_jobs_limit, hang_limit: job_hang_limit, |
422 | timeout: msecs_to_jiffies(m: hang_limit_ms), NULL, |
423 | NULL, name: "v3d_csd" , dev: v3d->drm.dev); |
424 | if (ret) |
425 | goto fail; |
426 | |
427 | ret = drm_sched_init(sched: &v3d->queue[V3D_CACHE_CLEAN].sched, |
428 | ops: &v3d_cache_clean_sched_ops, |
429 | num_rqs: DRM_SCHED_PRIORITY_COUNT, |
430 | hw_submission: hw_jobs_limit, hang_limit: job_hang_limit, |
431 | timeout: msecs_to_jiffies(m: hang_limit_ms), NULL, |
432 | NULL, name: "v3d_cache_clean" , dev: v3d->drm.dev); |
433 | if (ret) |
434 | goto fail; |
435 | } |
436 | |
437 | return 0; |
438 | |
439 | fail: |
440 | v3d_sched_fini(v3d); |
441 | return ret; |
442 | } |
443 | |
444 | void |
445 | v3d_sched_fini(struct v3d_dev *v3d) |
446 | { |
447 | enum v3d_queue q; |
448 | |
449 | for (q = 0; q < V3D_MAX_QUEUES; q++) { |
450 | if (v3d->queue[q].sched.ready) |
451 | drm_sched_fini(sched: &v3d->queue[q].sched); |
452 | } |
453 | } |
454 | |