intel_execlists_submission.c source code [linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2014 Intel Corporation
4	*/
5
6	/**
7	* DOC: Logical Rings, Logical Ring Contexts and Execlists
8	*
9	* Motivation:
10	* GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
11	* These expanded contexts enable a number of new abilities, especially
12	* "Execlists" (also implemented in this file).
13	*
14	* One of the main differences with the legacy HW contexts is that logical
15	* ring contexts incorporate many more things to the context's state, like
16	* PDPs or ringbuffer control registers:
17	*
18	* The reason why PDPs are included in the context is straightforward: as
19	* PPGTTs (per-process GTTs) are actually per-context, having the PDPs
20	* contained there mean you don't need to do a ppgtt->switch_mm yourself,
21	* instead, the GPU will do it for you on the context switch.
22	*
23	* But, what about the ringbuffer control registers (head, tail, etc..)?
24	* shouldn't we just need a set of those per engine command streamer? This is
25	* where the name "Logical Rings" starts to make sense: by virtualizing the
26	* rings, the engine cs shifts to a new "ring buffer" with every context
27	* switch. When you want to submit a workload to the GPU you: A) choose your
28	* context, B) find its appropriate virtualized ring, C) write commands to it
29	* and then, finally, D) tell the GPU to switch to that context.
30	*
31	* Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
32	* to a contexts is via a context execution list, ergo "Execlists".
33	*
34	* LRC implementation:
35	* Regarding the creation of contexts, we have:
36	*
37	* - One global default context.
38	* - One local default context for each opened fd.
39	* - One local extra context for each context create ioctl call.
40	*
41	* Now that ringbuffers belong per-context (and not per-engine, like before)
42	* and that contexts are uniquely tied to a given engine (and not reusable,
43	* like before) we need:
44	*
45	* - One ringbuffer per-engine inside each context.
46	* - One backing object per-engine inside each context.
47	*
48	* The global default context starts its life with these new objects fully
49	* allocated and populated. The local default context for each opened fd is
50	* more complex, because we don't know at creation time which engine is going
51	* to use them. To handle this, we have implemented a deferred creation of LR
52	* contexts:
53	*
54	* The local context starts its life as a hollow or blank holder, that only
55	* gets populated for a given engine once we receive an execbuffer. If later
56	* on we receive another execbuffer ioctl for the same context but a different
57	* engine, we allocate/populate a new ringbuffer and context backing object and
58	* so on.
59	*
60	* Finally, regarding local contexts created using the ioctl call: as they are
61	* only allowed with the render ring, we can allocate & populate them right
62	* away (no need to defer anything, at least for now).
63	*
64	* Execlists implementation:
65	* Execlists are the new method by which, on gen8+ hardware, workloads are
66	* submitted for execution (as opposed to the legacy, ringbuffer-based, method).
67	* This method works as follows:
68	*
69	* When a request is committed, its commands (the BB start and any leading or
70	* trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
71	* for the appropriate context. The tail pointer in the hardware context is not
72	* updated at this time, but instead, kept by the driver in the ringbuffer
73	* structure. A structure representing this request is added to a request queue
74	* for the appropriate engine: this structure contains a copy of the context's
75	* tail after the request was written to the ring buffer and a pointer to the
76	* context itself.
77	*
78	* If the engine's request queue was empty before the request was added, the
79	* queue is processed immediately. Otherwise the queue will be processed during
80	* a context switch interrupt. In any case, elements on the queue will get sent
81	* (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
82	* globally unique 20-bits submission ID.
83	*
84	* When execution of a request completes, the GPU updates the context status
85	* buffer with a context complete event and generates a context switch interrupt.
86	* During the interrupt handling, the driver examines the events in the buffer:
87	* for each context complete event, if the announced ID matches that on the head
88	* of the request queue, then that request is retired and removed from the queue.
89	*
90	* After processing, if any requests were retired and the queue is not empty
91	* then a new execution list can be submitted. The two requests at the front of
92	* the queue are next to be submitted but since a context may not occur twice in
93	* an execution list, if subsequent requests have the same ID as the first then
94	* the two requests must be combined. This is done simply by discarding requests
95	* at the head of the queue until either only one requests is left (in which case
96	* we use a NULL second context) or the first two requests have unique IDs.
97	*
98	* By always executing the first two requests in the queue the driver ensures
99	* that the GPU is kept as busy as possible. In the case where a single context
100	* completes but a second context is still executing, the request for this second
101	* context will be at the head of the queue when we remove the first one. This
102	* request will then be resubmitted along with a new request for a different context,
103	* which will cause the hardware to continue executing the second request and queue
104	* the new request (the GPU detects the condition of a context getting preempted
105	* with the same context and optimizes the context switch flow by not doing
106	* preemption, but just sampling the new tail pointer).
107	*
108	*/
109	#include <linux/interrupt.h>
110	#include <linux/string_helpers.h>
111
112	#include "i915_drv.h"
113	#include "i915_reg.h"
114	#include "i915_trace.h"
115	#include "i915_vgpu.h"
116	#include "gen8_engine_cs.h"
117	#include "intel_breadcrumbs.h"
118	#include "intel_context.h"
119	#include "intel_engine_heartbeat.h"
120	#include "intel_engine_pm.h"
121	#include "intel_engine_regs.h"
122	#include "intel_engine_stats.h"
123	#include "intel_execlists_submission.h"
124	#include "intel_gt.h"
125	#include "intel_gt_irq.h"
126	#include "intel_gt_pm.h"
127	#include "intel_gt_regs.h"
128	#include "intel_gt_requests.h"
129	#include "intel_lrc.h"
130	#include "intel_lrc_reg.h"
131	#include "intel_mocs.h"
132	#include "intel_reset.h"
133	#include "intel_ring.h"
134	#include "intel_workarounds.h"
135	#include "shmem_utils.h"
136
137	#define RING_EXECLIST_QFULL (1 << 0x2)
138	#define RING_EXECLIST1_VALID (1 << 0x3)
139	#define RING_EXECLIST0_VALID (1 << 0x4)
140	#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
141	#define RING_EXECLIST1_ACTIVE (1 << 0x11)
142	#define RING_EXECLIST0_ACTIVE (1 << 0x12)
143
144	#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
145	#define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
146	#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
147	#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
148	#define GEN8_CTX_STATUS_COMPLETE (1 << 4)
149	#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
150
151	#define GEN8_CTX_STATUS_COMPLETED_MASK \
152	(GEN8_CTX_STATUS_COMPLETE \| GEN8_CTX_STATUS_PREEMPTED)
153
154	#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
155	#define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
156	#define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
157	#define GEN12_IDLE_CTX_ID 0x7FF
158	#define GEN12_CSB_CTX_VALID(csb_dw) \
159	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
160
161	#define XEHP_CTX_STATUS_SWITCHED_TO_NEW_QUEUE BIT(1) /* upper csb dword */
162	#define XEHP_CSB_SW_CTX_ID_MASK GENMASK(31, 10)
163	#define XEHP_IDLE_CTX_ID 0xFFFF
164	#define XEHP_CSB_CTX_VALID(csb_dw) \
165	(FIELD_GET(XEHP_CSB_SW_CTX_ID_MASK, csb_dw) != XEHP_IDLE_CTX_ID)
166
167	/ Typical size of the average request (2 pipecontrols and a MI_BB) /
168	#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
169
170	struct virtual_engine {
171	struct intel_engine_cs base;
172	struct intel_context context;
173	struct rcu_work rcu;
174
175	/*
176	* We allow only a single request through the virtual engine at a time
177	* (each request in the timeline waits for the completion fence of
178	* the previous before being submitted). By restricting ourselves to
179	* only submitting a single request, each request is placed on to a
180	* physical to maximise load spreading (by virtue of the late greedy
181	* scheduling -- each real engine takes the next available request
182	* upon idling).
183	*/
184	struct i915_request *request;
185
186	/*
187	* We keep a rbtree of available virtual engines inside each physical
188	* engine, sorted by priority. Here we preallocate the nodes we need
189	* for the virtual engine, indexed by physical_engine->id.
190	*/
191	struct ve_node {
192	struct rb_node rb;
193	int prio;
194	} nodes[I915_NUM_ENGINES];
195
196	/ And finally, which physical engines this virtual engine maps onto. /
197	unsigned int num_siblings;
198	struct intel_engine_cs *siblings[];
199	};
200
201	static struct virtual_engine to_virtual_engine(struct* intel_engine_cs *engine)
202	{
203	GEM_BUG_ON(!intel_engine_is_virtual(engine));
204	return container_of(engine, struct virtual_engine, base);
205	}
206
207	static struct intel_context *
208	execlists_create_virtual(struct intel_engine_cs *siblings, unsigned* int count,
209	unsigned long flags);
210
211	static struct i915_request *
212	__active_request(const struct intel_timeline * const tl,
213	struct i915_request *rq,
214	int error)
215	{
216	struct i915_request *active = rq;
217
218	list_for_each_entry_from_reverse(rq, &tl->requests, link) {
219	if (__i915_request_is_complete(rq))
220	break;
221
222	if (error) {
223	i915_request_set_error_once(rq, error);
224	__i915_request_skip(rq);
225	}
226	active = rq;
227	}
228
229	return active;
230	}
231
232	static struct i915_request *
233	active_request(const struct intel_timeline * const tl, struct i915_request *rq)
234	{
235	return __active_request(tl, rq, error: `0`);
236	}
237
238	static void ring_set_paused(const struct intel_engine_cs engine, int* state)
239	{
240	/*
241	* We inspect HWS_PREEMPT with a semaphore inside
242	* engine->emit_fini_breadcrumb. If the dword is true,
243	* the ring is paused as the semaphore will busywait
244	* until the dword is false.
245	*/
246	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
247	if (state)
248	wmb();
249	}
250
251	static struct i915_priolist to_priolist(struct* rb_node *rb)
252	{
253	return rb_entry(rb, struct i915_priolist, node);
254	}
255
256	static int rq_prio(const struct i915_request *rq)
257	{
258	return READ_ONCE(rq->sched.attr.priority);
259	}
260
261	static int effective_prio(const struct i915_request *rq)
262	{
263	int prio = rq_prio(rq);
264
265	/*
266	* If this request is special and must not be interrupted at any
267	* cost, so be it. Note we are only checking the most recent request
268	* in the context and so may be masking an earlier vip request. It
269	* is hoped that under the conditions where nopreempt is used, this
270	* will not matter (i.e. all requests to that context will be
271	* nopreempt for as long as desired).
272	*/
273	if (i915_request_has_nopreempt(rq))
274	prio = I915_PRIORITY_UNPREEMPTABLE;
275
276	return prio;
277	}
278
279	static int queue_prio(const struct i915_sched_engine *sched_engine)
280	{
281	struct rb_node *rb;
282
283	rb = rb_first_cached(&sched_engine->queue);
284	if (!rb)
285	return INT_MIN;
286
287	return to_priolist(rb)->priority;
288	}
289
290	static int virtual_prio(const struct intel_engine_execlists *el)
291	{
292	struct rb_node *rb = rb_first_cached(&el->virtual);
293
294	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
295	}
296
297	static bool need_preempt(const struct intel_engine_cs *engine,
298	const struct i915_request *rq)
299	{
300	int last_prio;
301
302	if (!intel_engine_has_semaphores(engine))
303	return false;
304
305	/*
306	* Check if the current priority hint merits a preemption attempt.
307	*
308	* We record the highest value priority we saw during rescheduling
309	* prior to this dequeue, therefore we know that if it is strictly
310	* less than the current tail of ESLP[0], we do not need to force
311	* a preempt-to-idle cycle.
312	*
313	* However, the priority hint is a mere hint that we may need to
314	* preempt. If that hint is stale or we may be trying to preempt
315	* ourselves, ignore the request.
316	*
317	* More naturally we would write
318	* prio >= max(0, last);
319	* except that we wish to prevent triggering preemption at the same
320	* priority level: the task that is running should remain running
321	* to preserve FIFO ordering of dependencies.
322	*/
323	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - `1`);
324	if (engine->sched_engine->queue_priority_hint <= last_prio)
325	return false;
326
327	/*
328	* Check against the first request in ELSP[1], it will, thanks to the
329	* power of PI, be the highest priority of that context.
330	*/
331	if (!list_is_last(list: &rq->sched.link, head: &engine->sched_engine->requests) &&
332	rq_prio(list_next_entry(rq, sched.link)) > last_prio)
333	return true;
334
335	/*
336	* If the inflight context did not trigger the preemption, then maybe
337	* it was the set of queued requests? Pick the highest priority in
338	* the queue (the first active priolist) and see if it deserves to be
339	* running instead of ELSP[0].
340	*
341	* The highest priority request in the queue can not be either
342	* ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
343	* context, it's priority would not exceed ELSP[0] aka last_prio.
344	*/
345	return max(virtual_prio(&engine->execlists),
346	queue_prio(engine->sched_engine)) > last_prio;
347	}
348
349	__maybe_unused static bool
350	assert_priority_queue(const struct i915_request *prev,
351	const struct i915_request *next)
352	{
353	/*
354	* Without preemption, the prev may refer to the still active element
355	* which we refuse to let go.
356	*
357	* Even with preemption, there are times when we think it is better not
358	* to preempt and leave an ostensibly lower priority request in flight.
359	*/
360	if (i915_request_is_active(rq: prev))
361	return true;
362
363	return rq_prio(rq: prev) >= rq_prio(rq: next);
364	}
365
366	static struct i915_request *
367	__unwind_incomplete_requests(struct intel_engine_cs *engine)
368	{
369	struct i915_request rq, rn, *active = NULL;
370	struct list_head *pl;
371	int prio = I915_PRIORITY_INVALID;
372
373	lockdep_assert_held(&engine->sched_engine->lock);
374
375	list_for_each_entry_safe_reverse(rq, rn,
376	&engine->sched_engine->requests,
377	sched.link) {
378	if (__i915_request_is_complete(rq)) {
379	list_del_init(entry: &rq->sched.link);
380	continue;
381	}
382
383	__i915_request_unsubmit(request: rq);
384
385	GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
386	if (rq_prio(rq) != prio) {
387	prio = rq_prio(rq);
388	pl = i915_sched_lookup_priolist(sched_engine: engine->sched_engine,
389	prio);
390	}
391	GEM_BUG_ON(i915_sched_engine_is_empty(engine->sched_engine));
392
393	list_move(list: &rq->sched.link, head: pl);
394	set_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
395
396	/ Check in case we rollback so far we wrap [size/2] /
397	if (intel_ring_direction(ring: rq->ring,
398	next: rq->tail,
399	prev: rq->ring->tail + `8`) > `0`)
400	rq->context->lrc.desc \|= CTX_DESC_FORCE_RESTORE;
401
402	active = rq;
403	}
404
405	return active;
406	}
407
408	struct i915_request *
409	execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
410	{
411	struct intel_engine_cs *engine =
412	container_of(execlists, typeof(*engine), execlists);
413
414	return __unwind_incomplete_requests(engine);
415	}
416
417	static void
418	execlists_context_status_change(struct i915_request rq, unsigned* long status)
419	{
420	/*
421	* Only used when GVT-g is enabled now. When GVT-g is disabled,
422	* The compiler should eliminate this function as dead-code.
423	*/
424	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
425	return;
426
427	atomic_notifier_call_chain(nh: &rq->engine->context_status_notifier,
428	val: status, v: rq);
429	}
430
431	static void reset_active(struct i915_request *rq,
432	struct intel_engine_cs *engine)
433	{
434	struct intel_context * const ce = rq->context;
435	u32 head;
436
437	/*
438	* The executing context has been cancelled. We want to prevent
439	* further execution along this context and propagate the error on
440	* to anything depending on its results.
441	*
442	* In __i915_request_submit(), we apply the -EIO and remove the
443	* requests' payloads for any banned requests. But first, we must
444	* rewind the context back to the start of the incomplete request so
445	* that we do not jump back into the middle of the batch.
446	*
447	* We preserve the breadcrumbs and semaphores of the incomplete
448	* requests so that inter-timeline dependencies (i.e other timelines)
449	* remain correctly ordered. And we defer to __i915_request_submit()
450	* so that all asynchronous waits are correctly handled.
451	*/
452	ENGINE_TRACE(engine, "{ reset rq=%llx:%lld }\n",
453	rq->fence.context, rq->fence.seqno);
454
455	/ On resubmission of the active request, payload will be scrubbed /
456	if (__i915_request_is_complete(rq))
457	head = rq->tail;
458	else
459	head = __active_request(tl: ce->timeline, rq, error: -EIO)->head;
460	head = intel_ring_wrap(ring: ce->ring, pos: head);
461
462	/ Scrub the context image to prevent replaying the previous batch /
463	lrc_init_regs(ce, engine, clear: true);
464
465	/ We've switched away, so this should be a no-op, but intent matters /
466	ce->lrc.lrca = lrc_update_regs(ce, engine, head);
467	}
468
469	static bool bad_request(const struct i915_request *rq)
470	{
471	return rq->fence.error && i915_request_started(rq);
472	}
473
474	static struct intel_engine_cs *
475	__execlists_schedule_in(struct i915_request *rq)
476	{
477	struct intel_engine_cs * const engine = rq->engine;
478	struct intel_context * const ce = rq->context;
479
480	intel_context_get(ce);
481
482	if (unlikely(intel_context_is_closed(ce) &&
483	!intel_engine_has_heartbeat(engine)))
484	intel_context_set_exiting(ce);
485
486	if (unlikely(!intel_context_is_schedulable(ce) \|\| bad_request(rq)))
487	reset_active(rq, engine);
488
489	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
490	lrc_check_regs(ce, engine, when: "before");
491
492	if (ce->tag) {
493	/ Use a fixed tag for OA and friends /
494	GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
495	ce->lrc.ccid = ce->tag;
496	} else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `50`)) {
497	/ We don't need a strict matching tag, just different values /
498	unsigned int tag = ffs(READ_ONCE(engine->context_tag));
499
500	GEM_BUG_ON(tag == `0` \|\| tag >= BITS_PER_LONG);
501	clear_bit(nr: tag - `1`, addr: &engine->context_tag);
502	ce->lrc.ccid = tag << (XEHP_SW_CTX_ID_SHIFT - `32`);
503
504	BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
505
506	} else {
507	/ We don't need a strict matching tag, just different values /
508	unsigned int tag = __ffs(engine->context_tag);
509
510	GEM_BUG_ON(tag >= BITS_PER_LONG);
511	__clear_bit(tag, &engine->context_tag);
512	ce->lrc.ccid = (`1` + tag) << (GEN11_SW_CTX_ID_SHIFT - `32`);
513
514	BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
515	}
516
517	ce->lrc.ccid \|= engine->execlists.ccid;
518
519	__intel_gt_pm_get(gt: engine->gt);
520	if (engine->fw_domain && !engine->fw_active++)
521	intel_uncore_forcewake_get(uncore: engine->uncore, domains: engine->fw_domain);
522	execlists_context_status_change(rq, status: INTEL_CONTEXT_SCHEDULE_IN);
523	intel_engine_context_in(engine);
524
525	CE_TRACE(ce, "schedule-in, ccid:%x\n", ce->lrc.ccid);
526
527	return engine;
528	}
529
530	static void execlists_schedule_in(struct i915_request rq, int* idx)
531	{
532	struct intel_context * const ce = rq->context;
533	struct intel_engine_cs *old;
534
535	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
536	trace_i915_request_in(rq, port: idx);
537
538	old = ce->inflight;
539	if (!old)
540	old = __execlists_schedule_in(rq);
541	WRITE_ONCE(ce->inflight, ptr_inc(old));
542
543	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
544	}
545
546	static void
547	resubmit_virtual_request(struct i915_request rq, struct* virtual_engine *ve)
548	{
549	struct intel_engine_cs *engine = rq->engine;
550
551	spin_lock_irq(lock: &engine->sched_engine->lock);
552
553	clear_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
554	WRITE_ONCE(rq->engine, &ve->base);
555	ve->base.submit_request(rq);
556
557	spin_unlock_irq(lock: &engine->sched_engine->lock);
558	}
559
560	static void kick_siblings(struct i915_request rq, struct* intel_context *ce)
561	{
562	struct virtual_engine ve = container_of(ce, typeof(ve), context);
563	struct intel_engine_cs *engine = rq->engine;
564
565	/*
566	* After this point, the rq may be transferred to a new sibling, so
567	* before we clear ce->inflight make sure that the context has been
568	* removed from the b->signalers and furthermore we need to make sure
569	* that the concurrent iterator in signal_irq_work is no longer
570	* following ce->signal_link.
571	*/
572	if (!list_empty(head: &ce->signals))
573	intel_context_remove_breadcrumbs(ce, b: engine->breadcrumbs);
574
575	/*
576	* This engine is now too busy to run this virtual request, so
577	* see if we can find an alternative engine for it to execute on.
578	* Once a request has become bonded to this engine, we treat it the
579	* same as other native request.
580	*/
581	if (i915_request_in_priority_queue(rq) &&
582	rq->execution_mask != engine->mask)
583	resubmit_virtual_request(rq, ve);
584
585	if (READ_ONCE(ve->request))
586	tasklet_hi_schedule(t: &ve->base.sched_engine->tasklet);
587	}
588
589	static void __execlists_schedule_out(struct i915_request * const rq,
590	struct intel_context * const ce)
591	{
592	struct intel_engine_cs * const engine = rq->engine;
593	unsigned int ccid;
594
595	/*
596	* NB process_csb() is not under the engine->sched_engine->lock and hence
597	* schedule_out can race with schedule_in meaning that we should
598	* refrain from doing non-trivial work here.
599	*/
600
601	CE_TRACE(ce, "schedule-out, ccid:%x\n", ce->lrc.ccid);
602	GEM_BUG_ON(ce->inflight != engine);
603
604	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
605	lrc_check_regs(ce, engine, when: "after");
606
607	/*
608	* If we have just completed this context, the engine may now be
609	* idle and we want to re-enter powersaving.
610	*/
611	if (intel_timeline_is_last(tl: ce->timeline, rq) &&
612	__i915_request_is_complete(rq))
613	intel_engine_add_retire(engine, tl: ce->timeline);
614
615	ccid = ce->lrc.ccid;
616	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `50`)) {
617	ccid >>= XEHP_SW_CTX_ID_SHIFT - `32`;
618	ccid &= XEHP_MAX_CONTEXT_HW_ID;
619	} else {
620	ccid >>= GEN11_SW_CTX_ID_SHIFT - `32`;
621	ccid &= GEN12_MAX_CONTEXT_HW_ID;
622	}
623
624	if (ccid < BITS_PER_LONG) {
625	GEM_BUG_ON(ccid == `0`);
626	GEM_BUG_ON(test_bit(ccid - `1`, &engine->context_tag));
627	__set_bit(ccid - `1`, &engine->context_tag);
628	}
629	intel_engine_context_out(engine);
630	execlists_context_status_change(rq, status: INTEL_CONTEXT_SCHEDULE_OUT);
631	if (engine->fw_domain && !--engine->fw_active)
632	intel_uncore_forcewake_put(uncore: engine->uncore, domains: engine->fw_domain);
633	intel_gt_pm_put_async_untracked(gt: engine->gt);
634
635	/*
636	* If this is part of a virtual engine, its next request may
637	* have been blocked waiting for access to the active context.
638	* We have to kick all the siblings again in case we need to
639	* switch (e.g. the next request is not runnable on this
640	* engine). Hopefully, we will already have submitted the next
641	* request before the tasklet runs and do not need to rebuild
642	* each virtual tree and kick everyone again.
643	*/
644	if (ce->engine != engine)
645	kick_siblings(rq, ce);
646
647	WRITE_ONCE(ce->inflight, NULL);
648	intel_context_put(ce);
649	}
650
651	static inline void execlists_schedule_out(struct i915_request *rq)
652	{
653	struct intel_context * const ce = rq->context;
654
655	trace_i915_request_out(rq);
656
657	GEM_BUG_ON(!ce->inflight);
658	ce->inflight = ptr_dec(ce->inflight);
659	if (!__intel_context_inflight_count(ce->inflight))
660	__execlists_schedule_out(rq, ce);
661
662	i915_request_put(rq);
663	}
664
665	static u32 map_i915_prio_to_lrc_desc_prio(int prio)
666	{
667	if (prio > I915_PRIORITY_NORMAL)
668	return GEN12_CTX_PRIORITY_HIGH;
669	else if (prio < I915_PRIORITY_NORMAL)
670	return GEN12_CTX_PRIORITY_LOW;
671	else
672	return GEN12_CTX_PRIORITY_NORMAL;
673	}
674
675	static u64 execlists_update_context(struct i915_request *rq)
676	{
677	struct intel_context *ce = rq->context;
678	u64 desc;
679	u32 tail, prev;
680
681	desc = ce->lrc.desc;
682	if (rq->engine->flags & I915_ENGINE_HAS_EU_PRIORITY)
683	desc \|= map_i915_prio_to_lrc_desc_prio(prio: rq_prio(rq));
684
685	/*
686	* WaIdleLiteRestore:bdw,skl
687	*
688	* We should never submit the context with the same RING_TAIL twice
689	* just in case we submit an empty ring, which confuses the HW.
690	*
691	* We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
692	* the normal request to be able to always advance the RING_TAIL on
693	* subsequent resubmissions (for lite restore). Should that fail us,
694	* and we try and submit the same tail again, force the context
695	* reload.
696	*
697	* If we need to return to a preempted context, we need to skip the
698	* lite-restore and force it to reload the RING_TAIL. Otherwise, the
699	* HW has a tendency to ignore us rewinding the TAIL to the end of
700	* an earlier request.
701	*/
702	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
703	prev = rq->ring->tail;
704	tail = intel_ring_set_tail(ring: rq->ring, tail: rq->tail);
705	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= `0`))
706	desc \|= CTX_DESC_FORCE_RESTORE;
707	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
708	rq->tail = rq->wa_tail;
709
710	/*
711	* Make sure the context image is complete before we submit it to HW.
712	*
713	* Ostensibly, writes (including the WCB) should be flushed prior to
714	* an uncached write such as our mmio register access, the empirical
715	* evidence (esp. on Braswell) suggests that the WC write into memory
716	* may not be visible to the HW prior to the completion of the UC
717	* register write and that we may begin execution from the context
718	* before its image is complete leading to invalid PD chasing.
719	*/
720	wmb();
721
722	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
723	return desc;
724	}
725
726	static void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
727	{
728	if (execlists->ctrl_reg) {
729	writel(lower_32_bits(desc), addr: execlists->submit_reg + port * `2`);
730	writel(upper_32_bits(desc), addr: execlists->submit_reg + port * `2` + `1`);
731	} else {
732	writel(upper_32_bits(desc), addr: execlists->submit_reg);
733	writel(lower_32_bits(desc), addr: execlists->submit_reg);
734	}
735	}
736
737	static __maybe_unused char *
738	dump_port(char buf, int* buflen, const char prefix, struct* i915_request *rq)
739	{
740	if (!rq)
741	return "";
742
743	snprintf(buf, size: buflen, fmt: "%sccid:%x %llx:%lld%s prio %d",
744	prefix,
745	rq->context->lrc.ccid,
746	rq->fence.context, rq->fence.seqno,
747	__i915_request_is_complete(rq) ? "!" :
748	__i915_request_has_started(rq) ? "*" :
749	"",
750	rq_prio(rq));
751
752	return buf;
753	}
754
755	static __maybe_unused noinline void
756	trace_ports(const struct intel_engine_execlists *execlists,
757	const char *msg,
758	struct i915_request * const *ports)
759	{
760	const struct intel_engine_cs *engine =
761	container_of(execlists, typeof(*engine), execlists);
762	char __maybe_unused p0[`40`], p1[`40`];
763
764	if (!ports[`0`])
765	return;
766
767	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
768	dump_port(p0, sizeof(p0), "", ports[`0`]),
769	dump_port(p1, sizeof(p1), ", ", ports[`1`]));
770	}
771
772	static bool
773	reset_in_progress(const struct intel_engine_cs *engine)
774	{
775	return unlikely(!__tasklet_is_enabled(&engine->sched_engine->tasklet));
776	}
777
778	static __maybe_unused noinline bool
779	assert_pending_valid(const struct intel_engine_execlists *execlists,
780	const char *msg)
781	{
782	struct intel_engine_cs *engine =
783	container_of(execlists, typeof(*engine), execlists);
784	struct i915_request * const port, rq, *prev = NULL;
785	struct intel_context *ce = NULL;
786	u32 ccid = -`1`;
787
788	trace_ports(execlists, msg, ports: execlists->pending);
789
790	/ We may be messing around with the lists during reset, lalala /
791	if (reset_in_progress(engine))
792	return true;
793
794	if (!execlists->pending[`0`]) {
795	GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
796	engine->name);
797	return false;
798	}
799
800	if (execlists->pending[execlists_num_ports(execlists)]) {
801	GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
802	engine->name, execlists_num_ports(execlists));
803	return false;
804	}
805
806	for (port = execlists->pending; (rq = *port); port++) {
807	unsigned long flags;
808	bool ok = true;
809
810	GEM_BUG_ON(!kref_read(&rq->fence.refcount));
811	GEM_BUG_ON(!i915_request_is_active(rq));
812
813	if (ce == rq->context) {
814	GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
815	engine->name,
816	ce->timeline->fence_context,
817	port - execlists->pending);
818	return false;
819	}
820	ce = rq->context;
821
822	if (ccid == ce->lrc.ccid) {
823	GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
824	engine->name,
825	ccid, ce->timeline->fence_context,
826	port - execlists->pending);
827	return false;
828	}
829	ccid = ce->lrc.ccid;
830
831	/*
832	* Sentinels are supposed to be the last request so they flush
833	* the current execution off the HW. Check that they are the only
834	* request in the pending submission.
835	*
836	* NB: Due to the async nature of preempt-to-busy and request
837	* cancellation we need to handle the case where request
838	* becomes a sentinel in parallel to CSB processing.
839	*/
840	if (prev && i915_request_has_sentinel(rq: prev) &&
841	!READ_ONCE(prev->fence.error)) {
842	GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
843	engine->name,
844	ce->timeline->fence_context,
845	port - execlists->pending);
846	return false;
847	}
848	prev = rq;
849
850	/*
851	* We want virtual requests to only be in the first slot so
852	* that they are never stuck behind a hog and can be immediately
853	* transferred onto the next idle engine.
854	*/
855	if (rq->execution_mask != engine->mask &&
856	port != execlists->pending) {
857	GEM_TRACE_ERR("%s: virtual engine:%llx not in prime position[%zd]\n",
858	engine->name,
859	ce->timeline->fence_context,
860	port - execlists->pending);
861	return false;
862	}
863
864	/ Hold tightly onto the lock to prevent concurrent retires! /
865	if (!spin_trylock_irqsave(&rq->lock, flags))
866	continue;
867
868	if (__i915_request_is_complete(rq))
869	goto unlock;
870
871	if (i915_active_is_idle(ref: &ce->active) &&
872	!intel_context_is_barrier(ce)) {
873	GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
874	engine->name,
875	ce->timeline->fence_context,
876	port - execlists->pending);
877	ok = false;
878	goto unlock;
879	}
880
881	if (!i915_vma_is_pinned(vma: ce->state)) {
882	GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
883	engine->name,
884	ce->timeline->fence_context,
885	port - execlists->pending);
886	ok = false;
887	goto unlock;
888	}
889
890	if (!i915_vma_is_pinned(vma: ce->ring->vma)) {
891	GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
892	engine->name,
893	ce->timeline->fence_context,
894	port - execlists->pending);
895	ok = false;
896	goto unlock;
897	}
898
899	unlock:
900	spin_unlock_irqrestore(lock: &rq->lock, flags);
901	if (!ok)
902	return false;
903	}
904
905	return ce;
906	}
907
908	static void execlists_submit_ports(struct intel_engine_cs *engine)
909	{
910	struct intel_engine_execlists *execlists = &engine->execlists;
911	unsigned int n;
912
913	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
914
915	/*
916	* We can skip acquiring intel_runtime_pm_get() here as it was taken
917	* on our behalf by the request (see i915_gem_mark_busy()) and it will
918	* not be relinquished until the device is idle (see
919	* i915_gem_idle_work_handler()). As a precaution, we make sure
920	* that all ELSP are drained i.e. we have processed the CSB,
921	* before allowing ourselves to idle and calling intel_runtime_pm_put().
922	*/
923	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
924
925	/*
926	* ELSQ note: the submit queue is not cleared after being submitted
927	* to the HW so we need to make sure we always clean it up. This is
928	* currently ensured by the fact that we always write the same number
929	* of elsq entries, keep this in mind before changing the loop below.
930	*/
931	for (n = execlists_num_ports(execlists); n--; ) {
932	struct i915_request *rq = execlists->pending[n];
933
934	write_desc(execlists,
935	desc: rq ? execlists_update_context(rq) : `0`,
936	port: n);
937	}
938
939	/ we need to manually load the submit queue /
940	if (execlists->ctrl_reg)
941	writel(EL_CTRL_LOAD, addr: execlists->ctrl_reg);
942	}
943
944	static bool ctx_single_port_submission(const struct intel_context *ce)
945	{
946	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
947	intel_context_force_single_submission(ce));
948	}
949
950	static bool can_merge_ctx(const struct intel_context *prev,
951	const struct intel_context *next)
952	{
953	if (prev != next)
954	return false;
955
956	if (ctx_single_port_submission(ce: prev))
957	return false;
958
959	return true;
960	}
961
962	static unsigned long i915_request_flags(const struct i915_request *rq)
963	{
964	return READ_ONCE(rq->fence.flags);
965	}
966
967	static bool can_merge_rq(const struct i915_request *prev,
968	const struct i915_request *next)
969	{
970	GEM_BUG_ON(prev == next);
971	GEM_BUG_ON(!assert_priority_queue(prev, next));
972
973	/*
974	* We do not submit known completed requests. Therefore if the next
975	* request is already completed, we can pretend to merge it in
976	* with the previous context (and we will skip updating the ELSP
977	* and tracking). Thus hopefully keeping the ELSP full with active
978	* contexts, despite the best efforts of preempt-to-busy to confuse
979	* us.
980	*/
981	if (__i915_request_is_complete(rq: next))
982	return true;
983
984	if (unlikely((i915_request_flags(prev) \| i915_request_flags(next)) &
985	(BIT(I915_FENCE_FLAG_NOPREEMPT) \|
986	BIT(I915_FENCE_FLAG_SENTINEL))))
987	return false;
988
989	if (!can_merge_ctx(prev: prev->context, next: next->context))
990	return false;
991
992	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
993	return true;
994	}
995
996	static bool virtual_matches(const struct virtual_engine *ve,
997	const struct i915_request *rq,
998	const struct intel_engine_cs *engine)
999	{
1000	const struct intel_engine_cs *inflight;
1001
1002	if (!rq)
1003	return false;
1004
1005	if (!(rq->execution_mask & engine->mask)) / We peeked too soon! /
1006	return false;
1007
1008	/*
1009	* We track when the HW has completed saving the context image
1010	* (i.e. when we have seen the final CS event switching out of
1011	* the context) and must not overwrite the context image before
1012	* then. This restricts us to only using the active engine
1013	* while the previous virtualized request is inflight (so
1014	* we reuse the register offsets). This is a very small
1015	* hystersis on the greedy seelction algorithm.
1016	*/
1017	inflight = intel_context_inflight(&ve->context);
1018	if (inflight && inflight != engine)
1019	return false;
1020
1021	return true;
1022	}
1023
1024	static struct virtual_engine *
1025	first_virtual_engine(struct intel_engine_cs *engine)
1026	{
1027	struct intel_engine_execlists *el = &engine->execlists;
1028	struct rb_node *rb = rb_first_cached(&el->virtual);
1029
1030	while (rb) {
1031	struct virtual_engine *ve =
1032	rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1033	struct i915_request *rq = READ_ONCE(ve->request);
1034
1035	/ lazily cleanup after another engine handled rq /
1036	if (!rq \|\| !virtual_matches(ve, rq, engine)) {
1037	rb_erase_cached(node: rb, root: &el->virtual);
1038	RB_CLEAR_NODE(rb);
1039	rb = rb_first_cached(&el->virtual);
1040	continue;
1041	}
1042
1043	return ve;
1044	}
1045
1046	return NULL;
1047	}
1048
1049	static void virtual_xfer_context(struct virtual_engine *ve,
1050	struct intel_engine_cs *engine)
1051	{
1052	unsigned int n;
1053
1054	if (likely(engine == ve->siblings[`0`]))
1055	return;
1056
1057	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1058	if (!intel_engine_has_relative_mmio(engine))
1059	lrc_update_offsets(ce: &ve->context, engine);
1060
1061	/*
1062	* Move the bound engine to the top of the list for
1063	* future execution. We then kick this tasklet first
1064	* before checking others, so that we preferentially
1065	* reuse this set of bound registers.
1066	*/
1067	for (n = `1`; n < ve->num_siblings; n++) {
1068	if (ve->siblings[n] == engine) {
1069	swap(ve->siblings[n], ve->siblings[`0`]);
1070	break;
1071	}
1072	}
1073	}
1074
1075	static void defer_request(struct i915_request rq, struct* list_head * const pl)
1076	{
1077	LIST_HEAD(list);
1078
1079	/*
1080	* We want to move the interrupted request to the back of
1081	* the round-robin list (i.e. its priority level), but
1082	* in doing so, we must then move all requests that were in
1083	* flight and were waiting for the interrupted request to
1084	* be run after it again.
1085	*/
1086	do {
1087	struct i915_dependency *p;
1088
1089	GEM_BUG_ON(i915_request_is_active(rq));
1090	list_move_tail(list: &rq->sched.link, head: pl);
1091
1092	for_each_waiter(p, rq) {
1093	struct i915_request *w =
1094	container_of(p->waiter, typeof(*w), sched);
1095
1096	if (p->flags & I915_DEPENDENCY_WEAK)
1097	continue;
1098
1099	/ Leave semaphores spinning on the other engines /
1100	if (w->engine != rq->engine)
1101	continue;
1102
1103	/ No waiter should start before its signaler /
1104	GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1105	__i915_request_has_started(w) &&
1106	!__i915_request_is_complete(rq));
1107
1108	if (!i915_request_is_ready(rq: w))
1109	continue;
1110
1111	if (rq_prio(rq: w) < rq_prio(rq))
1112	continue;
1113
1114	GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1115	GEM_BUG_ON(i915_request_is_active(w));
1116	list_move_tail(list: &w->sched.link, head: &list);
1117	}
1118
1119	rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1120	} while (rq);
1121	}
1122
1123	static void defer_active(struct intel_engine_cs *engine)
1124	{
1125	struct i915_request *rq;
1126
1127	rq = __unwind_incomplete_requests(engine);
1128	if (!rq)
1129	return;
1130
1131	defer_request(rq, pl: i915_sched_lookup_priolist(sched_engine: engine->sched_engine,
1132	prio: rq_prio(rq)));
1133	}
1134
1135	static bool
1136	timeslice_yield(const struct intel_engine_execlists *el,
1137	const struct i915_request *rq)
1138	{
1139	/*
1140	* Once bitten, forever smitten!
1141	*
1142	* If the active context ever busy-waited on a semaphore,
1143	* it will be treated as a hog until the end of its timeslice (i.e.
1144	* until it is scheduled out and replaced by a new submission,
1145	* possibly even its own lite-restore). The HW only sends an interrupt
1146	* on the first miss, and we do know if that semaphore has been
1147	* signaled, or even if it is now stuck on another semaphore. Play
1148	* safe, yield if it might be stuck -- it will be given a fresh
1149	* timeslice in the near future.
1150	*/
1151	return rq->context->lrc.ccid == READ_ONCE(el->yield);
1152	}
1153
1154	static bool needs_timeslice(const struct intel_engine_cs *engine,
1155	const struct i915_request *rq)
1156	{
1157	if (!intel_engine_has_timeslices(engine))
1158	return false;
1159
1160	/ If not currently active, or about to switch, wait for next event /
1161	if (!rq \|\| __i915_request_is_complete(rq))
1162	return false;
1163
1164	/ We do not need to start the timeslice until after the ACK /
1165	if (READ_ONCE(engine->execlists.pending[`0`]))
1166	return false;
1167
1168	/ If ELSP[1] is occupied, always check to see if worth slicing /
1169	if (!list_is_last_rcu(list: &rq->sched.link,
1170	head: &engine->sched_engine->requests)) {
1171	ENGINE_TRACE(engine, "timeslice required for second inflight context\n");
1172	return true;
1173	}
1174
1175	/ Otherwise, ELSP[0] is by itself, but may be waiting in the queue /
1176	if (!i915_sched_engine_is_empty(sched_engine: engine->sched_engine)) {
1177	ENGINE_TRACE(engine, "timeslice required for queue\n");
1178	return true;
1179	}
1180
1181	if (!RB_EMPTY_ROOT(&engine->execlists.virtual.rb_root)) {
1182	ENGINE_TRACE(engine, "timeslice required for virtual\n");
1183	return true;
1184	}
1185
1186	return false;
1187	}
1188
1189	static bool
1190	timeslice_expired(struct intel_engine_cs engine, const* struct i915_request *rq)
1191	{
1192	const struct intel_engine_execlists *el = &engine->execlists;
1193
1194	if (i915_request_has_nopreempt(rq) && __i915_request_has_started(rq))
1195	return false;
1196
1197	if (!needs_timeslice(engine, rq))
1198	return false;
1199
1200	return timer_expired(t: &el->timer) \|\| timeslice_yield(el, rq);
1201	}
1202
1203	static unsigned long timeslice(const struct intel_engine_cs *engine)
1204	{
1205	return READ_ONCE(engine->props.timeslice_duration_ms);
1206	}
1207
1208	static void start_timeslice(struct intel_engine_cs *engine)
1209	{
1210	struct intel_engine_execlists *el = &engine->execlists;
1211	unsigned long duration;
1212
1213	/ Disable the timer if there is nothing to switch to /
1214	duration = `0`;
1215	if (needs_timeslice(engine, rq: *el->active)) {
1216	/ Avoid continually prolonging an active timeslice /
1217	if (timer_active(t: &el->timer)) {
1218	/*
1219	* If we just submitted a new ELSP after an old
1220	* context, that context may have already consumed
1221	* its timeslice, so recheck.
1222	*/
1223	if (!timer_pending(timer: &el->timer))
1224	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
1225	return;
1226	}
1227
1228	duration = timeslice(engine);
1229	}
1230
1231	set_timer_ms(t: &el->timer, timeout: duration);
1232	}
1233
1234	static void record_preemption(struct intel_engine_execlists *execlists)
1235	{
1236	(void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1237	}
1238
1239	static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1240	const struct i915_request *rq)
1241	{
1242	if (!rq)
1243	return `0`;
1244
1245	/ Only allow ourselves to force reset the currently active context /
1246	engine->execlists.preempt_target = rq;
1247
1248	/ Force a fast reset for terminated contexts (ignoring sysfs!) /
1249	if (unlikely(intel_context_is_banned(rq->context) \|\| bad_request(rq)))
1250	return INTEL_CONTEXT_BANNED_PREEMPT_TIMEOUT_MS;
1251
1252	return READ_ONCE(engine->props.preempt_timeout_ms);
1253	}
1254
1255	static void set_preempt_timeout(struct intel_engine_cs *engine,
1256	const struct i915_request *rq)
1257	{
1258	if (!intel_engine_has_preempt_reset(engine))
1259	return;
1260
1261	set_timer_ms(t: &engine->execlists.preempt,
1262	timeout: active_preempt_timeout(engine, rq));
1263	}
1264
1265	static bool completed(const struct i915_request *rq)
1266	{
1267	if (i915_request_has_sentinel(rq))
1268	return false;
1269
1270	return __i915_request_is_complete(rq);
1271	}
1272
1273	static void execlists_dequeue(struct intel_engine_cs *engine)
1274	{
1275	struct intel_engine_execlists * const execlists = &engine->execlists;
1276	struct i915_sched_engine * const sched_engine = engine->sched_engine;
1277	struct i915_request **port = execlists->pending;
1278	struct i915_request ** const last_port = port + execlists->port_mask;
1279	struct i915_request last, const *active;
1280	struct virtual_engine *ve;
1281	struct rb_node *rb;
1282	bool submit = false;
1283
1284	/*
1285	* Hardware submission is through 2 ports. Conceptually each port
1286	* has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1287	* static for a context, and unique to each, so we only execute
1288	* requests belonging to a single context from each ring. RING_HEAD
1289	* is maintained by the CS in the context image, it marks the place
1290	* where it got up to last time, and through RING_TAIL we tell the CS
1291	* where we want to execute up to this time.
1292	*
1293	* In this list the requests are in order of execution. Consecutive
1294	* requests from the same context are adjacent in the ringbuffer. We
1295	* can combine these requests into a single RING_TAIL update:
1296	*
1297	* RING_HEAD...req1...req2
1298	* ^- RING_TAIL
1299	* since to execute req2 the CS must first execute req1.
1300	*
1301	* Our goal then is to point each port to the end of a consecutive
1302	* sequence of requests as being the most optimal (fewest wake ups
1303	* and context switches) submission.
1304	*/
1305
1306	spin_lock(lock: &sched_engine->lock);
1307
1308	/*
1309	* If the queue is higher priority than the last
1310	* request in the currently active context, submit afresh.
1311	* We will resubmit again afterwards in case we need to split
1312	* the active context to interject the preemption request,
1313	* i.e. we will retrigger preemption following the ack in case
1314	* of trouble.
1315	*
1316	*/
1317	active = execlists->active;
1318	while ((last = *active) && completed(rq: last))
1319	active++;
1320
1321	if (last) {
1322	if (need_preempt(engine, rq: last)) {
1323	ENGINE_TRACE(engine,
1324	"preempting last=%llx:%lld, prio=%d, hint=%d\n",
1325	last->fence.context,
1326	last->fence.seqno,
1327	last->sched.attr.priority,
1328	sched_engine->queue_priority_hint);
1329	record_preemption(execlists);
1330
1331	/*
1332	* Don't let the RING_HEAD advance past the breadcrumb
1333	* as we unwind (and until we resubmit) so that we do
1334	* not accidentally tell it to go backwards.
1335	*/
1336	ring_set_paused(engine, state: `1`);
1337
1338	/*
1339	* Note that we have not stopped the GPU at this point,
1340	* so we are unwinding the incomplete requests as they
1341	* remain inflight and so by the time we do complete
1342	* the preemption, some of the unwound requests may
1343	* complete!
1344	*/
1345	__unwind_incomplete_requests(engine);
1346
1347	last = NULL;
1348	} else if (timeslice_expired(engine, rq: last)) {
1349	ENGINE_TRACE(engine,
1350	"expired:%s last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
1351	str_yes_no(timer_expired(&execlists->timer)),
1352	last->fence.context, last->fence.seqno,
1353	rq_prio(last),
1354	sched_engine->queue_priority_hint,
1355	str_yes_no(timeslice_yield(execlists, last)));
1356
1357	/*
1358	* Consume this timeslice; ensure we start a new one.
1359	*
1360	* The timeslice expired, and we will unwind the
1361	* running contexts and recompute the next ELSP.
1362	* If that submit will be the same pair of contexts
1363	* (due to dependency ordering), we will skip the
1364	* submission. If we don't cancel the timer now,
1365	* we will see that the timer has expired and
1366	* reschedule the tasklet; continually until the
1367	* next context switch or other preemption event.
1368	*
1369	* Since we have decided to reschedule based on
1370	* consumption of this timeslice, if we submit the
1371	* same context again, grant it a full timeslice.
1372	*/
1373	cancel_timer(t: &execlists->timer);
1374	ring_set_paused(engine, state: `1`);
1375	defer_active(engine);
1376
1377	/*
1378	* Unlike for preemption, if we rewind and continue
1379	* executing the same context as previously active,
1380	* the order of execution will remain the same and
1381	* the tail will only advance. We do not need to
1382	* force a full context restore, as a lite-restore
1383	* is sufficient to resample the monotonic TAIL.
1384	*
1385	* If we switch to any other context, similarly we
1386	* will not rewind TAIL of current context, and
1387	* normal save/restore will preserve state and allow
1388	* us to later continue executing the same request.
1389	*/
1390	last = NULL;
1391	} else {
1392	/*
1393	* Otherwise if we already have a request pending
1394	* for execution after the current one, we can
1395	* just wait until the next CS event before
1396	* queuing more. In either case we will force a
1397	* lite-restore preemption event, but if we wait
1398	* we hopefully coalesce several updates into a single
1399	* submission.
1400	*/
1401	if (active[`1`]) {
1402	/*
1403	* Even if ELSP[1] is occupied and not worthy
1404	* of timeslices, our queue might be.
1405	*/
1406	spin_unlock(lock: &sched_engine->lock);
1407	return;
1408	}
1409	}
1410	}
1411
1412	/ XXX virtual is always taking precedence /
1413	while ((ve = first_virtual_engine(engine))) {
1414	struct i915_request *rq;
1415
1416	spin_lock(lock: &ve->base.sched_engine->lock);
1417
1418	rq = ve->request;
1419	if (unlikely(!virtual_matches(ve, rq, engine)))
1420	goto unlock; / lost the race to a sibling /
1421
1422	GEM_BUG_ON(rq->engine != &ve->base);
1423	GEM_BUG_ON(rq->context != &ve->context);
1424
1425	if (unlikely(rq_prio(rq) < queue_prio(sched_engine))) {
1426	spin_unlock(lock: &ve->base.sched_engine->lock);
1427	break;
1428	}
1429
1430	if (last && !can_merge_rq(prev: last, next: rq)) {
1431	spin_unlock(lock: &ve->base.sched_engine->lock);
1432	spin_unlock(lock: &engine->sched_engine->lock);
1433	return; / leave this for another sibling /
1434	}
1435
1436	ENGINE_TRACE(engine,
1437	"virtual rq=%llx:%lld%s, new engine? %s\n",
1438	rq->fence.context,
1439	rq->fence.seqno,
1440	__i915_request_is_complete(rq) ? "!" :
1441	__i915_request_has_started(rq) ? "*" :
1442	"",
1443	str_yes_no(engine != ve->siblings[`0`]));
1444
1445	WRITE_ONCE(ve->request, NULL);
1446	WRITE_ONCE(ve->base.sched_engine->queue_priority_hint, INT_MIN);
1447
1448	rb = &ve->nodes[engine->id].rb;
1449	rb_erase_cached(node: rb, root: &execlists->virtual);
1450	RB_CLEAR_NODE(rb);
1451
1452	GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1453	WRITE_ONCE(rq->engine, engine);
1454
1455	if (__i915_request_submit(request: rq)) {
1456	/*
1457	* Only after we confirm that we will submit
1458	* this request (i.e. it has not already
1459	* completed), do we want to update the context.
1460	*
1461	* This serves two purposes. It avoids
1462	* unnecessary work if we are resubmitting an
1463	* already completed request after timeslicing.
1464	* But more importantly, it prevents us altering
1465	* ve->siblings[] on an idle context, where
1466	* we may be using ve->siblings[] in
1467	* virtual_context_enter / virtual_context_exit.
1468	*/
1469	virtual_xfer_context(ve, engine);
1470	GEM_BUG_ON(ve->siblings[`0`] != engine);
1471
1472	submit = true;
1473	last = rq;
1474	}
1475
1476	i915_request_put(rq);
1477	unlock:
1478	spin_unlock(lock: &ve->base.sched_engine->lock);
1479
1480	/*
1481	* Hmm, we have a bunch of virtual engine requests,
1482	* but the first one was already completed (thanks
1483	* preempt-to-busy!). Keep looking at the veng queue
1484	* until we have no more relevant requests (i.e.
1485	* the normal submit queue has higher priority).
1486	*/
1487	if (submit)
1488	break;
1489	}
1490
1491	while ((rb = rb_first_cached(&sched_engine->queue))) {
1492	struct i915_priolist *p = to_priolist(rb);
1493	struct i915_request rq, rn;
1494
1495	priolist_for_each_request_consume(rq, rn, p) {
1496	bool merge = true;
1497
1498	/*
1499	* Can we combine this request with the current port?
1500	* It has to be the same context/ringbuffer and not
1501	* have any exceptions (e.g. GVT saying never to
1502	* combine contexts).
1503	*
1504	* If we can combine the requests, we can execute both
1505	* by updating the RING_TAIL to point to the end of the
1506	* second request, and so we never need to tell the
1507	* hardware about the first.
1508	*/
1509	if (last && !can_merge_rq(prev: last, next: rq)) {
1510	/*
1511	* If we are on the second port and cannot
1512	* combine this request with the last, then we
1513	* are done.
1514	*/
1515	if (port == last_port)
1516	goto done;
1517
1518	/*
1519	* We must not populate both ELSP[] with the
1520	* same LRCA, i.e. we must submit 2 different
1521	* contexts if we submit 2 ELSP.
1522	*/
1523	if (last->context == rq->context)
1524	goto done;
1525
1526	if (i915_request_has_sentinel(rq: last))
1527	goto done;
1528
1529	/*
1530	* We avoid submitting virtual requests into
1531	* the secondary ports so that we can migrate
1532	* the request immediately to another engine
1533	* rather than wait for the primary request.
1534	*/
1535	if (rq->execution_mask != engine->mask)
1536	goto done;
1537
1538	/*
1539	* If GVT overrides us we only ever submit
1540	* port[0], leaving port[1] empty. Note that we
1541	* also have to be careful that we don't queue
1542	* the same context (even though a different
1543	* request) to the second port.
1544	*/
1545	if (ctx_single_port_submission(ce: last->context) \|\|
1546	ctx_single_port_submission(ce: rq->context))
1547	goto done;
1548
1549	merge = false;
1550	}
1551
1552	if (__i915_request_submit(request: rq)) {
1553	if (!merge) {
1554	*port++ = i915_request_get(rq: last);
1555	last = NULL;
1556	}
1557
1558	GEM_BUG_ON(last &&
1559	!can_merge_ctx(last->context,
1560	rq->context));
1561	GEM_BUG_ON(last &&
1562	i915_seqno_passed(last->fence.seqno,
1563	rq->fence.seqno));
1564
1565	submit = true;
1566	last = rq;
1567	}
1568	}
1569
1570	rb_erase_cached(node: &p->node, root: &sched_engine->queue);
1571	i915_priolist_free(p);
1572	}
1573	done:
1574	*port++ = i915_request_get(rq: last);
1575
1576	/*
1577	* Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
1578	*
1579	* We choose the priority hint such that if we add a request of greater
1580	* priority than this, we kick the submission tasklet to decide on
1581	* the right order of submitting the requests to hardware. We must
1582	* also be prepared to reorder requests as they are in-flight on the
1583	* HW. We derive the priority hint then as the first "hole" in
1584	* the HW submission ports and if there are no available slots,
1585	* the priority of the lowest executing request, i.e. last.
1586	*
1587	* When we do receive a higher priority request ready to run from the
1588	* user, see queue_request(), the priority hint is bumped to that
1589	* request triggering preemption on the next dequeue (or subsequent
1590	* interrupt for secondary ports).
1591	*/
1592	sched_engine->queue_priority_hint = queue_prio(sched_engine);
1593	i915_sched_engine_reset_on_empty(sched_engine);
1594	spin_unlock(lock: &sched_engine->lock);
1595
1596	/*
1597	* We can skip poking the HW if we ended up with exactly the same set
1598	* of requests as currently running, e.g. trying to timeslice a pair
1599	* of ordered contexts.
1600	*/
1601	if (submit &&
1602	memcmp(p: active,
1603	q: execlists->pending,
1604	size: (port - execlists->pending) * sizeof(*port))) {
1605	*port = NULL;
1606	while (port-- != execlists->pending)
1607	execlists_schedule_in(rq: *port, idx: port - execlists->pending);
1608
1609	WRITE_ONCE(execlists->yield, -`1`);
1610	set_preempt_timeout(engine, rq: *active);
1611	execlists_submit_ports(engine);
1612	} else {
1613	ring_set_paused(engine, state: `0`);
1614	while (port-- != execlists->pending)
1615	i915_request_put(rq: *port);
1616	*execlists->pending = NULL;
1617	}
1618	}
1619
1620	static void execlists_dequeue_irq(struct intel_engine_cs *engine)
1621	{
1622	local_irq_disable(); / Suspend interrupts across request submission /
1623	execlists_dequeue(engine);
1624	local_irq_enable(); / flush irq_work (e.g. breadcrumb enabling) /
1625	}
1626
1627	static void clear_ports(struct i915_request *ports, int* count)
1628	{
1629	memset_p(p: (void **)ports, NULL, n: count);
1630	}
1631
1632	static void
1633	copy_ports(struct i915_request dst, struct i915_request src, int count)
1634	{
1635	/ A memcpy_p() would be very useful here! /
1636	while (count--)
1637	WRITE_ONCE(dst++, src++); / avoid write tearing /
1638	}
1639
1640	static struct i915_request **
1641	cancel_port_requests(struct intel_engine_execlists * const execlists,
1642	struct i915_request **inactive)
1643	{
1644	struct i915_request * const *port;
1645
1646	for (port = execlists->pending; *port; port++)
1647	inactive++ = port;
1648	clear_ports(ports: execlists->pending, ARRAY_SIZE(execlists->pending));
1649
1650	/ Mark the end of active before we overwrite active /*
1651	for (port = xchg(&execlists->active, execlists->pending); *port; port++)
1652	inactive++ = port;
1653	clear_ports(ports: execlists->inflight, ARRAY_SIZE(execlists->inflight));
1654
1655	smp_wmb(); / complete the seqlock for execlists_active() /
1656	WRITE_ONCE(execlists->active, execlists->inflight);
1657
1658	/ Having cancelled all outstanding process_csb(), stop their timers /
1659	GEM_BUG_ON(execlists->pending[`0`]);
1660	cancel_timer(t: &execlists->timer);
1661	cancel_timer(t: &execlists->preempt);
1662
1663	return inactive;
1664	}
1665
1666	/*
1667	* Starting with Gen12, the status has a new format:
1668	*
1669	* bit 0: switched to new queue
1670	* bit 1: reserved
1671	* bit 2: semaphore wait mode (poll or signal), only valid when
1672	* switch detail is set to "wait on semaphore"
1673	* bits 3-5: engine class
1674	* bits 6-11: engine instance
1675	* bits 12-14: reserved
1676	* bits 15-25: sw context id of the lrc the GT switched to
1677	* bits 26-31: sw counter of the lrc the GT switched to
1678	* bits 32-35: context switch detail
1679	* - 0: ctx complete
1680	* - 1: wait on sync flip
1681	* - 2: wait on vblank
1682	* - 3: wait on scanline
1683	* - 4: wait on semaphore
1684	* - 5: context preempted (not on SEMAPHORE_WAIT or
1685	* WAIT_FOR_EVENT)
1686	* bit 36: reserved
1687	* bits 37-43: wait detail (for switch detail 1 to 4)
1688	* bits 44-46: reserved
1689	* bits 47-57: sw context id of the lrc the GT switched away from
1690	* bits 58-63: sw counter of the lrc the GT switched away from
1691	*
1692	* Xe_HP csb shuffles things around compared to TGL:
1693	*
1694	* bits 0-3: context switch detail (same possible values as TGL)
1695	* bits 4-9: engine instance
1696	* bits 10-25: sw context id of the lrc the GT switched to
1697	* bits 26-31: sw counter of the lrc the GT switched to
1698	* bit 32: semaphore wait mode (poll or signal), Only valid when
1699	* switch detail is set to "wait on semaphore"
1700	* bit 33: switched to new queue
1701	* bits 34-41: wait detail (for switch detail 1 to 4)
1702	* bits 42-57: sw context id of the lrc the GT switched away from
1703	* bits 58-63: sw counter of the lrc the GT switched away from
1704	*/
1705	static inline bool
1706	__gen12_csb_parse(bool ctx_to_valid, bool ctx_away_valid, bool new_queue,
1707	u8 switch_detail)
1708	{
1709	/*
1710	* The context switch detail is not guaranteed to be 5 when a preemption
1711	* occurs, so we can't just check for that. The check below works for
1712	* all the cases we care about, including preemptions of WAIT
1713	* instructions and lite-restore. Preempt-to-idle via the CTRL register
1714	* would require some extra handling, but we don't support that.
1715	*/
1716	if (!ctx_away_valid \|\| new_queue) {
1717	GEM_BUG_ON(!ctx_to_valid);
1718	return true;
1719	}
1720
1721	/*
1722	* switch detail = 5 is covered by the case above and we do not expect a
1723	* context switch on an unsuccessful wait instruction since we always
1724	* use polling mode.
1725	*/
1726	GEM_BUG_ON(switch_detail);
1727	return false;
1728	}
1729
1730	static bool xehp_csb_parse(const u64 csb)
1731	{
1732	return __gen12_csb_parse(XEHP_CSB_CTX_VALID(lower_32_bits(csb)), / cxt to /
1733	XEHP_CSB_CTX_VALID(upper_32_bits(csb)), / cxt away /
1734	upper_32_bits(csb) & XEHP_CTX_STATUS_SWITCHED_TO_NEW_QUEUE,
1735	GEN12_CTX_SWITCH_DETAIL(lower_32_bits(csb)));
1736	}
1737
1738	static bool gen12_csb_parse(const u64 csb)
1739	{
1740	return __gen12_csb_parse(GEN12_CSB_CTX_VALID(lower_32_bits(csb)), / cxt to /
1741	GEN12_CSB_CTX_VALID(upper_32_bits(csb)), / cxt away /
1742	lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE,
1743	GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
1744	}
1745
1746	static bool gen8_csb_parse(const u64 csb)
1747	{
1748	return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE \| GEN8_CTX_STATUS_PREEMPTED);
1749	}
1750
1751	static noinline u64
1752	wa_csb_read(const struct intel_engine_cs engine, u64 const csb)
1753	{
1754	u64 entry;
1755
1756	/*
1757	* Reading from the HWSP has one particular advantage: we can detect
1758	* a stale entry. Since the write into HWSP is broken, we have no reason
1759	* to trust the HW at all, the mmio entry may equally be unordered, so
1760	* we prefer the path that is self-checking and as a last resort,
1761	* return the mmio value.
1762	*
1763	* tgl,dg1:HSDES#22011327657
1764	*/
1765	preempt_disable();
1766	if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -`1`, `10`)) {
1767	int idx = csb - engine->execlists.csb_status;
1768	int status;
1769
1770	status = GEN8_EXECLISTS_STATUS_BUF;
1771	if (idx >= `6`) {
1772	status = GEN11_EXECLISTS_STATUS_BUF2;
1773	idx -= `6`;
1774	}
1775	status += sizeof(u64) * idx;
1776
1777	entry = intel_uncore_read64(uncore: engine->uncore,
1778	_MMIO(engine->mmio_base + status));
1779	}
1780	preempt_enable();
1781
1782	return entry;
1783	}
1784
1785	static u64 csb_read(const struct intel_engine_cs engine, u64 const csb)
1786	{
1787	u64 entry = READ_ONCE(*csb);
1788
1789	/*
1790	* Unfortunately, the GPU does not always serialise its write
1791	* of the CSB entries before its write of the CSB pointer, at least
1792	* from the perspective of the CPU, using what is known as a Global
1793	* Observation Point. We may read a new CSB tail pointer, but then
1794	* read the stale CSB entries, causing us to misinterpret the
1795	* context-switch events, and eventually declare the GPU hung.
1796	*
1797	* icl:HSDES#1806554093
1798	* tgl:HSDES#22011248461
1799	*/
1800	if (unlikely(entry == -`1`))
1801	entry = wa_csb_read(engine, csb);
1802
1803	/ Consume this entry so that we can spot its future reuse. /
1804	WRITE_ONCE(*csb, -`1`);
1805
1806	/ ELSP is an implicit wmb() before the GPU wraps and overwrites csb /
1807	return entry;
1808	}
1809
1810	static void new_timeslice(struct intel_engine_execlists *el)
1811	{
1812	/ By cancelling, we will start afresh in start_timeslice() /
1813	cancel_timer(t: &el->timer);
1814	}
1815
1816	static struct i915_request **
1817	process_csb(struct intel_engine_cs engine, struct* i915_request **inactive)
1818	{
1819	struct intel_engine_execlists * const execlists = &engine->execlists;
1820	u64 * const buf = execlists->csb_status;
1821	const u8 num_entries = execlists->csb_size;
1822	struct i915_request **prev;
1823	u8 head, tail;
1824
1825	/*
1826	* As we modify our execlists state tracking we require exclusive
1827	* access. Either we are inside the tasklet, or the tasklet is disabled
1828	* and we assume that is only inside the reset paths and so serialised.
1829	*/
1830	GEM_BUG_ON(!tasklet_is_locked(&engine->sched_engine->tasklet) &&
1831	!reset_in_progress(engine));
1832
1833	/*
1834	* Note that csb_write, csb_status may be either in HWSP or mmio.
1835	* When reading from the csb_write mmio register, we have to be
1836	* careful to only use the GEN8_CSB_WRITE_PTR portion, which is
1837	* the low 4bits. As it happens we know the next 4bits are always
1838	* zero and so we can simply masked off the low u8 of the register
1839	* and treat it identically to reading from the HWSP (without having
1840	* to use explicit shifting and masking, and probably bifurcating
1841	* the code to handle the legacy mmio read).
1842	*/
1843	head = execlists->csb_head;
1844	tail = READ_ONCE(*execlists->csb_write);
1845	if (unlikely(head == tail))
1846	return inactive;
1847
1848	/*
1849	* We will consume all events from HW, or at least pretend to.
1850	*
1851	* The sequence of events from the HW is deterministic, and derived
1852	* from our writes to the ELSP, with a smidgen of variability for
1853	* the arrival of the asynchronous requests wrt to the inflight
1854	* execution. If the HW sends an event that does not correspond with
1855	* the one we are expecting, we have to abandon all hope as we lose
1856	* all tracking of what the engine is actually executing. We will
1857	* only detect we are out of sequence with the HW when we get an
1858	* 'impossible' event because we have already drained our own
1859	* preemption/promotion queue. If this occurs, we know that we likely
1860	* lost track of execution earlier and must unwind and restart, the
1861	* simplest way is by stop processing the event queue and force the
1862	* engine to reset.
1863	*/
1864	execlists->csb_head = tail;
1865	ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
1866
1867	/*
1868	* Hopefully paired with a wmb() in HW!
1869	*
1870	* We must complete the read of the write pointer before any reads
1871	* from the CSB, so that we do not see stale values. Without an rmb
1872	* (lfence) the HW may speculatively perform the CSB[] reads before
1873	* we perform the READ_ONCE(*csb_write).
1874	*/
1875	rmb();
1876
1877	/ Remember who was last running under the timer /
1878	prev = inactive;
1879	*prev = NULL;
1880
1881	do {
1882	bool promote;
1883	u64 csb;
1884
1885	if (++head == num_entries)
1886	head = `0`;
1887
1888	/*
1889	* We are flying near dragons again.
1890	*
1891	* We hold a reference to the request in execlist_port[]
1892	* but no more than that. We are operating in softirq
1893	* context and so cannot hold any mutex or sleep. That
1894	* prevents us stopping the requests we are processing
1895	* in port[] from being retired simultaneously (the
1896	* breadcrumb will be complete before we see the
1897	* context-switch). As we only hold the reference to the
1898	* request, any pointer chasing underneath the request
1899	* is subject to a potential use-after-free. Thus we
1900	* store all of the bookkeeping within port[] as
1901	* required, and avoid using unguarded pointers beneath
1902	* request itself. The same applies to the atomic
1903	* status notifier.
1904	*/
1905
1906	csb = csb_read(engine, csb: buf + head);
1907	ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
1908	head, upper_32_bits(csb), lower_32_bits(csb));
1909
1910	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `50`))
1911	promote = xehp_csb_parse(csb);
1912	else if (GRAPHICS_VER(engine->i915) >= `12`)
1913	promote = gen12_csb_parse(csb);
1914	else
1915	promote = gen8_csb_parse(csb);
1916	if (promote) {
1917	struct i915_request * const *old = execlists->active;
1918
1919	if (GEM_WARN_ON(!*execlists->pending)) {
1920	execlists->error_interrupt \|= ERROR_CSB;
1921	break;
1922	}
1923
1924	ring_set_paused(engine, state: `0`);
1925
1926	/ Point active to the new ELSP; prevent overwriting /
1927	WRITE_ONCE(execlists->active, execlists->pending);
1928	smp_wmb(); / notify execlists_active() /
1929
1930	/ cancel old inflight, prepare for switch /
1931	trace_ports(execlists, msg: "preempted", ports: old);
1932	while (*old)
1933	inactive++ = old++;
1934
1935	/ switch pending to inflight /
1936	GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
1937	copy_ports(dst: execlists->inflight,
1938	src: execlists->pending,
1939	count: execlists_num_ports(execlists));
1940	smp_wmb(); / complete the seqlock /
1941	WRITE_ONCE(execlists->active, execlists->inflight);
1942
1943	/ XXX Magic delay for tgl /
1944	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
1945
1946	WRITE_ONCE(execlists->pending[`0`], NULL);
1947	} else {
1948	if (GEM_WARN_ON(!*execlists->active)) {
1949	execlists->error_interrupt \|= ERROR_CSB;
1950	break;
1951	}
1952
1953	/ port0 completed, advanced to port1 /
1954	trace_ports(execlists, msg: "completed", ports: execlists->active);
1955
1956	/*
1957	* We rely on the hardware being strongly
1958	* ordered, that the breadcrumb write is
1959	* coherent (visible from the CPU) before the
1960	* user interrupt is processed. One might assume
1961	* that the breadcrumb write being before the
1962	* user interrupt and the CS event for the context
1963	* switch would therefore be before the CS event
1964	* itself...
1965	*/
1966	if (GEM_SHOW_DEBUG() &&
1967	!__i915_request_is_complete(rq: *execlists->active)) {
1968	struct i915_request rq = execlists->active;
1969	const u32 *regs __maybe_unused =
1970	rq->context->lrc_reg_state;
1971
1972	ENGINE_TRACE(engine,
1973	"context completed before request!\n");
1974	ENGINE_TRACE(engine,
1975	"ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
1976	ENGINE_READ(engine, RING_START),
1977	ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
1978	ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
1979	ENGINE_READ(engine, RING_CTL),
1980	ENGINE_READ(engine, RING_MI_MODE));
1981	ENGINE_TRACE(engine,
1982	"rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
1983	i915_ggtt_offset(rq->ring->vma),
1984	rq->head, rq->tail,
1985	rq->fence.context,
1986	lower_32_bits(rq->fence.seqno),
1987	hwsp_seqno(rq));
1988	ENGINE_TRACE(engine,
1989	"ctx:{start:%08x, head:%04x, tail:%04x}, ",
1990	regs[CTX_RING_START],
1991	regs[CTX_RING_HEAD],
1992	regs[CTX_RING_TAIL]);
1993	}
1994
1995	inactive++ = execlists->active++;
1996
1997	GEM_BUG_ON(execlists->active - execlists->inflight >
1998	execlists_num_ports(execlists));
1999	}
2000	} while (head != tail);
2001
2002	/*
2003	* Gen11 has proven to fail wrt global observation point between
2004	* entry and tail update, failing on the ordering and thus
2005	* we see an old entry in the context status buffer.
2006	*
2007	* Forcibly evict out entries for the next gpu csb update,
2008	* to increase the odds that we get a fresh entries with non
2009	* working hardware. The cost for doing so comes out mostly with
2010	* the wash as hardware, working or not, will need to do the
2011	* invalidation before.
2012	*/
2013	drm_clflush_virt_range(addr: &buf[`0`], length: num_entries * sizeof(buf[`0`]));
2014
2015	/*
2016	* We assume that any event reflects a change in context flow
2017	* and merits a fresh timeslice. We reinstall the timer after
2018	* inspecting the queue to see if we need to resumbit.
2019	*/
2020	if (prev != execlists->active) { / elide lite-restores /
2021	struct intel_context prev_ce = NULL, active_ce = NULL;
2022
2023	/*
2024	* Note the inherent discrepancy between the HW runtime,
2025	* recorded as part of the context switch, and the CPU
2026	* adjustment for active contexts. We have to hope that
2027	* the delay in processing the CS event is very small
2028	* and consistent. It works to our advantage to have
2029	* the CPU adjustment _undershoot_ (i.e. start later than)
2030	* the CS timestamp so we never overreport the runtime
2031	* and correct overselves later when updating from HW.
2032	*/
2033	if (*prev)
2034	prev_ce = (*prev)->context;
2035	if (*execlists->active)
2036	active_ce = (*execlists->active)->context;
2037	if (prev_ce != active_ce) {
2038	if (prev_ce)
2039	lrc_runtime_stop(ce: prev_ce);
2040	if (active_ce)
2041	lrc_runtime_start(ce: active_ce);
2042	}
2043	new_timeslice(el: execlists);
2044	}
2045
2046	return inactive;
2047	}
2048
2049	static void post_process_csb(struct i915_request **port,
2050	struct i915_request **last)
2051	{
2052	while (port != last)
2053	execlists_schedule_out(rq: *port++);
2054	}
2055
2056	static void __execlists_hold(struct i915_request *rq)
2057	{
2058	LIST_HEAD(list);
2059
2060	do {
2061	struct i915_dependency *p;
2062
2063	if (i915_request_is_active(rq))
2064	__i915_request_unsubmit(request: rq);
2065
2066	clear_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
2067	list_move_tail(list: &rq->sched.link,
2068	head: &rq->engine->sched_engine->hold);
2069	i915_request_set_hold(rq);
2070	RQ_TRACE(rq, "on hold\n");
2071
2072	for_each_waiter(p, rq) {
2073	struct i915_request *w =
2074	container_of(p->waiter, typeof(*w), sched);
2075
2076	if (p->flags & I915_DEPENDENCY_WEAK)
2077	continue;
2078
2079	/ Leave semaphores spinning on the other engines /
2080	if (w->engine != rq->engine)
2081	continue;
2082
2083	if (!i915_request_is_ready(rq: w))
2084	continue;
2085
2086	if (__i915_request_is_complete(rq: w))
2087	continue;
2088
2089	if (i915_request_on_hold(rq: w))
2090	continue;
2091
2092	list_move_tail(list: &w->sched.link, head: &list);
2093	}
2094
2095	rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2096	} while (rq);
2097	}
2098
2099	static bool execlists_hold(struct intel_engine_cs *engine,
2100	struct i915_request *rq)
2101	{
2102	if (i915_request_on_hold(rq))
2103	return false;
2104
2105	spin_lock_irq(lock: &engine->sched_engine->lock);
2106
2107	if (__i915_request_is_complete(rq)) { / too late! /
2108	rq = NULL;
2109	goto unlock;
2110	}
2111
2112	/*
2113	* Transfer this request onto the hold queue to prevent it
2114	* being resumbitted to HW (and potentially completed) before we have
2115	* released it. Since we may have already submitted following
2116	* requests, we need to remove those as well.
2117	*/
2118	GEM_BUG_ON(i915_request_on_hold(rq));
2119	GEM_BUG_ON(rq->engine != engine);
2120	__execlists_hold(rq);
2121	GEM_BUG_ON(list_empty(&engine->sched_engine->hold));
2122
2123	unlock:
2124	spin_unlock_irq(lock: &engine->sched_engine->lock);
2125	return rq;
2126	}
2127
2128	static bool hold_request(const struct i915_request *rq)
2129	{
2130	struct i915_dependency *p;
2131	bool result = false;
2132
2133	/*
2134	* If one of our ancestors is on hold, we must also be on hold,
2135	* otherwise we will bypass it and execute before it.
2136	*/
2137	rcu_read_lock();
2138	for_each_signaler(p, rq) {
2139	const struct i915_request *s =
2140	container_of(p->signaler, typeof(*s), sched);
2141
2142	if (s->engine != rq->engine)
2143	continue;
2144
2145	result = i915_request_on_hold(rq: s);
2146	if (result)
2147	break;
2148	}
2149	rcu_read_unlock();
2150
2151	return result;
2152	}
2153
2154	static void __execlists_unhold(struct i915_request *rq)
2155	{
2156	LIST_HEAD(list);
2157
2158	do {
2159	struct i915_dependency *p;
2160
2161	RQ_TRACE(rq, "hold release\n");
2162
2163	GEM_BUG_ON(!i915_request_on_hold(rq));
2164	GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2165
2166	i915_request_clear_hold(rq);
2167	list_move_tail(list: &rq->sched.link,
2168	head: i915_sched_lookup_priolist(sched_engine: rq->engine->sched_engine,
2169	prio: rq_prio(rq)));
2170	set_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
2171
2172	/ Also release any children on this engine that are ready /
2173	for_each_waiter(p, rq) {
2174	struct i915_request *w =
2175	container_of(p->waiter, typeof(*w), sched);
2176
2177	if (p->flags & I915_DEPENDENCY_WEAK)
2178	continue;
2179
2180	if (w->engine != rq->engine)
2181	continue;
2182
2183	if (!i915_request_on_hold(rq: w))
2184	continue;
2185
2186	/ Check that no other parents are also on hold /
2187	if (hold_request(rq: w))
2188	continue;
2189
2190	list_move_tail(list: &w->sched.link, head: &list);
2191	}
2192
2193	rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2194	} while (rq);
2195	}
2196
2197	static void execlists_unhold(struct intel_engine_cs *engine,
2198	struct i915_request *rq)
2199	{
2200	spin_lock_irq(lock: &engine->sched_engine->lock);
2201
2202	/*
2203	* Move this request back to the priority queue, and all of its
2204	* children and grandchildren that were suspended along with it.
2205	*/
2206	__execlists_unhold(rq);
2207
2208	if (rq_prio(rq) > engine->sched_engine->queue_priority_hint) {
2209	engine->sched_engine->queue_priority_hint = rq_prio(rq);
2210	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
2211	}
2212
2213	spin_unlock_irq(lock: &engine->sched_engine->lock);
2214	}
2215
2216	struct execlists_capture {
2217	struct work_struct work;
2218	struct i915_request *rq;
2219	struct i915_gpu_coredump *error;
2220	};
2221
2222	static void execlists_capture_work(struct work_struct *work)
2223	{
2224	struct execlists_capture cap = container_of(work, typeof(cap), work);
2225	const gfp_t gfp = __GFP_KSWAPD_RECLAIM \| __GFP_RETRY_MAYFAIL \|
2226	__GFP_NOWARN;
2227	struct intel_engine_cs *engine = cap->rq->engine;
2228	struct intel_gt_coredump *gt = cap->error->gt;
2229	struct intel_engine_capture_vma *vma;
2230
2231	/ Compress all the objects attached to the request, slow! /
2232	vma = intel_engine_coredump_add_request(ee: gt->engine, rq: cap->rq, gfp);
2233	if (vma) {
2234	struct i915_vma_compress *compress =
2235	i915_vma_capture_prepare(gt);
2236
2237	intel_engine_coredump_add_vma(ee: gt->engine, capture: vma, compress);
2238	i915_vma_capture_finish(gt, compress);
2239	}
2240
2241	gt->simulated = gt->engine->simulated;
2242	cap->error->simulated = gt->simulated;
2243
2244	/ Publish the error state, and announce it to the world /
2245	i915_error_state_store(error: cap->error);
2246	i915_gpu_coredump_put(gpu: cap->error);
2247
2248	/ Return this request and all that depend upon it for signaling /
2249	execlists_unhold(engine, rq: cap->rq);
2250	i915_request_put(rq: cap->rq);
2251
2252	kfree(objp: cap);
2253	}
2254
2255	static struct execlists_capture capture_regs(struct* intel_engine_cs *engine)
2256	{
2257	const gfp_t gfp = GFP_ATOMIC \| __GFP_NOWARN;
2258	struct execlists_capture *cap;
2259
2260	cap = kmalloc(size: sizeof(*cap), flags: gfp);
2261	if (!cap)
2262	return NULL;
2263
2264	cap->error = i915_gpu_coredump_alloc(i915: engine->i915, gfp);
2265	if (!cap->error)
2266	goto err_cap;
2267
2268	cap->error->gt = intel_gt_coredump_alloc(gt: engine->gt, gfp, CORE_DUMP_FLAG_NONE);
2269	if (!cap->error->gt)
2270	goto err_gpu;
2271
2272	cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp, CORE_DUMP_FLAG_NONE);
2273	if (!cap->error->gt->engine)
2274	goto err_gt;
2275
2276	cap->error->gt->engine->hung = true;
2277
2278	return cap;
2279
2280	err_gt:
2281	kfree(objp: cap->error->gt);
2282	err_gpu:
2283	kfree(objp: cap->error);
2284	err_cap:
2285	kfree(objp: cap);
2286	return NULL;
2287	}
2288
2289	static struct i915_request *
2290	active_context(struct intel_engine_cs *engine, u32 ccid)
2291	{
2292	const struct intel_engine_execlists * const el = &engine->execlists;
2293	struct i915_request * const port, rq;
2294
2295	/*
2296	* Use the most recent result from process_csb(), but just in case
2297	* we trigger an error (via interrupt) before the first CS event has
2298	* been written, peek at the next submission.
2299	*/
2300
2301	for (port = el->active; (rq = *port); port++) {
2302	if (rq->context->lrc.ccid == ccid) {
2303	ENGINE_TRACE(engine,
2304	"ccid:%x found at active:%zd\n",
2305	ccid, port - el->active);
2306	return rq;
2307	}
2308	}
2309
2310	for (port = el->pending; (rq = *port); port++) {
2311	if (rq->context->lrc.ccid == ccid) {
2312	ENGINE_TRACE(engine,
2313	"ccid:%x found at pending:%zd\n",
2314	ccid, port - el->pending);
2315	return rq;
2316	}
2317	}
2318
2319	ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
2320	return NULL;
2321	}
2322
2323	static u32 active_ccid(struct intel_engine_cs *engine)
2324	{
2325	return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
2326	}
2327
2328	static void execlists_capture(struct intel_engine_cs *engine)
2329	{
2330	struct drm_i915_private *i915 = engine->i915;
2331	struct execlists_capture *cap;
2332
2333	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2334	return;
2335
2336	/*
2337	* We need to _quickly_ capture the engine state before we reset.
2338	* We are inside an atomic section (softirq) here and we are delaying
2339	* the forced preemption event.
2340	*/
2341	cap = capture_regs(engine);
2342	if (!cap)
2343	return;
2344
2345	spin_lock_irq(lock: &engine->sched_engine->lock);
2346	cap->rq = active_context(engine, ccid: active_ccid(engine));
2347	if (cap->rq) {
2348	cap->rq = active_request(tl: cap->rq->context->timeline, rq: cap->rq);
2349	cap->rq = i915_request_get_rcu(rq: cap->rq);
2350	}
2351	spin_unlock_irq(lock: &engine->sched_engine->lock);
2352	if (!cap->rq)
2353	goto err_free;
2354
2355	/*
2356	* Remove the request from the execlists queue, and take ownership
2357	* of the request. We pass it to our worker who will _slowly_ compress
2358	* all the pages the _user_ requested for debugging their batch, after
2359	* which we return it to the queue for signaling.
2360	*
2361	* By removing them from the execlists queue, we also remove the
2362	* requests from being processed by __unwind_incomplete_requests()
2363	* during the intel_engine_reset(), and so they will not be replayed
2364	* afterwards.
2365	*
2366	* Note that because we have not yet reset the engine at this point,
2367	* it is possible for the request that we have identified as being
2368	* guilty, did in fact complete and we will then hit an arbitration
2369	* point allowing the outstanding preemption to succeed. The likelihood
2370	* of that is very low (as capturing of the engine registers should be
2371	* fast enough to run inside an irq-off atomic section!), so we will
2372	* simply hold that request accountable for being non-preemptible
2373	* long enough to force the reset.
2374	*/
2375	if (!execlists_hold(engine, rq: cap->rq))
2376	goto err_rq;
2377
2378	INIT_WORK(&cap->work, execlists_capture_work);
2379	queue_work(wq: i915->unordered_wq, work: &cap->work);
2380	return;
2381
2382	err_rq:
2383	i915_request_put(rq: cap->rq);
2384	err_free:
2385	i915_gpu_coredump_put(gpu: cap->error);
2386	kfree(objp: cap);
2387	}
2388
2389	static void execlists_reset(struct intel_engine_cs engine, const* char *msg)
2390	{
2391	const unsigned int bit = I915_RESET_ENGINE + engine->id;
2392	unsigned long *lock = &engine->gt->reset.flags;
2393
2394	if (!intel_has_reset_engine(gt: engine->gt))
2395	return;
2396
2397	if (test_and_set_bit(nr: bit, addr: lock))
2398	return;
2399
2400	ENGINE_TRACE(engine, "reset for %s\n", msg);
2401
2402	/ Mark this tasklet as disabled to avoid waiting for it to complete /
2403	tasklet_disable_nosync(t: &engine->sched_engine->tasklet);
2404
2405	ring_set_paused(engine, state: `1`); / Freeze the current request in place /
2406	execlists_capture(engine);
2407	intel_engine_reset(engine, reason: msg);
2408
2409	tasklet_enable(t: &engine->sched_engine->tasklet);
2410	clear_and_wake_up_bit(bit, word: lock);
2411	}
2412
2413	static bool preempt_timeout(const struct intel_engine_cs *const engine)
2414	{
2415	const struct timer_list *t = &engine->execlists.preempt;
2416
2417	if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2418	return false;
2419
2420	if (!timer_expired(t))
2421	return false;
2422
2423	return engine->execlists.pending[`0`];
2424	}
2425
2426	/*
2427	* Check the unread Context Status Buffers and manage the submission of new
2428	* contexts to the ELSP accordingly.
2429	*/
2430	static void execlists_submission_tasklet(struct tasklet_struct *t)
2431	{
2432	struct i915_sched_engine *sched_engine =
2433	from_tasklet(sched_engine, t, tasklet);
2434	struct intel_engine_cs * const engine = sched_engine->private_data;
2435	struct i915_request post[`2` EXECLIST_MAX_PORTS];
2436	struct i915_request **inactive;
2437
2438	rcu_read_lock();
2439	inactive = process_csb(engine, inactive: post);
2440	GEM_BUG_ON(inactive - post > ARRAY_SIZE(post));
2441
2442	if (unlikely(preempt_timeout(engine))) {
2443	const struct i915_request rq = engine->execlists.active;
2444
2445	/*
2446	* If after the preempt-timeout expired, we are still on the
2447	* same active request/context as before we initiated the
2448	* preemption, reset the engine.
2449	*
2450	* However, if we have processed a CS event to switch contexts,
2451	* but not yet processed the CS event for the pending
2452	* preemption, reset the timer allowing the new context to
2453	* gracefully exit.
2454	*/
2455	cancel_timer(t: &engine->execlists.preempt);
2456	if (rq == engine->execlists.preempt_target)
2457	engine->execlists.error_interrupt \|= ERROR_PREEMPT;
2458	else
2459	set_timer_ms(t: &engine->execlists.preempt,
2460	timeout: active_preempt_timeout(engine, rq));
2461	}
2462
2463	if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
2464	const char *msg;
2465
2466	/ Generate the error message in priority wrt to the user! /
2467	if (engine->execlists.error_interrupt & GENMASK(`15`, `0`))
2468	msg = "CS error"; / thrown by a user payload /
2469	else if (engine->execlists.error_interrupt & ERROR_CSB)
2470	msg = "invalid CSB event";
2471	else if (engine->execlists.error_interrupt & ERROR_PREEMPT)
2472	msg = "preemption time out";
2473	else
2474	msg = "internal error";
2475
2476	engine->execlists.error_interrupt = `0`;
2477	execlists_reset(engine, msg);
2478	}
2479
2480	if (!engine->execlists.pending[`0`]) {
2481	execlists_dequeue_irq(engine);
2482	start_timeslice(engine);
2483	}
2484
2485	post_process_csb(port: post, last: inactive);
2486	rcu_read_unlock();
2487	}
2488
2489	static void execlists_irq_handler(struct intel_engine_cs *engine, u16 iir)
2490	{
2491	bool tasklet = false;
2492
2493	if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
2494	u32 eir;
2495
2496	/ Upper 16b are the enabling mask, rsvd for internal errors /
2497	eir = ENGINE_READ(engine, RING_EIR) & GENMASK(`15`, `0`);
2498	ENGINE_TRACE(engine, "CS error: %x\n", eir);
2499
2500	/ Disable the error interrupt until after the reset /
2501	if (likely(eir)) {
2502	ENGINE_WRITE(engine, RING_EMR, ~`0u`);
2503	ENGINE_WRITE(engine, RING_EIR, eir);
2504	WRITE_ONCE(engine->execlists.error_interrupt, eir);
2505	tasklet = true;
2506	}
2507	}
2508
2509	if (iir & GT_WAIT_SEMAPHORE_INTERRUPT) {
2510	WRITE_ONCE(engine->execlists.yield,
2511	ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI));
2512	ENGINE_TRACE(engine, "semaphore yield: %08x\n",
2513	engine->execlists.yield);
2514	if (del_timer(timer: &engine->execlists.timer))
2515	tasklet = true;
2516	}
2517
2518	if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
2519	tasklet = true;
2520
2521	if (iir & GT_RENDER_USER_INTERRUPT)
2522	intel_engine_signal_breadcrumbs(engine);
2523
2524	if (tasklet)
2525	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
2526	}
2527
2528	static void __execlists_kick(struct intel_engine_execlists *execlists)
2529	{
2530	struct intel_engine_cs *engine =
2531	container_of(execlists, typeof(*engine), execlists);
2532
2533	/ Kick the tasklet for some interrupt coalescing and reset handling /
2534	tasklet_hi_schedule(t: &engine->sched_engine->tasklet);
2535	}
2536
2537	#define execlists_kick(t, member) \
2538	__execlists_kick(container_of(t, struct intel_engine_execlists, member))
2539
2540	static void execlists_timeslice(struct timer_list *timer)
2541	{
2542	execlists_kick(timer, timer);
2543	}
2544
2545	static void execlists_preempt(struct timer_list *timer)
2546	{
2547	execlists_kick(timer, preempt);
2548	}
2549
2550	static void queue_request(struct intel_engine_cs *engine,
2551	struct i915_request *rq)
2552	{
2553	GEM_BUG_ON(!list_empty(&rq->sched.link));
2554	list_add_tail(new: &rq->sched.link,
2555	head: i915_sched_lookup_priolist(sched_engine: engine->sched_engine,
2556	prio: rq_prio(rq)));
2557	set_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
2558	}
2559
2560	static bool submit_queue(struct intel_engine_cs *engine,
2561	const struct i915_request *rq)
2562	{
2563	struct i915_sched_engine *sched_engine = engine->sched_engine;
2564
2565	if (rq_prio(rq) <= sched_engine->queue_priority_hint)
2566	return false;
2567
2568	sched_engine->queue_priority_hint = rq_prio(rq);
2569	return true;
2570	}
2571
2572	static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2573	const struct i915_request *rq)
2574	{
2575	GEM_BUG_ON(i915_request_on_hold(rq));
2576	return !list_empty(head: &engine->sched_engine->hold) && hold_request(rq);
2577	}
2578
2579	static void execlists_submit_request(struct i915_request *request)
2580	{
2581	struct intel_engine_cs *engine = request->engine;
2582	unsigned long flags;
2583
2584	/ Will be called from irq-context when using foreign fences. /
2585	spin_lock_irqsave(&engine->sched_engine->lock, flags);
2586
2587	if (unlikely(ancestor_on_hold(engine, request))) {
2588	RQ_TRACE(request, "ancestor on hold\n");
2589	list_add_tail(new: &request->sched.link,
2590	head: &engine->sched_engine->hold);
2591	i915_request_set_hold(rq: request);
2592	} else {
2593	queue_request(engine, rq: request);
2594
2595	GEM_BUG_ON(i915_sched_engine_is_empty(engine->sched_engine));
2596	GEM_BUG_ON(list_empty(&request->sched.link));
2597
2598	if (submit_queue(engine, rq: request))
2599	__execlists_kick(execlists: &engine->execlists);
2600	}
2601
2602	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
2603	}
2604
2605	static int
2606	__execlists_context_pre_pin(struct intel_context *ce,
2607	struct intel_engine_cs *engine,
2608	struct i915_gem_ww_ctx ww, void* **vaddr)
2609	{
2610	int err;
2611
2612	err = lrc_pre_pin(ce, engine, ww, vaddr);
2613	if (err)
2614	return err;
2615
2616	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) {
2617	lrc_init_state(ce, engine, state: *vaddr);
2618
2619	__i915_gem_object_flush_map(obj: ce->state->obj, offset: `0`, size: engine->context_size);
2620	}
2621
2622	return `0`;
2623	}
2624
2625	static int execlists_context_pre_pin(struct intel_context *ce,
2626	struct i915_gem_ww_ctx *ww,
2627	void **vaddr)
2628	{
2629	return __execlists_context_pre_pin(ce, engine: ce->engine, ww, vaddr);
2630	}
2631
2632	static int execlists_context_pin(struct intel_context ce, void* *vaddr)
2633	{
2634	return lrc_pin(ce, engine: ce->engine, vaddr);
2635	}
2636
2637	static int execlists_context_alloc(struct intel_context *ce)
2638	{
2639	return lrc_alloc(ce, engine: ce->engine);
2640	}
2641
2642	static void execlists_context_cancel_request(struct intel_context *ce,
2643	struct i915_request *rq)
2644	{
2645	struct intel_engine_cs *engine = NULL;
2646
2647	i915_request_active_engine(rq, active: &engine);
2648
2649	if (engine && intel_engine_pulse(engine))
2650	intel_gt_handle_error(gt: engine->gt, engine_mask: engine->mask, flags: `0`,
2651	fmt: "request cancellation by %s",
2652	current->comm);
2653	}
2654
2655	static struct intel_context *
2656	execlists_create_parallel(struct intel_engine_cs **engines,
2657	unsigned int num_siblings,
2658	unsigned int width)
2659	{
2660	struct intel_context parent = NULL, ce, *err;
2661	int i;
2662
2663	GEM_BUG_ON(num_siblings != `1`);
2664
2665	for (i = `0`; i < width; ++i) {
2666	ce = intel_context_create(engine: engines[i]);
2667	if (IS_ERR(ptr: ce)) {
2668	err = ce;
2669	goto unwind;
2670	}
2671
2672	if (i == `0`)
2673	parent = ce;
2674	else
2675	intel_context_bind_parent_child(parent, child: ce);
2676	}
2677
2678	parent->parallel.fence_context = dma_fence_context_alloc(num: `1`);
2679
2680	intel_context_set_nopreempt(ce: parent);
2681	for_each_child(parent, ce)
2682	intel_context_set_nopreempt(ce);
2683
2684	return parent;
2685
2686	unwind:
2687	if (parent)
2688	intel_context_put(ce: parent);
2689	return err;
2690	}
2691
2692	static const struct intel_context_ops execlists_context_ops = {
2693	.flags = COPS_HAS_INFLIGHT \| COPS_RUNTIME_CYCLES,
2694
2695	.alloc = execlists_context_alloc,
2696
2697	.cancel_request = execlists_context_cancel_request,
2698
2699	.pre_pin = execlists_context_pre_pin,
2700	.pin = execlists_context_pin,
2701	.unpin = lrc_unpin,
2702	.post_unpin = lrc_post_unpin,
2703
2704	.enter = intel_context_enter_engine,
2705	.exit = intel_context_exit_engine,
2706
2707	.reset = lrc_reset,
2708	.destroy = lrc_destroy,
2709
2710	.create_parallel = execlists_create_parallel,
2711	.create_virtual = execlists_create_virtual,
2712	};
2713
2714	static int emit_pdps(struct i915_request *rq)
2715	{
2716	const struct intel_engine_cs * const engine = rq->engine;
2717	struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(vm: rq->context->vm);
2718	int err, i;
2719	u32 *cs;
2720
2721	GEM_BUG_ON(intel_vgpu_active(rq->i915));
2722
2723	/*
2724	* Beware ye of the dragons, this sequence is magic!
2725	*
2726	* Small changes to this sequence can cause anything from
2727	* GPU hangs to forcewake errors and machine lockups!
2728	*/
2729
2730	cs = intel_ring_begin(rq, num_dwords: `2`);
2731	if (IS_ERR(ptr: cs))
2732	return PTR_ERR(ptr: cs);
2733
2734	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
2735	*cs++ = MI_NOOP;
2736	intel_ring_advance(rq, cs);
2737
2738	/ Flush any residual operations from the context load /
2739	err = engine->emit_flush(rq, EMIT_FLUSH);
2740	if (err)
2741	return err;
2742
2743	/ Magic required to prevent forcewake errors! /
2744	err = engine->emit_flush(rq, EMIT_INVALIDATE);
2745	if (err)
2746	return err;
2747
2748	cs = intel_ring_begin(rq, num_dwords: `4` * GEN8_3LVL_PDPES + `2`);
2749	if (IS_ERR(ptr: cs))
2750	return PTR_ERR(ptr: cs);
2751
2752	/ Ensure the LRI have landed before we invalidate & continue /
2753	cs++ = MI_LOAD_REGISTER_IMM(`2` GEN8_3LVL_PDPES) \| MI_LRI_FORCE_POSTED;
2754	for (i = GEN8_3LVL_PDPES; i--; ) {
2755	const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, n: i);
2756	u32 base = engine->mmio_base;
2757
2758	*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
2759	*cs++ = upper_32_bits(pd_daddr);
2760	*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
2761	*cs++ = lower_32_bits(pd_daddr);
2762	}
2763	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
2764	intel_ring_advance(rq, cs);
2765
2766	intel_ring_advance(rq, cs);
2767
2768	return `0`;
2769	}
2770
2771	static int execlists_request_alloc(struct i915_request *request)
2772	{
2773	int ret;
2774
2775	GEM_BUG_ON(!intel_context_is_pinned(request->context));
2776
2777	/*
2778	* Flush enough space to reduce the likelihood of waiting after
2779	* we start building the request - in which case we will just
2780	* have to repeat work.
2781	*/
2782	request->reserved_space += EXECLISTS_REQUEST_SIZE;
2783
2784	/*
2785	* Note that after this point, we have committed to using
2786	* this request as it is being used to both track the
2787	* state of engine initialisation and liveness of the
2788	* golden renderstate above. Think twice before you try
2789	* to cancel/unwind this request now.
2790	*/
2791
2792	if (!i915_vm_is_4lvl(vm: request->context->vm)) {
2793	ret = emit_pdps(rq: request);
2794	if (ret)
2795	return ret;
2796	}
2797
2798	/ Unconditionally invalidate GPU caches and TLBs. /
2799	ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
2800	if (ret)
2801	return ret;
2802
2803	request->reserved_space -= EXECLISTS_REQUEST_SIZE;
2804	return `0`;
2805	}
2806
2807	static void reset_csb_pointers(struct intel_engine_cs *engine)
2808	{
2809	struct intel_engine_execlists * const execlists = &engine->execlists;
2810	const unsigned int reset_value = execlists->csb_size - `1`;
2811
2812	ring_set_paused(engine, state: `0`);
2813
2814	/*
2815	* Sometimes Icelake forgets to reset its pointers on a GPU reset.
2816	* Bludgeon them with a mmio update to be sure.
2817	*/
2818	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
2819	`0xffff` << `16` \| reset_value << `8` \| reset_value);
2820	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2821
2822	/*
2823	* After a reset, the HW starts writing into CSB entry [0]. We
2824	* therefore have to set our HEAD pointer back one entry so that
2825	* the first entry we check is entry 0. To complicate this further,
2826	* as we don't wait for the first interrupt after reset, we have to
2827	* fake the HW write to point back to the last entry so that our
2828	* inline comparison of our cached head position against the last HW
2829	* write works even before the first interrupt.
2830	*/
2831	execlists->csb_head = reset_value;
2832	WRITE_ONCE(*execlists->csb_write, reset_value);
2833	wmb(); / Make sure this is visible to HW (paranoia?) /
2834
2835	/ Check that the GPU does indeed update the CSB entries! /
2836	memset(execlists->csb_status, -`1`, (reset_value + `1`) * sizeof(u64));
2837	drm_clflush_virt_range(addr: execlists->csb_status,
2838	length: execlists->csb_size *
2839	sizeof(execlists->csb_status));
2840
2841	/ Once more for luck and our trusty paranoia /
2842	ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
2843	`0xffff` << `16` \| reset_value << `8` \| reset_value);
2844	ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2845
2846	GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
2847	}
2848
2849	static void sanitize_hwsp(struct intel_engine_cs *engine)
2850	{
2851	struct intel_timeline *tl;
2852
2853	list_for_each_entry(tl, &engine->status_page.timelines, engine_link)
2854	intel_timeline_reset_seqno(tl);
2855	}
2856
2857	static void execlists_sanitize(struct intel_engine_cs *engine)
2858	{
2859	GEM_BUG_ON(execlists_active(&engine->execlists));
2860
2861	/*
2862	* Poison residual state on resume, in case the suspend didn't!
2863	*
2864	* We have to assume that across suspend/resume (or other loss
2865	* of control) that the contents of our pinned buffers has been
2866	* lost, replaced by garbage. Since this doesn't always happen,
2867	* let's poison such state so that we more quickly spot when
2868	* we falsely assume it has been preserved.
2869	*/
2870	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2871	memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
2872
2873	reset_csb_pointers(engine);
2874
2875	/*
2876	* The kernel_context HWSP is stored in the status_page. As above,
2877	* that may be lost on resume/initialisation, and so we need to
2878	* reset the value in the HWSP.
2879	*/
2880	sanitize_hwsp(engine);
2881
2882	/ And scrub the dirty cachelines for the HWSP /
2883	drm_clflush_virt_range(addr: engine->status_page.addr, PAGE_SIZE);
2884
2885	intel_engine_reset_pinned_contexts(engine);
2886	}
2887
2888	static void enable_error_interrupt(struct intel_engine_cs *engine)
2889	{
2890	u32 status;
2891
2892	engine->execlists.error_interrupt = `0`;
2893	ENGINE_WRITE(engine, RING_EMR, ~`0u`);
2894	ENGINE_WRITE(engine, RING_EIR, ~`0u`); / clear all existing errors /
2895
2896	status = ENGINE_READ(engine, RING_ESR);
2897	if (unlikely(status)) {
2898	drm_err(&engine->i915->drm,
2899	"engine '%s' resumed still in error: %08x\n",
2900	engine->name, status);
2901	__intel_gt_reset(gt: engine->gt, engine_mask: engine->mask);
2902	}
2903
2904	/*
2905	* On current gen8+, we have 2 signals to play with
2906	*
2907	* - I915_ERROR_INSTUCTION (bit 0)
2908	*
2909	* Generate an error if the command parser encounters an invalid
2910	* instruction
2911	*
2912	* This is a fatal error.
2913	*
2914	* - CP_PRIV (bit 2)
2915	*
2916	* Generate an error on privilege violation (where the CP replaces
2917	* the instruction with a no-op). This also fires for writes into
2918	* read-only scratch pages.
2919	*
2920	* This is a non-fatal error, parsing continues.
2921	*
2922	* * there are a few others defined for odd HW that we do not use
2923	*
2924	* Since CP_PRIV fires for cases where we have chosen to ignore the
2925	* error (as the HW is validating and suppressing the mistakes), we
2926	* only unmask the instruction error bit.
2927	*/
2928	ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
2929	}
2930
2931	static void enable_execlists(struct intel_engine_cs *engine)
2932	{
2933	u32 mode;
2934
2935	assert_forcewakes_active(uncore: engine->uncore, fw_domains: FORCEWAKE_ALL);
2936
2937	intel_engine_set_hwsp_writemask(engine, mask: ~`0u`); / HWSTAM /
2938
2939	if (GRAPHICS_VER(engine->i915) >= `11`)
2940	mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
2941	else
2942	mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
2943	ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
2944
2945	ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
2946
2947	ENGINE_WRITE_FW(engine,
2948	RING_HWS_PGA,
2949	i915_ggtt_offset(engine->status_page.vma));
2950	ENGINE_POSTING_READ(engine, RING_HWS_PGA);
2951
2952	enable_error_interrupt(engine);
2953	}
2954
2955	static int execlists_resume(struct intel_engine_cs *engine)
2956	{
2957	intel_mocs_init_engine(engine);
2958	intel_breadcrumbs_reset(b: engine->breadcrumbs);
2959
2960	enable_execlists(engine);
2961
2962	if (engine->flags & I915_ENGINE_FIRST_RENDER_COMPUTE)
2963	xehp_enable_ccs_engines(engine);
2964
2965	return `0`;
2966	}
2967
2968	static void execlists_reset_prepare(struct intel_engine_cs *engine)
2969	{
2970	ENGINE_TRACE(engine, "depth<-%d\n",
2971	atomic_read(&engine->sched_engine->tasklet.count));
2972
2973	/*
2974	* Prevent request submission to the hardware until we have
2975	* completed the reset in i915_gem_reset_finish(). If a request
2976	* is completed by one engine, it may then queue a request
2977	* to a second via its execlists->tasklet just as we are
2978	* calling engine->resume() and also writing the ELSP.
2979	* Turning off the execlists->tasklet until the reset is over
2980	* prevents the race.
2981	*/
2982	__tasklet_disable_sync_once(t: &engine->sched_engine->tasklet);
2983	GEM_BUG_ON(!reset_in_progress(engine));
2984
2985	/*
2986	* We stop engines, otherwise we might get failed reset and a
2987	* dead gpu (on elk). Also as modern gpu as kbl can suffer
2988	* from system hang if batchbuffer is progressing when
2989	* the reset is issued, regardless of READY_TO_RESET ack.
2990	* Thus assume it is best to stop engines on all gens
2991	* where we have a gpu reset.
2992	*
2993	* WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
2994	*
2995	* FIXME: Wa for more modern gens needs to be validated
2996	*/
2997	ring_set_paused(engine, state: `1`);
2998	intel_engine_stop_cs(engine);
2999
3000	/*
3001	* Wa_22011802037: In addition to stopping the cs, we need
3002	* to wait for any pending mi force wakeups
3003	*/
3004	if (intel_engine_reset_needs_wa_22011802037(gt: engine->gt))
3005	intel_engine_wait_for_pending_mi_fw(engine);
3006
3007	engine->execlists.reset_ccid = active_ccid(engine);
3008	}
3009
3010	static struct i915_request **
3011	reset_csb(struct intel_engine_cs engine, struct* i915_request **inactive)
3012	{
3013	struct intel_engine_execlists * const execlists = &engine->execlists;
3014
3015	drm_clflush_virt_range(addr: execlists->csb_write,
3016	length: sizeof(execlists->csb_write[`0`]));
3017
3018	inactive = process_csb(engine, inactive); / drain preemption events /
3019
3020	/ Following the reset, we need to reload the CSB read/write pointers /
3021	reset_csb_pointers(engine);
3022
3023	return inactive;
3024	}
3025
3026	static void
3027	execlists_reset_active(struct intel_engine_cs *engine, bool stalled)
3028	{
3029	struct intel_context *ce;
3030	struct i915_request *rq;
3031	u32 head;
3032
3033	/*
3034	* Save the currently executing context, even if we completed
3035	* its request, it was still running at the time of the
3036	* reset and will have been clobbered.
3037	*/
3038	rq = active_context(engine, ccid: engine->execlists.reset_ccid);
3039	if (!rq)
3040	return;
3041
3042	ce = rq->context;
3043	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3044
3045	if (__i915_request_is_complete(rq)) {
3046	/ Idle context; tidy up the ring so we can restart afresh /
3047	head = intel_ring_wrap(ring: ce->ring, pos: rq->tail);
3048	goto out_replay;
3049	}
3050
3051	/ We still have requests in-flight; the engine should be active /
3052	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3053
3054	/ Context has requests still in-flight; it should not be idle! /
3055	GEM_BUG_ON(i915_active_is_idle(&ce->active));
3056
3057	rq = active_request(tl: ce->timeline, rq);
3058	head = intel_ring_wrap(ring: ce->ring, pos: rq->head);
3059	GEM_BUG_ON(head == ce->ring->tail);
3060
3061	/*
3062	* If this request hasn't started yet, e.g. it is waiting on a
3063	* semaphore, we need to avoid skipping the request or else we
3064	* break the signaling chain. However, if the context is corrupt
3065	* the request will not restart and we will be stuck with a wedged
3066	* device. It is quite often the case that if we issue a reset
3067	* while the GPU is loading the context image, that the context
3068	* image becomes corrupt.
3069	*
3070	* Otherwise, if we have not started yet, the request should replay
3071	* perfectly and we do not need to flag the result as being erroneous.
3072	*/
3073	if (!__i915_request_has_started(rq))
3074	goto out_replay;
3075
3076	/*
3077	* If the request was innocent, we leave the request in the ELSP
3078	* and will try to replay it on restarting. The context image may
3079	* have been corrupted by the reset, in which case we may have
3080	* to service a new GPU hang, but more likely we can continue on
3081	* without impact.
3082	*
3083	* If the request was guilty, we presume the context is corrupt
3084	* and have to at least restore the RING register in the context
3085	* image back to the expected values to skip over the guilty request.
3086	*/
3087	__i915_request_reset(rq, guilty: stalled);
3088
3089	/*
3090	* We want a simple context + ring to execute the breadcrumb update.
3091	* We cannot rely on the context being intact across the GPU hang,
3092	* so clear it and rebuild just what we need for the breadcrumb.
3093	* All pending requests for this context will be zapped, and any
3094	* future request will be after userspace has had the opportunity
3095	* to recreate its own state.
3096	*/
3097	out_replay:
3098	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3099	head, ce->ring->tail);
3100	lrc_reset_regs(ce, engine);
3101	ce->lrc.lrca = lrc_update_regs(ce, engine, head);
3102	}
3103
3104	static void execlists_reset_csb(struct intel_engine_cs *engine, bool stalled)
3105	{
3106	struct intel_engine_execlists * const execlists = &engine->execlists;
3107	struct i915_request post[`2` EXECLIST_MAX_PORTS];
3108	struct i915_request **inactive;
3109
3110	rcu_read_lock();
3111	inactive = reset_csb(engine, inactive: post);
3112
3113	execlists_reset_active(engine, stalled: true);
3114
3115	inactive = cancel_port_requests(execlists, inactive);
3116	post_process_csb(port: post, last: inactive);
3117	rcu_read_unlock();
3118	}
3119
3120	static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3121	{
3122	unsigned long flags;
3123
3124	ENGINE_TRACE(engine, "\n");
3125
3126	/ Process the csb, find the guilty context and throw away /
3127	execlists_reset_csb(engine, stalled);
3128
3129	/ Push back any incomplete requests for replay after the reset. /
3130	rcu_read_lock();
3131	spin_lock_irqsave(&engine->sched_engine->lock, flags);
3132	__unwind_incomplete_requests(engine);
3133	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
3134	rcu_read_unlock();
3135	}
3136
3137	static void nop_submission_tasklet(struct tasklet_struct *t)
3138	{
3139	struct i915_sched_engine *sched_engine =
3140	from_tasklet(sched_engine, t, tasklet);
3141	struct intel_engine_cs * const engine = sched_engine->private_data;
3142
3143	/ The driver is wedged; don't process any more events. /
3144	WRITE_ONCE(engine->sched_engine->queue_priority_hint, INT_MIN);
3145	}
3146
3147	static void execlists_reset_cancel(struct intel_engine_cs *engine)
3148	{
3149	struct intel_engine_execlists * const execlists = &engine->execlists;
3150	struct i915_sched_engine * const sched_engine = engine->sched_engine;
3151	struct i915_request rq, rn;
3152	struct rb_node *rb;
3153	unsigned long flags;
3154
3155	ENGINE_TRACE(engine, "\n");
3156
3157	/*
3158	* Before we call engine->cancel_requests(), we should have exclusive
3159	* access to the submission state. This is arranged for us by the
3160	* caller disabling the interrupt generation, the tasklet and other
3161	* threads that may then access the same state, giving us a free hand
3162	* to reset state. However, we still need to let lockdep be aware that
3163	* we know this state may be accessed in hardirq context, so we
3164	* disable the irq around this manipulation and we want to keep
3165	* the spinlock focused on its duties and not accidentally conflate
3166	* coverage to the submission's irq state. (Similarly, although we
3167	* shouldn't need to disable irq around the manipulation of the
3168	* submission's irq state, we also wish to remind ourselves that
3169	* it is irq state.)
3170	*/
3171	execlists_reset_csb(engine, stalled: true);
3172
3173	rcu_read_lock();
3174	spin_lock_irqsave(&engine->sched_engine->lock, flags);
3175
3176	/ Mark all executing requests as skipped. /
3177	list_for_each_entry(rq, &engine->sched_engine->requests, sched.link)
3178	i915_request_put(rq: i915_request_mark_eio(rq));
3179	intel_engine_signal_breadcrumbs(engine);
3180
3181	/ Flush the queued requests to the timeline list (for retiring). /
3182	while ((rb = rb_first_cached(&sched_engine->queue))) {
3183	struct i915_priolist *p = to_priolist(rb);
3184
3185	priolist_for_each_request_consume(rq, rn, p) {
3186	if (i915_request_mark_eio(rq)) {
3187	__i915_request_submit(request: rq);
3188	i915_request_put(rq);
3189	}
3190	}
3191
3192	rb_erase_cached(node: &p->node, root: &sched_engine->queue);
3193	i915_priolist_free(p);
3194	}
3195
3196	/ On-hold requests will be flushed to timeline upon their release /
3197	list_for_each_entry(rq, &sched_engine->hold, sched.link)
3198	i915_request_put(rq: i915_request_mark_eio(rq));
3199
3200	/ Cancel all attached virtual engines /
3201	while ((rb = rb_first_cached(&execlists->virtual))) {
3202	struct virtual_engine *ve =
3203	rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3204
3205	rb_erase_cached(node: rb, root: &execlists->virtual);
3206	RB_CLEAR_NODE(rb);
3207
3208	spin_lock(lock: &ve->base.sched_engine->lock);
3209	rq = fetch_and_zero(&ve->request);
3210	if (rq) {
3211	if (i915_request_mark_eio(rq)) {
3212	rq->engine = engine;
3213	__i915_request_submit(request: rq);
3214	i915_request_put(rq);
3215	}
3216	i915_request_put(rq);
3217
3218	ve->base.sched_engine->queue_priority_hint = INT_MIN;
3219	}
3220	spin_unlock(lock: &ve->base.sched_engine->lock);
3221	}
3222
3223	/ Remaining _unready_ requests will be nop'ed when submitted /
3224
3225	sched_engine->queue_priority_hint = INT_MIN;
3226	sched_engine->queue = RB_ROOT_CACHED;
3227
3228	GEM_BUG_ON(__tasklet_is_enabled(&engine->sched_engine->tasklet));
3229	engine->sched_engine->tasklet.callback = nop_submission_tasklet;
3230
3231	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
3232	rcu_read_unlock();
3233	}
3234
3235	static void execlists_reset_finish(struct intel_engine_cs *engine)
3236	{
3237	struct intel_engine_execlists * const execlists = &engine->execlists;
3238
3239	/*
3240	* After a GPU reset, we may have requests to replay. Do so now while
3241	* we still have the forcewake to be sure that the GPU is not allowed
3242	* to sleep before we restart and reload a context.
3243	*
3244	* If the GPU reset fails, the engine may still be alive with requests
3245	* inflight. We expect those to complete, or for the device to be
3246	* reset as the next level of recovery, and as a final resort we
3247	* will declare the device wedged.
3248	*/
3249	GEM_BUG_ON(!reset_in_progress(engine));
3250
3251	/ And kick in case we missed a new request submission. /
3252	if (__tasklet_enable(t: &engine->sched_engine->tasklet))
3253	__execlists_kick(execlists);
3254
3255	ENGINE_TRACE(engine, "depth->%d\n",
3256	atomic_read(&engine->sched_engine->tasklet.count));
3257	}
3258
3259	static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3260	{
3261	ENGINE_WRITE(engine, RING_IMR,
3262	~(engine->irq_enable_mask \| engine->irq_keep_mask));
3263	ENGINE_POSTING_READ(engine, RING_IMR);
3264	}
3265
3266	static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3267	{
3268	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3269	}
3270
3271	static void execlists_park(struct intel_engine_cs *engine)
3272	{
3273	cancel_timer(t: &engine->execlists.timer);
3274	cancel_timer(t: &engine->execlists.preempt);
3275
3276	/ Reset upon idling, or we may delay the busy wakeup. /
3277	WRITE_ONCE(engine->sched_engine->queue_priority_hint, INT_MIN);
3278	}
3279
3280	static void add_to_engine(struct i915_request *rq)
3281	{
3282	lockdep_assert_held(&rq->engine->sched_engine->lock);
3283	list_move_tail(list: &rq->sched.link, head: &rq->engine->sched_engine->requests);
3284	}
3285
3286	static void remove_from_engine(struct i915_request *rq)
3287	{
3288	struct intel_engine_cs engine, locked;
3289
3290	/*
3291	* Virtual engines complicate acquiring the engine timeline lock,
3292	* as their rq->engine pointer is not stable until under that
3293	* engine lock. The simple ploy we use is to take the lock then
3294	* check that the rq still belongs to the newly locked engine.
3295	*/
3296	locked = READ_ONCE(rq->engine);
3297	spin_lock_irq(lock: &locked->sched_engine->lock);
3298	while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
3299	spin_unlock(lock: &locked->sched_engine->lock);
3300	spin_lock(lock: &engine->sched_engine->lock);
3301	locked = engine;
3302	}
3303	list_del_init(entry: &rq->sched.link);
3304
3305	clear_bit(nr: I915_FENCE_FLAG_PQUEUE, addr: &rq->fence.flags);
3306	clear_bit(nr: I915_FENCE_FLAG_HOLD, addr: &rq->fence.flags);
3307
3308	/ Prevent further __await_execution() registering a cb, then flush /
3309	set_bit(nr: I915_FENCE_FLAG_ACTIVE, addr: &rq->fence.flags);
3310
3311	spin_unlock_irq(lock: &locked->sched_engine->lock);
3312
3313	i915_request_notify_execute_cb_imm(rq);
3314	}
3315
3316	static bool can_preempt(struct intel_engine_cs *engine)
3317	{
3318	if (GRAPHICS_VER(engine->i915) > `8`)
3319	return true;
3320
3321	/ GPGPU on bdw requires extra w/a; not implemented /
3322	return engine->class != RENDER_CLASS;
3323	}
3324
3325	static void kick_execlists(const struct i915_request rq, int* prio)
3326	{
3327	struct intel_engine_cs *engine = rq->engine;
3328	struct i915_sched_engine *sched_engine = engine->sched_engine;
3329	const struct i915_request *inflight;
3330
3331	/*
3332	* We only need to kick the tasklet once for the high priority
3333	* new context we add into the queue.
3334	*/
3335	if (prio <= sched_engine->queue_priority_hint)
3336	return;
3337
3338	rcu_read_lock();
3339
3340	/ Nothing currently active? We're overdue for a submission! /
3341	inflight = execlists_active(execlists: &engine->execlists);
3342	if (!inflight)
3343	goto unlock;
3344
3345	/*
3346	* If we are already the currently executing context, don't
3347	* bother evaluating if we should preempt ourselves.
3348	*/
3349	if (inflight->context == rq->context)
3350	goto unlock;
3351
3352	ENGINE_TRACE(engine,
3353	"bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n",
3354	prio,
3355	rq->fence.context, rq->fence.seqno,
3356	inflight->fence.context, inflight->fence.seqno,
3357	inflight->sched.attr.priority);
3358
3359	sched_engine->queue_priority_hint = prio;
3360
3361	/*
3362	* Allow preemption of low -> normal -> high, but we do
3363	* not allow low priority tasks to preempt other low priority
3364	* tasks under the impression that latency for low priority
3365	* tasks does not matter (as much as background throughput),
3366	* so kiss.
3367	*/
3368	if (prio >= max(I915_PRIORITY_NORMAL, rq_prio(inflight)))
3369	tasklet_hi_schedule(t: &sched_engine->tasklet);
3370
3371	unlock:
3372	rcu_read_unlock();
3373	}
3374
3375	static void execlists_set_default_submission(struct intel_engine_cs *engine)
3376	{
3377	engine->submit_request = execlists_submit_request;
3378	engine->sched_engine->schedule = i915_schedule;
3379	engine->sched_engine->kick_backend = kick_execlists;
3380	engine->sched_engine->tasklet.callback = execlists_submission_tasklet;
3381	}
3382
3383	static void execlists_shutdown(struct intel_engine_cs *engine)
3384	{
3385	/ Synchronise with residual timers and any softirq they raise /
3386	del_timer_sync(timer: &engine->execlists.timer);
3387	del_timer_sync(timer: &engine->execlists.preempt);
3388	tasklet_kill(t: &engine->sched_engine->tasklet);
3389	}
3390
3391	static void execlists_release(struct intel_engine_cs *engine)
3392	{
3393	engine->sanitize = NULL; / no longer in control, nothing to sanitize /
3394
3395	execlists_shutdown(engine);
3396
3397	intel_engine_cleanup_common(engine);
3398	lrc_fini_wa_ctx(engine);
3399	}
3400
3401	static ktime_t __execlists_engine_busyness(struct intel_engine_cs *engine,
3402	ktime_t *now)
3403	{
3404	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
3405	ktime_t total = stats->total;
3406
3407	/*
3408	* If the engine is executing something at the moment
3409	* add it to the total.
3410	*/
3411	*now = ktime_get();
3412	if (READ_ONCE(stats->active))
3413	total = ktime_add(total, ktime_sub(*now, stats->start));
3414
3415	return total;
3416	}
3417
3418	static ktime_t execlists_engine_busyness(struct intel_engine_cs *engine,
3419	ktime_t *now)
3420	{
3421	struct intel_engine_execlists_stats *stats = &engine->stats.execlists;
3422	unsigned int seq;
3423	ktime_t total;
3424
3425	do {
3426	seq = read_seqcount_begin(&stats->lock);
3427	total = __execlists_engine_busyness(engine, now);
3428	} while (read_seqcount_retry(&stats->lock, seq));
3429
3430	return total;
3431	}
3432
3433	static void
3434	logical_ring_default_vfuncs(struct intel_engine_cs *engine)
3435	{
3436	/ Default vfuncs which can be overridden by each engine. /
3437
3438	engine->resume = execlists_resume;
3439
3440	engine->cops = &execlists_context_ops;
3441	engine->request_alloc = execlists_request_alloc;
3442	engine->add_active_request = add_to_engine;
3443	engine->remove_active_request = remove_from_engine;
3444
3445	engine->reset.prepare = execlists_reset_prepare;
3446	engine->reset.rewind = execlists_reset_rewind;
3447	engine->reset.cancel = execlists_reset_cancel;
3448	engine->reset.finish = execlists_reset_finish;
3449
3450	engine->park = execlists_park;
3451	engine->unpark = NULL;
3452
3453	engine->emit_flush = gen8_emit_flush_xcs;
3454	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
3455	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_xcs;
3456	if (GRAPHICS_VER(engine->i915) >= `12`) {
3457	engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_xcs;
3458	engine->emit_flush = gen12_emit_flush_xcs;
3459	}
3460	engine->set_default_submission = execlists_set_default_submission;
3461
3462	if (GRAPHICS_VER(engine->i915) < `11`) {
3463	engine->irq_enable = gen8_logical_ring_enable_irq;
3464	engine->irq_disable = gen8_logical_ring_disable_irq;
3465	} else {
3466	/*
3467	* TODO: On Gen11 interrupt masks need to be clear
3468	* to allow C6 entry. Keep interrupts enabled at
3469	* and take the hit of generating extra interrupts
3470	* until a more refined solution exists.
3471	*/
3472	}
3473	intel_engine_set_irq_handler(engine, fn: execlists_irq_handler);
3474
3475	engine->flags \|= I915_ENGINE_SUPPORTS_STATS;
3476	if (!intel_vgpu_active(i915: engine->i915)) {
3477	engine->flags \|= I915_ENGINE_HAS_SEMAPHORES;
3478	if (can_preempt(engine)) {
3479	engine->flags \|= I915_ENGINE_HAS_PREEMPTION;
3480	if (CONFIG_DRM_I915_TIMESLICE_DURATION)
3481	engine->flags \|= I915_ENGINE_HAS_TIMESLICES;
3482	}
3483	}
3484
3485	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(`12`, `50`)) {
3486	if (intel_engine_has_preemption(engine))
3487	engine->emit_bb_start = xehp_emit_bb_start;
3488	else
3489	engine->emit_bb_start = xehp_emit_bb_start_noarb;
3490	} else {
3491	if (intel_engine_has_preemption(engine))
3492	engine->emit_bb_start = gen8_emit_bb_start;
3493	else
3494	engine->emit_bb_start = gen8_emit_bb_start_noarb;
3495	}
3496
3497	engine->busyness = execlists_engine_busyness;
3498	}
3499
3500	static void logical_ring_default_irqs(struct intel_engine_cs *engine)
3501	{
3502	unsigned int shift = `0`;
3503
3504	if (GRAPHICS_VER(engine->i915) < `11`) {
3505	const u8 irq_shifts[] = {
3506	[RCS0] = GEN8_RCS_IRQ_SHIFT,
3507	[BCS0] = GEN8_BCS_IRQ_SHIFT,
3508	[VCS0] = GEN8_VCS0_IRQ_SHIFT,
3509	[VCS1] = GEN8_VCS1_IRQ_SHIFT,
3510	[VECS0] = GEN8_VECS_IRQ_SHIFT,
3511	};
3512
3513	shift = irq_shifts[engine->id];
3514	}
3515
3516	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
3517	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
3518	engine->irq_keep_mask \|= GT_CS_MASTER_ERROR_INTERRUPT << shift;
3519	engine->irq_keep_mask \|= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
3520	}
3521
3522	static void rcs_submission_override(struct intel_engine_cs *engine)
3523	{
3524	switch (GRAPHICS_VER(engine->i915)) {
3525	case `12`:
3526	engine->emit_flush = gen12_emit_flush_rcs;
3527	engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
3528	break;
3529	case `11`:
3530	engine->emit_flush = gen11_emit_flush_rcs;
3531	engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
3532	break;
3533	default:
3534	engine->emit_flush = gen8_emit_flush_rcs;
3535	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
3536	break;
3537	}
3538	}
3539
3540	int intel_execlists_submission_setup(struct intel_engine_cs *engine)
3541	{
3542	struct intel_engine_execlists * const execlists = &engine->execlists;
3543	struct drm_i915_private *i915 = engine->i915;
3544	struct intel_uncore *uncore = engine->uncore;
3545	u32 base = engine->mmio_base;
3546
3547	tasklet_setup(t: &engine->sched_engine->tasklet, callback: execlists_submission_tasklet);
3548	timer_setup(&engine->execlists.timer, execlists_timeslice, `0`);
3549	timer_setup(&engine->execlists.preempt, execlists_preempt, `0`);
3550
3551	logical_ring_default_vfuncs(engine);
3552	logical_ring_default_irqs(engine);
3553
3554	seqcount_init(&engine->stats.execlists.lock);
3555
3556	if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)
3557	rcs_submission_override(engine);
3558
3559	lrc_init_wa_ctx(engine);
3560
3561	if (HAS_LOGICAL_RING_ELSQ(i915)) {
3562	execlists->submit_reg = intel_uncore_regs(uncore) +
3563	i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
3564	execlists->ctrl_reg = intel_uncore_regs(uncore) +
3565	i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
3566
3567	engine->fw_domain = intel_uncore_forcewake_for_reg(uncore: engine->uncore,
3568	RING_EXECLIST_CONTROL(engine->mmio_base),
3569	FW_REG_WRITE);
3570	} else {
3571	execlists->submit_reg = intel_uncore_regs(uncore) +
3572	i915_mmio_reg_offset(RING_ELSP(base));
3573	}
3574
3575	execlists->csb_status =
3576	(u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
3577
3578	execlists->csb_write =
3579	&engine->status_page.addr[INTEL_HWS_CSB_WRITE_INDEX(i915)];
3580
3581	if (GRAPHICS_VER(i915) < `11`)
3582	execlists->csb_size = GEN8_CSB_ENTRIES;
3583	else
3584	execlists->csb_size = GEN11_CSB_ENTRIES;
3585
3586	engine->context_tag = GENMASK(BITS_PER_LONG - `2`, `0`);
3587	if (GRAPHICS_VER(engine->i915) >= `11` &&
3588	GRAPHICS_VER_FULL(engine->i915) < IP_VER(`12`, `50`)) {
3589	execlists->ccid \|= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - `32`);
3590	execlists->ccid \|= engine->class << (GEN11_ENGINE_CLASS_SHIFT - `32`);
3591	}
3592
3593	/ Finally, take ownership and responsibility for cleanup! /
3594	engine->sanitize = execlists_sanitize;
3595	engine->release = execlists_release;
3596
3597	return `0`;
3598	}
3599
3600	static struct list_head virtual_queue(struct* virtual_engine *ve)
3601	{
3602	return &ve->base.sched_engine->default_priolist.requests;
3603	}
3604
3605	static void rcu_virtual_context_destroy(struct work_struct *wrk)
3606	{
3607	struct virtual_engine *ve =
3608	container_of(wrk, typeof(*ve), rcu.work);
3609	unsigned int n;
3610
3611	GEM_BUG_ON(ve->context.inflight);
3612
3613	/ Preempt-to-busy may leave a stale request behind. /
3614	if (unlikely(ve->request)) {
3615	struct i915_request *old;
3616
3617	spin_lock_irq(lock: &ve->base.sched_engine->lock);
3618
3619	old = fetch_and_zero(&ve->request);
3620	if (old) {
3621	GEM_BUG_ON(!__i915_request_is_complete(old));
3622	__i915_request_submit(request: old);
3623	i915_request_put(rq: old);
3624	}
3625
3626	spin_unlock_irq(lock: &ve->base.sched_engine->lock);
3627	}
3628
3629	/*
3630	* Flush the tasklet in case it is still running on another core.
3631	*
3632	* This needs to be done before we remove ourselves from the siblings'
3633	* rbtrees as in the case it is running in parallel, it may reinsert
3634	* the rb_node into a sibling.
3635	*/
3636	tasklet_kill(t: &ve->base.sched_engine->tasklet);
3637
3638	/ Decouple ourselves from the siblings, no more access allowed. /
3639	for (n = `0`; n < ve->num_siblings; n++) {
3640	struct intel_engine_cs *sibling = ve->siblings[n];
3641	struct rb_node *node = &ve->nodes[sibling->id].rb;
3642
3643	if (RB_EMPTY_NODE(node))
3644	continue;
3645
3646	spin_lock_irq(lock: &sibling->sched_engine->lock);
3647
3648	/ Detachment is lazily performed in the sched_engine->tasklet /
3649	if (!RB_EMPTY_NODE(node))
3650	rb_erase_cached(node, root: &sibling->execlists.virtual);
3651
3652	spin_unlock_irq(lock: &sibling->sched_engine->lock);
3653	}
3654	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.sched_engine->tasklet));
3655	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3656
3657	lrc_fini(ce: &ve->context);
3658	intel_context_fini(ce: &ve->context);
3659
3660	if (ve->base.breadcrumbs)
3661	intel_breadcrumbs_put(b: ve->base.breadcrumbs);
3662	if (ve->base.sched_engine)
3663	i915_sched_engine_put(sched_engine: ve->base.sched_engine);
3664	intel_engine_free_request_pool(engine: &ve->base);
3665
3666	kfree(objp: ve);
3667	}
3668
3669	static void virtual_context_destroy(struct kref *kref)
3670	{
3671	struct virtual_engine *ve =
3672	container_of(kref, typeof(*ve), context.ref);
3673
3674	GEM_BUG_ON(!list_empty(&ve->context.signals));
3675
3676	/*
3677	* When destroying the virtual engine, we have to be aware that
3678	* it may still be in use from an hardirq/softirq context causing
3679	* the resubmission of a completed request (background completion
3680	* due to preempt-to-busy). Before we can free the engine, we need
3681	* to flush the submission code and tasklets that are still potentially
3682	* accessing the engine. Flushing the tasklets requires process context,
3683	* and since we can guard the resubmit onto the engine with an RCU read
3684	* lock, we can delegate the free of the engine to an RCU worker.
3685	*/
3686	INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
3687	queue_rcu_work(wq: ve->context.engine->i915->unordered_wq, rwork: &ve->rcu);
3688	}
3689
3690	static void virtual_engine_initial_hint(struct virtual_engine *ve)
3691	{
3692	int swp;
3693
3694	/*
3695	* Pick a random sibling on starting to help spread the load around.
3696	*
3697	* New contexts are typically created with exactly the same order
3698	* of siblings, and often started in batches. Due to the way we iterate
3699	* the array of sibling when submitting requests, sibling[0] is
3700	* prioritised for dequeuing. If we make sure that sibling[0] is fairly
3701	* randomised across the system, we also help spread the load by the
3702	* first engine we inspect being different each time.
3703	*
3704	* NB This does not force us to execute on this engine, it will just
3705	* typically be the first we inspect for submission.
3706	*/
3707	swp = get_random_u32_below(ceil: ve->num_siblings);
3708	if (swp)
3709	swap(ve->siblings[swp], ve->siblings[`0`]);
3710	}
3711
3712	static int virtual_context_alloc(struct intel_context *ce)
3713	{
3714	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3715
3716	return lrc_alloc(ce, engine: ve->siblings[`0`]);
3717	}
3718
3719	static int virtual_context_pre_pin(struct intel_context *ce,
3720	struct i915_gem_ww_ctx *ww,
3721	void **vaddr)
3722	{
3723	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3724
3725	/ Note: we must use a real engine class for setting up reg state /
3726	return __execlists_context_pre_pin(ce, engine: ve->siblings[`0`], ww, vaddr);
3727	}
3728
3729	static int virtual_context_pin(struct intel_context ce, void* *vaddr)
3730	{
3731	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3732
3733	return lrc_pin(ce, engine: ve->siblings[`0`], vaddr);
3734	}
3735
3736	static void virtual_context_enter(struct intel_context *ce)
3737	{
3738	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3739	unsigned int n;
3740
3741	for (n = `0`; n < ve->num_siblings; n++)
3742	intel_engine_pm_get(engine: ve->siblings[n]);
3743
3744	intel_timeline_enter(tl: ce->timeline);
3745	}
3746
3747	static void virtual_context_exit(struct intel_context *ce)
3748	{
3749	struct virtual_engine ve = container_of(ce, typeof(ve), context);
3750	unsigned int n;
3751
3752	intel_timeline_exit(tl: ce->timeline);
3753
3754	for (n = `0`; n < ve->num_siblings; n++)
3755	intel_engine_pm_put(engine: ve->siblings[n]);
3756	}
3757
3758	static struct intel_engine_cs *
3759	virtual_get_sibling(struct intel_engine_cs engine, unsigned* int sibling)
3760	{
3761	struct virtual_engine *ve = to_virtual_engine(engine);
3762
3763	if (sibling >= ve->num_siblings)
3764	return NULL;
3765
3766	return ve->siblings[sibling];
3767	}
3768
3769	static const struct intel_context_ops virtual_context_ops = {
3770	.flags = COPS_HAS_INFLIGHT \| COPS_RUNTIME_CYCLES,
3771
3772	.alloc = virtual_context_alloc,
3773
3774	.cancel_request = execlists_context_cancel_request,
3775
3776	.pre_pin = virtual_context_pre_pin,
3777	.pin = virtual_context_pin,
3778	.unpin = lrc_unpin,
3779	.post_unpin = lrc_post_unpin,
3780
3781	.enter = virtual_context_enter,
3782	.exit = virtual_context_exit,
3783
3784	.destroy = virtual_context_destroy,
3785
3786	.get_sibling = virtual_get_sibling,
3787	};
3788
3789	static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
3790	{
3791	struct i915_request *rq;
3792	intel_engine_mask_t mask;
3793
3794	rq = READ_ONCE(ve->request);
3795	if (!rq)
3796	return `0`;
3797
3798	/ The rq is ready for submission; rq->execution_mask is now stable. /
3799	mask = rq->execution_mask;
3800	if (unlikely(!mask)) {
3801	/ Invalid selection, submit to a random engine in error /
3802	i915_request_set_error_once(rq, error: -ENODEV);
3803	mask = ve->siblings[`0`]->mask;
3804	}
3805
3806	ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
3807	rq->fence.context, rq->fence.seqno,
3808	mask, ve->base.sched_engine->queue_priority_hint);
3809
3810	return mask;
3811	}
3812
3813	static void virtual_submission_tasklet(struct tasklet_struct *t)
3814	{
3815	struct i915_sched_engine *sched_engine =
3816	from_tasklet(sched_engine, t, tasklet);
3817	struct virtual_engine * const ve =
3818	(struct virtual_engine *)sched_engine->private_data;
3819	const int prio = READ_ONCE(sched_engine->queue_priority_hint);
3820	intel_engine_mask_t mask;
3821	unsigned int n;
3822
3823	rcu_read_lock();
3824	mask = virtual_submission_mask(ve);
3825	rcu_read_unlock();
3826	if (unlikely(!mask))
3827	return;
3828
3829	for (n = `0`; n < ve->num_siblings; n++) {
3830	struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
3831	struct ve_node * const node = &ve->nodes[sibling->id];
3832	struct rb_node *parent, rb;
3833	bool first;
3834
3835	if (!READ_ONCE(ve->request))
3836	break; / already handled by a sibling's tasklet /
3837
3838	spin_lock_irq(lock: &sibling->sched_engine->lock);
3839
3840	if (unlikely(!(mask & sibling->mask))) {
3841	if (!RB_EMPTY_NODE(&node->rb)) {
3842	rb_erase_cached(node: &node->rb,
3843	root: &sibling->execlists.virtual);
3844	RB_CLEAR_NODE(&node->rb);
3845	}
3846
3847	goto unlock_engine;
3848	}
3849
3850	if (unlikely(!RB_EMPTY_NODE(&node->rb))) {
3851	/*
3852	* Cheat and avoid rebalancing the tree if we can
3853	* reuse this node in situ.
3854	*/
3855	first = rb_first_cached(&sibling->execlists.virtual) ==
3856	&node->rb;
3857	if (prio == node->prio \|\| (prio > node->prio && first))
3858	goto submit_engine;
3859
3860	rb_erase_cached(node: &node->rb, root: &sibling->execlists.virtual);
3861	}
3862
3863	rb = NULL;
3864	first = true;
3865	parent = &sibling->execlists.virtual.rb_root.rb_node;
3866	while (*parent) {
3867	struct ve_node *other;
3868
3869	rb = *parent;
3870	other = rb_entry(rb, typeof(*other), rb);
3871	if (prio > other->prio) {
3872	parent = &rb->rb_left;
3873	} else {
3874	parent = &rb->rb_right;
3875	first = false;
3876	}
3877	}
3878
3879	rb_link_node(node: &node->rb, parent: rb, rb_link: parent);
3880	rb_insert_color_cached(node: &node->rb,
3881	root: &sibling->execlists.virtual,
3882	leftmost: first);
3883
3884	submit_engine:
3885	GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
3886	node->prio = prio;
3887	if (first && prio > sibling->sched_engine->queue_priority_hint)
3888	tasklet_hi_schedule(t: &sibling->sched_engine->tasklet);
3889
3890	unlock_engine:
3891	spin_unlock_irq(lock: &sibling->sched_engine->lock);
3892
3893	if (intel_context_inflight(&ve->context))
3894	break;
3895	}
3896	}
3897
3898	static void virtual_submit_request(struct i915_request *rq)
3899	{
3900	struct virtual_engine *ve = to_virtual_engine(engine: rq->engine);
3901	unsigned long flags;
3902
3903	ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
3904	rq->fence.context,
3905	rq->fence.seqno);
3906
3907	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
3908
3909	spin_lock_irqsave(&ve->base.sched_engine->lock, flags);
3910
3911	/ By the time we resubmit a request, it may be completed /
3912	if (__i915_request_is_complete(rq)) {
3913	__i915_request_submit(request: rq);
3914	goto unlock;
3915	}
3916
3917	if (ve->request) { / background completion from preempt-to-busy /
3918	GEM_BUG_ON(!__i915_request_is_complete(ve->request));
3919	__i915_request_submit(request: ve->request);
3920	i915_request_put(rq: ve->request);
3921	}
3922
3923	ve->base.sched_engine->queue_priority_hint = rq_prio(rq);
3924	ve->request = i915_request_get(rq);
3925
3926	GEM_BUG_ON(!list_empty(virtual_queue(ve)));
3927	list_move_tail(list: &rq->sched.link, head: virtual_queue(ve));
3928
3929	tasklet_hi_schedule(t: &ve->base.sched_engine->tasklet);
3930
3931	unlock:
3932	spin_unlock_irqrestore(lock: &ve->base.sched_engine->lock, flags);
3933	}
3934
3935	static struct intel_context *
3936	execlists_create_virtual(struct intel_engine_cs *siblings, unsigned* int count,
3937	unsigned long flags)
3938	{
3939	struct drm_i915_private *i915 = siblings[`0`]->i915;
3940	struct virtual_engine *ve;
3941	unsigned int n;
3942	int err;
3943
3944	ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
3945	if (!ve)
3946	return ERR_PTR(error: -ENOMEM);
3947
3948	ve->base.i915 = i915;
3949	ve->base.gt = siblings[`0`]->gt;
3950	ve->base.uncore = siblings[`0`]->uncore;
3951	ve->base.id = -`1`;
3952
3953	ve->base.class = OTHER_CLASS;
3954	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
3955	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3956	ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
3957
3958	/*
3959	* The decision on whether to submit a request using semaphores
3960	* depends on the saturated state of the engine. We only compute
3961	* this during HW submission of the request, and we need for this
3962	* state to be globally applied to all requests being submitted
3963	* to this engine. Virtual engines encompass more than one physical
3964	* engine and so we cannot accurately tell in advance if one of those
3965	* engines is already saturated and so cannot afford to use a semaphore
3966	* and be pessimized in priority for doing so -- if we are the only
3967	* context using semaphores after all other clients have stopped, we
3968	* will be starved on the saturated system. Such a global switch for
3969	* semaphores is less than ideal, but alas is the current compromise.
3970	*/
3971	ve->base.saturated = ALL_ENGINES;
3972
3973	snprintf(buf: ve->base.name, size: sizeof(ve->base.name), fmt: "virtual");
3974
3975	intel_engine_init_execlists(engine: &ve->base);
3976
3977	ve->base.sched_engine = i915_sched_engine_create(ENGINE_VIRTUAL);
3978	if (!ve->base.sched_engine) {
3979	err = -ENOMEM;
3980	goto err_put;
3981	}
3982	ve->base.sched_engine->private_data = &ve->base;
3983
3984	ve->base.cops = &virtual_context_ops;
3985	ve->base.request_alloc = execlists_request_alloc;
3986
3987	ve->base.sched_engine->schedule = i915_schedule;
3988	ve->base.sched_engine->kick_backend = kick_execlists;
3989	ve->base.submit_request = virtual_submit_request;
3990
3991	INIT_LIST_HEAD(list: virtual_queue(ve));
3992	tasklet_setup(t: &ve->base.sched_engine->tasklet, callback: virtual_submission_tasklet);
3993
3994	intel_context_init(ce: &ve->context, engine: &ve->base);
3995
3996	ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
3997	if (!ve->base.breadcrumbs) {
3998	err = -ENOMEM;
3999	goto err_put;
4000	}
4001
4002	for (n = `0`; n < count; n++) {
4003	struct intel_engine_cs *sibling = siblings[n];
4004
4005	GEM_BUG_ON(!is_power_of_2(sibling->mask));
4006	if (sibling->mask & ve->base.mask) {
4007	drm_dbg(&i915->drm,
4008	"duplicate %s entry in load balancer\n",
4009	sibling->name);
4010	err = -EINVAL;
4011	goto err_put;
4012	}
4013
4014	/*
4015	* The virtual engine implementation is tightly coupled to
4016	* the execlists backend -- we push out request directly
4017	* into a tree inside each physical engine. We could support
4018	* layering if we handle cloning of the requests and
4019	* submitting a copy into each backend.
4020	*/
4021	if (sibling->sched_engine->tasklet.callback !=
4022	execlists_submission_tasklet) {
4023	err = -ENODEV;
4024	goto err_put;
4025	}
4026
4027	GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4028	RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4029
4030	ve->siblings[ve->num_siblings++] = sibling;
4031	ve->base.mask \|= sibling->mask;
4032	ve->base.logical_mask \|= sibling->logical_mask;
4033
4034	/*
4035	* All physical engines must be compatible for their emission
4036	* functions (as we build the instructions during request
4037	* construction and do not alter them before submission
4038	* on the physical engine). We use the engine class as a guide
4039	* here, although that could be refined.
4040	*/
4041	if (ve->base.class != OTHER_CLASS) {
4042	if (ve->base.class != sibling->class) {
4043	drm_dbg(&i915->drm,
4044	"invalid mixing of engine class, sibling %d, already %d\n",
4045	sibling->class, ve->base.class);
4046	err = -EINVAL;
4047	goto err_put;
4048	}
4049	continue;
4050	}
4051
4052	ve->base.class = sibling->class;
4053	ve->base.uabi_class = sibling->uabi_class;
4054	snprintf(buf: ve->base.name, size: sizeof(ve->base.name),
4055	fmt: "v%dx%d", ve->base.class, count);
4056	ve->base.context_size = sibling->context_size;
4057
4058	ve->base.add_active_request = sibling->add_active_request;
4059	ve->base.remove_active_request = sibling->remove_active_request;
4060	ve->base.emit_bb_start = sibling->emit_bb_start;
4061	ve->base.emit_flush = sibling->emit_flush;
4062	ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
4063	ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
4064	ve->base.emit_fini_breadcrumb_dw =
4065	sibling->emit_fini_breadcrumb_dw;
4066
4067	ve->base.flags = sibling->flags;
4068	}
4069
4070	ve->base.flags \|= I915_ENGINE_IS_VIRTUAL;
4071
4072	virtual_engine_initial_hint(ve);
4073	return &ve->context;
4074
4075	err_put:
4076	intel_context_put(ce: &ve->context);
4077	return ERR_PTR(error: err);
4078	}
4079
4080	void intel_execlists_show_requests(struct intel_engine_cs *engine,
4081	struct drm_printer *m,
4082	void (show_request)(struct* drm_printer *m,
4083	const struct i915_request *rq,
4084	const char *prefix,
4085	int indent),
4086	unsigned int max)
4087	{
4088	const struct intel_engine_execlists *execlists = &engine->execlists;
4089	struct i915_sched_engine *sched_engine = engine->sched_engine;
4090	struct i915_request rq, last;
4091	unsigned long flags;
4092	unsigned int count;
4093	struct rb_node *rb;
4094
4095	spin_lock_irqsave(&sched_engine->lock, flags);
4096
4097	last = NULL;
4098	count = `0`;
4099	list_for_each_entry(rq, &sched_engine->requests, sched.link) {
4100	if (count++ < max - `1`)
4101	show_request(m, rq, "\t\t", `0`);
4102	else
4103	last = rq;
4104	}
4105	if (last) {
4106	if (count > max) {
4107	drm_printf(p: m,
4108	f: "\t\t...skipping %d executing requests...\n",
4109	count - max);
4110	}
4111	show_request(m, last, "\t\t", `0`);
4112	}
4113
4114	if (sched_engine->queue_priority_hint != INT_MIN)
4115	drm_printf(p: m, f: "\t\tQueue priority hint: %d\n",
4116	READ_ONCE(sched_engine->queue_priority_hint));
4117
4118	last = NULL;
4119	count = `0`;
4120	for (rb = rb_first_cached(&sched_engine->queue); rb; rb = rb_next(rb)) {
4121	struct i915_priolist p = rb_entry(rb, typeof(p), node);
4122
4123	priolist_for_each_request(rq, p) {
4124	if (count++ < max - `1`)
4125	show_request(m, rq, "\t\t", `0`);
4126	else
4127	last = rq;
4128	}
4129	}
4130	if (last) {
4131	if (count > max) {
4132	drm_printf(p: m,
4133	f: "\t\t...skipping %d queued requests...\n",
4134	count - max);
4135	}
4136	show_request(m, last, "\t\t", `0`);
4137	}
4138
4139	last = NULL;
4140	count = `0`;
4141	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
4142	struct virtual_engine *ve =
4143	rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4144	struct i915_request *rq = READ_ONCE(ve->request);
4145
4146	if (rq) {
4147	if (count++ < max - `1`)
4148	show_request(m, rq, "\t\t", `0`);
4149	else
4150	last = rq;
4151	}
4152	}
4153	if (last) {
4154	if (count > max) {
4155	drm_printf(p: m,
4156	f: "\t\t...skipping %d virtual requests...\n",
4157	count - max);
4158	}
4159	show_request(m, last, "\t\t", `0`);
4160	}
4161
4162	spin_unlock_irqrestore(lock: &sched_engine->lock, flags);
4163	}
4164
4165	void intel_execlists_dump_active_requests(struct intel_engine_cs *engine,
4166	struct i915_request *hung_rq,
4167	struct drm_printer *m)
4168	{
4169	unsigned long flags;
4170
4171	spin_lock_irqsave(&engine->sched_engine->lock, flags);
4172
4173	intel_engine_dump_active_requests(requests: &engine->sched_engine->requests, hung_rq, m);
4174
4175	drm_printf(p: m, f: "\tOn hold?: %zu\n",
4176	list_count_nodes(head: &engine->sched_engine->hold));
4177
4178	spin_unlock_irqrestore(lock: &engine->sched_engine->lock, flags);
4179	}
4180
4181	#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
4182	#include "selftest_execlists.c"
4183	#endif
4184

source code of linux/drivers/gpu/drm/i915/gt/intel_execlists_submission.c