gen8_engine_cs.c source code [linux/drivers/gpu/drm/i915/gt/gen8_engine_cs.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2014 Intel Corporation
4	*/
5
6	#include "gen8_engine_cs.h"
7	#include "intel_engine_regs.h"
8	#include "intel_gpu_commands.h"
9	#include "intel_gt.h"
10	#include "intel_lrc.h"
11	#include "intel_ring.h"
12
13	int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode)
14	{
15	bool vf_flush_wa = false, dc_flush_wa = false;
16	u32 *cs, flags = `0`;
17	int len;
18
19	flags \|= PIPE_CONTROL_CS_STALL;
20
21	if (mode & EMIT_FLUSH) {
22	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
23	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
24	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
25	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
26	}
27
28	if (mode & EMIT_INVALIDATE) {
29	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
30	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
31	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
32	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
33	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
34	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
35	flags \|= PIPE_CONTROL_QW_WRITE;
36	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
37
38	/*
39	* On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
40	* pipe control.
41	*/
42	if (GRAPHICS_VER(rq->i915) == `9`)
43	vf_flush_wa = true;
44
45	/ WaForGAMHang:kbl /
46	if (IS_KABYLAKE(rq->i915) && IS_GRAPHICS_STEP(rq->i915, `0`, STEP_C0))
47	dc_flush_wa = true;
48	}
49
50	len = `6`;
51
52	if (vf_flush_wa)
53	len += `6`;
54
55	if (dc_flush_wa)
56	len += `12`;
57
58	cs = intel_ring_begin(rq, num_dwords: len);
59	if (IS_ERR(ptr: cs))
60	return PTR_ERR(ptr: cs);
61
62	if (vf_flush_wa)
63	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: `0`, offset: `0`);
64
65	if (dc_flush_wa)
66	cs = gen8_emit_pipe_control(batch: cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
67	offset: `0`);
68
69	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
70
71	if (dc_flush_wa)
72	cs = gen8_emit_pipe_control(batch: cs, PIPE_CONTROL_CS_STALL, offset: `0`);
73
74	intel_ring_advance(rq, cs);
75
76	return `0`;
77	}
78
79	int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
80	{
81	u32 cmd, *cs;
82
83	cs = intel_ring_begin(rq, num_dwords: `4`);
84	if (IS_ERR(ptr: cs))
85	return PTR_ERR(ptr: cs);
86
87	cmd = MI_FLUSH_DW + `1`;
88
89	/*
90	* We always require a command barrier so that subsequent
91	* commands, such as breadcrumb interrupts, are strictly ordered
92	* wrt the contents of the write cache being flushed to memory
93	* (and thus being coherent from the CPU).
94	*/
95	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
96
97	if (mode & EMIT_INVALIDATE) {
98	cmd \|= MI_INVALIDATE_TLB;
99	if (rq->engine->class == VIDEO_DECODE_CLASS)
100	cmd \|= MI_INVALIDATE_BSD;
101	}
102
103	*cs++ = cmd;
104	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
105	cs++ = `0`; /* upper addr /
106	cs++ = `0`; /* value /
107	intel_ring_advance(rq, cs);
108
109	return `0`;
110	}
111
112	int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode)
113	{
114	if (mode & EMIT_FLUSH) {
115	u32 *cs;
116	u32 flags = `0`;
117
118	flags \|= PIPE_CONTROL_CS_STALL;
119
120	flags \|= PIPE_CONTROL_TILE_CACHE_FLUSH;
121	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
122	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
123	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
124	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
125	flags \|= PIPE_CONTROL_QW_WRITE;
126	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
127
128	cs = intel_ring_begin(rq, num_dwords: `6`);
129	if (IS_ERR(ptr: cs))
130	return PTR_ERR(ptr: cs);
131
132	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
133	intel_ring_advance(rq, cs);
134	}
135
136	if (mode & EMIT_INVALIDATE) {
137	u32 *cs;
138	u32 flags = `0`;
139
140	flags \|= PIPE_CONTROL_CS_STALL;
141
142	flags \|= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
143	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
144	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
145	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
146	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
147	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
148	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
149	flags \|= PIPE_CONTROL_QW_WRITE;
150	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
151
152	cs = intel_ring_begin(rq, num_dwords: `6`);
153	if (IS_ERR(ptr: cs))
154	return PTR_ERR(ptr: cs);
155
156	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
157	intel_ring_advance(rq, cs);
158	}
159
160	return `0`;
161	}
162
163	static u32 preparser_disable(bool state)
164	{
165	return MI_ARB_CHECK \| `1` << `8` \| state;
166	}
167
168	static i915_reg_t gen12_get_aux_inv_reg(struct intel_engine_cs *engine)
169	{
170	switch (engine->id) {
171	case RCS0:
172	return GEN12_CCS_AUX_INV;
173	case BCS0:
174	return GEN12_BCS0_AUX_INV;
175	case VCS0:
176	return GEN12_VD0_AUX_INV;
177	case VCS2:
178	return GEN12_VD2_AUX_INV;
179	case VECS0:
180	return GEN12_VE0_AUX_INV;
181	case CCS0:
182	return GEN12_CCS0_AUX_INV;
183	default:
184	return INVALID_MMIO_REG;
185	}
186	}
187
188	static bool gen12_needs_ccs_aux_inv(struct intel_engine_cs *engine)
189	{
190	i915_reg_t reg = gen12_get_aux_inv_reg(engine);
191
192	if (IS_PONTEVECCHIO(engine->i915))
193	return false;
194
195	/*
196	* So far platforms supported by i915 having flat ccs do not require
197	* AUX invalidation. Check also whether the engine requires it.
198	*/
199	return i915_mmio_reg_valid(reg) && !HAS_FLAT_CCS(engine->i915);
200	}
201
202	u32 gen12_emit_aux_table_inv(struct* intel_engine_cs engine, u32 cs)
203	{
204	i915_reg_t inv_reg = gen12_get_aux_inv_reg(engine);
205	u32 gsi_offset = engine->gt->uncore->gsi_offset;
206
207	if (!gen12_needs_ccs_aux_inv(engine))
208	return cs;
209
210	*cs++ = MI_LOAD_REGISTER_IMM(`1`) \| MI_LRI_MMIO_REMAP_EN;
211	*cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset;
212	*cs++ = AUX_INV;
213
214	*cs++ = MI_SEMAPHORE_WAIT_TOKEN \|
215	MI_SEMAPHORE_REGISTER_POLL \|
216	MI_SEMAPHORE_POLL \|
217	MI_SEMAPHORE_SAD_EQ_SDD;
218	*cs++ = `0`;
219	*cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset;
220	*cs++ = `0`;
221	*cs++ = `0`;
222
223	return cs;
224	}
225
226	static int mtl_dummy_pipe_control(struct i915_request *rq)
227	{
228	/ Wa_14016712196 /
229	if (IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(`12`, `70`), IP_VER(`12`, `74`)) \|\|
230	IS_DG2(rq->i915)) {
231	u32 *cs;
232
233	/ dummy PIPE_CONTROL + depth flush /
234	cs = intel_ring_begin(rq, num_dwords: `6`);
235	if (IS_ERR(ptr: cs))
236	return PTR_ERR(ptr: cs);
237	cs = gen12_emit_pipe_control(batch: cs,
238	bit_group_0: `0`,
239	PIPE_CONTROL_DEPTH_CACHE_FLUSH,
240	LRC_PPHWSP_SCRATCH_ADDR);
241	intel_ring_advance(rq, cs);
242	}
243
244	return `0`;
245	}
246
247	int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode)
248	{
249	struct intel_engine_cs *engine = rq->engine;
250
251	/*
252	* On Aux CCS platforms the invalidation of the Aux
253	* table requires quiescing memory traffic beforehand
254	*/
255	if (mode & EMIT_FLUSH \|\| gen12_needs_ccs_aux_inv(engine)) {
256	u32 bit_group_0 = `0`;
257	u32 bit_group_1 = `0`;
258	int err;
259	u32 *cs;
260
261	err = mtl_dummy_pipe_control(rq);
262	if (err)
263	return err;
264
265	bit_group_0 \|= PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
266
267	/*
268	* When required, in MTL and beyond platforms we
269	* need to set the CCS_FLUSH bit in the pipe control
270	*/
271	if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(`12`, `70`))
272	bit_group_0 \|= PIPE_CONTROL_CCS_FLUSH;
273
274	/*
275	* L3 fabric flush is needed for AUX CCS invalidation
276	* which happens as part of pipe-control so we can
277	* ignore PIPE_CONTROL_FLUSH_L3. Also PIPE_CONTROL_FLUSH_L3
278	* deals with Protected Memory which is not needed for
279	* AUX CCS invalidation and lead to unwanted side effects.
280	*/
281	if ((mode & EMIT_FLUSH) &&
282	GRAPHICS_VER_FULL(rq->i915) < IP_VER(`12`, `70`))
283	bit_group_1 \|= PIPE_CONTROL_FLUSH_L3;
284
285	bit_group_1 \|= PIPE_CONTROL_TILE_CACHE_FLUSH;
286	bit_group_1 \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
287	bit_group_1 \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
288	/ Wa_1409600907:tgl,adl-p /
289	bit_group_1 \|= PIPE_CONTROL_DEPTH_STALL;
290	bit_group_1 \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
291	bit_group_1 \|= PIPE_CONTROL_FLUSH_ENABLE;
292
293	bit_group_1 \|= PIPE_CONTROL_STORE_DATA_INDEX;
294	bit_group_1 \|= PIPE_CONTROL_QW_WRITE;
295
296	bit_group_1 \|= PIPE_CONTROL_CS_STALL;
297
298	if (!HAS_3D_PIPELINE(engine->i915))
299	bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
300	else if (engine->class == COMPUTE_CLASS)
301	bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
302
303	cs = intel_ring_begin(rq, num_dwords: `6`);
304	if (IS_ERR(ptr: cs))
305	return PTR_ERR(ptr: cs);
306
307	cs = gen12_emit_pipe_control(batch: cs, bit_group_0, bit_group_1,
308	LRC_PPHWSP_SCRATCH_ADDR);
309	intel_ring_advance(rq, cs);
310	}
311
312	if (mode & EMIT_INVALIDATE) {
313	u32 flags = `0`;
314	u32 *cs, count;
315	int err;
316
317	err = mtl_dummy_pipe_control(rq);
318	if (err)
319	return err;
320
321	flags \|= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
322	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
323	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
324	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
325	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
326	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
327	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
328
329	flags \|= PIPE_CONTROL_STORE_DATA_INDEX;
330	flags \|= PIPE_CONTROL_QW_WRITE;
331
332	flags \|= PIPE_CONTROL_CS_STALL;
333
334	if (!HAS_3D_PIPELINE(engine->i915))
335	flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
336	else if (engine->class == COMPUTE_CLASS)
337	flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
338
339	count = `8`;
340	if (gen12_needs_ccs_aux_inv(engine: rq->engine))
341	count += `8`;
342
343	cs = intel_ring_begin(rq, num_dwords: count);
344	if (IS_ERR(ptr: cs))
345	return PTR_ERR(ptr: cs);
346
347	/*
348	* Prevent the pre-parser from skipping past the TLB
349	* invalidate and loading a stale page for the batch
350	* buffer / request payload.
351	*/
352	*cs++ = preparser_disable(state: true);
353
354	cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR);
355
356	cs = gen12_emit_aux_table_inv(engine, cs);
357
358	*cs++ = preparser_disable(state: false);
359	intel_ring_advance(rq, cs);
360	}
361
362	return `0`;
363	}
364
365	int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
366	{
367	u32 cmd = `4`;
368	u32 *cs;
369
370	if (mode & EMIT_INVALIDATE) {
371	cmd += `2`;
372
373	if (gen12_needs_ccs_aux_inv(engine: rq->engine))
374	cmd += `8`;
375	}
376
377	cs = intel_ring_begin(rq, num_dwords: cmd);
378	if (IS_ERR(ptr: cs))
379	return PTR_ERR(ptr: cs);
380
381	if (mode & EMIT_INVALIDATE)
382	*cs++ = preparser_disable(state: true);
383
384	cmd = MI_FLUSH_DW + `1`;
385
386	/*
387	* We always require a command barrier so that subsequent
388	* commands, such as breadcrumb interrupts, are strictly ordered
389	* wrt the contents of the write cache being flushed to memory
390	* (and thus being coherent from the CPU).
391	*/
392	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
393
394	if (mode & EMIT_INVALIDATE) {
395	cmd \|= MI_INVALIDATE_TLB;
396	if (rq->engine->class == VIDEO_DECODE_CLASS)
397	cmd \|= MI_INVALIDATE_BSD;
398
399	if (gen12_needs_ccs_aux_inv(engine: rq->engine) &&
400	rq->engine->class == COPY_ENGINE_CLASS)
401	cmd \|= MI_FLUSH_DW_CCS;
402	}
403
404	*cs++ = cmd;
405	*cs++ = LRC_PPHWSP_SCRATCH_ADDR;
406	cs++ = `0`; /* upper addr /
407	cs++ = `0`; /* value /
408
409	cs = gen12_emit_aux_table_inv(engine: rq->engine, cs);
410
411	if (mode & EMIT_INVALIDATE)
412	*cs++ = preparser_disable(state: false);
413
414	intel_ring_advance(rq, cs);
415
416	return `0`;
417	}
418
419	static u32 preempt_address(struct intel_engine_cs *engine)
420	{
421	return (i915_ggtt_offset(vma: engine->status_page.vma) +
422	I915_GEM_HWS_PREEMPT_ADDR);
423	}
424
425	static u32 hwsp_offset(const struct i915_request *rq)
426	{
427	const struct intel_timeline *tl;
428
429	/ Before the request is executed, the timeline is fixed /
430	tl = rcu_dereference_protected(rq->timeline,
431	!i915_request_signaled(rq));
432
433	/ See the comment in i915_request_active_seqno(). /
434	return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno);
435	}
436
437	int gen8_emit_init_breadcrumb(struct i915_request *rq)
438	{
439	u32 *cs;
440
441	GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
442	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
443	return `0`;
444
445	cs = intel_ring_begin(rq, num_dwords: `6`);
446	if (IS_ERR(ptr: cs))
447	return PTR_ERR(ptr: cs);
448
449	*cs++ = MI_STORE_DWORD_IMM_GEN4 \| MI_USE_GGTT;
450	*cs++ = hwsp_offset(rq);
451	*cs++ = `0`;
452	*cs++ = rq->fence.seqno - `1`;
453
454	/*
455	* Check if we have been preempted before we even get started.
456	*
457	* After this point i915_request_started() reports true, even if
458	* we get preempted and so are no longer running.
459	*
460	* i915_request_started() is used during preemption processing
461	* to decide if the request is currently inside the user payload
462	* or spinning on a kernel semaphore (or earlier). For no-preemption
463	* requests, we do allow preemption on the semaphore before the user
464	* payload, but do not allow preemption once the request is started.
465	*
466	* i915_request_started() is similarly used during GPU hangs to
467	* determine if the user's payload was guilty, and if so, the
468	* request is banned. Before the request is started, it is assumed
469	* to be unharmed and an innocent victim of another's hang.
470	*/
471	*cs++ = MI_NOOP;
472	*cs++ = MI_ARB_CHECK;
473
474	intel_ring_advance(rq, cs);
475
476	/ Record the updated position of the request's payload /
477	rq->infix = intel_ring_offset(rq, addr: cs);
478
479	__set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
480
481	return `0`;
482	}
483
484	static int __xehp_emit_bb_start(struct i915_request *rq,
485	u64 offset, u32 len,
486	const unsigned int flags,
487	u32 arb)
488	{
489	struct intel_context *ce = rq->context;
490	u32 wa_offset = lrc_indirect_bb(ce);
491	u32 *cs;
492
493	GEM_BUG_ON(!ce->wa_bb_page);
494
495	cs = intel_ring_begin(rq, num_dwords: `12`);
496	if (IS_ERR(ptr: cs))
497	return PTR_ERR(ptr: cs);
498
499	*cs++ = MI_ARB_ON_OFF \| arb;
500
501	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 \|
502	MI_SRM_LRM_GLOBAL_GTT \|
503	MI_LRI_LRM_CS_MMIO;
504	*cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(`0`));
505	*cs++ = wa_offset + DG2_PREDICATE_RESULT_WA;
506	*cs++ = `0`;
507
508	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
509	(flags & I915_DISPATCH_SECURE ? `0` : BIT(`8`));
510	*cs++ = lower_32_bits(offset);
511	*cs++ = upper_32_bits(offset);
512
513	/ Fixup stray MI_SET_PREDICATE as it prevents us executing the ring /
514	*cs++ = MI_BATCH_BUFFER_START_GEN8;
515	*cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
516	*cs++ = `0`;
517
518	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
519
520	intel_ring_advance(rq, cs);
521
522	return `0`;
523	}
524
525	int xehp_emit_bb_start_noarb(struct i915_request *rq,
526	u64 offset, u32 len,
527	const unsigned int flags)
528	{
529	return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE);
530	}
531
532	int xehp_emit_bb_start(struct i915_request *rq,
533	u64 offset, u32 len,
534	const unsigned int flags)
535	{
536	return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE);
537	}
538
539	int gen8_emit_bb_start_noarb(struct i915_request *rq,
540	u64 offset, u32 len,
541	const unsigned int flags)
542	{
543	u32 *cs;
544
545	cs = intel_ring_begin(rq, num_dwords: `4`);
546	if (IS_ERR(ptr: cs))
547	return PTR_ERR(ptr: cs);
548
549	/*
550	* WaDisableCtxRestoreArbitration:bdw,chv
551	*
552	* We don't need to perform MI_ARB_ENABLE as often as we do (in
553	* particular all the gen that do not need the w/a at all!), if we
554	* took care to make sure that on every switch into this context
555	* (both ordinary and for preemption) that arbitrartion was enabled
556	* we would be fine. However, for gen8 there is another w/a that
557	* requires us to not preempt inside GPGPU execution, so we keep
558	* arbitration disabled for gen8 batches. Arbitration will be
559	* re-enabled before we close the request
560	* (engine->emit_fini_breadcrumb).
561	*/
562	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
563
564	/ FIXME(BDW+): Address space and security selectors. /
565	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
566	(flags & I915_DISPATCH_SECURE ? `0` : BIT(`8`));
567	*cs++ = lower_32_bits(offset);
568	*cs++ = upper_32_bits(offset);
569
570	intel_ring_advance(rq, cs);
571
572	return `0`;
573	}
574
575	int gen8_emit_bb_start(struct i915_request *rq,
576	u64 offset, u32 len,
577	const unsigned int flags)
578	{
579	u32 *cs;
580
581	if (unlikely(i915_request_has_nopreempt(rq)))
582	return gen8_emit_bb_start_noarb(rq, offset, len, flags);
583
584	cs = intel_ring_begin(rq, num_dwords: `6`);
585	if (IS_ERR(ptr: cs))
586	return PTR_ERR(ptr: cs);
587
588	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
589
590	*cs++ = MI_BATCH_BUFFER_START_GEN8 \|
591	(flags & I915_DISPATCH_SECURE ? `0` : BIT(`8`));
592	*cs++ = lower_32_bits(offset);
593	*cs++ = upper_32_bits(offset);
594
595	*cs++ = MI_ARB_ON_OFF \| MI_ARB_DISABLE;
596	*cs++ = MI_NOOP;
597
598	intel_ring_advance(rq, cs);
599
600	return `0`;
601	}
602
603	static void assert_request_valid(struct i915_request *rq)
604	{
605	struct intel_ring *ring __maybe_unused = rq->ring;
606
607	/ Can we unwind this request without appearing to go forwards? /
608	GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= `0`);
609	}
610
611	/*
612	* Reserve space for 2 NOOPs at the end of each request to be
613	* used as a workaround for not being allowed to do lite
614	* restore with HEAD==TAIL (WaIdleLiteRestore).
615	*/
616	static u32 gen8_emit_wa_tail(struct* i915_request rq, u32 cs)
617	{
618	/ Ensure there's always at least one preemption point per-request. /
619	*cs++ = MI_ARB_CHECK;
620	*cs++ = MI_NOOP;
621	rq->wa_tail = intel_ring_offset(rq, addr: cs);
622
623	/ Check that entire request is less than half the ring /
624	assert_request_valid(rq);
625
626	return cs;
627	}
628
629	static u32 emit_preempt_busywait(struct* i915_request rq, u32 cs)
630	{
631	cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first /
632	*cs++ = MI_SEMAPHORE_WAIT \|
633	MI_SEMAPHORE_GLOBAL_GTT \|
634	MI_SEMAPHORE_POLL \|
635	MI_SEMAPHORE_SAD_EQ_SDD;
636	*cs++ = `0`;
637	*cs++ = preempt_address(engine: rq->engine);
638	*cs++ = `0`;
639	*cs++ = MI_NOOP;
640
641	return cs;
642	}
643
644	static __always_inline u32*
645	gen8_emit_fini_breadcrumb_tail(struct i915_request rq, u32 cs)
646	{
647	*cs++ = MI_USER_INTERRUPT;
648
649	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
650	if (intel_engine_has_semaphores(engine: rq->engine) &&
651	!intel_uc_uses_guc_submission(uc: &rq->engine->gt->uc))
652	cs = emit_preempt_busywait(rq, cs);
653
654	rq->tail = intel_ring_offset(rq, addr: cs);
655	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
656
657	return gen8_emit_wa_tail(rq, cs);
658	}
659
660	static u32 emit_xcs_breadcrumb(struct* i915_request rq, u32 cs)
661	{
662	return gen8_emit_ggtt_write(cs, value: rq->fence.seqno, gtt_offset: hwsp_offset(rq), flags: `0`);
663	}
664
665	u32 gen8_emit_fini_breadcrumb_xcs(struct* i915_request rq, u32 cs)
666	{
667	return gen8_emit_fini_breadcrumb_tail(rq, cs: emit_xcs_breadcrumb(rq, cs));
668	}
669
670	u32 gen8_emit_fini_breadcrumb_rcs(struct* i915_request rq, u32 cs)
671	{
672	cs = gen8_emit_pipe_control(batch: cs,
673	PIPE_CONTROL_CS_STALL \|
674	PIPE_CONTROL_TLB_INVALIDATE \|
675	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
676	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
677	PIPE_CONTROL_DC_FLUSH_ENABLE,
678	offset: `0`);
679
680	/ XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl /
681	cs = gen8_emit_ggtt_write_rcs(cs,
682	value: rq->fence.seqno,
683	gtt_offset: hwsp_offset(rq),
684	PIPE_CONTROL_FLUSH_ENABLE \|
685	PIPE_CONTROL_CS_STALL);
686
687	return gen8_emit_fini_breadcrumb_tail(rq, cs);
688	}
689
690	u32 gen11_emit_fini_breadcrumb_rcs(struct* i915_request rq, u32 cs)
691	{
692	cs = gen8_emit_pipe_control(batch: cs,
693	PIPE_CONTROL_CS_STALL \|
694	PIPE_CONTROL_TLB_INVALIDATE \|
695	PIPE_CONTROL_TILE_CACHE_FLUSH \|
696	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
697	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
698	PIPE_CONTROL_DC_FLUSH_ENABLE,
699	offset: `0`);
700
701	/XXX: Look at gen8_emit_fini_breadcrumb_rcs /
702	cs = gen8_emit_ggtt_write_rcs(cs,
703	value: rq->fence.seqno,
704	gtt_offset: hwsp_offset(rq),
705	PIPE_CONTROL_FLUSH_ENABLE \|
706	PIPE_CONTROL_CS_STALL);
707
708	return gen8_emit_fini_breadcrumb_tail(rq, cs);
709	}
710
711	/*
712	* Note that the CS instruction pre-parser will not stall on the breadcrumb
713	* flush and will continue pre-fetching the instructions after it before the
714	* memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
715	* BB_START/END instructions, so, even though we might pre-fetch the pre-amble
716	* of the next request before the memory has been flushed, we're guaranteed that
717	* we won't access the batch itself too early.
718	* However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
719	* so, if the current request is modifying an instruction in the next request on
720	* the same intel_context, we might pre-fetch and then execute the pre-update
721	* instruction. To avoid this, the users of self-modifying code should either
722	* disable the parser around the code emitting the memory writes, via a new flag
723	* added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
724	* the in-kernel use-cases we've opted to use a separate context, see
725	* reloc_gpu() as an example.
726	* All the above applies only to the instructions themselves. Non-inline data
727	* used by the instructions is not pre-fetched.
728	*/
729
730	static u32 gen12_emit_preempt_busywait(struct* i915_request rq, u32 cs)
731	{
732	cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first /
733	*cs++ = MI_SEMAPHORE_WAIT_TOKEN \|
734	MI_SEMAPHORE_GLOBAL_GTT \|
735	MI_SEMAPHORE_POLL \|
736	MI_SEMAPHORE_SAD_EQ_SDD;
737	*cs++ = `0`;
738	*cs++ = preempt_address(engine: rq->engine);
739	*cs++ = `0`;
740	*cs++ = `0`;
741
742	return cs;
743	}
744
745	/ Wa_14014475959:dg2 /
746	#define CCS_SEMAPHORE_PPHWSP_OFFSET 0x540
747	static u32 ccs_semaphore_offset(struct i915_request *rq)
748	{
749	return i915_ggtt_offset(vma: rq->context->state) +
750	(LRC_PPHWSP_PN * PAGE_SIZE) + CCS_SEMAPHORE_PPHWSP_OFFSET;
751	}
752
753	/ Wa_14014475959:dg2 /
754	static u32 ccs_emit_wa_busywait(struct* i915_request rq, u32 cs)
755	{
756	int i;
757
758	*cs++ = MI_ATOMIC_INLINE \| MI_ATOMIC_GLOBAL_GTT \| MI_ATOMIC_CS_STALL \|
759	MI_ATOMIC_MOVE;
760	*cs++ = ccs_semaphore_offset(rq);
761	*cs++ = `0`;
762	*cs++ = `1`;
763
764	/*
765	* When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP)
766	* to align. 4 DWs above + 8 filler DWs here.
767	*/
768	for (i = `0`; i < `8`; ++i)
769	*cs++ = `0`;
770
771	*cs++ = MI_SEMAPHORE_WAIT \|
772	MI_SEMAPHORE_GLOBAL_GTT \|
773	MI_SEMAPHORE_POLL \|
774	MI_SEMAPHORE_SAD_EQ_SDD;
775	*cs++ = `0`;
776	*cs++ = ccs_semaphore_offset(rq);
777	*cs++ = `0`;
778
779	return cs;
780	}
781
782	static __always_inline u32*
783	gen12_emit_fini_breadcrumb_tail(struct i915_request rq, u32 cs)
784	{
785	*cs++ = MI_USER_INTERRUPT;
786
787	*cs++ = MI_ARB_ON_OFF \| MI_ARB_ENABLE;
788	if (intel_engine_has_semaphores(engine: rq->engine) &&
789	!intel_uc_uses_guc_submission(uc: &rq->engine->gt->uc))
790	cs = gen12_emit_preempt_busywait(rq, cs);
791
792	/ Wa_14014475959:dg2 /
793	if (intel_engine_uses_wa_hold_ccs_switchout(engine: rq->engine))
794	cs = ccs_emit_wa_busywait(rq, cs);
795
796	rq->tail = intel_ring_offset(rq, addr: cs);
797	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
798
799	return gen8_emit_wa_tail(rq, cs);
800	}
801
802	u32 gen12_emit_fini_breadcrumb_xcs(struct* i915_request rq, u32 cs)
803	{
804	/ XXX Stalling flush before seqno write; post-sync not /
805	cs = emit_xcs_breadcrumb(rq, cs: __gen8_emit_flush_dw(cs, value: `0`, gtt_offset: `0`, flags: `0`));
806	return gen12_emit_fini_breadcrumb_tail(rq, cs);
807	}
808
809	u32 gen12_emit_fini_breadcrumb_rcs(struct* i915_request rq, u32 cs)
810	{
811	struct drm_i915_private *i915 = rq->i915;
812	struct intel_gt *gt = rq->engine->gt;
813	u32 flags = (PIPE_CONTROL_CS_STALL \|
814	PIPE_CONTROL_TLB_INVALIDATE \|
815	PIPE_CONTROL_TILE_CACHE_FLUSH \|
816	PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
817	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
818	PIPE_CONTROL_DC_FLUSH_ENABLE \|
819	PIPE_CONTROL_FLUSH_ENABLE);
820
821	if (GRAPHICS_VER_FULL(rq->i915) < IP_VER(`12`, `70`))
822	flags \|= PIPE_CONTROL_FLUSH_L3;
823
824	/ Wa_14016712196 /
825	if (IS_GFX_GT_IP_RANGE(gt, IP_VER(`12`, `70`), IP_VER(`12`, `74`)) \|\| IS_DG2(i915))
826	/ dummy PIPE_CONTROL + depth flush /
827	cs = gen12_emit_pipe_control(batch: cs, bit_group_0: `0`,
828	PIPE_CONTROL_DEPTH_CACHE_FLUSH, offset: `0`);
829
830	if (GRAPHICS_VER(i915) == `12` && GRAPHICS_VER_FULL(i915) < IP_VER(`12`, `50`))
831	/ Wa_1409600907 /
832	flags \|= PIPE_CONTROL_DEPTH_STALL;
833
834	if (!HAS_3D_PIPELINE(rq->i915))
835	flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS;
836	else if (rq->engine->class == COMPUTE_CLASS)
837	flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS;
838
839	cs = gen12_emit_pipe_control(batch: cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, bit_group_1: flags, offset: `0`);
840
841	/XXX: Look at gen8_emit_fini_breadcrumb_rcs /
842	cs = gen12_emit_ggtt_write_rcs(cs,
843	value: rq->fence.seqno,
844	gtt_offset: hwsp_offset(rq),
845	flags0: `0`,
846	PIPE_CONTROL_FLUSH_ENABLE \|
847	PIPE_CONTROL_CS_STALL);
848
849	return gen12_emit_fini_breadcrumb_tail(rq, cs);
850	}
851

source code of linux/drivers/gpu/drm/i915/gt/gen8_engine_cs.c