gen6_engine_cs.c source code [linux/drivers/gpu/drm/i915/gt/gen6_engine_cs.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2020 Intel Corporation
4	*/
5
6	#include "gen6_engine_cs.h"
7	#include "intel_engine.h"
8	#include "intel_engine_regs.h"
9	#include "intel_gpu_commands.h"
10	#include "intel_gt.h"
11	#include "intel_gt_irq.h"
12	#include "intel_gt_pm_irq.h"
13	#include "intel_ring.h"
14
15	#define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32))
16
17	/*
18	* Emits a PIPE_CONTROL with a non-zero post-sync operation, for
19	* implementing two workarounds on gen6. From section 1.4.7.1
20	* "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
21	*
22	* [DevSNB-C+{W/A}] Before any depth stall flush (including those
23	* produced by non-pipelined state commands), software needs to first
24	* send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
25	* 0.
26	*
27	* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
28	* =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
29	*
30	* And the workaround for these two requires this workaround first:
31	*
32	* [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
33	* BEFORE the pipe-control with a post-sync op and no write-cache
34	* flushes.
35	*
36	* And this last workaround is tricky because of the requirements on
37	* that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
38	* volume 2 part 1:
39	*
40	* "1 of the following must also be set:
41	* - Render Target Cache Flush Enable ([12] of DW1)
42	* - Depth Cache Flush Enable ([0] of DW1)
43	* - Stall at Pixel Scoreboard ([1] of DW1)
44	* - Depth Stall ([13] of DW1)
45	* - Post-Sync Operation ([13] of DW1)
46	* - Notify Enable ([8] of DW1)"
47	*
48	* The cache flushes require the workaround flush that triggered this
49	* one, so we can't use it. Depth stall would trigger the same.
50	* Post-sync nonzero is what triggered this second workaround, so we
51	* can't use that one either. Notify enable is IRQs, which aren't
52	* really our business. That leaves only stall at scoreboard.
53	*/
54	static int
55	gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
56	{
57	u32 scratch_addr =
58	intel_gt_scratch_offset(gt: rq->engine->gt,
59	field: INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
60	u32 *cs;
61
62	cs = intel_ring_begin(rq, num_dwords: `6`);
63	if (IS_ERR(ptr: cs))
64	return PTR_ERR(ptr: cs);
65
66	*cs++ = GFX_OP_PIPE_CONTROL(`5`);
67	*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;
68	*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;
69	cs++ = `0`; /* low dword /
70	cs++ = `0`; /* high dword /
71	*cs++ = MI_NOOP;
72	intel_ring_advance(rq, cs);
73
74	cs = intel_ring_begin(rq, num_dwords: `6`);
75	if (IS_ERR(ptr: cs))
76	return PTR_ERR(ptr: cs);
77
78	*cs++ = GFX_OP_PIPE_CONTROL(`5`);
79	*cs++ = PIPE_CONTROL_QW_WRITE;
80	*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;
81	*cs++ = `0`;
82	*cs++ = `0`;
83	*cs++ = MI_NOOP;
84	intel_ring_advance(rq, cs);
85
86	return `0`;
87	}
88
89	int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
90	{
91	u32 scratch_addr =
92	intel_gt_scratch_offset(gt: rq->engine->gt,
93	field: INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
94	u32 *cs, flags = `0`;
95	int ret;
96
97	/ Force SNB workarounds for PIPE_CONTROL flushes /
98	ret = gen6_emit_post_sync_nonzero_flush(rq);
99	if (ret)
100	return ret;
101
102	/*
103	* Just flush everything. Experiments have shown that reducing the
104	* number of bits based on the write domains has little performance
105	* impact. And when rearranging requests, the order of flushes is
106	* unknown.
107	*/
108	if (mode & EMIT_FLUSH) {
109	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
110	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
111	/*
112	* Ensure that any following seqno writes only happen
113	* when the render cache is indeed flushed.
114	*/
115	flags \|= PIPE_CONTROL_CS_STALL;
116	}
117	if (mode & EMIT_INVALIDATE) {
118	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
119	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
120	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
121	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
122	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
123	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
124	/*
125	* TLB invalidate requires a post-sync write.
126	*/
127	flags \|= PIPE_CONTROL_QW_WRITE \| PIPE_CONTROL_CS_STALL;
128	}
129
130	cs = intel_ring_begin(rq, num_dwords: `4`);
131	if (IS_ERR(ptr: cs))
132	return PTR_ERR(ptr: cs);
133
134	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
135	*cs++ = flags;
136	*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;
137	*cs++ = `0`;
138	intel_ring_advance(rq, cs);
139
140	return `0`;
141	}
142
143	u32 gen6_emit_breadcrumb_rcs(struct* i915_request rq, u32 cs)
144	{
145	/ First we do the gen6_emit_post_sync_nonzero_flush w/a /
146	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
147	*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;
148	*cs++ = `0`;
149	*cs++ = `0`;
150
151	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
152	*cs++ = PIPE_CONTROL_QW_WRITE;
153	*cs++ = intel_gt_scratch_offset(gt: rq->engine->gt,
154	field: INTEL_GT_SCRATCH_FIELD_DEFAULT) \|
155	PIPE_CONTROL_GLOBAL_GTT;
156	*cs++ = `0`;
157
158	/ Finally we can flush and with it emit the breadcrumb /
159	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
160	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
161	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
162	PIPE_CONTROL_DC_FLUSH_ENABLE \|
163	PIPE_CONTROL_QW_WRITE \|
164	PIPE_CONTROL_CS_STALL);
165	*cs++ = i915_request_active_seqno(rq) \|
166	PIPE_CONTROL_GLOBAL_GTT;
167	*cs++ = rq->fence.seqno;
168
169	*cs++ = MI_USER_INTERRUPT;
170	*cs++ = MI_NOOP;
171
172	rq->tail = intel_ring_offset(rq, addr: cs);
173	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
174
175	return cs;
176	}
177
178	static int mi_flush_dw(struct i915_request *rq, u32 flags)
179	{
180	u32 cmd, *cs;
181
182	cs = intel_ring_begin(rq, num_dwords: `4`);
183	if (IS_ERR(ptr: cs))
184	return PTR_ERR(ptr: cs);
185
186	cmd = MI_FLUSH_DW;
187
188	/*
189	* We always require a command barrier so that subsequent
190	* commands, such as breadcrumb interrupts, are strictly ordered
191	* wrt the contents of the write cache being flushed to memory
192	* (and thus being coherent from the CPU).
193	*/
194	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
195
196	/*
197	* Bspec vol 1c.3 - blitter engine command streamer:
198	* "If ENABLED, all TLBs will be invalidated once the flush
199	* operation is complete. This bit is only valid when the
200	* Post-Sync Operation field is a value of 1h or 3h."
201	*/
202	cmd \|= flags;
203
204	*cs++ = cmd;
205	*cs++ = HWS_SCRATCH_ADDR \| MI_FLUSH_DW_USE_GTT;
206	*cs++ = `0`;
207	*cs++ = MI_NOOP;
208
209	intel_ring_advance(rq, cs);
210
211	return `0`;
212	}
213
214	static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
215	{
216	return mi_flush_dw(rq, flags: mode & EMIT_INVALIDATE ? invflags : `0`);
217	}
218
219	int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
220	{
221	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
222	}
223
224	int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
225	{
226	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB \| MI_INVALIDATE_BSD);
227	}
228
229	int gen6_emit_bb_start(struct i915_request *rq,
230	u64 offset, u32 len,
231	unsigned int dispatch_flags)
232	{
233	u32 security;
234	u32 *cs;
235
236	security = MI_BATCH_NON_SECURE_I965;
237	if (dispatch_flags & I915_DISPATCH_SECURE)
238	security = `0`;
239
240	cs = intel_ring_begin(rq, num_dwords: `2`);
241	if (IS_ERR(ptr: cs))
242	return PTR_ERR(ptr: cs);
243
244	cs = __gen6_emit_bb_start(cs, addr: offset, flags: security);
245	intel_ring_advance(rq, cs);
246
247	return `0`;
248	}
249
250	int
251	hsw_emit_bb_start(struct i915_request *rq,
252	u64 offset, u32 len,
253	unsigned int dispatch_flags)
254	{
255	u32 security;
256	u32 *cs;
257
258	security = MI_BATCH_PPGTT_HSW \| MI_BATCH_NON_SECURE_HSW;
259	if (dispatch_flags & I915_DISPATCH_SECURE)
260	security = `0`;
261
262	cs = intel_ring_begin(rq, num_dwords: `2`);
263	if (IS_ERR(ptr: cs))
264	return PTR_ERR(ptr: cs);
265
266	cs = __gen6_emit_bb_start(cs, addr: offset, flags: security);
267	intel_ring_advance(rq, cs);
268
269	return `0`;
270	}
271
272	static int gen7_stall_cs(struct i915_request *rq)
273	{
274	u32 *cs;
275
276	cs = intel_ring_begin(rq, num_dwords: `4`);
277	if (IS_ERR(ptr: cs))
278	return PTR_ERR(ptr: cs);
279
280	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
281	*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;
282	*cs++ = `0`;
283	*cs++ = `0`;
284	intel_ring_advance(rq, cs);
285
286	return `0`;
287	}
288
289	int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
290	{
291	u32 scratch_addr =
292	intel_gt_scratch_offset(gt: rq->engine->gt,
293	field: INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
294	u32 *cs, flags = `0`;
295
296	/*
297	* Ensure that any following seqno writes only happen when the render
298	* cache is indeed flushed.
299	*
300	* Workaround: 4th PIPE_CONTROL command (except the ones with only
301	* read-cache invalidate bits set) must have the CS_STALL bit set. We
302	* don't try to be clever and just set it unconditionally.
303	*/
304	flags \|= PIPE_CONTROL_CS_STALL;
305
306	/*
307	* CS_STALL suggests at least a post-sync write.
308	*/
309	flags \|= PIPE_CONTROL_QW_WRITE;
310	flags \|= PIPE_CONTROL_GLOBAL_GTT_IVB;
311
312	/*
313	* Just flush everything. Experiments have shown that reducing the
314	* number of bits based on the write domains has little performance
315	* impact.
316	*/
317	if (mode & EMIT_FLUSH) {
318	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
319	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
320	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
321	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
322	}
323	if (mode & EMIT_INVALIDATE) {
324	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
325	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
326	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
327	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
328	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
329	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
330	flags \|= PIPE_CONTROL_MEDIA_STATE_CLEAR;
331
332	/*
333	* Workaround: we must issue a pipe_control with CS-stall bit
334	* set before a pipe_control command that has the state cache
335	* invalidate bit set.
336	*/
337	gen7_stall_cs(rq);
338	}
339
340	cs = intel_ring_begin(rq, num_dwords: `4`);
341	if (IS_ERR(ptr: cs))
342	return PTR_ERR(ptr: cs);
343
344	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
345	*cs++ = flags;
346	*cs++ = scratch_addr;
347	*cs++ = `0`;
348	intel_ring_advance(rq, cs);
349
350	return `0`;
351	}
352
353	u32 gen7_emit_breadcrumb_rcs(struct* i915_request rq, u32 cs)
354	{
355	*cs++ = GFX_OP_PIPE_CONTROL(`4`);
356	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|
357	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
358	PIPE_CONTROL_DC_FLUSH_ENABLE \|
359	PIPE_CONTROL_FLUSH_ENABLE \|
360	PIPE_CONTROL_QW_WRITE \|
361	PIPE_CONTROL_GLOBAL_GTT_IVB \|
362	PIPE_CONTROL_CS_STALL);
363	*cs++ = i915_request_active_seqno(rq);
364	*cs++ = rq->fence.seqno;
365
366	*cs++ = MI_USER_INTERRUPT;
367	*cs++ = MI_NOOP;
368
369	rq->tail = intel_ring_offset(rq, addr: cs);
370	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
371
372	return cs;
373	}
374
375	u32 gen6_emit_breadcrumb_xcs(struct* i915_request rq, u32 cs)
376	{
377	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
378	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
379
380	*cs++ = MI_FLUSH_DW \| MI_FLUSH_DW_OP_STOREDW \| MI_FLUSH_DW_STORE_INDEX;
381	*cs++ = I915_GEM_HWS_SEQNO_ADDR \| MI_FLUSH_DW_USE_GTT;
382	*cs++ = rq->fence.seqno;
383
384	*cs++ = MI_USER_INTERRUPT;
385
386	rq->tail = intel_ring_offset(rq, addr: cs);
387	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
388
389	return cs;
390	}
391
392	#define GEN7_XCS_WA 32
393	u32 gen7_emit_breadcrumb_xcs(struct* i915_request rq, u32 cs)
394	{
395	int i;
396
397	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
398	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
399
400	*cs++ = MI_FLUSH_DW \| MI_INVALIDATE_TLB \|
401	MI_FLUSH_DW_OP_STOREDW \| MI_FLUSH_DW_STORE_INDEX;
402	*cs++ = I915_GEM_HWS_SEQNO_ADDR \| MI_FLUSH_DW_USE_GTT;
403	*cs++ = rq->fence.seqno;
404
405	for (i = `0`; i < GEN7_XCS_WA; i++) {
406	*cs++ = MI_STORE_DWORD_INDEX;
407	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
408	*cs++ = rq->fence.seqno;
409	}
410
411	*cs++ = MI_FLUSH_DW;
412	*cs++ = `0`;
413	*cs++ = `0`;
414
415	*cs++ = MI_USER_INTERRUPT;
416	*cs++ = MI_NOOP;
417
418	rq->tail = intel_ring_offset(rq, addr: cs);
419	assert_ring_tail_valid(ring: rq->ring, tail: rq->tail);
420
421	return cs;
422	}
423	#undef GEN7_XCS_WA
424
425	void gen6_irq_enable(struct intel_engine_cs *engine)
426	{
427	ENGINE_WRITE(engine, RING_IMR,
428	~(engine->irq_enable_mask \| engine->irq_keep_mask));
429
430	/ Flush/delay to ensure the RING_IMR is active before the GT IMR /
431	ENGINE_POSTING_READ(engine, RING_IMR);
432
433	gen5_gt_enable_irq(gt: engine->gt, mask: engine->irq_enable_mask);
434	}
435
436	void gen6_irq_disable(struct intel_engine_cs *engine)
437	{
438	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
439	gen5_gt_disable_irq(gt: engine->gt, mask: engine->irq_enable_mask);
440	}
441
442	void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
443	{
444	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
445
446	/ Flush/delay to ensure the RING_IMR is active before the GT IMR /
447	ENGINE_POSTING_READ(engine, RING_IMR);
448
449	gen6_gt_pm_unmask_irq(gt: engine->gt, mask: engine->irq_enable_mask);
450	}
451
452	void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
453	{
454	ENGINE_WRITE(engine, RING_IMR, ~`0`);
455	gen6_gt_pm_mask_irq(gt: engine->gt, mask: engine->irq_enable_mask);
456	}
457

source code of linux/drivers/gpu/drm/i915/gt/gen6_engine_cs.c