1 | // SPDX-License-Identifier: MIT |
2 | /* |
3 | * Copyright © 2014 Intel Corporation |
4 | */ |
5 | |
6 | #include "gen8_engine_cs.h" |
7 | #include "intel_engine_regs.h" |
8 | #include "intel_gpu_commands.h" |
9 | #include "intel_gt.h" |
10 | #include "intel_lrc.h" |
11 | #include "intel_ring.h" |
12 | |
13 | int gen8_emit_flush_rcs(struct i915_request *rq, u32 mode) |
14 | { |
15 | bool vf_flush_wa = false, dc_flush_wa = false; |
16 | u32 *cs, flags = 0; |
17 | int len; |
18 | |
19 | flags |= PIPE_CONTROL_CS_STALL; |
20 | |
21 | if (mode & EMIT_FLUSH) { |
22 | flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; |
23 | flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; |
24 | flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; |
25 | flags |= PIPE_CONTROL_FLUSH_ENABLE; |
26 | } |
27 | |
28 | if (mode & EMIT_INVALIDATE) { |
29 | flags |= PIPE_CONTROL_TLB_INVALIDATE; |
30 | flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; |
31 | flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; |
32 | flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; |
33 | flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; |
34 | flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; |
35 | flags |= PIPE_CONTROL_QW_WRITE; |
36 | flags |= PIPE_CONTROL_STORE_DATA_INDEX; |
37 | |
38 | /* |
39 | * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL |
40 | * pipe control. |
41 | */ |
42 | if (GRAPHICS_VER(rq->i915) == 9) |
43 | vf_flush_wa = true; |
44 | |
45 | /* WaForGAMHang:kbl */ |
46 | if (IS_KABYLAKE(rq->i915) && IS_GRAPHICS_STEP(rq->i915, 0, STEP_C0)) |
47 | dc_flush_wa = true; |
48 | } |
49 | |
50 | len = 6; |
51 | |
52 | if (vf_flush_wa) |
53 | len += 6; |
54 | |
55 | if (dc_flush_wa) |
56 | len += 12; |
57 | |
58 | cs = intel_ring_begin(rq, num_dwords: len); |
59 | if (IS_ERR(ptr: cs)) |
60 | return PTR_ERR(ptr: cs); |
61 | |
62 | if (vf_flush_wa) |
63 | cs = gen8_emit_pipe_control(batch: cs, bit_group_1: 0, offset: 0); |
64 | |
65 | if (dc_flush_wa) |
66 | cs = gen8_emit_pipe_control(batch: cs, PIPE_CONTROL_DC_FLUSH_ENABLE, |
67 | offset: 0); |
68 | |
69 | cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR); |
70 | |
71 | if (dc_flush_wa) |
72 | cs = gen8_emit_pipe_control(batch: cs, PIPE_CONTROL_CS_STALL, offset: 0); |
73 | |
74 | intel_ring_advance(rq, cs); |
75 | |
76 | return 0; |
77 | } |
78 | |
79 | int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode) |
80 | { |
81 | u32 cmd, *cs; |
82 | |
83 | cs = intel_ring_begin(rq, num_dwords: 4); |
84 | if (IS_ERR(ptr: cs)) |
85 | return PTR_ERR(ptr: cs); |
86 | |
87 | cmd = MI_FLUSH_DW + 1; |
88 | |
89 | /* |
90 | * We always require a command barrier so that subsequent |
91 | * commands, such as breadcrumb interrupts, are strictly ordered |
92 | * wrt the contents of the write cache being flushed to memory |
93 | * (and thus being coherent from the CPU). |
94 | */ |
95 | cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; |
96 | |
97 | if (mode & EMIT_INVALIDATE) { |
98 | cmd |= MI_INVALIDATE_TLB; |
99 | if (rq->engine->class == VIDEO_DECODE_CLASS) |
100 | cmd |= MI_INVALIDATE_BSD; |
101 | } |
102 | |
103 | *cs++ = cmd; |
104 | *cs++ = LRC_PPHWSP_SCRATCH_ADDR; |
105 | *cs++ = 0; /* upper addr */ |
106 | *cs++ = 0; /* value */ |
107 | intel_ring_advance(rq, cs); |
108 | |
109 | return 0; |
110 | } |
111 | |
112 | int gen11_emit_flush_rcs(struct i915_request *rq, u32 mode) |
113 | { |
114 | if (mode & EMIT_FLUSH) { |
115 | u32 *cs; |
116 | u32 flags = 0; |
117 | |
118 | flags |= PIPE_CONTROL_CS_STALL; |
119 | |
120 | flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; |
121 | flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; |
122 | flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; |
123 | flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; |
124 | flags |= PIPE_CONTROL_FLUSH_ENABLE; |
125 | flags |= PIPE_CONTROL_QW_WRITE; |
126 | flags |= PIPE_CONTROL_STORE_DATA_INDEX; |
127 | |
128 | cs = intel_ring_begin(rq, num_dwords: 6); |
129 | if (IS_ERR(ptr: cs)) |
130 | return PTR_ERR(ptr: cs); |
131 | |
132 | cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR); |
133 | intel_ring_advance(rq, cs); |
134 | } |
135 | |
136 | if (mode & EMIT_INVALIDATE) { |
137 | u32 *cs; |
138 | u32 flags = 0; |
139 | |
140 | flags |= PIPE_CONTROL_CS_STALL; |
141 | |
142 | flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; |
143 | flags |= PIPE_CONTROL_TLB_INVALIDATE; |
144 | flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; |
145 | flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; |
146 | flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; |
147 | flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; |
148 | flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; |
149 | flags |= PIPE_CONTROL_QW_WRITE; |
150 | flags |= PIPE_CONTROL_STORE_DATA_INDEX; |
151 | |
152 | cs = intel_ring_begin(rq, num_dwords: 6); |
153 | if (IS_ERR(ptr: cs)) |
154 | return PTR_ERR(ptr: cs); |
155 | |
156 | cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR); |
157 | intel_ring_advance(rq, cs); |
158 | } |
159 | |
160 | return 0; |
161 | } |
162 | |
163 | static u32 preparser_disable(bool state) |
164 | { |
165 | return MI_ARB_CHECK | 1 << 8 | state; |
166 | } |
167 | |
168 | static i915_reg_t gen12_get_aux_inv_reg(struct intel_engine_cs *engine) |
169 | { |
170 | switch (engine->id) { |
171 | case RCS0: |
172 | return GEN12_CCS_AUX_INV; |
173 | case BCS0: |
174 | return GEN12_BCS0_AUX_INV; |
175 | case VCS0: |
176 | return GEN12_VD0_AUX_INV; |
177 | case VCS2: |
178 | return GEN12_VD2_AUX_INV; |
179 | case VECS0: |
180 | return GEN12_VE0_AUX_INV; |
181 | case CCS0: |
182 | return GEN12_CCS0_AUX_INV; |
183 | default: |
184 | return INVALID_MMIO_REG; |
185 | } |
186 | } |
187 | |
188 | static bool gen12_needs_ccs_aux_inv(struct intel_engine_cs *engine) |
189 | { |
190 | i915_reg_t reg = gen12_get_aux_inv_reg(engine); |
191 | |
192 | if (IS_PONTEVECCHIO(engine->i915)) |
193 | return false; |
194 | |
195 | /* |
196 | * So far platforms supported by i915 having flat ccs do not require |
197 | * AUX invalidation. Check also whether the engine requires it. |
198 | */ |
199 | return i915_mmio_reg_valid(reg) && !HAS_FLAT_CCS(engine->i915); |
200 | } |
201 | |
202 | u32 *gen12_emit_aux_table_inv(struct intel_engine_cs *engine, u32 *cs) |
203 | { |
204 | i915_reg_t inv_reg = gen12_get_aux_inv_reg(engine); |
205 | u32 gsi_offset = engine->gt->uncore->gsi_offset; |
206 | |
207 | if (!gen12_needs_ccs_aux_inv(engine)) |
208 | return cs; |
209 | |
210 | *cs++ = MI_LOAD_REGISTER_IMM(1) | MI_LRI_MMIO_REMAP_EN; |
211 | *cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset; |
212 | *cs++ = AUX_INV; |
213 | |
214 | *cs++ = MI_SEMAPHORE_WAIT_TOKEN | |
215 | MI_SEMAPHORE_REGISTER_POLL | |
216 | MI_SEMAPHORE_POLL | |
217 | MI_SEMAPHORE_SAD_EQ_SDD; |
218 | *cs++ = 0; |
219 | *cs++ = i915_mmio_reg_offset(inv_reg) + gsi_offset; |
220 | *cs++ = 0; |
221 | *cs++ = 0; |
222 | |
223 | return cs; |
224 | } |
225 | |
226 | static int mtl_dummy_pipe_control(struct i915_request *rq) |
227 | { |
228 | /* Wa_14016712196 */ |
229 | if (IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) || |
230 | IS_DG2(rq->i915)) { |
231 | u32 *cs; |
232 | |
233 | /* dummy PIPE_CONTROL + depth flush */ |
234 | cs = intel_ring_begin(rq, num_dwords: 6); |
235 | if (IS_ERR(ptr: cs)) |
236 | return PTR_ERR(ptr: cs); |
237 | cs = gen12_emit_pipe_control(batch: cs, |
238 | bit_group_0: 0, |
239 | PIPE_CONTROL_DEPTH_CACHE_FLUSH, |
240 | LRC_PPHWSP_SCRATCH_ADDR); |
241 | intel_ring_advance(rq, cs); |
242 | } |
243 | |
244 | return 0; |
245 | } |
246 | |
247 | int gen12_emit_flush_rcs(struct i915_request *rq, u32 mode) |
248 | { |
249 | struct intel_engine_cs *engine = rq->engine; |
250 | |
251 | /* |
252 | * On Aux CCS platforms the invalidation of the Aux |
253 | * table requires quiescing memory traffic beforehand |
254 | */ |
255 | if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) { |
256 | u32 bit_group_0 = 0; |
257 | u32 bit_group_1 = 0; |
258 | int err; |
259 | u32 *cs; |
260 | |
261 | err = mtl_dummy_pipe_control(rq); |
262 | if (err) |
263 | return err; |
264 | |
265 | bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH; |
266 | |
267 | /* |
268 | * When required, in MTL and beyond platforms we |
269 | * need to set the CCS_FLUSH bit in the pipe control |
270 | */ |
271 | if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70)) |
272 | bit_group_0 |= PIPE_CONTROL_CCS_FLUSH; |
273 | |
274 | /* |
275 | * L3 fabric flush is needed for AUX CCS invalidation |
276 | * which happens as part of pipe-control so we can |
277 | * ignore PIPE_CONTROL_FLUSH_L3. Also PIPE_CONTROL_FLUSH_L3 |
278 | * deals with Protected Memory which is not needed for |
279 | * AUX CCS invalidation and lead to unwanted side effects. |
280 | */ |
281 | if ((mode & EMIT_FLUSH) && |
282 | GRAPHICS_VER_FULL(rq->i915) < IP_VER(12, 70)) |
283 | bit_group_1 |= PIPE_CONTROL_FLUSH_L3; |
284 | |
285 | bit_group_1 |= PIPE_CONTROL_TILE_CACHE_FLUSH; |
286 | bit_group_1 |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; |
287 | bit_group_1 |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; |
288 | /* Wa_1409600907:tgl,adl-p */ |
289 | bit_group_1 |= PIPE_CONTROL_DEPTH_STALL; |
290 | bit_group_1 |= PIPE_CONTROL_DC_FLUSH_ENABLE; |
291 | bit_group_1 |= PIPE_CONTROL_FLUSH_ENABLE; |
292 | |
293 | bit_group_1 |= PIPE_CONTROL_STORE_DATA_INDEX; |
294 | bit_group_1 |= PIPE_CONTROL_QW_WRITE; |
295 | |
296 | bit_group_1 |= PIPE_CONTROL_CS_STALL; |
297 | |
298 | if (!HAS_3D_PIPELINE(engine->i915)) |
299 | bit_group_1 &= ~PIPE_CONTROL_3D_ARCH_FLAGS; |
300 | else if (engine->class == COMPUTE_CLASS) |
301 | bit_group_1 &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; |
302 | |
303 | cs = intel_ring_begin(rq, num_dwords: 6); |
304 | if (IS_ERR(ptr: cs)) |
305 | return PTR_ERR(ptr: cs); |
306 | |
307 | cs = gen12_emit_pipe_control(batch: cs, bit_group_0, bit_group_1, |
308 | LRC_PPHWSP_SCRATCH_ADDR); |
309 | intel_ring_advance(rq, cs); |
310 | } |
311 | |
312 | if (mode & EMIT_INVALIDATE) { |
313 | u32 flags = 0; |
314 | u32 *cs, count; |
315 | int err; |
316 | |
317 | err = mtl_dummy_pipe_control(rq); |
318 | if (err) |
319 | return err; |
320 | |
321 | flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; |
322 | flags |= PIPE_CONTROL_TLB_INVALIDATE; |
323 | flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; |
324 | flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; |
325 | flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; |
326 | flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; |
327 | flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; |
328 | |
329 | flags |= PIPE_CONTROL_STORE_DATA_INDEX; |
330 | flags |= PIPE_CONTROL_QW_WRITE; |
331 | |
332 | flags |= PIPE_CONTROL_CS_STALL; |
333 | |
334 | if (!HAS_3D_PIPELINE(engine->i915)) |
335 | flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS; |
336 | else if (engine->class == COMPUTE_CLASS) |
337 | flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; |
338 | |
339 | count = 8; |
340 | if (gen12_needs_ccs_aux_inv(engine: rq->engine)) |
341 | count += 8; |
342 | |
343 | cs = intel_ring_begin(rq, num_dwords: count); |
344 | if (IS_ERR(ptr: cs)) |
345 | return PTR_ERR(ptr: cs); |
346 | |
347 | /* |
348 | * Prevent the pre-parser from skipping past the TLB |
349 | * invalidate and loading a stale page for the batch |
350 | * buffer / request payload. |
351 | */ |
352 | *cs++ = preparser_disable(state: true); |
353 | |
354 | cs = gen8_emit_pipe_control(batch: cs, bit_group_1: flags, LRC_PPHWSP_SCRATCH_ADDR); |
355 | |
356 | cs = gen12_emit_aux_table_inv(engine, cs); |
357 | |
358 | *cs++ = preparser_disable(state: false); |
359 | intel_ring_advance(rq, cs); |
360 | } |
361 | |
362 | return 0; |
363 | } |
364 | |
365 | int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode) |
366 | { |
367 | u32 cmd = 4; |
368 | u32 *cs; |
369 | |
370 | if (mode & EMIT_INVALIDATE) { |
371 | cmd += 2; |
372 | |
373 | if (gen12_needs_ccs_aux_inv(engine: rq->engine)) |
374 | cmd += 8; |
375 | } |
376 | |
377 | cs = intel_ring_begin(rq, num_dwords: cmd); |
378 | if (IS_ERR(ptr: cs)) |
379 | return PTR_ERR(ptr: cs); |
380 | |
381 | if (mode & EMIT_INVALIDATE) |
382 | *cs++ = preparser_disable(state: true); |
383 | |
384 | cmd = MI_FLUSH_DW + 1; |
385 | |
386 | /* |
387 | * We always require a command barrier so that subsequent |
388 | * commands, such as breadcrumb interrupts, are strictly ordered |
389 | * wrt the contents of the write cache being flushed to memory |
390 | * (and thus being coherent from the CPU). |
391 | */ |
392 | cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; |
393 | |
394 | if (mode & EMIT_INVALIDATE) { |
395 | cmd |= MI_INVALIDATE_TLB; |
396 | if (rq->engine->class == VIDEO_DECODE_CLASS) |
397 | cmd |= MI_INVALIDATE_BSD; |
398 | |
399 | if (gen12_needs_ccs_aux_inv(engine: rq->engine) && |
400 | rq->engine->class == COPY_ENGINE_CLASS) |
401 | cmd |= MI_FLUSH_DW_CCS; |
402 | } |
403 | |
404 | *cs++ = cmd; |
405 | *cs++ = LRC_PPHWSP_SCRATCH_ADDR; |
406 | *cs++ = 0; /* upper addr */ |
407 | *cs++ = 0; /* value */ |
408 | |
409 | cs = gen12_emit_aux_table_inv(engine: rq->engine, cs); |
410 | |
411 | if (mode & EMIT_INVALIDATE) |
412 | *cs++ = preparser_disable(state: false); |
413 | |
414 | intel_ring_advance(rq, cs); |
415 | |
416 | return 0; |
417 | } |
418 | |
419 | static u32 preempt_address(struct intel_engine_cs *engine) |
420 | { |
421 | return (i915_ggtt_offset(vma: engine->status_page.vma) + |
422 | I915_GEM_HWS_PREEMPT_ADDR); |
423 | } |
424 | |
425 | static u32 hwsp_offset(const struct i915_request *rq) |
426 | { |
427 | const struct intel_timeline *tl; |
428 | |
429 | /* Before the request is executed, the timeline is fixed */ |
430 | tl = rcu_dereference_protected(rq->timeline, |
431 | !i915_request_signaled(rq)); |
432 | |
433 | /* See the comment in i915_request_active_seqno(). */ |
434 | return page_mask_bits(tl->hwsp_offset) + offset_in_page(rq->hwsp_seqno); |
435 | } |
436 | |
437 | int gen8_emit_init_breadcrumb(struct i915_request *rq) |
438 | { |
439 | u32 *cs; |
440 | |
441 | GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); |
442 | if (!i915_request_timeline(rq)->has_initial_breadcrumb) |
443 | return 0; |
444 | |
445 | cs = intel_ring_begin(rq, num_dwords: 6); |
446 | if (IS_ERR(ptr: cs)) |
447 | return PTR_ERR(ptr: cs); |
448 | |
449 | *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; |
450 | *cs++ = hwsp_offset(rq); |
451 | *cs++ = 0; |
452 | *cs++ = rq->fence.seqno - 1; |
453 | |
454 | /* |
455 | * Check if we have been preempted before we even get started. |
456 | * |
457 | * After this point i915_request_started() reports true, even if |
458 | * we get preempted and so are no longer running. |
459 | * |
460 | * i915_request_started() is used during preemption processing |
461 | * to decide if the request is currently inside the user payload |
462 | * or spinning on a kernel semaphore (or earlier). For no-preemption |
463 | * requests, we do allow preemption on the semaphore before the user |
464 | * payload, but do not allow preemption once the request is started. |
465 | * |
466 | * i915_request_started() is similarly used during GPU hangs to |
467 | * determine if the user's payload was guilty, and if so, the |
468 | * request is banned. Before the request is started, it is assumed |
469 | * to be unharmed and an innocent victim of another's hang. |
470 | */ |
471 | *cs++ = MI_NOOP; |
472 | *cs++ = MI_ARB_CHECK; |
473 | |
474 | intel_ring_advance(rq, cs); |
475 | |
476 | /* Record the updated position of the request's payload */ |
477 | rq->infix = intel_ring_offset(rq, addr: cs); |
478 | |
479 | __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); |
480 | |
481 | return 0; |
482 | } |
483 | |
484 | static int __xehp_emit_bb_start(struct i915_request *rq, |
485 | u64 offset, u32 len, |
486 | const unsigned int flags, |
487 | u32 arb) |
488 | { |
489 | struct intel_context *ce = rq->context; |
490 | u32 wa_offset = lrc_indirect_bb(ce); |
491 | u32 *cs; |
492 | |
493 | GEM_BUG_ON(!ce->wa_bb_page); |
494 | |
495 | cs = intel_ring_begin(rq, num_dwords: 12); |
496 | if (IS_ERR(ptr: cs)) |
497 | return PTR_ERR(ptr: cs); |
498 | |
499 | *cs++ = MI_ARB_ON_OFF | arb; |
500 | |
501 | *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | |
502 | MI_SRM_LRM_GLOBAL_GTT | |
503 | MI_LRI_LRM_CS_MMIO; |
504 | *cs++ = i915_mmio_reg_offset(RING_PREDICATE_RESULT(0)); |
505 | *cs++ = wa_offset + DG2_PREDICATE_RESULT_WA; |
506 | *cs++ = 0; |
507 | |
508 | *cs++ = MI_BATCH_BUFFER_START_GEN8 | |
509 | (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); |
510 | *cs++ = lower_32_bits(offset); |
511 | *cs++ = upper_32_bits(offset); |
512 | |
513 | /* Fixup stray MI_SET_PREDICATE as it prevents us executing the ring */ |
514 | *cs++ = MI_BATCH_BUFFER_START_GEN8; |
515 | *cs++ = wa_offset + DG2_PREDICATE_RESULT_BB; |
516 | *cs++ = 0; |
517 | |
518 | *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; |
519 | |
520 | intel_ring_advance(rq, cs); |
521 | |
522 | return 0; |
523 | } |
524 | |
525 | int xehp_emit_bb_start_noarb(struct i915_request *rq, |
526 | u64 offset, u32 len, |
527 | const unsigned int flags) |
528 | { |
529 | return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_DISABLE); |
530 | } |
531 | |
532 | int xehp_emit_bb_start(struct i915_request *rq, |
533 | u64 offset, u32 len, |
534 | const unsigned int flags) |
535 | { |
536 | return __xehp_emit_bb_start(rq, offset, len, flags, MI_ARB_ENABLE); |
537 | } |
538 | |
539 | int gen8_emit_bb_start_noarb(struct i915_request *rq, |
540 | u64 offset, u32 len, |
541 | const unsigned int flags) |
542 | { |
543 | u32 *cs; |
544 | |
545 | cs = intel_ring_begin(rq, num_dwords: 4); |
546 | if (IS_ERR(ptr: cs)) |
547 | return PTR_ERR(ptr: cs); |
548 | |
549 | /* |
550 | * WaDisableCtxRestoreArbitration:bdw,chv |
551 | * |
552 | * We don't need to perform MI_ARB_ENABLE as often as we do (in |
553 | * particular all the gen that do not need the w/a at all!), if we |
554 | * took care to make sure that on every switch into this context |
555 | * (both ordinary and for preemption) that arbitrartion was enabled |
556 | * we would be fine. However, for gen8 there is another w/a that |
557 | * requires us to not preempt inside GPGPU execution, so we keep |
558 | * arbitration disabled for gen8 batches. Arbitration will be |
559 | * re-enabled before we close the request |
560 | * (engine->emit_fini_breadcrumb). |
561 | */ |
562 | *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; |
563 | |
564 | /* FIXME(BDW+): Address space and security selectors. */ |
565 | *cs++ = MI_BATCH_BUFFER_START_GEN8 | |
566 | (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); |
567 | *cs++ = lower_32_bits(offset); |
568 | *cs++ = upper_32_bits(offset); |
569 | |
570 | intel_ring_advance(rq, cs); |
571 | |
572 | return 0; |
573 | } |
574 | |
575 | int gen8_emit_bb_start(struct i915_request *rq, |
576 | u64 offset, u32 len, |
577 | const unsigned int flags) |
578 | { |
579 | u32 *cs; |
580 | |
581 | if (unlikely(i915_request_has_nopreempt(rq))) |
582 | return gen8_emit_bb_start_noarb(rq, offset, len, flags); |
583 | |
584 | cs = intel_ring_begin(rq, num_dwords: 6); |
585 | if (IS_ERR(ptr: cs)) |
586 | return PTR_ERR(ptr: cs); |
587 | |
588 | *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; |
589 | |
590 | *cs++ = MI_BATCH_BUFFER_START_GEN8 | |
591 | (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); |
592 | *cs++ = lower_32_bits(offset); |
593 | *cs++ = upper_32_bits(offset); |
594 | |
595 | *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; |
596 | *cs++ = MI_NOOP; |
597 | |
598 | intel_ring_advance(rq, cs); |
599 | |
600 | return 0; |
601 | } |
602 | |
603 | static void assert_request_valid(struct i915_request *rq) |
604 | { |
605 | struct intel_ring *ring __maybe_unused = rq->ring; |
606 | |
607 | /* Can we unwind this request without appearing to go forwards? */ |
608 | GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); |
609 | } |
610 | |
611 | /* |
612 | * Reserve space for 2 NOOPs at the end of each request to be |
613 | * used as a workaround for not being allowed to do lite |
614 | * restore with HEAD==TAIL (WaIdleLiteRestore). |
615 | */ |
616 | static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs) |
617 | { |
618 | /* Ensure there's always at least one preemption point per-request. */ |
619 | *cs++ = MI_ARB_CHECK; |
620 | *cs++ = MI_NOOP; |
621 | rq->wa_tail = intel_ring_offset(rq, addr: cs); |
622 | |
623 | /* Check that entire request is less than half the ring */ |
624 | assert_request_valid(rq); |
625 | |
626 | return cs; |
627 | } |
628 | |
629 | static u32 *emit_preempt_busywait(struct i915_request *rq, u32 *cs) |
630 | { |
631 | *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */ |
632 | *cs++ = MI_SEMAPHORE_WAIT | |
633 | MI_SEMAPHORE_GLOBAL_GTT | |
634 | MI_SEMAPHORE_POLL | |
635 | MI_SEMAPHORE_SAD_EQ_SDD; |
636 | *cs++ = 0; |
637 | *cs++ = preempt_address(engine: rq->engine); |
638 | *cs++ = 0; |
639 | *cs++ = MI_NOOP; |
640 | |
641 | return cs; |
642 | } |
643 | |
644 | static __always_inline u32* |
645 | gen8_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs) |
646 | { |
647 | *cs++ = MI_USER_INTERRUPT; |
648 | |
649 | *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; |
650 | if (intel_engine_has_semaphores(engine: rq->engine) && |
651 | !intel_uc_uses_guc_submission(uc: &rq->engine->gt->uc)) |
652 | cs = emit_preempt_busywait(rq, cs); |
653 | |
654 | rq->tail = intel_ring_offset(rq, addr: cs); |
655 | assert_ring_tail_valid(ring: rq->ring, tail: rq->tail); |
656 | |
657 | return gen8_emit_wa_tail(rq, cs); |
658 | } |
659 | |
660 | static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) |
661 | { |
662 | return gen8_emit_ggtt_write(cs, value: rq->fence.seqno, gtt_offset: hwsp_offset(rq), flags: 0); |
663 | } |
664 | |
665 | u32 *gen8_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs) |
666 | { |
667 | return gen8_emit_fini_breadcrumb_tail(rq, cs: emit_xcs_breadcrumb(rq, cs)); |
668 | } |
669 | |
670 | u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs) |
671 | { |
672 | cs = gen8_emit_pipe_control(batch: cs, |
673 | PIPE_CONTROL_CS_STALL | |
674 | PIPE_CONTROL_TLB_INVALIDATE | |
675 | PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | |
676 | PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
677 | PIPE_CONTROL_DC_FLUSH_ENABLE, |
678 | offset: 0); |
679 | |
680 | /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ |
681 | cs = gen8_emit_ggtt_write_rcs(cs, |
682 | value: rq->fence.seqno, |
683 | gtt_offset: hwsp_offset(rq), |
684 | PIPE_CONTROL_FLUSH_ENABLE | |
685 | PIPE_CONTROL_CS_STALL); |
686 | |
687 | return gen8_emit_fini_breadcrumb_tail(rq, cs); |
688 | } |
689 | |
690 | u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs) |
691 | { |
692 | cs = gen8_emit_pipe_control(batch: cs, |
693 | PIPE_CONTROL_CS_STALL | |
694 | PIPE_CONTROL_TLB_INVALIDATE | |
695 | PIPE_CONTROL_TILE_CACHE_FLUSH | |
696 | PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | |
697 | PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
698 | PIPE_CONTROL_DC_FLUSH_ENABLE, |
699 | offset: 0); |
700 | |
701 | /*XXX: Look at gen8_emit_fini_breadcrumb_rcs */ |
702 | cs = gen8_emit_ggtt_write_rcs(cs, |
703 | value: rq->fence.seqno, |
704 | gtt_offset: hwsp_offset(rq), |
705 | PIPE_CONTROL_FLUSH_ENABLE | |
706 | PIPE_CONTROL_CS_STALL); |
707 | |
708 | return gen8_emit_fini_breadcrumb_tail(rq, cs); |
709 | } |
710 | |
711 | /* |
712 | * Note that the CS instruction pre-parser will not stall on the breadcrumb |
713 | * flush and will continue pre-fetching the instructions after it before the |
714 | * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at |
715 | * BB_START/END instructions, so, even though we might pre-fetch the pre-amble |
716 | * of the next request before the memory has been flushed, we're guaranteed that |
717 | * we won't access the batch itself too early. |
718 | * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, |
719 | * so, if the current request is modifying an instruction in the next request on |
720 | * the same intel_context, we might pre-fetch and then execute the pre-update |
721 | * instruction. To avoid this, the users of self-modifying code should either |
722 | * disable the parser around the code emitting the memory writes, via a new flag |
723 | * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For |
724 | * the in-kernel use-cases we've opted to use a separate context, see |
725 | * reloc_gpu() as an example. |
726 | * All the above applies only to the instructions themselves. Non-inline data |
727 | * used by the instructions is not pre-fetched. |
728 | */ |
729 | |
730 | static u32 *gen12_emit_preempt_busywait(struct i915_request *rq, u32 *cs) |
731 | { |
732 | *cs++ = MI_ARB_CHECK; /* trigger IDLE->ACTIVE first */ |
733 | *cs++ = MI_SEMAPHORE_WAIT_TOKEN | |
734 | MI_SEMAPHORE_GLOBAL_GTT | |
735 | MI_SEMAPHORE_POLL | |
736 | MI_SEMAPHORE_SAD_EQ_SDD; |
737 | *cs++ = 0; |
738 | *cs++ = preempt_address(engine: rq->engine); |
739 | *cs++ = 0; |
740 | *cs++ = 0; |
741 | |
742 | return cs; |
743 | } |
744 | |
745 | /* Wa_14014475959:dg2 */ |
746 | #define CCS_SEMAPHORE_PPHWSP_OFFSET 0x540 |
747 | static u32 ccs_semaphore_offset(struct i915_request *rq) |
748 | { |
749 | return i915_ggtt_offset(vma: rq->context->state) + |
750 | (LRC_PPHWSP_PN * PAGE_SIZE) + CCS_SEMAPHORE_PPHWSP_OFFSET; |
751 | } |
752 | |
753 | /* Wa_14014475959:dg2 */ |
754 | static u32 *ccs_emit_wa_busywait(struct i915_request *rq, u32 *cs) |
755 | { |
756 | int i; |
757 | |
758 | *cs++ = MI_ATOMIC_INLINE | MI_ATOMIC_GLOBAL_GTT | MI_ATOMIC_CS_STALL | |
759 | MI_ATOMIC_MOVE; |
760 | *cs++ = ccs_semaphore_offset(rq); |
761 | *cs++ = 0; |
762 | *cs++ = 1; |
763 | |
764 | /* |
765 | * When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP) |
766 | * to align. 4 DWs above + 8 filler DWs here. |
767 | */ |
768 | for (i = 0; i < 8; ++i) |
769 | *cs++ = 0; |
770 | |
771 | *cs++ = MI_SEMAPHORE_WAIT | |
772 | MI_SEMAPHORE_GLOBAL_GTT | |
773 | MI_SEMAPHORE_POLL | |
774 | MI_SEMAPHORE_SAD_EQ_SDD; |
775 | *cs++ = 0; |
776 | *cs++ = ccs_semaphore_offset(rq); |
777 | *cs++ = 0; |
778 | |
779 | return cs; |
780 | } |
781 | |
782 | static __always_inline u32* |
783 | gen12_emit_fini_breadcrumb_tail(struct i915_request *rq, u32 *cs) |
784 | { |
785 | *cs++ = MI_USER_INTERRUPT; |
786 | |
787 | *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; |
788 | if (intel_engine_has_semaphores(engine: rq->engine) && |
789 | !intel_uc_uses_guc_submission(uc: &rq->engine->gt->uc)) |
790 | cs = gen12_emit_preempt_busywait(rq, cs); |
791 | |
792 | /* Wa_14014475959:dg2 */ |
793 | if (intel_engine_uses_wa_hold_ccs_switchout(engine: rq->engine)) |
794 | cs = ccs_emit_wa_busywait(rq, cs); |
795 | |
796 | rq->tail = intel_ring_offset(rq, addr: cs); |
797 | assert_ring_tail_valid(ring: rq->ring, tail: rq->tail); |
798 | |
799 | return gen8_emit_wa_tail(rq, cs); |
800 | } |
801 | |
802 | u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs) |
803 | { |
804 | /* XXX Stalling flush before seqno write; post-sync not */ |
805 | cs = emit_xcs_breadcrumb(rq, cs: __gen8_emit_flush_dw(cs, value: 0, gtt_offset: 0, flags: 0)); |
806 | return gen12_emit_fini_breadcrumb_tail(rq, cs); |
807 | } |
808 | |
809 | u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs) |
810 | { |
811 | struct drm_i915_private *i915 = rq->i915; |
812 | struct intel_gt *gt = rq->engine->gt; |
813 | u32 flags = (PIPE_CONTROL_CS_STALL | |
814 | PIPE_CONTROL_TLB_INVALIDATE | |
815 | PIPE_CONTROL_TILE_CACHE_FLUSH | |
816 | PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | |
817 | PIPE_CONTROL_DEPTH_CACHE_FLUSH | |
818 | PIPE_CONTROL_DC_FLUSH_ENABLE | |
819 | PIPE_CONTROL_FLUSH_ENABLE); |
820 | |
821 | if (GRAPHICS_VER_FULL(rq->i915) < IP_VER(12, 70)) |
822 | flags |= PIPE_CONTROL_FLUSH_L3; |
823 | |
824 | /* Wa_14016712196 */ |
825 | if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915)) |
826 | /* dummy PIPE_CONTROL + depth flush */ |
827 | cs = gen12_emit_pipe_control(batch: cs, bit_group_0: 0, |
828 | PIPE_CONTROL_DEPTH_CACHE_FLUSH, offset: 0); |
829 | |
830 | if (GRAPHICS_VER(i915) == 12 && GRAPHICS_VER_FULL(i915) < IP_VER(12, 50)) |
831 | /* Wa_1409600907 */ |
832 | flags |= PIPE_CONTROL_DEPTH_STALL; |
833 | |
834 | if (!HAS_3D_PIPELINE(rq->i915)) |
835 | flags &= ~PIPE_CONTROL_3D_ARCH_FLAGS; |
836 | else if (rq->engine->class == COMPUTE_CLASS) |
837 | flags &= ~PIPE_CONTROL_3D_ENGINE_FLAGS; |
838 | |
839 | cs = gen12_emit_pipe_control(batch: cs, PIPE_CONTROL0_HDC_PIPELINE_FLUSH, bit_group_1: flags, offset: 0); |
840 | |
841 | /*XXX: Look at gen8_emit_fini_breadcrumb_rcs */ |
842 | cs = gen12_emit_ggtt_write_rcs(cs, |
843 | value: rq->fence.seqno, |
844 | gtt_offset: hwsp_offset(rq), |
845 | flags0: 0, |
846 | PIPE_CONTROL_FLUSH_ENABLE | |
847 | PIPE_CONTROL_CS_STALL); |
848 | |
849 | return gen12_emit_fini_breadcrumb_tail(rq, cs); |
850 | } |
851 | |