1 | // SPDX-License-Identifier: MIT |
2 | /* |
3 | * Copyright © 2020 Intel Corporation |
4 | */ |
5 | |
6 | #include "i915_drv.h" |
7 | #include "intel_context.h" |
8 | #include "intel_gpu_commands.h" |
9 | #include "intel_gt.h" |
10 | #include "intel_gtt.h" |
11 | #include "intel_migrate.h" |
12 | #include "intel_ring.h" |
13 | #include "gem/i915_gem_lmem.h" |
14 | |
15 | struct insert_pte_data { |
16 | u64 offset; |
17 | }; |
18 | |
19 | #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */ |
20 | |
21 | #define GET_CCS_BYTES(i915, size) (HAS_FLAT_CCS(i915) ? \ |
22 | DIV_ROUND_UP(size, NUM_BYTES_PER_CCS_BYTE) : 0) |
23 | static bool engine_supports_migration(struct intel_engine_cs *engine) |
24 | { |
25 | if (!engine) |
26 | return false; |
27 | |
28 | /* |
29 | * We need the ability to prevent aribtration (MI_ARB_ON_OFF), |
30 | * the ability to write PTE using inline data (MI_STORE_DATA) |
31 | * and of course the ability to do the block transfer (blits). |
32 | */ |
33 | GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS); |
34 | |
35 | return true; |
36 | } |
37 | |
38 | static void xehpsdv_toggle_pdes(struct i915_address_space *vm, |
39 | struct i915_page_table *pt, |
40 | void *data) |
41 | { |
42 | struct insert_pte_data *d = data; |
43 | |
44 | /* |
45 | * Insert a dummy PTE into every PT that will map to LMEM to ensure |
46 | * we have a correctly setup PDE structure for later use. |
47 | */ |
48 | vm->insert_page(vm, 0, d->offset, |
49 | i915_gem_get_pat_index(i915: vm->i915, level: I915_CACHE_NONE), |
50 | PTE_LM); |
51 | GEM_BUG_ON(!pt->is_compact); |
52 | d->offset += SZ_2M; |
53 | } |
54 | |
55 | static void xehpsdv_insert_pte(struct i915_address_space *vm, |
56 | struct i915_page_table *pt, |
57 | void *data) |
58 | { |
59 | struct insert_pte_data *d = data; |
60 | |
61 | /* |
62 | * We are playing tricks here, since the actual pt, from the hw |
63 | * pov, is only 256bytes with 32 entries, or 4096bytes with 512 |
64 | * entries, but we are still guaranteed that the physical |
65 | * alignment is 64K underneath for the pt, and we are careful |
66 | * not to access the space in the void. |
67 | */ |
68 | vm->insert_page(vm, px_dma(pt), d->offset, |
69 | i915_gem_get_pat_index(i915: vm->i915, level: I915_CACHE_NONE), |
70 | PTE_LM); |
71 | d->offset += SZ_64K; |
72 | } |
73 | |
74 | static void insert_pte(struct i915_address_space *vm, |
75 | struct i915_page_table *pt, |
76 | void *data) |
77 | { |
78 | struct insert_pte_data *d = data; |
79 | |
80 | vm->insert_page(vm, px_dma(pt), d->offset, |
81 | i915_gem_get_pat_index(i915: vm->i915, level: I915_CACHE_NONE), |
82 | i915_gem_object_is_lmem(obj: pt->base) ? PTE_LM : 0); |
83 | d->offset += PAGE_SIZE; |
84 | } |
85 | |
86 | static struct i915_address_space *migrate_vm(struct intel_gt *gt) |
87 | { |
88 | struct i915_vm_pt_stash stash = {}; |
89 | struct i915_ppgtt *vm; |
90 | int err; |
91 | int i; |
92 | |
93 | /* |
94 | * We construct a very special VM for use by all migration contexts, |
95 | * it is kept pinned so that it can be used at any time. As we need |
96 | * to pre-allocate the page directories for the migration VM, this |
97 | * limits us to only using a small number of prepared vma. |
98 | * |
99 | * To be able to pipeline and reschedule migration operations while |
100 | * avoiding unnecessary contention on the vm itself, the PTE updates |
101 | * are inline with the blits. All the blits use the same fixed |
102 | * addresses, with the backing store redirection being updated on the |
103 | * fly. Only 2 implicit vma are used for all migration operations. |
104 | * |
105 | * We lay the ppGTT out as: |
106 | * |
107 | * [0, CHUNK_SZ) -> first object |
108 | * [CHUNK_SZ, 2 * CHUNK_SZ) -> second object |
109 | * [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE |
110 | * |
111 | * By exposing the dma addresses of the page directories themselves |
112 | * within the ppGTT, we are then able to rewrite the PTE prior to use. |
113 | * But the PTE update and subsequent migration operation must be atomic, |
114 | * i.e. within the same non-preemptible window so that we do not switch |
115 | * to another migration context that overwrites the PTE. |
116 | * |
117 | * This changes quite a bit on platforms with HAS_64K_PAGES support, |
118 | * where we instead have three windows, each CHUNK_SIZE in size. The |
119 | * first is reserved for mapping system-memory, and that just uses the |
120 | * 512 entry layout using 4K GTT pages. The other two windows just map |
121 | * lmem pages and must use the new compact 32 entry layout using 64K GTT |
122 | * pages, which ensures we can address any lmem object that the user |
123 | * throws at us. We then also use the xehpsdv_toggle_pdes as a way of |
124 | * just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the |
125 | * compact layout for each of these page-tables, that fall within the |
126 | * [CHUNK_SIZE, 3 * CHUNK_SIZE) range. |
127 | * |
128 | * We lay the ppGTT out as: |
129 | * |
130 | * [0, CHUNK_SZ) -> first window/object, maps smem |
131 | * [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src |
132 | * [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst |
133 | * |
134 | * For the PTE window it's also quite different, since each PTE must |
135 | * point to some 64K page, one for each PT(since it's in lmem), and yet |
136 | * each is only <= 4096bytes, but since the unused space within that PTE |
137 | * range is never touched, this should be fine. |
138 | * |
139 | * So basically each PT now needs 64K of virtual memory, instead of 4K, |
140 | * which looks like: |
141 | * |
142 | * [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE |
143 | */ |
144 | |
145 | vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY); |
146 | if (IS_ERR(ptr: vm)) |
147 | return ERR_CAST(ptr: vm); |
148 | |
149 | if (!vm->vm.allocate_va_range || !vm->vm.foreach) { |
150 | err = -ENODEV; |
151 | goto err_vm; |
152 | } |
153 | |
154 | if (HAS_64K_PAGES(gt->i915)) |
155 | stash.pt_sz = I915_GTT_PAGE_SIZE_64K; |
156 | |
157 | /* |
158 | * Each engine instance is assigned its own chunk in the VM, so |
159 | * that we can run multiple instances concurrently |
160 | */ |
161 | for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { |
162 | struct intel_engine_cs *engine; |
163 | u64 base = (u64)i << 32; |
164 | struct insert_pte_data d = {}; |
165 | struct i915_gem_ww_ctx ww; |
166 | u64 sz; |
167 | |
168 | engine = gt->engine_class[COPY_ENGINE_CLASS][i]; |
169 | if (!engine_supports_migration(engine)) |
170 | continue; |
171 | |
172 | /* |
173 | * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need |
174 | * 4x2 page directories for source/destination. |
175 | */ |
176 | if (HAS_64K_PAGES(gt->i915)) |
177 | sz = 3 * CHUNK_SZ; |
178 | else |
179 | sz = 2 * CHUNK_SZ; |
180 | d.offset = base + sz; |
181 | |
182 | /* |
183 | * We need another page directory setup so that we can write |
184 | * the 8x512 PTE in each chunk. |
185 | */ |
186 | if (HAS_64K_PAGES(gt->i915)) |
187 | sz += (sz / SZ_2M) * SZ_64K; |
188 | else |
189 | sz += (sz >> 12) * sizeof(u64); |
190 | |
191 | err = i915_vm_alloc_pt_stash(vm: &vm->vm, stash: &stash, size: sz); |
192 | if (err) |
193 | goto err_vm; |
194 | |
195 | for_i915_gem_ww(&ww, err, true) { |
196 | err = i915_vm_lock_objects(vm: &vm->vm, ww: &ww); |
197 | if (err) |
198 | continue; |
199 | err = i915_vm_map_pt_stash(vm: &vm->vm, stash: &stash); |
200 | if (err) |
201 | continue; |
202 | |
203 | vm->vm.allocate_va_range(&vm->vm, &stash, base, sz); |
204 | } |
205 | i915_vm_free_pt_stash(vm: &vm->vm, stash: &stash); |
206 | if (err) |
207 | goto err_vm; |
208 | |
209 | /* Now allow the GPU to rewrite the PTE via its own ppGTT */ |
210 | if (HAS_64K_PAGES(gt->i915)) { |
211 | vm->vm.foreach(&vm->vm, base, d.offset - base, |
212 | xehpsdv_insert_pte, &d); |
213 | d.offset = base + CHUNK_SZ; |
214 | vm->vm.foreach(&vm->vm, |
215 | d.offset, |
216 | 2 * CHUNK_SZ, |
217 | xehpsdv_toggle_pdes, &d); |
218 | } else { |
219 | vm->vm.foreach(&vm->vm, base, d.offset - base, |
220 | insert_pte, &d); |
221 | } |
222 | } |
223 | |
224 | return &vm->vm; |
225 | |
226 | err_vm: |
227 | i915_vm_put(vm: &vm->vm); |
228 | return ERR_PTR(error: err); |
229 | } |
230 | |
231 | static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt) |
232 | { |
233 | struct intel_engine_cs *engine; |
234 | int i; |
235 | |
236 | for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { |
237 | engine = gt->engine_class[COPY_ENGINE_CLASS][i]; |
238 | if (engine_supports_migration(engine)) |
239 | return engine; |
240 | } |
241 | |
242 | return NULL; |
243 | } |
244 | |
245 | static struct intel_context *pinned_context(struct intel_gt *gt) |
246 | { |
247 | static struct lock_class_key key; |
248 | struct intel_engine_cs *engine; |
249 | struct i915_address_space *vm; |
250 | struct intel_context *ce; |
251 | |
252 | engine = first_copy_engine(gt); |
253 | if (!engine) |
254 | return ERR_PTR(error: -ENODEV); |
255 | |
256 | vm = migrate_vm(gt); |
257 | if (IS_ERR(ptr: vm)) |
258 | return ERR_CAST(ptr: vm); |
259 | |
260 | ce = intel_engine_create_pinned_context(engine, vm, SZ_512K, |
261 | I915_GEM_HWS_MIGRATE, |
262 | key: &key, name: "migrate" ); |
263 | i915_vm_put(vm); |
264 | return ce; |
265 | } |
266 | |
267 | int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt) |
268 | { |
269 | struct intel_context *ce; |
270 | |
271 | memset(m, 0, sizeof(*m)); |
272 | |
273 | ce = pinned_context(gt); |
274 | if (IS_ERR(ptr: ce)) |
275 | return PTR_ERR(ptr: ce); |
276 | |
277 | m->context = ce; |
278 | return 0; |
279 | } |
280 | |
281 | static int random_index(unsigned int max) |
282 | { |
283 | return upper_32_bits(mul_u32_u32(get_random_u32(), max)); |
284 | } |
285 | |
286 | static struct intel_context *__migrate_engines(struct intel_gt *gt) |
287 | { |
288 | struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE]; |
289 | struct intel_engine_cs *engine; |
290 | unsigned int count, i; |
291 | |
292 | count = 0; |
293 | for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { |
294 | engine = gt->engine_class[COPY_ENGINE_CLASS][i]; |
295 | if (engine_supports_migration(engine)) |
296 | engines[count++] = engine; |
297 | } |
298 | |
299 | return intel_context_create(engine: engines[random_index(max: count)]); |
300 | } |
301 | |
302 | struct intel_context *intel_migrate_create_context(struct intel_migrate *m) |
303 | { |
304 | struct intel_context *ce; |
305 | |
306 | /* |
307 | * We randomly distribute contexts across the engines upon constrction, |
308 | * as they all share the same pinned vm, and so in order to allow |
309 | * multiple blits to run in parallel, we must construct each blit |
310 | * to use a different range of the vm for its GTT. This has to be |
311 | * known at construction, so we can not use the late greedy load |
312 | * balancing of the virtual-engine. |
313 | */ |
314 | ce = __migrate_engines(gt: m->context->engine->gt); |
315 | if (IS_ERR(ptr: ce)) |
316 | return ce; |
317 | |
318 | ce->ring = NULL; |
319 | ce->ring_size = SZ_256K; |
320 | |
321 | i915_vm_put(vm: ce->vm); |
322 | ce->vm = i915_vm_get(vm: m->context->vm); |
323 | |
324 | return ce; |
325 | } |
326 | |
327 | static inline struct sgt_dma sg_sgt(struct scatterlist *sg) |
328 | { |
329 | dma_addr_t addr = sg_dma_address(sg); |
330 | |
331 | return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) }; |
332 | } |
333 | |
334 | static int emit_no_arbitration(struct i915_request *rq) |
335 | { |
336 | u32 *cs; |
337 | |
338 | cs = intel_ring_begin(rq, num_dwords: 2); |
339 | if (IS_ERR(ptr: cs)) |
340 | return PTR_ERR(ptr: cs); |
341 | |
342 | /* Explicitly disable preemption for this request. */ |
343 | *cs++ = MI_ARB_ON_OFF; |
344 | *cs++ = MI_NOOP; |
345 | intel_ring_advance(rq, cs); |
346 | |
347 | return 0; |
348 | } |
349 | |
350 | static int max_pte_pkt_size(struct i915_request *rq, int pkt) |
351 | { |
352 | struct intel_ring *ring = rq->ring; |
353 | |
354 | pkt = min_t(int, pkt, (ring->space - rq->reserved_space) / sizeof(u32) + 5); |
355 | pkt = min_t(int, pkt, (ring->size - ring->emit) / sizeof(u32) + 5); |
356 | |
357 | return pkt; |
358 | } |
359 | |
360 | #define I915_EMIT_PTE_NUM_DWORDS 6 |
361 | |
362 | static int emit_pte(struct i915_request *rq, |
363 | struct sgt_dma *it, |
364 | unsigned int pat_index, |
365 | bool is_lmem, |
366 | u64 offset, |
367 | int length) |
368 | { |
369 | bool has_64K_pages = HAS_64K_PAGES(rq->i915); |
370 | const u64 encode = rq->context->vm->pte_encode(0, pat_index, |
371 | is_lmem ? PTE_LM : 0); |
372 | struct intel_ring *ring = rq->ring; |
373 | int pkt, dword_length; |
374 | u32 total = 0; |
375 | u32 page_size; |
376 | u32 *hdr, *cs; |
377 | |
378 | GEM_BUG_ON(GRAPHICS_VER(rq->i915) < 8); |
379 | |
380 | page_size = I915_GTT_PAGE_SIZE; |
381 | dword_length = 0x400; |
382 | |
383 | /* Compute the page directory offset for the target address range */ |
384 | if (has_64K_pages) { |
385 | GEM_BUG_ON(!IS_ALIGNED(offset, SZ_2M)); |
386 | |
387 | offset /= SZ_2M; |
388 | offset *= SZ_64K; |
389 | offset += 3 * CHUNK_SZ; |
390 | |
391 | if (is_lmem) { |
392 | page_size = I915_GTT_PAGE_SIZE_64K; |
393 | dword_length = 0x40; |
394 | } |
395 | } else { |
396 | offset >>= 12; |
397 | offset *= sizeof(u64); |
398 | offset += 2 * CHUNK_SZ; |
399 | } |
400 | |
401 | offset += (u64)rq->engine->instance << 32; |
402 | |
403 | cs = intel_ring_begin(rq, I915_EMIT_PTE_NUM_DWORDS); |
404 | if (IS_ERR(ptr: cs)) |
405 | return PTR_ERR(ptr: cs); |
406 | |
407 | /* Pack as many PTE updates as possible into a single MI command */ |
408 | pkt = max_pte_pkt_size(rq, pkt: dword_length); |
409 | |
410 | hdr = cs; |
411 | *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); /* as qword elements */ |
412 | *cs++ = lower_32_bits(offset); |
413 | *cs++ = upper_32_bits(offset); |
414 | |
415 | do { |
416 | if (cs - hdr >= pkt) { |
417 | int dword_rem; |
418 | |
419 | *hdr += cs - hdr - 2; |
420 | *cs++ = MI_NOOP; |
421 | |
422 | ring->emit = (void *)cs - ring->vaddr; |
423 | intel_ring_advance(rq, cs); |
424 | intel_ring_update_space(ring); |
425 | |
426 | cs = intel_ring_begin(rq, I915_EMIT_PTE_NUM_DWORDS); |
427 | if (IS_ERR(ptr: cs)) |
428 | return PTR_ERR(ptr: cs); |
429 | |
430 | dword_rem = dword_length; |
431 | if (has_64K_pages) { |
432 | if (IS_ALIGNED(total, SZ_2M)) { |
433 | offset = round_up(offset, SZ_64K); |
434 | } else { |
435 | dword_rem = SZ_2M - (total & (SZ_2M - 1)); |
436 | dword_rem /= page_size; |
437 | dword_rem *= 2; |
438 | } |
439 | } |
440 | |
441 | pkt = max_pte_pkt_size(rq, pkt: dword_rem); |
442 | |
443 | hdr = cs; |
444 | *cs++ = MI_STORE_DATA_IMM | REG_BIT(21); |
445 | *cs++ = lower_32_bits(offset); |
446 | *cs++ = upper_32_bits(offset); |
447 | } |
448 | |
449 | GEM_BUG_ON(!IS_ALIGNED(it->dma, page_size)); |
450 | |
451 | *cs++ = lower_32_bits(encode | it->dma); |
452 | *cs++ = upper_32_bits(encode | it->dma); |
453 | |
454 | offset += 8; |
455 | total += page_size; |
456 | |
457 | it->dma += page_size; |
458 | if (it->dma >= it->max) { |
459 | it->sg = __sg_next(sg: it->sg); |
460 | if (!it->sg || sg_dma_len(it->sg) == 0) |
461 | break; |
462 | |
463 | it->dma = sg_dma_address(it->sg); |
464 | it->max = it->dma + sg_dma_len(it->sg); |
465 | } |
466 | } while (total < length); |
467 | |
468 | *hdr += cs - hdr - 2; |
469 | *cs++ = MI_NOOP; |
470 | |
471 | ring->emit = (void *)cs - ring->vaddr; |
472 | intel_ring_advance(rq, cs); |
473 | intel_ring_update_space(ring); |
474 | |
475 | return total; |
476 | } |
477 | |
478 | static bool wa_1209644611_applies(int ver, u32 size) |
479 | { |
480 | u32 height = size >> PAGE_SHIFT; |
481 | |
482 | if (ver != 11) |
483 | return false; |
484 | |
485 | return height % 4 == 3 && height <= 8; |
486 | } |
487 | |
488 | /** |
489 | * DOC: Flat-CCS - Memory compression for Local memory |
490 | * |
491 | * On Xe-HP and later devices, we use dedicated compression control state (CCS) |
492 | * stored in local memory for each surface, to support the 3D and media |
493 | * compression formats. |
494 | * |
495 | * The memory required for the CCS of the entire local memory is 1/256 of the |
496 | * local memory size. So before the kernel boot, the required memory is reserved |
497 | * for the CCS data and a secure register will be programmed with the CCS base |
498 | * address. |
499 | * |
500 | * Flat CCS data needs to be cleared when a lmem object is allocated. |
501 | * And CCS data can be copied in and out of CCS region through |
502 | * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly. |
503 | * |
504 | * I915 supports Flat-CCS on lmem only objects. When an objects has smem in |
505 | * its preference list, on memory pressure, i915 needs to migrate the lmem |
506 | * content into smem. If the lmem object is Flat-CCS compressed by userspace, |
507 | * then i915 needs to decompress it. But I915 lack the required information |
508 | * for such decompression. Hence I915 supports Flat-CCS only on lmem only objects. |
509 | * |
510 | * When we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can |
511 | * be temporarily evicted to smem, along with the auxiliary CCS state, where |
512 | * it can be potentially swapped-out at a later point, if required. |
513 | * If userspace later touches the evicted pages, then we always move |
514 | * the backing memory back to lmem, which includes restoring the saved CCS state, |
515 | * and potentially performing any required swap-in. |
516 | * |
517 | * For the migration of the lmem objects with smem in placement list, such as |
518 | * {lmem, smem}, objects are treated as non Flat-CCS capable objects. |
519 | */ |
520 | |
521 | static inline u32 *i915_flush_dw(u32 *cmd, u32 flags) |
522 | { |
523 | *cmd++ = MI_FLUSH_DW | flags; |
524 | *cmd++ = 0; |
525 | *cmd++ = 0; |
526 | |
527 | return cmd; |
528 | } |
529 | |
530 | static int emit_copy_ccs(struct i915_request *rq, |
531 | u32 dst_offset, u8 dst_access, |
532 | u32 src_offset, u8 src_access, int size) |
533 | { |
534 | struct drm_i915_private *i915 = rq->i915; |
535 | int mocs = rq->engine->gt->mocs.uc_index << 1; |
536 | u32 num_ccs_blks; |
537 | u32 *cs; |
538 | |
539 | cs = intel_ring_begin(rq, num_dwords: 12); |
540 | if (IS_ERR(ptr: cs)) |
541 | return PTR_ERR(ptr: cs); |
542 | |
543 | num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size), |
544 | NUM_CCS_BYTES_PER_BLOCK); |
545 | GEM_BUG_ON(num_ccs_blks > NUM_CCS_BLKS_PER_XFER); |
546 | cs = i915_flush_dw(cmd: cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS); |
547 | |
548 | /* |
549 | * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS |
550 | * data in and out of the CCS region. |
551 | * |
552 | * We can copy at most 1024 blocks of 256 bytes using one |
553 | * XY_CTRL_SURF_COPY_BLT instruction. |
554 | * |
555 | * In case we need to copy more than 1024 blocks, we need to add |
556 | * another instruction to the same batch buffer. |
557 | * |
558 | * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS. |
559 | * |
560 | * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM. |
561 | */ |
562 | *cs++ = XY_CTRL_SURF_COPY_BLT | |
563 | src_access << SRC_ACCESS_TYPE_SHIFT | |
564 | dst_access << DST_ACCESS_TYPE_SHIFT | |
565 | ((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT; |
566 | *cs++ = src_offset; |
567 | *cs++ = rq->engine->instance | |
568 | FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs); |
569 | *cs++ = dst_offset; |
570 | *cs++ = rq->engine->instance | |
571 | FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs); |
572 | |
573 | cs = i915_flush_dw(cmd: cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS); |
574 | *cs++ = MI_NOOP; |
575 | |
576 | intel_ring_advance(rq, cs); |
577 | |
578 | return 0; |
579 | } |
580 | |
581 | static int emit_copy(struct i915_request *rq, |
582 | u32 dst_offset, u32 src_offset, int size) |
583 | { |
584 | const int ver = GRAPHICS_VER(rq->i915); |
585 | u32 instance = rq->engine->instance; |
586 | u32 *cs; |
587 | |
588 | cs = intel_ring_begin(rq, num_dwords: ver >= 8 ? 10 : 6); |
589 | if (IS_ERR(ptr: cs)) |
590 | return PTR_ERR(ptr: cs); |
591 | |
592 | if (ver >= 9 && !wa_1209644611_applies(ver, size)) { |
593 | *cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2); |
594 | *cs++ = BLT_DEPTH_32 | PAGE_SIZE; |
595 | *cs++ = 0; |
596 | *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; |
597 | *cs++ = dst_offset; |
598 | *cs++ = instance; |
599 | *cs++ = 0; |
600 | *cs++ = PAGE_SIZE; |
601 | *cs++ = src_offset; |
602 | *cs++ = instance; |
603 | } else if (ver >= 8) { |
604 | *cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2); |
605 | *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; |
606 | *cs++ = 0; |
607 | *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; |
608 | *cs++ = dst_offset; |
609 | *cs++ = instance; |
610 | *cs++ = 0; |
611 | *cs++ = PAGE_SIZE; |
612 | *cs++ = src_offset; |
613 | *cs++ = instance; |
614 | } else { |
615 | GEM_BUG_ON(instance); |
616 | *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); |
617 | *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE; |
618 | *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE; |
619 | *cs++ = dst_offset; |
620 | *cs++ = PAGE_SIZE; |
621 | *cs++ = src_offset; |
622 | } |
623 | |
624 | intel_ring_advance(rq, cs); |
625 | return 0; |
626 | } |
627 | |
628 | static u64 scatter_list_length(struct scatterlist *sg) |
629 | { |
630 | u64 len = 0; |
631 | |
632 | while (sg && sg_dma_len(sg)) { |
633 | len += sg_dma_len(sg); |
634 | sg = sg_next(sg); |
635 | } |
636 | |
637 | return len; |
638 | } |
639 | |
640 | static int |
641 | calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem, |
642 | u64 bytes_to_cpy, u64 ccs_bytes_to_cpy) |
643 | { |
644 | if (ccs_bytes_to_cpy && !src_is_lmem) |
645 | /* |
646 | * When CHUNK_SZ is passed all the pages upto CHUNK_SZ |
647 | * will be taken for the blt. in Flat-ccs supported |
648 | * platform Smem obj will have more pages than required |
649 | * for main meory hence limit it to the required size |
650 | * for main memory |
651 | */ |
652 | return min_t(u64, bytes_to_cpy, CHUNK_SZ); |
653 | else |
654 | return CHUNK_SZ; |
655 | } |
656 | |
657 | static void get_ccs_sg_sgt(struct sgt_dma *it, u64 bytes_to_cpy) |
658 | { |
659 | u64 len; |
660 | |
661 | do { |
662 | GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg)); |
663 | len = it->max - it->dma; |
664 | if (len > bytes_to_cpy) { |
665 | it->dma += bytes_to_cpy; |
666 | break; |
667 | } |
668 | |
669 | bytes_to_cpy -= len; |
670 | |
671 | it->sg = __sg_next(sg: it->sg); |
672 | it->dma = sg_dma_address(it->sg); |
673 | it->max = it->dma + sg_dma_len(it->sg); |
674 | } while (bytes_to_cpy); |
675 | } |
676 | |
677 | int |
678 | intel_context_migrate_copy(struct intel_context *ce, |
679 | const struct i915_deps *deps, |
680 | struct scatterlist *src, |
681 | unsigned int src_pat_index, |
682 | bool src_is_lmem, |
683 | struct scatterlist *dst, |
684 | unsigned int dst_pat_index, |
685 | bool dst_is_lmem, |
686 | struct i915_request **out) |
687 | { |
688 | struct sgt_dma it_src = sg_sgt(sg: src), it_dst = sg_sgt(sg: dst), it_ccs; |
689 | struct drm_i915_private *i915 = ce->engine->i915; |
690 | u64 ccs_bytes_to_cpy = 0, bytes_to_cpy; |
691 | unsigned int ccs_pat_index; |
692 | u32 src_offset, dst_offset; |
693 | u8 src_access, dst_access; |
694 | struct i915_request *rq; |
695 | u64 src_sz, dst_sz; |
696 | bool ccs_is_src, overwrite_ccs; |
697 | int err; |
698 | |
699 | GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); |
700 | GEM_BUG_ON(IS_DGFX(ce->engine->i915) && (!src_is_lmem && !dst_is_lmem)); |
701 | *out = NULL; |
702 | |
703 | GEM_BUG_ON(ce->ring->size < SZ_64K); |
704 | |
705 | src_sz = scatter_list_length(sg: src); |
706 | bytes_to_cpy = src_sz; |
707 | |
708 | if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) { |
709 | src_access = !src_is_lmem && dst_is_lmem; |
710 | dst_access = !src_access; |
711 | |
712 | dst_sz = scatter_list_length(sg: dst); |
713 | if (src_is_lmem) { |
714 | it_ccs = it_dst; |
715 | ccs_pat_index = dst_pat_index; |
716 | ccs_is_src = false; |
717 | } else if (dst_is_lmem) { |
718 | bytes_to_cpy = dst_sz; |
719 | it_ccs = it_src; |
720 | ccs_pat_index = src_pat_index; |
721 | ccs_is_src = true; |
722 | } |
723 | |
724 | /* |
725 | * When there is a eviction of ccs needed smem will have the |
726 | * extra pages for the ccs data |
727 | * |
728 | * TO-DO: Want to move the size mismatch check to a WARN_ON, |
729 | * but still we have some requests of smem->lmem with same size. |
730 | * Need to fix it. |
731 | */ |
732 | ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, bytes_to_cpy) : 0; |
733 | if (ccs_bytes_to_cpy) |
734 | get_ccs_sg_sgt(it: &it_ccs, bytes_to_cpy); |
735 | } |
736 | |
737 | overwrite_ccs = HAS_FLAT_CCS(i915) && !ccs_bytes_to_cpy && dst_is_lmem; |
738 | |
739 | src_offset = 0; |
740 | dst_offset = CHUNK_SZ; |
741 | if (HAS_64K_PAGES(ce->engine->i915)) { |
742 | src_offset = 0; |
743 | dst_offset = 0; |
744 | if (src_is_lmem) |
745 | src_offset = CHUNK_SZ; |
746 | if (dst_is_lmem) |
747 | dst_offset = 2 * CHUNK_SZ; |
748 | } |
749 | |
750 | do { |
751 | int len; |
752 | |
753 | rq = i915_request_create(ce); |
754 | if (IS_ERR(ptr: rq)) { |
755 | err = PTR_ERR(ptr: rq); |
756 | goto out_ce; |
757 | } |
758 | |
759 | if (deps) { |
760 | err = i915_request_await_deps(rq, deps); |
761 | if (err) |
762 | goto out_rq; |
763 | |
764 | if (rq->engine->emit_init_breadcrumb) { |
765 | err = rq->engine->emit_init_breadcrumb(rq); |
766 | if (err) |
767 | goto out_rq; |
768 | } |
769 | |
770 | deps = NULL; |
771 | } |
772 | |
773 | /* The PTE updates + copy must not be interrupted. */ |
774 | err = emit_no_arbitration(rq); |
775 | if (err) |
776 | goto out_rq; |
777 | |
778 | src_sz = calculate_chunk_sz(i915, src_is_lmem, |
779 | bytes_to_cpy, ccs_bytes_to_cpy); |
780 | |
781 | len = emit_pte(rq, it: &it_src, pat_index: src_pat_index, is_lmem: src_is_lmem, |
782 | offset: src_offset, length: src_sz); |
783 | if (!len) { |
784 | err = -EINVAL; |
785 | goto out_rq; |
786 | } |
787 | if (len < 0) { |
788 | err = len; |
789 | goto out_rq; |
790 | } |
791 | |
792 | err = emit_pte(rq, it: &it_dst, pat_index: dst_pat_index, is_lmem: dst_is_lmem, |
793 | offset: dst_offset, length: len); |
794 | if (err < 0) |
795 | goto out_rq; |
796 | if (err < len) { |
797 | err = -EINVAL; |
798 | goto out_rq; |
799 | } |
800 | |
801 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
802 | if (err) |
803 | goto out_rq; |
804 | |
805 | err = emit_copy(rq, dst_offset, src_offset, size: len); |
806 | if (err) |
807 | goto out_rq; |
808 | |
809 | bytes_to_cpy -= len; |
810 | |
811 | if (ccs_bytes_to_cpy) { |
812 | int ccs_sz; |
813 | |
814 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
815 | if (err) |
816 | goto out_rq; |
817 | |
818 | ccs_sz = GET_CCS_BYTES(i915, len); |
819 | err = emit_pte(rq, it: &it_ccs, pat_index: ccs_pat_index, is_lmem: false, |
820 | offset: ccs_is_src ? src_offset : dst_offset, |
821 | length: ccs_sz); |
822 | if (err < 0) |
823 | goto out_rq; |
824 | if (err < ccs_sz) { |
825 | err = -EINVAL; |
826 | goto out_rq; |
827 | } |
828 | |
829 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
830 | if (err) |
831 | goto out_rq; |
832 | |
833 | err = emit_copy_ccs(rq, dst_offset, dst_access, |
834 | src_offset, src_access, size: len); |
835 | if (err) |
836 | goto out_rq; |
837 | |
838 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
839 | if (err) |
840 | goto out_rq; |
841 | ccs_bytes_to_cpy -= ccs_sz; |
842 | } else if (overwrite_ccs) { |
843 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
844 | if (err) |
845 | goto out_rq; |
846 | |
847 | if (src_is_lmem) { |
848 | /* |
849 | * If the src is already in lmem, then we must |
850 | * be doing an lmem -> lmem transfer, and so |
851 | * should be safe to directly copy the CCS |
852 | * state. In this case we have either |
853 | * initialised the CCS aux state when first |
854 | * clearing the pages (since it is already |
855 | * allocated in lmem), or the user has |
856 | * potentially populated it, in which case we |
857 | * need to copy the CCS state as-is. |
858 | */ |
859 | err = emit_copy_ccs(rq, |
860 | dst_offset, INDIRECT_ACCESS, |
861 | src_offset, INDIRECT_ACCESS, |
862 | size: len); |
863 | } else { |
864 | /* |
865 | * While we can't always restore/manage the CCS |
866 | * state, we still need to ensure we don't leak |
867 | * the CCS state from the previous user, so make |
868 | * sure we overwrite it with something. |
869 | */ |
870 | err = emit_copy_ccs(rq, |
871 | dst_offset, INDIRECT_ACCESS, |
872 | src_offset: dst_offset, DIRECT_ACCESS, |
873 | size: len); |
874 | } |
875 | |
876 | if (err) |
877 | goto out_rq; |
878 | |
879 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
880 | if (err) |
881 | goto out_rq; |
882 | } |
883 | |
884 | /* Arbitration is re-enabled between requests. */ |
885 | out_rq: |
886 | if (*out) |
887 | i915_request_put(rq: *out); |
888 | *out = i915_request_get(rq); |
889 | i915_request_add(rq); |
890 | |
891 | if (err) |
892 | break; |
893 | |
894 | if (!bytes_to_cpy && !ccs_bytes_to_cpy) { |
895 | if (src_is_lmem) |
896 | WARN_ON(it_src.sg && sg_dma_len(it_src.sg)); |
897 | else |
898 | WARN_ON(it_dst.sg && sg_dma_len(it_dst.sg)); |
899 | break; |
900 | } |
901 | |
902 | if (WARN_ON(!it_src.sg || !sg_dma_len(it_src.sg) || |
903 | !it_dst.sg || !sg_dma_len(it_dst.sg) || |
904 | (ccs_bytes_to_cpy && (!it_ccs.sg || |
905 | !sg_dma_len(it_ccs.sg))))) { |
906 | err = -EINVAL; |
907 | break; |
908 | } |
909 | |
910 | cond_resched(); |
911 | } while (1); |
912 | |
913 | out_ce: |
914 | return err; |
915 | } |
916 | |
917 | static int emit_clear(struct i915_request *rq, u32 offset, int size, |
918 | u32 value, bool is_lmem) |
919 | { |
920 | struct drm_i915_private *i915 = rq->i915; |
921 | int mocs = rq->engine->gt->mocs.uc_index << 1; |
922 | const int ver = GRAPHICS_VER(i915); |
923 | int ring_sz; |
924 | u32 *cs; |
925 | |
926 | GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX); |
927 | |
928 | if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) |
929 | ring_sz = XY_FAST_COLOR_BLT_DW; |
930 | else if (ver >= 8) |
931 | ring_sz = 8; |
932 | else |
933 | ring_sz = 6; |
934 | |
935 | cs = intel_ring_begin(rq, num_dwords: ring_sz); |
936 | if (IS_ERR(ptr: cs)) |
937 | return PTR_ERR(ptr: cs); |
938 | |
939 | if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { |
940 | *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 | |
941 | (XY_FAST_COLOR_BLT_DW - 2); |
942 | *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | |
943 | (PAGE_SIZE - 1); |
944 | *cs++ = 0; |
945 | *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; |
946 | *cs++ = offset; |
947 | *cs++ = rq->engine->instance; |
948 | *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT; |
949 | /* BG7 */ |
950 | *cs++ = value; |
951 | *cs++ = 0; |
952 | *cs++ = 0; |
953 | *cs++ = 0; |
954 | /* BG11 */ |
955 | *cs++ = 0; |
956 | *cs++ = 0; |
957 | /* BG13 */ |
958 | *cs++ = 0; |
959 | *cs++ = 0; |
960 | *cs++ = 0; |
961 | } else if (ver >= 8) { |
962 | *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2); |
963 | *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; |
964 | *cs++ = 0; |
965 | *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; |
966 | *cs++ = offset; |
967 | *cs++ = rq->engine->instance; |
968 | *cs++ = value; |
969 | *cs++ = MI_NOOP; |
970 | } else { |
971 | *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); |
972 | *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE; |
973 | *cs++ = 0; |
974 | *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4; |
975 | *cs++ = offset; |
976 | *cs++ = value; |
977 | } |
978 | |
979 | intel_ring_advance(rq, cs); |
980 | return 0; |
981 | } |
982 | |
983 | int |
984 | intel_context_migrate_clear(struct intel_context *ce, |
985 | const struct i915_deps *deps, |
986 | struct scatterlist *sg, |
987 | unsigned int pat_index, |
988 | bool is_lmem, |
989 | u32 value, |
990 | struct i915_request **out) |
991 | { |
992 | struct drm_i915_private *i915 = ce->engine->i915; |
993 | struct sgt_dma it = sg_sgt(sg); |
994 | struct i915_request *rq; |
995 | u32 offset; |
996 | int err; |
997 | |
998 | GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm); |
999 | *out = NULL; |
1000 | |
1001 | GEM_BUG_ON(ce->ring->size < SZ_64K); |
1002 | |
1003 | offset = 0; |
1004 | if (HAS_64K_PAGES(i915) && is_lmem) |
1005 | offset = CHUNK_SZ; |
1006 | |
1007 | do { |
1008 | int len; |
1009 | |
1010 | rq = i915_request_create(ce); |
1011 | if (IS_ERR(ptr: rq)) { |
1012 | err = PTR_ERR(ptr: rq); |
1013 | goto out_ce; |
1014 | } |
1015 | |
1016 | if (deps) { |
1017 | err = i915_request_await_deps(rq, deps); |
1018 | if (err) |
1019 | goto out_rq; |
1020 | |
1021 | if (rq->engine->emit_init_breadcrumb) { |
1022 | err = rq->engine->emit_init_breadcrumb(rq); |
1023 | if (err) |
1024 | goto out_rq; |
1025 | } |
1026 | |
1027 | deps = NULL; |
1028 | } |
1029 | |
1030 | /* The PTE updates + clear must not be interrupted. */ |
1031 | err = emit_no_arbitration(rq); |
1032 | if (err) |
1033 | goto out_rq; |
1034 | |
1035 | len = emit_pte(rq, it: &it, pat_index, is_lmem, offset, CHUNK_SZ); |
1036 | if (len <= 0) { |
1037 | err = len; |
1038 | goto out_rq; |
1039 | } |
1040 | |
1041 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
1042 | if (err) |
1043 | goto out_rq; |
1044 | |
1045 | err = emit_clear(rq, offset, size: len, value, is_lmem); |
1046 | if (err) |
1047 | goto out_rq; |
1048 | |
1049 | if (HAS_FLAT_CCS(i915) && is_lmem && !value) { |
1050 | /* |
1051 | * copy the content of memory into corresponding |
1052 | * ccs surface |
1053 | */ |
1054 | err = emit_copy_ccs(rq, dst_offset: offset, INDIRECT_ACCESS, src_offset: offset, |
1055 | DIRECT_ACCESS, size: len); |
1056 | if (err) |
1057 | goto out_rq; |
1058 | } |
1059 | |
1060 | err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); |
1061 | |
1062 | /* Arbitration is re-enabled between requests. */ |
1063 | out_rq: |
1064 | if (*out) |
1065 | i915_request_put(rq: *out); |
1066 | *out = i915_request_get(rq); |
1067 | i915_request_add(rq); |
1068 | if (err || !it.sg || !sg_dma_len(it.sg)) |
1069 | break; |
1070 | |
1071 | cond_resched(); |
1072 | } while (1); |
1073 | |
1074 | out_ce: |
1075 | return err; |
1076 | } |
1077 | |
1078 | int intel_migrate_copy(struct intel_migrate *m, |
1079 | struct i915_gem_ww_ctx *ww, |
1080 | const struct i915_deps *deps, |
1081 | struct scatterlist *src, |
1082 | unsigned int src_pat_index, |
1083 | bool src_is_lmem, |
1084 | struct scatterlist *dst, |
1085 | unsigned int dst_pat_index, |
1086 | bool dst_is_lmem, |
1087 | struct i915_request **out) |
1088 | { |
1089 | struct intel_context *ce; |
1090 | int err; |
1091 | |
1092 | *out = NULL; |
1093 | if (!m->context) |
1094 | return -ENODEV; |
1095 | |
1096 | ce = intel_migrate_create_context(m); |
1097 | if (IS_ERR(ptr: ce)) |
1098 | ce = intel_context_get(ce: m->context); |
1099 | GEM_BUG_ON(IS_ERR(ce)); |
1100 | |
1101 | err = intel_context_pin_ww(ce, ww); |
1102 | if (err) |
1103 | goto out; |
1104 | |
1105 | err = intel_context_migrate_copy(ce, deps, |
1106 | src, src_pat_index, src_is_lmem, |
1107 | dst, dst_pat_index, dst_is_lmem, |
1108 | out); |
1109 | |
1110 | intel_context_unpin(ce); |
1111 | out: |
1112 | intel_context_put(ce); |
1113 | return err; |
1114 | } |
1115 | |
1116 | int |
1117 | intel_migrate_clear(struct intel_migrate *m, |
1118 | struct i915_gem_ww_ctx *ww, |
1119 | const struct i915_deps *deps, |
1120 | struct scatterlist *sg, |
1121 | unsigned int pat_index, |
1122 | bool is_lmem, |
1123 | u32 value, |
1124 | struct i915_request **out) |
1125 | { |
1126 | struct intel_context *ce; |
1127 | int err; |
1128 | |
1129 | *out = NULL; |
1130 | if (!m->context) |
1131 | return -ENODEV; |
1132 | |
1133 | ce = intel_migrate_create_context(m); |
1134 | if (IS_ERR(ptr: ce)) |
1135 | ce = intel_context_get(ce: m->context); |
1136 | GEM_BUG_ON(IS_ERR(ce)); |
1137 | |
1138 | err = intel_context_pin_ww(ce, ww); |
1139 | if (err) |
1140 | goto out; |
1141 | |
1142 | err = intel_context_migrate_clear(ce, deps, sg, pat_index, |
1143 | is_lmem, value, out); |
1144 | |
1145 | intel_context_unpin(ce); |
1146 | out: |
1147 | intel_context_put(ce); |
1148 | return err; |
1149 | } |
1150 | |
1151 | void intel_migrate_fini(struct intel_migrate *m) |
1152 | { |
1153 | struct intel_context *ce; |
1154 | |
1155 | ce = fetch_and_zero(&m->context); |
1156 | if (!ce) |
1157 | return; |
1158 | |
1159 | intel_engine_destroy_pinned_context(ce); |
1160 | } |
1161 | |
1162 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
1163 | #include "selftest_migrate.c" |
1164 | #endif |
1165 | |