1 | // SPDX-License-Identifier: MIT |
2 | /* |
3 | * Copyright © 2014 Intel Corporation |
4 | */ |
5 | |
6 | #include "gem/i915_gem_lmem.h" |
7 | |
8 | #include "gen8_engine_cs.h" |
9 | #include "i915_drv.h" |
10 | #include "i915_perf.h" |
11 | #include "i915_reg.h" |
12 | #include "intel_context.h" |
13 | #include "intel_engine.h" |
14 | #include "intel_engine_regs.h" |
15 | #include "intel_gpu_commands.h" |
16 | #include "intel_gt.h" |
17 | #include "intel_gt_regs.h" |
18 | #include "intel_lrc.h" |
19 | #include "intel_lrc_reg.h" |
20 | #include "intel_ring.h" |
21 | #include "shmem_utils.h" |
22 | |
23 | /* |
24 | * The per-platform tables are u8-encoded in @data. Decode @data and set the |
25 | * addresses' offset and commands in @regs. The following encoding is used |
26 | * for each byte. There are 2 steps: decoding commands and decoding addresses. |
27 | * |
28 | * Commands: |
29 | * [7]: create NOPs - number of NOPs are set in lower bits |
30 | * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set |
31 | * MI_LRI_FORCE_POSTED |
32 | * [5:0]: Number of NOPs or registers to set values to in case of |
33 | * MI_LOAD_REGISTER_IMM |
34 | * |
35 | * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" |
36 | * number of registers. They are set by using the REG/REG16 macros: the former |
37 | * is used for offsets smaller than 0x200 while the latter is for values bigger |
38 | * than that. Those macros already set all the bits documented below correctly: |
39 | * |
40 | * [7]: When a register offset needs more than 6 bits, use additional bytes, to |
41 | * follow, for the lower bits |
42 | * [6:0]: Register offset, without considering the engine base. |
43 | * |
44 | * This function only tweaks the commands and register offsets. Values are not |
45 | * filled out. |
46 | */ |
47 | static void set_offsets(u32 *regs, |
48 | const u8 *data, |
49 | const struct intel_engine_cs *engine, |
50 | bool close) |
51 | #define NOP(x) (BIT(7) | (x)) |
52 | #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) |
53 | #define POSTED BIT(0) |
54 | #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) |
55 | #define REG16(x) \ |
56 | (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ |
57 | (((x) >> 2) & 0x7f) |
58 | #define END 0 |
59 | { |
60 | const u32 base = engine->mmio_base; |
61 | |
62 | while (*data) { |
63 | u8 count, flags; |
64 | |
65 | if (*data & BIT(7)) { /* skip */ |
66 | count = *data++ & ~BIT(7); |
67 | regs += count; |
68 | continue; |
69 | } |
70 | |
71 | count = *data & 0x3f; |
72 | flags = *data >> 6; |
73 | data++; |
74 | |
75 | *regs = MI_LOAD_REGISTER_IMM(count); |
76 | if (flags & POSTED) |
77 | *regs |= MI_LRI_FORCE_POSTED; |
78 | if (GRAPHICS_VER(engine->i915) >= 11) |
79 | *regs |= MI_LRI_LRM_CS_MMIO; |
80 | regs++; |
81 | |
82 | GEM_BUG_ON(!count); |
83 | do { |
84 | u32 offset = 0; |
85 | u8 v; |
86 | |
87 | do { |
88 | v = *data++; |
89 | offset <<= 7; |
90 | offset |= v & ~BIT(7); |
91 | } while (v & BIT(7)); |
92 | |
93 | regs[0] = base + (offset << 2); |
94 | regs += 2; |
95 | } while (--count); |
96 | } |
97 | |
98 | if (close) { |
99 | /* Close the batch; used mainly by live_lrc_layout() */ |
100 | *regs = MI_BATCH_BUFFER_END; |
101 | if (GRAPHICS_VER(engine->i915) >= 11) |
102 | *regs |= BIT(0); |
103 | } |
104 | } |
105 | |
106 | static const u8 gen8_xcs_offsets[] = { |
107 | NOP(1), |
108 | LRI(11, 0), |
109 | REG16(0x244), |
110 | REG(0x034), |
111 | REG(0x030), |
112 | REG(0x038), |
113 | REG(0x03c), |
114 | REG(0x168), |
115 | REG(0x140), |
116 | REG(0x110), |
117 | REG(0x11c), |
118 | REG(0x114), |
119 | REG(0x118), |
120 | |
121 | NOP(9), |
122 | LRI(9, 0), |
123 | REG16(0x3a8), |
124 | REG16(0x28c), |
125 | REG16(0x288), |
126 | REG16(0x284), |
127 | REG16(0x280), |
128 | REG16(0x27c), |
129 | REG16(0x278), |
130 | REG16(0x274), |
131 | REG16(0x270), |
132 | |
133 | NOP(13), |
134 | LRI(2, 0), |
135 | REG16(0x200), |
136 | REG(0x028), |
137 | |
138 | END |
139 | }; |
140 | |
141 | static const u8 gen9_xcs_offsets[] = { |
142 | NOP(1), |
143 | LRI(14, POSTED), |
144 | REG16(0x244), |
145 | REG(0x034), |
146 | REG(0x030), |
147 | REG(0x038), |
148 | REG(0x03c), |
149 | REG(0x168), |
150 | REG(0x140), |
151 | REG(0x110), |
152 | REG(0x11c), |
153 | REG(0x114), |
154 | REG(0x118), |
155 | REG(0x1c0), |
156 | REG(0x1c4), |
157 | REG(0x1c8), |
158 | |
159 | NOP(3), |
160 | LRI(9, POSTED), |
161 | REG16(0x3a8), |
162 | REG16(0x28c), |
163 | REG16(0x288), |
164 | REG16(0x284), |
165 | REG16(0x280), |
166 | REG16(0x27c), |
167 | REG16(0x278), |
168 | REG16(0x274), |
169 | REG16(0x270), |
170 | |
171 | NOP(13), |
172 | LRI(1, POSTED), |
173 | REG16(0x200), |
174 | |
175 | NOP(13), |
176 | LRI(44, POSTED), |
177 | REG(0x028), |
178 | REG(0x09c), |
179 | REG(0x0c0), |
180 | REG(0x178), |
181 | REG(0x17c), |
182 | REG16(0x358), |
183 | REG(0x170), |
184 | REG(0x150), |
185 | REG(0x154), |
186 | REG(0x158), |
187 | REG16(0x41c), |
188 | REG16(0x600), |
189 | REG16(0x604), |
190 | REG16(0x608), |
191 | REG16(0x60c), |
192 | REG16(0x610), |
193 | REG16(0x614), |
194 | REG16(0x618), |
195 | REG16(0x61c), |
196 | REG16(0x620), |
197 | REG16(0x624), |
198 | REG16(0x628), |
199 | REG16(0x62c), |
200 | REG16(0x630), |
201 | REG16(0x634), |
202 | REG16(0x638), |
203 | REG16(0x63c), |
204 | REG16(0x640), |
205 | REG16(0x644), |
206 | REG16(0x648), |
207 | REG16(0x64c), |
208 | REG16(0x650), |
209 | REG16(0x654), |
210 | REG16(0x658), |
211 | REG16(0x65c), |
212 | REG16(0x660), |
213 | REG16(0x664), |
214 | REG16(0x668), |
215 | REG16(0x66c), |
216 | REG16(0x670), |
217 | REG16(0x674), |
218 | REG16(0x678), |
219 | REG16(0x67c), |
220 | REG(0x068), |
221 | |
222 | END |
223 | }; |
224 | |
225 | static const u8 gen12_xcs_offsets[] = { |
226 | NOP(1), |
227 | LRI(13, POSTED), |
228 | REG16(0x244), |
229 | REG(0x034), |
230 | REG(0x030), |
231 | REG(0x038), |
232 | REG(0x03c), |
233 | REG(0x168), |
234 | REG(0x140), |
235 | REG(0x110), |
236 | REG(0x1c0), |
237 | REG(0x1c4), |
238 | REG(0x1c8), |
239 | REG(0x180), |
240 | REG16(0x2b4), |
241 | |
242 | NOP(5), |
243 | LRI(9, POSTED), |
244 | REG16(0x3a8), |
245 | REG16(0x28c), |
246 | REG16(0x288), |
247 | REG16(0x284), |
248 | REG16(0x280), |
249 | REG16(0x27c), |
250 | REG16(0x278), |
251 | REG16(0x274), |
252 | REG16(0x270), |
253 | |
254 | END |
255 | }; |
256 | |
257 | static const u8 dg2_xcs_offsets[] = { |
258 | NOP(1), |
259 | LRI(15, POSTED), |
260 | REG16(0x244), |
261 | REG(0x034), |
262 | REG(0x030), |
263 | REG(0x038), |
264 | REG(0x03c), |
265 | REG(0x168), |
266 | REG(0x140), |
267 | REG(0x110), |
268 | REG(0x1c0), |
269 | REG(0x1c4), |
270 | REG(0x1c8), |
271 | REG(0x180), |
272 | REG16(0x2b4), |
273 | REG(0x120), |
274 | REG(0x124), |
275 | |
276 | NOP(1), |
277 | LRI(9, POSTED), |
278 | REG16(0x3a8), |
279 | REG16(0x28c), |
280 | REG16(0x288), |
281 | REG16(0x284), |
282 | REG16(0x280), |
283 | REG16(0x27c), |
284 | REG16(0x278), |
285 | REG16(0x274), |
286 | REG16(0x270), |
287 | |
288 | END |
289 | }; |
290 | |
291 | static const u8 gen8_rcs_offsets[] = { |
292 | NOP(1), |
293 | LRI(14, POSTED), |
294 | REG16(0x244), |
295 | REG(0x034), |
296 | REG(0x030), |
297 | REG(0x038), |
298 | REG(0x03c), |
299 | REG(0x168), |
300 | REG(0x140), |
301 | REG(0x110), |
302 | REG(0x11c), |
303 | REG(0x114), |
304 | REG(0x118), |
305 | REG(0x1c0), |
306 | REG(0x1c4), |
307 | REG(0x1c8), |
308 | |
309 | NOP(3), |
310 | LRI(9, POSTED), |
311 | REG16(0x3a8), |
312 | REG16(0x28c), |
313 | REG16(0x288), |
314 | REG16(0x284), |
315 | REG16(0x280), |
316 | REG16(0x27c), |
317 | REG16(0x278), |
318 | REG16(0x274), |
319 | REG16(0x270), |
320 | |
321 | NOP(13), |
322 | LRI(1, 0), |
323 | REG(0x0c8), |
324 | |
325 | END |
326 | }; |
327 | |
328 | static const u8 gen9_rcs_offsets[] = { |
329 | NOP(1), |
330 | LRI(14, POSTED), |
331 | REG16(0x244), |
332 | REG(0x34), |
333 | REG(0x30), |
334 | REG(0x38), |
335 | REG(0x3c), |
336 | REG(0x168), |
337 | REG(0x140), |
338 | REG(0x110), |
339 | REG(0x11c), |
340 | REG(0x114), |
341 | REG(0x118), |
342 | REG(0x1c0), |
343 | REG(0x1c4), |
344 | REG(0x1c8), |
345 | |
346 | NOP(3), |
347 | LRI(9, POSTED), |
348 | REG16(0x3a8), |
349 | REG16(0x28c), |
350 | REG16(0x288), |
351 | REG16(0x284), |
352 | REG16(0x280), |
353 | REG16(0x27c), |
354 | REG16(0x278), |
355 | REG16(0x274), |
356 | REG16(0x270), |
357 | |
358 | NOP(13), |
359 | LRI(1, 0), |
360 | REG(0xc8), |
361 | |
362 | NOP(13), |
363 | LRI(44, POSTED), |
364 | REG(0x28), |
365 | REG(0x9c), |
366 | REG(0xc0), |
367 | REG(0x178), |
368 | REG(0x17c), |
369 | REG16(0x358), |
370 | REG(0x170), |
371 | REG(0x150), |
372 | REG(0x154), |
373 | REG(0x158), |
374 | REG16(0x41c), |
375 | REG16(0x600), |
376 | REG16(0x604), |
377 | REG16(0x608), |
378 | REG16(0x60c), |
379 | REG16(0x610), |
380 | REG16(0x614), |
381 | REG16(0x618), |
382 | REG16(0x61c), |
383 | REG16(0x620), |
384 | REG16(0x624), |
385 | REG16(0x628), |
386 | REG16(0x62c), |
387 | REG16(0x630), |
388 | REG16(0x634), |
389 | REG16(0x638), |
390 | REG16(0x63c), |
391 | REG16(0x640), |
392 | REG16(0x644), |
393 | REG16(0x648), |
394 | REG16(0x64c), |
395 | REG16(0x650), |
396 | REG16(0x654), |
397 | REG16(0x658), |
398 | REG16(0x65c), |
399 | REG16(0x660), |
400 | REG16(0x664), |
401 | REG16(0x668), |
402 | REG16(0x66c), |
403 | REG16(0x670), |
404 | REG16(0x674), |
405 | REG16(0x678), |
406 | REG16(0x67c), |
407 | REG(0x68), |
408 | |
409 | END |
410 | }; |
411 | |
412 | static const u8 gen11_rcs_offsets[] = { |
413 | NOP(1), |
414 | LRI(15, POSTED), |
415 | REG16(0x244), |
416 | REG(0x034), |
417 | REG(0x030), |
418 | REG(0x038), |
419 | REG(0x03c), |
420 | REG(0x168), |
421 | REG(0x140), |
422 | REG(0x110), |
423 | REG(0x11c), |
424 | REG(0x114), |
425 | REG(0x118), |
426 | REG(0x1c0), |
427 | REG(0x1c4), |
428 | REG(0x1c8), |
429 | REG(0x180), |
430 | |
431 | NOP(1), |
432 | LRI(9, POSTED), |
433 | REG16(0x3a8), |
434 | REG16(0x28c), |
435 | REG16(0x288), |
436 | REG16(0x284), |
437 | REG16(0x280), |
438 | REG16(0x27c), |
439 | REG16(0x278), |
440 | REG16(0x274), |
441 | REG16(0x270), |
442 | |
443 | LRI(1, POSTED), |
444 | REG(0x1b0), |
445 | |
446 | NOP(10), |
447 | LRI(1, 0), |
448 | REG(0x0c8), |
449 | |
450 | END |
451 | }; |
452 | |
453 | static const u8 gen12_rcs_offsets[] = { |
454 | NOP(1), |
455 | LRI(13, POSTED), |
456 | REG16(0x244), |
457 | REG(0x034), |
458 | REG(0x030), |
459 | REG(0x038), |
460 | REG(0x03c), |
461 | REG(0x168), |
462 | REG(0x140), |
463 | REG(0x110), |
464 | REG(0x1c0), |
465 | REG(0x1c4), |
466 | REG(0x1c8), |
467 | REG(0x180), |
468 | REG16(0x2b4), |
469 | |
470 | NOP(5), |
471 | LRI(9, POSTED), |
472 | REG16(0x3a8), |
473 | REG16(0x28c), |
474 | REG16(0x288), |
475 | REG16(0x284), |
476 | REG16(0x280), |
477 | REG16(0x27c), |
478 | REG16(0x278), |
479 | REG16(0x274), |
480 | REG16(0x270), |
481 | |
482 | LRI(3, POSTED), |
483 | REG(0x1b0), |
484 | REG16(0x5a8), |
485 | REG16(0x5ac), |
486 | |
487 | NOP(6), |
488 | LRI(1, 0), |
489 | REG(0x0c8), |
490 | NOP(3 + 9 + 1), |
491 | |
492 | LRI(51, POSTED), |
493 | REG16(0x588), |
494 | REG16(0x588), |
495 | REG16(0x588), |
496 | REG16(0x588), |
497 | REG16(0x588), |
498 | REG16(0x588), |
499 | REG(0x028), |
500 | REG(0x09c), |
501 | REG(0x0c0), |
502 | REG(0x178), |
503 | REG(0x17c), |
504 | REG16(0x358), |
505 | REG(0x170), |
506 | REG(0x150), |
507 | REG(0x154), |
508 | REG(0x158), |
509 | REG16(0x41c), |
510 | REG16(0x600), |
511 | REG16(0x604), |
512 | REG16(0x608), |
513 | REG16(0x60c), |
514 | REG16(0x610), |
515 | REG16(0x614), |
516 | REG16(0x618), |
517 | REG16(0x61c), |
518 | REG16(0x620), |
519 | REG16(0x624), |
520 | REG16(0x628), |
521 | REG16(0x62c), |
522 | REG16(0x630), |
523 | REG16(0x634), |
524 | REG16(0x638), |
525 | REG16(0x63c), |
526 | REG16(0x640), |
527 | REG16(0x644), |
528 | REG16(0x648), |
529 | REG16(0x64c), |
530 | REG16(0x650), |
531 | REG16(0x654), |
532 | REG16(0x658), |
533 | REG16(0x65c), |
534 | REG16(0x660), |
535 | REG16(0x664), |
536 | REG16(0x668), |
537 | REG16(0x66c), |
538 | REG16(0x670), |
539 | REG16(0x674), |
540 | REG16(0x678), |
541 | REG16(0x67c), |
542 | REG(0x068), |
543 | REG(0x084), |
544 | NOP(1), |
545 | |
546 | END |
547 | }; |
548 | |
549 | static const u8 xehp_rcs_offsets[] = { |
550 | NOP(1), |
551 | LRI(13, POSTED), |
552 | REG16(0x244), |
553 | REG(0x034), |
554 | REG(0x030), |
555 | REG(0x038), |
556 | REG(0x03c), |
557 | REG(0x168), |
558 | REG(0x140), |
559 | REG(0x110), |
560 | REG(0x1c0), |
561 | REG(0x1c4), |
562 | REG(0x1c8), |
563 | REG(0x180), |
564 | REG16(0x2b4), |
565 | |
566 | NOP(5), |
567 | LRI(9, POSTED), |
568 | REG16(0x3a8), |
569 | REG16(0x28c), |
570 | REG16(0x288), |
571 | REG16(0x284), |
572 | REG16(0x280), |
573 | REG16(0x27c), |
574 | REG16(0x278), |
575 | REG16(0x274), |
576 | REG16(0x270), |
577 | |
578 | LRI(3, POSTED), |
579 | REG(0x1b0), |
580 | REG16(0x5a8), |
581 | REG16(0x5ac), |
582 | |
583 | NOP(6), |
584 | LRI(1, 0), |
585 | REG(0x0c8), |
586 | |
587 | END |
588 | }; |
589 | |
590 | static const u8 dg2_rcs_offsets[] = { |
591 | NOP(1), |
592 | LRI(15, POSTED), |
593 | REG16(0x244), |
594 | REG(0x034), |
595 | REG(0x030), |
596 | REG(0x038), |
597 | REG(0x03c), |
598 | REG(0x168), |
599 | REG(0x140), |
600 | REG(0x110), |
601 | REG(0x1c0), |
602 | REG(0x1c4), |
603 | REG(0x1c8), |
604 | REG(0x180), |
605 | REG16(0x2b4), |
606 | REG(0x120), |
607 | REG(0x124), |
608 | |
609 | NOP(1), |
610 | LRI(9, POSTED), |
611 | REG16(0x3a8), |
612 | REG16(0x28c), |
613 | REG16(0x288), |
614 | REG16(0x284), |
615 | REG16(0x280), |
616 | REG16(0x27c), |
617 | REG16(0x278), |
618 | REG16(0x274), |
619 | REG16(0x270), |
620 | |
621 | LRI(3, POSTED), |
622 | REG(0x1b0), |
623 | REG16(0x5a8), |
624 | REG16(0x5ac), |
625 | |
626 | NOP(6), |
627 | LRI(1, 0), |
628 | REG(0x0c8), |
629 | |
630 | END |
631 | }; |
632 | |
633 | static const u8 mtl_rcs_offsets[] = { |
634 | NOP(1), |
635 | LRI(15, POSTED), |
636 | REG16(0x244), |
637 | REG(0x034), |
638 | REG(0x030), |
639 | REG(0x038), |
640 | REG(0x03c), |
641 | REG(0x168), |
642 | REG(0x140), |
643 | REG(0x110), |
644 | REG(0x1c0), |
645 | REG(0x1c4), |
646 | REG(0x1c8), |
647 | REG(0x180), |
648 | REG16(0x2b4), |
649 | REG(0x120), |
650 | REG(0x124), |
651 | |
652 | NOP(1), |
653 | LRI(9, POSTED), |
654 | REG16(0x3a8), |
655 | REG16(0x28c), |
656 | REG16(0x288), |
657 | REG16(0x284), |
658 | REG16(0x280), |
659 | REG16(0x27c), |
660 | REG16(0x278), |
661 | REG16(0x274), |
662 | REG16(0x270), |
663 | |
664 | NOP(2), |
665 | LRI(2, POSTED), |
666 | REG16(0x5a8), |
667 | REG16(0x5ac), |
668 | |
669 | NOP(6), |
670 | LRI(1, 0), |
671 | REG(0x0c8), |
672 | |
673 | END |
674 | }; |
675 | |
676 | #undef END |
677 | #undef REG16 |
678 | #undef REG |
679 | #undef LRI |
680 | #undef NOP |
681 | |
682 | static const u8 *reg_offsets(const struct intel_engine_cs *engine) |
683 | { |
684 | /* |
685 | * The gen12+ lists only have the registers we program in the basic |
686 | * default state. We rely on the context image using relative |
687 | * addressing to automatic fixup the register state between the |
688 | * physical engines for virtual engine. |
689 | */ |
690 | GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 && |
691 | !intel_engine_has_relative_mmio(engine)); |
692 | |
693 | if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) { |
694 | if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) |
695 | return mtl_rcs_offsets; |
696 | else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) |
697 | return dg2_rcs_offsets; |
698 | else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) |
699 | return xehp_rcs_offsets; |
700 | else if (GRAPHICS_VER(engine->i915) >= 12) |
701 | return gen12_rcs_offsets; |
702 | else if (GRAPHICS_VER(engine->i915) >= 11) |
703 | return gen11_rcs_offsets; |
704 | else if (GRAPHICS_VER(engine->i915) >= 9) |
705 | return gen9_rcs_offsets; |
706 | else |
707 | return gen8_rcs_offsets; |
708 | } else { |
709 | if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55)) |
710 | return dg2_xcs_offsets; |
711 | else if (GRAPHICS_VER(engine->i915) >= 12) |
712 | return gen12_xcs_offsets; |
713 | else if (GRAPHICS_VER(engine->i915) >= 9) |
714 | return gen9_xcs_offsets; |
715 | else |
716 | return gen8_xcs_offsets; |
717 | } |
718 | } |
719 | |
720 | static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) |
721 | { |
722 | if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) |
723 | return 0x70; |
724 | else if (GRAPHICS_VER(engine->i915) >= 12) |
725 | return 0x60; |
726 | else if (GRAPHICS_VER(engine->i915) >= 9) |
727 | return 0x54; |
728 | else if (engine->class == RENDER_CLASS) |
729 | return 0x58; |
730 | else |
731 | return -1; |
732 | } |
733 | |
734 | static int lrc_ring_bb_offset(const struct intel_engine_cs *engine) |
735 | { |
736 | if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) |
737 | return 0x80; |
738 | else if (GRAPHICS_VER(engine->i915) >= 12) |
739 | return 0x70; |
740 | else if (GRAPHICS_VER(engine->i915) >= 9) |
741 | return 0x64; |
742 | else if (GRAPHICS_VER(engine->i915) >= 8 && |
743 | engine->class == RENDER_CLASS) |
744 | return 0xc4; |
745 | else |
746 | return -1; |
747 | } |
748 | |
749 | static int lrc_ring_gpr0(const struct intel_engine_cs *engine) |
750 | { |
751 | if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) |
752 | return 0x84; |
753 | else if (GRAPHICS_VER(engine->i915) >= 12) |
754 | return 0x74; |
755 | else if (GRAPHICS_VER(engine->i915) >= 9) |
756 | return 0x68; |
757 | else if (engine->class == RENDER_CLASS) |
758 | return 0xd8; |
759 | else |
760 | return -1; |
761 | } |
762 | |
763 | static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) |
764 | { |
765 | if (GRAPHICS_VER(engine->i915) >= 12) |
766 | return 0x12; |
767 | else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS) |
768 | return 0x18; |
769 | else |
770 | return -1; |
771 | } |
772 | |
773 | static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) |
774 | { |
775 | int x; |
776 | |
777 | x = lrc_ring_wa_bb_per_ctx(engine); |
778 | if (x < 0) |
779 | return x; |
780 | |
781 | return x + 2; |
782 | } |
783 | |
784 | static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) |
785 | { |
786 | int x; |
787 | |
788 | x = lrc_ring_indirect_ptr(engine); |
789 | if (x < 0) |
790 | return x; |
791 | |
792 | return x + 2; |
793 | } |
794 | |
795 | static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) |
796 | { |
797 | |
798 | if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50)) |
799 | /* |
800 | * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL |
801 | * simply to match the RCS context image layout. |
802 | */ |
803 | return 0xc6; |
804 | else if (engine->class != RENDER_CLASS) |
805 | return -1; |
806 | else if (GRAPHICS_VER(engine->i915) >= 12) |
807 | return 0xb6; |
808 | else if (GRAPHICS_VER(engine->i915) >= 11) |
809 | return 0xaa; |
810 | else |
811 | return -1; |
812 | } |
813 | |
814 | static u32 |
815 | lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) |
816 | { |
817 | if (GRAPHICS_VER(engine->i915) >= 12) |
818 | return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; |
819 | else if (GRAPHICS_VER(engine->i915) >= 11) |
820 | return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; |
821 | else if (GRAPHICS_VER(engine->i915) >= 9) |
822 | return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; |
823 | else if (GRAPHICS_VER(engine->i915) >= 8) |
824 | return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; |
825 | |
826 | GEM_BUG_ON(GRAPHICS_VER(engine->i915) < 8); |
827 | |
828 | return 0; |
829 | } |
830 | |
831 | static void |
832 | lrc_setup_bb_per_ctx(u32 *regs, |
833 | const struct intel_engine_cs *engine, |
834 | u32 ctx_bb_ggtt_addr) |
835 | { |
836 | GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); |
837 | regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = |
838 | ctx_bb_ggtt_addr | |
839 | PER_CTX_BB_FORCE | |
840 | PER_CTX_BB_VALID; |
841 | } |
842 | |
843 | static void |
844 | lrc_setup_indirect_ctx(u32 *regs, |
845 | const struct intel_engine_cs *engine, |
846 | u32 ctx_bb_ggtt_addr, |
847 | u32 size) |
848 | { |
849 | GEM_BUG_ON(!size); |
850 | GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); |
851 | GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); |
852 | regs[lrc_ring_indirect_ptr(engine) + 1] = |
853 | ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); |
854 | |
855 | GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); |
856 | regs[lrc_ring_indirect_offset(engine) + 1] = |
857 | lrc_ring_indirect_offset_default(engine) << 6; |
858 | } |
859 | |
860 | static bool ctx_needs_runalone(const struct intel_context *ce) |
861 | { |
862 | struct i915_gem_context *gem_ctx; |
863 | bool ctx_is_protected = false; |
864 | |
865 | /* |
866 | * On MTL and newer platforms, protected contexts require setting |
867 | * the LRC run-alone bit or else the encryption will not happen. |
868 | */ |
869 | if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) && |
870 | (ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) { |
871 | rcu_read_lock(); |
872 | gem_ctx = rcu_dereference(ce->gem_context); |
873 | if (gem_ctx) |
874 | ctx_is_protected = gem_ctx->uses_protected_content; |
875 | rcu_read_unlock(); |
876 | } |
877 | |
878 | return ctx_is_protected; |
879 | } |
880 | |
881 | static void init_common_regs(u32 * const regs, |
882 | const struct intel_context *ce, |
883 | const struct intel_engine_cs *engine, |
884 | bool inhibit) |
885 | { |
886 | u32 ctl; |
887 | int loc; |
888 | |
889 | ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); |
890 | ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); |
891 | if (inhibit) |
892 | ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; |
893 | if (GRAPHICS_VER(engine->i915) < 11) |
894 | ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | |
895 | CTX_CTRL_RS_CTX_ENABLE); |
896 | if (ctx_needs_runalone(ce)) |
897 | ctl |= _MASKED_BIT_ENABLE(GEN12_CTX_CTRL_RUNALONE_MODE); |
898 | regs[CTX_CONTEXT_CONTROL] = ctl; |
899 | |
900 | regs[CTX_TIMESTAMP] = ce->stats.runtime.last; |
901 | |
902 | loc = lrc_ring_bb_offset(engine); |
903 | if (loc != -1) |
904 | regs[loc + 1] = 0; |
905 | } |
906 | |
907 | static void init_wa_bb_regs(u32 * const regs, |
908 | const struct intel_engine_cs *engine) |
909 | { |
910 | const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; |
911 | |
912 | if (wa_ctx->per_ctx.size) { |
913 | const u32 ggtt_offset = i915_ggtt_offset(vma: wa_ctx->vma); |
914 | |
915 | GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); |
916 | regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = |
917 | (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; |
918 | } |
919 | |
920 | if (wa_ctx->indirect_ctx.size) { |
921 | lrc_setup_indirect_ctx(regs, engine, |
922 | ctx_bb_ggtt_addr: i915_ggtt_offset(vma: wa_ctx->vma) + |
923 | wa_ctx->indirect_ctx.offset, |
924 | size: wa_ctx->indirect_ctx.size); |
925 | } |
926 | } |
927 | |
928 | static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt) |
929 | { |
930 | if (i915_vm_is_4lvl(vm: &ppgtt->vm)) { |
931 | /* 64b PPGTT (48bit canonical) |
932 | * PDP0_DESCRIPTOR contains the base address to PML4 and |
933 | * other PDP Descriptors are ignored. |
934 | */ |
935 | ASSIGN_CTX_PML4(ppgtt, regs); |
936 | } else { |
937 | ASSIGN_CTX_PDP(ppgtt, regs, 3); |
938 | ASSIGN_CTX_PDP(ppgtt, regs, 2); |
939 | ASSIGN_CTX_PDP(ppgtt, regs, 1); |
940 | ASSIGN_CTX_PDP(ppgtt, regs, 0); |
941 | } |
942 | } |
943 | |
944 | static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) |
945 | { |
946 | if (i915_is_ggtt(vm)) |
947 | return i915_vm_to_ggtt(vm)->alias; |
948 | else |
949 | return i915_vm_to_ppgtt(vm); |
950 | } |
951 | |
952 | static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) |
953 | { |
954 | int x; |
955 | |
956 | x = lrc_ring_mi_mode(engine); |
957 | if (x != -1) { |
958 | regs[x + 1] &= ~STOP_RING; |
959 | regs[x + 1] |= STOP_RING << 16; |
960 | } |
961 | } |
962 | |
963 | static void __lrc_init_regs(u32 *regs, |
964 | const struct intel_context *ce, |
965 | const struct intel_engine_cs *engine, |
966 | bool inhibit) |
967 | { |
968 | /* |
969 | * A context is actually a big batch buffer with several |
970 | * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The |
971 | * values we are setting here are only for the first context restore: |
972 | * on a subsequent save, the GPU will recreate this batchbuffer with new |
973 | * values (including all the missing MI_LOAD_REGISTER_IMM commands that |
974 | * we are not initializing here). |
975 | * |
976 | * Must keep consistent with virtual_update_register_offsets(). |
977 | */ |
978 | |
979 | if (inhibit) |
980 | memset(regs, 0, PAGE_SIZE); |
981 | |
982 | set_offsets(regs, data: reg_offsets(engine), engine, close: inhibit); |
983 | |
984 | init_common_regs(regs, ce, engine, inhibit); |
985 | init_ppgtt_regs(regs, ppgtt: vm_alias(vm: ce->vm)); |
986 | |
987 | init_wa_bb_regs(regs, engine); |
988 | |
989 | __reset_stop_ring(regs, engine); |
990 | } |
991 | |
992 | void lrc_init_regs(const struct intel_context *ce, |
993 | const struct intel_engine_cs *engine, |
994 | bool inhibit) |
995 | { |
996 | __lrc_init_regs(regs: ce->lrc_reg_state, ce, engine, inhibit); |
997 | } |
998 | |
999 | void lrc_reset_regs(const struct intel_context *ce, |
1000 | const struct intel_engine_cs *engine) |
1001 | { |
1002 | __reset_stop_ring(regs: ce->lrc_reg_state, engine); |
1003 | } |
1004 | |
1005 | static void |
1006 | set_redzone(void *vaddr, const struct intel_engine_cs *engine) |
1007 | { |
1008 | if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) |
1009 | return; |
1010 | |
1011 | vaddr += engine->context_size; |
1012 | |
1013 | memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); |
1014 | } |
1015 | |
1016 | static void |
1017 | check_redzone(const void *vaddr, const struct intel_engine_cs *engine) |
1018 | { |
1019 | if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) |
1020 | return; |
1021 | |
1022 | vaddr += engine->context_size; |
1023 | |
1024 | if (memchr_inv(p: vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) |
1025 | drm_err_once(&engine->i915->drm, |
1026 | "%s context redzone overwritten!\n" , |
1027 | engine->name); |
1028 | } |
1029 | |
1030 | static u32 context_wa_bb_offset(const struct intel_context *ce) |
1031 | { |
1032 | return PAGE_SIZE * ce->wa_bb_page; |
1033 | } |
1034 | |
1035 | /* |
1036 | * per_ctx below determines which WABB section is used. |
1037 | * When true, the function returns the location of the |
1038 | * PER_CTX_BB. When false, the function returns the |
1039 | * location of the INDIRECT_CTX. |
1040 | */ |
1041 | static u32 *context_wabb(const struct intel_context *ce, bool per_ctx) |
1042 | { |
1043 | void *ptr; |
1044 | |
1045 | GEM_BUG_ON(!ce->wa_bb_page); |
1046 | |
1047 | ptr = ce->lrc_reg_state; |
1048 | ptr -= LRC_STATE_OFFSET; /* back to start of context image */ |
1049 | ptr += context_wa_bb_offset(ce); |
1050 | ptr += per_ctx ? PAGE_SIZE : 0; |
1051 | |
1052 | return ptr; |
1053 | } |
1054 | |
1055 | void lrc_init_state(struct intel_context *ce, |
1056 | struct intel_engine_cs *engine, |
1057 | void *state) |
1058 | { |
1059 | bool inhibit = true; |
1060 | |
1061 | set_redzone(vaddr: state, engine); |
1062 | |
1063 | if (engine->default_state) { |
1064 | shmem_read(file: engine->default_state, off: 0, |
1065 | dst: state, len: engine->context_size); |
1066 | __set_bit(CONTEXT_VALID_BIT, &ce->flags); |
1067 | inhibit = false; |
1068 | } |
1069 | |
1070 | /* Clear the ppHWSP (inc. per-context counters) */ |
1071 | memset(state, 0, PAGE_SIZE); |
1072 | |
1073 | /* Clear the indirect wa and storage */ |
1074 | if (ce->wa_bb_page) |
1075 | memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE); |
1076 | |
1077 | /* |
1078 | * The second page of the context object contains some registers which |
1079 | * must be set up prior to the first execution. |
1080 | */ |
1081 | __lrc_init_regs(regs: state + LRC_STATE_OFFSET, ce, engine, inhibit); |
1082 | } |
1083 | |
1084 | u32 lrc_indirect_bb(const struct intel_context *ce) |
1085 | { |
1086 | return i915_ggtt_offset(vma: ce->state) + context_wa_bb_offset(ce); |
1087 | } |
1088 | |
1089 | static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs) |
1090 | { |
1091 | /* If predication is active, this will be noop'ed */ |
1092 | *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); |
1093 | *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; |
1094 | *cs++ = 0; |
1095 | *cs++ = 0; /* No predication */ |
1096 | |
1097 | /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */ |
1098 | *cs++ = MI_BATCH_BUFFER_END | BIT(15); |
1099 | *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE; |
1100 | |
1101 | /* Instructions are no longer predicated (disabled), we can proceed */ |
1102 | *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2); |
1103 | *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA; |
1104 | *cs++ = 0; |
1105 | *cs++ = 1; /* enable predication before the next BB */ |
1106 | |
1107 | *cs++ = MI_BATCH_BUFFER_END; |
1108 | GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA); |
1109 | |
1110 | return cs; |
1111 | } |
1112 | |
1113 | static struct i915_vma * |
1114 | __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine) |
1115 | { |
1116 | struct drm_i915_gem_object *obj; |
1117 | struct i915_vma *vma; |
1118 | u32 context_size; |
1119 | |
1120 | context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); |
1121 | |
1122 | if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) |
1123 | context_size += I915_GTT_PAGE_SIZE; /* for redzone */ |
1124 | |
1125 | if (GRAPHICS_VER(engine->i915) >= 12) { |
1126 | ce->wa_bb_page = context_size / PAGE_SIZE; |
1127 | /* INDIRECT_CTX and PER_CTX_BB need separate pages. */ |
1128 | context_size += PAGE_SIZE * 2; |
1129 | } |
1130 | |
1131 | if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) { |
1132 | ce->parallel.guc.parent_page = context_size / PAGE_SIZE; |
1133 | context_size += PARENT_SCRATCH_SIZE; |
1134 | } |
1135 | |
1136 | obj = i915_gem_object_create_lmem(i915: engine->i915, size: context_size, |
1137 | I915_BO_ALLOC_PM_VOLATILE); |
1138 | if (IS_ERR(ptr: obj)) { |
1139 | obj = i915_gem_object_create_shmem(i915: engine->i915, size: context_size); |
1140 | if (IS_ERR(ptr: obj)) |
1141 | return ERR_CAST(ptr: obj); |
1142 | |
1143 | /* |
1144 | * Wa_22016122933: For Media version 13.0, all Media GT shared |
1145 | * memory needs to be mapped as WC on CPU side and UC (PAT |
1146 | * index 2) on GPU side. |
1147 | */ |
1148 | if (intel_gt_needs_wa_22016122933(gt: engine->gt)) |
1149 | i915_gem_object_set_cache_coherency(obj, cache_level: I915_CACHE_NONE); |
1150 | } |
1151 | |
1152 | vma = i915_vma_instance(obj, vm: &engine->gt->ggtt->vm, NULL); |
1153 | if (IS_ERR(ptr: vma)) { |
1154 | i915_gem_object_put(obj); |
1155 | return vma; |
1156 | } |
1157 | |
1158 | return vma; |
1159 | } |
1160 | |
1161 | static struct intel_timeline * |
1162 | pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine) |
1163 | { |
1164 | struct intel_timeline *tl = fetch_and_zero(&ce->timeline); |
1165 | |
1166 | return intel_timeline_create_from_engine(engine, page_unmask_bits(tl)); |
1167 | } |
1168 | |
1169 | int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine) |
1170 | { |
1171 | struct intel_ring *ring; |
1172 | struct i915_vma *vma; |
1173 | int err; |
1174 | |
1175 | GEM_BUG_ON(ce->state); |
1176 | |
1177 | vma = __lrc_alloc_state(ce, engine); |
1178 | if (IS_ERR(ptr: vma)) |
1179 | return PTR_ERR(ptr: vma); |
1180 | |
1181 | ring = intel_engine_create_ring(engine, size: ce->ring_size); |
1182 | if (IS_ERR(ptr: ring)) { |
1183 | err = PTR_ERR(ptr: ring); |
1184 | goto err_vma; |
1185 | } |
1186 | |
1187 | if (!page_mask_bits(ce->timeline)) { |
1188 | struct intel_timeline *tl; |
1189 | |
1190 | /* |
1191 | * Use the static global HWSP for the kernel context, and |
1192 | * a dynamically allocated cacheline for everyone else. |
1193 | */ |
1194 | if (unlikely(ce->timeline)) |
1195 | tl = pinned_timeline(ce, engine); |
1196 | else |
1197 | tl = intel_timeline_create(gt: engine->gt); |
1198 | if (IS_ERR(ptr: tl)) { |
1199 | err = PTR_ERR(ptr: tl); |
1200 | goto err_ring; |
1201 | } |
1202 | |
1203 | ce->timeline = tl; |
1204 | } |
1205 | |
1206 | ce->ring = ring; |
1207 | ce->state = vma; |
1208 | |
1209 | return 0; |
1210 | |
1211 | err_ring: |
1212 | intel_ring_put(ring); |
1213 | err_vma: |
1214 | i915_vma_put(vma); |
1215 | return err; |
1216 | } |
1217 | |
1218 | void lrc_reset(struct intel_context *ce) |
1219 | { |
1220 | GEM_BUG_ON(!intel_context_is_pinned(ce)); |
1221 | |
1222 | intel_ring_reset(ring: ce->ring, tail: ce->ring->emit); |
1223 | |
1224 | /* Scrub away the garbage */ |
1225 | lrc_init_regs(ce, engine: ce->engine, inhibit: true); |
1226 | ce->lrc.lrca = lrc_update_regs(ce, engine: ce->engine, head: ce->ring->tail); |
1227 | } |
1228 | |
1229 | int |
1230 | lrc_pre_pin(struct intel_context *ce, |
1231 | struct intel_engine_cs *engine, |
1232 | struct i915_gem_ww_ctx *ww, |
1233 | void **vaddr) |
1234 | { |
1235 | GEM_BUG_ON(!ce->state); |
1236 | GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); |
1237 | |
1238 | *vaddr = i915_gem_object_pin_map(obj: ce->state->obj, |
1239 | type: intel_gt_coherent_map_type(gt: ce->engine->gt, |
1240 | obj: ce->state->obj, |
1241 | always_coherent: false) | |
1242 | I915_MAP_OVERRIDE); |
1243 | |
1244 | return PTR_ERR_OR_ZERO(ptr: *vaddr); |
1245 | } |
1246 | |
1247 | int |
1248 | lrc_pin(struct intel_context *ce, |
1249 | struct intel_engine_cs *engine, |
1250 | void *vaddr) |
1251 | { |
1252 | ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; |
1253 | |
1254 | if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags)) |
1255 | lrc_init_state(ce, engine, state: vaddr); |
1256 | |
1257 | ce->lrc.lrca = lrc_update_regs(ce, engine, head: ce->ring->tail); |
1258 | return 0; |
1259 | } |
1260 | |
1261 | void lrc_unpin(struct intel_context *ce) |
1262 | { |
1263 | if (unlikely(ce->parallel.last_rq)) { |
1264 | i915_request_put(rq: ce->parallel.last_rq); |
1265 | ce->parallel.last_rq = NULL; |
1266 | } |
1267 | check_redzone(vaddr: (void *)ce->lrc_reg_state - LRC_STATE_OFFSET, |
1268 | engine: ce->engine); |
1269 | } |
1270 | |
1271 | void lrc_post_unpin(struct intel_context *ce) |
1272 | { |
1273 | i915_gem_object_unpin_map(obj: ce->state->obj); |
1274 | } |
1275 | |
1276 | void lrc_fini(struct intel_context *ce) |
1277 | { |
1278 | if (!ce->state) |
1279 | return; |
1280 | |
1281 | intel_ring_put(fetch_and_zero(&ce->ring)); |
1282 | i915_vma_put(fetch_and_zero(&ce->state)); |
1283 | } |
1284 | |
1285 | void lrc_destroy(struct kref *kref) |
1286 | { |
1287 | struct intel_context *ce = container_of(kref, typeof(*ce), ref); |
1288 | |
1289 | GEM_BUG_ON(!i915_active_is_idle(&ce->active)); |
1290 | GEM_BUG_ON(intel_context_is_pinned(ce)); |
1291 | |
1292 | lrc_fini(ce); |
1293 | |
1294 | intel_context_fini(ce); |
1295 | intel_context_free(ce); |
1296 | } |
1297 | |
1298 | static u32 * |
1299 | gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) |
1300 | { |
1301 | *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | |
1302 | MI_SRM_LRM_GLOBAL_GTT | |
1303 | MI_LRI_LRM_CS_MMIO; |
1304 | *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); |
1305 | *cs++ = i915_ggtt_offset(vma: ce->state) + LRC_STATE_OFFSET + |
1306 | CTX_TIMESTAMP * sizeof(u32); |
1307 | *cs++ = 0; |
1308 | |
1309 | *cs++ = MI_LOAD_REGISTER_REG | |
1310 | MI_LRR_SOURCE_CS_MMIO | |
1311 | MI_LRI_LRM_CS_MMIO; |
1312 | *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); |
1313 | *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); |
1314 | |
1315 | *cs++ = MI_LOAD_REGISTER_REG | |
1316 | MI_LRR_SOURCE_CS_MMIO | |
1317 | MI_LRI_LRM_CS_MMIO; |
1318 | *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); |
1319 | *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); |
1320 | |
1321 | return cs; |
1322 | } |
1323 | |
1324 | static u32 * |
1325 | gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) |
1326 | { |
1327 | GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); |
1328 | |
1329 | *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | |
1330 | MI_SRM_LRM_GLOBAL_GTT | |
1331 | MI_LRI_LRM_CS_MMIO; |
1332 | *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); |
1333 | *cs++ = i915_ggtt_offset(vma: ce->state) + LRC_STATE_OFFSET + |
1334 | (lrc_ring_gpr0(engine: ce->engine) + 1) * sizeof(u32); |
1335 | *cs++ = 0; |
1336 | |
1337 | return cs; |
1338 | } |
1339 | |
1340 | static u32 * |
1341 | gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) |
1342 | { |
1343 | GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); |
1344 | |
1345 | *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | |
1346 | MI_SRM_LRM_GLOBAL_GTT | |
1347 | MI_LRI_LRM_CS_MMIO; |
1348 | *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); |
1349 | *cs++ = i915_ggtt_offset(vma: ce->state) + LRC_STATE_OFFSET + |
1350 | (lrc_ring_cmd_buf_cctl(engine: ce->engine) + 1) * sizeof(u32); |
1351 | *cs++ = 0; |
1352 | |
1353 | *cs++ = MI_LOAD_REGISTER_REG | |
1354 | MI_LRR_SOURCE_CS_MMIO | |
1355 | MI_LRI_LRM_CS_MMIO; |
1356 | *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); |
1357 | *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); |
1358 | |
1359 | return cs; |
1360 | } |
1361 | |
1362 | /* |
1363 | * The bspec's tuning guide asks us to program a vertical watermark value of |
1364 | * 0x3FF. However this register is not saved/restored properly by the |
1365 | * hardware, so we're required to apply the desired value via INDIRECT_CTX |
1366 | * batch buffer to ensure the value takes effect properly. All other bits |
1367 | * in this register should remain at 0 (the hardware default). |
1368 | */ |
1369 | static u32 * |
1370 | dg2_emit_draw_watermark_setting(u32 *cs) |
1371 | { |
1372 | *cs++ = MI_LOAD_REGISTER_IMM(1); |
1373 | *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK); |
1374 | *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF); |
1375 | |
1376 | return cs; |
1377 | } |
1378 | |
1379 | static u32 * |
1380 | gen12_invalidate_state_cache(u32 *cs) |
1381 | { |
1382 | *cs++ = MI_LOAD_REGISTER_IMM(1); |
1383 | *cs++ = i915_mmio_reg_offset(GEN12_CS_DEBUG_MODE2); |
1384 | *cs++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE); |
1385 | return cs; |
1386 | } |
1387 | |
1388 | static u32 * |
1389 | gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) |
1390 | { |
1391 | cs = gen12_emit_timestamp_wa(ce, cs); |
1392 | cs = gen12_emit_cmd_buf_wa(ce, cs); |
1393 | cs = gen12_emit_restore_scratch(ce, cs); |
1394 | |
1395 | /* Wa_16013000631:dg2 */ |
1396 | if (IS_DG2_G11(ce->engine->i915)) |
1397 | cs = gen8_emit_pipe_control(batch: cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, offset: 0); |
1398 | |
1399 | cs = gen12_emit_aux_table_inv(engine: ce->engine, cs); |
1400 | |
1401 | /* Wa_18022495364 */ |
1402 | if (IS_GFX_GT_IP_RANGE(ce->engine->gt, IP_VER(12, 0), IP_VER(12, 10))) |
1403 | cs = gen12_invalidate_state_cache(cs); |
1404 | |
1405 | /* Wa_16014892111 */ |
1406 | if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || |
1407 | IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) || |
1408 | IS_DG2(ce->engine->i915)) |
1409 | cs = dg2_emit_draw_watermark_setting(cs); |
1410 | |
1411 | return cs; |
1412 | } |
1413 | |
1414 | static u32 * |
1415 | gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) |
1416 | { |
1417 | cs = gen12_emit_timestamp_wa(ce, cs); |
1418 | cs = gen12_emit_restore_scratch(ce, cs); |
1419 | |
1420 | /* Wa_16013000631:dg2 */ |
1421 | if (IS_DG2_G11(ce->engine->i915)) |
1422 | if (ce->engine->class == COMPUTE_CLASS) |
1423 | cs = gen8_emit_pipe_control(batch: cs, |
1424 | PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, |
1425 | offset: 0); |
1426 | |
1427 | return gen12_emit_aux_table_inv(engine: ce->engine, cs); |
1428 | } |
1429 | |
1430 | static u32 *xehp_emit_fastcolor_blt_wabb(const struct intel_context *ce, u32 *cs) |
1431 | { |
1432 | struct intel_gt *gt = ce->engine->gt; |
1433 | int mocs = gt->mocs.uc_index << 1; |
1434 | |
1435 | /** |
1436 | * Wa_16018031267 / Wa_16018063123 requires that SW forces the |
1437 | * main copy engine arbitration into round robin mode. We |
1438 | * additionally need to submit the following WABB blt command |
1439 | * to produce 4 subblits with each subblit generating 0 byte |
1440 | * write requests as WABB: |
1441 | * |
1442 | * XY_FASTCOLOR_BLT |
1443 | * BG0 -> 5100000E |
1444 | * BG1 -> 0000003F (Dest pitch) |
1445 | * BG2 -> 00000000 (X1, Y1) = (0, 0) |
1446 | * BG3 -> 00040001 (X2, Y2) = (1, 4) |
1447 | * BG4 -> scratch |
1448 | * BG5 -> scratch |
1449 | * BG6-12 -> 00000000 |
1450 | * BG13 -> 20004004 (Surf. Width= 2,Surf. Height = 5 ) |
1451 | * BG14 -> 00000010 (Qpitch = 4) |
1452 | * BG15 -> 00000000 |
1453 | */ |
1454 | *cs++ = XY_FAST_COLOR_BLT_CMD | (16 - 2); |
1455 | *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) | 0x3f; |
1456 | *cs++ = 0; |
1457 | *cs++ = 4 << 16 | 1; |
1458 | *cs++ = lower_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); |
1459 | *cs++ = upper_32_bits(i915_vma_offset(ce->vm->rsvd.vma)); |
1460 | *cs++ = 0; |
1461 | *cs++ = 0; |
1462 | *cs++ = 0; |
1463 | *cs++ = 0; |
1464 | *cs++ = 0; |
1465 | *cs++ = 0; |
1466 | *cs++ = 0; |
1467 | *cs++ = 0x20004004; |
1468 | *cs++ = 0x10; |
1469 | *cs++ = 0; |
1470 | |
1471 | return cs; |
1472 | } |
1473 | |
1474 | static u32 * |
1475 | xehp_emit_per_ctx_bb(const struct intel_context *ce, u32 *cs) |
1476 | { |
1477 | /* Wa_16018031267, Wa_16018063123 */ |
1478 | if (NEEDS_FASTCOLOR_BLT_WABB(ce->engine)) |
1479 | cs = xehp_emit_fastcolor_blt_wabb(ce, cs); |
1480 | |
1481 | return cs; |
1482 | } |
1483 | |
1484 | static void |
1485 | setup_per_ctx_bb(const struct intel_context *ce, |
1486 | const struct intel_engine_cs *engine, |
1487 | u32 *(*emit)(const struct intel_context *, u32 *)) |
1488 | { |
1489 | /* Place PER_CTX_BB on next page after INDIRECT_CTX */ |
1490 | u32 * const start = context_wabb(ce, per_ctx: true); |
1491 | u32 *cs; |
1492 | |
1493 | cs = emit(ce, start); |
1494 | |
1495 | /* PER_CTX_BB must manually terminate */ |
1496 | *cs++ = MI_BATCH_BUFFER_END; |
1497 | |
1498 | GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); |
1499 | lrc_setup_bb_per_ctx(regs: ce->lrc_reg_state, engine, |
1500 | ctx_bb_ggtt_addr: lrc_indirect_bb(ce) + PAGE_SIZE); |
1501 | } |
1502 | |
1503 | static void |
1504 | setup_indirect_ctx_bb(const struct intel_context *ce, |
1505 | const struct intel_engine_cs *engine, |
1506 | u32 *(*emit)(const struct intel_context *, u32 *)) |
1507 | { |
1508 | u32 * const start = context_wabb(ce, per_ctx: false); |
1509 | u32 *cs; |
1510 | |
1511 | cs = emit(ce, start); |
1512 | GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); |
1513 | while ((unsigned long)cs % CACHELINE_BYTES) |
1514 | *cs++ = MI_NOOP; |
1515 | |
1516 | GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start)); |
1517 | setup_predicate_disable_wa(ce, cs: start + DG2_PREDICATE_RESULT_BB / sizeof(*start)); |
1518 | |
1519 | lrc_setup_indirect_ctx(regs: ce->lrc_reg_state, engine, |
1520 | ctx_bb_ggtt_addr: lrc_indirect_bb(ce), |
1521 | size: (cs - start) * sizeof(*cs)); |
1522 | } |
1523 | |
1524 | /* |
1525 | * The context descriptor encodes various attributes of a context, |
1526 | * including its GTT address and some flags. Because it's fairly |
1527 | * expensive to calculate, we'll just do it once and cache the result, |
1528 | * which remains valid until the context is unpinned. |
1529 | * |
1530 | * This is what a descriptor looks like, from LSB to MSB:: |
1531 | * |
1532 | * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) |
1533 | * bits 12-31: LRCA, GTT address of (the HWSP of) this context |
1534 | * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) |
1535 | * bits 53-54: mbz, reserved for use by hardware |
1536 | * bits 55-63: group ID, currently unused and set to 0 |
1537 | * |
1538 | * Starting from Gen11, the upper dword of the descriptor has a new format: |
1539 | * |
1540 | * bits 32-36: reserved |
1541 | * bits 37-47: SW context ID |
1542 | * bits 48:53: engine instance |
1543 | * bit 54: mbz, reserved for use by hardware |
1544 | * bits 55-60: SW counter |
1545 | * bits 61-63: engine class |
1546 | * |
1547 | * On Xe_HP, the upper dword of the descriptor has a new format: |
1548 | * |
1549 | * bits 32-37: virtual function number |
1550 | * bit 38: mbz, reserved for use by hardware |
1551 | * bits 39-54: SW context ID |
1552 | * bits 55-57: reserved |
1553 | * bits 58-63: SW counter |
1554 | * |
1555 | * engine info, SW context ID and SW counter need to form a unique number |
1556 | * (Context ID) per lrc. |
1557 | */ |
1558 | static u32 lrc_descriptor(const struct intel_context *ce) |
1559 | { |
1560 | u32 desc; |
1561 | |
1562 | desc = INTEL_LEGACY_32B_CONTEXT; |
1563 | if (i915_vm_is_4lvl(vm: ce->vm)) |
1564 | desc = INTEL_LEGACY_64B_CONTEXT; |
1565 | desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; |
1566 | |
1567 | desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; |
1568 | if (GRAPHICS_VER(ce->vm->i915) == 8) |
1569 | desc |= GEN8_CTX_L3LLC_COHERENT; |
1570 | |
1571 | return i915_ggtt_offset(vma: ce->state) | desc; |
1572 | } |
1573 | |
1574 | u32 lrc_update_regs(const struct intel_context *ce, |
1575 | const struct intel_engine_cs *engine, |
1576 | u32 head) |
1577 | { |
1578 | struct intel_ring *ring = ce->ring; |
1579 | u32 *regs = ce->lrc_reg_state; |
1580 | |
1581 | GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); |
1582 | GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); |
1583 | |
1584 | regs[CTX_RING_START] = i915_ggtt_offset(vma: ring->vma); |
1585 | regs[CTX_RING_HEAD] = head; |
1586 | regs[CTX_RING_TAIL] = ring->tail; |
1587 | regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; |
1588 | |
1589 | /* RPCS */ |
1590 | if (engine->class == RENDER_CLASS) { |
1591 | regs[CTX_R_PWR_CLK_STATE] = |
1592 | intel_sseu_make_rpcs(gt: engine->gt, req_sseu: &ce->sseu); |
1593 | |
1594 | i915_oa_init_reg_state(ce, engine); |
1595 | } |
1596 | |
1597 | if (ce->wa_bb_page) { |
1598 | u32 *(*fn)(const struct intel_context *ce, u32 *cs); |
1599 | |
1600 | fn = gen12_emit_indirect_ctx_xcs; |
1601 | if (ce->engine->class == RENDER_CLASS) |
1602 | fn = gen12_emit_indirect_ctx_rcs; |
1603 | |
1604 | /* Mutually exclusive wrt to global indirect bb */ |
1605 | GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); |
1606 | setup_indirect_ctx_bb(ce, engine, emit: fn); |
1607 | setup_per_ctx_bb(ce, engine, emit: xehp_emit_per_ctx_bb); |
1608 | } |
1609 | |
1610 | return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE; |
1611 | } |
1612 | |
1613 | void lrc_update_offsets(struct intel_context *ce, |
1614 | struct intel_engine_cs *engine) |
1615 | { |
1616 | set_offsets(regs: ce->lrc_reg_state, data: reg_offsets(engine), engine, close: false); |
1617 | } |
1618 | |
1619 | void lrc_check_regs(const struct intel_context *ce, |
1620 | const struct intel_engine_cs *engine, |
1621 | const char *when) |
1622 | { |
1623 | const struct intel_ring *ring = ce->ring; |
1624 | u32 *regs = ce->lrc_reg_state; |
1625 | bool valid = true; |
1626 | int x; |
1627 | |
1628 | if (regs[CTX_RING_START] != i915_ggtt_offset(vma: ring->vma)) { |
1629 | pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n" , |
1630 | engine->name, |
1631 | regs[CTX_RING_START], |
1632 | i915_ggtt_offset(ring->vma)); |
1633 | regs[CTX_RING_START] = i915_ggtt_offset(vma: ring->vma); |
1634 | valid = false; |
1635 | } |
1636 | |
1637 | if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != |
1638 | (RING_CTL_SIZE(ring->size) | RING_VALID)) { |
1639 | pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n" , |
1640 | engine->name, |
1641 | regs[CTX_RING_CTL], |
1642 | (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); |
1643 | regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; |
1644 | valid = false; |
1645 | } |
1646 | |
1647 | x = lrc_ring_mi_mode(engine); |
1648 | if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { |
1649 | pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n" , |
1650 | engine->name, regs[x + 1]); |
1651 | regs[x + 1] &= ~STOP_RING; |
1652 | regs[x + 1] |= STOP_RING << 16; |
1653 | valid = false; |
1654 | } |
1655 | |
1656 | WARN_ONCE(!valid, "Invalid lrc state found %s submission\n" , when); |
1657 | } |
1658 | |
1659 | /* |
1660 | * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after |
1661 | * PIPE_CONTROL instruction. This is required for the flush to happen correctly |
1662 | * but there is a slight complication as this is applied in WA batch where the |
1663 | * values are only initialized once so we cannot take register value at the |
1664 | * beginning and reuse it further; hence we save its value to memory, upload a |
1665 | * constant value with bit21 set and then we restore it back with the saved value. |
1666 | * To simplify the WA, a constant value is formed by using the default value |
1667 | * of this register. This shouldn't be a problem because we are only modifying |
1668 | * it for a short period and this batch in non-premptible. We can ofcourse |
1669 | * use additional instructions that read the actual value of the register |
1670 | * at that time and set our bit of interest but it makes the WA complicated. |
1671 | * |
1672 | * This WA is also required for Gen9 so extracting as a function avoids |
1673 | * code duplication. |
1674 | */ |
1675 | static u32 * |
1676 | gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) |
1677 | { |
1678 | /* NB no one else is allowed to scribble over scratch + 256! */ |
1679 | *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; |
1680 | *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); |
1681 | *batch++ = intel_gt_scratch_offset(gt: engine->gt, |
1682 | field: INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); |
1683 | *batch++ = 0; |
1684 | |
1685 | *batch++ = MI_LOAD_REGISTER_IMM(1); |
1686 | *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); |
1687 | *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; |
1688 | |
1689 | batch = gen8_emit_pipe_control(batch, |
1690 | PIPE_CONTROL_CS_STALL | |
1691 | PIPE_CONTROL_DC_FLUSH_ENABLE, |
1692 | offset: 0); |
1693 | |
1694 | *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; |
1695 | *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); |
1696 | *batch++ = intel_gt_scratch_offset(gt: engine->gt, |
1697 | field: INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); |
1698 | *batch++ = 0; |
1699 | |
1700 | return batch; |
1701 | } |
1702 | |
1703 | /* |
1704 | * Typically we only have one indirect_ctx and per_ctx batch buffer which are |
1705 | * initialized at the beginning and shared across all contexts but this field |
1706 | * helps us to have multiple batches at different offsets and select them based |
1707 | * on a criteria. At the moment this batch always start at the beginning of the page |
1708 | * and at this point we don't have multiple wa_ctx batch buffers. |
1709 | * |
1710 | * The number of WA applied are not known at the beginning; we use this field |
1711 | * to return the no of DWORDS written. |
1712 | * |
1713 | * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END |
1714 | * so it adds NOOPs as padding to make it cacheline aligned. |
1715 | * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together |
1716 | * makes a complete batch buffer. |
1717 | */ |
1718 | static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) |
1719 | { |
1720 | /* WaDisableCtxRestoreArbitration:bdw,chv */ |
1721 | *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; |
1722 | |
1723 | /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ |
1724 | if (IS_BROADWELL(engine->i915)) |
1725 | batch = gen8_emit_flush_coherentl3_wa(engine, batch); |
1726 | |
1727 | /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ |
1728 | /* Actual scratch location is at 128 bytes offset */ |
1729 | batch = gen8_emit_pipe_control(batch, |
1730 | PIPE_CONTROL_FLUSH_L3 | |
1731 | PIPE_CONTROL_STORE_DATA_INDEX | |
1732 | PIPE_CONTROL_CS_STALL | |
1733 | PIPE_CONTROL_QW_WRITE, |
1734 | LRC_PPHWSP_SCRATCH_ADDR); |
1735 | |
1736 | *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; |
1737 | |
1738 | /* Pad to end of cacheline */ |
1739 | while ((unsigned long)batch % CACHELINE_BYTES) |
1740 | *batch++ = MI_NOOP; |
1741 | |
1742 | /* |
1743 | * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because |
1744 | * execution depends on the length specified in terms of cache lines |
1745 | * in the register CTX_RCS_INDIRECT_CTX |
1746 | */ |
1747 | |
1748 | return batch; |
1749 | } |
1750 | |
1751 | struct lri { |
1752 | i915_reg_t reg; |
1753 | u32 value; |
1754 | }; |
1755 | |
1756 | static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) |
1757 | { |
1758 | GEM_BUG_ON(!count || count > 63); |
1759 | |
1760 | *batch++ = MI_LOAD_REGISTER_IMM(count); |
1761 | do { |
1762 | *batch++ = i915_mmio_reg_offset(lri->reg); |
1763 | *batch++ = lri->value; |
1764 | } while (lri++, --count); |
1765 | *batch++ = MI_NOOP; |
1766 | |
1767 | return batch; |
1768 | } |
1769 | |
1770 | static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) |
1771 | { |
1772 | static const struct lri lri[] = { |
1773 | /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ |
1774 | { |
1775 | COMMON_SLICE_CHICKEN2, |
1776 | __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, |
1777 | 0), |
1778 | }, |
1779 | |
1780 | /* BSpec: 11391 */ |
1781 | { |
1782 | FF_SLICE_CHICKEN, |
1783 | __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, |
1784 | FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), |
1785 | }, |
1786 | |
1787 | /* BSpec: 11299 */ |
1788 | { |
1789 | _3D_CHICKEN3, |
1790 | __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, |
1791 | _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), |
1792 | } |
1793 | }; |
1794 | |
1795 | *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; |
1796 | |
1797 | /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ |
1798 | batch = gen8_emit_flush_coherentl3_wa(engine, batch); |
1799 | |
1800 | /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ |
1801 | batch = gen8_emit_pipe_control(batch, |
1802 | PIPE_CONTROL_FLUSH_L3 | |
1803 | PIPE_CONTROL_STORE_DATA_INDEX | |
1804 | PIPE_CONTROL_CS_STALL | |
1805 | PIPE_CONTROL_QW_WRITE, |
1806 | LRC_PPHWSP_SCRATCH_ADDR); |
1807 | |
1808 | batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); |
1809 | |
1810 | /* WaMediaPoolStateCmdInWABB:bxt,glk */ |
1811 | if (HAS_POOLED_EU(engine->i915)) { |
1812 | /* |
1813 | * EU pool configuration is setup along with golden context |
1814 | * during context initialization. This value depends on |
1815 | * device type (2x6 or 3x6) and needs to be updated based |
1816 | * on which subslice is disabled especially for 2x6 |
1817 | * devices, however it is safe to load default |
1818 | * configuration of 3x6 device instead of masking off |
1819 | * corresponding bits because HW ignores bits of a disabled |
1820 | * subslice and drops down to appropriate config. Please |
1821 | * see render_state_setup() in i915_gem_render_state.c for |
1822 | * possible configurations, to avoid duplication they are |
1823 | * not shown here again. |
1824 | */ |
1825 | *batch++ = GEN9_MEDIA_POOL_STATE; |
1826 | *batch++ = GEN9_MEDIA_POOL_ENABLE; |
1827 | *batch++ = 0x00777000; |
1828 | *batch++ = 0; |
1829 | *batch++ = 0; |
1830 | *batch++ = 0; |
1831 | } |
1832 | |
1833 | *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; |
1834 | |
1835 | /* Pad to end of cacheline */ |
1836 | while ((unsigned long)batch % CACHELINE_BYTES) |
1837 | *batch++ = MI_NOOP; |
1838 | |
1839 | return batch; |
1840 | } |
1841 | |
1842 | #define CTX_WA_BB_SIZE (PAGE_SIZE) |
1843 | |
1844 | static int lrc_create_wa_ctx(struct intel_engine_cs *engine) |
1845 | { |
1846 | struct drm_i915_gem_object *obj; |
1847 | struct i915_vma *vma; |
1848 | int err; |
1849 | |
1850 | obj = i915_gem_object_create_shmem(i915: engine->i915, CTX_WA_BB_SIZE); |
1851 | if (IS_ERR(ptr: obj)) |
1852 | return PTR_ERR(ptr: obj); |
1853 | |
1854 | vma = i915_vma_instance(obj, vm: &engine->gt->ggtt->vm, NULL); |
1855 | if (IS_ERR(ptr: vma)) { |
1856 | err = PTR_ERR(ptr: vma); |
1857 | goto err; |
1858 | } |
1859 | |
1860 | engine->wa_ctx.vma = vma; |
1861 | return 0; |
1862 | |
1863 | err: |
1864 | i915_gem_object_put(obj); |
1865 | return err; |
1866 | } |
1867 | |
1868 | void lrc_fini_wa_ctx(struct intel_engine_cs *engine) |
1869 | { |
1870 | i915_vma_unpin_and_release(p_vma: &engine->wa_ctx.vma, flags: 0); |
1871 | } |
1872 | |
1873 | typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); |
1874 | |
1875 | void lrc_init_wa_ctx(struct intel_engine_cs *engine) |
1876 | { |
1877 | struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; |
1878 | struct i915_wa_ctx_bb *wa_bb[] = { |
1879 | &wa_ctx->indirect_ctx, &wa_ctx->per_ctx |
1880 | }; |
1881 | wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)]; |
1882 | struct i915_gem_ww_ctx ww; |
1883 | void *batch, *batch_ptr; |
1884 | unsigned int i; |
1885 | int err; |
1886 | |
1887 | if (GRAPHICS_VER(engine->i915) >= 11 || |
1888 | !(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE)) |
1889 | return; |
1890 | |
1891 | if (GRAPHICS_VER(engine->i915) == 9) { |
1892 | wa_bb_fn[0] = gen9_init_indirectctx_bb; |
1893 | wa_bb_fn[1] = NULL; |
1894 | } else if (GRAPHICS_VER(engine->i915) == 8) { |
1895 | wa_bb_fn[0] = gen8_init_indirectctx_bb; |
1896 | wa_bb_fn[1] = NULL; |
1897 | } |
1898 | |
1899 | err = lrc_create_wa_ctx(engine); |
1900 | if (err) { |
1901 | /* |
1902 | * We continue even if we fail to initialize WA batch |
1903 | * because we only expect rare glitches but nothing |
1904 | * critical to prevent us from using GPU |
1905 | */ |
1906 | drm_err(&engine->i915->drm, |
1907 | "Ignoring context switch w/a allocation error:%d\n" , |
1908 | err); |
1909 | return; |
1910 | } |
1911 | |
1912 | if (!engine->wa_ctx.vma) |
1913 | return; |
1914 | |
1915 | i915_gem_ww_ctx_init(ctx: &ww, intr: true); |
1916 | retry: |
1917 | err = i915_gem_object_lock(obj: wa_ctx->vma->obj, ww: &ww); |
1918 | if (!err) |
1919 | err = i915_ggtt_pin(vma: wa_ctx->vma, ww: &ww, align: 0, PIN_HIGH); |
1920 | if (err) |
1921 | goto err; |
1922 | |
1923 | batch = i915_gem_object_pin_map(obj: wa_ctx->vma->obj, type: I915_MAP_WB); |
1924 | if (IS_ERR(ptr: batch)) { |
1925 | err = PTR_ERR(ptr: batch); |
1926 | goto err_unpin; |
1927 | } |
1928 | |
1929 | /* |
1930 | * Emit the two workaround batch buffers, recording the offset from the |
1931 | * start of the workaround batch buffer object for each and their |
1932 | * respective sizes. |
1933 | */ |
1934 | batch_ptr = batch; |
1935 | for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { |
1936 | wa_bb[i]->offset = batch_ptr - batch; |
1937 | if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, |
1938 | CACHELINE_BYTES))) { |
1939 | err = -EINVAL; |
1940 | break; |
1941 | } |
1942 | if (wa_bb_fn[i]) |
1943 | batch_ptr = wa_bb_fn[i](engine, batch_ptr); |
1944 | wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); |
1945 | } |
1946 | GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE); |
1947 | |
1948 | __i915_gem_object_flush_map(obj: wa_ctx->vma->obj, offset: 0, size: batch_ptr - batch); |
1949 | __i915_gem_object_release_map(obj: wa_ctx->vma->obj); |
1950 | |
1951 | /* Verify that we can handle failure to setup the wa_ctx */ |
1952 | if (!err) |
1953 | err = i915_inject_probe_error(engine->i915, -ENODEV); |
1954 | |
1955 | err_unpin: |
1956 | if (err) |
1957 | i915_vma_unpin(vma: wa_ctx->vma); |
1958 | err: |
1959 | if (err == -EDEADLK) { |
1960 | err = i915_gem_ww_ctx_backoff(ctx: &ww); |
1961 | if (!err) |
1962 | goto retry; |
1963 | } |
1964 | i915_gem_ww_ctx_fini(ctx: &ww); |
1965 | |
1966 | if (err) { |
1967 | i915_vma_put(vma: engine->wa_ctx.vma); |
1968 | |
1969 | /* Clear all flags to prevent further use */ |
1970 | memset(wa_ctx, 0, sizeof(*wa_ctx)); |
1971 | } |
1972 | } |
1973 | |
1974 | static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt) |
1975 | { |
1976 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
1977 | stats->runtime.num_underflow++; |
1978 | stats->runtime.max_underflow = |
1979 | max_t(u32, stats->runtime.max_underflow, -dt); |
1980 | #endif |
1981 | } |
1982 | |
1983 | static u32 lrc_get_runtime(const struct intel_context *ce) |
1984 | { |
1985 | /* |
1986 | * We can use either ppHWSP[16] which is recorded before the context |
1987 | * switch (and so excludes the cost of context switches) or use the |
1988 | * value from the context image itself, which is saved/restored earlier |
1989 | * and so includes the cost of the save. |
1990 | */ |
1991 | return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); |
1992 | } |
1993 | |
1994 | void lrc_update_runtime(struct intel_context *ce) |
1995 | { |
1996 | struct intel_context_stats *stats = &ce->stats; |
1997 | u32 old; |
1998 | s32 dt; |
1999 | |
2000 | old = stats->runtime.last; |
2001 | stats->runtime.last = lrc_get_runtime(ce); |
2002 | dt = stats->runtime.last - old; |
2003 | if (!dt) |
2004 | return; |
2005 | |
2006 | if (unlikely(dt < 0)) { |
2007 | CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n" , |
2008 | old, stats->runtime.last, dt); |
2009 | st_runtime_underflow(stats, dt); |
2010 | return; |
2011 | } |
2012 | |
2013 | ewma_runtime_add(e: &stats->runtime.avg, val: dt); |
2014 | stats->runtime.total += dt; |
2015 | } |
2016 | |
2017 | #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
2018 | #include "selftest_lrc.c" |
2019 | #endif |
2020 | |