1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2014 Intel Corporation
4 */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool close)23 static void set_offsets(u32 *regs,
24 const u8 *data,
25 const struct intel_engine_cs *engine,
26 bool close)
27 #define NOP(x) (BIT(7) | (x))
28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
29 #define POSTED BIT(0)
30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
31 #define REG16(x) \
32 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
33 (((x) >> 2) & 0x7f)
34 #define END 0
35 {
36 const u32 base = engine->mmio_base;
37
38 while (*data) {
39 u8 count, flags;
40
41 if (*data & BIT(7)) { /* skip */
42 count = *data++ & ~BIT(7);
43 regs += count;
44 continue;
45 }
46
47 count = *data & 0x3f;
48 flags = *data >> 6;
49 data++;
50
51 *regs = MI_LOAD_REGISTER_IMM(count);
52 if (flags & POSTED)
53 *regs |= MI_LRI_FORCE_POSTED;
54 if (GRAPHICS_VER(engine->i915) >= 11)
55 *regs |= MI_LRI_LRM_CS_MMIO;
56 regs++;
57
58 GEM_BUG_ON(!count);
59 do {
60 u32 offset = 0;
61 u8 v;
62
63 do {
64 v = *data++;
65 offset <<= 7;
66 offset |= v & ~BIT(7);
67 } while (v & BIT(7));
68
69 regs[0] = base + (offset << 2);
70 regs += 2;
71 } while (--count);
72 }
73
74 if (close) {
75 /* Close the batch; used mainly by live_lrc_layout() */
76 *regs = MI_BATCH_BUFFER_END;
77 if (GRAPHICS_VER(engine->i915) >= 11)
78 *regs |= BIT(0);
79 }
80 }
81
82 static const u8 gen8_xcs_offsets[] = {
83 NOP(1),
84 LRI(11, 0),
85 REG16(0x244),
86 REG(0x034),
87 REG(0x030),
88 REG(0x038),
89 REG(0x03c),
90 REG(0x168),
91 REG(0x140),
92 REG(0x110),
93 REG(0x11c),
94 REG(0x114),
95 REG(0x118),
96
97 NOP(9),
98 LRI(9, 0),
99 REG16(0x3a8),
100 REG16(0x28c),
101 REG16(0x288),
102 REG16(0x284),
103 REG16(0x280),
104 REG16(0x27c),
105 REG16(0x278),
106 REG16(0x274),
107 REG16(0x270),
108
109 NOP(13),
110 LRI(2, 0),
111 REG16(0x200),
112 REG(0x028),
113
114 END
115 };
116
117 static const u8 gen9_xcs_offsets[] = {
118 NOP(1),
119 LRI(14, POSTED),
120 REG16(0x244),
121 REG(0x034),
122 REG(0x030),
123 REG(0x038),
124 REG(0x03c),
125 REG(0x168),
126 REG(0x140),
127 REG(0x110),
128 REG(0x11c),
129 REG(0x114),
130 REG(0x118),
131 REG(0x1c0),
132 REG(0x1c4),
133 REG(0x1c8),
134
135 NOP(3),
136 LRI(9, POSTED),
137 REG16(0x3a8),
138 REG16(0x28c),
139 REG16(0x288),
140 REG16(0x284),
141 REG16(0x280),
142 REG16(0x27c),
143 REG16(0x278),
144 REG16(0x274),
145 REG16(0x270),
146
147 NOP(13),
148 LRI(1, POSTED),
149 REG16(0x200),
150
151 NOP(13),
152 LRI(44, POSTED),
153 REG(0x028),
154 REG(0x09c),
155 REG(0x0c0),
156 REG(0x178),
157 REG(0x17c),
158 REG16(0x358),
159 REG(0x170),
160 REG(0x150),
161 REG(0x154),
162 REG(0x158),
163 REG16(0x41c),
164 REG16(0x600),
165 REG16(0x604),
166 REG16(0x608),
167 REG16(0x60c),
168 REG16(0x610),
169 REG16(0x614),
170 REG16(0x618),
171 REG16(0x61c),
172 REG16(0x620),
173 REG16(0x624),
174 REG16(0x628),
175 REG16(0x62c),
176 REG16(0x630),
177 REG16(0x634),
178 REG16(0x638),
179 REG16(0x63c),
180 REG16(0x640),
181 REG16(0x644),
182 REG16(0x648),
183 REG16(0x64c),
184 REG16(0x650),
185 REG16(0x654),
186 REG16(0x658),
187 REG16(0x65c),
188 REG16(0x660),
189 REG16(0x664),
190 REG16(0x668),
191 REG16(0x66c),
192 REG16(0x670),
193 REG16(0x674),
194 REG16(0x678),
195 REG16(0x67c),
196 REG(0x068),
197
198 END
199 };
200
201 static const u8 gen12_xcs_offsets[] = {
202 NOP(1),
203 LRI(13, POSTED),
204 REG16(0x244),
205 REG(0x034),
206 REG(0x030),
207 REG(0x038),
208 REG(0x03c),
209 REG(0x168),
210 REG(0x140),
211 REG(0x110),
212 REG(0x1c0),
213 REG(0x1c4),
214 REG(0x1c8),
215 REG(0x180),
216 REG16(0x2b4),
217
218 NOP(5),
219 LRI(9, POSTED),
220 REG16(0x3a8),
221 REG16(0x28c),
222 REG16(0x288),
223 REG16(0x284),
224 REG16(0x280),
225 REG16(0x27c),
226 REG16(0x278),
227 REG16(0x274),
228 REG16(0x270),
229
230 END
231 };
232
233 static const u8 dg2_xcs_offsets[] = {
234 NOP(1),
235 LRI(15, POSTED),
236 REG16(0x244),
237 REG(0x034),
238 REG(0x030),
239 REG(0x038),
240 REG(0x03c),
241 REG(0x168),
242 REG(0x140),
243 REG(0x110),
244 REG(0x1c0),
245 REG(0x1c4),
246 REG(0x1c8),
247 REG(0x180),
248 REG16(0x2b4),
249 REG(0x120),
250 REG(0x124),
251
252 NOP(1),
253 LRI(9, POSTED),
254 REG16(0x3a8),
255 REG16(0x28c),
256 REG16(0x288),
257 REG16(0x284),
258 REG16(0x280),
259 REG16(0x27c),
260 REG16(0x278),
261 REG16(0x274),
262 REG16(0x270),
263
264 END
265 };
266
267 static const u8 gen8_rcs_offsets[] = {
268 NOP(1),
269 LRI(14, POSTED),
270 REG16(0x244),
271 REG(0x034),
272 REG(0x030),
273 REG(0x038),
274 REG(0x03c),
275 REG(0x168),
276 REG(0x140),
277 REG(0x110),
278 REG(0x11c),
279 REG(0x114),
280 REG(0x118),
281 REG(0x1c0),
282 REG(0x1c4),
283 REG(0x1c8),
284
285 NOP(3),
286 LRI(9, POSTED),
287 REG16(0x3a8),
288 REG16(0x28c),
289 REG16(0x288),
290 REG16(0x284),
291 REG16(0x280),
292 REG16(0x27c),
293 REG16(0x278),
294 REG16(0x274),
295 REG16(0x270),
296
297 NOP(13),
298 LRI(1, 0),
299 REG(0x0c8),
300
301 END
302 };
303
304 static const u8 gen9_rcs_offsets[] = {
305 NOP(1),
306 LRI(14, POSTED),
307 REG16(0x244),
308 REG(0x34),
309 REG(0x30),
310 REG(0x38),
311 REG(0x3c),
312 REG(0x168),
313 REG(0x140),
314 REG(0x110),
315 REG(0x11c),
316 REG(0x114),
317 REG(0x118),
318 REG(0x1c0),
319 REG(0x1c4),
320 REG(0x1c8),
321
322 NOP(3),
323 LRI(9, POSTED),
324 REG16(0x3a8),
325 REG16(0x28c),
326 REG16(0x288),
327 REG16(0x284),
328 REG16(0x280),
329 REG16(0x27c),
330 REG16(0x278),
331 REG16(0x274),
332 REG16(0x270),
333
334 NOP(13),
335 LRI(1, 0),
336 REG(0xc8),
337
338 NOP(13),
339 LRI(44, POSTED),
340 REG(0x28),
341 REG(0x9c),
342 REG(0xc0),
343 REG(0x178),
344 REG(0x17c),
345 REG16(0x358),
346 REG(0x170),
347 REG(0x150),
348 REG(0x154),
349 REG(0x158),
350 REG16(0x41c),
351 REG16(0x600),
352 REG16(0x604),
353 REG16(0x608),
354 REG16(0x60c),
355 REG16(0x610),
356 REG16(0x614),
357 REG16(0x618),
358 REG16(0x61c),
359 REG16(0x620),
360 REG16(0x624),
361 REG16(0x628),
362 REG16(0x62c),
363 REG16(0x630),
364 REG16(0x634),
365 REG16(0x638),
366 REG16(0x63c),
367 REG16(0x640),
368 REG16(0x644),
369 REG16(0x648),
370 REG16(0x64c),
371 REG16(0x650),
372 REG16(0x654),
373 REG16(0x658),
374 REG16(0x65c),
375 REG16(0x660),
376 REG16(0x664),
377 REG16(0x668),
378 REG16(0x66c),
379 REG16(0x670),
380 REG16(0x674),
381 REG16(0x678),
382 REG16(0x67c),
383 REG(0x68),
384
385 END
386 };
387
388 static const u8 gen11_rcs_offsets[] = {
389 NOP(1),
390 LRI(15, POSTED),
391 REG16(0x244),
392 REG(0x034),
393 REG(0x030),
394 REG(0x038),
395 REG(0x03c),
396 REG(0x168),
397 REG(0x140),
398 REG(0x110),
399 REG(0x11c),
400 REG(0x114),
401 REG(0x118),
402 REG(0x1c0),
403 REG(0x1c4),
404 REG(0x1c8),
405 REG(0x180),
406
407 NOP(1),
408 LRI(9, POSTED),
409 REG16(0x3a8),
410 REG16(0x28c),
411 REG16(0x288),
412 REG16(0x284),
413 REG16(0x280),
414 REG16(0x27c),
415 REG16(0x278),
416 REG16(0x274),
417 REG16(0x270),
418
419 LRI(1, POSTED),
420 REG(0x1b0),
421
422 NOP(10),
423 LRI(1, 0),
424 REG(0x0c8),
425
426 END
427 };
428
429 static const u8 gen12_rcs_offsets[] = {
430 NOP(1),
431 LRI(13, POSTED),
432 REG16(0x244),
433 REG(0x034),
434 REG(0x030),
435 REG(0x038),
436 REG(0x03c),
437 REG(0x168),
438 REG(0x140),
439 REG(0x110),
440 REG(0x1c0),
441 REG(0x1c4),
442 REG(0x1c8),
443 REG(0x180),
444 REG16(0x2b4),
445
446 NOP(5),
447 LRI(9, POSTED),
448 REG16(0x3a8),
449 REG16(0x28c),
450 REG16(0x288),
451 REG16(0x284),
452 REG16(0x280),
453 REG16(0x27c),
454 REG16(0x278),
455 REG16(0x274),
456 REG16(0x270),
457
458 LRI(3, POSTED),
459 REG(0x1b0),
460 REG16(0x5a8),
461 REG16(0x5ac),
462
463 NOP(6),
464 LRI(1, 0),
465 REG(0x0c8),
466 NOP(3 + 9 + 1),
467
468 LRI(51, POSTED),
469 REG16(0x588),
470 REG16(0x588),
471 REG16(0x588),
472 REG16(0x588),
473 REG16(0x588),
474 REG16(0x588),
475 REG(0x028),
476 REG(0x09c),
477 REG(0x0c0),
478 REG(0x178),
479 REG(0x17c),
480 REG16(0x358),
481 REG(0x170),
482 REG(0x150),
483 REG(0x154),
484 REG(0x158),
485 REG16(0x41c),
486 REG16(0x600),
487 REG16(0x604),
488 REG16(0x608),
489 REG16(0x60c),
490 REG16(0x610),
491 REG16(0x614),
492 REG16(0x618),
493 REG16(0x61c),
494 REG16(0x620),
495 REG16(0x624),
496 REG16(0x628),
497 REG16(0x62c),
498 REG16(0x630),
499 REG16(0x634),
500 REG16(0x638),
501 REG16(0x63c),
502 REG16(0x640),
503 REG16(0x644),
504 REG16(0x648),
505 REG16(0x64c),
506 REG16(0x650),
507 REG16(0x654),
508 REG16(0x658),
509 REG16(0x65c),
510 REG16(0x660),
511 REG16(0x664),
512 REG16(0x668),
513 REG16(0x66c),
514 REG16(0x670),
515 REG16(0x674),
516 REG16(0x678),
517 REG16(0x67c),
518 REG(0x068),
519 REG(0x084),
520 NOP(1),
521
522 END
523 };
524
525 static const u8 xehp_rcs_offsets[] = {
526 NOP(1),
527 LRI(13, POSTED),
528 REG16(0x244),
529 REG(0x034),
530 REG(0x030),
531 REG(0x038),
532 REG(0x03c),
533 REG(0x168),
534 REG(0x140),
535 REG(0x110),
536 REG(0x1c0),
537 REG(0x1c4),
538 REG(0x1c8),
539 REG(0x180),
540 REG16(0x2b4),
541
542 NOP(5),
543 LRI(9, POSTED),
544 REG16(0x3a8),
545 REG16(0x28c),
546 REG16(0x288),
547 REG16(0x284),
548 REG16(0x280),
549 REG16(0x27c),
550 REG16(0x278),
551 REG16(0x274),
552 REG16(0x270),
553
554 LRI(3, POSTED),
555 REG(0x1b0),
556 REG16(0x5a8),
557 REG16(0x5ac),
558
559 NOP(6),
560 LRI(1, 0),
561 REG(0x0c8),
562
563 END
564 };
565
566 static const u8 dg2_rcs_offsets[] = {
567 NOP(1),
568 LRI(15, POSTED),
569 REG16(0x244),
570 REG(0x034),
571 REG(0x030),
572 REG(0x038),
573 REG(0x03c),
574 REG(0x168),
575 REG(0x140),
576 REG(0x110),
577 REG(0x1c0),
578 REG(0x1c4),
579 REG(0x1c8),
580 REG(0x180),
581 REG16(0x2b4),
582 REG(0x120),
583 REG(0x124),
584
585 NOP(1),
586 LRI(9, POSTED),
587 REG16(0x3a8),
588 REG16(0x28c),
589 REG16(0x288),
590 REG16(0x284),
591 REG16(0x280),
592 REG16(0x27c),
593 REG16(0x278),
594 REG16(0x274),
595 REG16(0x270),
596
597 LRI(3, POSTED),
598 REG(0x1b0),
599 REG16(0x5a8),
600 REG16(0x5ac),
601
602 NOP(6),
603 LRI(1, 0),
604 REG(0x0c8),
605
606 END
607 };
608
609 #undef END
610 #undef REG16
611 #undef REG
612 #undef LRI
613 #undef NOP
614
reg_offsets(const struct intel_engine_cs * engine)615 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
616 {
617 /*
618 * The gen12+ lists only have the registers we program in the basic
619 * default state. We rely on the context image using relative
620 * addressing to automatic fixup the register state between the
621 * physical engines for virtual engine.
622 */
623 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
624 !intel_engine_has_relative_mmio(engine));
625
626 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
627 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
628 return dg2_rcs_offsets;
629 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
630 return xehp_rcs_offsets;
631 else if (GRAPHICS_VER(engine->i915) >= 12)
632 return gen12_rcs_offsets;
633 else if (GRAPHICS_VER(engine->i915) >= 11)
634 return gen11_rcs_offsets;
635 else if (GRAPHICS_VER(engine->i915) >= 9)
636 return gen9_rcs_offsets;
637 else
638 return gen8_rcs_offsets;
639 } else {
640 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
641 return dg2_xcs_offsets;
642 else if (GRAPHICS_VER(engine->i915) >= 12)
643 return gen12_xcs_offsets;
644 else if (GRAPHICS_VER(engine->i915) >= 9)
645 return gen9_xcs_offsets;
646 else
647 return gen8_xcs_offsets;
648 }
649 }
650
lrc_ring_mi_mode(const struct intel_engine_cs * engine)651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
652 {
653 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
654 return 0x70;
655 else if (GRAPHICS_VER(engine->i915) >= 12)
656 return 0x60;
657 else if (GRAPHICS_VER(engine->i915) >= 9)
658 return 0x54;
659 else if (engine->class == RENDER_CLASS)
660 return 0x58;
661 else
662 return -1;
663 }
664
lrc_ring_gpr0(const struct intel_engine_cs * engine)665 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
666 {
667 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
668 return 0x84;
669 else if (GRAPHICS_VER(engine->i915) >= 12)
670 return 0x74;
671 else if (GRAPHICS_VER(engine->i915) >= 9)
672 return 0x68;
673 else if (engine->class == RENDER_CLASS)
674 return 0xd8;
675 else
676 return -1;
677 }
678
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)679 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
680 {
681 if (GRAPHICS_VER(engine->i915) >= 12)
682 return 0x12;
683 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
684 return 0x18;
685 else
686 return -1;
687 }
688
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)689 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
690 {
691 int x;
692
693 x = lrc_ring_wa_bb_per_ctx(engine);
694 if (x < 0)
695 return x;
696
697 return x + 2;
698 }
699
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)700 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
701 {
702 int x;
703
704 x = lrc_ring_indirect_ptr(engine);
705 if (x < 0)
706 return x;
707
708 return x + 2;
709 }
710
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)711 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
712 {
713
714 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
715 /*
716 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
717 * simply to match the RCS context image layout.
718 */
719 return 0xc6;
720 else if (engine->class != RENDER_CLASS)
721 return -1;
722 else if (GRAPHICS_VER(engine->i915) >= 12)
723 return 0xb6;
724 else if (GRAPHICS_VER(engine->i915) >= 11)
725 return 0xaa;
726 else
727 return -1;
728 }
729
730 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)731 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
732 {
733 switch (GRAPHICS_VER(engine->i915)) {
734 default:
735 MISSING_CASE(GRAPHICS_VER(engine->i915));
736 fallthrough;
737 case 12:
738 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
739 case 11:
740 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
741 case 9:
742 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
743 case 8:
744 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
745 }
746 }
747
748 static void
lrc_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)749 lrc_setup_indirect_ctx(u32 *regs,
750 const struct intel_engine_cs *engine,
751 u32 ctx_bb_ggtt_addr,
752 u32 size)
753 {
754 GEM_BUG_ON(!size);
755 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
756 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
757 regs[lrc_ring_indirect_ptr(engine) + 1] =
758 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
759
760 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
761 regs[lrc_ring_indirect_offset(engine) + 1] =
762 lrc_ring_indirect_offset_default(engine) << 6;
763 }
764
init_common_regs(u32 * const regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)765 static void init_common_regs(u32 * const regs,
766 const struct intel_context *ce,
767 const struct intel_engine_cs *engine,
768 bool inhibit)
769 {
770 u32 ctl;
771
772 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
773 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
774 if (inhibit)
775 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
776 if (GRAPHICS_VER(engine->i915) < 11)
777 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
778 CTX_CTRL_RS_CTX_ENABLE);
779 regs[CTX_CONTEXT_CONTROL] = ctl;
780
781 regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
782 }
783
init_wa_bb_regs(u32 * const regs,const struct intel_engine_cs * engine)784 static void init_wa_bb_regs(u32 * const regs,
785 const struct intel_engine_cs *engine)
786 {
787 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
788
789 if (wa_ctx->per_ctx.size) {
790 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
791
792 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
793 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
794 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
795 }
796
797 if (wa_ctx->indirect_ctx.size) {
798 lrc_setup_indirect_ctx(regs, engine,
799 i915_ggtt_offset(wa_ctx->vma) +
800 wa_ctx->indirect_ctx.offset,
801 wa_ctx->indirect_ctx.size);
802 }
803 }
804
init_ppgtt_regs(u32 * regs,const struct i915_ppgtt * ppgtt)805 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
806 {
807 if (i915_vm_is_4lvl(&ppgtt->vm)) {
808 /* 64b PPGTT (48bit canonical)
809 * PDP0_DESCRIPTOR contains the base address to PML4 and
810 * other PDP Descriptors are ignored.
811 */
812 ASSIGN_CTX_PML4(ppgtt, regs);
813 } else {
814 ASSIGN_CTX_PDP(ppgtt, regs, 3);
815 ASSIGN_CTX_PDP(ppgtt, regs, 2);
816 ASSIGN_CTX_PDP(ppgtt, regs, 1);
817 ASSIGN_CTX_PDP(ppgtt, regs, 0);
818 }
819 }
820
vm_alias(struct i915_address_space * vm)821 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
822 {
823 if (i915_is_ggtt(vm))
824 return i915_vm_to_ggtt(vm)->alias;
825 else
826 return i915_vm_to_ppgtt(vm);
827 }
828
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)829 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
830 {
831 int x;
832
833 x = lrc_ring_mi_mode(engine);
834 if (x != -1) {
835 regs[x + 1] &= ~STOP_RING;
836 regs[x + 1] |= STOP_RING << 16;
837 }
838 }
839
__lrc_init_regs(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)840 static void __lrc_init_regs(u32 *regs,
841 const struct intel_context *ce,
842 const struct intel_engine_cs *engine,
843 bool inhibit)
844 {
845 /*
846 * A context is actually a big batch buffer with several
847 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
848 * values we are setting here are only for the first context restore:
849 * on a subsequent save, the GPU will recreate this batchbuffer with new
850 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
851 * we are not initializing here).
852 *
853 * Must keep consistent with virtual_update_register_offsets().
854 */
855
856 if (inhibit)
857 memset(regs, 0, PAGE_SIZE);
858
859 set_offsets(regs, reg_offsets(engine), engine, inhibit);
860
861 init_common_regs(regs, ce, engine, inhibit);
862 init_ppgtt_regs(regs, vm_alias(ce->vm));
863
864 init_wa_bb_regs(regs, engine);
865
866 __reset_stop_ring(regs, engine);
867 }
868
lrc_init_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)869 void lrc_init_regs(const struct intel_context *ce,
870 const struct intel_engine_cs *engine,
871 bool inhibit)
872 {
873 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
874 }
875
lrc_reset_regs(const struct intel_context * ce,const struct intel_engine_cs * engine)876 void lrc_reset_regs(const struct intel_context *ce,
877 const struct intel_engine_cs *engine)
878 {
879 __reset_stop_ring(ce->lrc_reg_state, engine);
880 }
881
882 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)883 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
884 {
885 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
886 return;
887
888 vaddr += engine->context_size;
889
890 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
891 }
892
893 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)894 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
895 {
896 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
897 return;
898
899 vaddr += engine->context_size;
900
901 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
902 drm_err_once(&engine->i915->drm,
903 "%s context redzone overwritten!\n",
904 engine->name);
905 }
906
context_wa_bb_offset(const struct intel_context * ce)907 static u32 context_wa_bb_offset(const struct intel_context *ce)
908 {
909 return PAGE_SIZE * ce->wa_bb_page;
910 }
911
context_indirect_bb(const struct intel_context * ce)912 static u32 *context_indirect_bb(const struct intel_context *ce)
913 {
914 void *ptr;
915
916 GEM_BUG_ON(!ce->wa_bb_page);
917
918 ptr = ce->lrc_reg_state;
919 ptr -= LRC_STATE_OFFSET; /* back to start of context image */
920 ptr += context_wa_bb_offset(ce);
921
922 return ptr;
923 }
924
lrc_init_state(struct intel_context * ce,struct intel_engine_cs * engine,void * state)925 void lrc_init_state(struct intel_context *ce,
926 struct intel_engine_cs *engine,
927 void *state)
928 {
929 bool inhibit = true;
930
931 set_redzone(state, engine);
932
933 if (engine->default_state) {
934 shmem_read(engine->default_state, 0,
935 state, engine->context_size);
936 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
937 inhibit = false;
938 }
939
940 /* Clear the ppHWSP (inc. per-context counters) */
941 memset(state, 0, PAGE_SIZE);
942
943 /* Clear the indirect wa and storage */
944 if (ce->wa_bb_page)
945 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
946
947 /*
948 * The second page of the context object contains some registers which
949 * must be set up prior to the first execution.
950 */
951 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
952 }
953
lrc_indirect_bb(const struct intel_context * ce)954 u32 lrc_indirect_bb(const struct intel_context *ce)
955 {
956 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
957 }
958
setup_predicate_disable_wa(const struct intel_context * ce,u32 * cs)959 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
960 {
961 /* If predication is active, this will be noop'ed */
962 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
963 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
964 *cs++ = 0;
965 *cs++ = 0; /* No predication */
966
967 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
968 *cs++ = MI_BATCH_BUFFER_END | BIT(15);
969 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
970
971 /* Instructions are no longer predicated (disabled), we can proceed */
972 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
973 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
974 *cs++ = 0;
975 *cs++ = 1; /* enable predication before the next BB */
976
977 *cs++ = MI_BATCH_BUFFER_END;
978 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
979
980 return cs;
981 }
982
983 static struct i915_vma *
__lrc_alloc_state(struct intel_context * ce,struct intel_engine_cs * engine)984 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
985 {
986 struct drm_i915_gem_object *obj;
987 struct i915_vma *vma;
988 u32 context_size;
989
990 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
991
992 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
993 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
994
995 if (GRAPHICS_VER(engine->i915) == 12) {
996 ce->wa_bb_page = context_size / PAGE_SIZE;
997 context_size += PAGE_SIZE;
998 }
999
1000 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1001 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1002 context_size += PARENT_SCRATCH_SIZE;
1003 }
1004
1005 obj = i915_gem_object_create_lmem(engine->i915, context_size,
1006 I915_BO_ALLOC_PM_VOLATILE);
1007 if (IS_ERR(obj))
1008 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1009 if (IS_ERR(obj))
1010 return ERR_CAST(obj);
1011
1012 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1013 if (IS_ERR(vma)) {
1014 i915_gem_object_put(obj);
1015 return vma;
1016 }
1017
1018 return vma;
1019 }
1020
1021 static struct intel_timeline *
pinned_timeline(struct intel_context * ce,struct intel_engine_cs * engine)1022 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1023 {
1024 struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1025
1026 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1027 }
1028
lrc_alloc(struct intel_context * ce,struct intel_engine_cs * engine)1029 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1030 {
1031 struct intel_ring *ring;
1032 struct i915_vma *vma;
1033 int err;
1034
1035 GEM_BUG_ON(ce->state);
1036
1037 vma = __lrc_alloc_state(ce, engine);
1038 if (IS_ERR(vma))
1039 return PTR_ERR(vma);
1040
1041 ring = intel_engine_create_ring(engine, ce->ring_size);
1042 if (IS_ERR(ring)) {
1043 err = PTR_ERR(ring);
1044 goto err_vma;
1045 }
1046
1047 if (!page_mask_bits(ce->timeline)) {
1048 struct intel_timeline *tl;
1049
1050 /*
1051 * Use the static global HWSP for the kernel context, and
1052 * a dynamically allocated cacheline for everyone else.
1053 */
1054 if (unlikely(ce->timeline))
1055 tl = pinned_timeline(ce, engine);
1056 else
1057 tl = intel_timeline_create(engine->gt);
1058 if (IS_ERR(tl)) {
1059 err = PTR_ERR(tl);
1060 goto err_ring;
1061 }
1062
1063 ce->timeline = tl;
1064 }
1065
1066 ce->ring = ring;
1067 ce->state = vma;
1068
1069 return 0;
1070
1071 err_ring:
1072 intel_ring_put(ring);
1073 err_vma:
1074 i915_vma_put(vma);
1075 return err;
1076 }
1077
lrc_reset(struct intel_context * ce)1078 void lrc_reset(struct intel_context *ce)
1079 {
1080 GEM_BUG_ON(!intel_context_is_pinned(ce));
1081
1082 intel_ring_reset(ce->ring, ce->ring->emit);
1083
1084 /* Scrub away the garbage */
1085 lrc_init_regs(ce, ce->engine, true);
1086 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1087 }
1088
1089 int
lrc_pre_pin(struct intel_context * ce,struct intel_engine_cs * engine,struct i915_gem_ww_ctx * ww,void ** vaddr)1090 lrc_pre_pin(struct intel_context *ce,
1091 struct intel_engine_cs *engine,
1092 struct i915_gem_ww_ctx *ww,
1093 void **vaddr)
1094 {
1095 GEM_BUG_ON(!ce->state);
1096 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1097
1098 *vaddr = i915_gem_object_pin_map(ce->state->obj,
1099 i915_coherent_map_type(ce->engine->i915,
1100 ce->state->obj,
1101 false) |
1102 I915_MAP_OVERRIDE);
1103
1104 return PTR_ERR_OR_ZERO(*vaddr);
1105 }
1106
1107 int
lrc_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)1108 lrc_pin(struct intel_context *ce,
1109 struct intel_engine_cs *engine,
1110 void *vaddr)
1111 {
1112 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1113
1114 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1115 lrc_init_state(ce, engine, vaddr);
1116
1117 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1118 return 0;
1119 }
1120
lrc_unpin(struct intel_context * ce)1121 void lrc_unpin(struct intel_context *ce)
1122 {
1123 if (unlikely(ce->parallel.last_rq)) {
1124 i915_request_put(ce->parallel.last_rq);
1125 ce->parallel.last_rq = NULL;
1126 }
1127 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1128 ce->engine);
1129 }
1130
lrc_post_unpin(struct intel_context * ce)1131 void lrc_post_unpin(struct intel_context *ce)
1132 {
1133 i915_gem_object_unpin_map(ce->state->obj);
1134 }
1135
lrc_fini(struct intel_context * ce)1136 void lrc_fini(struct intel_context *ce)
1137 {
1138 if (!ce->state)
1139 return;
1140
1141 intel_ring_put(fetch_and_zero(&ce->ring));
1142 i915_vma_put(fetch_and_zero(&ce->state));
1143 }
1144
lrc_destroy(struct kref * kref)1145 void lrc_destroy(struct kref *kref)
1146 {
1147 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1148
1149 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1150 GEM_BUG_ON(intel_context_is_pinned(ce));
1151
1152 lrc_fini(ce);
1153
1154 intel_context_fini(ce);
1155 intel_context_free(ce);
1156 }
1157
1158 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)1159 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1160 {
1161 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1162 MI_SRM_LRM_GLOBAL_GTT |
1163 MI_LRI_LRM_CS_MMIO;
1164 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1165 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1166 CTX_TIMESTAMP * sizeof(u32);
1167 *cs++ = 0;
1168
1169 *cs++ = MI_LOAD_REGISTER_REG |
1170 MI_LRR_SOURCE_CS_MMIO |
1171 MI_LRI_LRM_CS_MMIO;
1172 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1173 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1174
1175 *cs++ = MI_LOAD_REGISTER_REG |
1176 MI_LRR_SOURCE_CS_MMIO |
1177 MI_LRI_LRM_CS_MMIO;
1178 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1179 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1180
1181 return cs;
1182 }
1183
1184 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)1185 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1186 {
1187 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1188
1189 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1190 MI_SRM_LRM_GLOBAL_GTT |
1191 MI_LRI_LRM_CS_MMIO;
1192 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1193 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1194 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1195 *cs++ = 0;
1196
1197 return cs;
1198 }
1199
1200 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)1201 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1202 {
1203 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1204
1205 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1206 MI_SRM_LRM_GLOBAL_GTT |
1207 MI_LRI_LRM_CS_MMIO;
1208 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1209 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1210 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1211 *cs++ = 0;
1212
1213 *cs++ = MI_LOAD_REGISTER_REG |
1214 MI_LRR_SOURCE_CS_MMIO |
1215 MI_LRI_LRM_CS_MMIO;
1216 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1217 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1218
1219 return cs;
1220 }
1221
1222 /*
1223 * On DG2 during context restore of a preempted context in GPGPU mode,
1224 * RCS restore hang is detected. This is extremely timing dependent.
1225 * To address this below sw wabb is implemented for DG2 A steppings.
1226 */
1227 static u32 *
dg2_emit_rcs_hang_wabb(const struct intel_context * ce,u32 * cs)1228 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1229 {
1230 *cs++ = MI_LOAD_REGISTER_IMM(1);
1231 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1232 *cs++ = 0x21;
1233
1234 *cs++ = MI_LOAD_REGISTER_REG;
1235 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1236 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1237
1238 *cs++ = MI_LOAD_REGISTER_REG;
1239 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1240 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1241
1242 return cs;
1243 }
1244
1245 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)1246 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1247 {
1248 cs = gen12_emit_timestamp_wa(ce, cs);
1249 cs = gen12_emit_cmd_buf_wa(ce, cs);
1250 cs = gen12_emit_restore_scratch(ce, cs);
1251
1252 /* Wa_22011450934:dg2 */
1253 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1254 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1255 cs = dg2_emit_rcs_hang_wabb(ce, cs);
1256
1257 /* Wa_16013000631:dg2 */
1258 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1259 IS_DG2_G11(ce->engine->i915))
1260 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1261
1262 /* hsdes: 1809175790 */
1263 if (!HAS_FLAT_CCS(ce->engine->i915))
1264 cs = gen12_emit_aux_table_inv(cs, GEN12_GFX_CCS_AUX_NV);
1265
1266 return cs;
1267 }
1268
1269 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)1270 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1271 {
1272 cs = gen12_emit_timestamp_wa(ce, cs);
1273 cs = gen12_emit_restore_scratch(ce, cs);
1274
1275 /* Wa_16013000631:dg2 */
1276 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1277 IS_DG2_G11(ce->engine->i915))
1278 if (ce->engine->class == COMPUTE_CLASS)
1279 cs = gen8_emit_pipe_control(cs,
1280 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1281 0);
1282
1283 /* hsdes: 1809175790 */
1284 if (!HAS_FLAT_CCS(ce->engine->i915)) {
1285 if (ce->engine->class == VIDEO_DECODE_CLASS)
1286 cs = gen12_emit_aux_table_inv(cs, GEN12_VD0_AUX_NV);
1287 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1288 cs = gen12_emit_aux_table_inv(cs, GEN12_VE0_AUX_NV);
1289 }
1290
1291 return cs;
1292 }
1293
1294 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1295 setup_indirect_ctx_bb(const struct intel_context *ce,
1296 const struct intel_engine_cs *engine,
1297 u32 *(*emit)(const struct intel_context *, u32 *))
1298 {
1299 u32 * const start = context_indirect_bb(ce);
1300 u32 *cs;
1301
1302 cs = emit(ce, start);
1303 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1304 while ((unsigned long)cs % CACHELINE_BYTES)
1305 *cs++ = MI_NOOP;
1306
1307 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1308 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1309
1310 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1311 lrc_indirect_bb(ce),
1312 (cs - start) * sizeof(*cs));
1313 }
1314
1315 /*
1316 * The context descriptor encodes various attributes of a context,
1317 * including its GTT address and some flags. Because it's fairly
1318 * expensive to calculate, we'll just do it once and cache the result,
1319 * which remains valid until the context is unpinned.
1320 *
1321 * This is what a descriptor looks like, from LSB to MSB::
1322 *
1323 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
1324 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
1325 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
1326 * bits 53-54: mbz, reserved for use by hardware
1327 * bits 55-63: group ID, currently unused and set to 0
1328 *
1329 * Starting from Gen11, the upper dword of the descriptor has a new format:
1330 *
1331 * bits 32-36: reserved
1332 * bits 37-47: SW context ID
1333 * bits 48:53: engine instance
1334 * bit 54: mbz, reserved for use by hardware
1335 * bits 55-60: SW counter
1336 * bits 61-63: engine class
1337 *
1338 * On Xe_HP, the upper dword of the descriptor has a new format:
1339 *
1340 * bits 32-37: virtual function number
1341 * bit 38: mbz, reserved for use by hardware
1342 * bits 39-54: SW context ID
1343 * bits 55-57: reserved
1344 * bits 58-63: SW counter
1345 *
1346 * engine info, SW context ID and SW counter need to form a unique number
1347 * (Context ID) per lrc.
1348 */
lrc_descriptor(const struct intel_context * ce)1349 static u32 lrc_descriptor(const struct intel_context *ce)
1350 {
1351 u32 desc;
1352
1353 desc = INTEL_LEGACY_32B_CONTEXT;
1354 if (i915_vm_is_4lvl(ce->vm))
1355 desc = INTEL_LEGACY_64B_CONTEXT;
1356 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1357
1358 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1359 if (GRAPHICS_VER(ce->vm->i915) == 8)
1360 desc |= GEN8_CTX_L3LLC_COHERENT;
1361
1362 return i915_ggtt_offset(ce->state) | desc;
1363 }
1364
lrc_update_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)1365 u32 lrc_update_regs(const struct intel_context *ce,
1366 const struct intel_engine_cs *engine,
1367 u32 head)
1368 {
1369 struct intel_ring *ring = ce->ring;
1370 u32 *regs = ce->lrc_reg_state;
1371
1372 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1373 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1374
1375 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1376 regs[CTX_RING_HEAD] = head;
1377 regs[CTX_RING_TAIL] = ring->tail;
1378 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1379
1380 /* RPCS */
1381 if (engine->class == RENDER_CLASS) {
1382 regs[CTX_R_PWR_CLK_STATE] =
1383 intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1384
1385 i915_oa_init_reg_state(ce, engine);
1386 }
1387
1388 if (ce->wa_bb_page) {
1389 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1390
1391 fn = gen12_emit_indirect_ctx_xcs;
1392 if (ce->engine->class == RENDER_CLASS)
1393 fn = gen12_emit_indirect_ctx_rcs;
1394
1395 /* Mutually exclusive wrt to global indirect bb */
1396 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1397 setup_indirect_ctx_bb(ce, engine, fn);
1398 }
1399
1400 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1401 }
1402
lrc_update_offsets(struct intel_context * ce,struct intel_engine_cs * engine)1403 void lrc_update_offsets(struct intel_context *ce,
1404 struct intel_engine_cs *engine)
1405 {
1406 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1407 }
1408
lrc_check_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,const char * when)1409 void lrc_check_regs(const struct intel_context *ce,
1410 const struct intel_engine_cs *engine,
1411 const char *when)
1412 {
1413 const struct intel_ring *ring = ce->ring;
1414 u32 *regs = ce->lrc_reg_state;
1415 bool valid = true;
1416 int x;
1417
1418 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1419 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1420 engine->name,
1421 regs[CTX_RING_START],
1422 i915_ggtt_offset(ring->vma));
1423 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1424 valid = false;
1425 }
1426
1427 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1428 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1429 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1430 engine->name,
1431 regs[CTX_RING_CTL],
1432 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1433 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1434 valid = false;
1435 }
1436
1437 x = lrc_ring_mi_mode(engine);
1438 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1439 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1440 engine->name, regs[x + 1]);
1441 regs[x + 1] &= ~STOP_RING;
1442 regs[x + 1] |= STOP_RING << 16;
1443 valid = false;
1444 }
1445
1446 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1447 }
1448
1449 /*
1450 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1451 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1452 * but there is a slight complication as this is applied in WA batch where the
1453 * values are only initialized once so we cannot take register value at the
1454 * beginning and reuse it further; hence we save its value to memory, upload a
1455 * constant value with bit21 set and then we restore it back with the saved value.
1456 * To simplify the WA, a constant value is formed by using the default value
1457 * of this register. This shouldn't be a problem because we are only modifying
1458 * it for a short period and this batch in non-premptible. We can ofcourse
1459 * use additional instructions that read the actual value of the register
1460 * at that time and set our bit of interest but it makes the WA complicated.
1461 *
1462 * This WA is also required for Gen9 so extracting as a function avoids
1463 * code duplication.
1464 */
1465 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)1466 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1467 {
1468 /* NB no one else is allowed to scribble over scratch + 256! */
1469 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1470 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1471 *batch++ = intel_gt_scratch_offset(engine->gt,
1472 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1473 *batch++ = 0;
1474
1475 *batch++ = MI_LOAD_REGISTER_IMM(1);
1476 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1477 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1478
1479 batch = gen8_emit_pipe_control(batch,
1480 PIPE_CONTROL_CS_STALL |
1481 PIPE_CONTROL_DC_FLUSH_ENABLE,
1482 0);
1483
1484 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1485 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1486 *batch++ = intel_gt_scratch_offset(engine->gt,
1487 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1488 *batch++ = 0;
1489
1490 return batch;
1491 }
1492
1493 /*
1494 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1495 * initialized at the beginning and shared across all contexts but this field
1496 * helps us to have multiple batches at different offsets and select them based
1497 * on a criteria. At the moment this batch always start at the beginning of the page
1498 * and at this point we don't have multiple wa_ctx batch buffers.
1499 *
1500 * The number of WA applied are not known at the beginning; we use this field
1501 * to return the no of DWORDS written.
1502 *
1503 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1504 * so it adds NOOPs as padding to make it cacheline aligned.
1505 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1506 * makes a complete batch buffer.
1507 */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1508 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1509 {
1510 /* WaDisableCtxRestoreArbitration:bdw,chv */
1511 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1512
1513 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1514 if (IS_BROADWELL(engine->i915))
1515 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1516
1517 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1518 /* Actual scratch location is at 128 bytes offset */
1519 batch = gen8_emit_pipe_control(batch,
1520 PIPE_CONTROL_FLUSH_L3 |
1521 PIPE_CONTROL_STORE_DATA_INDEX |
1522 PIPE_CONTROL_CS_STALL |
1523 PIPE_CONTROL_QW_WRITE,
1524 LRC_PPHWSP_SCRATCH_ADDR);
1525
1526 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1527
1528 /* Pad to end of cacheline */
1529 while ((unsigned long)batch % CACHELINE_BYTES)
1530 *batch++ = MI_NOOP;
1531
1532 /*
1533 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1534 * execution depends on the length specified in terms of cache lines
1535 * in the register CTX_RCS_INDIRECT_CTX
1536 */
1537
1538 return batch;
1539 }
1540
1541 struct lri {
1542 i915_reg_t reg;
1543 u32 value;
1544 };
1545
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)1546 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1547 {
1548 GEM_BUG_ON(!count || count > 63);
1549
1550 *batch++ = MI_LOAD_REGISTER_IMM(count);
1551 do {
1552 *batch++ = i915_mmio_reg_offset(lri->reg);
1553 *batch++ = lri->value;
1554 } while (lri++, --count);
1555 *batch++ = MI_NOOP;
1556
1557 return batch;
1558 }
1559
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1560 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1561 {
1562 static const struct lri lri[] = {
1563 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1564 {
1565 COMMON_SLICE_CHICKEN2,
1566 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1567 0),
1568 },
1569
1570 /* BSpec: 11391 */
1571 {
1572 FF_SLICE_CHICKEN,
1573 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1574 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1575 },
1576
1577 /* BSpec: 11299 */
1578 {
1579 _3D_CHICKEN3,
1580 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1581 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1582 }
1583 };
1584
1585 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1586
1587 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1588 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1589
1590 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1591 batch = gen8_emit_pipe_control(batch,
1592 PIPE_CONTROL_FLUSH_L3 |
1593 PIPE_CONTROL_STORE_DATA_INDEX |
1594 PIPE_CONTROL_CS_STALL |
1595 PIPE_CONTROL_QW_WRITE,
1596 LRC_PPHWSP_SCRATCH_ADDR);
1597
1598 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1599
1600 /* WaMediaPoolStateCmdInWABB:bxt,glk */
1601 if (HAS_POOLED_EU(engine->i915)) {
1602 /*
1603 * EU pool configuration is setup along with golden context
1604 * during context initialization. This value depends on
1605 * device type (2x6 or 3x6) and needs to be updated based
1606 * on which subslice is disabled especially for 2x6
1607 * devices, however it is safe to load default
1608 * configuration of 3x6 device instead of masking off
1609 * corresponding bits because HW ignores bits of a disabled
1610 * subslice and drops down to appropriate config. Please
1611 * see render_state_setup() in i915_gem_render_state.c for
1612 * possible configurations, to avoid duplication they are
1613 * not shown here again.
1614 */
1615 *batch++ = GEN9_MEDIA_POOL_STATE;
1616 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1617 *batch++ = 0x00777000;
1618 *batch++ = 0;
1619 *batch++ = 0;
1620 *batch++ = 0;
1621 }
1622
1623 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1624
1625 /* Pad to end of cacheline */
1626 while ((unsigned long)batch % CACHELINE_BYTES)
1627 *batch++ = MI_NOOP;
1628
1629 return batch;
1630 }
1631
1632 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1633
lrc_create_wa_ctx(struct intel_engine_cs * engine)1634 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1635 {
1636 struct drm_i915_gem_object *obj;
1637 struct i915_vma *vma;
1638 int err;
1639
1640 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1641 if (IS_ERR(obj))
1642 return PTR_ERR(obj);
1643
1644 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1645 if (IS_ERR(vma)) {
1646 err = PTR_ERR(vma);
1647 goto err;
1648 }
1649
1650 engine->wa_ctx.vma = vma;
1651 return 0;
1652
1653 err:
1654 i915_gem_object_put(obj);
1655 return err;
1656 }
1657
lrc_fini_wa_ctx(struct intel_engine_cs * engine)1658 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1659 {
1660 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1661 }
1662
1663 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1664
lrc_init_wa_ctx(struct intel_engine_cs * engine)1665 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1666 {
1667 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1668 struct i915_wa_ctx_bb *wa_bb[] = {
1669 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1670 };
1671 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1672 struct i915_gem_ww_ctx ww;
1673 void *batch, *batch_ptr;
1674 unsigned int i;
1675 int err;
1676
1677 if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1678 return;
1679
1680 switch (GRAPHICS_VER(engine->i915)) {
1681 case 12:
1682 case 11:
1683 return;
1684 case 9:
1685 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1686 wa_bb_fn[1] = NULL;
1687 break;
1688 case 8:
1689 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1690 wa_bb_fn[1] = NULL;
1691 break;
1692 default:
1693 MISSING_CASE(GRAPHICS_VER(engine->i915));
1694 return;
1695 }
1696
1697 err = lrc_create_wa_ctx(engine);
1698 if (err) {
1699 /*
1700 * We continue even if we fail to initialize WA batch
1701 * because we only expect rare glitches but nothing
1702 * critical to prevent us from using GPU
1703 */
1704 drm_err(&engine->i915->drm,
1705 "Ignoring context switch w/a allocation error:%d\n",
1706 err);
1707 return;
1708 }
1709
1710 if (!engine->wa_ctx.vma)
1711 return;
1712
1713 i915_gem_ww_ctx_init(&ww, true);
1714 retry:
1715 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1716 if (!err)
1717 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1718 if (err)
1719 goto err;
1720
1721 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1722 if (IS_ERR(batch)) {
1723 err = PTR_ERR(batch);
1724 goto err_unpin;
1725 }
1726
1727 /*
1728 * Emit the two workaround batch buffers, recording the offset from the
1729 * start of the workaround batch buffer object for each and their
1730 * respective sizes.
1731 */
1732 batch_ptr = batch;
1733 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1734 wa_bb[i]->offset = batch_ptr - batch;
1735 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1736 CACHELINE_BYTES))) {
1737 err = -EINVAL;
1738 break;
1739 }
1740 if (wa_bb_fn[i])
1741 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1742 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1743 }
1744 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1745
1746 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1747 __i915_gem_object_release_map(wa_ctx->vma->obj);
1748
1749 /* Verify that we can handle failure to setup the wa_ctx */
1750 if (!err)
1751 err = i915_inject_probe_error(engine->i915, -ENODEV);
1752
1753 err_unpin:
1754 if (err)
1755 i915_vma_unpin(wa_ctx->vma);
1756 err:
1757 if (err == -EDEADLK) {
1758 err = i915_gem_ww_ctx_backoff(&ww);
1759 if (!err)
1760 goto retry;
1761 }
1762 i915_gem_ww_ctx_fini(&ww);
1763
1764 if (err) {
1765 i915_vma_put(engine->wa_ctx.vma);
1766
1767 /* Clear all flags to prevent further use */
1768 memset(wa_ctx, 0, sizeof(*wa_ctx));
1769 }
1770 }
1771
st_runtime_underflow(struct intel_context_stats * stats,s32 dt)1772 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1773 {
1774 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1775 stats->runtime.num_underflow++;
1776 stats->runtime.max_underflow =
1777 max_t(u32, stats->runtime.max_underflow, -dt);
1778 #endif
1779 }
1780
lrc_get_runtime(const struct intel_context * ce)1781 static u32 lrc_get_runtime(const struct intel_context *ce)
1782 {
1783 /*
1784 * We can use either ppHWSP[16] which is recorded before the context
1785 * switch (and so excludes the cost of context switches) or use the
1786 * value from the context image itself, which is saved/restored earlier
1787 * and so includes the cost of the save.
1788 */
1789 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1790 }
1791
lrc_update_runtime(struct intel_context * ce)1792 void lrc_update_runtime(struct intel_context *ce)
1793 {
1794 struct intel_context_stats *stats = &ce->stats;
1795 u32 old;
1796 s32 dt;
1797
1798 old = stats->runtime.last;
1799 stats->runtime.last = lrc_get_runtime(ce);
1800 dt = stats->runtime.last - old;
1801 if (!dt)
1802 return;
1803
1804 if (unlikely(dt < 0)) {
1805 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1806 old, stats->runtime.last, dt);
1807 st_runtime_underflow(stats, dt);
1808 return;
1809 }
1810
1811 ewma_runtime_add(&stats->runtime.avg, dt);
1812 stats->runtime.total += dt;
1813 }
1814
1815 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1816 #include "selftest_lrc.c"
1817 #endif
1818