1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2014 Intel Corporation
4 */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "i915_reg.h"
12 #include "intel_context.h"
13 #include "intel_engine.h"
14 #include "intel_engine_regs.h"
15 #include "intel_gpu_commands.h"
16 #include "intel_gt.h"
17 #include "intel_gt_regs.h"
18 #include "intel_lrc.h"
19 #include "intel_lrc_reg.h"
20 #include "intel_ring.h"
21 #include "shmem_utils.h"
22
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool close)23 static void set_offsets(u32 *regs,
24 const u8 *data,
25 const struct intel_engine_cs *engine,
26 bool close)
27 #define NOP(x) (BIT(7) | (x))
28 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
29 #define POSTED BIT(0)
30 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
31 #define REG16(x) \
32 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
33 (((x) >> 2) & 0x7f)
34 #define END 0
35 {
36 const u32 base = engine->mmio_base;
37
38 while (*data) {
39 u8 count, flags;
40
41 if (*data & BIT(7)) { /* skip */
42 count = *data++ & ~BIT(7);
43 regs += count;
44 continue;
45 }
46
47 count = *data & 0x3f;
48 flags = *data >> 6;
49 data++;
50
51 *regs = MI_LOAD_REGISTER_IMM(count);
52 if (flags & POSTED)
53 *regs |= MI_LRI_FORCE_POSTED;
54 if (GRAPHICS_VER(engine->i915) >= 11)
55 *regs |= MI_LRI_LRM_CS_MMIO;
56 regs++;
57
58 GEM_BUG_ON(!count);
59 do {
60 u32 offset = 0;
61 u8 v;
62
63 do {
64 v = *data++;
65 offset <<= 7;
66 offset |= v & ~BIT(7);
67 } while (v & BIT(7));
68
69 regs[0] = base + (offset << 2);
70 regs += 2;
71 } while (--count);
72 }
73
74 if (close) {
75 /* Close the batch; used mainly by live_lrc_layout() */
76 *regs = MI_BATCH_BUFFER_END;
77 if (GRAPHICS_VER(engine->i915) >= 11)
78 *regs |= BIT(0);
79 }
80 }
81
82 static const u8 gen8_xcs_offsets[] = {
83 NOP(1),
84 LRI(11, 0),
85 REG16(0x244),
86 REG(0x034),
87 REG(0x030),
88 REG(0x038),
89 REG(0x03c),
90 REG(0x168),
91 REG(0x140),
92 REG(0x110),
93 REG(0x11c),
94 REG(0x114),
95 REG(0x118),
96
97 NOP(9),
98 LRI(9, 0),
99 REG16(0x3a8),
100 REG16(0x28c),
101 REG16(0x288),
102 REG16(0x284),
103 REG16(0x280),
104 REG16(0x27c),
105 REG16(0x278),
106 REG16(0x274),
107 REG16(0x270),
108
109 NOP(13),
110 LRI(2, 0),
111 REG16(0x200),
112 REG(0x028),
113
114 END
115 };
116
117 static const u8 gen9_xcs_offsets[] = {
118 NOP(1),
119 LRI(14, POSTED),
120 REG16(0x244),
121 REG(0x034),
122 REG(0x030),
123 REG(0x038),
124 REG(0x03c),
125 REG(0x168),
126 REG(0x140),
127 REG(0x110),
128 REG(0x11c),
129 REG(0x114),
130 REG(0x118),
131 REG(0x1c0),
132 REG(0x1c4),
133 REG(0x1c8),
134
135 NOP(3),
136 LRI(9, POSTED),
137 REG16(0x3a8),
138 REG16(0x28c),
139 REG16(0x288),
140 REG16(0x284),
141 REG16(0x280),
142 REG16(0x27c),
143 REG16(0x278),
144 REG16(0x274),
145 REG16(0x270),
146
147 NOP(13),
148 LRI(1, POSTED),
149 REG16(0x200),
150
151 NOP(13),
152 LRI(44, POSTED),
153 REG(0x028),
154 REG(0x09c),
155 REG(0x0c0),
156 REG(0x178),
157 REG(0x17c),
158 REG16(0x358),
159 REG(0x170),
160 REG(0x150),
161 REG(0x154),
162 REG(0x158),
163 REG16(0x41c),
164 REG16(0x600),
165 REG16(0x604),
166 REG16(0x608),
167 REG16(0x60c),
168 REG16(0x610),
169 REG16(0x614),
170 REG16(0x618),
171 REG16(0x61c),
172 REG16(0x620),
173 REG16(0x624),
174 REG16(0x628),
175 REG16(0x62c),
176 REG16(0x630),
177 REG16(0x634),
178 REG16(0x638),
179 REG16(0x63c),
180 REG16(0x640),
181 REG16(0x644),
182 REG16(0x648),
183 REG16(0x64c),
184 REG16(0x650),
185 REG16(0x654),
186 REG16(0x658),
187 REG16(0x65c),
188 REG16(0x660),
189 REG16(0x664),
190 REG16(0x668),
191 REG16(0x66c),
192 REG16(0x670),
193 REG16(0x674),
194 REG16(0x678),
195 REG16(0x67c),
196 REG(0x068),
197
198 END
199 };
200
201 static const u8 gen12_xcs_offsets[] = {
202 NOP(1),
203 LRI(13, POSTED),
204 REG16(0x244),
205 REG(0x034),
206 REG(0x030),
207 REG(0x038),
208 REG(0x03c),
209 REG(0x168),
210 REG(0x140),
211 REG(0x110),
212 REG(0x1c0),
213 REG(0x1c4),
214 REG(0x1c8),
215 REG(0x180),
216 REG16(0x2b4),
217
218 NOP(5),
219 LRI(9, POSTED),
220 REG16(0x3a8),
221 REG16(0x28c),
222 REG16(0x288),
223 REG16(0x284),
224 REG16(0x280),
225 REG16(0x27c),
226 REG16(0x278),
227 REG16(0x274),
228 REG16(0x270),
229
230 END
231 };
232
233 static const u8 dg2_xcs_offsets[] = {
234 NOP(1),
235 LRI(15, POSTED),
236 REG16(0x244),
237 REG(0x034),
238 REG(0x030),
239 REG(0x038),
240 REG(0x03c),
241 REG(0x168),
242 REG(0x140),
243 REG(0x110),
244 REG(0x1c0),
245 REG(0x1c4),
246 REG(0x1c8),
247 REG(0x180),
248 REG16(0x2b4),
249 REG(0x120),
250 REG(0x124),
251
252 NOP(1),
253 LRI(9, POSTED),
254 REG16(0x3a8),
255 REG16(0x28c),
256 REG16(0x288),
257 REG16(0x284),
258 REG16(0x280),
259 REG16(0x27c),
260 REG16(0x278),
261 REG16(0x274),
262 REG16(0x270),
263
264 END
265 };
266
267 static const u8 gen8_rcs_offsets[] = {
268 NOP(1),
269 LRI(14, POSTED),
270 REG16(0x244),
271 REG(0x034),
272 REG(0x030),
273 REG(0x038),
274 REG(0x03c),
275 REG(0x168),
276 REG(0x140),
277 REG(0x110),
278 REG(0x11c),
279 REG(0x114),
280 REG(0x118),
281 REG(0x1c0),
282 REG(0x1c4),
283 REG(0x1c8),
284
285 NOP(3),
286 LRI(9, POSTED),
287 REG16(0x3a8),
288 REG16(0x28c),
289 REG16(0x288),
290 REG16(0x284),
291 REG16(0x280),
292 REG16(0x27c),
293 REG16(0x278),
294 REG16(0x274),
295 REG16(0x270),
296
297 NOP(13),
298 LRI(1, 0),
299 REG(0x0c8),
300
301 END
302 };
303
304 static const u8 gen9_rcs_offsets[] = {
305 NOP(1),
306 LRI(14, POSTED),
307 REG16(0x244),
308 REG(0x34),
309 REG(0x30),
310 REG(0x38),
311 REG(0x3c),
312 REG(0x168),
313 REG(0x140),
314 REG(0x110),
315 REG(0x11c),
316 REG(0x114),
317 REG(0x118),
318 REG(0x1c0),
319 REG(0x1c4),
320 REG(0x1c8),
321
322 NOP(3),
323 LRI(9, POSTED),
324 REG16(0x3a8),
325 REG16(0x28c),
326 REG16(0x288),
327 REG16(0x284),
328 REG16(0x280),
329 REG16(0x27c),
330 REG16(0x278),
331 REG16(0x274),
332 REG16(0x270),
333
334 NOP(13),
335 LRI(1, 0),
336 REG(0xc8),
337
338 NOP(13),
339 LRI(44, POSTED),
340 REG(0x28),
341 REG(0x9c),
342 REG(0xc0),
343 REG(0x178),
344 REG(0x17c),
345 REG16(0x358),
346 REG(0x170),
347 REG(0x150),
348 REG(0x154),
349 REG(0x158),
350 REG16(0x41c),
351 REG16(0x600),
352 REG16(0x604),
353 REG16(0x608),
354 REG16(0x60c),
355 REG16(0x610),
356 REG16(0x614),
357 REG16(0x618),
358 REG16(0x61c),
359 REG16(0x620),
360 REG16(0x624),
361 REG16(0x628),
362 REG16(0x62c),
363 REG16(0x630),
364 REG16(0x634),
365 REG16(0x638),
366 REG16(0x63c),
367 REG16(0x640),
368 REG16(0x644),
369 REG16(0x648),
370 REG16(0x64c),
371 REG16(0x650),
372 REG16(0x654),
373 REG16(0x658),
374 REG16(0x65c),
375 REG16(0x660),
376 REG16(0x664),
377 REG16(0x668),
378 REG16(0x66c),
379 REG16(0x670),
380 REG16(0x674),
381 REG16(0x678),
382 REG16(0x67c),
383 REG(0x68),
384
385 END
386 };
387
388 static const u8 gen11_rcs_offsets[] = {
389 NOP(1),
390 LRI(15, POSTED),
391 REG16(0x244),
392 REG(0x034),
393 REG(0x030),
394 REG(0x038),
395 REG(0x03c),
396 REG(0x168),
397 REG(0x140),
398 REG(0x110),
399 REG(0x11c),
400 REG(0x114),
401 REG(0x118),
402 REG(0x1c0),
403 REG(0x1c4),
404 REG(0x1c8),
405 REG(0x180),
406
407 NOP(1),
408 LRI(9, POSTED),
409 REG16(0x3a8),
410 REG16(0x28c),
411 REG16(0x288),
412 REG16(0x284),
413 REG16(0x280),
414 REG16(0x27c),
415 REG16(0x278),
416 REG16(0x274),
417 REG16(0x270),
418
419 LRI(1, POSTED),
420 REG(0x1b0),
421
422 NOP(10),
423 LRI(1, 0),
424 REG(0x0c8),
425
426 END
427 };
428
429 static const u8 gen12_rcs_offsets[] = {
430 NOP(1),
431 LRI(13, POSTED),
432 REG16(0x244),
433 REG(0x034),
434 REG(0x030),
435 REG(0x038),
436 REG(0x03c),
437 REG(0x168),
438 REG(0x140),
439 REG(0x110),
440 REG(0x1c0),
441 REG(0x1c4),
442 REG(0x1c8),
443 REG(0x180),
444 REG16(0x2b4),
445
446 NOP(5),
447 LRI(9, POSTED),
448 REG16(0x3a8),
449 REG16(0x28c),
450 REG16(0x288),
451 REG16(0x284),
452 REG16(0x280),
453 REG16(0x27c),
454 REG16(0x278),
455 REG16(0x274),
456 REG16(0x270),
457
458 LRI(3, POSTED),
459 REG(0x1b0),
460 REG16(0x5a8),
461 REG16(0x5ac),
462
463 NOP(6),
464 LRI(1, 0),
465 REG(0x0c8),
466 NOP(3 + 9 + 1),
467
468 LRI(51, POSTED),
469 REG16(0x588),
470 REG16(0x588),
471 REG16(0x588),
472 REG16(0x588),
473 REG16(0x588),
474 REG16(0x588),
475 REG(0x028),
476 REG(0x09c),
477 REG(0x0c0),
478 REG(0x178),
479 REG(0x17c),
480 REG16(0x358),
481 REG(0x170),
482 REG(0x150),
483 REG(0x154),
484 REG(0x158),
485 REG16(0x41c),
486 REG16(0x600),
487 REG16(0x604),
488 REG16(0x608),
489 REG16(0x60c),
490 REG16(0x610),
491 REG16(0x614),
492 REG16(0x618),
493 REG16(0x61c),
494 REG16(0x620),
495 REG16(0x624),
496 REG16(0x628),
497 REG16(0x62c),
498 REG16(0x630),
499 REG16(0x634),
500 REG16(0x638),
501 REG16(0x63c),
502 REG16(0x640),
503 REG16(0x644),
504 REG16(0x648),
505 REG16(0x64c),
506 REG16(0x650),
507 REG16(0x654),
508 REG16(0x658),
509 REG16(0x65c),
510 REG16(0x660),
511 REG16(0x664),
512 REG16(0x668),
513 REG16(0x66c),
514 REG16(0x670),
515 REG16(0x674),
516 REG16(0x678),
517 REG16(0x67c),
518 REG(0x068),
519 REG(0x084),
520 NOP(1),
521
522 END
523 };
524
525 static const u8 xehp_rcs_offsets[] = {
526 NOP(1),
527 LRI(13, POSTED),
528 REG16(0x244),
529 REG(0x034),
530 REG(0x030),
531 REG(0x038),
532 REG(0x03c),
533 REG(0x168),
534 REG(0x140),
535 REG(0x110),
536 REG(0x1c0),
537 REG(0x1c4),
538 REG(0x1c8),
539 REG(0x180),
540 REG16(0x2b4),
541
542 NOP(5),
543 LRI(9, POSTED),
544 REG16(0x3a8),
545 REG16(0x28c),
546 REG16(0x288),
547 REG16(0x284),
548 REG16(0x280),
549 REG16(0x27c),
550 REG16(0x278),
551 REG16(0x274),
552 REG16(0x270),
553
554 LRI(3, POSTED),
555 REG(0x1b0),
556 REG16(0x5a8),
557 REG16(0x5ac),
558
559 NOP(6),
560 LRI(1, 0),
561 REG(0x0c8),
562
563 END
564 };
565
566 static const u8 dg2_rcs_offsets[] = {
567 NOP(1),
568 LRI(15, POSTED),
569 REG16(0x244),
570 REG(0x034),
571 REG(0x030),
572 REG(0x038),
573 REG(0x03c),
574 REG(0x168),
575 REG(0x140),
576 REG(0x110),
577 REG(0x1c0),
578 REG(0x1c4),
579 REG(0x1c8),
580 REG(0x180),
581 REG16(0x2b4),
582 REG(0x120),
583 REG(0x124),
584
585 NOP(1),
586 LRI(9, POSTED),
587 REG16(0x3a8),
588 REG16(0x28c),
589 REG16(0x288),
590 REG16(0x284),
591 REG16(0x280),
592 REG16(0x27c),
593 REG16(0x278),
594 REG16(0x274),
595 REG16(0x270),
596
597 LRI(3, POSTED),
598 REG(0x1b0),
599 REG16(0x5a8),
600 REG16(0x5ac),
601
602 NOP(6),
603 LRI(1, 0),
604 REG(0x0c8),
605
606 END
607 };
608
609 #undef END
610 #undef REG16
611 #undef REG
612 #undef LRI
613 #undef NOP
614
reg_offsets(const struct intel_engine_cs * engine)615 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
616 {
617 /*
618 * The gen12+ lists only have the registers we program in the basic
619 * default state. We rely on the context image using relative
620 * addressing to automatic fixup the register state between the
621 * physical engines for virtual engine.
622 */
623 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
624 !intel_engine_has_relative_mmio(engine));
625
626 if (engine->flags & I915_ENGINE_HAS_RCS_REG_STATE) {
627 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
628 return dg2_rcs_offsets;
629 else if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
630 return xehp_rcs_offsets;
631 else if (GRAPHICS_VER(engine->i915) >= 12)
632 return gen12_rcs_offsets;
633 else if (GRAPHICS_VER(engine->i915) >= 11)
634 return gen11_rcs_offsets;
635 else if (GRAPHICS_VER(engine->i915) >= 9)
636 return gen9_rcs_offsets;
637 else
638 return gen8_rcs_offsets;
639 } else {
640 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 55))
641 return dg2_xcs_offsets;
642 else if (GRAPHICS_VER(engine->i915) >= 12)
643 return gen12_xcs_offsets;
644 else if (GRAPHICS_VER(engine->i915) >= 9)
645 return gen9_xcs_offsets;
646 else
647 return gen8_xcs_offsets;
648 }
649 }
650
lrc_ring_mi_mode(const struct intel_engine_cs * engine)651 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
652 {
653 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
654 return 0x70;
655 else if (GRAPHICS_VER(engine->i915) >= 12)
656 return 0x60;
657 else if (GRAPHICS_VER(engine->i915) >= 9)
658 return 0x54;
659 else if (engine->class == RENDER_CLASS)
660 return 0x58;
661 else
662 return -1;
663 }
664
lrc_ring_bb_offset(const struct intel_engine_cs * engine)665 static int lrc_ring_bb_offset(const struct intel_engine_cs *engine)
666 {
667 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
668 return 0x80;
669 else if (GRAPHICS_VER(engine->i915) >= 12)
670 return 0x70;
671 else if (GRAPHICS_VER(engine->i915) >= 9)
672 return 0x64;
673 else if (GRAPHICS_VER(engine->i915) >= 8 &&
674 engine->class == RENDER_CLASS)
675 return 0xc4;
676 else
677 return -1;
678 }
679
lrc_ring_gpr0(const struct intel_engine_cs * engine)680 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
681 {
682 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
683 return 0x84;
684 else if (GRAPHICS_VER(engine->i915) >= 12)
685 return 0x74;
686 else if (GRAPHICS_VER(engine->i915) >= 9)
687 return 0x68;
688 else if (engine->class == RENDER_CLASS)
689 return 0xd8;
690 else
691 return -1;
692 }
693
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)694 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
695 {
696 if (GRAPHICS_VER(engine->i915) >= 12)
697 return 0x12;
698 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
699 return 0x18;
700 else
701 return -1;
702 }
703
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)704 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
705 {
706 int x;
707
708 x = lrc_ring_wa_bb_per_ctx(engine);
709 if (x < 0)
710 return x;
711
712 return x + 2;
713 }
714
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)715 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
716 {
717 int x;
718
719 x = lrc_ring_indirect_ptr(engine);
720 if (x < 0)
721 return x;
722
723 return x + 2;
724 }
725
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)726 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
727 {
728
729 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
730 /*
731 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
732 * simply to match the RCS context image layout.
733 */
734 return 0xc6;
735 else if (engine->class != RENDER_CLASS)
736 return -1;
737 else if (GRAPHICS_VER(engine->i915) >= 12)
738 return 0xb6;
739 else if (GRAPHICS_VER(engine->i915) >= 11)
740 return 0xaa;
741 else
742 return -1;
743 }
744
745 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)746 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
747 {
748 switch (GRAPHICS_VER(engine->i915)) {
749 default:
750 MISSING_CASE(GRAPHICS_VER(engine->i915));
751 fallthrough;
752 case 12:
753 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
754 case 11:
755 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
756 case 9:
757 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
758 case 8:
759 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
760 }
761 }
762
763 static void
lrc_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)764 lrc_setup_indirect_ctx(u32 *regs,
765 const struct intel_engine_cs *engine,
766 u32 ctx_bb_ggtt_addr,
767 u32 size)
768 {
769 GEM_BUG_ON(!size);
770 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
771 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
772 regs[lrc_ring_indirect_ptr(engine) + 1] =
773 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
774
775 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
776 regs[lrc_ring_indirect_offset(engine) + 1] =
777 lrc_ring_indirect_offset_default(engine) << 6;
778 }
779
init_common_regs(u32 * const regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)780 static void init_common_regs(u32 * const regs,
781 const struct intel_context *ce,
782 const struct intel_engine_cs *engine,
783 bool inhibit)
784 {
785 u32 ctl;
786 int loc;
787
788 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
789 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
790 if (inhibit)
791 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
792 if (GRAPHICS_VER(engine->i915) < 11)
793 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
794 CTX_CTRL_RS_CTX_ENABLE);
795 regs[CTX_CONTEXT_CONTROL] = ctl;
796
797 regs[CTX_TIMESTAMP] = ce->stats.runtime.last;
798
799 loc = lrc_ring_bb_offset(engine);
800 if (loc != -1)
801 regs[loc + 1] = 0;
802 }
803
init_wa_bb_regs(u32 * const regs,const struct intel_engine_cs * engine)804 static void init_wa_bb_regs(u32 * const regs,
805 const struct intel_engine_cs *engine)
806 {
807 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
808
809 if (wa_ctx->per_ctx.size) {
810 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
811
812 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
813 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
814 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
815 }
816
817 if (wa_ctx->indirect_ctx.size) {
818 lrc_setup_indirect_ctx(regs, engine,
819 i915_ggtt_offset(wa_ctx->vma) +
820 wa_ctx->indirect_ctx.offset,
821 wa_ctx->indirect_ctx.size);
822 }
823 }
824
init_ppgtt_regs(u32 * regs,const struct i915_ppgtt * ppgtt)825 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
826 {
827 if (i915_vm_is_4lvl(&ppgtt->vm)) {
828 /* 64b PPGTT (48bit canonical)
829 * PDP0_DESCRIPTOR contains the base address to PML4 and
830 * other PDP Descriptors are ignored.
831 */
832 ASSIGN_CTX_PML4(ppgtt, regs);
833 } else {
834 ASSIGN_CTX_PDP(ppgtt, regs, 3);
835 ASSIGN_CTX_PDP(ppgtt, regs, 2);
836 ASSIGN_CTX_PDP(ppgtt, regs, 1);
837 ASSIGN_CTX_PDP(ppgtt, regs, 0);
838 }
839 }
840
vm_alias(struct i915_address_space * vm)841 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
842 {
843 if (i915_is_ggtt(vm))
844 return i915_vm_to_ggtt(vm)->alias;
845 else
846 return i915_vm_to_ppgtt(vm);
847 }
848
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)849 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
850 {
851 int x;
852
853 x = lrc_ring_mi_mode(engine);
854 if (x != -1) {
855 regs[x + 1] &= ~STOP_RING;
856 regs[x + 1] |= STOP_RING << 16;
857 }
858 }
859
__lrc_init_regs(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)860 static void __lrc_init_regs(u32 *regs,
861 const struct intel_context *ce,
862 const struct intel_engine_cs *engine,
863 bool inhibit)
864 {
865 /*
866 * A context is actually a big batch buffer with several
867 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
868 * values we are setting here are only for the first context restore:
869 * on a subsequent save, the GPU will recreate this batchbuffer with new
870 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
871 * we are not initializing here).
872 *
873 * Must keep consistent with virtual_update_register_offsets().
874 */
875
876 if (inhibit)
877 memset(regs, 0, PAGE_SIZE);
878
879 set_offsets(regs, reg_offsets(engine), engine, inhibit);
880
881 init_common_regs(regs, ce, engine, inhibit);
882 init_ppgtt_regs(regs, vm_alias(ce->vm));
883
884 init_wa_bb_regs(regs, engine);
885
886 __reset_stop_ring(regs, engine);
887 }
888
lrc_init_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)889 void lrc_init_regs(const struct intel_context *ce,
890 const struct intel_engine_cs *engine,
891 bool inhibit)
892 {
893 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
894 }
895
lrc_reset_regs(const struct intel_context * ce,const struct intel_engine_cs * engine)896 void lrc_reset_regs(const struct intel_context *ce,
897 const struct intel_engine_cs *engine)
898 {
899 __reset_stop_ring(ce->lrc_reg_state, engine);
900 }
901
902 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)903 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
904 {
905 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
906 return;
907
908 vaddr += engine->context_size;
909
910 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
911 }
912
913 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)914 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
915 {
916 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
917 return;
918
919 vaddr += engine->context_size;
920
921 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
922 drm_err_once(&engine->i915->drm,
923 "%s context redzone overwritten!\n",
924 engine->name);
925 }
926
context_wa_bb_offset(const struct intel_context * ce)927 static u32 context_wa_bb_offset(const struct intel_context *ce)
928 {
929 return PAGE_SIZE * ce->wa_bb_page;
930 }
931
context_indirect_bb(const struct intel_context * ce)932 static u32 *context_indirect_bb(const struct intel_context *ce)
933 {
934 void *ptr;
935
936 GEM_BUG_ON(!ce->wa_bb_page);
937
938 ptr = ce->lrc_reg_state;
939 ptr -= LRC_STATE_OFFSET; /* back to start of context image */
940 ptr += context_wa_bb_offset(ce);
941
942 return ptr;
943 }
944
lrc_init_state(struct intel_context * ce,struct intel_engine_cs * engine,void * state)945 void lrc_init_state(struct intel_context *ce,
946 struct intel_engine_cs *engine,
947 void *state)
948 {
949 bool inhibit = true;
950
951 set_redzone(state, engine);
952
953 if (engine->default_state) {
954 shmem_read(engine->default_state, 0,
955 state, engine->context_size);
956 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
957 inhibit = false;
958 }
959
960 /* Clear the ppHWSP (inc. per-context counters) */
961 memset(state, 0, PAGE_SIZE);
962
963 /* Clear the indirect wa and storage */
964 if (ce->wa_bb_page)
965 memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
966
967 /*
968 * The second page of the context object contains some registers which
969 * must be set up prior to the first execution.
970 */
971 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
972 }
973
lrc_indirect_bb(const struct intel_context * ce)974 u32 lrc_indirect_bb(const struct intel_context *ce)
975 {
976 return i915_ggtt_offset(ce->state) + context_wa_bb_offset(ce);
977 }
978
setup_predicate_disable_wa(const struct intel_context * ce,u32 * cs)979 static u32 *setup_predicate_disable_wa(const struct intel_context *ce, u32 *cs)
980 {
981 /* If predication is active, this will be noop'ed */
982 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
983 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
984 *cs++ = 0;
985 *cs++ = 0; /* No predication */
986
987 /* predicated end, only terminates if SET_PREDICATE_RESULT:0 is clear */
988 *cs++ = MI_BATCH_BUFFER_END | BIT(15);
989 *cs++ = MI_SET_PREDICATE | MI_SET_PREDICATE_DISABLE;
990
991 /* Instructions are no longer predicated (disabled), we can proceed */
992 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT | (4 - 2);
993 *cs++ = lrc_indirect_bb(ce) + DG2_PREDICATE_RESULT_WA;
994 *cs++ = 0;
995 *cs++ = 1; /* enable predication before the next BB */
996
997 *cs++ = MI_BATCH_BUFFER_END;
998 GEM_BUG_ON(offset_in_page(cs) > DG2_PREDICATE_RESULT_WA);
999
1000 return cs;
1001 }
1002
1003 static struct i915_vma *
__lrc_alloc_state(struct intel_context * ce,struct intel_engine_cs * engine)1004 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
1005 {
1006 struct drm_i915_gem_object *obj;
1007 struct i915_vma *vma;
1008 u32 context_size;
1009
1010 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
1011
1012 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1013 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
1014
1015 if (GRAPHICS_VER(engine->i915) == 12) {
1016 ce->wa_bb_page = context_size / PAGE_SIZE;
1017 context_size += PAGE_SIZE;
1018 }
1019
1020 if (intel_context_is_parent(ce) && intel_engine_uses_guc(engine)) {
1021 ce->parallel.guc.parent_page = context_size / PAGE_SIZE;
1022 context_size += PARENT_SCRATCH_SIZE;
1023 }
1024
1025 obj = i915_gem_object_create_lmem(engine->i915, context_size,
1026 I915_BO_ALLOC_PM_VOLATILE);
1027 if (IS_ERR(obj))
1028 obj = i915_gem_object_create_shmem(engine->i915, context_size);
1029 if (IS_ERR(obj))
1030 return ERR_CAST(obj);
1031
1032 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1033 if (IS_ERR(vma)) {
1034 i915_gem_object_put(obj);
1035 return vma;
1036 }
1037
1038 return vma;
1039 }
1040
1041 static struct intel_timeline *
pinned_timeline(struct intel_context * ce,struct intel_engine_cs * engine)1042 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
1043 {
1044 struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
1045
1046 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
1047 }
1048
lrc_alloc(struct intel_context * ce,struct intel_engine_cs * engine)1049 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
1050 {
1051 struct intel_ring *ring;
1052 struct i915_vma *vma;
1053 int err;
1054
1055 GEM_BUG_ON(ce->state);
1056
1057 vma = __lrc_alloc_state(ce, engine);
1058 if (IS_ERR(vma))
1059 return PTR_ERR(vma);
1060
1061 ring = intel_engine_create_ring(engine, ce->ring_size);
1062 if (IS_ERR(ring)) {
1063 err = PTR_ERR(ring);
1064 goto err_vma;
1065 }
1066
1067 if (!page_mask_bits(ce->timeline)) {
1068 struct intel_timeline *tl;
1069
1070 /*
1071 * Use the static global HWSP for the kernel context, and
1072 * a dynamically allocated cacheline for everyone else.
1073 */
1074 if (unlikely(ce->timeline))
1075 tl = pinned_timeline(ce, engine);
1076 else
1077 tl = intel_timeline_create(engine->gt);
1078 if (IS_ERR(tl)) {
1079 err = PTR_ERR(tl);
1080 goto err_ring;
1081 }
1082
1083 ce->timeline = tl;
1084 }
1085
1086 ce->ring = ring;
1087 ce->state = vma;
1088
1089 return 0;
1090
1091 err_ring:
1092 intel_ring_put(ring);
1093 err_vma:
1094 i915_vma_put(vma);
1095 return err;
1096 }
1097
lrc_reset(struct intel_context * ce)1098 void lrc_reset(struct intel_context *ce)
1099 {
1100 GEM_BUG_ON(!intel_context_is_pinned(ce));
1101
1102 intel_ring_reset(ce->ring, ce->ring->emit);
1103
1104 /* Scrub away the garbage */
1105 lrc_init_regs(ce, ce->engine, true);
1106 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
1107 }
1108
1109 int
lrc_pre_pin(struct intel_context * ce,struct intel_engine_cs * engine,struct i915_gem_ww_ctx * ww,void ** vaddr)1110 lrc_pre_pin(struct intel_context *ce,
1111 struct intel_engine_cs *engine,
1112 struct i915_gem_ww_ctx *ww,
1113 void **vaddr)
1114 {
1115 GEM_BUG_ON(!ce->state);
1116 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
1117
1118 *vaddr = i915_gem_object_pin_map(ce->state->obj,
1119 i915_coherent_map_type(ce->engine->i915,
1120 ce->state->obj,
1121 false) |
1122 I915_MAP_OVERRIDE);
1123
1124 return PTR_ERR_OR_ZERO(*vaddr);
1125 }
1126
1127 int
lrc_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)1128 lrc_pin(struct intel_context *ce,
1129 struct intel_engine_cs *engine,
1130 void *vaddr)
1131 {
1132 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
1133
1134 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
1135 lrc_init_state(ce, engine, vaddr);
1136
1137 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
1138 return 0;
1139 }
1140
lrc_unpin(struct intel_context * ce)1141 void lrc_unpin(struct intel_context *ce)
1142 {
1143 if (unlikely(ce->parallel.last_rq)) {
1144 i915_request_put(ce->parallel.last_rq);
1145 ce->parallel.last_rq = NULL;
1146 }
1147 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
1148 ce->engine);
1149 }
1150
lrc_post_unpin(struct intel_context * ce)1151 void lrc_post_unpin(struct intel_context *ce)
1152 {
1153 i915_gem_object_unpin_map(ce->state->obj);
1154 }
1155
lrc_fini(struct intel_context * ce)1156 void lrc_fini(struct intel_context *ce)
1157 {
1158 if (!ce->state)
1159 return;
1160
1161 intel_ring_put(fetch_and_zero(&ce->ring));
1162 i915_vma_put(fetch_and_zero(&ce->state));
1163 }
1164
lrc_destroy(struct kref * kref)1165 void lrc_destroy(struct kref *kref)
1166 {
1167 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1168
1169 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1170 GEM_BUG_ON(intel_context_is_pinned(ce));
1171
1172 lrc_fini(ce);
1173
1174 intel_context_fini(ce);
1175 intel_context_free(ce);
1176 }
1177
1178 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)1179 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1180 {
1181 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1182 MI_SRM_LRM_GLOBAL_GTT |
1183 MI_LRI_LRM_CS_MMIO;
1184 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1185 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1186 CTX_TIMESTAMP * sizeof(u32);
1187 *cs++ = 0;
1188
1189 *cs++ = MI_LOAD_REGISTER_REG |
1190 MI_LRR_SOURCE_CS_MMIO |
1191 MI_LRI_LRM_CS_MMIO;
1192 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1193 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1194
1195 *cs++ = MI_LOAD_REGISTER_REG |
1196 MI_LRR_SOURCE_CS_MMIO |
1197 MI_LRI_LRM_CS_MMIO;
1198 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1199 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1200
1201 return cs;
1202 }
1203
1204 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)1205 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1206 {
1207 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1208
1209 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1210 MI_SRM_LRM_GLOBAL_GTT |
1211 MI_LRI_LRM_CS_MMIO;
1212 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1213 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1214 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1215 *cs++ = 0;
1216
1217 return cs;
1218 }
1219
1220 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)1221 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1222 {
1223 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1224
1225 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1226 MI_SRM_LRM_GLOBAL_GTT |
1227 MI_LRI_LRM_CS_MMIO;
1228 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1229 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1230 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1231 *cs++ = 0;
1232
1233 *cs++ = MI_LOAD_REGISTER_REG |
1234 MI_LRR_SOURCE_CS_MMIO |
1235 MI_LRI_LRM_CS_MMIO;
1236 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1237 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1238
1239 return cs;
1240 }
1241
1242 /*
1243 * On DG2 during context restore of a preempted context in GPGPU mode,
1244 * RCS restore hang is detected. This is extremely timing dependent.
1245 * To address this below sw wabb is implemented for DG2 A steppings.
1246 */
1247 static u32 *
dg2_emit_rcs_hang_wabb(const struct intel_context * ce,u32 * cs)1248 dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs)
1249 {
1250 *cs++ = MI_LOAD_REGISTER_IMM(1);
1251 *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG);
1252 *cs++ = 0x21;
1253
1254 *cs++ = MI_LOAD_REGISTER_REG;
1255 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1256 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT1);
1257
1258 *cs++ = MI_LOAD_REGISTER_REG;
1259 *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base));
1260 *cs++ = i915_mmio_reg_offset(GEN12_CULLBIT2);
1261
1262 return cs;
1263 }
1264
1265 /*
1266 * The bspec's tuning guide asks us to program a vertical watermark value of
1267 * 0x3FF. However this register is not saved/restored properly by the
1268 * hardware, so we're required to apply the desired value via INDIRECT_CTX
1269 * batch buffer to ensure the value takes effect properly. All other bits
1270 * in this register should remain at 0 (the hardware default).
1271 */
1272 static u32 *
dg2_emit_draw_watermark_setting(u32 * cs)1273 dg2_emit_draw_watermark_setting(u32 *cs)
1274 {
1275 *cs++ = MI_LOAD_REGISTER_IMM(1);
1276 *cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
1277 *cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
1278
1279 return cs;
1280 }
1281
1282 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)1283 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1284 {
1285 cs = gen12_emit_timestamp_wa(ce, cs);
1286 cs = gen12_emit_cmd_buf_wa(ce, cs);
1287 cs = gen12_emit_restore_scratch(ce, cs);
1288
1289 /* Wa_22011450934:dg2 */
1290 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) ||
1291 IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0))
1292 cs = dg2_emit_rcs_hang_wabb(ce, cs);
1293
1294 /* Wa_16013000631:dg2 */
1295 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1296 IS_DG2_G11(ce->engine->i915))
1297 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0);
1298
1299 /* hsdes: 1809175790 */
1300 if (!HAS_FLAT_CCS(ce->engine->i915))
1301 cs = gen12_emit_aux_table_inv(ce->engine->gt,
1302 cs, GEN12_GFX_CCS_AUX_NV);
1303
1304 /* Wa_16014892111 */
1305 if (IS_DG2(ce->engine->i915))
1306 cs = dg2_emit_draw_watermark_setting(cs);
1307
1308 return cs;
1309 }
1310
1311 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)1312 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1313 {
1314 cs = gen12_emit_timestamp_wa(ce, cs);
1315 cs = gen12_emit_restore_scratch(ce, cs);
1316
1317 /* Wa_16013000631:dg2 */
1318 if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) ||
1319 IS_DG2_G11(ce->engine->i915))
1320 if (ce->engine->class == COMPUTE_CLASS)
1321 cs = gen8_emit_pipe_control(cs,
1322 PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE,
1323 0);
1324
1325 /* hsdes: 1809175790 */
1326 if (!HAS_FLAT_CCS(ce->engine->i915)) {
1327 if (ce->engine->class == VIDEO_DECODE_CLASS)
1328 cs = gen12_emit_aux_table_inv(ce->engine->gt,
1329 cs, GEN12_VD0_AUX_NV);
1330 else if (ce->engine->class == VIDEO_ENHANCEMENT_CLASS)
1331 cs = gen12_emit_aux_table_inv(ce->engine->gt,
1332 cs, GEN12_VE0_AUX_NV);
1333 }
1334
1335 return cs;
1336 }
1337
1338 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1339 setup_indirect_ctx_bb(const struct intel_context *ce,
1340 const struct intel_engine_cs *engine,
1341 u32 *(*emit)(const struct intel_context *, u32 *))
1342 {
1343 u32 * const start = context_indirect_bb(ce);
1344 u32 *cs;
1345
1346 cs = emit(ce, start);
1347 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1348 while ((unsigned long)cs % CACHELINE_BYTES)
1349 *cs++ = MI_NOOP;
1350
1351 GEM_BUG_ON(cs - start > DG2_PREDICATE_RESULT_BB / sizeof(*start));
1352 setup_predicate_disable_wa(ce, start + DG2_PREDICATE_RESULT_BB / sizeof(*start));
1353
1354 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1355 lrc_indirect_bb(ce),
1356 (cs - start) * sizeof(*cs));
1357 }
1358
1359 /*
1360 * The context descriptor encodes various attributes of a context,
1361 * including its GTT address and some flags. Because it's fairly
1362 * expensive to calculate, we'll just do it once and cache the result,
1363 * which remains valid until the context is unpinned.
1364 *
1365 * This is what a descriptor looks like, from LSB to MSB::
1366 *
1367 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
1368 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
1369 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
1370 * bits 53-54: mbz, reserved for use by hardware
1371 * bits 55-63: group ID, currently unused and set to 0
1372 *
1373 * Starting from Gen11, the upper dword of the descriptor has a new format:
1374 *
1375 * bits 32-36: reserved
1376 * bits 37-47: SW context ID
1377 * bits 48:53: engine instance
1378 * bit 54: mbz, reserved for use by hardware
1379 * bits 55-60: SW counter
1380 * bits 61-63: engine class
1381 *
1382 * On Xe_HP, the upper dword of the descriptor has a new format:
1383 *
1384 * bits 32-37: virtual function number
1385 * bit 38: mbz, reserved for use by hardware
1386 * bits 39-54: SW context ID
1387 * bits 55-57: reserved
1388 * bits 58-63: SW counter
1389 *
1390 * engine info, SW context ID and SW counter need to form a unique number
1391 * (Context ID) per lrc.
1392 */
lrc_descriptor(const struct intel_context * ce)1393 static u32 lrc_descriptor(const struct intel_context *ce)
1394 {
1395 u32 desc;
1396
1397 desc = INTEL_LEGACY_32B_CONTEXT;
1398 if (i915_vm_is_4lvl(ce->vm))
1399 desc = INTEL_LEGACY_64B_CONTEXT;
1400 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1401
1402 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1403 if (GRAPHICS_VER(ce->vm->i915) == 8)
1404 desc |= GEN8_CTX_L3LLC_COHERENT;
1405
1406 return i915_ggtt_offset(ce->state) | desc;
1407 }
1408
lrc_update_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)1409 u32 lrc_update_regs(const struct intel_context *ce,
1410 const struct intel_engine_cs *engine,
1411 u32 head)
1412 {
1413 struct intel_ring *ring = ce->ring;
1414 u32 *regs = ce->lrc_reg_state;
1415
1416 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1417 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1418
1419 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1420 regs[CTX_RING_HEAD] = head;
1421 regs[CTX_RING_TAIL] = ring->tail;
1422 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1423
1424 /* RPCS */
1425 if (engine->class == RENDER_CLASS) {
1426 regs[CTX_R_PWR_CLK_STATE] =
1427 intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1428
1429 i915_oa_init_reg_state(ce, engine);
1430 }
1431
1432 if (ce->wa_bb_page) {
1433 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1434
1435 fn = gen12_emit_indirect_ctx_xcs;
1436 if (ce->engine->class == RENDER_CLASS)
1437 fn = gen12_emit_indirect_ctx_rcs;
1438
1439 /* Mutually exclusive wrt to global indirect bb */
1440 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1441 setup_indirect_ctx_bb(ce, engine, fn);
1442 }
1443
1444 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1445 }
1446
lrc_update_offsets(struct intel_context * ce,struct intel_engine_cs * engine)1447 void lrc_update_offsets(struct intel_context *ce,
1448 struct intel_engine_cs *engine)
1449 {
1450 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1451 }
1452
lrc_check_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,const char * when)1453 void lrc_check_regs(const struct intel_context *ce,
1454 const struct intel_engine_cs *engine,
1455 const char *when)
1456 {
1457 const struct intel_ring *ring = ce->ring;
1458 u32 *regs = ce->lrc_reg_state;
1459 bool valid = true;
1460 int x;
1461
1462 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1463 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1464 engine->name,
1465 regs[CTX_RING_START],
1466 i915_ggtt_offset(ring->vma));
1467 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1468 valid = false;
1469 }
1470
1471 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1472 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1473 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1474 engine->name,
1475 regs[CTX_RING_CTL],
1476 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1477 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1478 valid = false;
1479 }
1480
1481 x = lrc_ring_mi_mode(engine);
1482 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1483 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1484 engine->name, regs[x + 1]);
1485 regs[x + 1] &= ~STOP_RING;
1486 regs[x + 1] |= STOP_RING << 16;
1487 valid = false;
1488 }
1489
1490 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1491 }
1492
1493 /*
1494 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1495 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1496 * but there is a slight complication as this is applied in WA batch where the
1497 * values are only initialized once so we cannot take register value at the
1498 * beginning and reuse it further; hence we save its value to memory, upload a
1499 * constant value with bit21 set and then we restore it back with the saved value.
1500 * To simplify the WA, a constant value is formed by using the default value
1501 * of this register. This shouldn't be a problem because we are only modifying
1502 * it for a short period and this batch in non-premptible. We can ofcourse
1503 * use additional instructions that read the actual value of the register
1504 * at that time and set our bit of interest but it makes the WA complicated.
1505 *
1506 * This WA is also required for Gen9 so extracting as a function avoids
1507 * code duplication.
1508 */
1509 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)1510 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1511 {
1512 /* NB no one else is allowed to scribble over scratch + 256! */
1513 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1514 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1515 *batch++ = intel_gt_scratch_offset(engine->gt,
1516 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1517 *batch++ = 0;
1518
1519 *batch++ = MI_LOAD_REGISTER_IMM(1);
1520 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1521 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1522
1523 batch = gen8_emit_pipe_control(batch,
1524 PIPE_CONTROL_CS_STALL |
1525 PIPE_CONTROL_DC_FLUSH_ENABLE,
1526 0);
1527
1528 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1529 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1530 *batch++ = intel_gt_scratch_offset(engine->gt,
1531 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1532 *batch++ = 0;
1533
1534 return batch;
1535 }
1536
1537 /*
1538 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1539 * initialized at the beginning and shared across all contexts but this field
1540 * helps us to have multiple batches at different offsets and select them based
1541 * on a criteria. At the moment this batch always start at the beginning of the page
1542 * and at this point we don't have multiple wa_ctx batch buffers.
1543 *
1544 * The number of WA applied are not known at the beginning; we use this field
1545 * to return the no of DWORDS written.
1546 *
1547 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1548 * so it adds NOOPs as padding to make it cacheline aligned.
1549 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1550 * makes a complete batch buffer.
1551 */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1552 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1553 {
1554 /* WaDisableCtxRestoreArbitration:bdw,chv */
1555 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1556
1557 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1558 if (IS_BROADWELL(engine->i915))
1559 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1560
1561 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1562 /* Actual scratch location is at 128 bytes offset */
1563 batch = gen8_emit_pipe_control(batch,
1564 PIPE_CONTROL_FLUSH_L3 |
1565 PIPE_CONTROL_STORE_DATA_INDEX |
1566 PIPE_CONTROL_CS_STALL |
1567 PIPE_CONTROL_QW_WRITE,
1568 LRC_PPHWSP_SCRATCH_ADDR);
1569
1570 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1571
1572 /* Pad to end of cacheline */
1573 while ((unsigned long)batch % CACHELINE_BYTES)
1574 *batch++ = MI_NOOP;
1575
1576 /*
1577 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1578 * execution depends on the length specified in terms of cache lines
1579 * in the register CTX_RCS_INDIRECT_CTX
1580 */
1581
1582 return batch;
1583 }
1584
1585 struct lri {
1586 i915_reg_t reg;
1587 u32 value;
1588 };
1589
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)1590 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1591 {
1592 GEM_BUG_ON(!count || count > 63);
1593
1594 *batch++ = MI_LOAD_REGISTER_IMM(count);
1595 do {
1596 *batch++ = i915_mmio_reg_offset(lri->reg);
1597 *batch++ = lri->value;
1598 } while (lri++, --count);
1599 *batch++ = MI_NOOP;
1600
1601 return batch;
1602 }
1603
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1604 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1605 {
1606 static const struct lri lri[] = {
1607 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1608 {
1609 COMMON_SLICE_CHICKEN2,
1610 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1611 0),
1612 },
1613
1614 /* BSpec: 11391 */
1615 {
1616 FF_SLICE_CHICKEN,
1617 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1618 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1619 },
1620
1621 /* BSpec: 11299 */
1622 {
1623 _3D_CHICKEN3,
1624 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1625 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1626 }
1627 };
1628
1629 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1630
1631 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1632 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1633
1634 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1635 batch = gen8_emit_pipe_control(batch,
1636 PIPE_CONTROL_FLUSH_L3 |
1637 PIPE_CONTROL_STORE_DATA_INDEX |
1638 PIPE_CONTROL_CS_STALL |
1639 PIPE_CONTROL_QW_WRITE,
1640 LRC_PPHWSP_SCRATCH_ADDR);
1641
1642 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1643
1644 /* WaMediaPoolStateCmdInWABB:bxt,glk */
1645 if (HAS_POOLED_EU(engine->i915)) {
1646 /*
1647 * EU pool configuration is setup along with golden context
1648 * during context initialization. This value depends on
1649 * device type (2x6 or 3x6) and needs to be updated based
1650 * on which subslice is disabled especially for 2x6
1651 * devices, however it is safe to load default
1652 * configuration of 3x6 device instead of masking off
1653 * corresponding bits because HW ignores bits of a disabled
1654 * subslice and drops down to appropriate config. Please
1655 * see render_state_setup() in i915_gem_render_state.c for
1656 * possible configurations, to avoid duplication they are
1657 * not shown here again.
1658 */
1659 *batch++ = GEN9_MEDIA_POOL_STATE;
1660 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1661 *batch++ = 0x00777000;
1662 *batch++ = 0;
1663 *batch++ = 0;
1664 *batch++ = 0;
1665 }
1666
1667 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1668
1669 /* Pad to end of cacheline */
1670 while ((unsigned long)batch % CACHELINE_BYTES)
1671 *batch++ = MI_NOOP;
1672
1673 return batch;
1674 }
1675
1676 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1677
lrc_create_wa_ctx(struct intel_engine_cs * engine)1678 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1679 {
1680 struct drm_i915_gem_object *obj;
1681 struct i915_vma *vma;
1682 int err;
1683
1684 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1685 if (IS_ERR(obj))
1686 return PTR_ERR(obj);
1687
1688 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1689 if (IS_ERR(vma)) {
1690 err = PTR_ERR(vma);
1691 goto err;
1692 }
1693
1694 engine->wa_ctx.vma = vma;
1695 return 0;
1696
1697 err:
1698 i915_gem_object_put(obj);
1699 return err;
1700 }
1701
lrc_fini_wa_ctx(struct intel_engine_cs * engine)1702 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1703 {
1704 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1705 }
1706
1707 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1708
lrc_init_wa_ctx(struct intel_engine_cs * engine)1709 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1710 {
1711 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1712 struct i915_wa_ctx_bb *wa_bb[] = {
1713 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1714 };
1715 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1716 struct i915_gem_ww_ctx ww;
1717 void *batch, *batch_ptr;
1718 unsigned int i;
1719 int err;
1720
1721 if (!(engine->flags & I915_ENGINE_HAS_RCS_REG_STATE))
1722 return;
1723
1724 switch (GRAPHICS_VER(engine->i915)) {
1725 case 12:
1726 case 11:
1727 return;
1728 case 9:
1729 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1730 wa_bb_fn[1] = NULL;
1731 break;
1732 case 8:
1733 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1734 wa_bb_fn[1] = NULL;
1735 break;
1736 default:
1737 MISSING_CASE(GRAPHICS_VER(engine->i915));
1738 return;
1739 }
1740
1741 err = lrc_create_wa_ctx(engine);
1742 if (err) {
1743 /*
1744 * We continue even if we fail to initialize WA batch
1745 * because we only expect rare glitches but nothing
1746 * critical to prevent us from using GPU
1747 */
1748 drm_err(&engine->i915->drm,
1749 "Ignoring context switch w/a allocation error:%d\n",
1750 err);
1751 return;
1752 }
1753
1754 if (!engine->wa_ctx.vma)
1755 return;
1756
1757 i915_gem_ww_ctx_init(&ww, true);
1758 retry:
1759 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1760 if (!err)
1761 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1762 if (err)
1763 goto err;
1764
1765 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1766 if (IS_ERR(batch)) {
1767 err = PTR_ERR(batch);
1768 goto err_unpin;
1769 }
1770
1771 /*
1772 * Emit the two workaround batch buffers, recording the offset from the
1773 * start of the workaround batch buffer object for each and their
1774 * respective sizes.
1775 */
1776 batch_ptr = batch;
1777 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1778 wa_bb[i]->offset = batch_ptr - batch;
1779 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1780 CACHELINE_BYTES))) {
1781 err = -EINVAL;
1782 break;
1783 }
1784 if (wa_bb_fn[i])
1785 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1786 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1787 }
1788 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1789
1790 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1791 __i915_gem_object_release_map(wa_ctx->vma->obj);
1792
1793 /* Verify that we can handle failure to setup the wa_ctx */
1794 if (!err)
1795 err = i915_inject_probe_error(engine->i915, -ENODEV);
1796
1797 err_unpin:
1798 if (err)
1799 i915_vma_unpin(wa_ctx->vma);
1800 err:
1801 if (err == -EDEADLK) {
1802 err = i915_gem_ww_ctx_backoff(&ww);
1803 if (!err)
1804 goto retry;
1805 }
1806 i915_gem_ww_ctx_fini(&ww);
1807
1808 if (err) {
1809 i915_vma_put(engine->wa_ctx.vma);
1810
1811 /* Clear all flags to prevent further use */
1812 memset(wa_ctx, 0, sizeof(*wa_ctx));
1813 }
1814 }
1815
st_runtime_underflow(struct intel_context_stats * stats,s32 dt)1816 static void st_runtime_underflow(struct intel_context_stats *stats, s32 dt)
1817 {
1818 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1819 stats->runtime.num_underflow++;
1820 stats->runtime.max_underflow =
1821 max_t(u32, stats->runtime.max_underflow, -dt);
1822 #endif
1823 }
1824
lrc_get_runtime(const struct intel_context * ce)1825 static u32 lrc_get_runtime(const struct intel_context *ce)
1826 {
1827 /*
1828 * We can use either ppHWSP[16] which is recorded before the context
1829 * switch (and so excludes the cost of context switches) or use the
1830 * value from the context image itself, which is saved/restored earlier
1831 * and so includes the cost of the save.
1832 */
1833 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
1834 }
1835
lrc_update_runtime(struct intel_context * ce)1836 void lrc_update_runtime(struct intel_context *ce)
1837 {
1838 struct intel_context_stats *stats = &ce->stats;
1839 u32 old;
1840 s32 dt;
1841
1842 old = stats->runtime.last;
1843 stats->runtime.last = lrc_get_runtime(ce);
1844 dt = stats->runtime.last - old;
1845 if (!dt)
1846 return;
1847
1848 if (unlikely(dt < 0)) {
1849 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1850 old, stats->runtime.last, dt);
1851 st_runtime_underflow(stats, dt);
1852 return;
1853 }
1854
1855 ewma_runtime_add(&stats->runtime.avg, dt);
1856 stats->runtime.total += dt;
1857 }
1858
1859 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1860 #include "selftest_lrc.c"
1861 #endif
1862