1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2016 Intel Corporation
4 */
5
6 #include <linux/kthread.h>
7
8 #include "gem/i915_gem_context.h"
9 #include "gem/i915_gem_internal.h"
10
11 #include "i915_gem_evict.h"
12 #include "intel_gt.h"
13 #include "intel_engine_heartbeat.h"
14 #include "intel_engine_pm.h"
15 #include "selftest_engine_heartbeat.h"
16
17 #include "i915_selftest.h"
18 #include "selftests/i915_random.h"
19 #include "selftests/igt_flush_test.h"
20 #include "selftests/igt_reset.h"
21 #include "selftests/igt_atomic.h"
22 #include "selftests/igt_spinner.h"
23 #include "selftests/intel_scheduler_helpers.h"
24
25 #include "selftests/mock_drm.h"
26
27 #include "gem/selftests/mock_context.h"
28 #include "gem/selftests/igt_gem_utils.h"
29
30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
31
32 struct hang {
33 struct intel_gt *gt;
34 struct drm_i915_gem_object *hws;
35 struct drm_i915_gem_object *obj;
36 struct i915_gem_context *ctx;
37 u32 *seqno;
38 u32 *batch;
39 };
40
hang_init(struct hang * h,struct intel_gt * gt)41 static int hang_init(struct hang *h, struct intel_gt *gt)
42 {
43 void *vaddr;
44 int err;
45
46 memset(h, 0, sizeof(*h));
47 h->gt = gt;
48
49 h->ctx = kernel_context(gt->i915, NULL);
50 if (IS_ERR(h->ctx))
51 return PTR_ERR(h->ctx);
52
53 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
54
55 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
56 if (IS_ERR(h->hws)) {
57 err = PTR_ERR(h->hws);
58 goto err_ctx;
59 }
60
61 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
62 if (IS_ERR(h->obj)) {
63 err = PTR_ERR(h->obj);
64 goto err_hws;
65 }
66
67 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
68 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
69 if (IS_ERR(vaddr)) {
70 err = PTR_ERR(vaddr);
71 goto err_obj;
72 }
73 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
74
75 vaddr = i915_gem_object_pin_map_unlocked(h->obj,
76 i915_coherent_map_type(gt->i915, h->obj, false));
77 if (IS_ERR(vaddr)) {
78 err = PTR_ERR(vaddr);
79 goto err_unpin_hws;
80 }
81 h->batch = vaddr;
82
83 return 0;
84
85 err_unpin_hws:
86 i915_gem_object_unpin_map(h->hws);
87 err_obj:
88 i915_gem_object_put(h->obj);
89 err_hws:
90 i915_gem_object_put(h->hws);
91 err_ctx:
92 kernel_context_close(h->ctx);
93 return err;
94 }
95
hws_address(const struct i915_vma * hws,const struct i915_request * rq)96 static u64 hws_address(const struct i915_vma *hws,
97 const struct i915_request *rq)
98 {
99 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
100 }
101
move_to_active(struct i915_vma * vma,struct i915_request * rq,unsigned int flags)102 static int move_to_active(struct i915_vma *vma,
103 struct i915_request *rq,
104 unsigned int flags)
105 {
106 int err;
107
108 i915_vma_lock(vma);
109 err = i915_request_await_object(rq, vma->obj,
110 flags & EXEC_OBJECT_WRITE);
111 if (err == 0)
112 err = i915_vma_move_to_active(vma, rq, flags);
113 i915_vma_unlock(vma);
114
115 return err;
116 }
117
118 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)119 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
120 {
121 struct intel_gt *gt = h->gt;
122 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
123 struct drm_i915_gem_object *obj;
124 struct i915_request *rq = NULL;
125 struct i915_vma *hws, *vma;
126 unsigned int flags;
127 void *vaddr;
128 u32 *batch;
129 int err;
130
131 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
132 if (IS_ERR(obj)) {
133 i915_vm_put(vm);
134 return ERR_CAST(obj);
135 }
136
137 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
138 if (IS_ERR(vaddr)) {
139 i915_gem_object_put(obj);
140 i915_vm_put(vm);
141 return ERR_CAST(vaddr);
142 }
143
144 i915_gem_object_unpin_map(h->obj);
145 i915_gem_object_put(h->obj);
146
147 h->obj = obj;
148 h->batch = vaddr;
149
150 vma = i915_vma_instance(h->obj, vm, NULL);
151 if (IS_ERR(vma)) {
152 i915_vm_put(vm);
153 return ERR_CAST(vma);
154 }
155
156 hws = i915_vma_instance(h->hws, vm, NULL);
157 if (IS_ERR(hws)) {
158 i915_vm_put(vm);
159 return ERR_CAST(hws);
160 }
161
162 err = i915_vma_pin(vma, 0, 0, PIN_USER);
163 if (err) {
164 i915_vm_put(vm);
165 return ERR_PTR(err);
166 }
167
168 err = i915_vma_pin(hws, 0, 0, PIN_USER);
169 if (err)
170 goto unpin_vma;
171
172 rq = igt_request_alloc(h->ctx, engine);
173 if (IS_ERR(rq)) {
174 err = PTR_ERR(rq);
175 goto unpin_hws;
176 }
177
178 err = move_to_active(vma, rq, 0);
179 if (err)
180 goto cancel_rq;
181
182 err = move_to_active(hws, rq, 0);
183 if (err)
184 goto cancel_rq;
185
186 batch = h->batch;
187 if (GRAPHICS_VER(gt->i915) >= 8) {
188 *batch++ = MI_STORE_DWORD_IMM_GEN4;
189 *batch++ = lower_32_bits(hws_address(hws, rq));
190 *batch++ = upper_32_bits(hws_address(hws, rq));
191 *batch++ = rq->fence.seqno;
192 *batch++ = MI_NOOP;
193
194 memset(batch, 0, 1024);
195 batch += 1024 / sizeof(*batch);
196
197 *batch++ = MI_NOOP;
198 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
199 *batch++ = lower_32_bits(vma->node.start);
200 *batch++ = upper_32_bits(vma->node.start);
201 } else if (GRAPHICS_VER(gt->i915) >= 6) {
202 *batch++ = MI_STORE_DWORD_IMM_GEN4;
203 *batch++ = 0;
204 *batch++ = lower_32_bits(hws_address(hws, rq));
205 *batch++ = rq->fence.seqno;
206 *batch++ = MI_NOOP;
207
208 memset(batch, 0, 1024);
209 batch += 1024 / sizeof(*batch);
210
211 *batch++ = MI_NOOP;
212 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
213 *batch++ = lower_32_bits(vma->node.start);
214 } else if (GRAPHICS_VER(gt->i915) >= 4) {
215 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
216 *batch++ = 0;
217 *batch++ = lower_32_bits(hws_address(hws, rq));
218 *batch++ = rq->fence.seqno;
219 *batch++ = MI_NOOP;
220
221 memset(batch, 0, 1024);
222 batch += 1024 / sizeof(*batch);
223
224 *batch++ = MI_NOOP;
225 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
226 *batch++ = lower_32_bits(vma->node.start);
227 } else {
228 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
229 *batch++ = lower_32_bits(hws_address(hws, rq));
230 *batch++ = rq->fence.seqno;
231 *batch++ = MI_NOOP;
232
233 memset(batch, 0, 1024);
234 batch += 1024 / sizeof(*batch);
235
236 *batch++ = MI_NOOP;
237 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
238 *batch++ = lower_32_bits(vma->node.start);
239 }
240 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
241 intel_gt_chipset_flush(engine->gt);
242
243 if (rq->engine->emit_init_breadcrumb) {
244 err = rq->engine->emit_init_breadcrumb(rq);
245 if (err)
246 goto cancel_rq;
247 }
248
249 flags = 0;
250 if (GRAPHICS_VER(gt->i915) <= 5)
251 flags |= I915_DISPATCH_SECURE;
252
253 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
254
255 cancel_rq:
256 if (err) {
257 i915_request_set_error_once(rq, err);
258 i915_request_add(rq);
259 }
260 unpin_hws:
261 i915_vma_unpin(hws);
262 unpin_vma:
263 i915_vma_unpin(vma);
264 i915_vm_put(vm);
265 return err ? ERR_PTR(err) : rq;
266 }
267
hws_seqno(const struct hang * h,const struct i915_request * rq)268 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
269 {
270 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
271 }
272
hang_fini(struct hang * h)273 static void hang_fini(struct hang *h)
274 {
275 *h->batch = MI_BATCH_BUFFER_END;
276 intel_gt_chipset_flush(h->gt);
277
278 i915_gem_object_unpin_map(h->obj);
279 i915_gem_object_put(h->obj);
280
281 i915_gem_object_unpin_map(h->hws);
282 i915_gem_object_put(h->hws);
283
284 kernel_context_close(h->ctx);
285
286 igt_flush_test(h->gt->i915);
287 }
288
wait_until_running(struct hang * h,struct i915_request * rq)289 static bool wait_until_running(struct hang *h, struct i915_request *rq)
290 {
291 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
292 rq->fence.seqno),
293 10) &&
294 wait_for(i915_seqno_passed(hws_seqno(h, rq),
295 rq->fence.seqno),
296 1000));
297 }
298
igt_hang_sanitycheck(void * arg)299 static int igt_hang_sanitycheck(void *arg)
300 {
301 struct intel_gt *gt = arg;
302 struct i915_request *rq;
303 struct intel_engine_cs *engine;
304 enum intel_engine_id id;
305 struct hang h;
306 int err;
307
308 /* Basic check that we can execute our hanging batch */
309
310 err = hang_init(&h, gt);
311 if (err)
312 return err;
313
314 for_each_engine(engine, gt, id) {
315 struct intel_wedge_me w;
316 long timeout;
317
318 if (!intel_engine_can_store_dword(engine))
319 continue;
320
321 rq = hang_create_request(&h, engine);
322 if (IS_ERR(rq)) {
323 err = PTR_ERR(rq);
324 pr_err("Failed to create request for %s, err=%d\n",
325 engine->name, err);
326 goto fini;
327 }
328
329 i915_request_get(rq);
330
331 *h.batch = MI_BATCH_BUFFER_END;
332 intel_gt_chipset_flush(engine->gt);
333
334 i915_request_add(rq);
335
336 timeout = 0;
337 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
338 timeout = i915_request_wait(rq, 0,
339 MAX_SCHEDULE_TIMEOUT);
340 if (intel_gt_is_wedged(gt))
341 timeout = -EIO;
342
343 i915_request_put(rq);
344
345 if (timeout < 0) {
346 err = timeout;
347 pr_err("Wait for request failed on %s, err=%d\n",
348 engine->name, err);
349 goto fini;
350 }
351 }
352
353 fini:
354 hang_fini(&h);
355 return err;
356 }
357
wait_for_idle(struct intel_engine_cs * engine)358 static bool wait_for_idle(struct intel_engine_cs *engine)
359 {
360 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
361 }
362
igt_reset_nop(void * arg)363 static int igt_reset_nop(void *arg)
364 {
365 struct intel_gt *gt = arg;
366 struct i915_gpu_error *global = >->i915->gpu_error;
367 struct intel_engine_cs *engine;
368 unsigned int reset_count, count;
369 enum intel_engine_id id;
370 IGT_TIMEOUT(end_time);
371 int err = 0;
372
373 /* Check that we can reset during non-user portions of requests */
374
375 reset_count = i915_reset_count(global);
376 count = 0;
377 do {
378 for_each_engine(engine, gt, id) {
379 struct intel_context *ce;
380 int i;
381
382 ce = intel_context_create(engine);
383 if (IS_ERR(ce)) {
384 err = PTR_ERR(ce);
385 pr_err("[%s] Create context failed: %d!\n", engine->name, err);
386 break;
387 }
388
389 for (i = 0; i < 16; i++) {
390 struct i915_request *rq;
391
392 rq = intel_context_create_request(ce);
393 if (IS_ERR(rq)) {
394 err = PTR_ERR(rq);
395 pr_err("[%s] Create request failed: %d!\n",
396 engine->name, err);
397 break;
398 }
399
400 i915_request_add(rq);
401 }
402
403 intel_context_put(ce);
404 }
405
406 igt_global_reset_lock(gt);
407 intel_gt_reset(gt, ALL_ENGINES, NULL);
408 igt_global_reset_unlock(gt);
409
410 if (intel_gt_is_wedged(gt)) {
411 pr_err("[%s] GT is wedged!\n", engine->name);
412 err = -EIO;
413 break;
414 }
415
416 if (i915_reset_count(global) != reset_count + ++count) {
417 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
418 engine->name, i915_reset_count(global), reset_count, count);
419 err = -EINVAL;
420 break;
421 }
422
423 err = igt_flush_test(gt->i915);
424 if (err) {
425 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
426 break;
427 }
428 } while (time_before(jiffies, end_time));
429 pr_info("%s: %d resets\n", __func__, count);
430
431 if (igt_flush_test(gt->i915)) {
432 pr_err("Post flush failed: %d!\n", err);
433 err = -EIO;
434 }
435
436 return err;
437 }
438
igt_reset_nop_engine(void * arg)439 static int igt_reset_nop_engine(void *arg)
440 {
441 struct intel_gt *gt = arg;
442 struct i915_gpu_error *global = >->i915->gpu_error;
443 struct intel_engine_cs *engine;
444 enum intel_engine_id id;
445
446 /* Check that we can engine-reset during non-user portions */
447
448 if (!intel_has_reset_engine(gt))
449 return 0;
450
451 for_each_engine(engine, gt, id) {
452 unsigned int reset_count, reset_engine_count, count;
453 struct intel_context *ce;
454 IGT_TIMEOUT(end_time);
455 int err;
456
457 if (intel_engine_uses_guc(engine)) {
458 /* Engine level resets are triggered by GuC when a hang
459 * is detected. They can't be triggered by the KMD any
460 * more. Thus a nop batch cannot be used as a reset test
461 */
462 continue;
463 }
464
465 ce = intel_context_create(engine);
466 if (IS_ERR(ce)) {
467 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
468 return PTR_ERR(ce);
469 }
470
471 reset_count = i915_reset_count(global);
472 reset_engine_count = i915_reset_engine_count(global, engine);
473 count = 0;
474
475 st_engine_heartbeat_disable(engine);
476 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
477 >->reset.flags));
478 do {
479 int i;
480
481 if (!wait_for_idle(engine)) {
482 pr_err("%s failed to idle before reset\n",
483 engine->name);
484 err = -EIO;
485 break;
486 }
487
488 for (i = 0; i < 16; i++) {
489 struct i915_request *rq;
490
491 rq = intel_context_create_request(ce);
492 if (IS_ERR(rq)) {
493 struct drm_printer p =
494 drm_info_printer(gt->i915->drm.dev);
495 intel_engine_dump(engine, &p,
496 "%s(%s): failed to submit request\n",
497 __func__,
498 engine->name);
499
500 GEM_TRACE("%s(%s): failed to submit request\n",
501 __func__,
502 engine->name);
503 GEM_TRACE_DUMP();
504
505 intel_gt_set_wedged(gt);
506
507 err = PTR_ERR(rq);
508 break;
509 }
510
511 i915_request_add(rq);
512 }
513 err = intel_engine_reset(engine, NULL);
514 if (err) {
515 pr_err("intel_engine_reset(%s) failed, err:%d\n",
516 engine->name, err);
517 break;
518 }
519
520 if (i915_reset_count(global) != reset_count) {
521 pr_err("Full GPU reset recorded! (engine reset expected)\n");
522 err = -EINVAL;
523 break;
524 }
525
526 if (i915_reset_engine_count(global, engine) !=
527 reset_engine_count + ++count) {
528 pr_err("%s engine reset not recorded!\n",
529 engine->name);
530 err = -EINVAL;
531 break;
532 }
533 } while (time_before(jiffies, end_time));
534 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
535 st_engine_heartbeat_enable(engine);
536
537 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
538
539 intel_context_put(ce);
540 if (igt_flush_test(gt->i915))
541 err = -EIO;
542 if (err)
543 return err;
544 }
545
546 return 0;
547 }
548
force_reset_timeout(struct intel_engine_cs * engine)549 static void force_reset_timeout(struct intel_engine_cs *engine)
550 {
551 engine->reset_timeout.probability = 999;
552 atomic_set(&engine->reset_timeout.times, -1);
553 }
554
cancel_reset_timeout(struct intel_engine_cs * engine)555 static void cancel_reset_timeout(struct intel_engine_cs *engine)
556 {
557 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
558 }
559
igt_reset_fail_engine(void * arg)560 static int igt_reset_fail_engine(void *arg)
561 {
562 struct intel_gt *gt = arg;
563 struct intel_engine_cs *engine;
564 enum intel_engine_id id;
565
566 /* Check that we can recover from engine-reset failues */
567
568 if (!intel_has_reset_engine(gt))
569 return 0;
570
571 for_each_engine(engine, gt, id) {
572 unsigned int count;
573 struct intel_context *ce;
574 IGT_TIMEOUT(end_time);
575 int err;
576
577 /* Can't manually break the reset if i915 doesn't perform it */
578 if (intel_engine_uses_guc(engine))
579 continue;
580
581 ce = intel_context_create(engine);
582 if (IS_ERR(ce)) {
583 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
584 return PTR_ERR(ce);
585 }
586
587 st_engine_heartbeat_disable(engine);
588 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
589 >->reset.flags));
590
591 force_reset_timeout(engine);
592 err = intel_engine_reset(engine, NULL);
593 cancel_reset_timeout(engine);
594 if (err == 0) /* timeouts only generated on gen8+ */
595 goto skip;
596
597 count = 0;
598 do {
599 struct i915_request *last = NULL;
600 int i;
601
602 if (!wait_for_idle(engine)) {
603 pr_err("%s failed to idle before reset\n",
604 engine->name);
605 err = -EIO;
606 break;
607 }
608
609 for (i = 0; i < count % 15; i++) {
610 struct i915_request *rq;
611
612 rq = intel_context_create_request(ce);
613 if (IS_ERR(rq)) {
614 struct drm_printer p =
615 drm_info_printer(gt->i915->drm.dev);
616 intel_engine_dump(engine, &p,
617 "%s(%s): failed to submit request\n",
618 __func__,
619 engine->name);
620
621 GEM_TRACE("%s(%s): failed to submit request\n",
622 __func__,
623 engine->name);
624 GEM_TRACE_DUMP();
625
626 intel_gt_set_wedged(gt);
627 if (last)
628 i915_request_put(last);
629
630 err = PTR_ERR(rq);
631 goto out;
632 }
633
634 if (last)
635 i915_request_put(last);
636 last = i915_request_get(rq);
637 i915_request_add(rq);
638 }
639
640 if (count & 1) {
641 err = intel_engine_reset(engine, NULL);
642 if (err) {
643 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
644 engine->name, err);
645 GEM_TRACE_DUMP();
646 i915_request_put(last);
647 break;
648 }
649 } else {
650 force_reset_timeout(engine);
651 err = intel_engine_reset(engine, NULL);
652 cancel_reset_timeout(engine);
653 if (err != -ETIMEDOUT) {
654 pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
655 engine->name, err);
656 i915_request_put(last);
657 break;
658 }
659 }
660
661 err = 0;
662 if (last) {
663 if (i915_request_wait(last, 0, HZ / 2) < 0) {
664 struct drm_printer p =
665 drm_info_printer(gt->i915->drm.dev);
666
667 intel_engine_dump(engine, &p,
668 "%s(%s): failed to complete request\n",
669 __func__,
670 engine->name);
671
672 GEM_TRACE("%s(%s): failed to complete request\n",
673 __func__,
674 engine->name);
675 GEM_TRACE_DUMP();
676
677 err = -EIO;
678 }
679 i915_request_put(last);
680 }
681 count++;
682 } while (err == 0 && time_before(jiffies, end_time));
683 out:
684 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
685 skip:
686 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
687 st_engine_heartbeat_enable(engine);
688 intel_context_put(ce);
689
690 if (igt_flush_test(gt->i915))
691 err = -EIO;
692 if (err)
693 return err;
694 }
695
696 return 0;
697 }
698
__igt_reset_engine(struct intel_gt * gt,bool active)699 static int __igt_reset_engine(struct intel_gt *gt, bool active)
700 {
701 struct i915_gpu_error *global = >->i915->gpu_error;
702 struct intel_engine_cs *engine;
703 enum intel_engine_id id;
704 struct hang h;
705 int err = 0;
706
707 /* Check that we can issue an engine reset on an idle engine (no-op) */
708
709 if (!intel_has_reset_engine(gt))
710 return 0;
711
712 if (active) {
713 err = hang_init(&h, gt);
714 if (err)
715 return err;
716 }
717
718 for_each_engine(engine, gt, id) {
719 unsigned int reset_count, reset_engine_count;
720 unsigned long count;
721 bool using_guc = intel_engine_uses_guc(engine);
722 IGT_TIMEOUT(end_time);
723
724 if (using_guc && !active)
725 continue;
726
727 if (active && !intel_engine_can_store_dword(engine))
728 continue;
729
730 if (!wait_for_idle(engine)) {
731 pr_err("%s failed to idle before reset\n",
732 engine->name);
733 err = -EIO;
734 break;
735 }
736
737 reset_count = i915_reset_count(global);
738 reset_engine_count = i915_reset_engine_count(global, engine);
739
740 st_engine_heartbeat_disable(engine);
741 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
742 >->reset.flags));
743 count = 0;
744 do {
745 struct i915_request *rq = NULL;
746 struct intel_selftest_saved_policy saved;
747 int err2;
748
749 err = intel_selftest_modify_policy(engine, &saved,
750 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
751 if (err) {
752 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
753 break;
754 }
755
756 if (active) {
757 rq = hang_create_request(&h, engine);
758 if (IS_ERR(rq)) {
759 err = PTR_ERR(rq);
760 pr_err("[%s] Create hang request failed: %d!\n",
761 engine->name, err);
762 goto restore;
763 }
764
765 i915_request_get(rq);
766 i915_request_add(rq);
767
768 if (!wait_until_running(&h, rq)) {
769 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
770
771 pr_err("%s: Failed to start request %llx, at %x\n",
772 __func__, rq->fence.seqno, hws_seqno(&h, rq));
773 intel_engine_dump(engine, &p,
774 "%s\n", engine->name);
775
776 i915_request_put(rq);
777 err = -EIO;
778 goto restore;
779 }
780 }
781
782 if (!using_guc) {
783 err = intel_engine_reset(engine, NULL);
784 if (err) {
785 pr_err("intel_engine_reset(%s) failed, err:%d\n",
786 engine->name, err);
787 goto skip;
788 }
789 }
790
791 if (rq) {
792 /* Ensure the reset happens and kills the engine */
793 err = intel_selftest_wait_for_rq(rq);
794 if (err)
795 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
796 engine->name, rq->fence.context,
797 rq->fence.seqno, rq->context->guc_id.id, err);
798 }
799
800 skip:
801 if (rq)
802 i915_request_put(rq);
803
804 if (i915_reset_count(global) != reset_count) {
805 pr_err("Full GPU reset recorded! (engine reset expected)\n");
806 err = -EINVAL;
807 goto restore;
808 }
809
810 /* GuC based resets are not logged per engine */
811 if (!using_guc) {
812 if (i915_reset_engine_count(global, engine) !=
813 ++reset_engine_count) {
814 pr_err("%s engine reset not recorded!\n",
815 engine->name);
816 err = -EINVAL;
817 goto restore;
818 }
819 }
820
821 count++;
822
823 restore:
824 err2 = intel_selftest_restore_policy(engine, &saved);
825 if (err2)
826 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
827 if (err == 0)
828 err = err2;
829 if (err)
830 break;
831 } while (time_before(jiffies, end_time));
832 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
833 st_engine_heartbeat_enable(engine);
834 pr_info("%s: Completed %lu %s resets\n",
835 engine->name, count, active ? "active" : "idle");
836
837 if (err)
838 break;
839
840 err = igt_flush_test(gt->i915);
841 if (err) {
842 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
843 break;
844 }
845 }
846
847 if (intel_gt_is_wedged(gt)) {
848 pr_err("GT is wedged!\n");
849 err = -EIO;
850 }
851
852 if (active)
853 hang_fini(&h);
854
855 return err;
856 }
857
igt_reset_idle_engine(void * arg)858 static int igt_reset_idle_engine(void *arg)
859 {
860 return __igt_reset_engine(arg, false);
861 }
862
igt_reset_active_engine(void * arg)863 static int igt_reset_active_engine(void *arg)
864 {
865 return __igt_reset_engine(arg, true);
866 }
867
868 struct active_engine {
869 struct task_struct *task;
870 struct intel_engine_cs *engine;
871 unsigned long resets;
872 unsigned int flags;
873 };
874
875 #define TEST_ACTIVE BIT(0)
876 #define TEST_OTHERS BIT(1)
877 #define TEST_SELF BIT(2)
878 #define TEST_PRIORITY BIT(3)
879
active_request_put(struct i915_request * rq)880 static int active_request_put(struct i915_request *rq)
881 {
882 int err = 0;
883
884 if (!rq)
885 return 0;
886
887 if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
888 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
889 rq->engine->name,
890 rq->fence.context,
891 rq->fence.seqno);
892 GEM_TRACE_DUMP();
893
894 intel_gt_set_wedged(rq->engine->gt);
895 err = -EIO;
896 }
897
898 i915_request_put(rq);
899
900 return err;
901 }
902
active_engine(void * data)903 static int active_engine(void *data)
904 {
905 I915_RND_STATE(prng);
906 struct active_engine *arg = data;
907 struct intel_engine_cs *engine = arg->engine;
908 struct i915_request *rq[8] = {};
909 struct intel_context *ce[ARRAY_SIZE(rq)];
910 unsigned long count;
911 int err = 0;
912
913 for (count = 0; count < ARRAY_SIZE(ce); count++) {
914 ce[count] = intel_context_create(engine);
915 if (IS_ERR(ce[count])) {
916 err = PTR_ERR(ce[count]);
917 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
918 while (--count)
919 intel_context_put(ce[count]);
920 return err;
921 }
922 }
923
924 count = 0;
925 while (!kthread_should_stop()) {
926 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
927 struct i915_request *old = rq[idx];
928 struct i915_request *new;
929
930 new = intel_context_create_request(ce[idx]);
931 if (IS_ERR(new)) {
932 err = PTR_ERR(new);
933 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
934 break;
935 }
936
937 rq[idx] = i915_request_get(new);
938 i915_request_add(new);
939
940 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
941 struct i915_sched_attr attr = {
942 .priority =
943 i915_prandom_u32_max_state(512, &prng),
944 };
945 engine->sched_engine->schedule(rq[idx], &attr);
946 }
947
948 err = active_request_put(old);
949 if (err) {
950 pr_err("[%s] Request put failed: %d!\n", engine->name, err);
951 break;
952 }
953
954 cond_resched();
955 }
956
957 for (count = 0; count < ARRAY_SIZE(rq); count++) {
958 int err__ = active_request_put(rq[count]);
959
960 if (err)
961 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
962
963 /* Keep the first error */
964 if (!err)
965 err = err__;
966
967 intel_context_put(ce[count]);
968 }
969
970 return err;
971 }
972
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)973 static int __igt_reset_engines(struct intel_gt *gt,
974 const char *test_name,
975 unsigned int flags)
976 {
977 struct i915_gpu_error *global = >->i915->gpu_error;
978 struct intel_engine_cs *engine, *other;
979 enum intel_engine_id id, tmp;
980 struct hang h;
981 int err = 0;
982
983 /* Check that issuing a reset on one engine does not interfere
984 * with any other engine.
985 */
986
987 if (!intel_has_reset_engine(gt))
988 return 0;
989
990 if (flags & TEST_ACTIVE) {
991 err = hang_init(&h, gt);
992 if (err)
993 return err;
994
995 if (flags & TEST_PRIORITY)
996 h.ctx->sched.priority = 1024;
997 }
998
999 for_each_engine(engine, gt, id) {
1000 struct active_engine threads[I915_NUM_ENGINES] = {};
1001 unsigned long device = i915_reset_count(global);
1002 unsigned long count = 0, reported;
1003 bool using_guc = intel_engine_uses_guc(engine);
1004 IGT_TIMEOUT(end_time);
1005
1006 if (flags & TEST_ACTIVE) {
1007 if (!intel_engine_can_store_dword(engine))
1008 continue;
1009 } else if (using_guc)
1010 continue;
1011
1012 if (!wait_for_idle(engine)) {
1013 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1014 engine->name, test_name);
1015 err = -EIO;
1016 break;
1017 }
1018
1019 memset(threads, 0, sizeof(threads));
1020 for_each_engine(other, gt, tmp) {
1021 struct task_struct *tsk;
1022
1023 threads[tmp].resets =
1024 i915_reset_engine_count(global, other);
1025
1026 if (other == engine && !(flags & TEST_SELF))
1027 continue;
1028
1029 if (other != engine && !(flags & TEST_OTHERS))
1030 continue;
1031
1032 threads[tmp].engine = other;
1033 threads[tmp].flags = flags;
1034
1035 tsk = kthread_run(active_engine, &threads[tmp],
1036 "igt/%s", other->name);
1037 if (IS_ERR(tsk)) {
1038 err = PTR_ERR(tsk);
1039 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1040 goto unwind;
1041 }
1042
1043 threads[tmp].task = tsk;
1044 get_task_struct(tsk);
1045 }
1046
1047 yield(); /* start all threads before we begin */
1048
1049 st_engine_heartbeat_disable_no_pm(engine);
1050 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
1051 >->reset.flags));
1052 do {
1053 struct i915_request *rq = NULL;
1054 struct intel_selftest_saved_policy saved;
1055 int err2;
1056
1057 err = intel_selftest_modify_policy(engine, &saved,
1058 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1059 if (err) {
1060 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1061 break;
1062 }
1063
1064 if (flags & TEST_ACTIVE) {
1065 rq = hang_create_request(&h, engine);
1066 if (IS_ERR(rq)) {
1067 err = PTR_ERR(rq);
1068 pr_err("[%s] Create hang request failed: %d!\n",
1069 engine->name, err);
1070 goto restore;
1071 }
1072
1073 i915_request_get(rq);
1074 i915_request_add(rq);
1075
1076 if (!wait_until_running(&h, rq)) {
1077 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1078
1079 pr_err("%s: Failed to start request %llx, at %x\n",
1080 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1081 intel_engine_dump(engine, &p,
1082 "%s\n", engine->name);
1083
1084 i915_request_put(rq);
1085 err = -EIO;
1086 goto restore;
1087 }
1088 } else {
1089 intel_engine_pm_get(engine);
1090 }
1091
1092 if (!using_guc) {
1093 err = intel_engine_reset(engine, NULL);
1094 if (err) {
1095 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1096 engine->name, test_name, err);
1097 goto restore;
1098 }
1099 }
1100
1101 if (rq) {
1102 /* Ensure the reset happens and kills the engine */
1103 err = intel_selftest_wait_for_rq(rq);
1104 if (err)
1105 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1106 engine->name, rq->fence.context,
1107 rq->fence.seqno, rq->context->guc_id.id, err);
1108 }
1109
1110 count++;
1111
1112 if (rq) {
1113 if (rq->fence.error != -EIO) {
1114 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1115 engine->name, test_name,
1116 rq->fence.context,
1117 rq->fence.seqno, rq->context->guc_id.id);
1118 i915_request_put(rq);
1119
1120 GEM_TRACE_DUMP();
1121 intel_gt_set_wedged(gt);
1122 err = -EIO;
1123 goto restore;
1124 }
1125
1126 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1127 struct drm_printer p =
1128 drm_info_printer(gt->i915->drm.dev);
1129
1130 pr_err("i915_reset_engine(%s:%s):"
1131 " failed to complete request %llx:%lld after reset\n",
1132 engine->name, test_name,
1133 rq->fence.context,
1134 rq->fence.seqno);
1135 intel_engine_dump(engine, &p,
1136 "%s\n", engine->name);
1137 i915_request_put(rq);
1138
1139 GEM_TRACE_DUMP();
1140 intel_gt_set_wedged(gt);
1141 err = -EIO;
1142 goto restore;
1143 }
1144
1145 i915_request_put(rq);
1146 }
1147
1148 if (!(flags & TEST_ACTIVE))
1149 intel_engine_pm_put(engine);
1150
1151 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1152 struct drm_printer p =
1153 drm_info_printer(gt->i915->drm.dev);
1154
1155 pr_err("i915_reset_engine(%s:%s):"
1156 " failed to idle after reset\n",
1157 engine->name, test_name);
1158 intel_engine_dump(engine, &p,
1159 "%s\n", engine->name);
1160
1161 err = -EIO;
1162 goto restore;
1163 }
1164
1165 restore:
1166 err2 = intel_selftest_restore_policy(engine, &saved);
1167 if (err2)
1168 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1169 if (err == 0)
1170 err = err2;
1171 if (err)
1172 break;
1173 } while (time_before(jiffies, end_time));
1174 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
1175 st_engine_heartbeat_enable_no_pm(engine);
1176
1177 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1178 engine->name, test_name, count);
1179
1180 /* GuC based resets are not logged per engine */
1181 if (!using_guc) {
1182 reported = i915_reset_engine_count(global, engine);
1183 reported -= threads[engine->id].resets;
1184 if (reported != count) {
1185 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1186 engine->name, test_name, count, reported);
1187 if (!err)
1188 err = -EINVAL;
1189 }
1190 }
1191
1192 unwind:
1193 for_each_engine(other, gt, tmp) {
1194 int ret;
1195
1196 if (!threads[tmp].task)
1197 continue;
1198
1199 ret = kthread_stop(threads[tmp].task);
1200 if (ret) {
1201 pr_err("kthread for other engine %s failed, err=%d\n",
1202 other->name, ret);
1203 if (!err)
1204 err = ret;
1205 }
1206 put_task_struct(threads[tmp].task);
1207
1208 /* GuC based resets are not logged per engine */
1209 if (!using_guc) {
1210 if (other->uabi_class != engine->uabi_class &&
1211 threads[tmp].resets !=
1212 i915_reset_engine_count(global, other)) {
1213 pr_err("Innocent engine %s was reset (count=%ld)\n",
1214 other->name,
1215 i915_reset_engine_count(global, other) -
1216 threads[tmp].resets);
1217 if (!err)
1218 err = -EINVAL;
1219 }
1220 }
1221 }
1222
1223 if (device != i915_reset_count(global)) {
1224 pr_err("Global reset (count=%ld)!\n",
1225 i915_reset_count(global) - device);
1226 if (!err)
1227 err = -EINVAL;
1228 }
1229
1230 if (err)
1231 break;
1232
1233 err = igt_flush_test(gt->i915);
1234 if (err) {
1235 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1236 break;
1237 }
1238 }
1239
1240 if (intel_gt_is_wedged(gt))
1241 err = -EIO;
1242
1243 if (flags & TEST_ACTIVE)
1244 hang_fini(&h);
1245
1246 return err;
1247 }
1248
igt_reset_engines(void * arg)1249 static int igt_reset_engines(void *arg)
1250 {
1251 static const struct {
1252 const char *name;
1253 unsigned int flags;
1254 } phases[] = {
1255 { "idle", 0 },
1256 { "active", TEST_ACTIVE },
1257 { "others-idle", TEST_OTHERS },
1258 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1259 {
1260 "others-priority",
1261 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1262 },
1263 {
1264 "self-priority",
1265 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1266 },
1267 { }
1268 };
1269 struct intel_gt *gt = arg;
1270 typeof(*phases) *p;
1271 int err;
1272
1273 for (p = phases; p->name; p++) {
1274 if (p->flags & TEST_PRIORITY) {
1275 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1276 continue;
1277 }
1278
1279 err = __igt_reset_engines(arg, p->name, p->flags);
1280 if (err)
1281 return err;
1282 }
1283
1284 return 0;
1285 }
1286
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1287 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1288 {
1289 u32 count = i915_reset_count(>->i915->gpu_error);
1290
1291 intel_gt_reset(gt, mask, NULL);
1292
1293 return count;
1294 }
1295
igt_reset_wait(void * arg)1296 static int igt_reset_wait(void *arg)
1297 {
1298 struct intel_gt *gt = arg;
1299 struct i915_gpu_error *global = >->i915->gpu_error;
1300 struct intel_engine_cs *engine = gt->engine[RCS0];
1301 struct i915_request *rq;
1302 unsigned int reset_count;
1303 struct hang h;
1304 long timeout;
1305 int err;
1306
1307 if (!engine || !intel_engine_can_store_dword(engine))
1308 return 0;
1309
1310 /* Check that we detect a stuck waiter and issue a reset */
1311
1312 igt_global_reset_lock(gt);
1313
1314 err = hang_init(&h, gt);
1315 if (err) {
1316 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1317 goto unlock;
1318 }
1319
1320 rq = hang_create_request(&h, engine);
1321 if (IS_ERR(rq)) {
1322 err = PTR_ERR(rq);
1323 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1324 goto fini;
1325 }
1326
1327 i915_request_get(rq);
1328 i915_request_add(rq);
1329
1330 if (!wait_until_running(&h, rq)) {
1331 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1332
1333 pr_err("%s: Failed to start request %llx, at %x\n",
1334 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1335 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1336
1337 intel_gt_set_wedged(gt);
1338
1339 err = -EIO;
1340 goto out_rq;
1341 }
1342
1343 reset_count = fake_hangcheck(gt, ALL_ENGINES);
1344
1345 timeout = i915_request_wait(rq, 0, 10);
1346 if (timeout < 0) {
1347 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1348 timeout);
1349 err = timeout;
1350 goto out_rq;
1351 }
1352
1353 if (i915_reset_count(global) == reset_count) {
1354 pr_err("No GPU reset recorded!\n");
1355 err = -EINVAL;
1356 goto out_rq;
1357 }
1358
1359 out_rq:
1360 i915_request_put(rq);
1361 fini:
1362 hang_fini(&h);
1363 unlock:
1364 igt_global_reset_unlock(gt);
1365
1366 if (intel_gt_is_wedged(gt))
1367 return -EIO;
1368
1369 return err;
1370 }
1371
1372 struct evict_vma {
1373 struct completion completion;
1374 struct i915_vma *vma;
1375 };
1376
evict_vma(void * data)1377 static int evict_vma(void *data)
1378 {
1379 struct evict_vma *arg = data;
1380 struct i915_address_space *vm = arg->vma->vm;
1381 struct drm_mm_node evict = arg->vma->node;
1382 int err;
1383
1384 complete(&arg->completion);
1385
1386 mutex_lock(&vm->mutex);
1387 err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1388 mutex_unlock(&vm->mutex);
1389
1390 return err;
1391 }
1392
evict_fence(void * data)1393 static int evict_fence(void *data)
1394 {
1395 struct evict_vma *arg = data;
1396 int err;
1397
1398 complete(&arg->completion);
1399
1400 /* Mark the fence register as dirty to force the mmio update. */
1401 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1402 if (err) {
1403 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1404 return err;
1405 }
1406
1407 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1408 if (err) {
1409 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1410 return err;
1411 }
1412
1413 err = i915_vma_pin_fence(arg->vma);
1414 i915_vma_unpin(arg->vma);
1415 if (err) {
1416 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1417 return err;
1418 }
1419
1420 i915_vma_unpin_fence(arg->vma);
1421
1422 return 0;
1423 }
1424
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1425 static int __igt_reset_evict_vma(struct intel_gt *gt,
1426 struct i915_address_space *vm,
1427 int (*fn)(void *),
1428 unsigned int flags)
1429 {
1430 struct intel_engine_cs *engine = gt->engine[RCS0];
1431 struct drm_i915_gem_object *obj;
1432 struct task_struct *tsk = NULL;
1433 struct i915_request *rq;
1434 struct evict_vma arg;
1435 struct hang h;
1436 unsigned int pin_flags;
1437 int err;
1438
1439 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1440 return 0;
1441
1442 if (!engine || !intel_engine_can_store_dword(engine))
1443 return 0;
1444
1445 /* Check that we can recover an unbind stuck on a hanging request */
1446
1447 err = hang_init(&h, gt);
1448 if (err) {
1449 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1450 return err;
1451 }
1452
1453 obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1454 if (IS_ERR(obj)) {
1455 err = PTR_ERR(obj);
1456 pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1457 goto fini;
1458 }
1459
1460 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1461 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1462 if (err) {
1463 pr_err("Invalid X-tiling settings; err:%d\n", err);
1464 goto out_obj;
1465 }
1466 }
1467
1468 arg.vma = i915_vma_instance(obj, vm, NULL);
1469 if (IS_ERR(arg.vma)) {
1470 err = PTR_ERR(arg.vma);
1471 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1472 goto out_obj;
1473 }
1474
1475 rq = hang_create_request(&h, engine);
1476 if (IS_ERR(rq)) {
1477 err = PTR_ERR(rq);
1478 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1479 goto out_obj;
1480 }
1481
1482 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1483
1484 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1485 pin_flags |= PIN_MAPPABLE;
1486
1487 err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1488 if (err) {
1489 i915_request_add(rq);
1490 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1491 goto out_obj;
1492 }
1493
1494 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1495 err = i915_vma_pin_fence(arg.vma);
1496 if (err) {
1497 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1498 i915_vma_unpin(arg.vma);
1499 i915_request_add(rq);
1500 goto out_obj;
1501 }
1502 }
1503
1504 i915_vma_lock(arg.vma);
1505 err = i915_request_await_object(rq, arg.vma->obj,
1506 flags & EXEC_OBJECT_WRITE);
1507 if (err == 0) {
1508 err = i915_vma_move_to_active(arg.vma, rq, flags);
1509 if (err)
1510 pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1511 } else {
1512 pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1513 }
1514
1515 i915_vma_unlock(arg.vma);
1516
1517 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1518 i915_vma_unpin_fence(arg.vma);
1519 i915_vma_unpin(arg.vma);
1520
1521 i915_request_get(rq);
1522 i915_request_add(rq);
1523 if (err)
1524 goto out_rq;
1525
1526 if (!wait_until_running(&h, rq)) {
1527 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1528
1529 pr_err("%s: Failed to start request %llx, at %x\n",
1530 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1531 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1532
1533 intel_gt_set_wedged(gt);
1534 goto out_reset;
1535 }
1536
1537 init_completion(&arg.completion);
1538
1539 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1540 if (IS_ERR(tsk)) {
1541 err = PTR_ERR(tsk);
1542 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1543 tsk = NULL;
1544 goto out_reset;
1545 }
1546 get_task_struct(tsk);
1547
1548 wait_for_completion(&arg.completion);
1549
1550 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1551 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1552
1553 pr_err("igt/evict_vma kthread did not wait\n");
1554 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1555
1556 intel_gt_set_wedged(gt);
1557 goto out_reset;
1558 }
1559
1560 out_reset:
1561 igt_global_reset_lock(gt);
1562 fake_hangcheck(gt, rq->engine->mask);
1563 igt_global_reset_unlock(gt);
1564
1565 if (tsk) {
1566 struct intel_wedge_me w;
1567
1568 /* The reset, even indirectly, should take less than 10ms. */
1569 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1570 err = kthread_stop(tsk);
1571
1572 put_task_struct(tsk);
1573 }
1574
1575 out_rq:
1576 i915_request_put(rq);
1577 out_obj:
1578 i915_gem_object_put(obj);
1579 fini:
1580 hang_fini(&h);
1581 if (intel_gt_is_wedged(gt))
1582 return -EIO;
1583
1584 return err;
1585 }
1586
igt_reset_evict_ggtt(void * arg)1587 static int igt_reset_evict_ggtt(void *arg)
1588 {
1589 struct intel_gt *gt = arg;
1590
1591 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1592 evict_vma, EXEC_OBJECT_WRITE);
1593 }
1594
igt_reset_evict_ppgtt(void * arg)1595 static int igt_reset_evict_ppgtt(void *arg)
1596 {
1597 struct intel_gt *gt = arg;
1598 struct i915_ppgtt *ppgtt;
1599 int err;
1600
1601 /* aliasing == global gtt locking, covered above */
1602 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1603 return 0;
1604
1605 ppgtt = i915_ppgtt_create(gt, 0);
1606 if (IS_ERR(ppgtt))
1607 return PTR_ERR(ppgtt);
1608
1609 err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1610 evict_vma, EXEC_OBJECT_WRITE);
1611 i915_vm_put(&ppgtt->vm);
1612
1613 return err;
1614 }
1615
igt_reset_evict_fence(void * arg)1616 static int igt_reset_evict_fence(void *arg)
1617 {
1618 struct intel_gt *gt = arg;
1619
1620 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1621 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1622 }
1623
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1624 static int wait_for_others(struct intel_gt *gt,
1625 struct intel_engine_cs *exclude)
1626 {
1627 struct intel_engine_cs *engine;
1628 enum intel_engine_id id;
1629
1630 for_each_engine(engine, gt, id) {
1631 if (engine == exclude)
1632 continue;
1633
1634 if (!wait_for_idle(engine))
1635 return -EIO;
1636 }
1637
1638 return 0;
1639 }
1640
igt_reset_queue(void * arg)1641 static int igt_reset_queue(void *arg)
1642 {
1643 struct intel_gt *gt = arg;
1644 struct i915_gpu_error *global = >->i915->gpu_error;
1645 struct intel_engine_cs *engine;
1646 enum intel_engine_id id;
1647 struct hang h;
1648 int err;
1649
1650 /* Check that we replay pending requests following a hang */
1651
1652 igt_global_reset_lock(gt);
1653
1654 err = hang_init(&h, gt);
1655 if (err)
1656 goto unlock;
1657
1658 for_each_engine(engine, gt, id) {
1659 struct intel_selftest_saved_policy saved;
1660 struct i915_request *prev;
1661 IGT_TIMEOUT(end_time);
1662 unsigned int count;
1663 bool using_guc = intel_engine_uses_guc(engine);
1664
1665 if (!intel_engine_can_store_dword(engine))
1666 continue;
1667
1668 if (using_guc) {
1669 err = intel_selftest_modify_policy(engine, &saved,
1670 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1671 if (err) {
1672 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1673 goto fini;
1674 }
1675 }
1676
1677 prev = hang_create_request(&h, engine);
1678 if (IS_ERR(prev)) {
1679 err = PTR_ERR(prev);
1680 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1681 goto restore;
1682 }
1683
1684 i915_request_get(prev);
1685 i915_request_add(prev);
1686
1687 count = 0;
1688 do {
1689 struct i915_request *rq;
1690 unsigned int reset_count;
1691
1692 rq = hang_create_request(&h, engine);
1693 if (IS_ERR(rq)) {
1694 err = PTR_ERR(rq);
1695 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1696 goto restore;
1697 }
1698
1699 i915_request_get(rq);
1700 i915_request_add(rq);
1701
1702 /*
1703 * XXX We don't handle resetting the kernel context
1704 * very well. If we trigger a device reset twice in
1705 * quick succession while the kernel context is
1706 * executing, we may end up skipping the breadcrumb.
1707 * This is really only a problem for the selftest as
1708 * normally there is a large interlude between resets
1709 * (hangcheck), or we focus on resetting just one
1710 * engine and so avoid repeatedly resetting innocents.
1711 */
1712 err = wait_for_others(gt, engine);
1713 if (err) {
1714 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1715 __func__, engine->name);
1716 i915_request_put(rq);
1717 i915_request_put(prev);
1718
1719 GEM_TRACE_DUMP();
1720 intel_gt_set_wedged(gt);
1721 goto restore;
1722 }
1723
1724 if (!wait_until_running(&h, prev)) {
1725 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1726
1727 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1728 __func__, engine->name,
1729 prev->fence.seqno, hws_seqno(&h, prev));
1730 intel_engine_dump(engine, &p,
1731 "%s\n", engine->name);
1732
1733 i915_request_put(rq);
1734 i915_request_put(prev);
1735
1736 intel_gt_set_wedged(gt);
1737
1738 err = -EIO;
1739 goto restore;
1740 }
1741
1742 reset_count = fake_hangcheck(gt, BIT(id));
1743
1744 if (prev->fence.error != -EIO) {
1745 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1746 prev->fence.error);
1747 i915_request_put(rq);
1748 i915_request_put(prev);
1749 err = -EINVAL;
1750 goto restore;
1751 }
1752
1753 if (rq->fence.error) {
1754 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1755 rq->fence.error);
1756 i915_request_put(rq);
1757 i915_request_put(prev);
1758 err = -EINVAL;
1759 goto restore;
1760 }
1761
1762 if (i915_reset_count(global) == reset_count) {
1763 pr_err("No GPU reset recorded!\n");
1764 i915_request_put(rq);
1765 i915_request_put(prev);
1766 err = -EINVAL;
1767 goto restore;
1768 }
1769
1770 i915_request_put(prev);
1771 prev = rq;
1772 count++;
1773 } while (time_before(jiffies, end_time));
1774 pr_info("%s: Completed %d queued resets\n",
1775 engine->name, count);
1776
1777 *h.batch = MI_BATCH_BUFFER_END;
1778 intel_gt_chipset_flush(engine->gt);
1779
1780 i915_request_put(prev);
1781
1782 restore:
1783 if (using_guc) {
1784 int err2 = intel_selftest_restore_policy(engine, &saved);
1785
1786 if (err2)
1787 pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1788 __func__, __LINE__, engine->name, err2);
1789 if (err == 0)
1790 err = err2;
1791 }
1792 if (err)
1793 goto fini;
1794
1795 err = igt_flush_test(gt->i915);
1796 if (err) {
1797 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1798 break;
1799 }
1800 }
1801
1802 fini:
1803 hang_fini(&h);
1804 unlock:
1805 igt_global_reset_unlock(gt);
1806
1807 if (intel_gt_is_wedged(gt))
1808 return -EIO;
1809
1810 return err;
1811 }
1812
igt_handle_error(void * arg)1813 static int igt_handle_error(void *arg)
1814 {
1815 struct intel_gt *gt = arg;
1816 struct i915_gpu_error *global = >->i915->gpu_error;
1817 struct intel_engine_cs *engine = gt->engine[RCS0];
1818 struct hang h;
1819 struct i915_request *rq;
1820 struct i915_gpu_coredump *error;
1821 int err;
1822
1823 /* Check that we can issue a global GPU and engine reset */
1824
1825 if (!intel_has_reset_engine(gt))
1826 return 0;
1827
1828 if (!engine || !intel_engine_can_store_dword(engine))
1829 return 0;
1830
1831 err = hang_init(&h, gt);
1832 if (err) {
1833 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1834 return err;
1835 }
1836
1837 rq = hang_create_request(&h, engine);
1838 if (IS_ERR(rq)) {
1839 err = PTR_ERR(rq);
1840 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1841 goto err_fini;
1842 }
1843
1844 i915_request_get(rq);
1845 i915_request_add(rq);
1846
1847 if (!wait_until_running(&h, rq)) {
1848 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1849
1850 pr_err("%s: Failed to start request %llx, at %x\n",
1851 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1852 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1853
1854 intel_gt_set_wedged(gt);
1855
1856 err = -EIO;
1857 goto err_request;
1858 }
1859
1860 /* Temporarily disable error capture */
1861 error = xchg(&global->first_error, (void *)-1);
1862
1863 intel_gt_handle_error(gt, engine->mask, 0, NULL);
1864
1865 xchg(&global->first_error, error);
1866
1867 if (rq->fence.error != -EIO) {
1868 pr_err("Guilty request not identified!\n");
1869 err = -EINVAL;
1870 goto err_request;
1871 }
1872
1873 err_request:
1874 i915_request_put(rq);
1875 err_fini:
1876 hang_fini(&h);
1877 return err;
1878 }
1879
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1880 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1881 const struct igt_atomic_section *p,
1882 const char *mode)
1883 {
1884 struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1885 int err;
1886
1887 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1888 engine->name, mode, p->name);
1889
1890 if (t->func)
1891 tasklet_disable(t);
1892 if (strcmp(p->name, "softirq"))
1893 local_bh_disable();
1894 p->critical_section_begin();
1895
1896 err = __intel_engine_reset_bh(engine, NULL);
1897
1898 p->critical_section_end();
1899 if (strcmp(p->name, "softirq"))
1900 local_bh_enable();
1901 if (t->func) {
1902 tasklet_enable(t);
1903 tasklet_hi_schedule(t);
1904 }
1905
1906 if (err)
1907 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1908 engine->name, mode, p->name);
1909
1910 return err;
1911 }
1912
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1913 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1914 const struct igt_atomic_section *p)
1915 {
1916 struct i915_request *rq;
1917 struct hang h;
1918 int err;
1919
1920 err = __igt_atomic_reset_engine(engine, p, "idle");
1921 if (err)
1922 return err;
1923
1924 err = hang_init(&h, engine->gt);
1925 if (err) {
1926 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1927 return err;
1928 }
1929
1930 rq = hang_create_request(&h, engine);
1931 if (IS_ERR(rq)) {
1932 err = PTR_ERR(rq);
1933 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1934 goto out;
1935 }
1936
1937 i915_request_get(rq);
1938 i915_request_add(rq);
1939
1940 if (wait_until_running(&h, rq)) {
1941 err = __igt_atomic_reset_engine(engine, p, "active");
1942 } else {
1943 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1944 __func__, engine->name,
1945 rq->fence.seqno, hws_seqno(&h, rq));
1946 intel_gt_set_wedged(engine->gt);
1947 err = -EIO;
1948 }
1949
1950 if (err == 0) {
1951 struct intel_wedge_me w;
1952
1953 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1954 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1955 if (intel_gt_is_wedged(engine->gt))
1956 err = -EIO;
1957 }
1958
1959 i915_request_put(rq);
1960 out:
1961 hang_fini(&h);
1962 return err;
1963 }
1964
igt_reset_engines_atomic(void * arg)1965 static int igt_reset_engines_atomic(void *arg)
1966 {
1967 struct intel_gt *gt = arg;
1968 const typeof(*igt_atomic_phases) *p;
1969 int err = 0;
1970
1971 /* Check that the engines resets are usable from atomic context */
1972
1973 if (!intel_has_reset_engine(gt))
1974 return 0;
1975
1976 if (intel_uc_uses_guc_submission(>->uc))
1977 return 0;
1978
1979 igt_global_reset_lock(gt);
1980
1981 /* Flush any requests before we get started and check basics */
1982 if (!igt_force_reset(gt))
1983 goto unlock;
1984
1985 for (p = igt_atomic_phases; p->name; p++) {
1986 struct intel_engine_cs *engine;
1987 enum intel_engine_id id;
1988
1989 for_each_engine(engine, gt, id) {
1990 err = igt_atomic_reset_engine(engine, p);
1991 if (err)
1992 goto out;
1993 }
1994 }
1995
1996 out:
1997 /* As we poke around the guts, do a full reset before continuing. */
1998 igt_force_reset(gt);
1999 unlock:
2000 igt_global_reset_unlock(gt);
2001
2002 return err;
2003 }
2004
intel_hangcheck_live_selftests(struct drm_i915_private * i915)2005 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2006 {
2007 static const struct i915_subtest tests[] = {
2008 SUBTEST(igt_hang_sanitycheck),
2009 SUBTEST(igt_reset_nop),
2010 SUBTEST(igt_reset_nop_engine),
2011 SUBTEST(igt_reset_idle_engine),
2012 SUBTEST(igt_reset_active_engine),
2013 SUBTEST(igt_reset_fail_engine),
2014 SUBTEST(igt_reset_engines),
2015 SUBTEST(igt_reset_engines_atomic),
2016 SUBTEST(igt_reset_queue),
2017 SUBTEST(igt_reset_wait),
2018 SUBTEST(igt_reset_evict_ggtt),
2019 SUBTEST(igt_reset_evict_ppgtt),
2020 SUBTEST(igt_reset_evict_fence),
2021 SUBTEST(igt_handle_error),
2022 };
2023 struct intel_gt *gt = to_gt(i915);
2024 intel_wakeref_t wakeref;
2025 int err;
2026
2027 if (!intel_has_gpu_reset(gt))
2028 return 0;
2029
2030 if (intel_gt_is_wedged(gt))
2031 return -EIO; /* we're long past hope of a successful reset */
2032
2033 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2034
2035 err = intel_gt_live_subtests(tests, gt);
2036
2037 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2038
2039 return err;
2040 }
2041