1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2016 Intel Corporation
4 */
5
6 #include <linux/kthread.h>
7
8 #include "gem/i915_gem_context.h"
9 #include "gem/i915_gem_internal.h"
10
11 #include "i915_gem_evict.h"
12 #include "intel_gt.h"
13 #include "intel_engine_heartbeat.h"
14 #include "intel_engine_pm.h"
15 #include "selftest_engine_heartbeat.h"
16
17 #include "i915_selftest.h"
18 #include "selftests/i915_random.h"
19 #include "selftests/igt_flush_test.h"
20 #include "selftests/igt_reset.h"
21 #include "selftests/igt_atomic.h"
22 #include "selftests/igt_spinner.h"
23 #include "selftests/intel_scheduler_helpers.h"
24
25 #include "selftests/mock_drm.h"
26
27 #include "gem/selftests/mock_context.h"
28 #include "gem/selftests/igt_gem_utils.h"
29
30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
31
32 struct hang {
33 struct intel_gt *gt;
34 struct drm_i915_gem_object *hws;
35 struct drm_i915_gem_object *obj;
36 struct i915_gem_context *ctx;
37 u32 *seqno;
38 u32 *batch;
39 };
40
hang_init(struct hang * h,struct intel_gt * gt)41 static int hang_init(struct hang *h, struct intel_gt *gt)
42 {
43 void *vaddr;
44 int err;
45
46 memset(h, 0, sizeof(*h));
47 h->gt = gt;
48
49 h->ctx = kernel_context(gt->i915, NULL);
50 if (IS_ERR(h->ctx))
51 return PTR_ERR(h->ctx);
52
53 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
54
55 h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
56 if (IS_ERR(h->hws)) {
57 err = PTR_ERR(h->hws);
58 goto err_ctx;
59 }
60
61 h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
62 if (IS_ERR(h->obj)) {
63 err = PTR_ERR(h->obj);
64 goto err_hws;
65 }
66
67 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
68 vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
69 if (IS_ERR(vaddr)) {
70 err = PTR_ERR(vaddr);
71 goto err_obj;
72 }
73 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
74
75 vaddr = i915_gem_object_pin_map_unlocked(h->obj,
76 i915_coherent_map_type(gt->i915, h->obj, false));
77 if (IS_ERR(vaddr)) {
78 err = PTR_ERR(vaddr);
79 goto err_unpin_hws;
80 }
81 h->batch = vaddr;
82
83 return 0;
84
85 err_unpin_hws:
86 i915_gem_object_unpin_map(h->hws);
87 err_obj:
88 i915_gem_object_put(h->obj);
89 err_hws:
90 i915_gem_object_put(h->hws);
91 err_ctx:
92 kernel_context_close(h->ctx);
93 return err;
94 }
95
hws_address(const struct i915_vma * hws,const struct i915_request * rq)96 static u64 hws_address(const struct i915_vma *hws,
97 const struct i915_request *rq)
98 {
99 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
100 }
101
move_to_active(struct i915_vma * vma,struct i915_request * rq,unsigned int flags)102 static int move_to_active(struct i915_vma *vma,
103 struct i915_request *rq,
104 unsigned int flags)
105 {
106 int err;
107
108 i915_vma_lock(vma);
109 err = i915_request_await_object(rq, vma->obj,
110 flags & EXEC_OBJECT_WRITE);
111 if (err == 0)
112 err = i915_vma_move_to_active(vma, rq, flags);
113 i915_vma_unlock(vma);
114
115 return err;
116 }
117
118 static struct i915_request *
hang_create_request(struct hang * h,struct intel_engine_cs * engine)119 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
120 {
121 struct intel_gt *gt = h->gt;
122 struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
123 struct drm_i915_gem_object *obj;
124 struct i915_request *rq = NULL;
125 struct i915_vma *hws, *vma;
126 unsigned int flags;
127 void *vaddr;
128 u32 *batch;
129 int err;
130
131 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
132 if (IS_ERR(obj)) {
133 i915_vm_put(vm);
134 return ERR_CAST(obj);
135 }
136
137 vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
138 if (IS_ERR(vaddr)) {
139 i915_gem_object_put(obj);
140 i915_vm_put(vm);
141 return ERR_CAST(vaddr);
142 }
143
144 i915_gem_object_unpin_map(h->obj);
145 i915_gem_object_put(h->obj);
146
147 h->obj = obj;
148 h->batch = vaddr;
149
150 vma = i915_vma_instance(h->obj, vm, NULL);
151 if (IS_ERR(vma)) {
152 i915_vm_put(vm);
153 return ERR_CAST(vma);
154 }
155
156 hws = i915_vma_instance(h->hws, vm, NULL);
157 if (IS_ERR(hws)) {
158 i915_vm_put(vm);
159 return ERR_CAST(hws);
160 }
161
162 err = i915_vma_pin(vma, 0, 0, PIN_USER);
163 if (err) {
164 i915_vm_put(vm);
165 return ERR_PTR(err);
166 }
167
168 err = i915_vma_pin(hws, 0, 0, PIN_USER);
169 if (err)
170 goto unpin_vma;
171
172 rq = igt_request_alloc(h->ctx, engine);
173 if (IS_ERR(rq)) {
174 err = PTR_ERR(rq);
175 goto unpin_hws;
176 }
177
178 err = move_to_active(vma, rq, 0);
179 if (err)
180 goto cancel_rq;
181
182 err = move_to_active(hws, rq, 0);
183 if (err)
184 goto cancel_rq;
185
186 batch = h->batch;
187 if (GRAPHICS_VER(gt->i915) >= 8) {
188 *batch++ = MI_STORE_DWORD_IMM_GEN4;
189 *batch++ = lower_32_bits(hws_address(hws, rq));
190 *batch++ = upper_32_bits(hws_address(hws, rq));
191 *batch++ = rq->fence.seqno;
192 *batch++ = MI_NOOP;
193
194 memset(batch, 0, 1024);
195 batch += 1024 / sizeof(*batch);
196
197 *batch++ = MI_NOOP;
198 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
199 *batch++ = lower_32_bits(vma->node.start);
200 *batch++ = upper_32_bits(vma->node.start);
201 } else if (GRAPHICS_VER(gt->i915) >= 6) {
202 *batch++ = MI_STORE_DWORD_IMM_GEN4;
203 *batch++ = 0;
204 *batch++ = lower_32_bits(hws_address(hws, rq));
205 *batch++ = rq->fence.seqno;
206 *batch++ = MI_NOOP;
207
208 memset(batch, 0, 1024);
209 batch += 1024 / sizeof(*batch);
210
211 *batch++ = MI_NOOP;
212 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
213 *batch++ = lower_32_bits(vma->node.start);
214 } else if (GRAPHICS_VER(gt->i915) >= 4) {
215 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
216 *batch++ = 0;
217 *batch++ = lower_32_bits(hws_address(hws, rq));
218 *batch++ = rq->fence.seqno;
219 *batch++ = MI_NOOP;
220
221 memset(batch, 0, 1024);
222 batch += 1024 / sizeof(*batch);
223
224 *batch++ = MI_NOOP;
225 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
226 *batch++ = lower_32_bits(vma->node.start);
227 } else {
228 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
229 *batch++ = lower_32_bits(hws_address(hws, rq));
230 *batch++ = rq->fence.seqno;
231 *batch++ = MI_NOOP;
232
233 memset(batch, 0, 1024);
234 batch += 1024 / sizeof(*batch);
235
236 *batch++ = MI_NOOP;
237 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
238 *batch++ = lower_32_bits(vma->node.start);
239 }
240 *batch++ = MI_BATCH_BUFFER_END; /* not reached */
241 intel_gt_chipset_flush(engine->gt);
242
243 if (rq->engine->emit_init_breadcrumb) {
244 err = rq->engine->emit_init_breadcrumb(rq);
245 if (err)
246 goto cancel_rq;
247 }
248
249 flags = 0;
250 if (GRAPHICS_VER(gt->i915) <= 5)
251 flags |= I915_DISPATCH_SECURE;
252
253 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
254
255 cancel_rq:
256 if (err) {
257 i915_request_set_error_once(rq, err);
258 i915_request_add(rq);
259 }
260 unpin_hws:
261 i915_vma_unpin(hws);
262 unpin_vma:
263 i915_vma_unpin(vma);
264 i915_vm_put(vm);
265 return err ? ERR_PTR(err) : rq;
266 }
267
hws_seqno(const struct hang * h,const struct i915_request * rq)268 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
269 {
270 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
271 }
272
hang_fini(struct hang * h)273 static void hang_fini(struct hang *h)
274 {
275 *h->batch = MI_BATCH_BUFFER_END;
276 intel_gt_chipset_flush(h->gt);
277
278 i915_gem_object_unpin_map(h->obj);
279 i915_gem_object_put(h->obj);
280
281 i915_gem_object_unpin_map(h->hws);
282 i915_gem_object_put(h->hws);
283
284 kernel_context_close(h->ctx);
285
286 igt_flush_test(h->gt->i915);
287 }
288
wait_until_running(struct hang * h,struct i915_request * rq)289 static bool wait_until_running(struct hang *h, struct i915_request *rq)
290 {
291 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
292 rq->fence.seqno),
293 10) &&
294 wait_for(i915_seqno_passed(hws_seqno(h, rq),
295 rq->fence.seqno),
296 1000));
297 }
298
igt_hang_sanitycheck(void * arg)299 static int igt_hang_sanitycheck(void *arg)
300 {
301 struct intel_gt *gt = arg;
302 struct i915_request *rq;
303 struct intel_engine_cs *engine;
304 enum intel_engine_id id;
305 struct hang h;
306 int err;
307
308 /* Basic check that we can execute our hanging batch */
309
310 err = hang_init(&h, gt);
311 if (err)
312 return err;
313
314 for_each_engine(engine, gt, id) {
315 struct intel_wedge_me w;
316 long timeout;
317
318 if (!intel_engine_can_store_dword(engine))
319 continue;
320
321 rq = hang_create_request(&h, engine);
322 if (IS_ERR(rq)) {
323 err = PTR_ERR(rq);
324 pr_err("Failed to create request for %s, err=%d\n",
325 engine->name, err);
326 goto fini;
327 }
328
329 i915_request_get(rq);
330
331 *h.batch = MI_BATCH_BUFFER_END;
332 intel_gt_chipset_flush(engine->gt);
333
334 i915_request_add(rq);
335
336 timeout = 0;
337 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
338 timeout = i915_request_wait(rq, 0,
339 MAX_SCHEDULE_TIMEOUT);
340 if (intel_gt_is_wedged(gt))
341 timeout = -EIO;
342
343 i915_request_put(rq);
344
345 if (timeout < 0) {
346 err = timeout;
347 pr_err("Wait for request failed on %s, err=%d\n",
348 engine->name, err);
349 goto fini;
350 }
351 }
352
353 fini:
354 hang_fini(&h);
355 return err;
356 }
357
wait_for_idle(struct intel_engine_cs * engine)358 static bool wait_for_idle(struct intel_engine_cs *engine)
359 {
360 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
361 }
362
igt_reset_nop(void * arg)363 static int igt_reset_nop(void *arg)
364 {
365 struct intel_gt *gt = arg;
366 struct i915_gpu_error *global = >->i915->gpu_error;
367 struct intel_engine_cs *engine;
368 unsigned int reset_count, count;
369 enum intel_engine_id id;
370 IGT_TIMEOUT(end_time);
371 int err = 0;
372
373 /* Check that we can reset during non-user portions of requests */
374
375 reset_count = i915_reset_count(global);
376 count = 0;
377 do {
378 for_each_engine(engine, gt, id) {
379 struct intel_context *ce;
380 int i;
381
382 ce = intel_context_create(engine);
383 if (IS_ERR(ce)) {
384 err = PTR_ERR(ce);
385 pr_err("[%s] Create context failed: %d!\n", engine->name, err);
386 break;
387 }
388
389 for (i = 0; i < 16; i++) {
390 struct i915_request *rq;
391
392 rq = intel_context_create_request(ce);
393 if (IS_ERR(rq)) {
394 err = PTR_ERR(rq);
395 pr_err("[%s] Create request failed: %d!\n",
396 engine->name, err);
397 break;
398 }
399
400 i915_request_add(rq);
401 }
402
403 intel_context_put(ce);
404 }
405
406 igt_global_reset_lock(gt);
407 intel_gt_reset(gt, ALL_ENGINES, NULL);
408 igt_global_reset_unlock(gt);
409
410 if (intel_gt_is_wedged(gt)) {
411 pr_err("[%s] GT is wedged!\n", engine->name);
412 err = -EIO;
413 break;
414 }
415
416 if (i915_reset_count(global) != reset_count + ++count) {
417 pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
418 engine->name, i915_reset_count(global), reset_count, count);
419 err = -EINVAL;
420 break;
421 }
422
423 err = igt_flush_test(gt->i915);
424 if (err) {
425 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
426 break;
427 }
428 } while (time_before(jiffies, end_time));
429 pr_info("%s: %d resets\n", __func__, count);
430
431 if (igt_flush_test(gt->i915)) {
432 pr_err("Post flush failed: %d!\n", err);
433 err = -EIO;
434 }
435
436 return err;
437 }
438
igt_reset_nop_engine(void * arg)439 static int igt_reset_nop_engine(void *arg)
440 {
441 struct intel_gt *gt = arg;
442 struct i915_gpu_error *global = >->i915->gpu_error;
443 struct intel_engine_cs *engine;
444 enum intel_engine_id id;
445
446 /* Check that we can engine-reset during non-user portions */
447
448 if (!intel_has_reset_engine(gt))
449 return 0;
450
451 for_each_engine(engine, gt, id) {
452 unsigned int reset_count, reset_engine_count, count;
453 struct intel_context *ce;
454 IGT_TIMEOUT(end_time);
455 int err;
456
457 if (intel_engine_uses_guc(engine)) {
458 /* Engine level resets are triggered by GuC when a hang
459 * is detected. They can't be triggered by the KMD any
460 * more. Thus a nop batch cannot be used as a reset test
461 */
462 continue;
463 }
464
465 ce = intel_context_create(engine);
466 if (IS_ERR(ce)) {
467 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
468 return PTR_ERR(ce);
469 }
470
471 reset_count = i915_reset_count(global);
472 reset_engine_count = i915_reset_engine_count(global, engine);
473 count = 0;
474
475 st_engine_heartbeat_disable(engine);
476 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
477 >->reset.flags));
478 do {
479 int i;
480
481 if (!wait_for_idle(engine)) {
482 pr_err("%s failed to idle before reset\n",
483 engine->name);
484 err = -EIO;
485 break;
486 }
487
488 for (i = 0; i < 16; i++) {
489 struct i915_request *rq;
490
491 rq = intel_context_create_request(ce);
492 if (IS_ERR(rq)) {
493 struct drm_printer p =
494 drm_info_printer(gt->i915->drm.dev);
495 intel_engine_dump(engine, &p,
496 "%s(%s): failed to submit request\n",
497 __func__,
498 engine->name);
499
500 GEM_TRACE("%s(%s): failed to submit request\n",
501 __func__,
502 engine->name);
503 GEM_TRACE_DUMP();
504
505 intel_gt_set_wedged(gt);
506
507 err = PTR_ERR(rq);
508 break;
509 }
510
511 i915_request_add(rq);
512 }
513 err = intel_engine_reset(engine, NULL);
514 if (err) {
515 pr_err("intel_engine_reset(%s) failed, err:%d\n",
516 engine->name, err);
517 break;
518 }
519
520 if (i915_reset_count(global) != reset_count) {
521 pr_err("Full GPU reset recorded! (engine reset expected)\n");
522 err = -EINVAL;
523 break;
524 }
525
526 if (i915_reset_engine_count(global, engine) !=
527 reset_engine_count + ++count) {
528 pr_err("%s engine reset not recorded!\n",
529 engine->name);
530 err = -EINVAL;
531 break;
532 }
533 } while (time_before(jiffies, end_time));
534 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
535 st_engine_heartbeat_enable(engine);
536
537 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
538
539 intel_context_put(ce);
540 if (igt_flush_test(gt->i915))
541 err = -EIO;
542 if (err)
543 return err;
544 }
545
546 return 0;
547 }
548
force_reset_timeout(struct intel_engine_cs * engine)549 static void force_reset_timeout(struct intel_engine_cs *engine)
550 {
551 engine->reset_timeout.probability = 999;
552 atomic_set(&engine->reset_timeout.times, -1);
553 }
554
cancel_reset_timeout(struct intel_engine_cs * engine)555 static void cancel_reset_timeout(struct intel_engine_cs *engine)
556 {
557 memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
558 }
559
igt_reset_fail_engine(void * arg)560 static int igt_reset_fail_engine(void *arg)
561 {
562 struct intel_gt *gt = arg;
563 struct intel_engine_cs *engine;
564 enum intel_engine_id id;
565
566 /* Check that we can recover from engine-reset failues */
567
568 if (!intel_has_reset_engine(gt))
569 return 0;
570
571 for_each_engine(engine, gt, id) {
572 unsigned int count;
573 struct intel_context *ce;
574 IGT_TIMEOUT(end_time);
575 int err;
576
577 /* Can't manually break the reset if i915 doesn't perform it */
578 if (intel_engine_uses_guc(engine))
579 continue;
580
581 ce = intel_context_create(engine);
582 if (IS_ERR(ce)) {
583 pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
584 return PTR_ERR(ce);
585 }
586
587 st_engine_heartbeat_disable(engine);
588 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
589 >->reset.flags));
590
591 force_reset_timeout(engine);
592 err = intel_engine_reset(engine, NULL);
593 cancel_reset_timeout(engine);
594 if (err == 0) /* timeouts only generated on gen8+ */
595 goto skip;
596
597 count = 0;
598 do {
599 struct i915_request *last = NULL;
600 int i;
601
602 if (!wait_for_idle(engine)) {
603 pr_err("%s failed to idle before reset\n",
604 engine->name);
605 err = -EIO;
606 break;
607 }
608
609 for (i = 0; i < count % 15; i++) {
610 struct i915_request *rq;
611
612 rq = intel_context_create_request(ce);
613 if (IS_ERR(rq)) {
614 struct drm_printer p =
615 drm_info_printer(gt->i915->drm.dev);
616 intel_engine_dump(engine, &p,
617 "%s(%s): failed to submit request\n",
618 __func__,
619 engine->name);
620
621 GEM_TRACE("%s(%s): failed to submit request\n",
622 __func__,
623 engine->name);
624 GEM_TRACE_DUMP();
625
626 intel_gt_set_wedged(gt);
627 if (last)
628 i915_request_put(last);
629
630 err = PTR_ERR(rq);
631 goto out;
632 }
633
634 if (last)
635 i915_request_put(last);
636 last = i915_request_get(rq);
637 i915_request_add(rq);
638 }
639
640 if (count & 1) {
641 err = intel_engine_reset(engine, NULL);
642 if (err) {
643 GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
644 engine->name, err);
645 GEM_TRACE_DUMP();
646 i915_request_put(last);
647 break;
648 }
649 } else {
650 force_reset_timeout(engine);
651 err = intel_engine_reset(engine, NULL);
652 cancel_reset_timeout(engine);
653 if (err != -ETIMEDOUT) {
654 pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
655 engine->name, err);
656 i915_request_put(last);
657 break;
658 }
659 }
660
661 err = 0;
662 if (last) {
663 if (i915_request_wait(last, 0, HZ / 2) < 0) {
664 struct drm_printer p =
665 drm_info_printer(gt->i915->drm.dev);
666
667 intel_engine_dump(engine, &p,
668 "%s(%s): failed to complete request\n",
669 __func__,
670 engine->name);
671
672 GEM_TRACE("%s(%s): failed to complete request\n",
673 __func__,
674 engine->name);
675 GEM_TRACE_DUMP();
676
677 err = -EIO;
678 }
679 i915_request_put(last);
680 }
681 count++;
682 } while (err == 0 && time_before(jiffies, end_time));
683 out:
684 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
685 skip:
686 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
687 st_engine_heartbeat_enable(engine);
688 intel_context_put(ce);
689
690 if (igt_flush_test(gt->i915))
691 err = -EIO;
692 if (err)
693 return err;
694 }
695
696 return 0;
697 }
698
__igt_reset_engine(struct intel_gt * gt,bool active)699 static int __igt_reset_engine(struct intel_gt *gt, bool active)
700 {
701 struct i915_gpu_error *global = >->i915->gpu_error;
702 struct intel_engine_cs *engine;
703 enum intel_engine_id id;
704 struct hang h;
705 int err = 0;
706
707 /* Check that we can issue an engine reset on an idle engine (no-op) */
708
709 if (!intel_has_reset_engine(gt))
710 return 0;
711
712 if (active) {
713 err = hang_init(&h, gt);
714 if (err)
715 return err;
716 }
717
718 for_each_engine(engine, gt, id) {
719 unsigned int reset_count, reset_engine_count;
720 unsigned long count;
721 bool using_guc = intel_engine_uses_guc(engine);
722 IGT_TIMEOUT(end_time);
723
724 if (using_guc && !active)
725 continue;
726
727 if (active && !intel_engine_can_store_dword(engine))
728 continue;
729
730 if (!wait_for_idle(engine)) {
731 pr_err("%s failed to idle before reset\n",
732 engine->name);
733 err = -EIO;
734 break;
735 }
736
737 reset_count = i915_reset_count(global);
738 reset_engine_count = i915_reset_engine_count(global, engine);
739
740 st_engine_heartbeat_disable(engine);
741 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
742 >->reset.flags));
743 count = 0;
744 do {
745 struct i915_request *rq = NULL;
746 struct intel_selftest_saved_policy saved;
747 int err2;
748
749 err = intel_selftest_modify_policy(engine, &saved,
750 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
751 if (err) {
752 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
753 break;
754 }
755
756 if (active) {
757 rq = hang_create_request(&h, engine);
758 if (IS_ERR(rq)) {
759 err = PTR_ERR(rq);
760 pr_err("[%s] Create hang request failed: %d!\n",
761 engine->name, err);
762 goto restore;
763 }
764
765 i915_request_get(rq);
766 i915_request_add(rq);
767
768 if (!wait_until_running(&h, rq)) {
769 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
770
771 pr_err("%s: Failed to start request %llx, at %x\n",
772 __func__, rq->fence.seqno, hws_seqno(&h, rq));
773 intel_engine_dump(engine, &p,
774 "%s\n", engine->name);
775
776 i915_request_put(rq);
777 err = -EIO;
778 goto restore;
779 }
780 }
781
782 if (!using_guc) {
783 err = intel_engine_reset(engine, NULL);
784 if (err) {
785 pr_err("intel_engine_reset(%s) failed, err:%d\n",
786 engine->name, err);
787 goto skip;
788 }
789 }
790
791 if (rq) {
792 /* Ensure the reset happens and kills the engine */
793 err = intel_selftest_wait_for_rq(rq);
794 if (err)
795 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
796 engine->name, rq->fence.context,
797 rq->fence.seqno, rq->context->guc_id.id, err);
798 }
799
800 skip:
801 if (rq)
802 i915_request_put(rq);
803
804 if (i915_reset_count(global) != reset_count) {
805 pr_err("Full GPU reset recorded! (engine reset expected)\n");
806 err = -EINVAL;
807 goto restore;
808 }
809
810 /* GuC based resets are not logged per engine */
811 if (!using_guc) {
812 if (i915_reset_engine_count(global, engine) !=
813 ++reset_engine_count) {
814 pr_err("%s engine reset not recorded!\n",
815 engine->name);
816 err = -EINVAL;
817 goto restore;
818 }
819 }
820
821 count++;
822
823 restore:
824 err2 = intel_selftest_restore_policy(engine, &saved);
825 if (err2)
826 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
827 if (err == 0)
828 err = err2;
829 if (err)
830 break;
831 } while (time_before(jiffies, end_time));
832 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
833 st_engine_heartbeat_enable(engine);
834 pr_info("%s: Completed %lu %s resets\n",
835 engine->name, count, active ? "active" : "idle");
836
837 if (err)
838 break;
839
840 err = igt_flush_test(gt->i915);
841 if (err) {
842 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
843 break;
844 }
845 }
846
847 if (intel_gt_is_wedged(gt)) {
848 pr_err("GT is wedged!\n");
849 err = -EIO;
850 }
851
852 if (active)
853 hang_fini(&h);
854
855 return err;
856 }
857
igt_reset_idle_engine(void * arg)858 static int igt_reset_idle_engine(void *arg)
859 {
860 return __igt_reset_engine(arg, false);
861 }
862
igt_reset_active_engine(void * arg)863 static int igt_reset_active_engine(void *arg)
864 {
865 return __igt_reset_engine(arg, true);
866 }
867
868 struct active_engine {
869 struct task_struct *task;
870 struct intel_engine_cs *engine;
871 unsigned long resets;
872 unsigned int flags;
873 };
874
875 #define TEST_ACTIVE BIT(0)
876 #define TEST_OTHERS BIT(1)
877 #define TEST_SELF BIT(2)
878 #define TEST_PRIORITY BIT(3)
879
active_request_put(struct i915_request * rq)880 static int active_request_put(struct i915_request *rq)
881 {
882 int err = 0;
883
884 if (!rq)
885 return 0;
886
887 if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
888 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
889 rq->engine->name,
890 rq->fence.context,
891 rq->fence.seqno);
892 GEM_TRACE_DUMP();
893
894 intel_gt_set_wedged(rq->engine->gt);
895 err = -EIO;
896 }
897
898 i915_request_put(rq);
899
900 return err;
901 }
902
active_engine(void * data)903 static int active_engine(void *data)
904 {
905 I915_RND_STATE(prng);
906 struct active_engine *arg = data;
907 struct intel_engine_cs *engine = arg->engine;
908 struct i915_request *rq[8] = {};
909 struct intel_context *ce[ARRAY_SIZE(rq)];
910 unsigned long count;
911 int err = 0;
912
913 for (count = 0; count < ARRAY_SIZE(ce); count++) {
914 ce[count] = intel_context_create(engine);
915 if (IS_ERR(ce[count])) {
916 err = PTR_ERR(ce[count]);
917 pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
918 while (--count)
919 intel_context_put(ce[count]);
920 return err;
921 }
922 }
923
924 count = 0;
925 while (!kthread_should_stop()) {
926 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
927 struct i915_request *old = rq[idx];
928 struct i915_request *new;
929
930 new = intel_context_create_request(ce[idx]);
931 if (IS_ERR(new)) {
932 err = PTR_ERR(new);
933 pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
934 break;
935 }
936
937 rq[idx] = i915_request_get(new);
938 i915_request_add(new);
939
940 if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
941 struct i915_sched_attr attr = {
942 .priority =
943 i915_prandom_u32_max_state(512, &prng),
944 };
945 engine->sched_engine->schedule(rq[idx], &attr);
946 }
947
948 err = active_request_put(old);
949 if (err) {
950 pr_err("[%s] Request put failed: %d!\n", engine->name, err);
951 break;
952 }
953
954 cond_resched();
955 }
956
957 for (count = 0; count < ARRAY_SIZE(rq); count++) {
958 int err__ = active_request_put(rq[count]);
959
960 if (err)
961 pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
962
963 /* Keep the first error */
964 if (!err)
965 err = err__;
966
967 intel_context_put(ce[count]);
968 }
969
970 return err;
971 }
972
__igt_reset_engines(struct intel_gt * gt,const char * test_name,unsigned int flags)973 static int __igt_reset_engines(struct intel_gt *gt,
974 const char *test_name,
975 unsigned int flags)
976 {
977 struct i915_gpu_error *global = >->i915->gpu_error;
978 struct intel_engine_cs *engine, *other;
979 struct active_engine *threads;
980 enum intel_engine_id id, tmp;
981 struct hang h;
982 int err = 0;
983
984 /* Check that issuing a reset on one engine does not interfere
985 * with any other engine.
986 */
987
988 if (!intel_has_reset_engine(gt))
989 return 0;
990
991 if (flags & TEST_ACTIVE) {
992 err = hang_init(&h, gt);
993 if (err)
994 return err;
995
996 if (flags & TEST_PRIORITY)
997 h.ctx->sched.priority = 1024;
998 }
999
1000 threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL);
1001 if (!threads)
1002 return -ENOMEM;
1003
1004 for_each_engine(engine, gt, id) {
1005 unsigned long device = i915_reset_count(global);
1006 unsigned long count = 0, reported;
1007 bool using_guc = intel_engine_uses_guc(engine);
1008 IGT_TIMEOUT(end_time);
1009
1010 if (flags & TEST_ACTIVE) {
1011 if (!intel_engine_can_store_dword(engine))
1012 continue;
1013 } else if (using_guc)
1014 continue;
1015
1016 if (!wait_for_idle(engine)) {
1017 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1018 engine->name, test_name);
1019 err = -EIO;
1020 break;
1021 }
1022
1023 memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
1024 for_each_engine(other, gt, tmp) {
1025 struct task_struct *tsk;
1026
1027 threads[tmp].resets =
1028 i915_reset_engine_count(global, other);
1029
1030 if (other == engine && !(flags & TEST_SELF))
1031 continue;
1032
1033 if (other != engine && !(flags & TEST_OTHERS))
1034 continue;
1035
1036 threads[tmp].engine = other;
1037 threads[tmp].flags = flags;
1038
1039 tsk = kthread_run(active_engine, &threads[tmp],
1040 "igt/%s", other->name);
1041 if (IS_ERR(tsk)) {
1042 err = PTR_ERR(tsk);
1043 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1044 goto unwind;
1045 }
1046
1047 threads[tmp].task = tsk;
1048 get_task_struct(tsk);
1049 }
1050
1051 yield(); /* start all threads before we begin */
1052
1053 st_engine_heartbeat_disable_no_pm(engine);
1054 GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
1055 >->reset.flags));
1056 do {
1057 struct i915_request *rq = NULL;
1058 struct intel_selftest_saved_policy saved;
1059 int err2;
1060
1061 err = intel_selftest_modify_policy(engine, &saved,
1062 SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1063 if (err) {
1064 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1065 break;
1066 }
1067
1068 if (flags & TEST_ACTIVE) {
1069 rq = hang_create_request(&h, engine);
1070 if (IS_ERR(rq)) {
1071 err = PTR_ERR(rq);
1072 pr_err("[%s] Create hang request failed: %d!\n",
1073 engine->name, err);
1074 goto restore;
1075 }
1076
1077 i915_request_get(rq);
1078 i915_request_add(rq);
1079
1080 if (!wait_until_running(&h, rq)) {
1081 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1082
1083 pr_err("%s: Failed to start request %llx, at %x\n",
1084 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1085 intel_engine_dump(engine, &p,
1086 "%s\n", engine->name);
1087
1088 i915_request_put(rq);
1089 err = -EIO;
1090 goto restore;
1091 }
1092 } else {
1093 intel_engine_pm_get(engine);
1094 }
1095
1096 if (!using_guc) {
1097 err = intel_engine_reset(engine, NULL);
1098 if (err) {
1099 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1100 engine->name, test_name, err);
1101 goto restore;
1102 }
1103 }
1104
1105 if (rq) {
1106 /* Ensure the reset happens and kills the engine */
1107 err = intel_selftest_wait_for_rq(rq);
1108 if (err)
1109 pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1110 engine->name, rq->fence.context,
1111 rq->fence.seqno, rq->context->guc_id.id, err);
1112 }
1113
1114 count++;
1115
1116 if (rq) {
1117 if (rq->fence.error != -EIO) {
1118 pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1119 engine->name, test_name,
1120 rq->fence.context,
1121 rq->fence.seqno, rq->context->guc_id.id);
1122 i915_request_put(rq);
1123
1124 GEM_TRACE_DUMP();
1125 intel_gt_set_wedged(gt);
1126 err = -EIO;
1127 goto restore;
1128 }
1129
1130 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1131 struct drm_printer p =
1132 drm_info_printer(gt->i915->drm.dev);
1133
1134 pr_err("i915_reset_engine(%s:%s):"
1135 " failed to complete request %llx:%lld after reset\n",
1136 engine->name, test_name,
1137 rq->fence.context,
1138 rq->fence.seqno);
1139 intel_engine_dump(engine, &p,
1140 "%s\n", engine->name);
1141 i915_request_put(rq);
1142
1143 GEM_TRACE_DUMP();
1144 intel_gt_set_wedged(gt);
1145 err = -EIO;
1146 goto restore;
1147 }
1148
1149 i915_request_put(rq);
1150 }
1151
1152 if (!(flags & TEST_ACTIVE))
1153 intel_engine_pm_put(engine);
1154
1155 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1156 struct drm_printer p =
1157 drm_info_printer(gt->i915->drm.dev);
1158
1159 pr_err("i915_reset_engine(%s:%s):"
1160 " failed to idle after reset\n",
1161 engine->name, test_name);
1162 intel_engine_dump(engine, &p,
1163 "%s\n", engine->name);
1164
1165 err = -EIO;
1166 goto restore;
1167 }
1168
1169 restore:
1170 err2 = intel_selftest_restore_policy(engine, &saved);
1171 if (err2)
1172 pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1173 if (err == 0)
1174 err = err2;
1175 if (err)
1176 break;
1177 } while (time_before(jiffies, end_time));
1178 clear_and_wake_up_bit(I915_RESET_ENGINE + id, >->reset.flags);
1179 st_engine_heartbeat_enable_no_pm(engine);
1180
1181 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1182 engine->name, test_name, count);
1183
1184 /* GuC based resets are not logged per engine */
1185 if (!using_guc) {
1186 reported = i915_reset_engine_count(global, engine);
1187 reported -= threads[engine->id].resets;
1188 if (reported != count) {
1189 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1190 engine->name, test_name, count, reported);
1191 if (!err)
1192 err = -EINVAL;
1193 }
1194 }
1195
1196 unwind:
1197 for_each_engine(other, gt, tmp) {
1198 int ret;
1199
1200 if (!threads[tmp].task)
1201 continue;
1202
1203 ret = kthread_stop(threads[tmp].task);
1204 if (ret) {
1205 pr_err("kthread for other engine %s failed, err=%d\n",
1206 other->name, ret);
1207 if (!err)
1208 err = ret;
1209 }
1210 put_task_struct(threads[tmp].task);
1211
1212 /* GuC based resets are not logged per engine */
1213 if (!using_guc) {
1214 if (other->uabi_class != engine->uabi_class &&
1215 threads[tmp].resets !=
1216 i915_reset_engine_count(global, other)) {
1217 pr_err("Innocent engine %s was reset (count=%ld)\n",
1218 other->name,
1219 i915_reset_engine_count(global, other) -
1220 threads[tmp].resets);
1221 if (!err)
1222 err = -EINVAL;
1223 }
1224 }
1225 }
1226
1227 if (device != i915_reset_count(global)) {
1228 pr_err("Global reset (count=%ld)!\n",
1229 i915_reset_count(global) - device);
1230 if (!err)
1231 err = -EINVAL;
1232 }
1233
1234 if (err)
1235 break;
1236
1237 err = igt_flush_test(gt->i915);
1238 if (err) {
1239 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1240 break;
1241 }
1242 }
1243 kfree(threads);
1244
1245 if (intel_gt_is_wedged(gt))
1246 err = -EIO;
1247
1248 if (flags & TEST_ACTIVE)
1249 hang_fini(&h);
1250
1251 return err;
1252 }
1253
igt_reset_engines(void * arg)1254 static int igt_reset_engines(void *arg)
1255 {
1256 static const struct {
1257 const char *name;
1258 unsigned int flags;
1259 } phases[] = {
1260 { "idle", 0 },
1261 { "active", TEST_ACTIVE },
1262 { "others-idle", TEST_OTHERS },
1263 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1264 {
1265 "others-priority",
1266 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1267 },
1268 {
1269 "self-priority",
1270 TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1271 },
1272 { }
1273 };
1274 struct intel_gt *gt = arg;
1275 typeof(*phases) *p;
1276 int err;
1277
1278 for (p = phases; p->name; p++) {
1279 if (p->flags & TEST_PRIORITY) {
1280 if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1281 continue;
1282 }
1283
1284 err = __igt_reset_engines(arg, p->name, p->flags);
1285 if (err)
1286 return err;
1287 }
1288
1289 return 0;
1290 }
1291
fake_hangcheck(struct intel_gt * gt,intel_engine_mask_t mask)1292 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1293 {
1294 u32 count = i915_reset_count(>->i915->gpu_error);
1295
1296 intel_gt_reset(gt, mask, NULL);
1297
1298 return count;
1299 }
1300
igt_reset_wait(void * arg)1301 static int igt_reset_wait(void *arg)
1302 {
1303 struct intel_gt *gt = arg;
1304 struct i915_gpu_error *global = >->i915->gpu_error;
1305 struct intel_engine_cs *engine;
1306 struct i915_request *rq;
1307 unsigned int reset_count;
1308 struct hang h;
1309 long timeout;
1310 int err;
1311
1312 engine = intel_selftest_find_any_engine(gt);
1313
1314 if (!engine || !intel_engine_can_store_dword(engine))
1315 return 0;
1316
1317 /* Check that we detect a stuck waiter and issue a reset */
1318
1319 igt_global_reset_lock(gt);
1320
1321 err = hang_init(&h, gt);
1322 if (err) {
1323 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1324 goto unlock;
1325 }
1326
1327 rq = hang_create_request(&h, engine);
1328 if (IS_ERR(rq)) {
1329 err = PTR_ERR(rq);
1330 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1331 goto fini;
1332 }
1333
1334 i915_request_get(rq);
1335 i915_request_add(rq);
1336
1337 if (!wait_until_running(&h, rq)) {
1338 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1339
1340 pr_err("%s: Failed to start request %llx, at %x\n",
1341 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1342 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1343
1344 intel_gt_set_wedged(gt);
1345
1346 err = -EIO;
1347 goto out_rq;
1348 }
1349
1350 reset_count = fake_hangcheck(gt, ALL_ENGINES);
1351
1352 timeout = i915_request_wait(rq, 0, 10);
1353 if (timeout < 0) {
1354 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1355 timeout);
1356 err = timeout;
1357 goto out_rq;
1358 }
1359
1360 if (i915_reset_count(global) == reset_count) {
1361 pr_err("No GPU reset recorded!\n");
1362 err = -EINVAL;
1363 goto out_rq;
1364 }
1365
1366 out_rq:
1367 i915_request_put(rq);
1368 fini:
1369 hang_fini(&h);
1370 unlock:
1371 igt_global_reset_unlock(gt);
1372
1373 if (intel_gt_is_wedged(gt))
1374 return -EIO;
1375
1376 return err;
1377 }
1378
1379 struct evict_vma {
1380 struct completion completion;
1381 struct i915_vma *vma;
1382 };
1383
evict_vma(void * data)1384 static int evict_vma(void *data)
1385 {
1386 struct evict_vma *arg = data;
1387 struct i915_address_space *vm = arg->vma->vm;
1388 struct drm_mm_node evict = arg->vma->node;
1389 int err;
1390
1391 complete(&arg->completion);
1392
1393 mutex_lock(&vm->mutex);
1394 err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1395 mutex_unlock(&vm->mutex);
1396
1397 return err;
1398 }
1399
evict_fence(void * data)1400 static int evict_fence(void *data)
1401 {
1402 struct evict_vma *arg = data;
1403 int err;
1404
1405 complete(&arg->completion);
1406
1407 /* Mark the fence register as dirty to force the mmio update. */
1408 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1409 if (err) {
1410 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1411 return err;
1412 }
1413
1414 err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1415 if (err) {
1416 pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1417 return err;
1418 }
1419
1420 err = i915_vma_pin_fence(arg->vma);
1421 i915_vma_unpin(arg->vma);
1422 if (err) {
1423 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1424 return err;
1425 }
1426
1427 i915_vma_unpin_fence(arg->vma);
1428
1429 return 0;
1430 }
1431
__igt_reset_evict_vma(struct intel_gt * gt,struct i915_address_space * vm,int (* fn)(void *),unsigned int flags)1432 static int __igt_reset_evict_vma(struct intel_gt *gt,
1433 struct i915_address_space *vm,
1434 int (*fn)(void *),
1435 unsigned int flags)
1436 {
1437 struct intel_engine_cs *engine;
1438 struct drm_i915_gem_object *obj;
1439 struct task_struct *tsk = NULL;
1440 struct i915_request *rq;
1441 struct evict_vma arg;
1442 struct hang h;
1443 unsigned int pin_flags;
1444 int err;
1445
1446 if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1447 return 0;
1448
1449 engine = intel_selftest_find_any_engine(gt);
1450
1451 if (!engine || !intel_engine_can_store_dword(engine))
1452 return 0;
1453
1454 /* Check that we can recover an unbind stuck on a hanging request */
1455
1456 err = hang_init(&h, gt);
1457 if (err) {
1458 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1459 return err;
1460 }
1461
1462 obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1463 if (IS_ERR(obj)) {
1464 err = PTR_ERR(obj);
1465 pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1466 goto fini;
1467 }
1468
1469 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1470 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1471 if (err) {
1472 pr_err("Invalid X-tiling settings; err:%d\n", err);
1473 goto out_obj;
1474 }
1475 }
1476
1477 arg.vma = i915_vma_instance(obj, vm, NULL);
1478 if (IS_ERR(arg.vma)) {
1479 err = PTR_ERR(arg.vma);
1480 pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1481 goto out_obj;
1482 }
1483
1484 rq = hang_create_request(&h, engine);
1485 if (IS_ERR(rq)) {
1486 err = PTR_ERR(rq);
1487 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1488 goto out_obj;
1489 }
1490
1491 pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1492
1493 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1494 pin_flags |= PIN_MAPPABLE;
1495
1496 err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1497 if (err) {
1498 i915_request_add(rq);
1499 pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1500 goto out_obj;
1501 }
1502
1503 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1504 err = i915_vma_pin_fence(arg.vma);
1505 if (err) {
1506 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1507 i915_vma_unpin(arg.vma);
1508 i915_request_add(rq);
1509 goto out_obj;
1510 }
1511 }
1512
1513 i915_vma_lock(arg.vma);
1514 err = i915_request_await_object(rq, arg.vma->obj,
1515 flags & EXEC_OBJECT_WRITE);
1516 if (err == 0) {
1517 err = i915_vma_move_to_active(arg.vma, rq, flags);
1518 if (err)
1519 pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1520 } else {
1521 pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1522 }
1523
1524 i915_vma_unlock(arg.vma);
1525
1526 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1527 i915_vma_unpin_fence(arg.vma);
1528 i915_vma_unpin(arg.vma);
1529
1530 i915_request_get(rq);
1531 i915_request_add(rq);
1532 if (err)
1533 goto out_rq;
1534
1535 if (!wait_until_running(&h, rq)) {
1536 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1537
1538 pr_err("%s: Failed to start request %llx, at %x\n",
1539 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1540 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1541
1542 intel_gt_set_wedged(gt);
1543 goto out_reset;
1544 }
1545
1546 init_completion(&arg.completion);
1547
1548 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1549 if (IS_ERR(tsk)) {
1550 err = PTR_ERR(tsk);
1551 pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1552 tsk = NULL;
1553 goto out_reset;
1554 }
1555 get_task_struct(tsk);
1556
1557 wait_for_completion(&arg.completion);
1558
1559 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1560 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1561
1562 pr_err("igt/evict_vma kthread did not wait\n");
1563 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1564
1565 intel_gt_set_wedged(gt);
1566 goto out_reset;
1567 }
1568
1569 out_reset:
1570 igt_global_reset_lock(gt);
1571 fake_hangcheck(gt, rq->engine->mask);
1572 igt_global_reset_unlock(gt);
1573
1574 if (tsk) {
1575 struct intel_wedge_me w;
1576
1577 /* The reset, even indirectly, should take less than 10ms. */
1578 intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1579 err = kthread_stop(tsk);
1580
1581 put_task_struct(tsk);
1582 }
1583
1584 out_rq:
1585 i915_request_put(rq);
1586 out_obj:
1587 i915_gem_object_put(obj);
1588 fini:
1589 hang_fini(&h);
1590 if (intel_gt_is_wedged(gt))
1591 return -EIO;
1592
1593 return err;
1594 }
1595
igt_reset_evict_ggtt(void * arg)1596 static int igt_reset_evict_ggtt(void *arg)
1597 {
1598 struct intel_gt *gt = arg;
1599
1600 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1601 evict_vma, EXEC_OBJECT_WRITE);
1602 }
1603
igt_reset_evict_ppgtt(void * arg)1604 static int igt_reset_evict_ppgtt(void *arg)
1605 {
1606 struct intel_gt *gt = arg;
1607 struct i915_ppgtt *ppgtt;
1608 int err;
1609
1610 /* aliasing == global gtt locking, covered above */
1611 if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1612 return 0;
1613
1614 ppgtt = i915_ppgtt_create(gt, 0);
1615 if (IS_ERR(ppgtt))
1616 return PTR_ERR(ppgtt);
1617
1618 err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1619 evict_vma, EXEC_OBJECT_WRITE);
1620 i915_vm_put(&ppgtt->vm);
1621
1622 return err;
1623 }
1624
igt_reset_evict_fence(void * arg)1625 static int igt_reset_evict_fence(void *arg)
1626 {
1627 struct intel_gt *gt = arg;
1628
1629 return __igt_reset_evict_vma(gt, >->ggtt->vm,
1630 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1631 }
1632
wait_for_others(struct intel_gt * gt,struct intel_engine_cs * exclude)1633 static int wait_for_others(struct intel_gt *gt,
1634 struct intel_engine_cs *exclude)
1635 {
1636 struct intel_engine_cs *engine;
1637 enum intel_engine_id id;
1638
1639 for_each_engine(engine, gt, id) {
1640 if (engine == exclude)
1641 continue;
1642
1643 if (!wait_for_idle(engine))
1644 return -EIO;
1645 }
1646
1647 return 0;
1648 }
1649
igt_reset_queue(void * arg)1650 static int igt_reset_queue(void *arg)
1651 {
1652 struct intel_gt *gt = arg;
1653 struct i915_gpu_error *global = >->i915->gpu_error;
1654 struct intel_engine_cs *engine;
1655 enum intel_engine_id id;
1656 struct hang h;
1657 int err;
1658
1659 /* Check that we replay pending requests following a hang */
1660
1661 igt_global_reset_lock(gt);
1662
1663 err = hang_init(&h, gt);
1664 if (err)
1665 goto unlock;
1666
1667 for_each_engine(engine, gt, id) {
1668 struct intel_selftest_saved_policy saved;
1669 struct i915_request *prev;
1670 IGT_TIMEOUT(end_time);
1671 unsigned int count;
1672 bool using_guc = intel_engine_uses_guc(engine);
1673
1674 if (!intel_engine_can_store_dword(engine))
1675 continue;
1676
1677 if (using_guc) {
1678 err = intel_selftest_modify_policy(engine, &saved,
1679 SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1680 if (err) {
1681 pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1682 goto fini;
1683 }
1684 }
1685
1686 prev = hang_create_request(&h, engine);
1687 if (IS_ERR(prev)) {
1688 err = PTR_ERR(prev);
1689 pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1690 goto restore;
1691 }
1692
1693 i915_request_get(prev);
1694 i915_request_add(prev);
1695
1696 count = 0;
1697 do {
1698 struct i915_request *rq;
1699 unsigned int reset_count;
1700
1701 rq = hang_create_request(&h, engine);
1702 if (IS_ERR(rq)) {
1703 err = PTR_ERR(rq);
1704 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1705 goto restore;
1706 }
1707
1708 i915_request_get(rq);
1709 i915_request_add(rq);
1710
1711 /*
1712 * XXX We don't handle resetting the kernel context
1713 * very well. If we trigger a device reset twice in
1714 * quick succession while the kernel context is
1715 * executing, we may end up skipping the breadcrumb.
1716 * This is really only a problem for the selftest as
1717 * normally there is a large interlude between resets
1718 * (hangcheck), or we focus on resetting just one
1719 * engine and so avoid repeatedly resetting innocents.
1720 */
1721 err = wait_for_others(gt, engine);
1722 if (err) {
1723 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1724 __func__, engine->name);
1725 i915_request_put(rq);
1726 i915_request_put(prev);
1727
1728 GEM_TRACE_DUMP();
1729 intel_gt_set_wedged(gt);
1730 goto restore;
1731 }
1732
1733 if (!wait_until_running(&h, prev)) {
1734 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1735
1736 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1737 __func__, engine->name,
1738 prev->fence.seqno, hws_seqno(&h, prev));
1739 intel_engine_dump(engine, &p,
1740 "%s\n", engine->name);
1741
1742 i915_request_put(rq);
1743 i915_request_put(prev);
1744
1745 intel_gt_set_wedged(gt);
1746
1747 err = -EIO;
1748 goto restore;
1749 }
1750
1751 reset_count = fake_hangcheck(gt, BIT(id));
1752
1753 if (prev->fence.error != -EIO) {
1754 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1755 prev->fence.error);
1756 i915_request_put(rq);
1757 i915_request_put(prev);
1758 err = -EINVAL;
1759 goto restore;
1760 }
1761
1762 if (rq->fence.error) {
1763 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1764 rq->fence.error);
1765 i915_request_put(rq);
1766 i915_request_put(prev);
1767 err = -EINVAL;
1768 goto restore;
1769 }
1770
1771 if (i915_reset_count(global) == reset_count) {
1772 pr_err("No GPU reset recorded!\n");
1773 i915_request_put(rq);
1774 i915_request_put(prev);
1775 err = -EINVAL;
1776 goto restore;
1777 }
1778
1779 i915_request_put(prev);
1780 prev = rq;
1781 count++;
1782 } while (time_before(jiffies, end_time));
1783 pr_info("%s: Completed %d queued resets\n",
1784 engine->name, count);
1785
1786 *h.batch = MI_BATCH_BUFFER_END;
1787 intel_gt_chipset_flush(engine->gt);
1788
1789 i915_request_put(prev);
1790
1791 restore:
1792 if (using_guc) {
1793 int err2 = intel_selftest_restore_policy(engine, &saved);
1794
1795 if (err2)
1796 pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1797 __func__, __LINE__, engine->name, err2);
1798 if (err == 0)
1799 err = err2;
1800 }
1801 if (err)
1802 goto fini;
1803
1804 err = igt_flush_test(gt->i915);
1805 if (err) {
1806 pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1807 break;
1808 }
1809 }
1810
1811 fini:
1812 hang_fini(&h);
1813 unlock:
1814 igt_global_reset_unlock(gt);
1815
1816 if (intel_gt_is_wedged(gt))
1817 return -EIO;
1818
1819 return err;
1820 }
1821
igt_handle_error(void * arg)1822 static int igt_handle_error(void *arg)
1823 {
1824 struct intel_gt *gt = arg;
1825 struct i915_gpu_error *global = >->i915->gpu_error;
1826 struct intel_engine_cs *engine;
1827 struct hang h;
1828 struct i915_request *rq;
1829 struct i915_gpu_coredump *error;
1830 int err;
1831
1832 engine = intel_selftest_find_any_engine(gt);
1833
1834 /* Check that we can issue a global GPU and engine reset */
1835
1836 if (!intel_has_reset_engine(gt))
1837 return 0;
1838
1839 if (!engine || !intel_engine_can_store_dword(engine))
1840 return 0;
1841
1842 err = hang_init(&h, gt);
1843 if (err) {
1844 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1845 return err;
1846 }
1847
1848 rq = hang_create_request(&h, engine);
1849 if (IS_ERR(rq)) {
1850 err = PTR_ERR(rq);
1851 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1852 goto err_fini;
1853 }
1854
1855 i915_request_get(rq);
1856 i915_request_add(rq);
1857
1858 if (!wait_until_running(&h, rq)) {
1859 struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1860
1861 pr_err("%s: Failed to start request %llx, at %x\n",
1862 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1863 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1864
1865 intel_gt_set_wedged(gt);
1866
1867 err = -EIO;
1868 goto err_request;
1869 }
1870
1871 /* Temporarily disable error capture */
1872 error = xchg(&global->first_error, (void *)-1);
1873
1874 intel_gt_handle_error(gt, engine->mask, 0, NULL);
1875
1876 xchg(&global->first_error, error);
1877
1878 if (rq->fence.error != -EIO) {
1879 pr_err("Guilty request not identified!\n");
1880 err = -EINVAL;
1881 goto err_request;
1882 }
1883
1884 err_request:
1885 i915_request_put(rq);
1886 err_fini:
1887 hang_fini(&h);
1888 return err;
1889 }
1890
__igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p,const char * mode)1891 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1892 const struct igt_atomic_section *p,
1893 const char *mode)
1894 {
1895 struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1896 int err;
1897
1898 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1899 engine->name, mode, p->name);
1900
1901 if (t->func)
1902 tasklet_disable(t);
1903 if (strcmp(p->name, "softirq"))
1904 local_bh_disable();
1905 p->critical_section_begin();
1906
1907 err = __intel_engine_reset_bh(engine, NULL);
1908
1909 p->critical_section_end();
1910 if (strcmp(p->name, "softirq"))
1911 local_bh_enable();
1912 if (t->func) {
1913 tasklet_enable(t);
1914 tasklet_hi_schedule(t);
1915 }
1916
1917 if (err)
1918 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1919 engine->name, mode, p->name);
1920
1921 return err;
1922 }
1923
igt_atomic_reset_engine(struct intel_engine_cs * engine,const struct igt_atomic_section * p)1924 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1925 const struct igt_atomic_section *p)
1926 {
1927 struct i915_request *rq;
1928 struct hang h;
1929 int err;
1930
1931 err = __igt_atomic_reset_engine(engine, p, "idle");
1932 if (err)
1933 return err;
1934
1935 err = hang_init(&h, engine->gt);
1936 if (err) {
1937 pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1938 return err;
1939 }
1940
1941 rq = hang_create_request(&h, engine);
1942 if (IS_ERR(rq)) {
1943 err = PTR_ERR(rq);
1944 pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1945 goto out;
1946 }
1947
1948 i915_request_get(rq);
1949 i915_request_add(rq);
1950
1951 if (wait_until_running(&h, rq)) {
1952 err = __igt_atomic_reset_engine(engine, p, "active");
1953 } else {
1954 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1955 __func__, engine->name,
1956 rq->fence.seqno, hws_seqno(&h, rq));
1957 intel_gt_set_wedged(engine->gt);
1958 err = -EIO;
1959 }
1960
1961 if (err == 0) {
1962 struct intel_wedge_me w;
1963
1964 intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1965 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1966 if (intel_gt_is_wedged(engine->gt))
1967 err = -EIO;
1968 }
1969
1970 i915_request_put(rq);
1971 out:
1972 hang_fini(&h);
1973 return err;
1974 }
1975
igt_reset_engines_atomic(void * arg)1976 static int igt_reset_engines_atomic(void *arg)
1977 {
1978 struct intel_gt *gt = arg;
1979 const typeof(*igt_atomic_phases) *p;
1980 int err = 0;
1981
1982 /* Check that the engines resets are usable from atomic context */
1983
1984 if (!intel_has_reset_engine(gt))
1985 return 0;
1986
1987 if (intel_uc_uses_guc_submission(>->uc))
1988 return 0;
1989
1990 igt_global_reset_lock(gt);
1991
1992 /* Flush any requests before we get started and check basics */
1993 if (!igt_force_reset(gt))
1994 goto unlock;
1995
1996 for (p = igt_atomic_phases; p->name; p++) {
1997 struct intel_engine_cs *engine;
1998 enum intel_engine_id id;
1999
2000 for_each_engine(engine, gt, id) {
2001 err = igt_atomic_reset_engine(engine, p);
2002 if (err)
2003 goto out;
2004 }
2005 }
2006
2007 out:
2008 /* As we poke around the guts, do a full reset before continuing. */
2009 igt_force_reset(gt);
2010 unlock:
2011 igt_global_reset_unlock(gt);
2012
2013 return err;
2014 }
2015
intel_hangcheck_live_selftests(struct drm_i915_private * i915)2016 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2017 {
2018 static const struct i915_subtest tests[] = {
2019 SUBTEST(igt_hang_sanitycheck),
2020 SUBTEST(igt_reset_nop),
2021 SUBTEST(igt_reset_nop_engine),
2022 SUBTEST(igt_reset_idle_engine),
2023 SUBTEST(igt_reset_active_engine),
2024 SUBTEST(igt_reset_fail_engine),
2025 SUBTEST(igt_reset_engines),
2026 SUBTEST(igt_reset_engines_atomic),
2027 SUBTEST(igt_reset_queue),
2028 SUBTEST(igt_reset_wait),
2029 SUBTEST(igt_reset_evict_ggtt),
2030 SUBTEST(igt_reset_evict_ppgtt),
2031 SUBTEST(igt_reset_evict_fence),
2032 SUBTEST(igt_handle_error),
2033 };
2034 struct intel_gt *gt = to_gt(i915);
2035 intel_wakeref_t wakeref;
2036 int err;
2037
2038 if (!intel_has_gpu_reset(gt))
2039 return 0;
2040
2041 if (intel_gt_is_wedged(gt))
2042 return -EIO; /* we're long past hope of a successful reset */
2043
2044 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2045
2046 err = intel_gt_live_subtests(tests, gt);
2047
2048 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2049
2050 return err;
2051 }
2052