1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28
29 #include "gem/i915_gem_internal.h"
30 #include "gem/i915_gem_pm.h"
31 #include "gem/selftests/mock_context.h"
32
33 #include "gt/intel_engine_heartbeat.h"
34 #include "gt/intel_engine_pm.h"
35 #include "gt/intel_engine_user.h"
36 #include "gt/intel_gt.h"
37 #include "gt/intel_gt_clock_utils.h"
38 #include "gt/intel_gt_requests.h"
39 #include "gt/selftest_engine_heartbeat.h"
40
41 #include "i915_random.h"
42 #include "i915_selftest.h"
43 #include "igt_flush_test.h"
44 #include "igt_live_test.h"
45 #include "igt_spinner.h"
46 #include "lib_sw_fence.h"
47
48 #include "mock_drm.h"
49 #include "mock_gem_device.h"
50
num_uabi_engines(struct drm_i915_private * i915)51 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
52 {
53 struct intel_engine_cs *engine;
54 unsigned int count;
55
56 count = 0;
57 for_each_uabi_engine(engine, i915)
58 count++;
59
60 return count;
61 }
62
rcs0(struct drm_i915_private * i915)63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
64 {
65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
66 }
67
igt_add_request(void * arg)68 static int igt_add_request(void *arg)
69 {
70 struct drm_i915_private *i915 = arg;
71 struct i915_request *request;
72
73 /* Basic preliminary test to create a request and let it loose! */
74
75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
76 if (!request)
77 return -ENOMEM;
78
79 i915_request_add(request);
80
81 return 0;
82 }
83
igt_wait_request(void * arg)84 static int igt_wait_request(void *arg)
85 {
86 const long T = HZ / 4;
87 struct drm_i915_private *i915 = arg;
88 struct i915_request *request;
89 int err = -EINVAL;
90
91 /* Submit a request, then wait upon it */
92
93 request = mock_request(rcs0(i915)->kernel_context, T);
94 if (!request)
95 return -ENOMEM;
96
97 i915_request_get(request);
98
99 if (i915_request_wait(request, 0, 0) != -ETIME) {
100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
101 goto out_request;
102 }
103
104 if (i915_request_wait(request, 0, T) != -ETIME) {
105 pr_err("request wait succeeded (expected timeout before submit!)\n");
106 goto out_request;
107 }
108
109 if (i915_request_completed(request)) {
110 pr_err("request completed before submit!!\n");
111 goto out_request;
112 }
113
114 i915_request_add(request);
115
116 if (i915_request_wait(request, 0, 0) != -ETIME) {
117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
118 goto out_request;
119 }
120
121 if (i915_request_completed(request)) {
122 pr_err("request completed immediately!\n");
123 goto out_request;
124 }
125
126 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
127 pr_err("request wait succeeded (expected timeout!)\n");
128 goto out_request;
129 }
130
131 if (i915_request_wait(request, 0, T) == -ETIME) {
132 pr_err("request wait timed out!\n");
133 goto out_request;
134 }
135
136 if (!i915_request_completed(request)) {
137 pr_err("request not complete after waiting!\n");
138 goto out_request;
139 }
140
141 if (i915_request_wait(request, 0, T) == -ETIME) {
142 pr_err("request wait timed out when already complete!\n");
143 goto out_request;
144 }
145
146 err = 0;
147 out_request:
148 i915_request_put(request);
149 mock_device_flush(i915);
150 return err;
151 }
152
igt_fence_wait(void * arg)153 static int igt_fence_wait(void *arg)
154 {
155 const long T = HZ / 4;
156 struct drm_i915_private *i915 = arg;
157 struct i915_request *request;
158 int err = -EINVAL;
159
160 /* Submit a request, treat it as a fence and wait upon it */
161
162 request = mock_request(rcs0(i915)->kernel_context, T);
163 if (!request)
164 return -ENOMEM;
165
166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
167 pr_err("fence wait success before submit (expected timeout)!\n");
168 goto out;
169 }
170
171 i915_request_add(request);
172
173 if (dma_fence_is_signaled(&request->fence)) {
174 pr_err("fence signaled immediately!\n");
175 goto out;
176 }
177
178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
179 pr_err("fence wait success after submit (expected timeout)!\n");
180 goto out;
181 }
182
183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
184 pr_err("fence wait timed out (expected success)!\n");
185 goto out;
186 }
187
188 if (!dma_fence_is_signaled(&request->fence)) {
189 pr_err("fence unsignaled after waiting!\n");
190 goto out;
191 }
192
193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
194 pr_err("fence wait timed out when complete (expected success)!\n");
195 goto out;
196 }
197
198 err = 0;
199 out:
200 mock_device_flush(i915);
201 return err;
202 }
203
igt_request_rewind(void * arg)204 static int igt_request_rewind(void *arg)
205 {
206 struct drm_i915_private *i915 = arg;
207 struct i915_request *request, *vip;
208 struct i915_gem_context *ctx[2];
209 struct intel_context *ce;
210 int err = -EINVAL;
211
212 ctx[0] = mock_context(i915, "A");
213 if (!ctx[0]) {
214 err = -ENOMEM;
215 goto err_ctx_0;
216 }
217
218 ce = i915_gem_context_get_engine(ctx[0], RCS0);
219 GEM_BUG_ON(IS_ERR(ce));
220 request = mock_request(ce, 2 * HZ);
221 intel_context_put(ce);
222 if (!request) {
223 err = -ENOMEM;
224 goto err_context_0;
225 }
226
227 i915_request_get(request);
228 i915_request_add(request);
229
230 ctx[1] = mock_context(i915, "B");
231 if (!ctx[1]) {
232 err = -ENOMEM;
233 goto err_ctx_1;
234 }
235
236 ce = i915_gem_context_get_engine(ctx[1], RCS0);
237 GEM_BUG_ON(IS_ERR(ce));
238 vip = mock_request(ce, 0);
239 intel_context_put(ce);
240 if (!vip) {
241 err = -ENOMEM;
242 goto err_context_1;
243 }
244
245 /* Simulate preemption by manual reordering */
246 if (!mock_cancel_request(request)) {
247 pr_err("failed to cancel request (already executed)!\n");
248 i915_request_add(vip);
249 goto err_context_1;
250 }
251 i915_request_get(vip);
252 i915_request_add(vip);
253 rcu_read_lock();
254 request->engine->submit_request(request);
255 rcu_read_unlock();
256
257
258 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
259 pr_err("timed out waiting for high priority request\n");
260 goto err;
261 }
262
263 if (i915_request_completed(request)) {
264 pr_err("low priority request already completed\n");
265 goto err;
266 }
267
268 err = 0;
269 err:
270 i915_request_put(vip);
271 err_context_1:
272 mock_context_close(ctx[1]);
273 err_ctx_1:
274 i915_request_put(request);
275 err_context_0:
276 mock_context_close(ctx[0]);
277 err_ctx_0:
278 mock_device_flush(i915);
279 return err;
280 }
281
282 struct smoketest {
283 struct intel_engine_cs *engine;
284 struct i915_gem_context **contexts;
285 atomic_long_t num_waits, num_fences;
286 int ncontexts, max_batch;
287 struct i915_request *(*request_alloc)(struct intel_context *ce);
288 };
289
290 static struct i915_request *
__mock_request_alloc(struct intel_context * ce)291 __mock_request_alloc(struct intel_context *ce)
292 {
293 return mock_request(ce, 0);
294 }
295
296 static struct i915_request *
__live_request_alloc(struct intel_context * ce)297 __live_request_alloc(struct intel_context *ce)
298 {
299 return intel_context_create_request(ce);
300 }
301
__igt_breadcrumbs_smoketest(void * arg)302 static int __igt_breadcrumbs_smoketest(void *arg)
303 {
304 struct smoketest *t = arg;
305 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
306 const unsigned int total = 4 * t->ncontexts + 1;
307 unsigned int num_waits = 0, num_fences = 0;
308 struct i915_request **requests;
309 I915_RND_STATE(prng);
310 unsigned int *order;
311 int err = 0;
312
313 /*
314 * A very simple test to catch the most egregious of list handling bugs.
315 *
316 * At its heart, we simply create oodles of requests running across
317 * multiple kthreads and enable signaling on them, for the sole purpose
318 * of stressing our breadcrumb handling. The only inspection we do is
319 * that the fences were marked as signaled.
320 */
321
322 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
323 if (!requests)
324 return -ENOMEM;
325
326 order = i915_random_order(total, &prng);
327 if (!order) {
328 err = -ENOMEM;
329 goto out_requests;
330 }
331
332 while (!kthread_should_stop()) {
333 struct i915_sw_fence *submit, *wait;
334 unsigned int n, count;
335
336 submit = heap_fence_create(GFP_KERNEL);
337 if (!submit) {
338 err = -ENOMEM;
339 break;
340 }
341
342 wait = heap_fence_create(GFP_KERNEL);
343 if (!wait) {
344 i915_sw_fence_commit(submit);
345 heap_fence_put(submit);
346 err = -ENOMEM;
347 break;
348 }
349
350 i915_random_reorder(order, total, &prng);
351 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
352
353 for (n = 0; n < count; n++) {
354 struct i915_gem_context *ctx =
355 t->contexts[order[n] % t->ncontexts];
356 struct i915_request *rq;
357 struct intel_context *ce;
358
359 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
360 GEM_BUG_ON(IS_ERR(ce));
361 rq = t->request_alloc(ce);
362 intel_context_put(ce);
363 if (IS_ERR(rq)) {
364 err = PTR_ERR(rq);
365 count = n;
366 break;
367 }
368
369 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
370 submit,
371 GFP_KERNEL);
372
373 requests[n] = i915_request_get(rq);
374 i915_request_add(rq);
375
376 if (err >= 0)
377 err = i915_sw_fence_await_dma_fence(wait,
378 &rq->fence,
379 0,
380 GFP_KERNEL);
381
382 if (err < 0) {
383 i915_request_put(rq);
384 count = n;
385 break;
386 }
387 }
388
389 i915_sw_fence_commit(submit);
390 i915_sw_fence_commit(wait);
391
392 if (!wait_event_timeout(wait->wait,
393 i915_sw_fence_done(wait),
394 5 * HZ)) {
395 struct i915_request *rq = requests[count - 1];
396
397 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
398 atomic_read(&wait->pending), count,
399 rq->fence.context, rq->fence.seqno,
400 t->engine->name);
401 GEM_TRACE_DUMP();
402
403 intel_gt_set_wedged(t->engine->gt);
404 GEM_BUG_ON(!i915_request_completed(rq));
405 i915_sw_fence_wait(wait);
406 err = -EIO;
407 }
408
409 for (n = 0; n < count; n++) {
410 struct i915_request *rq = requests[n];
411
412 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
413 &rq->fence.flags)) {
414 pr_err("%llu:%llu was not signaled!\n",
415 rq->fence.context, rq->fence.seqno);
416 err = -EINVAL;
417 }
418
419 i915_request_put(rq);
420 }
421
422 heap_fence_put(wait);
423 heap_fence_put(submit);
424
425 if (err < 0)
426 break;
427
428 num_fences += count;
429 num_waits++;
430
431 cond_resched();
432 }
433
434 atomic_long_add(num_fences, &t->num_fences);
435 atomic_long_add(num_waits, &t->num_waits);
436
437 kfree(order);
438 out_requests:
439 kfree(requests);
440 return err;
441 }
442
mock_breadcrumbs_smoketest(void * arg)443 static int mock_breadcrumbs_smoketest(void *arg)
444 {
445 struct drm_i915_private *i915 = arg;
446 struct smoketest t = {
447 .engine = rcs0(i915),
448 .ncontexts = 1024,
449 .max_batch = 1024,
450 .request_alloc = __mock_request_alloc
451 };
452 unsigned int ncpus = num_online_cpus();
453 struct task_struct **threads;
454 unsigned int n;
455 int ret = 0;
456
457 /*
458 * Smoketest our breadcrumb/signal handling for requests across multiple
459 * threads. A very simple test to only catch the most egregious of bugs.
460 * See __igt_breadcrumbs_smoketest();
461 */
462
463 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
464 if (!threads)
465 return -ENOMEM;
466
467 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
468 if (!t.contexts) {
469 ret = -ENOMEM;
470 goto out_threads;
471 }
472
473 for (n = 0; n < t.ncontexts; n++) {
474 t.contexts[n] = mock_context(t.engine->i915, "mock");
475 if (!t.contexts[n]) {
476 ret = -ENOMEM;
477 goto out_contexts;
478 }
479 }
480
481 for (n = 0; n < ncpus; n++) {
482 threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
483 &t, "igt/%d", n);
484 if (IS_ERR(threads[n])) {
485 ret = PTR_ERR(threads[n]);
486 ncpus = n;
487 break;
488 }
489
490 get_task_struct(threads[n]);
491 }
492
493 yield(); /* start all threads before we begin */
494 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
495
496 for (n = 0; n < ncpus; n++) {
497 int err;
498
499 err = kthread_stop(threads[n]);
500 if (err < 0 && !ret)
501 ret = err;
502
503 put_task_struct(threads[n]);
504 }
505 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
506 atomic_long_read(&t.num_waits),
507 atomic_long_read(&t.num_fences),
508 ncpus);
509
510 out_contexts:
511 for (n = 0; n < t.ncontexts; n++) {
512 if (!t.contexts[n])
513 break;
514 mock_context_close(t.contexts[n]);
515 }
516 kfree(t.contexts);
517 out_threads:
518 kfree(threads);
519 return ret;
520 }
521
i915_request_mock_selftests(void)522 int i915_request_mock_selftests(void)
523 {
524 static const struct i915_subtest tests[] = {
525 SUBTEST(igt_add_request),
526 SUBTEST(igt_wait_request),
527 SUBTEST(igt_fence_wait),
528 SUBTEST(igt_request_rewind),
529 SUBTEST(mock_breadcrumbs_smoketest),
530 };
531 struct drm_i915_private *i915;
532 intel_wakeref_t wakeref;
533 int err = 0;
534
535 i915 = mock_gem_device();
536 if (!i915)
537 return -ENOMEM;
538
539 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
540 err = i915_subtests(tests, i915);
541
542 mock_destroy_device(i915);
543
544 return err;
545 }
546
live_nop_request(void * arg)547 static int live_nop_request(void *arg)
548 {
549 struct drm_i915_private *i915 = arg;
550 struct intel_engine_cs *engine;
551 struct igt_live_test t;
552 int err = -ENODEV;
553
554 /*
555 * Submit various sized batches of empty requests, to each engine
556 * (individually), and wait for the batch to complete. We can check
557 * the overhead of submitting requests to the hardware.
558 */
559
560 for_each_uabi_engine(engine, i915) {
561 unsigned long n, prime;
562 IGT_TIMEOUT(end_time);
563 ktime_t times[2] = {};
564
565 err = igt_live_test_begin(&t, i915, __func__, engine->name);
566 if (err)
567 return err;
568
569 intel_engine_pm_get(engine);
570 for_each_prime_number_from(prime, 1, 8192) {
571 struct i915_request *request = NULL;
572
573 times[1] = ktime_get_raw();
574
575 for (n = 0; n < prime; n++) {
576 i915_request_put(request);
577 request = i915_request_create(engine->kernel_context);
578 if (IS_ERR(request))
579 return PTR_ERR(request);
580
581 /*
582 * This space is left intentionally blank.
583 *
584 * We do not actually want to perform any
585 * action with this request, we just want
586 * to measure the latency in allocation
587 * and submission of our breadcrumbs -
588 * ensuring that the bare request is sufficient
589 * for the system to work (i.e. proper HEAD
590 * tracking of the rings, interrupt handling,
591 * etc). It also gives us the lowest bounds
592 * for latency.
593 */
594
595 i915_request_get(request);
596 i915_request_add(request);
597 }
598 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
599 i915_request_put(request);
600
601 times[1] = ktime_sub(ktime_get_raw(), times[1]);
602 if (prime == 1)
603 times[0] = times[1];
604
605 if (__igt_timeout(end_time, NULL))
606 break;
607 }
608 intel_engine_pm_put(engine);
609
610 err = igt_live_test_end(&t);
611 if (err)
612 return err;
613
614 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
615 engine->name,
616 ktime_to_ns(times[0]),
617 prime, div64_u64(ktime_to_ns(times[1]), prime));
618 }
619
620 return err;
621 }
622
__cancel_inactive(struct intel_engine_cs * engine)623 static int __cancel_inactive(struct intel_engine_cs *engine)
624 {
625 struct intel_context *ce;
626 struct igt_spinner spin;
627 struct i915_request *rq;
628 int err = 0;
629
630 if (igt_spinner_init(&spin, engine->gt))
631 return -ENOMEM;
632
633 ce = intel_context_create(engine);
634 if (IS_ERR(ce)) {
635 err = PTR_ERR(ce);
636 goto out_spin;
637 }
638
639 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
640 if (IS_ERR(rq)) {
641 err = PTR_ERR(rq);
642 goto out_ce;
643 }
644
645 pr_debug("%s: Cancelling inactive request\n", engine->name);
646 i915_request_cancel(rq, -EINTR);
647 i915_request_get(rq);
648 i915_request_add(rq);
649
650 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
651 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
652
653 pr_err("%s: Failed to cancel inactive request\n", engine->name);
654 intel_engine_dump(engine, &p, "%s\n", engine->name);
655 err = -ETIME;
656 goto out_rq;
657 }
658
659 if (rq->fence.error != -EINTR) {
660 pr_err("%s: fence not cancelled (%u)\n",
661 engine->name, rq->fence.error);
662 err = -EINVAL;
663 }
664
665 out_rq:
666 i915_request_put(rq);
667 out_ce:
668 intel_context_put(ce);
669 out_spin:
670 igt_spinner_fini(&spin);
671 if (err)
672 pr_err("%s: %s error %d\n", __func__, engine->name, err);
673 return err;
674 }
675
__cancel_active(struct intel_engine_cs * engine)676 static int __cancel_active(struct intel_engine_cs *engine)
677 {
678 struct intel_context *ce;
679 struct igt_spinner spin;
680 struct i915_request *rq;
681 int err = 0;
682
683 if (igt_spinner_init(&spin, engine->gt))
684 return -ENOMEM;
685
686 ce = intel_context_create(engine);
687 if (IS_ERR(ce)) {
688 err = PTR_ERR(ce);
689 goto out_spin;
690 }
691
692 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
693 if (IS_ERR(rq)) {
694 err = PTR_ERR(rq);
695 goto out_ce;
696 }
697
698 pr_debug("%s: Cancelling active request\n", engine->name);
699 i915_request_get(rq);
700 i915_request_add(rq);
701 if (!igt_wait_for_spinner(&spin, rq)) {
702 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
703
704 pr_err("Failed to start spinner on %s\n", engine->name);
705 intel_engine_dump(engine, &p, "%s\n", engine->name);
706 err = -ETIME;
707 goto out_rq;
708 }
709 i915_request_cancel(rq, -EINTR);
710
711 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
712 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
713
714 pr_err("%s: Failed to cancel active request\n", engine->name);
715 intel_engine_dump(engine, &p, "%s\n", engine->name);
716 err = -ETIME;
717 goto out_rq;
718 }
719
720 if (rq->fence.error != -EINTR) {
721 pr_err("%s: fence not cancelled (%u)\n",
722 engine->name, rq->fence.error);
723 err = -EINVAL;
724 }
725
726 out_rq:
727 i915_request_put(rq);
728 out_ce:
729 intel_context_put(ce);
730 out_spin:
731 igt_spinner_fini(&spin);
732 if (err)
733 pr_err("%s: %s error %d\n", __func__, engine->name, err);
734 return err;
735 }
736
__cancel_completed(struct intel_engine_cs * engine)737 static int __cancel_completed(struct intel_engine_cs *engine)
738 {
739 struct intel_context *ce;
740 struct igt_spinner spin;
741 struct i915_request *rq;
742 int err = 0;
743
744 if (igt_spinner_init(&spin, engine->gt))
745 return -ENOMEM;
746
747 ce = intel_context_create(engine);
748 if (IS_ERR(ce)) {
749 err = PTR_ERR(ce);
750 goto out_spin;
751 }
752
753 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
754 if (IS_ERR(rq)) {
755 err = PTR_ERR(rq);
756 goto out_ce;
757 }
758 igt_spinner_end(&spin);
759 i915_request_get(rq);
760 i915_request_add(rq);
761
762 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
763 err = -ETIME;
764 goto out_rq;
765 }
766
767 pr_debug("%s: Cancelling completed request\n", engine->name);
768 i915_request_cancel(rq, -EINTR);
769 if (rq->fence.error) {
770 pr_err("%s: fence not cancelled (%u)\n",
771 engine->name, rq->fence.error);
772 err = -EINVAL;
773 }
774
775 out_rq:
776 i915_request_put(rq);
777 out_ce:
778 intel_context_put(ce);
779 out_spin:
780 igt_spinner_fini(&spin);
781 if (err)
782 pr_err("%s: %s error %d\n", __func__, engine->name, err);
783 return err;
784 }
785
786 /*
787 * Test to prove a non-preemptable request can be cancelled and a subsequent
788 * request on the same context can successfully complete after cancellation.
789 *
790 * Testing methodology is to create a non-preemptible request and submit it,
791 * wait for spinner to start, create a NOP request and submit it, cancel the
792 * spinner, wait for spinner to complete and verify it failed with an error,
793 * finally wait for NOP request to complete verify it succeeded without an
794 * error. Preemption timeout also reduced / restored so test runs in a timely
795 * maner.
796 */
__cancel_reset(struct drm_i915_private * i915,struct intel_engine_cs * engine)797 static int __cancel_reset(struct drm_i915_private *i915,
798 struct intel_engine_cs *engine)
799 {
800 struct intel_context *ce;
801 struct igt_spinner spin;
802 struct i915_request *rq, *nop;
803 unsigned long preempt_timeout_ms;
804 int err = 0;
805
806 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
807 !intel_has_reset_engine(engine->gt))
808 return 0;
809
810 preempt_timeout_ms = engine->props.preempt_timeout_ms;
811 engine->props.preempt_timeout_ms = 100;
812
813 if (igt_spinner_init(&spin, engine->gt))
814 goto out_restore;
815
816 ce = intel_context_create(engine);
817 if (IS_ERR(ce)) {
818 err = PTR_ERR(ce);
819 goto out_spin;
820 }
821
822 rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
823 if (IS_ERR(rq)) {
824 err = PTR_ERR(rq);
825 goto out_ce;
826 }
827
828 pr_debug("%s: Cancelling active non-preemptable request\n",
829 engine->name);
830 i915_request_get(rq);
831 i915_request_add(rq);
832 if (!igt_wait_for_spinner(&spin, rq)) {
833 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
834
835 pr_err("Failed to start spinner on %s\n", engine->name);
836 intel_engine_dump(engine, &p, "%s\n", engine->name);
837 err = -ETIME;
838 goto out_rq;
839 }
840
841 nop = intel_context_create_request(ce);
842 if (IS_ERR(nop))
843 goto out_rq;
844 i915_request_get(nop);
845 i915_request_add(nop);
846
847 i915_request_cancel(rq, -EINTR);
848
849 if (i915_request_wait(rq, 0, HZ) < 0) {
850 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
851
852 pr_err("%s: Failed to cancel hung request\n", engine->name);
853 intel_engine_dump(engine, &p, "%s\n", engine->name);
854 err = -ETIME;
855 goto out_nop;
856 }
857
858 if (rq->fence.error != -EINTR) {
859 pr_err("%s: fence not cancelled (%u)\n",
860 engine->name, rq->fence.error);
861 err = -EINVAL;
862 goto out_nop;
863 }
864
865 if (i915_request_wait(nop, 0, HZ) < 0) {
866 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
867
868 pr_err("%s: Failed to complete nop request\n", engine->name);
869 intel_engine_dump(engine, &p, "%s\n", engine->name);
870 err = -ETIME;
871 goto out_nop;
872 }
873
874 if (nop->fence.error != 0) {
875 pr_err("%s: Nop request errored (%u)\n",
876 engine->name, nop->fence.error);
877 err = -EINVAL;
878 }
879
880 out_nop:
881 i915_request_put(nop);
882 out_rq:
883 i915_request_put(rq);
884 out_ce:
885 intel_context_put(ce);
886 out_spin:
887 igt_spinner_fini(&spin);
888 out_restore:
889 engine->props.preempt_timeout_ms = preempt_timeout_ms;
890 if (err)
891 pr_err("%s: %s error %d\n", __func__, engine->name, err);
892 return err;
893 }
894
live_cancel_request(void * arg)895 static int live_cancel_request(void *arg)
896 {
897 struct drm_i915_private *i915 = arg;
898 struct intel_engine_cs *engine;
899
900 /*
901 * Check cancellation of requests. We expect to be able to immediately
902 * cancel active requests, even if they are currently on the GPU.
903 */
904
905 for_each_uabi_engine(engine, i915) {
906 struct igt_live_test t;
907 int err, err2;
908
909 if (!intel_engine_has_preemption(engine))
910 continue;
911
912 err = igt_live_test_begin(&t, i915, __func__, engine->name);
913 if (err)
914 return err;
915
916 err = __cancel_inactive(engine);
917 if (err == 0)
918 err = __cancel_active(engine);
919 if (err == 0)
920 err = __cancel_completed(engine);
921
922 err2 = igt_live_test_end(&t);
923 if (err)
924 return err;
925 if (err2)
926 return err2;
927
928 /* Expects reset so call outside of igt_live_test_* */
929 err = __cancel_reset(i915, engine);
930 if (err)
931 return err;
932
933 if (igt_flush_test(i915))
934 return -EIO;
935 }
936
937 return 0;
938 }
939
empty_batch(struct drm_i915_private * i915)940 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
941 {
942 struct drm_i915_gem_object *obj;
943 struct i915_vma *vma;
944 u32 *cmd;
945 int err;
946
947 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
948 if (IS_ERR(obj))
949 return ERR_CAST(obj);
950
951 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
952 if (IS_ERR(cmd)) {
953 err = PTR_ERR(cmd);
954 goto err;
955 }
956
957 *cmd = MI_BATCH_BUFFER_END;
958
959 __i915_gem_object_flush_map(obj, 0, 64);
960 i915_gem_object_unpin_map(obj);
961
962 intel_gt_chipset_flush(to_gt(i915));
963
964 vma = i915_vma_instance(obj, &to_gt(i915)->ggtt->vm, NULL);
965 if (IS_ERR(vma)) {
966 err = PTR_ERR(vma);
967 goto err;
968 }
969
970 err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
971 if (err)
972 goto err;
973
974 /* Force the wait now to avoid including it in the benchmark */
975 err = i915_vma_sync(vma);
976 if (err)
977 goto err_pin;
978
979 return vma;
980
981 err_pin:
982 i915_vma_unpin(vma);
983 err:
984 i915_gem_object_put(obj);
985 return ERR_PTR(err);
986 }
987
988 static struct i915_request *
empty_request(struct intel_engine_cs * engine,struct i915_vma * batch)989 empty_request(struct intel_engine_cs *engine,
990 struct i915_vma *batch)
991 {
992 struct i915_request *request;
993 int err;
994
995 request = i915_request_create(engine->kernel_context);
996 if (IS_ERR(request))
997 return request;
998
999 err = engine->emit_bb_start(request,
1000 batch->node.start,
1001 batch->node.size,
1002 I915_DISPATCH_SECURE);
1003 if (err)
1004 goto out_request;
1005
1006 i915_request_get(request);
1007 out_request:
1008 i915_request_add(request);
1009 return err ? ERR_PTR(err) : request;
1010 }
1011
live_empty_request(void * arg)1012 static int live_empty_request(void *arg)
1013 {
1014 struct drm_i915_private *i915 = arg;
1015 struct intel_engine_cs *engine;
1016 struct igt_live_test t;
1017 struct i915_vma *batch;
1018 int err = 0;
1019
1020 /*
1021 * Submit various sized batches of empty requests, to each engine
1022 * (individually), and wait for the batch to complete. We can check
1023 * the overhead of submitting requests to the hardware.
1024 */
1025
1026 batch = empty_batch(i915);
1027 if (IS_ERR(batch))
1028 return PTR_ERR(batch);
1029
1030 for_each_uabi_engine(engine, i915) {
1031 IGT_TIMEOUT(end_time);
1032 struct i915_request *request;
1033 unsigned long n, prime;
1034 ktime_t times[2] = {};
1035
1036 err = igt_live_test_begin(&t, i915, __func__, engine->name);
1037 if (err)
1038 goto out_batch;
1039
1040 intel_engine_pm_get(engine);
1041
1042 /* Warmup / preload */
1043 request = empty_request(engine, batch);
1044 if (IS_ERR(request)) {
1045 err = PTR_ERR(request);
1046 intel_engine_pm_put(engine);
1047 goto out_batch;
1048 }
1049 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1050
1051 for_each_prime_number_from(prime, 1, 8192) {
1052 times[1] = ktime_get_raw();
1053
1054 for (n = 0; n < prime; n++) {
1055 i915_request_put(request);
1056 request = empty_request(engine, batch);
1057 if (IS_ERR(request)) {
1058 err = PTR_ERR(request);
1059 intel_engine_pm_put(engine);
1060 goto out_batch;
1061 }
1062 }
1063 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1064
1065 times[1] = ktime_sub(ktime_get_raw(), times[1]);
1066 if (prime == 1)
1067 times[0] = times[1];
1068
1069 if (__igt_timeout(end_time, NULL))
1070 break;
1071 }
1072 i915_request_put(request);
1073 intel_engine_pm_put(engine);
1074
1075 err = igt_live_test_end(&t);
1076 if (err)
1077 goto out_batch;
1078
1079 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1080 engine->name,
1081 ktime_to_ns(times[0]),
1082 prime, div64_u64(ktime_to_ns(times[1]), prime));
1083 }
1084
1085 out_batch:
1086 i915_vma_unpin(batch);
1087 i915_vma_put(batch);
1088 return err;
1089 }
1090
recursive_batch(struct drm_i915_private * i915)1091 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
1092 {
1093 struct drm_i915_gem_object *obj;
1094 const int ver = GRAPHICS_VER(i915);
1095 struct i915_vma *vma;
1096 u32 *cmd;
1097 int err;
1098
1099 obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
1100 if (IS_ERR(obj))
1101 return ERR_CAST(obj);
1102
1103 vma = i915_vma_instance(obj, to_gt(i915)->vm, NULL);
1104 if (IS_ERR(vma)) {
1105 err = PTR_ERR(vma);
1106 goto err;
1107 }
1108
1109 err = i915_vma_pin(vma, 0, 0, PIN_USER);
1110 if (err)
1111 goto err;
1112
1113 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1114 if (IS_ERR(cmd)) {
1115 err = PTR_ERR(cmd);
1116 goto err;
1117 }
1118
1119 if (ver >= 8) {
1120 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1121 *cmd++ = lower_32_bits(vma->node.start);
1122 *cmd++ = upper_32_bits(vma->node.start);
1123 } else if (ver >= 6) {
1124 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1125 *cmd++ = lower_32_bits(vma->node.start);
1126 } else {
1127 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1128 *cmd++ = lower_32_bits(vma->node.start);
1129 }
1130 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1131
1132 __i915_gem_object_flush_map(obj, 0, 64);
1133 i915_gem_object_unpin_map(obj);
1134
1135 intel_gt_chipset_flush(to_gt(i915));
1136
1137 return vma;
1138
1139 err:
1140 i915_gem_object_put(obj);
1141 return ERR_PTR(err);
1142 }
1143
recursive_batch_resolve(struct i915_vma * batch)1144 static int recursive_batch_resolve(struct i915_vma *batch)
1145 {
1146 u32 *cmd;
1147
1148 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1149 if (IS_ERR(cmd))
1150 return PTR_ERR(cmd);
1151
1152 *cmd = MI_BATCH_BUFFER_END;
1153
1154 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1155 i915_gem_object_unpin_map(batch->obj);
1156
1157 intel_gt_chipset_flush(batch->vm->gt);
1158
1159 return 0;
1160 }
1161
live_all_engines(void * arg)1162 static int live_all_engines(void *arg)
1163 {
1164 struct drm_i915_private *i915 = arg;
1165 const unsigned int nengines = num_uabi_engines(i915);
1166 struct intel_engine_cs *engine;
1167 struct i915_request **request;
1168 struct igt_live_test t;
1169 struct i915_vma *batch;
1170 unsigned int idx;
1171 int err;
1172
1173 /*
1174 * Check we can submit requests to all engines simultaneously. We
1175 * send a recursive batch to each engine - checking that we don't
1176 * block doing so, and that they don't complete too soon.
1177 */
1178
1179 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1180 if (!request)
1181 return -ENOMEM;
1182
1183 err = igt_live_test_begin(&t, i915, __func__, "");
1184 if (err)
1185 goto out_free;
1186
1187 batch = recursive_batch(i915);
1188 if (IS_ERR(batch)) {
1189 err = PTR_ERR(batch);
1190 pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1191 goto out_free;
1192 }
1193
1194 i915_vma_lock(batch);
1195
1196 idx = 0;
1197 for_each_uabi_engine(engine, i915) {
1198 request[idx] = intel_engine_create_kernel_request(engine);
1199 if (IS_ERR(request[idx])) {
1200 err = PTR_ERR(request[idx]);
1201 pr_err("%s: Request allocation failed with err=%d\n",
1202 __func__, err);
1203 goto out_request;
1204 }
1205
1206 err = i915_request_await_object(request[idx], batch->obj, 0);
1207 if (err == 0)
1208 err = i915_vma_move_to_active(batch, request[idx], 0);
1209 GEM_BUG_ON(err);
1210
1211 err = engine->emit_bb_start(request[idx],
1212 batch->node.start,
1213 batch->node.size,
1214 0);
1215 GEM_BUG_ON(err);
1216 request[idx]->batch = batch;
1217
1218 i915_request_get(request[idx]);
1219 i915_request_add(request[idx]);
1220 idx++;
1221 }
1222
1223 i915_vma_unlock(batch);
1224
1225 idx = 0;
1226 for_each_uabi_engine(engine, i915) {
1227 if (i915_request_completed(request[idx])) {
1228 pr_err("%s(%s): request completed too early!\n",
1229 __func__, engine->name);
1230 err = -EINVAL;
1231 goto out_request;
1232 }
1233 idx++;
1234 }
1235
1236 err = recursive_batch_resolve(batch);
1237 if (err) {
1238 pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1239 goto out_request;
1240 }
1241
1242 idx = 0;
1243 for_each_uabi_engine(engine, i915) {
1244 long timeout;
1245
1246 timeout = i915_request_wait(request[idx], 0,
1247 MAX_SCHEDULE_TIMEOUT);
1248 if (timeout < 0) {
1249 err = timeout;
1250 pr_err("%s: error waiting for request on %s, err=%d\n",
1251 __func__, engine->name, err);
1252 goto out_request;
1253 }
1254
1255 GEM_BUG_ON(!i915_request_completed(request[idx]));
1256 i915_request_put(request[idx]);
1257 request[idx] = NULL;
1258 idx++;
1259 }
1260
1261 err = igt_live_test_end(&t);
1262
1263 out_request:
1264 idx = 0;
1265 for_each_uabi_engine(engine, i915) {
1266 if (request[idx])
1267 i915_request_put(request[idx]);
1268 idx++;
1269 }
1270 i915_vma_unpin(batch);
1271 i915_vma_put(batch);
1272 out_free:
1273 kfree(request);
1274 return err;
1275 }
1276
live_sequential_engines(void * arg)1277 static int live_sequential_engines(void *arg)
1278 {
1279 struct drm_i915_private *i915 = arg;
1280 const unsigned int nengines = num_uabi_engines(i915);
1281 struct i915_request **request;
1282 struct i915_request *prev = NULL;
1283 struct intel_engine_cs *engine;
1284 struct igt_live_test t;
1285 unsigned int idx;
1286 int err;
1287
1288 /*
1289 * Check we can submit requests to all engines sequentially, such
1290 * that each successive request waits for the earlier ones. This
1291 * tests that we don't execute requests out of order, even though
1292 * they are running on independent engines.
1293 */
1294
1295 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1296 if (!request)
1297 return -ENOMEM;
1298
1299 err = igt_live_test_begin(&t, i915, __func__, "");
1300 if (err)
1301 goto out_free;
1302
1303 idx = 0;
1304 for_each_uabi_engine(engine, i915) {
1305 struct i915_vma *batch;
1306
1307 batch = recursive_batch(i915);
1308 if (IS_ERR(batch)) {
1309 err = PTR_ERR(batch);
1310 pr_err("%s: Unable to create batch for %s, err=%d\n",
1311 __func__, engine->name, err);
1312 goto out_free;
1313 }
1314
1315 i915_vma_lock(batch);
1316 request[idx] = intel_engine_create_kernel_request(engine);
1317 if (IS_ERR(request[idx])) {
1318 err = PTR_ERR(request[idx]);
1319 pr_err("%s: Request allocation failed for %s with err=%d\n",
1320 __func__, engine->name, err);
1321 goto out_unlock;
1322 }
1323
1324 if (prev) {
1325 err = i915_request_await_dma_fence(request[idx],
1326 &prev->fence);
1327 if (err) {
1328 i915_request_add(request[idx]);
1329 pr_err("%s: Request await failed for %s with err=%d\n",
1330 __func__, engine->name, err);
1331 goto out_unlock;
1332 }
1333 }
1334
1335 err = i915_request_await_object(request[idx],
1336 batch->obj, false);
1337 if (err == 0)
1338 err = i915_vma_move_to_active(batch, request[idx], 0);
1339 GEM_BUG_ON(err);
1340
1341 err = engine->emit_bb_start(request[idx],
1342 batch->node.start,
1343 batch->node.size,
1344 0);
1345 GEM_BUG_ON(err);
1346 request[idx]->batch = batch;
1347
1348 i915_request_get(request[idx]);
1349 i915_request_add(request[idx]);
1350
1351 prev = request[idx];
1352 idx++;
1353
1354 out_unlock:
1355 i915_vma_unlock(batch);
1356 if (err)
1357 goto out_request;
1358 }
1359
1360 idx = 0;
1361 for_each_uabi_engine(engine, i915) {
1362 long timeout;
1363
1364 if (i915_request_completed(request[idx])) {
1365 pr_err("%s(%s): request completed too early!\n",
1366 __func__, engine->name);
1367 err = -EINVAL;
1368 goto out_request;
1369 }
1370
1371 err = recursive_batch_resolve(request[idx]->batch);
1372 if (err) {
1373 pr_err("%s: failed to resolve batch, err=%d\n",
1374 __func__, err);
1375 goto out_request;
1376 }
1377
1378 timeout = i915_request_wait(request[idx], 0,
1379 MAX_SCHEDULE_TIMEOUT);
1380 if (timeout < 0) {
1381 err = timeout;
1382 pr_err("%s: error waiting for request on %s, err=%d\n",
1383 __func__, engine->name, err);
1384 goto out_request;
1385 }
1386
1387 GEM_BUG_ON(!i915_request_completed(request[idx]));
1388 idx++;
1389 }
1390
1391 err = igt_live_test_end(&t);
1392
1393 out_request:
1394 idx = 0;
1395 for_each_uabi_engine(engine, i915) {
1396 u32 *cmd;
1397
1398 if (!request[idx])
1399 break;
1400
1401 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1402 I915_MAP_WC);
1403 if (!IS_ERR(cmd)) {
1404 *cmd = MI_BATCH_BUFFER_END;
1405
1406 __i915_gem_object_flush_map(request[idx]->batch->obj,
1407 0, sizeof(*cmd));
1408 i915_gem_object_unpin_map(request[idx]->batch->obj);
1409
1410 intel_gt_chipset_flush(engine->gt);
1411 }
1412
1413 i915_vma_put(request[idx]->batch);
1414 i915_request_put(request[idx]);
1415 idx++;
1416 }
1417 out_free:
1418 kfree(request);
1419 return err;
1420 }
1421
__live_parallel_engine1(void * arg)1422 static int __live_parallel_engine1(void *arg)
1423 {
1424 struct intel_engine_cs *engine = arg;
1425 IGT_TIMEOUT(end_time);
1426 unsigned long count;
1427 int err = 0;
1428
1429 count = 0;
1430 intel_engine_pm_get(engine);
1431 do {
1432 struct i915_request *rq;
1433
1434 rq = i915_request_create(engine->kernel_context);
1435 if (IS_ERR(rq)) {
1436 err = PTR_ERR(rq);
1437 break;
1438 }
1439
1440 i915_request_get(rq);
1441 i915_request_add(rq);
1442
1443 err = 0;
1444 if (i915_request_wait(rq, 0, HZ) < 0)
1445 err = -ETIME;
1446 i915_request_put(rq);
1447 if (err)
1448 break;
1449
1450 count++;
1451 } while (!__igt_timeout(end_time, NULL));
1452 intel_engine_pm_put(engine);
1453
1454 pr_info("%s: %lu request + sync\n", engine->name, count);
1455 return err;
1456 }
1457
__live_parallel_engineN(void * arg)1458 static int __live_parallel_engineN(void *arg)
1459 {
1460 struct intel_engine_cs *engine = arg;
1461 IGT_TIMEOUT(end_time);
1462 unsigned long count;
1463 int err = 0;
1464
1465 count = 0;
1466 intel_engine_pm_get(engine);
1467 do {
1468 struct i915_request *rq;
1469
1470 rq = i915_request_create(engine->kernel_context);
1471 if (IS_ERR(rq)) {
1472 err = PTR_ERR(rq);
1473 break;
1474 }
1475
1476 i915_request_add(rq);
1477 count++;
1478 } while (!__igt_timeout(end_time, NULL));
1479 intel_engine_pm_put(engine);
1480
1481 pr_info("%s: %lu requests\n", engine->name, count);
1482 return err;
1483 }
1484
wake_all(struct drm_i915_private * i915)1485 static bool wake_all(struct drm_i915_private *i915)
1486 {
1487 if (atomic_dec_and_test(&i915->selftest.counter)) {
1488 wake_up_var(&i915->selftest.counter);
1489 return true;
1490 }
1491
1492 return false;
1493 }
1494
wait_for_all(struct drm_i915_private * i915)1495 static int wait_for_all(struct drm_i915_private *i915)
1496 {
1497 if (wake_all(i915))
1498 return 0;
1499
1500 if (wait_var_event_timeout(&i915->selftest.counter,
1501 !atomic_read(&i915->selftest.counter),
1502 i915_selftest.timeout_jiffies))
1503 return 0;
1504
1505 return -ETIME;
1506 }
1507
__live_parallel_spin(void * arg)1508 static int __live_parallel_spin(void *arg)
1509 {
1510 struct intel_engine_cs *engine = arg;
1511 struct igt_spinner spin;
1512 struct i915_request *rq;
1513 int err = 0;
1514
1515 /*
1516 * Create a spinner running for eternity on each engine. If a second
1517 * spinner is incorrectly placed on the same engine, it will not be
1518 * able to start in time.
1519 */
1520
1521 if (igt_spinner_init(&spin, engine->gt)) {
1522 wake_all(engine->i915);
1523 return -ENOMEM;
1524 }
1525
1526 intel_engine_pm_get(engine);
1527 rq = igt_spinner_create_request(&spin,
1528 engine->kernel_context,
1529 MI_NOOP); /* no preemption */
1530 intel_engine_pm_put(engine);
1531 if (IS_ERR(rq)) {
1532 err = PTR_ERR(rq);
1533 if (err == -ENODEV)
1534 err = 0;
1535 wake_all(engine->i915);
1536 goto out_spin;
1537 }
1538
1539 i915_request_get(rq);
1540 i915_request_add(rq);
1541 if (igt_wait_for_spinner(&spin, rq)) {
1542 /* Occupy this engine for the whole test */
1543 err = wait_for_all(engine->i915);
1544 } else {
1545 pr_err("Failed to start spinner on %s\n", engine->name);
1546 err = -EINVAL;
1547 }
1548 igt_spinner_end(&spin);
1549
1550 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1551 err = -EIO;
1552 i915_request_put(rq);
1553
1554 out_spin:
1555 igt_spinner_fini(&spin);
1556 return err;
1557 }
1558
live_parallel_engines(void * arg)1559 static int live_parallel_engines(void *arg)
1560 {
1561 struct drm_i915_private *i915 = arg;
1562 static int (* const func[])(void *arg) = {
1563 __live_parallel_engine1,
1564 __live_parallel_engineN,
1565 __live_parallel_spin,
1566 NULL,
1567 };
1568 const unsigned int nengines = num_uabi_engines(i915);
1569 struct intel_engine_cs *engine;
1570 int (* const *fn)(void *arg);
1571 struct task_struct **tsk;
1572 int err = 0;
1573
1574 /*
1575 * Check we can submit requests to all engines concurrently. This
1576 * tests that we load up the system maximally.
1577 */
1578
1579 tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1580 if (!tsk)
1581 return -ENOMEM;
1582
1583 for (fn = func; !err && *fn; fn++) {
1584 char name[KSYM_NAME_LEN];
1585 struct igt_live_test t;
1586 unsigned int idx;
1587
1588 snprintf(name, sizeof(name), "%ps", *fn);
1589 err = igt_live_test_begin(&t, i915, __func__, name);
1590 if (err)
1591 break;
1592
1593 atomic_set(&i915->selftest.counter, nengines);
1594
1595 idx = 0;
1596 for_each_uabi_engine(engine, i915) {
1597 tsk[idx] = kthread_run(*fn, engine,
1598 "igt/parallel:%s",
1599 engine->name);
1600 if (IS_ERR(tsk[idx])) {
1601 err = PTR_ERR(tsk[idx]);
1602 break;
1603 }
1604 get_task_struct(tsk[idx++]);
1605 }
1606
1607 yield(); /* start all threads before we kthread_stop() */
1608
1609 idx = 0;
1610 for_each_uabi_engine(engine, i915) {
1611 int status;
1612
1613 if (IS_ERR(tsk[idx]))
1614 break;
1615
1616 status = kthread_stop(tsk[idx]);
1617 if (status && !err)
1618 err = status;
1619
1620 put_task_struct(tsk[idx++]);
1621 }
1622
1623 if (igt_live_test_end(&t))
1624 err = -EIO;
1625 }
1626
1627 kfree(tsk);
1628 return err;
1629 }
1630
1631 static int
max_batches(struct i915_gem_context * ctx,struct intel_engine_cs * engine)1632 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1633 {
1634 struct i915_request *rq;
1635 int ret;
1636
1637 /*
1638 * Before execlists, all contexts share the same ringbuffer. With
1639 * execlists, each context/engine has a separate ringbuffer and
1640 * for the purposes of this test, inexhaustible.
1641 *
1642 * For the global ringbuffer though, we have to be very careful
1643 * that we do not wrap while preventing the execution of requests
1644 * with a unsignaled fence.
1645 */
1646 if (HAS_EXECLISTS(ctx->i915))
1647 return INT_MAX;
1648
1649 rq = igt_request_alloc(ctx, engine);
1650 if (IS_ERR(rq)) {
1651 ret = PTR_ERR(rq);
1652 } else {
1653 int sz;
1654
1655 ret = rq->ring->size - rq->reserved_space;
1656 i915_request_add(rq);
1657
1658 sz = rq->ring->emit - rq->head;
1659 if (sz < 0)
1660 sz += rq->ring->size;
1661 ret /= sz;
1662 ret /= 2; /* leave half spare, in case of emergency! */
1663 }
1664
1665 return ret;
1666 }
1667
live_breadcrumbs_smoketest(void * arg)1668 static int live_breadcrumbs_smoketest(void *arg)
1669 {
1670 struct drm_i915_private *i915 = arg;
1671 const unsigned int nengines = num_uabi_engines(i915);
1672 const unsigned int ncpus = num_online_cpus();
1673 unsigned long num_waits, num_fences;
1674 struct intel_engine_cs *engine;
1675 struct task_struct **threads;
1676 struct igt_live_test live;
1677 intel_wakeref_t wakeref;
1678 struct smoketest *smoke;
1679 unsigned int n, idx;
1680 struct file *file;
1681 int ret = 0;
1682
1683 /*
1684 * Smoketest our breadcrumb/signal handling for requests across multiple
1685 * threads. A very simple test to only catch the most egregious of bugs.
1686 * See __igt_breadcrumbs_smoketest();
1687 *
1688 * On real hardware this time.
1689 */
1690
1691 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1692
1693 file = mock_file(i915);
1694 if (IS_ERR(file)) {
1695 ret = PTR_ERR(file);
1696 goto out_rpm;
1697 }
1698
1699 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1700 if (!smoke) {
1701 ret = -ENOMEM;
1702 goto out_file;
1703 }
1704
1705 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1706 if (!threads) {
1707 ret = -ENOMEM;
1708 goto out_smoke;
1709 }
1710
1711 smoke[0].request_alloc = __live_request_alloc;
1712 smoke[0].ncontexts = 64;
1713 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1714 sizeof(*smoke[0].contexts),
1715 GFP_KERNEL);
1716 if (!smoke[0].contexts) {
1717 ret = -ENOMEM;
1718 goto out_threads;
1719 }
1720
1721 for (n = 0; n < smoke[0].ncontexts; n++) {
1722 smoke[0].contexts[n] = live_context(i915, file);
1723 if (IS_ERR(smoke[0].contexts[n])) {
1724 ret = PTR_ERR(smoke[0].contexts[n]);
1725 goto out_contexts;
1726 }
1727 }
1728
1729 ret = igt_live_test_begin(&live, i915, __func__, "");
1730 if (ret)
1731 goto out_contexts;
1732
1733 idx = 0;
1734 for_each_uabi_engine(engine, i915) {
1735 smoke[idx] = smoke[0];
1736 smoke[idx].engine = engine;
1737 smoke[idx].max_batch =
1738 max_batches(smoke[0].contexts[0], engine);
1739 if (smoke[idx].max_batch < 0) {
1740 ret = smoke[idx].max_batch;
1741 goto out_flush;
1742 }
1743 /* One ring interleaved between requests from all cpus */
1744 smoke[idx].max_batch /= num_online_cpus() + 1;
1745 pr_debug("Limiting batches to %d requests on %s\n",
1746 smoke[idx].max_batch, engine->name);
1747
1748 for (n = 0; n < ncpus; n++) {
1749 struct task_struct *tsk;
1750
1751 tsk = kthread_run(__igt_breadcrumbs_smoketest,
1752 &smoke[idx], "igt/%d.%d", idx, n);
1753 if (IS_ERR(tsk)) {
1754 ret = PTR_ERR(tsk);
1755 goto out_flush;
1756 }
1757
1758 get_task_struct(tsk);
1759 threads[idx * ncpus + n] = tsk;
1760 }
1761
1762 idx++;
1763 }
1764
1765 yield(); /* start all threads before we begin */
1766 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1767
1768 out_flush:
1769 idx = 0;
1770 num_waits = 0;
1771 num_fences = 0;
1772 for_each_uabi_engine(engine, i915) {
1773 for (n = 0; n < ncpus; n++) {
1774 struct task_struct *tsk = threads[idx * ncpus + n];
1775 int err;
1776
1777 if (!tsk)
1778 continue;
1779
1780 err = kthread_stop(tsk);
1781 if (err < 0 && !ret)
1782 ret = err;
1783
1784 put_task_struct(tsk);
1785 }
1786
1787 num_waits += atomic_long_read(&smoke[idx].num_waits);
1788 num_fences += atomic_long_read(&smoke[idx].num_fences);
1789 idx++;
1790 }
1791 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1792 num_waits, num_fences, idx, ncpus);
1793
1794 ret = igt_live_test_end(&live) ?: ret;
1795 out_contexts:
1796 kfree(smoke[0].contexts);
1797 out_threads:
1798 kfree(threads);
1799 out_smoke:
1800 kfree(smoke);
1801 out_file:
1802 fput(file);
1803 out_rpm:
1804 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1805
1806 return ret;
1807 }
1808
i915_request_live_selftests(struct drm_i915_private * i915)1809 int i915_request_live_selftests(struct drm_i915_private *i915)
1810 {
1811 static const struct i915_subtest tests[] = {
1812 SUBTEST(live_nop_request),
1813 SUBTEST(live_all_engines),
1814 SUBTEST(live_sequential_engines),
1815 SUBTEST(live_parallel_engines),
1816 SUBTEST(live_empty_request),
1817 SUBTEST(live_cancel_request),
1818 SUBTEST(live_breadcrumbs_smoketest),
1819 };
1820
1821 if (intel_gt_is_wedged(to_gt(i915)))
1822 return 0;
1823
1824 return i915_live_subtests(tests, i915);
1825 }
1826
switch_to_kernel_sync(struct intel_context * ce,int err)1827 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1828 {
1829 struct i915_request *rq;
1830 struct dma_fence *fence;
1831
1832 rq = intel_engine_create_kernel_request(ce->engine);
1833 if (IS_ERR(rq))
1834 return PTR_ERR(rq);
1835
1836 fence = i915_active_fence_get(&ce->timeline->last_request);
1837 if (fence) {
1838 i915_request_await_dma_fence(rq, fence);
1839 dma_fence_put(fence);
1840 }
1841
1842 rq = i915_request_get(rq);
1843 i915_request_add(rq);
1844 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1845 err = -ETIME;
1846 i915_request_put(rq);
1847
1848 while (!err && !intel_engine_is_idle(ce->engine))
1849 intel_engine_flush_submission(ce->engine);
1850
1851 return err;
1852 }
1853
1854 struct perf_stats {
1855 struct intel_engine_cs *engine;
1856 unsigned long count;
1857 ktime_t time;
1858 ktime_t busy;
1859 u64 runtime;
1860 };
1861
1862 struct perf_series {
1863 struct drm_i915_private *i915;
1864 unsigned int nengines;
1865 struct intel_context *ce[];
1866 };
1867
cmp_u32(const void * A,const void * B)1868 static int cmp_u32(const void *A, const void *B)
1869 {
1870 const u32 *a = A, *b = B;
1871
1872 return *a - *b;
1873 }
1874
trifilter(u32 * a)1875 static u32 trifilter(u32 *a)
1876 {
1877 u64 sum;
1878
1879 #define TF_COUNT 5
1880 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1881
1882 sum = mul_u32_u32(a[2], 2);
1883 sum += a[1];
1884 sum += a[3];
1885
1886 GEM_BUG_ON(sum > U32_MAX);
1887 return sum;
1888 #define TF_BIAS 2
1889 }
1890
cycles_to_ns(struct intel_engine_cs * engine,u32 cycles)1891 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1892 {
1893 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1894
1895 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1896 }
1897
emit_timestamp_store(u32 * cs,struct intel_context * ce,u32 offset)1898 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1899 {
1900 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1901 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1902 *cs++ = offset;
1903 *cs++ = 0;
1904
1905 return cs;
1906 }
1907
emit_store_dw(u32 * cs,u32 offset,u32 value)1908 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1909 {
1910 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1911 *cs++ = offset;
1912 *cs++ = 0;
1913 *cs++ = value;
1914
1915 return cs;
1916 }
1917
emit_semaphore_poll(u32 * cs,u32 mode,u32 value,u32 offset)1918 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1919 {
1920 *cs++ = MI_SEMAPHORE_WAIT |
1921 MI_SEMAPHORE_GLOBAL_GTT |
1922 MI_SEMAPHORE_POLL |
1923 mode;
1924 *cs++ = value;
1925 *cs++ = offset;
1926 *cs++ = 0;
1927
1928 return cs;
1929 }
1930
emit_semaphore_poll_until(u32 * cs,u32 offset,u32 value)1931 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1932 {
1933 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1934 }
1935
semaphore_set(u32 * sema,u32 value)1936 static void semaphore_set(u32 *sema, u32 value)
1937 {
1938 WRITE_ONCE(*sema, value);
1939 wmb(); /* flush the update to the cache, and beyond */
1940 }
1941
hwsp_scratch(const struct intel_context * ce)1942 static u32 *hwsp_scratch(const struct intel_context *ce)
1943 {
1944 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1945 }
1946
hwsp_offset(const struct intel_context * ce,u32 * dw)1947 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1948 {
1949 return (i915_ggtt_offset(ce->engine->status_page.vma) +
1950 offset_in_page(dw));
1951 }
1952
measure_semaphore_response(struct intel_context * ce)1953 static int measure_semaphore_response(struct intel_context *ce)
1954 {
1955 u32 *sema = hwsp_scratch(ce);
1956 const u32 offset = hwsp_offset(ce, sema);
1957 u32 elapsed[TF_COUNT], cycles;
1958 struct i915_request *rq;
1959 u32 *cs;
1960 int err;
1961 int i;
1962
1963 /*
1964 * Measure how many cycles it takes for the HW to detect the change
1965 * in a semaphore value.
1966 *
1967 * A: read CS_TIMESTAMP from CPU
1968 * poke semaphore
1969 * B: read CS_TIMESTAMP on GPU
1970 *
1971 * Semaphore latency: B - A
1972 */
1973
1974 semaphore_set(sema, -1);
1975
1976 rq = i915_request_create(ce);
1977 if (IS_ERR(rq))
1978 return PTR_ERR(rq);
1979
1980 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1981 if (IS_ERR(cs)) {
1982 i915_request_add(rq);
1983 err = PTR_ERR(cs);
1984 goto err;
1985 }
1986
1987 cs = emit_store_dw(cs, offset, 0);
1988 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1989 cs = emit_semaphore_poll_until(cs, offset, i);
1990 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1991 cs = emit_store_dw(cs, offset, 0);
1992 }
1993
1994 intel_ring_advance(rq, cs);
1995 i915_request_add(rq);
1996
1997 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1998 err = -EIO;
1999 goto err;
2000 }
2001
2002 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2003 preempt_disable();
2004 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2005 semaphore_set(sema, i);
2006 preempt_enable();
2007
2008 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2009 err = -EIO;
2010 goto err;
2011 }
2012
2013 elapsed[i - 1] = sema[i] - cycles;
2014 }
2015
2016 cycles = trifilter(elapsed);
2017 pr_info("%s: semaphore response %d cycles, %lluns\n",
2018 ce->engine->name, cycles >> TF_BIAS,
2019 cycles_to_ns(ce->engine, cycles));
2020
2021 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2022
2023 err:
2024 intel_gt_set_wedged(ce->engine->gt);
2025 return err;
2026 }
2027
measure_idle_dispatch(struct intel_context * ce)2028 static int measure_idle_dispatch(struct intel_context *ce)
2029 {
2030 u32 *sema = hwsp_scratch(ce);
2031 const u32 offset = hwsp_offset(ce, sema);
2032 u32 elapsed[TF_COUNT], cycles;
2033 u32 *cs;
2034 int err;
2035 int i;
2036
2037 /*
2038 * Measure how long it takes for us to submit a request while the
2039 * engine is idle, but is resting in our context.
2040 *
2041 * A: read CS_TIMESTAMP from CPU
2042 * submit request
2043 * B: read CS_TIMESTAMP on GPU
2044 *
2045 * Submission latency: B - A
2046 */
2047
2048 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2049 struct i915_request *rq;
2050
2051 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2052 if (err)
2053 return err;
2054
2055 rq = i915_request_create(ce);
2056 if (IS_ERR(rq)) {
2057 err = PTR_ERR(rq);
2058 goto err;
2059 }
2060
2061 cs = intel_ring_begin(rq, 4);
2062 if (IS_ERR(cs)) {
2063 i915_request_add(rq);
2064 err = PTR_ERR(cs);
2065 goto err;
2066 }
2067
2068 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2069
2070 intel_ring_advance(rq, cs);
2071
2072 preempt_disable();
2073 local_bh_disable();
2074 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2075 i915_request_add(rq);
2076 local_bh_enable();
2077 preempt_enable();
2078 }
2079
2080 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2081 if (err)
2082 goto err;
2083
2084 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2085 elapsed[i] = sema[i] - elapsed[i];
2086
2087 cycles = trifilter(elapsed);
2088 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2089 ce->engine->name, cycles >> TF_BIAS,
2090 cycles_to_ns(ce->engine, cycles));
2091
2092 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2093
2094 err:
2095 intel_gt_set_wedged(ce->engine->gt);
2096 return err;
2097 }
2098
measure_busy_dispatch(struct intel_context * ce)2099 static int measure_busy_dispatch(struct intel_context *ce)
2100 {
2101 u32 *sema = hwsp_scratch(ce);
2102 const u32 offset = hwsp_offset(ce, sema);
2103 u32 elapsed[TF_COUNT + 1], cycles;
2104 u32 *cs;
2105 int err;
2106 int i;
2107
2108 /*
2109 * Measure how long it takes for us to submit a request while the
2110 * engine is busy, polling on a semaphore in our context. With
2111 * direct submission, this will include the cost of a lite restore.
2112 *
2113 * A: read CS_TIMESTAMP from CPU
2114 * submit request
2115 * B: read CS_TIMESTAMP on GPU
2116 *
2117 * Submission latency: B - A
2118 */
2119
2120 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2121 struct i915_request *rq;
2122
2123 rq = i915_request_create(ce);
2124 if (IS_ERR(rq)) {
2125 err = PTR_ERR(rq);
2126 goto err;
2127 }
2128
2129 cs = intel_ring_begin(rq, 12);
2130 if (IS_ERR(cs)) {
2131 i915_request_add(rq);
2132 err = PTR_ERR(cs);
2133 goto err;
2134 }
2135
2136 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2137 cs = emit_semaphore_poll_until(cs, offset, i);
2138 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2139
2140 intel_ring_advance(rq, cs);
2141
2142 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2143 err = -EIO;
2144 goto err;
2145 }
2146
2147 preempt_disable();
2148 local_bh_disable();
2149 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2150 i915_request_add(rq);
2151 local_bh_enable();
2152 semaphore_set(sema, i - 1);
2153 preempt_enable();
2154 }
2155
2156 wait_for(READ_ONCE(sema[i - 1]), 500);
2157 semaphore_set(sema, i - 1);
2158
2159 for (i = 1; i <= TF_COUNT; i++) {
2160 GEM_BUG_ON(sema[i] == -1);
2161 elapsed[i - 1] = sema[i] - elapsed[i];
2162 }
2163
2164 cycles = trifilter(elapsed);
2165 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2166 ce->engine->name, cycles >> TF_BIAS,
2167 cycles_to_ns(ce->engine, cycles));
2168
2169 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2170
2171 err:
2172 intel_gt_set_wedged(ce->engine->gt);
2173 return err;
2174 }
2175
plug(struct intel_engine_cs * engine,u32 * sema,u32 mode,int value)2176 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2177 {
2178 const u32 offset =
2179 i915_ggtt_offset(engine->status_page.vma) +
2180 offset_in_page(sema);
2181 struct i915_request *rq;
2182 u32 *cs;
2183
2184 rq = i915_request_create(engine->kernel_context);
2185 if (IS_ERR(rq))
2186 return PTR_ERR(rq);
2187
2188 cs = intel_ring_begin(rq, 4);
2189 if (IS_ERR(cs)) {
2190 i915_request_add(rq);
2191 return PTR_ERR(cs);
2192 }
2193
2194 cs = emit_semaphore_poll(cs, mode, value, offset);
2195
2196 intel_ring_advance(rq, cs);
2197 i915_request_add(rq);
2198
2199 return 0;
2200 }
2201
measure_inter_request(struct intel_context * ce)2202 static int measure_inter_request(struct intel_context *ce)
2203 {
2204 u32 *sema = hwsp_scratch(ce);
2205 const u32 offset = hwsp_offset(ce, sema);
2206 u32 elapsed[TF_COUNT + 1], cycles;
2207 struct i915_sw_fence *submit;
2208 int i, err;
2209
2210 /*
2211 * Measure how long it takes to advance from one request into the
2212 * next. Between each request we flush the GPU caches to memory,
2213 * update the breadcrumbs, and then invalidate those caches.
2214 * We queue up all the requests to be submitted in one batch so
2215 * it should be one set of contiguous measurements.
2216 *
2217 * A: read CS_TIMESTAMP on GPU
2218 * advance request
2219 * B: read CS_TIMESTAMP on GPU
2220 *
2221 * Request latency: B - A
2222 */
2223
2224 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2225 if (err)
2226 return err;
2227
2228 submit = heap_fence_create(GFP_KERNEL);
2229 if (!submit) {
2230 semaphore_set(sema, 1);
2231 return -ENOMEM;
2232 }
2233
2234 intel_engine_flush_submission(ce->engine);
2235 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2236 struct i915_request *rq;
2237 u32 *cs;
2238
2239 rq = i915_request_create(ce);
2240 if (IS_ERR(rq)) {
2241 err = PTR_ERR(rq);
2242 goto err_submit;
2243 }
2244
2245 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2246 submit,
2247 GFP_KERNEL);
2248 if (err < 0) {
2249 i915_request_add(rq);
2250 goto err_submit;
2251 }
2252
2253 cs = intel_ring_begin(rq, 4);
2254 if (IS_ERR(cs)) {
2255 i915_request_add(rq);
2256 err = PTR_ERR(cs);
2257 goto err_submit;
2258 }
2259
2260 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2261
2262 intel_ring_advance(rq, cs);
2263 i915_request_add(rq);
2264 }
2265 i915_sw_fence_commit(submit);
2266 intel_engine_flush_submission(ce->engine);
2267 heap_fence_put(submit);
2268
2269 semaphore_set(sema, 1);
2270 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2271 if (err)
2272 goto err;
2273
2274 for (i = 1; i <= TF_COUNT; i++)
2275 elapsed[i - 1] = sema[i + 1] - sema[i];
2276
2277 cycles = trifilter(elapsed);
2278 pr_info("%s: inter-request latency %d cycles, %lluns\n",
2279 ce->engine->name, cycles >> TF_BIAS,
2280 cycles_to_ns(ce->engine, cycles));
2281
2282 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2283
2284 err_submit:
2285 i915_sw_fence_commit(submit);
2286 heap_fence_put(submit);
2287 semaphore_set(sema, 1);
2288 err:
2289 intel_gt_set_wedged(ce->engine->gt);
2290 return err;
2291 }
2292
measure_context_switch(struct intel_context * ce)2293 static int measure_context_switch(struct intel_context *ce)
2294 {
2295 u32 *sema = hwsp_scratch(ce);
2296 const u32 offset = hwsp_offset(ce, sema);
2297 struct i915_request *fence = NULL;
2298 u32 elapsed[TF_COUNT + 1], cycles;
2299 int i, j, err;
2300 u32 *cs;
2301
2302 /*
2303 * Measure how long it takes to advance from one request in one
2304 * context to a request in another context. This allows us to
2305 * measure how long the context save/restore take, along with all
2306 * the inter-context setup we require.
2307 *
2308 * A: read CS_TIMESTAMP on GPU
2309 * switch context
2310 * B: read CS_TIMESTAMP on GPU
2311 *
2312 * Context switch latency: B - A
2313 */
2314
2315 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2316 if (err)
2317 return err;
2318
2319 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2320 struct intel_context *arr[] = {
2321 ce, ce->engine->kernel_context
2322 };
2323 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2324
2325 for (j = 0; j < ARRAY_SIZE(arr); j++) {
2326 struct i915_request *rq;
2327
2328 rq = i915_request_create(arr[j]);
2329 if (IS_ERR(rq)) {
2330 err = PTR_ERR(rq);
2331 goto err_fence;
2332 }
2333
2334 if (fence) {
2335 err = i915_request_await_dma_fence(rq,
2336 &fence->fence);
2337 if (err) {
2338 i915_request_add(rq);
2339 goto err_fence;
2340 }
2341 }
2342
2343 cs = intel_ring_begin(rq, 4);
2344 if (IS_ERR(cs)) {
2345 i915_request_add(rq);
2346 err = PTR_ERR(cs);
2347 goto err_fence;
2348 }
2349
2350 cs = emit_timestamp_store(cs, ce, addr);
2351 addr += sizeof(u32);
2352
2353 intel_ring_advance(rq, cs);
2354
2355 i915_request_put(fence);
2356 fence = i915_request_get(rq);
2357
2358 i915_request_add(rq);
2359 }
2360 }
2361 i915_request_put(fence);
2362 intel_engine_flush_submission(ce->engine);
2363
2364 semaphore_set(sema, 1);
2365 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2366 if (err)
2367 goto err;
2368
2369 for (i = 1; i <= TF_COUNT; i++)
2370 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2371
2372 cycles = trifilter(elapsed);
2373 pr_info("%s: context switch latency %d cycles, %lluns\n",
2374 ce->engine->name, cycles >> TF_BIAS,
2375 cycles_to_ns(ce->engine, cycles));
2376
2377 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2378
2379 err_fence:
2380 i915_request_put(fence);
2381 semaphore_set(sema, 1);
2382 err:
2383 intel_gt_set_wedged(ce->engine->gt);
2384 return err;
2385 }
2386
measure_preemption(struct intel_context * ce)2387 static int measure_preemption(struct intel_context *ce)
2388 {
2389 u32 *sema = hwsp_scratch(ce);
2390 const u32 offset = hwsp_offset(ce, sema);
2391 u32 elapsed[TF_COUNT], cycles;
2392 u32 *cs;
2393 int err;
2394 int i;
2395
2396 /*
2397 * We measure two latencies while triggering preemption. The first
2398 * latency is how long it takes for us to submit a preempting request.
2399 * The second latency is how it takes for us to return from the
2400 * preemption back to the original context.
2401 *
2402 * A: read CS_TIMESTAMP from CPU
2403 * submit preemption
2404 * B: read CS_TIMESTAMP on GPU (in preempting context)
2405 * context switch
2406 * C: read CS_TIMESTAMP on GPU (in original context)
2407 *
2408 * Preemption dispatch latency: B - A
2409 * Preemption switch latency: C - B
2410 */
2411
2412 if (!intel_engine_has_preemption(ce->engine))
2413 return 0;
2414
2415 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2416 u32 addr = offset + 2 * i * sizeof(u32);
2417 struct i915_request *rq;
2418
2419 rq = i915_request_create(ce);
2420 if (IS_ERR(rq)) {
2421 err = PTR_ERR(rq);
2422 goto err;
2423 }
2424
2425 cs = intel_ring_begin(rq, 12);
2426 if (IS_ERR(cs)) {
2427 i915_request_add(rq);
2428 err = PTR_ERR(cs);
2429 goto err;
2430 }
2431
2432 cs = emit_store_dw(cs, addr, -1);
2433 cs = emit_semaphore_poll_until(cs, offset, i);
2434 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2435
2436 intel_ring_advance(rq, cs);
2437 i915_request_add(rq);
2438
2439 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2440 err = -EIO;
2441 goto err;
2442 }
2443
2444 rq = i915_request_create(ce->engine->kernel_context);
2445 if (IS_ERR(rq)) {
2446 err = PTR_ERR(rq);
2447 goto err;
2448 }
2449
2450 cs = intel_ring_begin(rq, 8);
2451 if (IS_ERR(cs)) {
2452 i915_request_add(rq);
2453 err = PTR_ERR(cs);
2454 goto err;
2455 }
2456
2457 cs = emit_timestamp_store(cs, ce, addr);
2458 cs = emit_store_dw(cs, offset, i);
2459
2460 intel_ring_advance(rq, cs);
2461 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2462
2463 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2464 i915_request_add(rq);
2465 }
2466
2467 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2468 err = -EIO;
2469 goto err;
2470 }
2471
2472 for (i = 1; i <= TF_COUNT; i++)
2473 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2474
2475 cycles = trifilter(elapsed);
2476 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2477 ce->engine->name, cycles >> TF_BIAS,
2478 cycles_to_ns(ce->engine, cycles));
2479
2480 for (i = 1; i <= TF_COUNT; i++)
2481 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2482
2483 cycles = trifilter(elapsed);
2484 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2485 ce->engine->name, cycles >> TF_BIAS,
2486 cycles_to_ns(ce->engine, cycles));
2487
2488 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2489
2490 err:
2491 intel_gt_set_wedged(ce->engine->gt);
2492 return err;
2493 }
2494
2495 struct signal_cb {
2496 struct dma_fence_cb base;
2497 bool seen;
2498 };
2499
signal_cb(struct dma_fence * fence,struct dma_fence_cb * cb)2500 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2501 {
2502 struct signal_cb *s = container_of(cb, typeof(*s), base);
2503
2504 smp_store_mb(s->seen, true); /* be safe, be strong */
2505 }
2506
measure_completion(struct intel_context * ce)2507 static int measure_completion(struct intel_context *ce)
2508 {
2509 u32 *sema = hwsp_scratch(ce);
2510 const u32 offset = hwsp_offset(ce, sema);
2511 u32 elapsed[TF_COUNT], cycles;
2512 u32 *cs;
2513 int err;
2514 int i;
2515
2516 /*
2517 * Measure how long it takes for the signal (interrupt) to be
2518 * sent from the GPU to be processed by the CPU.
2519 *
2520 * A: read CS_TIMESTAMP on GPU
2521 * signal
2522 * B: read CS_TIMESTAMP from CPU
2523 *
2524 * Completion latency: B - A
2525 */
2526
2527 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2528 struct signal_cb cb = { .seen = false };
2529 struct i915_request *rq;
2530
2531 rq = i915_request_create(ce);
2532 if (IS_ERR(rq)) {
2533 err = PTR_ERR(rq);
2534 goto err;
2535 }
2536
2537 cs = intel_ring_begin(rq, 12);
2538 if (IS_ERR(cs)) {
2539 i915_request_add(rq);
2540 err = PTR_ERR(cs);
2541 goto err;
2542 }
2543
2544 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2545 cs = emit_semaphore_poll_until(cs, offset, i);
2546 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2547
2548 intel_ring_advance(rq, cs);
2549
2550 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2551 i915_request_add(rq);
2552
2553 intel_engine_flush_submission(ce->engine);
2554 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2555 err = -EIO;
2556 goto err;
2557 }
2558
2559 preempt_disable();
2560 semaphore_set(sema, i);
2561 while (!READ_ONCE(cb.seen))
2562 cpu_relax();
2563
2564 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2565 preempt_enable();
2566 }
2567
2568 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2569 if (err)
2570 goto err;
2571
2572 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2573 GEM_BUG_ON(sema[i + 1] == -1);
2574 elapsed[i] = elapsed[i] - sema[i + 1];
2575 }
2576
2577 cycles = trifilter(elapsed);
2578 pr_info("%s: completion latency %d cycles, %lluns\n",
2579 ce->engine->name, cycles >> TF_BIAS,
2580 cycles_to_ns(ce->engine, cycles));
2581
2582 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2583
2584 err:
2585 intel_gt_set_wedged(ce->engine->gt);
2586 return err;
2587 }
2588
rps_pin(struct intel_gt * gt)2589 static void rps_pin(struct intel_gt *gt)
2590 {
2591 /* Pin the frequency to max */
2592 atomic_inc(>->rps.num_waiters);
2593 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2594
2595 mutex_lock(>->rps.lock);
2596 intel_rps_set(>->rps, gt->rps.max_freq);
2597 mutex_unlock(>->rps.lock);
2598 }
2599
rps_unpin(struct intel_gt * gt)2600 static void rps_unpin(struct intel_gt *gt)
2601 {
2602 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2603 atomic_dec(>->rps.num_waiters);
2604 }
2605
perf_request_latency(void * arg)2606 static int perf_request_latency(void *arg)
2607 {
2608 struct drm_i915_private *i915 = arg;
2609 struct intel_engine_cs *engine;
2610 struct pm_qos_request qos;
2611 int err = 0;
2612
2613 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2614 return 0;
2615
2616 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2617
2618 for_each_uabi_engine(engine, i915) {
2619 struct intel_context *ce;
2620
2621 ce = intel_context_create(engine);
2622 if (IS_ERR(ce)) {
2623 err = PTR_ERR(ce);
2624 goto out;
2625 }
2626
2627 err = intel_context_pin(ce);
2628 if (err) {
2629 intel_context_put(ce);
2630 goto out;
2631 }
2632
2633 st_engine_heartbeat_disable(engine);
2634 rps_pin(engine->gt);
2635
2636 if (err == 0)
2637 err = measure_semaphore_response(ce);
2638 if (err == 0)
2639 err = measure_idle_dispatch(ce);
2640 if (err == 0)
2641 err = measure_busy_dispatch(ce);
2642 if (err == 0)
2643 err = measure_inter_request(ce);
2644 if (err == 0)
2645 err = measure_context_switch(ce);
2646 if (err == 0)
2647 err = measure_preemption(ce);
2648 if (err == 0)
2649 err = measure_completion(ce);
2650
2651 rps_unpin(engine->gt);
2652 st_engine_heartbeat_enable(engine);
2653
2654 intel_context_unpin(ce);
2655 intel_context_put(ce);
2656 if (err)
2657 goto out;
2658 }
2659
2660 out:
2661 if (igt_flush_test(i915))
2662 err = -EIO;
2663
2664 cpu_latency_qos_remove_request(&qos);
2665 return err;
2666 }
2667
s_sync0(void * arg)2668 static int s_sync0(void *arg)
2669 {
2670 struct perf_series *ps = arg;
2671 IGT_TIMEOUT(end_time);
2672 unsigned int idx = 0;
2673 int err = 0;
2674
2675 GEM_BUG_ON(!ps->nengines);
2676 do {
2677 struct i915_request *rq;
2678
2679 rq = i915_request_create(ps->ce[idx]);
2680 if (IS_ERR(rq)) {
2681 err = PTR_ERR(rq);
2682 break;
2683 }
2684
2685 i915_request_get(rq);
2686 i915_request_add(rq);
2687
2688 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2689 err = -ETIME;
2690 i915_request_put(rq);
2691 if (err)
2692 break;
2693
2694 if (++idx == ps->nengines)
2695 idx = 0;
2696 } while (!__igt_timeout(end_time, NULL));
2697
2698 return err;
2699 }
2700
s_sync1(void * arg)2701 static int s_sync1(void *arg)
2702 {
2703 struct perf_series *ps = arg;
2704 struct i915_request *prev = NULL;
2705 IGT_TIMEOUT(end_time);
2706 unsigned int idx = 0;
2707 int err = 0;
2708
2709 GEM_BUG_ON(!ps->nengines);
2710 do {
2711 struct i915_request *rq;
2712
2713 rq = i915_request_create(ps->ce[idx]);
2714 if (IS_ERR(rq)) {
2715 err = PTR_ERR(rq);
2716 break;
2717 }
2718
2719 i915_request_get(rq);
2720 i915_request_add(rq);
2721
2722 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2723 err = -ETIME;
2724 i915_request_put(prev);
2725 prev = rq;
2726 if (err)
2727 break;
2728
2729 if (++idx == ps->nengines)
2730 idx = 0;
2731 } while (!__igt_timeout(end_time, NULL));
2732 i915_request_put(prev);
2733
2734 return err;
2735 }
2736
s_many(void * arg)2737 static int s_many(void *arg)
2738 {
2739 struct perf_series *ps = arg;
2740 IGT_TIMEOUT(end_time);
2741 unsigned int idx = 0;
2742
2743 GEM_BUG_ON(!ps->nengines);
2744 do {
2745 struct i915_request *rq;
2746
2747 rq = i915_request_create(ps->ce[idx]);
2748 if (IS_ERR(rq))
2749 return PTR_ERR(rq);
2750
2751 i915_request_add(rq);
2752
2753 if (++idx == ps->nengines)
2754 idx = 0;
2755 } while (!__igt_timeout(end_time, NULL));
2756
2757 return 0;
2758 }
2759
perf_series_engines(void * arg)2760 static int perf_series_engines(void *arg)
2761 {
2762 struct drm_i915_private *i915 = arg;
2763 static int (* const func[])(void *arg) = {
2764 s_sync0,
2765 s_sync1,
2766 s_many,
2767 NULL,
2768 };
2769 const unsigned int nengines = num_uabi_engines(i915);
2770 struct intel_engine_cs *engine;
2771 int (* const *fn)(void *arg);
2772 struct pm_qos_request qos;
2773 struct perf_stats *stats;
2774 struct perf_series *ps;
2775 unsigned int idx;
2776 int err = 0;
2777
2778 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2779 if (!stats)
2780 return -ENOMEM;
2781
2782 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2783 if (!ps) {
2784 kfree(stats);
2785 return -ENOMEM;
2786 }
2787
2788 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2789
2790 ps->i915 = i915;
2791 ps->nengines = nengines;
2792
2793 idx = 0;
2794 for_each_uabi_engine(engine, i915) {
2795 struct intel_context *ce;
2796
2797 ce = intel_context_create(engine);
2798 if (IS_ERR(ce)) {
2799 err = PTR_ERR(ce);
2800 goto out;
2801 }
2802
2803 err = intel_context_pin(ce);
2804 if (err) {
2805 intel_context_put(ce);
2806 goto out;
2807 }
2808
2809 ps->ce[idx++] = ce;
2810 }
2811 GEM_BUG_ON(idx != ps->nengines);
2812
2813 for (fn = func; *fn && !err; fn++) {
2814 char name[KSYM_NAME_LEN];
2815 struct igt_live_test t;
2816
2817 snprintf(name, sizeof(name), "%ps", *fn);
2818 err = igt_live_test_begin(&t, i915, __func__, name);
2819 if (err)
2820 break;
2821
2822 for (idx = 0; idx < nengines; idx++) {
2823 struct perf_stats *p =
2824 memset(&stats[idx], 0, sizeof(stats[idx]));
2825 struct intel_context *ce = ps->ce[idx];
2826
2827 p->engine = ps->ce[idx]->engine;
2828 intel_engine_pm_get(p->engine);
2829
2830 if (intel_engine_supports_stats(p->engine))
2831 p->busy = intel_engine_get_busy_time(p->engine,
2832 &p->time) + 1;
2833 else
2834 p->time = ktime_get();
2835 p->runtime = -intel_context_get_total_runtime_ns(ce);
2836 }
2837
2838 err = (*fn)(ps);
2839 if (igt_live_test_end(&t))
2840 err = -EIO;
2841
2842 for (idx = 0; idx < nengines; idx++) {
2843 struct perf_stats *p = &stats[idx];
2844 struct intel_context *ce = ps->ce[idx];
2845 int integer, decimal;
2846 u64 busy, dt, now;
2847
2848 if (p->busy)
2849 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2850 &now),
2851 p->busy - 1);
2852 else
2853 now = ktime_get();
2854 p->time = ktime_sub(now, p->time);
2855
2856 err = switch_to_kernel_sync(ce, err);
2857 p->runtime += intel_context_get_total_runtime_ns(ce);
2858 intel_engine_pm_put(p->engine);
2859
2860 busy = 100 * ktime_to_ns(p->busy);
2861 dt = ktime_to_ns(p->time);
2862 if (dt) {
2863 integer = div64_u64(busy, dt);
2864 busy -= integer * dt;
2865 decimal = div64_u64(100 * busy, dt);
2866 } else {
2867 integer = 0;
2868 decimal = 0;
2869 }
2870
2871 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2872 name, p->engine->name, ce->timeline->seqno,
2873 integer, decimal,
2874 div_u64(p->runtime, 1000 * 1000),
2875 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2876 }
2877 }
2878
2879 out:
2880 for (idx = 0; idx < nengines; idx++) {
2881 if (IS_ERR_OR_NULL(ps->ce[idx]))
2882 break;
2883
2884 intel_context_unpin(ps->ce[idx]);
2885 intel_context_put(ps->ce[idx]);
2886 }
2887 kfree(ps);
2888
2889 cpu_latency_qos_remove_request(&qos);
2890 kfree(stats);
2891 return err;
2892 }
2893
p_sync0(void * arg)2894 static int p_sync0(void *arg)
2895 {
2896 struct perf_stats *p = arg;
2897 struct intel_engine_cs *engine = p->engine;
2898 struct intel_context *ce;
2899 IGT_TIMEOUT(end_time);
2900 unsigned long count;
2901 bool busy;
2902 int err = 0;
2903
2904 ce = intel_context_create(engine);
2905 if (IS_ERR(ce))
2906 return PTR_ERR(ce);
2907
2908 err = intel_context_pin(ce);
2909 if (err) {
2910 intel_context_put(ce);
2911 return err;
2912 }
2913
2914 if (intel_engine_supports_stats(engine)) {
2915 p->busy = intel_engine_get_busy_time(engine, &p->time);
2916 busy = true;
2917 } else {
2918 p->time = ktime_get();
2919 busy = false;
2920 }
2921
2922 count = 0;
2923 do {
2924 struct i915_request *rq;
2925
2926 rq = i915_request_create(ce);
2927 if (IS_ERR(rq)) {
2928 err = PTR_ERR(rq);
2929 break;
2930 }
2931
2932 i915_request_get(rq);
2933 i915_request_add(rq);
2934
2935 err = 0;
2936 if (i915_request_wait(rq, 0, HZ) < 0)
2937 err = -ETIME;
2938 i915_request_put(rq);
2939 if (err)
2940 break;
2941
2942 count++;
2943 } while (!__igt_timeout(end_time, NULL));
2944
2945 if (busy) {
2946 ktime_t now;
2947
2948 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2949 p->busy);
2950 p->time = ktime_sub(now, p->time);
2951 } else {
2952 p->time = ktime_sub(ktime_get(), p->time);
2953 }
2954
2955 err = switch_to_kernel_sync(ce, err);
2956 p->runtime = intel_context_get_total_runtime_ns(ce);
2957 p->count = count;
2958
2959 intel_context_unpin(ce);
2960 intel_context_put(ce);
2961 return err;
2962 }
2963
p_sync1(void * arg)2964 static int p_sync1(void *arg)
2965 {
2966 struct perf_stats *p = arg;
2967 struct intel_engine_cs *engine = p->engine;
2968 struct i915_request *prev = NULL;
2969 struct intel_context *ce;
2970 IGT_TIMEOUT(end_time);
2971 unsigned long count;
2972 bool busy;
2973 int err = 0;
2974
2975 ce = intel_context_create(engine);
2976 if (IS_ERR(ce))
2977 return PTR_ERR(ce);
2978
2979 err = intel_context_pin(ce);
2980 if (err) {
2981 intel_context_put(ce);
2982 return err;
2983 }
2984
2985 if (intel_engine_supports_stats(engine)) {
2986 p->busy = intel_engine_get_busy_time(engine, &p->time);
2987 busy = true;
2988 } else {
2989 p->time = ktime_get();
2990 busy = false;
2991 }
2992
2993 count = 0;
2994 do {
2995 struct i915_request *rq;
2996
2997 rq = i915_request_create(ce);
2998 if (IS_ERR(rq)) {
2999 err = PTR_ERR(rq);
3000 break;
3001 }
3002
3003 i915_request_get(rq);
3004 i915_request_add(rq);
3005
3006 err = 0;
3007 if (prev && i915_request_wait(prev, 0, HZ) < 0)
3008 err = -ETIME;
3009 i915_request_put(prev);
3010 prev = rq;
3011 if (err)
3012 break;
3013
3014 count++;
3015 } while (!__igt_timeout(end_time, NULL));
3016 i915_request_put(prev);
3017
3018 if (busy) {
3019 ktime_t now;
3020
3021 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3022 p->busy);
3023 p->time = ktime_sub(now, p->time);
3024 } else {
3025 p->time = ktime_sub(ktime_get(), p->time);
3026 }
3027
3028 err = switch_to_kernel_sync(ce, err);
3029 p->runtime = intel_context_get_total_runtime_ns(ce);
3030 p->count = count;
3031
3032 intel_context_unpin(ce);
3033 intel_context_put(ce);
3034 return err;
3035 }
3036
p_many(void * arg)3037 static int p_many(void *arg)
3038 {
3039 struct perf_stats *p = arg;
3040 struct intel_engine_cs *engine = p->engine;
3041 struct intel_context *ce;
3042 IGT_TIMEOUT(end_time);
3043 unsigned long count;
3044 int err = 0;
3045 bool busy;
3046
3047 ce = intel_context_create(engine);
3048 if (IS_ERR(ce))
3049 return PTR_ERR(ce);
3050
3051 err = intel_context_pin(ce);
3052 if (err) {
3053 intel_context_put(ce);
3054 return err;
3055 }
3056
3057 if (intel_engine_supports_stats(engine)) {
3058 p->busy = intel_engine_get_busy_time(engine, &p->time);
3059 busy = true;
3060 } else {
3061 p->time = ktime_get();
3062 busy = false;
3063 }
3064
3065 count = 0;
3066 do {
3067 struct i915_request *rq;
3068
3069 rq = i915_request_create(ce);
3070 if (IS_ERR(rq)) {
3071 err = PTR_ERR(rq);
3072 break;
3073 }
3074
3075 i915_request_add(rq);
3076 count++;
3077 } while (!__igt_timeout(end_time, NULL));
3078
3079 if (busy) {
3080 ktime_t now;
3081
3082 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3083 p->busy);
3084 p->time = ktime_sub(now, p->time);
3085 } else {
3086 p->time = ktime_sub(ktime_get(), p->time);
3087 }
3088
3089 err = switch_to_kernel_sync(ce, err);
3090 p->runtime = intel_context_get_total_runtime_ns(ce);
3091 p->count = count;
3092
3093 intel_context_unpin(ce);
3094 intel_context_put(ce);
3095 return err;
3096 }
3097
perf_parallel_engines(void * arg)3098 static int perf_parallel_engines(void *arg)
3099 {
3100 struct drm_i915_private *i915 = arg;
3101 static int (* const func[])(void *arg) = {
3102 p_sync0,
3103 p_sync1,
3104 p_many,
3105 NULL,
3106 };
3107 const unsigned int nengines = num_uabi_engines(i915);
3108 struct intel_engine_cs *engine;
3109 int (* const *fn)(void *arg);
3110 struct pm_qos_request qos;
3111 struct {
3112 struct perf_stats p;
3113 struct task_struct *tsk;
3114 } *engines;
3115 int err = 0;
3116
3117 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3118 if (!engines)
3119 return -ENOMEM;
3120
3121 cpu_latency_qos_add_request(&qos, 0);
3122
3123 for (fn = func; *fn; fn++) {
3124 char name[KSYM_NAME_LEN];
3125 struct igt_live_test t;
3126 unsigned int idx;
3127
3128 snprintf(name, sizeof(name), "%ps", *fn);
3129 err = igt_live_test_begin(&t, i915, __func__, name);
3130 if (err)
3131 break;
3132
3133 atomic_set(&i915->selftest.counter, nengines);
3134
3135 idx = 0;
3136 for_each_uabi_engine(engine, i915) {
3137 intel_engine_pm_get(engine);
3138
3139 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3140 engines[idx].p.engine = engine;
3141
3142 engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3143 "igt:%s", engine->name);
3144 if (IS_ERR(engines[idx].tsk)) {
3145 err = PTR_ERR(engines[idx].tsk);
3146 intel_engine_pm_put(engine);
3147 break;
3148 }
3149 get_task_struct(engines[idx++].tsk);
3150 }
3151
3152 yield(); /* start all threads before we kthread_stop() */
3153
3154 idx = 0;
3155 for_each_uabi_engine(engine, i915) {
3156 int status;
3157
3158 if (IS_ERR(engines[idx].tsk))
3159 break;
3160
3161 status = kthread_stop(engines[idx].tsk);
3162 if (status && !err)
3163 err = status;
3164
3165 intel_engine_pm_put(engine);
3166 put_task_struct(engines[idx++].tsk);
3167 }
3168
3169 if (igt_live_test_end(&t))
3170 err = -EIO;
3171 if (err)
3172 break;
3173
3174 idx = 0;
3175 for_each_uabi_engine(engine, i915) {
3176 struct perf_stats *p = &engines[idx].p;
3177 u64 busy = 100 * ktime_to_ns(p->busy);
3178 u64 dt = ktime_to_ns(p->time);
3179 int integer, decimal;
3180
3181 if (dt) {
3182 integer = div64_u64(busy, dt);
3183 busy -= integer * dt;
3184 decimal = div64_u64(100 * busy, dt);
3185 } else {
3186 integer = 0;
3187 decimal = 0;
3188 }
3189
3190 GEM_BUG_ON(engine != p->engine);
3191 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3192 name, engine->name, p->count, integer, decimal,
3193 div_u64(p->runtime, 1000 * 1000),
3194 div_u64(ktime_to_ns(p->time), 1000 * 1000));
3195 idx++;
3196 }
3197 }
3198
3199 cpu_latency_qos_remove_request(&qos);
3200 kfree(engines);
3201 return err;
3202 }
3203
i915_request_perf_selftests(struct drm_i915_private * i915)3204 int i915_request_perf_selftests(struct drm_i915_private *i915)
3205 {
3206 static const struct i915_subtest tests[] = {
3207 SUBTEST(perf_request_latency),
3208 SUBTEST(perf_series_engines),
3209 SUBTEST(perf_parallel_engines),
3210 };
3211
3212 if (intel_gt_is_wedged(to_gt(i915)))
3213 return 0;
3214
3215 return i915_subtests(tests, i915);
3216 }
3217