1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35
36 #include <drm/drm_atomic_helper.h>
37 #include <drm/drm_probe_helper.h>
38 #include <drm/amdgpu_drm.h>
39 #include <linux/vgaarb.h>
40 #include <linux/vga_switcheroo.h>
41 #include <linux/efi.h>
42 #include "amdgpu.h"
43 #include "amdgpu_trace.h"
44 #include "amdgpu_i2c.h"
45 #include "atom.h"
46 #include "amdgpu_atombios.h"
47 #include "amdgpu_atomfirmware.h"
48 #include "amd_pcie.h"
49 #ifdef CONFIG_DRM_AMDGPU_SI
50 #include "si.h"
51 #endif
52 #ifdef CONFIG_DRM_AMDGPU_CIK
53 #include "cik.h"
54 #endif
55 #include "vi.h"
56 #include "soc15.h"
57 #include "nv.h"
58 #include "bif/bif_4_1_d.h"
59 #include <linux/firmware.h>
60 #include "amdgpu_vf_error.h"
61
62 #include "amdgpu_amdkfd.h"
63 #include "amdgpu_pm.h"
64
65 #include "amdgpu_xgmi.h"
66 #include "amdgpu_ras.h"
67 #include "amdgpu_pmu.h"
68 #include "amdgpu_fru_eeprom.h"
69 #include "amdgpu_reset.h"
70
71 #include <linux/suspend.h>
72 #include <drm/task_barrier.h>
73 #include <linux/pm_runtime.h>
74
75 #include <drm/drm_drv.h>
76
77 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
84
85 #define AMDGPU_RESUME_MS 2000
86 #define AMDGPU_MAX_RETRY_LIMIT 2
87 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
88
89 const char *amdgpu_asic_name[] = {
90 "TAHITI",
91 "PITCAIRN",
92 "VERDE",
93 "OLAND",
94 "HAINAN",
95 "BONAIRE",
96 "KAVERI",
97 "KABINI",
98 "HAWAII",
99 "MULLINS",
100 "TOPAZ",
101 "TONGA",
102 "FIJI",
103 "CARRIZO",
104 "STONEY",
105 "POLARIS10",
106 "POLARIS11",
107 "POLARIS12",
108 "VEGAM",
109 "VEGA10",
110 "VEGA12",
111 "VEGA20",
112 "RAVEN",
113 "ARCTURUS",
114 "RENOIR",
115 "ALDEBARAN",
116 "NAVI10",
117 "CYAN_SKILLFISH",
118 "NAVI14",
119 "NAVI12",
120 "SIENNA_CICHLID",
121 "NAVY_FLOUNDER",
122 "VANGOGH",
123 "DIMGREY_CAVEFISH",
124 "BEIGE_GOBY",
125 "YELLOW_CARP",
126 "IP DISCOVERY",
127 "LAST",
128 };
129
130 /**
131 * DOC: pcie_replay_count
132 *
133 * The amdgpu driver provides a sysfs API for reporting the total number
134 * of PCIe replays (NAKs)
135 * The file pcie_replay_count is used for this and returns the total
136 * number of replays as a sum of the NAKs generated and NAKs received
137 */
138
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)139 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
140 struct device_attribute *attr, char *buf)
141 {
142 struct drm_device *ddev = dev_get_drvdata(dev);
143 struct amdgpu_device *adev = drm_to_adev(ddev);
144 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
145
146 return sysfs_emit(buf, "%llu\n", cnt);
147 }
148
149 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
150 amdgpu_device_get_pcie_replay_count, NULL);
151
152 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
153
154 /**
155 * DOC: product_name
156 *
157 * The amdgpu driver provides a sysfs API for reporting the product name
158 * for the device
159 * The file serial_number is used for this and returns the product name
160 * as returned from the FRU.
161 * NOTE: This is only available for certain server cards
162 */
163
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)164 static ssize_t amdgpu_device_get_product_name(struct device *dev,
165 struct device_attribute *attr, char *buf)
166 {
167 struct drm_device *ddev = dev_get_drvdata(dev);
168 struct amdgpu_device *adev = drm_to_adev(ddev);
169
170 return sysfs_emit(buf, "%s\n", adev->product_name);
171 }
172
173 static DEVICE_ATTR(product_name, S_IRUGO,
174 amdgpu_device_get_product_name, NULL);
175
176 /**
177 * DOC: product_number
178 *
179 * The amdgpu driver provides a sysfs API for reporting the part number
180 * for the device
181 * The file serial_number is used for this and returns the part number
182 * as returned from the FRU.
183 * NOTE: This is only available for certain server cards
184 */
185
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)186 static ssize_t amdgpu_device_get_product_number(struct device *dev,
187 struct device_attribute *attr, char *buf)
188 {
189 struct drm_device *ddev = dev_get_drvdata(dev);
190 struct amdgpu_device *adev = drm_to_adev(ddev);
191
192 return sysfs_emit(buf, "%s\n", adev->product_number);
193 }
194
195 static DEVICE_ATTR(product_number, S_IRUGO,
196 amdgpu_device_get_product_number, NULL);
197
198 /**
199 * DOC: serial_number
200 *
201 * The amdgpu driver provides a sysfs API for reporting the serial number
202 * for the device
203 * The file serial_number is used for this and returns the serial number
204 * as returned from the FRU.
205 * NOTE: This is only available for certain server cards
206 */
207
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)208 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
209 struct device_attribute *attr, char *buf)
210 {
211 struct drm_device *ddev = dev_get_drvdata(dev);
212 struct amdgpu_device *adev = drm_to_adev(ddev);
213
214 return sysfs_emit(buf, "%s\n", adev->serial);
215 }
216
217 static DEVICE_ATTR(serial_number, S_IRUGO,
218 amdgpu_device_get_serial_number, NULL);
219
220 /**
221 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
222 *
223 * @dev: drm_device pointer
224 *
225 * Returns true if the device is a dGPU with ATPX power control,
226 * otherwise return false.
227 */
amdgpu_device_supports_px(struct drm_device * dev)228 bool amdgpu_device_supports_px(struct drm_device *dev)
229 {
230 struct amdgpu_device *adev = drm_to_adev(dev);
231
232 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
233 return true;
234 return false;
235 }
236
237 /**
238 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
239 *
240 * @dev: drm_device pointer
241 *
242 * Returns true if the device is a dGPU with ACPI power control,
243 * otherwise return false.
244 */
amdgpu_device_supports_boco(struct drm_device * dev)245 bool amdgpu_device_supports_boco(struct drm_device *dev)
246 {
247 struct amdgpu_device *adev = drm_to_adev(dev);
248
249 if (adev->has_pr3 ||
250 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
251 return true;
252 return false;
253 }
254
255 /**
256 * amdgpu_device_supports_baco - Does the device support BACO
257 *
258 * @dev: drm_device pointer
259 *
260 * Returns true if the device supporte BACO,
261 * otherwise return false.
262 */
amdgpu_device_supports_baco(struct drm_device * dev)263 bool amdgpu_device_supports_baco(struct drm_device *dev)
264 {
265 struct amdgpu_device *adev = drm_to_adev(dev);
266
267 return amdgpu_asic_supports_baco(adev);
268 }
269
270 /**
271 * amdgpu_device_supports_smart_shift - Is the device dGPU with
272 * smart shift support
273 *
274 * @dev: drm_device pointer
275 *
276 * Returns true if the device is a dGPU with Smart Shift support,
277 * otherwise returns false.
278 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)279 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
280 {
281 return (amdgpu_device_supports_boco(dev) &&
282 amdgpu_acpi_is_power_shift_control_supported());
283 }
284
285 /*
286 * VRAM access helper functions
287 */
288
289 /**
290 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
291 *
292 * @adev: amdgpu_device pointer
293 * @pos: offset of the buffer in vram
294 * @buf: virtual address of the buffer in system memory
295 * @size: read/write size, sizeof(@buf) must > @size
296 * @write: true - write to vram, otherwise - read from vram
297 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)298 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
299 void *buf, size_t size, bool write)
300 {
301 unsigned long flags;
302 uint32_t hi = ~0, tmp = 0;
303 uint32_t *data = buf;
304 uint64_t last;
305 int idx;
306
307 if (!drm_dev_enter(adev_to_drm(adev), &idx))
308 return;
309
310 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
311
312 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
313 for (last = pos + size; pos < last; pos += 4) {
314 tmp = pos >> 31;
315
316 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
317 if (tmp != hi) {
318 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
319 hi = tmp;
320 }
321 if (write)
322 WREG32_NO_KIQ(mmMM_DATA, *data++);
323 else
324 *data++ = RREG32_NO_KIQ(mmMM_DATA);
325 }
326
327 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
328 drm_dev_exit(idx);
329 }
330
331 /**
332 * amdgpu_device_aper_access - access vram by vram aperature
333 *
334 * @adev: amdgpu_device pointer
335 * @pos: offset of the buffer in vram
336 * @buf: virtual address of the buffer in system memory
337 * @size: read/write size, sizeof(@buf) must > @size
338 * @write: true - write to vram, otherwise - read from vram
339 *
340 * The return value means how many bytes have been transferred.
341 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)342 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
343 void *buf, size_t size, bool write)
344 {
345 #ifdef CONFIG_64BIT
346 void __iomem *addr;
347 size_t count = 0;
348 uint64_t last;
349
350 if (!adev->mman.aper_base_kaddr)
351 return 0;
352
353 last = min(pos + size, adev->gmc.visible_vram_size);
354 if (last > pos) {
355 addr = adev->mman.aper_base_kaddr + pos;
356 count = last - pos;
357
358 if (write) {
359 memcpy_toio(addr, buf, count);
360 mb();
361 amdgpu_device_flush_hdp(adev, NULL);
362 } else {
363 amdgpu_device_invalidate_hdp(adev, NULL);
364 mb();
365 memcpy_fromio(buf, addr, count);
366 }
367
368 }
369
370 return count;
371 #else
372 return 0;
373 #endif
374 }
375
376 /**
377 * amdgpu_device_vram_access - read/write a buffer in vram
378 *
379 * @adev: amdgpu_device pointer
380 * @pos: offset of the buffer in vram
381 * @buf: virtual address of the buffer in system memory
382 * @size: read/write size, sizeof(@buf) must > @size
383 * @write: true - write to vram, otherwise - read from vram
384 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)385 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
386 void *buf, size_t size, bool write)
387 {
388 size_t count;
389
390 /* try to using vram apreature to access vram first */
391 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
392 size -= count;
393 if (size) {
394 /* using MM to access rest vram */
395 pos += count;
396 buf += count;
397 amdgpu_device_mm_access(adev, pos, buf, size, write);
398 }
399 }
400
401 /*
402 * register access helper functions.
403 */
404
405 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)406 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
407 {
408 if (adev->no_hw_access)
409 return true;
410
411 #ifdef CONFIG_LOCKDEP
412 /*
413 * This is a bit complicated to understand, so worth a comment. What we assert
414 * here is that the GPU reset is not running on another thread in parallel.
415 *
416 * For this we trylock the read side of the reset semaphore, if that succeeds
417 * we know that the reset is not running in paralell.
418 *
419 * If the trylock fails we assert that we are either already holding the read
420 * side of the lock or are the reset thread itself and hold the write side of
421 * the lock.
422 */
423 if (in_task()) {
424 if (down_read_trylock(&adev->reset_domain->sem))
425 up_read(&adev->reset_domain->sem);
426 else
427 lockdep_assert_held(&adev->reset_domain->sem);
428 }
429 #endif
430 return false;
431 }
432
433 /**
434 * amdgpu_device_rreg - read a memory mapped IO or indirect register
435 *
436 * @adev: amdgpu_device pointer
437 * @reg: dword aligned register offset
438 * @acc_flags: access flags which require special behavior
439 *
440 * Returns the 32 bit value from the offset specified.
441 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)442 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
443 uint32_t reg, uint32_t acc_flags)
444 {
445 uint32_t ret;
446
447 if (amdgpu_device_skip_hw_access(adev))
448 return 0;
449
450 if ((reg * 4) < adev->rmmio_size) {
451 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
452 amdgpu_sriov_runtime(adev) &&
453 down_read_trylock(&adev->reset_domain->sem)) {
454 ret = amdgpu_kiq_rreg(adev, reg);
455 up_read(&adev->reset_domain->sem);
456 } else {
457 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
458 }
459 } else {
460 ret = adev->pcie_rreg(adev, reg * 4);
461 }
462
463 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
464
465 return ret;
466 }
467
468 /*
469 * MMIO register read with bytes helper functions
470 * @offset:bytes offset from MMIO start
471 *
472 */
473
474 /**
475 * amdgpu_mm_rreg8 - read a memory mapped IO register
476 *
477 * @adev: amdgpu_device pointer
478 * @offset: byte aligned register offset
479 *
480 * Returns the 8 bit value from the offset specified.
481 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)482 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
483 {
484 if (amdgpu_device_skip_hw_access(adev))
485 return 0;
486
487 if (offset < adev->rmmio_size)
488 return (readb(adev->rmmio + offset));
489 BUG();
490 }
491
492 /*
493 * MMIO register write with bytes helper functions
494 * @offset:bytes offset from MMIO start
495 * @value: the value want to be written to the register
496 *
497 */
498 /**
499 * amdgpu_mm_wreg8 - read a memory mapped IO register
500 *
501 * @adev: amdgpu_device pointer
502 * @offset: byte aligned register offset
503 * @value: 8 bit value to write
504 *
505 * Writes the value specified to the offset specified.
506 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)507 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
508 {
509 if (amdgpu_device_skip_hw_access(adev))
510 return;
511
512 if (offset < adev->rmmio_size)
513 writeb(value, adev->rmmio + offset);
514 else
515 BUG();
516 }
517
518 /**
519 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
520 *
521 * @adev: amdgpu_device pointer
522 * @reg: dword aligned register offset
523 * @v: 32 bit value to write to the register
524 * @acc_flags: access flags which require special behavior
525 *
526 * Writes the value specified to the offset specified.
527 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)528 void amdgpu_device_wreg(struct amdgpu_device *adev,
529 uint32_t reg, uint32_t v,
530 uint32_t acc_flags)
531 {
532 if (amdgpu_device_skip_hw_access(adev))
533 return;
534
535 if ((reg * 4) < adev->rmmio_size) {
536 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
537 amdgpu_sriov_runtime(adev) &&
538 down_read_trylock(&adev->reset_domain->sem)) {
539 amdgpu_kiq_wreg(adev, reg, v);
540 up_read(&adev->reset_domain->sem);
541 } else {
542 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
543 }
544 } else {
545 adev->pcie_wreg(adev, reg * 4, v);
546 }
547
548 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
549 }
550
551 /**
552 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
553 *
554 * @adev: amdgpu_device pointer
555 * @reg: mmio/rlc register
556 * @v: value to write
557 *
558 * this function is invoked only for the debugfs register access
559 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)560 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
561 uint32_t reg, uint32_t v)
562 {
563 if (amdgpu_device_skip_hw_access(adev))
564 return;
565
566 if (amdgpu_sriov_fullaccess(adev) &&
567 adev->gfx.rlc.funcs &&
568 adev->gfx.rlc.funcs->is_rlcg_access_range) {
569 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
570 return amdgpu_sriov_wreg(adev, reg, v, 0, 0);
571 } else if ((reg * 4) >= adev->rmmio_size) {
572 adev->pcie_wreg(adev, reg * 4, v);
573 } else {
574 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
575 }
576 }
577
578 /**
579 * amdgpu_mm_rdoorbell - read a doorbell dword
580 *
581 * @adev: amdgpu_device pointer
582 * @index: doorbell index
583 *
584 * Returns the value in the doorbell aperture at the
585 * requested doorbell index (CIK).
586 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)587 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
588 {
589 if (amdgpu_device_skip_hw_access(adev))
590 return 0;
591
592 if (index < adev->doorbell.num_doorbells) {
593 return readl(adev->doorbell.ptr + index);
594 } else {
595 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
596 return 0;
597 }
598 }
599
600 /**
601 * amdgpu_mm_wdoorbell - write a doorbell dword
602 *
603 * @adev: amdgpu_device pointer
604 * @index: doorbell index
605 * @v: value to write
606 *
607 * Writes @v to the doorbell aperture at the
608 * requested doorbell index (CIK).
609 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)610 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
611 {
612 if (amdgpu_device_skip_hw_access(adev))
613 return;
614
615 if (index < adev->doorbell.num_doorbells) {
616 writel(v, adev->doorbell.ptr + index);
617 } else {
618 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
619 }
620 }
621
622 /**
623 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
624 *
625 * @adev: amdgpu_device pointer
626 * @index: doorbell index
627 *
628 * Returns the value in the doorbell aperture at the
629 * requested doorbell index (VEGA10+).
630 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)631 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
632 {
633 if (amdgpu_device_skip_hw_access(adev))
634 return 0;
635
636 if (index < adev->doorbell.num_doorbells) {
637 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
638 } else {
639 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
640 return 0;
641 }
642 }
643
644 /**
645 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
646 *
647 * @adev: amdgpu_device pointer
648 * @index: doorbell index
649 * @v: value to write
650 *
651 * Writes @v to the doorbell aperture at the
652 * requested doorbell index (VEGA10+).
653 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)654 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
655 {
656 if (amdgpu_device_skip_hw_access(adev))
657 return;
658
659 if (index < adev->doorbell.num_doorbells) {
660 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
661 } else {
662 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
663 }
664 }
665
666 /**
667 * amdgpu_device_indirect_rreg - read an indirect register
668 *
669 * @adev: amdgpu_device pointer
670 * @pcie_index: mmio register offset
671 * @pcie_data: mmio register offset
672 * @reg_addr: indirect register address to read from
673 *
674 * Returns the value of indirect register @reg_addr
675 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)676 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
677 u32 pcie_index, u32 pcie_data,
678 u32 reg_addr)
679 {
680 unsigned long flags;
681 u32 r;
682 void __iomem *pcie_index_offset;
683 void __iomem *pcie_data_offset;
684
685 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
686 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
687 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
688
689 writel(reg_addr, pcie_index_offset);
690 readl(pcie_index_offset);
691 r = readl(pcie_data_offset);
692 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
693
694 return r;
695 }
696
697 /**
698 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
699 *
700 * @adev: amdgpu_device pointer
701 * @pcie_index: mmio register offset
702 * @pcie_data: mmio register offset
703 * @reg_addr: indirect register address to read from
704 *
705 * Returns the value of indirect register @reg_addr
706 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)707 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
708 u32 pcie_index, u32 pcie_data,
709 u32 reg_addr)
710 {
711 unsigned long flags;
712 u64 r;
713 void __iomem *pcie_index_offset;
714 void __iomem *pcie_data_offset;
715
716 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
717 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
718 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
719
720 /* read low 32 bits */
721 writel(reg_addr, pcie_index_offset);
722 readl(pcie_index_offset);
723 r = readl(pcie_data_offset);
724 /* read high 32 bits */
725 writel(reg_addr + 4, pcie_index_offset);
726 readl(pcie_index_offset);
727 r |= ((u64)readl(pcie_data_offset) << 32);
728 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
729
730 return r;
731 }
732
733 /**
734 * amdgpu_device_indirect_wreg - write an indirect register address
735 *
736 * @adev: amdgpu_device pointer
737 * @pcie_index: mmio register offset
738 * @pcie_data: mmio register offset
739 * @reg_addr: indirect register offset
740 * @reg_data: indirect register data
741 *
742 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)743 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
744 u32 pcie_index, u32 pcie_data,
745 u32 reg_addr, u32 reg_data)
746 {
747 unsigned long flags;
748 void __iomem *pcie_index_offset;
749 void __iomem *pcie_data_offset;
750
751 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
752 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
753 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
754
755 writel(reg_addr, pcie_index_offset);
756 readl(pcie_index_offset);
757 writel(reg_data, pcie_data_offset);
758 readl(pcie_data_offset);
759 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
760 }
761
762 /**
763 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
764 *
765 * @adev: amdgpu_device pointer
766 * @pcie_index: mmio register offset
767 * @pcie_data: mmio register offset
768 * @reg_addr: indirect register offset
769 * @reg_data: indirect register data
770 *
771 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)772 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
773 u32 pcie_index, u32 pcie_data,
774 u32 reg_addr, u64 reg_data)
775 {
776 unsigned long flags;
777 void __iomem *pcie_index_offset;
778 void __iomem *pcie_data_offset;
779
780 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
781 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
782 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
783
784 /* write low 32 bits */
785 writel(reg_addr, pcie_index_offset);
786 readl(pcie_index_offset);
787 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
788 readl(pcie_data_offset);
789 /* write high 32 bits */
790 writel(reg_addr + 4, pcie_index_offset);
791 readl(pcie_index_offset);
792 writel((u32)(reg_data >> 32), pcie_data_offset);
793 readl(pcie_data_offset);
794 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
795 }
796
797 /**
798 * amdgpu_invalid_rreg - dummy reg read function
799 *
800 * @adev: amdgpu_device pointer
801 * @reg: offset of register
802 *
803 * Dummy register read function. Used for register blocks
804 * that certain asics don't have (all asics).
805 * Returns the value in the register.
806 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)807 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
808 {
809 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
810 BUG();
811 return 0;
812 }
813
814 /**
815 * amdgpu_invalid_wreg - dummy reg write function
816 *
817 * @adev: amdgpu_device pointer
818 * @reg: offset of register
819 * @v: value to write to the register
820 *
821 * Dummy register read function. Used for register blocks
822 * that certain asics don't have (all asics).
823 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)824 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
825 {
826 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
827 reg, v);
828 BUG();
829 }
830
831 /**
832 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
833 *
834 * @adev: amdgpu_device pointer
835 * @reg: offset of register
836 *
837 * Dummy register read function. Used for register blocks
838 * that certain asics don't have (all asics).
839 * Returns the value in the register.
840 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)841 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
842 {
843 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
844 BUG();
845 return 0;
846 }
847
848 /**
849 * amdgpu_invalid_wreg64 - dummy reg write function
850 *
851 * @adev: amdgpu_device pointer
852 * @reg: offset of register
853 * @v: value to write to the register
854 *
855 * Dummy register read function. Used for register blocks
856 * that certain asics don't have (all asics).
857 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)858 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
859 {
860 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
861 reg, v);
862 BUG();
863 }
864
865 /**
866 * amdgpu_block_invalid_rreg - dummy reg read function
867 *
868 * @adev: amdgpu_device pointer
869 * @block: offset of instance
870 * @reg: offset of register
871 *
872 * Dummy register read function. Used for register blocks
873 * that certain asics don't have (all asics).
874 * Returns the value in the register.
875 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)876 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
877 uint32_t block, uint32_t reg)
878 {
879 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
880 reg, block);
881 BUG();
882 return 0;
883 }
884
885 /**
886 * amdgpu_block_invalid_wreg - dummy reg write function
887 *
888 * @adev: amdgpu_device pointer
889 * @block: offset of instance
890 * @reg: offset of register
891 * @v: value to write to the register
892 *
893 * Dummy register read function. Used for register blocks
894 * that certain asics don't have (all asics).
895 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)896 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
897 uint32_t block,
898 uint32_t reg, uint32_t v)
899 {
900 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
901 reg, block, v);
902 BUG();
903 }
904
905 /**
906 * amdgpu_device_asic_init - Wrapper for atom asic_init
907 *
908 * @adev: amdgpu_device pointer
909 *
910 * Does any asic specific work and then calls atom asic init.
911 */
amdgpu_device_asic_init(struct amdgpu_device * adev)912 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
913 {
914 amdgpu_asic_pre_asic_init(adev);
915
916 if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
917 return amdgpu_atomfirmware_asic_init(adev, true);
918 else
919 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
920 }
921
922 /**
923 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
924 *
925 * @adev: amdgpu_device pointer
926 *
927 * Allocates a scratch page of VRAM for use by various things in the
928 * driver.
929 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)930 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
931 {
932 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
933 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
934 &adev->vram_scratch.robj,
935 &adev->vram_scratch.gpu_addr,
936 (void **)&adev->vram_scratch.ptr);
937 }
938
939 /**
940 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
941 *
942 * @adev: amdgpu_device pointer
943 *
944 * Frees the VRAM scratch page.
945 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)946 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
947 {
948 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
949 }
950
951 /**
952 * amdgpu_device_program_register_sequence - program an array of registers.
953 *
954 * @adev: amdgpu_device pointer
955 * @registers: pointer to the register array
956 * @array_size: size of the register array
957 *
958 * Programs an array or registers with and and or masks.
959 * This is a helper for setting golden registers.
960 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)961 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
962 const u32 *registers,
963 const u32 array_size)
964 {
965 u32 tmp, reg, and_mask, or_mask;
966 int i;
967
968 if (array_size % 3)
969 return;
970
971 for (i = 0; i < array_size; i +=3) {
972 reg = registers[i + 0];
973 and_mask = registers[i + 1];
974 or_mask = registers[i + 2];
975
976 if (and_mask == 0xffffffff) {
977 tmp = or_mask;
978 } else {
979 tmp = RREG32(reg);
980 tmp &= ~and_mask;
981 if (adev->family >= AMDGPU_FAMILY_AI)
982 tmp |= (or_mask & and_mask);
983 else
984 tmp |= or_mask;
985 }
986 WREG32(reg, tmp);
987 }
988 }
989
990 /**
991 * amdgpu_device_pci_config_reset - reset the GPU
992 *
993 * @adev: amdgpu_device pointer
994 *
995 * Resets the GPU using the pci config reset sequence.
996 * Only applicable to asics prior to vega10.
997 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)998 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
999 {
1000 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1001 }
1002
1003 /**
1004 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1005 *
1006 * @adev: amdgpu_device pointer
1007 *
1008 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1009 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1010 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1011 {
1012 return pci_reset_function(adev->pdev);
1013 }
1014
1015 /*
1016 * GPU doorbell aperture helpers function.
1017 */
1018 /**
1019 * amdgpu_device_doorbell_init - Init doorbell driver information.
1020 *
1021 * @adev: amdgpu_device pointer
1022 *
1023 * Init doorbell driver information (CIK)
1024 * Returns 0 on success, error on failure.
1025 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)1026 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1027 {
1028
1029 /* No doorbell on SI hardware generation */
1030 if (adev->asic_type < CHIP_BONAIRE) {
1031 adev->doorbell.base = 0;
1032 adev->doorbell.size = 0;
1033 adev->doorbell.num_doorbells = 0;
1034 adev->doorbell.ptr = NULL;
1035 return 0;
1036 }
1037
1038 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1039 return -EINVAL;
1040
1041 amdgpu_asic_init_doorbell_index(adev);
1042
1043 /* doorbell bar mapping */
1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1046
1047 if (adev->enable_mes) {
1048 adev->doorbell.num_doorbells =
1049 adev->doorbell.size / sizeof(u32);
1050 } else {
1051 adev->doorbell.num_doorbells =
1052 min_t(u32, adev->doorbell.size / sizeof(u32),
1053 adev->doorbell_index.max_assignment+1);
1054 if (adev->doorbell.num_doorbells == 0)
1055 return -EINVAL;
1056
1057 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1058 * paging queue doorbell use the second page. The
1059 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1060 * doorbells are in the first page. So with paging queue enabled,
1061 * the max num_doorbells should + 1 page (0x400 in dword)
1062 */
1063 if (adev->asic_type >= CHIP_VEGA10)
1064 adev->doorbell.num_doorbells += 0x400;
1065 }
1066
1067 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1068 adev->doorbell.num_doorbells *
1069 sizeof(u32));
1070 if (adev->doorbell.ptr == NULL)
1071 return -ENOMEM;
1072
1073 return 0;
1074 }
1075
1076 /**
1077 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1078 *
1079 * @adev: amdgpu_device pointer
1080 *
1081 * Tear down doorbell driver information (CIK)
1082 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)1083 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1084 {
1085 iounmap(adev->doorbell.ptr);
1086 adev->doorbell.ptr = NULL;
1087 }
1088
1089
1090
1091 /*
1092 * amdgpu_device_wb_*()
1093 * Writeback is the method by which the GPU updates special pages in memory
1094 * with the status of certain GPU events (fences, ring pointers,etc.).
1095 */
1096
1097 /**
1098 * amdgpu_device_wb_fini - Disable Writeback and free memory
1099 *
1100 * @adev: amdgpu_device pointer
1101 *
1102 * Disables Writeback and frees the Writeback memory (all asics).
1103 * Used at driver shutdown.
1104 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1105 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1106 {
1107 if (adev->wb.wb_obj) {
1108 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1109 &adev->wb.gpu_addr,
1110 (void **)&adev->wb.wb);
1111 adev->wb.wb_obj = NULL;
1112 }
1113 }
1114
1115 /**
1116 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1117 *
1118 * @adev: amdgpu_device pointer
1119 *
1120 * Initializes writeback and allocates writeback memory (all asics).
1121 * Used at driver startup.
1122 * Returns 0 on success or an -error on failure.
1123 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1124 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1125 {
1126 int r;
1127
1128 if (adev->wb.wb_obj == NULL) {
1129 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1130 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1131 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1132 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1133 (void **)&adev->wb.wb);
1134 if (r) {
1135 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1136 return r;
1137 }
1138
1139 adev->wb.num_wb = AMDGPU_MAX_WB;
1140 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1141
1142 /* clear wb memory */
1143 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1144 }
1145
1146 return 0;
1147 }
1148
1149 /**
1150 * amdgpu_device_wb_get - Allocate a wb entry
1151 *
1152 * @adev: amdgpu_device pointer
1153 * @wb: wb index
1154 *
1155 * Allocate a wb slot for use by the driver (all asics).
1156 * Returns 0 on success or -EINVAL on failure.
1157 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1158 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1159 {
1160 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1161
1162 if (offset < adev->wb.num_wb) {
1163 __set_bit(offset, adev->wb.used);
1164 *wb = offset << 3; /* convert to dw offset */
1165 return 0;
1166 } else {
1167 return -EINVAL;
1168 }
1169 }
1170
1171 /**
1172 * amdgpu_device_wb_free - Free a wb entry
1173 *
1174 * @adev: amdgpu_device pointer
1175 * @wb: wb index
1176 *
1177 * Free a wb slot allocated for use by the driver (all asics)
1178 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1179 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1180 {
1181 wb >>= 3;
1182 if (wb < adev->wb.num_wb)
1183 __clear_bit(wb, adev->wb.used);
1184 }
1185
1186 /**
1187 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1188 *
1189 * @adev: amdgpu_device pointer
1190 *
1191 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1192 * to fail, but if any of the BARs is not accessible after the size we abort
1193 * driver loading by returning -ENODEV.
1194 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1195 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1196 {
1197 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1198 struct pci_bus *root;
1199 struct resource *res;
1200 unsigned i;
1201 u16 cmd;
1202 int r;
1203
1204 /* Bypass for VF */
1205 if (amdgpu_sriov_vf(adev))
1206 return 0;
1207
1208 /* skip if the bios has already enabled large BAR */
1209 if (adev->gmc.real_vram_size &&
1210 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1211 return 0;
1212
1213 /* Check if the root BUS has 64bit memory resources */
1214 root = adev->pdev->bus;
1215 while (root->parent)
1216 root = root->parent;
1217
1218 pci_bus_for_each_resource(root, res, i) {
1219 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1220 res->start > 0x100000000ull)
1221 break;
1222 }
1223
1224 /* Trying to resize is pointless without a root hub window above 4GB */
1225 if (!res)
1226 return 0;
1227
1228 /* Limit the BAR size to what is available */
1229 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1230 rbar_size);
1231
1232 /* Disable memory decoding while we change the BAR addresses and size */
1233 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1234 pci_write_config_word(adev->pdev, PCI_COMMAND,
1235 cmd & ~PCI_COMMAND_MEMORY);
1236
1237 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1238 amdgpu_device_doorbell_fini(adev);
1239 if (adev->asic_type >= CHIP_BONAIRE)
1240 pci_release_resource(adev->pdev, 2);
1241
1242 pci_release_resource(adev->pdev, 0);
1243
1244 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1245 if (r == -ENOSPC)
1246 DRM_INFO("Not enough PCI address space for a large BAR.");
1247 else if (r && r != -ENOTSUPP)
1248 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1249
1250 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1251
1252 /* When the doorbell or fb BAR isn't available we have no chance of
1253 * using the device.
1254 */
1255 r = amdgpu_device_doorbell_init(adev);
1256 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1257 return -ENODEV;
1258
1259 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1260
1261 return 0;
1262 }
1263
1264 /*
1265 * GPU helpers function.
1266 */
1267 /**
1268 * amdgpu_device_need_post - check if the hw need post or not
1269 *
1270 * @adev: amdgpu_device pointer
1271 *
1272 * Check if the asic has been initialized (all asics) at driver startup
1273 * or post is needed if hw reset is performed.
1274 * Returns true if need or false if not.
1275 */
amdgpu_device_need_post(struct amdgpu_device * adev)1276 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1277 {
1278 uint32_t reg;
1279
1280 if (amdgpu_sriov_vf(adev))
1281 return false;
1282
1283 if (amdgpu_passthrough(adev)) {
1284 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1285 * some old smc fw still need driver do vPost otherwise gpu hang, while
1286 * those smc fw version above 22.15 doesn't have this flaw, so we force
1287 * vpost executed for smc version below 22.15
1288 */
1289 if (adev->asic_type == CHIP_FIJI) {
1290 int err;
1291 uint32_t fw_ver;
1292 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1293 /* force vPost if error occured */
1294 if (err)
1295 return true;
1296
1297 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1298 if (fw_ver < 0x00160e00)
1299 return true;
1300 }
1301 }
1302
1303 /* Don't post if we need to reset whole hive on init */
1304 if (adev->gmc.xgmi.pending_reset)
1305 return false;
1306
1307 if (adev->has_hw_reset) {
1308 adev->has_hw_reset = false;
1309 return true;
1310 }
1311
1312 /* bios scratch used on CIK+ */
1313 if (adev->asic_type >= CHIP_BONAIRE)
1314 return amdgpu_atombios_scratch_need_asic_init(adev);
1315
1316 /* check MEM_SIZE for older asics */
1317 reg = amdgpu_asic_get_config_memsize(adev);
1318
1319 if ((reg != 0) && (reg != 0xffffffff))
1320 return false;
1321
1322 return true;
1323 }
1324
1325 /**
1326 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1327 *
1328 * @adev: amdgpu_device pointer
1329 *
1330 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1331 * be set for this device.
1332 *
1333 * Returns true if it should be used or false if not.
1334 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1335 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1336 {
1337 switch (amdgpu_aspm) {
1338 case -1:
1339 break;
1340 case 0:
1341 return false;
1342 case 1:
1343 return true;
1344 default:
1345 return false;
1346 }
1347 return pcie_aspm_enabled(adev->pdev);
1348 }
1349
1350 /* if we get transitioned to only one device, take VGA back */
1351 /**
1352 * amdgpu_device_vga_set_decode - enable/disable vga decode
1353 *
1354 * @pdev: PCI device pointer
1355 * @state: enable/disable vga decode
1356 *
1357 * Enable/disable vga decode (all asics).
1358 * Returns VGA resource flags.
1359 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1360 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1361 bool state)
1362 {
1363 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1364 amdgpu_asic_set_vga_state(adev, state);
1365 if (state)
1366 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1367 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1368 else
1369 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1370 }
1371
1372 /**
1373 * amdgpu_device_check_block_size - validate the vm block size
1374 *
1375 * @adev: amdgpu_device pointer
1376 *
1377 * Validates the vm block size specified via module parameter.
1378 * The vm block size defines number of bits in page table versus page directory,
1379 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1380 * page table and the remaining bits are in the page directory.
1381 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1382 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1383 {
1384 /* defines number of bits in page table versus page directory,
1385 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1386 * page table and the remaining bits are in the page directory */
1387 if (amdgpu_vm_block_size == -1)
1388 return;
1389
1390 if (amdgpu_vm_block_size < 9) {
1391 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1392 amdgpu_vm_block_size);
1393 amdgpu_vm_block_size = -1;
1394 }
1395 }
1396
1397 /**
1398 * amdgpu_device_check_vm_size - validate the vm size
1399 *
1400 * @adev: amdgpu_device pointer
1401 *
1402 * Validates the vm size in GB specified via module parameter.
1403 * The VM size is the size of the GPU virtual memory space in GB.
1404 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1405 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1406 {
1407 /* no need to check the default value */
1408 if (amdgpu_vm_size == -1)
1409 return;
1410
1411 if (amdgpu_vm_size < 1) {
1412 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1413 amdgpu_vm_size);
1414 amdgpu_vm_size = -1;
1415 }
1416 }
1417
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1418 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1419 {
1420 struct sysinfo si;
1421 bool is_os_64 = (sizeof(void *) == 8);
1422 uint64_t total_memory;
1423 uint64_t dram_size_seven_GB = 0x1B8000000;
1424 uint64_t dram_size_three_GB = 0xB8000000;
1425
1426 if (amdgpu_smu_memory_pool_size == 0)
1427 return;
1428
1429 if (!is_os_64) {
1430 DRM_WARN("Not 64-bit OS, feature not supported\n");
1431 goto def_value;
1432 }
1433 si_meminfo(&si);
1434 total_memory = (uint64_t)si.totalram * si.mem_unit;
1435
1436 if ((amdgpu_smu_memory_pool_size == 1) ||
1437 (amdgpu_smu_memory_pool_size == 2)) {
1438 if (total_memory < dram_size_three_GB)
1439 goto def_value1;
1440 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1441 (amdgpu_smu_memory_pool_size == 8)) {
1442 if (total_memory < dram_size_seven_GB)
1443 goto def_value1;
1444 } else {
1445 DRM_WARN("Smu memory pool size not supported\n");
1446 goto def_value;
1447 }
1448 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1449
1450 return;
1451
1452 def_value1:
1453 DRM_WARN("No enough system memory\n");
1454 def_value:
1455 adev->pm.smu_prv_buffer_size = 0;
1456 }
1457
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1458 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1459 {
1460 if (!(adev->flags & AMD_IS_APU) ||
1461 adev->asic_type < CHIP_RAVEN)
1462 return 0;
1463
1464 switch (adev->asic_type) {
1465 case CHIP_RAVEN:
1466 if (adev->pdev->device == 0x15dd)
1467 adev->apu_flags |= AMD_APU_IS_RAVEN;
1468 if (adev->pdev->device == 0x15d8)
1469 adev->apu_flags |= AMD_APU_IS_PICASSO;
1470 break;
1471 case CHIP_RENOIR:
1472 if ((adev->pdev->device == 0x1636) ||
1473 (adev->pdev->device == 0x164c))
1474 adev->apu_flags |= AMD_APU_IS_RENOIR;
1475 else
1476 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1477 break;
1478 case CHIP_VANGOGH:
1479 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1480 break;
1481 case CHIP_YELLOW_CARP:
1482 break;
1483 case CHIP_CYAN_SKILLFISH:
1484 if ((adev->pdev->device == 0x13FE) ||
1485 (adev->pdev->device == 0x143F))
1486 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1487 break;
1488 default:
1489 break;
1490 }
1491
1492 return 0;
1493 }
1494
1495 /**
1496 * amdgpu_device_check_arguments - validate module params
1497 *
1498 * @adev: amdgpu_device pointer
1499 *
1500 * Validates certain module parameters and updates
1501 * the associated values used by the driver (all asics).
1502 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1503 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1504 {
1505 if (amdgpu_sched_jobs < 4) {
1506 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1507 amdgpu_sched_jobs);
1508 amdgpu_sched_jobs = 4;
1509 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1510 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1511 amdgpu_sched_jobs);
1512 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1513 }
1514
1515 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1516 /* gart size must be greater or equal to 32M */
1517 dev_warn(adev->dev, "gart size (%d) too small\n",
1518 amdgpu_gart_size);
1519 amdgpu_gart_size = -1;
1520 }
1521
1522 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1523 /* gtt size must be greater or equal to 32M */
1524 dev_warn(adev->dev, "gtt size (%d) too small\n",
1525 amdgpu_gtt_size);
1526 amdgpu_gtt_size = -1;
1527 }
1528
1529 /* valid range is between 4 and 9 inclusive */
1530 if (amdgpu_vm_fragment_size != -1 &&
1531 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1532 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1533 amdgpu_vm_fragment_size = -1;
1534 }
1535
1536 if (amdgpu_sched_hw_submission < 2) {
1537 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1538 amdgpu_sched_hw_submission);
1539 amdgpu_sched_hw_submission = 2;
1540 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1541 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1542 amdgpu_sched_hw_submission);
1543 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1544 }
1545
1546 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1547 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1548 amdgpu_reset_method = -1;
1549 }
1550
1551 amdgpu_device_check_smu_prv_buffer_size(adev);
1552
1553 amdgpu_device_check_vm_size(adev);
1554
1555 amdgpu_device_check_block_size(adev);
1556
1557 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1558
1559 return 0;
1560 }
1561
1562 /**
1563 * amdgpu_switcheroo_set_state - set switcheroo state
1564 *
1565 * @pdev: pci dev pointer
1566 * @state: vga_switcheroo state
1567 *
1568 * Callback for the switcheroo driver. Suspends or resumes the
1569 * the asics before or after it is powered up using ACPI methods.
1570 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1571 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1572 enum vga_switcheroo_state state)
1573 {
1574 struct drm_device *dev = pci_get_drvdata(pdev);
1575 int r;
1576
1577 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1578 return;
1579
1580 if (state == VGA_SWITCHEROO_ON) {
1581 pr_info("switched on\n");
1582 /* don't suspend or resume card normally */
1583 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1584
1585 pci_set_power_state(pdev, PCI_D0);
1586 amdgpu_device_load_pci_state(pdev);
1587 r = pci_enable_device(pdev);
1588 if (r)
1589 DRM_WARN("pci_enable_device failed (%d)\n", r);
1590 amdgpu_device_resume(dev, true);
1591
1592 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1593 } else {
1594 pr_info("switched off\n");
1595 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1596 amdgpu_device_suspend(dev, true);
1597 amdgpu_device_cache_pci_state(pdev);
1598 /* Shut down the device */
1599 pci_disable_device(pdev);
1600 pci_set_power_state(pdev, PCI_D3cold);
1601 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1602 }
1603 }
1604
1605 /**
1606 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1607 *
1608 * @pdev: pci dev pointer
1609 *
1610 * Callback for the switcheroo driver. Check of the switcheroo
1611 * state can be changed.
1612 * Returns true if the state can be changed, false if not.
1613 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1614 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1615 {
1616 struct drm_device *dev = pci_get_drvdata(pdev);
1617
1618 /*
1619 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1620 * locking inversion with the driver load path. And the access here is
1621 * completely racy anyway. So don't bother with locking for now.
1622 */
1623 return atomic_read(&dev->open_count) == 0;
1624 }
1625
1626 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1627 .set_gpu_state = amdgpu_switcheroo_set_state,
1628 .reprobe = NULL,
1629 .can_switch = amdgpu_switcheroo_can_switch,
1630 };
1631
1632 /**
1633 * amdgpu_device_ip_set_clockgating_state - set the CG state
1634 *
1635 * @dev: amdgpu_device pointer
1636 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1637 * @state: clockgating state (gate or ungate)
1638 *
1639 * Sets the requested clockgating state for all instances of
1640 * the hardware IP specified.
1641 * Returns the error code from the last instance.
1642 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1643 int amdgpu_device_ip_set_clockgating_state(void *dev,
1644 enum amd_ip_block_type block_type,
1645 enum amd_clockgating_state state)
1646 {
1647 struct amdgpu_device *adev = dev;
1648 int i, r = 0;
1649
1650 for (i = 0; i < adev->num_ip_blocks; i++) {
1651 if (!adev->ip_blocks[i].status.valid)
1652 continue;
1653 if (adev->ip_blocks[i].version->type != block_type)
1654 continue;
1655 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1656 continue;
1657 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1658 (void *)adev, state);
1659 if (r)
1660 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1661 adev->ip_blocks[i].version->funcs->name, r);
1662 }
1663 return r;
1664 }
1665
1666 /**
1667 * amdgpu_device_ip_set_powergating_state - set the PG state
1668 *
1669 * @dev: amdgpu_device pointer
1670 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1671 * @state: powergating state (gate or ungate)
1672 *
1673 * Sets the requested powergating state for all instances of
1674 * the hardware IP specified.
1675 * Returns the error code from the last instance.
1676 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1677 int amdgpu_device_ip_set_powergating_state(void *dev,
1678 enum amd_ip_block_type block_type,
1679 enum amd_powergating_state state)
1680 {
1681 struct amdgpu_device *adev = dev;
1682 int i, r = 0;
1683
1684 for (i = 0; i < adev->num_ip_blocks; i++) {
1685 if (!adev->ip_blocks[i].status.valid)
1686 continue;
1687 if (adev->ip_blocks[i].version->type != block_type)
1688 continue;
1689 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1690 continue;
1691 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1692 (void *)adev, state);
1693 if (r)
1694 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1695 adev->ip_blocks[i].version->funcs->name, r);
1696 }
1697 return r;
1698 }
1699
1700 /**
1701 * amdgpu_device_ip_get_clockgating_state - get the CG state
1702 *
1703 * @adev: amdgpu_device pointer
1704 * @flags: clockgating feature flags
1705 *
1706 * Walks the list of IPs on the device and updates the clockgating
1707 * flags for each IP.
1708 * Updates @flags with the feature flags for each hardware IP where
1709 * clockgating is enabled.
1710 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)1711 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1712 u64 *flags)
1713 {
1714 int i;
1715
1716 for (i = 0; i < adev->num_ip_blocks; i++) {
1717 if (!adev->ip_blocks[i].status.valid)
1718 continue;
1719 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1720 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1721 }
1722 }
1723
1724 /**
1725 * amdgpu_device_ip_wait_for_idle - wait for idle
1726 *
1727 * @adev: amdgpu_device pointer
1728 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1729 *
1730 * Waits for the request hardware IP to be idle.
1731 * Returns 0 for success or a negative error code on failure.
1732 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1733 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1734 enum amd_ip_block_type block_type)
1735 {
1736 int i, r;
1737
1738 for (i = 0; i < adev->num_ip_blocks; i++) {
1739 if (!adev->ip_blocks[i].status.valid)
1740 continue;
1741 if (adev->ip_blocks[i].version->type == block_type) {
1742 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1743 if (r)
1744 return r;
1745 break;
1746 }
1747 }
1748 return 0;
1749
1750 }
1751
1752 /**
1753 * amdgpu_device_ip_is_idle - is the hardware IP idle
1754 *
1755 * @adev: amdgpu_device pointer
1756 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1757 *
1758 * Check if the hardware IP is idle or not.
1759 * Returns true if it the IP is idle, false if not.
1760 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1761 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1762 enum amd_ip_block_type block_type)
1763 {
1764 int i;
1765
1766 for (i = 0; i < adev->num_ip_blocks; i++) {
1767 if (!adev->ip_blocks[i].status.valid)
1768 continue;
1769 if (adev->ip_blocks[i].version->type == block_type)
1770 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1771 }
1772 return true;
1773
1774 }
1775
1776 /**
1777 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1778 *
1779 * @adev: amdgpu_device pointer
1780 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1781 *
1782 * Returns a pointer to the hardware IP block structure
1783 * if it exists for the asic, otherwise NULL.
1784 */
1785 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1786 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1787 enum amd_ip_block_type type)
1788 {
1789 int i;
1790
1791 for (i = 0; i < adev->num_ip_blocks; i++)
1792 if (adev->ip_blocks[i].version->type == type)
1793 return &adev->ip_blocks[i];
1794
1795 return NULL;
1796 }
1797
1798 /**
1799 * amdgpu_device_ip_block_version_cmp
1800 *
1801 * @adev: amdgpu_device pointer
1802 * @type: enum amd_ip_block_type
1803 * @major: major version
1804 * @minor: minor version
1805 *
1806 * return 0 if equal or greater
1807 * return 1 if smaller or the ip_block doesn't exist
1808 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1809 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1810 enum amd_ip_block_type type,
1811 u32 major, u32 minor)
1812 {
1813 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1814
1815 if (ip_block && ((ip_block->version->major > major) ||
1816 ((ip_block->version->major == major) &&
1817 (ip_block->version->minor >= minor))))
1818 return 0;
1819
1820 return 1;
1821 }
1822
1823 /**
1824 * amdgpu_device_ip_block_add
1825 *
1826 * @adev: amdgpu_device pointer
1827 * @ip_block_version: pointer to the IP to add
1828 *
1829 * Adds the IP block driver information to the collection of IPs
1830 * on the asic.
1831 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1832 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1833 const struct amdgpu_ip_block_version *ip_block_version)
1834 {
1835 if (!ip_block_version)
1836 return -EINVAL;
1837
1838 switch (ip_block_version->type) {
1839 case AMD_IP_BLOCK_TYPE_VCN:
1840 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1841 return 0;
1842 break;
1843 case AMD_IP_BLOCK_TYPE_JPEG:
1844 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1845 return 0;
1846 break;
1847 default:
1848 break;
1849 }
1850
1851 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1852 ip_block_version->funcs->name);
1853
1854 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1855
1856 return 0;
1857 }
1858
1859 /**
1860 * amdgpu_device_enable_virtual_display - enable virtual display feature
1861 *
1862 * @adev: amdgpu_device pointer
1863 *
1864 * Enabled the virtual display feature if the user has enabled it via
1865 * the module parameter virtual_display. This feature provides a virtual
1866 * display hardware on headless boards or in virtualized environments.
1867 * This function parses and validates the configuration string specified by
1868 * the user and configues the virtual display configuration (number of
1869 * virtual connectors, crtcs, etc.) specified.
1870 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1871 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1872 {
1873 adev->enable_virtual_display = false;
1874
1875 if (amdgpu_virtual_display) {
1876 const char *pci_address_name = pci_name(adev->pdev);
1877 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1878
1879 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1880 pciaddstr_tmp = pciaddstr;
1881 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1882 pciaddname = strsep(&pciaddname_tmp, ",");
1883 if (!strcmp("all", pciaddname)
1884 || !strcmp(pci_address_name, pciaddname)) {
1885 long num_crtc;
1886 int res = -1;
1887
1888 adev->enable_virtual_display = true;
1889
1890 if (pciaddname_tmp)
1891 res = kstrtol(pciaddname_tmp, 10,
1892 &num_crtc);
1893
1894 if (!res) {
1895 if (num_crtc < 1)
1896 num_crtc = 1;
1897 if (num_crtc > 6)
1898 num_crtc = 6;
1899 adev->mode_info.num_crtc = num_crtc;
1900 } else {
1901 adev->mode_info.num_crtc = 1;
1902 }
1903 break;
1904 }
1905 }
1906
1907 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1908 amdgpu_virtual_display, pci_address_name,
1909 adev->enable_virtual_display, adev->mode_info.num_crtc);
1910
1911 kfree(pciaddstr);
1912 }
1913 }
1914
1915 /**
1916 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1917 *
1918 * @adev: amdgpu_device pointer
1919 *
1920 * Parses the asic configuration parameters specified in the gpu info
1921 * firmware and makes them availale to the driver for use in configuring
1922 * the asic.
1923 * Returns 0 on success, -EINVAL on failure.
1924 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1925 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1926 {
1927 const char *chip_name;
1928 char fw_name[40];
1929 int err;
1930 const struct gpu_info_firmware_header_v1_0 *hdr;
1931
1932 adev->firmware.gpu_info_fw = NULL;
1933
1934 if (adev->mman.discovery_bin) {
1935 /*
1936 * FIXME: The bounding box is still needed by Navi12, so
1937 * temporarily read it from gpu_info firmware. Should be dropped
1938 * when DAL no longer needs it.
1939 */
1940 if (adev->asic_type != CHIP_NAVI12)
1941 return 0;
1942 }
1943
1944 switch (adev->asic_type) {
1945 #ifdef CONFIG_DRM_AMDGPU_SI
1946 case CHIP_VERDE:
1947 case CHIP_TAHITI:
1948 case CHIP_PITCAIRN:
1949 case CHIP_OLAND:
1950 case CHIP_HAINAN:
1951 #endif
1952 #ifdef CONFIG_DRM_AMDGPU_CIK
1953 case CHIP_BONAIRE:
1954 case CHIP_HAWAII:
1955 case CHIP_KAVERI:
1956 case CHIP_KABINI:
1957 case CHIP_MULLINS:
1958 #endif
1959 case CHIP_TOPAZ:
1960 case CHIP_TONGA:
1961 case CHIP_FIJI:
1962 case CHIP_POLARIS10:
1963 case CHIP_POLARIS11:
1964 case CHIP_POLARIS12:
1965 case CHIP_VEGAM:
1966 case CHIP_CARRIZO:
1967 case CHIP_STONEY:
1968 case CHIP_VEGA20:
1969 case CHIP_ALDEBARAN:
1970 case CHIP_SIENNA_CICHLID:
1971 case CHIP_NAVY_FLOUNDER:
1972 case CHIP_DIMGREY_CAVEFISH:
1973 case CHIP_BEIGE_GOBY:
1974 default:
1975 return 0;
1976 case CHIP_VEGA10:
1977 chip_name = "vega10";
1978 break;
1979 case CHIP_VEGA12:
1980 chip_name = "vega12";
1981 break;
1982 case CHIP_RAVEN:
1983 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1984 chip_name = "raven2";
1985 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1986 chip_name = "picasso";
1987 else
1988 chip_name = "raven";
1989 break;
1990 case CHIP_ARCTURUS:
1991 chip_name = "arcturus";
1992 break;
1993 case CHIP_NAVI12:
1994 chip_name = "navi12";
1995 break;
1996 }
1997
1998 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1999 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
2000 if (err) {
2001 dev_err(adev->dev,
2002 "Failed to load gpu_info firmware \"%s\"\n",
2003 fw_name);
2004 goto out;
2005 }
2006 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
2007 if (err) {
2008 dev_err(adev->dev,
2009 "Failed to validate gpu_info firmware \"%s\"\n",
2010 fw_name);
2011 goto out;
2012 }
2013
2014 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2015 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2016
2017 switch (hdr->version_major) {
2018 case 1:
2019 {
2020 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2021 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2022 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2023
2024 /*
2025 * Should be droped when DAL no longer needs it.
2026 */
2027 if (adev->asic_type == CHIP_NAVI12)
2028 goto parse_soc_bounding_box;
2029
2030 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2031 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2032 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2033 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2034 adev->gfx.config.max_texture_channel_caches =
2035 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2036 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2037 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2038 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2039 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2040 adev->gfx.config.double_offchip_lds_buf =
2041 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2042 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2043 adev->gfx.cu_info.max_waves_per_simd =
2044 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2045 adev->gfx.cu_info.max_scratch_slots_per_cu =
2046 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2047 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2048 if (hdr->version_minor >= 1) {
2049 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2050 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2051 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2052 adev->gfx.config.num_sc_per_sh =
2053 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2054 adev->gfx.config.num_packer_per_sc =
2055 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2056 }
2057
2058 parse_soc_bounding_box:
2059 /*
2060 * soc bounding box info is not integrated in disocovery table,
2061 * we always need to parse it from gpu info firmware if needed.
2062 */
2063 if (hdr->version_minor == 2) {
2064 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2065 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2066 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2067 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2068 }
2069 break;
2070 }
2071 default:
2072 dev_err(adev->dev,
2073 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2074 err = -EINVAL;
2075 goto out;
2076 }
2077 out:
2078 return err;
2079 }
2080
2081 /**
2082 * amdgpu_device_ip_early_init - run early init for hardware IPs
2083 *
2084 * @adev: amdgpu_device pointer
2085 *
2086 * Early initialization pass for hardware IPs. The hardware IPs that make
2087 * up each asic are discovered each IP's early_init callback is run. This
2088 * is the first stage in initializing the asic.
2089 * Returns 0 on success, negative error code on failure.
2090 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2091 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2092 {
2093 struct drm_device *dev = adev_to_drm(adev);
2094 struct pci_dev *parent;
2095 int i, r;
2096
2097 amdgpu_device_enable_virtual_display(adev);
2098
2099 if (amdgpu_sriov_vf(adev)) {
2100 r = amdgpu_virt_request_full_gpu(adev, true);
2101 if (r)
2102 return r;
2103 }
2104
2105 switch (adev->asic_type) {
2106 #ifdef CONFIG_DRM_AMDGPU_SI
2107 case CHIP_VERDE:
2108 case CHIP_TAHITI:
2109 case CHIP_PITCAIRN:
2110 case CHIP_OLAND:
2111 case CHIP_HAINAN:
2112 adev->family = AMDGPU_FAMILY_SI;
2113 r = si_set_ip_blocks(adev);
2114 if (r)
2115 return r;
2116 break;
2117 #endif
2118 #ifdef CONFIG_DRM_AMDGPU_CIK
2119 case CHIP_BONAIRE:
2120 case CHIP_HAWAII:
2121 case CHIP_KAVERI:
2122 case CHIP_KABINI:
2123 case CHIP_MULLINS:
2124 if (adev->flags & AMD_IS_APU)
2125 adev->family = AMDGPU_FAMILY_KV;
2126 else
2127 adev->family = AMDGPU_FAMILY_CI;
2128
2129 r = cik_set_ip_blocks(adev);
2130 if (r)
2131 return r;
2132 break;
2133 #endif
2134 case CHIP_TOPAZ:
2135 case CHIP_TONGA:
2136 case CHIP_FIJI:
2137 case CHIP_POLARIS10:
2138 case CHIP_POLARIS11:
2139 case CHIP_POLARIS12:
2140 case CHIP_VEGAM:
2141 case CHIP_CARRIZO:
2142 case CHIP_STONEY:
2143 if (adev->flags & AMD_IS_APU)
2144 adev->family = AMDGPU_FAMILY_CZ;
2145 else
2146 adev->family = AMDGPU_FAMILY_VI;
2147
2148 r = vi_set_ip_blocks(adev);
2149 if (r)
2150 return r;
2151 break;
2152 default:
2153 r = amdgpu_discovery_set_ip_blocks(adev);
2154 if (r)
2155 return r;
2156 break;
2157 }
2158
2159 if (amdgpu_has_atpx() &&
2160 (amdgpu_is_atpx_hybrid() ||
2161 amdgpu_has_atpx_dgpu_power_cntl()) &&
2162 ((adev->flags & AMD_IS_APU) == 0) &&
2163 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2164 adev->flags |= AMD_IS_PX;
2165
2166 if (!(adev->flags & AMD_IS_APU)) {
2167 parent = pci_upstream_bridge(adev->pdev);
2168 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2169 }
2170
2171 amdgpu_amdkfd_device_probe(adev);
2172
2173 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2174 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2175 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2176 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2177 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2178
2179 for (i = 0; i < adev->num_ip_blocks; i++) {
2180 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2181 DRM_ERROR("disabled ip block: %d <%s>\n",
2182 i, adev->ip_blocks[i].version->funcs->name);
2183 adev->ip_blocks[i].status.valid = false;
2184 } else {
2185 if (adev->ip_blocks[i].version->funcs->early_init) {
2186 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2187 if (r == -ENOENT) {
2188 adev->ip_blocks[i].status.valid = false;
2189 } else if (r) {
2190 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2191 adev->ip_blocks[i].version->funcs->name, r);
2192 return r;
2193 } else {
2194 adev->ip_blocks[i].status.valid = true;
2195 }
2196 } else {
2197 adev->ip_blocks[i].status.valid = true;
2198 }
2199 }
2200 /* get the vbios after the asic_funcs are set up */
2201 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2202 r = amdgpu_device_parse_gpu_info_fw(adev);
2203 if (r)
2204 return r;
2205
2206 /* Read BIOS */
2207 if (!amdgpu_get_bios(adev))
2208 return -EINVAL;
2209
2210 r = amdgpu_atombios_init(adev);
2211 if (r) {
2212 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2213 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2214 return r;
2215 }
2216
2217 /*get pf2vf msg info at it's earliest time*/
2218 if (amdgpu_sriov_vf(adev))
2219 amdgpu_virt_init_data_exchange(adev);
2220
2221 }
2222 }
2223
2224 adev->cg_flags &= amdgpu_cg_mask;
2225 adev->pg_flags &= amdgpu_pg_mask;
2226
2227 return 0;
2228 }
2229
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2230 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2231 {
2232 int i, r;
2233
2234 for (i = 0; i < adev->num_ip_blocks; i++) {
2235 if (!adev->ip_blocks[i].status.sw)
2236 continue;
2237 if (adev->ip_blocks[i].status.hw)
2238 continue;
2239 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2240 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2241 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2242 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2243 if (r) {
2244 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2245 adev->ip_blocks[i].version->funcs->name, r);
2246 return r;
2247 }
2248 adev->ip_blocks[i].status.hw = true;
2249 }
2250 }
2251
2252 return 0;
2253 }
2254
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2255 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2256 {
2257 int i, r;
2258
2259 for (i = 0; i < adev->num_ip_blocks; i++) {
2260 if (!adev->ip_blocks[i].status.sw)
2261 continue;
2262 if (adev->ip_blocks[i].status.hw)
2263 continue;
2264 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2265 if (r) {
2266 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2267 adev->ip_blocks[i].version->funcs->name, r);
2268 return r;
2269 }
2270 adev->ip_blocks[i].status.hw = true;
2271 }
2272
2273 return 0;
2274 }
2275
amdgpu_device_fw_loading(struct amdgpu_device * adev)2276 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2277 {
2278 int r = 0;
2279 int i;
2280 uint32_t smu_version;
2281
2282 if (adev->asic_type >= CHIP_VEGA10) {
2283 for (i = 0; i < adev->num_ip_blocks; i++) {
2284 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2285 continue;
2286
2287 if (!adev->ip_blocks[i].status.sw)
2288 continue;
2289
2290 /* no need to do the fw loading again if already done*/
2291 if (adev->ip_blocks[i].status.hw == true)
2292 break;
2293
2294 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2295 r = adev->ip_blocks[i].version->funcs->resume(adev);
2296 if (r) {
2297 DRM_ERROR("resume of IP block <%s> failed %d\n",
2298 adev->ip_blocks[i].version->funcs->name, r);
2299 return r;
2300 }
2301 } else {
2302 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2303 if (r) {
2304 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2305 adev->ip_blocks[i].version->funcs->name, r);
2306 return r;
2307 }
2308 }
2309
2310 adev->ip_blocks[i].status.hw = true;
2311 break;
2312 }
2313 }
2314
2315 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2316 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2317
2318 return r;
2319 }
2320
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2321 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2322 {
2323 long timeout;
2324 int r, i;
2325
2326 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2327 struct amdgpu_ring *ring = adev->rings[i];
2328
2329 /* No need to setup the GPU scheduler for rings that don't need it */
2330 if (!ring || ring->no_scheduler)
2331 continue;
2332
2333 switch (ring->funcs->type) {
2334 case AMDGPU_RING_TYPE_GFX:
2335 timeout = adev->gfx_timeout;
2336 break;
2337 case AMDGPU_RING_TYPE_COMPUTE:
2338 timeout = adev->compute_timeout;
2339 break;
2340 case AMDGPU_RING_TYPE_SDMA:
2341 timeout = adev->sdma_timeout;
2342 break;
2343 default:
2344 timeout = adev->video_timeout;
2345 break;
2346 }
2347
2348 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2349 ring->num_hw_submission, amdgpu_job_hang_limit,
2350 timeout, adev->reset_domain->wq,
2351 ring->sched_score, ring->name,
2352 adev->dev);
2353 if (r) {
2354 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2355 ring->name);
2356 return r;
2357 }
2358 }
2359
2360 return 0;
2361 }
2362
2363
2364 /**
2365 * amdgpu_device_ip_init - run init for hardware IPs
2366 *
2367 * @adev: amdgpu_device pointer
2368 *
2369 * Main initialization pass for hardware IPs. The list of all the hardware
2370 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2371 * are run. sw_init initializes the software state associated with each IP
2372 * and hw_init initializes the hardware associated with each IP.
2373 * Returns 0 on success, negative error code on failure.
2374 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2375 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2376 {
2377 int i, r;
2378
2379 r = amdgpu_ras_init(adev);
2380 if (r)
2381 return r;
2382
2383 for (i = 0; i < adev->num_ip_blocks; i++) {
2384 if (!adev->ip_blocks[i].status.valid)
2385 continue;
2386 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2387 if (r) {
2388 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2389 adev->ip_blocks[i].version->funcs->name, r);
2390 goto init_failed;
2391 }
2392 adev->ip_blocks[i].status.sw = true;
2393
2394 /* need to do gmc hw init early so we can allocate gpu mem */
2395 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2396 /* Try to reserve bad pages early */
2397 if (amdgpu_sriov_vf(adev))
2398 amdgpu_virt_exchange_data(adev);
2399
2400 r = amdgpu_device_vram_scratch_init(adev);
2401 if (r) {
2402 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2403 goto init_failed;
2404 }
2405 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2406 if (r) {
2407 DRM_ERROR("hw_init %d failed %d\n", i, r);
2408 goto init_failed;
2409 }
2410 r = amdgpu_device_wb_init(adev);
2411 if (r) {
2412 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2413 goto init_failed;
2414 }
2415 adev->ip_blocks[i].status.hw = true;
2416
2417 /* right after GMC hw init, we create CSA */
2418 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2419 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2420 AMDGPU_GEM_DOMAIN_VRAM,
2421 AMDGPU_CSA_SIZE);
2422 if (r) {
2423 DRM_ERROR("allocate CSA failed %d\n", r);
2424 goto init_failed;
2425 }
2426 }
2427 }
2428 }
2429
2430 if (amdgpu_sriov_vf(adev))
2431 amdgpu_virt_init_data_exchange(adev);
2432
2433 r = amdgpu_ib_pool_init(adev);
2434 if (r) {
2435 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2436 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2437 goto init_failed;
2438 }
2439
2440 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2441 if (r)
2442 goto init_failed;
2443
2444 r = amdgpu_device_ip_hw_init_phase1(adev);
2445 if (r)
2446 goto init_failed;
2447
2448 r = amdgpu_device_fw_loading(adev);
2449 if (r)
2450 goto init_failed;
2451
2452 r = amdgpu_device_ip_hw_init_phase2(adev);
2453 if (r)
2454 goto init_failed;
2455
2456 /*
2457 * retired pages will be loaded from eeprom and reserved here,
2458 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2459 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2460 * for I2C communication which only true at this point.
2461 *
2462 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2463 * failure from bad gpu situation and stop amdgpu init process
2464 * accordingly. For other failed cases, it will still release all
2465 * the resource and print error message, rather than returning one
2466 * negative value to upper level.
2467 *
2468 * Note: theoretically, this should be called before all vram allocations
2469 * to protect retired page from abusing
2470 */
2471 r = amdgpu_ras_recovery_init(adev);
2472 if (r)
2473 goto init_failed;
2474
2475 /**
2476 * In case of XGMI grab extra reference for reset domain for this device
2477 */
2478 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2479 if (amdgpu_xgmi_add_device(adev) == 0) {
2480 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2481
2482 if (!hive->reset_domain ||
2483 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2484 r = -ENOENT;
2485 amdgpu_put_xgmi_hive(hive);
2486 goto init_failed;
2487 }
2488
2489 /* Drop the early temporary reset domain we created for device */
2490 amdgpu_reset_put_reset_domain(adev->reset_domain);
2491 adev->reset_domain = hive->reset_domain;
2492 amdgpu_put_xgmi_hive(hive);
2493 }
2494 }
2495
2496 r = amdgpu_device_init_schedulers(adev);
2497 if (r)
2498 goto init_failed;
2499
2500 /* Don't init kfd if whole hive need to be reset during init */
2501 if (!adev->gmc.xgmi.pending_reset)
2502 amdgpu_amdkfd_device_init(adev);
2503
2504 amdgpu_fru_get_product_info(adev);
2505
2506 init_failed:
2507 if (amdgpu_sriov_vf(adev))
2508 amdgpu_virt_release_full_gpu(adev, true);
2509
2510 return r;
2511 }
2512
2513 /**
2514 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2515 *
2516 * @adev: amdgpu_device pointer
2517 *
2518 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2519 * this function before a GPU reset. If the value is retained after a
2520 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2521 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2522 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2523 {
2524 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2525 }
2526
2527 /**
2528 * amdgpu_device_check_vram_lost - check if vram is valid
2529 *
2530 * @adev: amdgpu_device pointer
2531 *
2532 * Checks the reset magic value written to the gart pointer in VRAM.
2533 * The driver calls this after a GPU reset to see if the contents of
2534 * VRAM is lost or now.
2535 * returns true if vram is lost, false if not.
2536 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2537 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2538 {
2539 if (memcmp(adev->gart.ptr, adev->reset_magic,
2540 AMDGPU_RESET_MAGIC_NUM))
2541 return true;
2542
2543 if (!amdgpu_in_reset(adev))
2544 return false;
2545
2546 /*
2547 * For all ASICs with baco/mode1 reset, the VRAM is
2548 * always assumed to be lost.
2549 */
2550 switch (amdgpu_asic_reset_method(adev)) {
2551 case AMD_RESET_METHOD_BACO:
2552 case AMD_RESET_METHOD_MODE1:
2553 return true;
2554 default:
2555 return false;
2556 }
2557 }
2558
2559 /**
2560 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2561 *
2562 * @adev: amdgpu_device pointer
2563 * @state: clockgating state (gate or ungate)
2564 *
2565 * The list of all the hardware IPs that make up the asic is walked and the
2566 * set_clockgating_state callbacks are run.
2567 * Late initialization pass enabling clockgating for hardware IPs.
2568 * Fini or suspend, pass disabling clockgating for hardware IPs.
2569 * Returns 0 on success, negative error code on failure.
2570 */
2571
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2572 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2573 enum amd_clockgating_state state)
2574 {
2575 int i, j, r;
2576
2577 if (amdgpu_emu_mode == 1)
2578 return 0;
2579
2580 for (j = 0; j < adev->num_ip_blocks; j++) {
2581 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2582 if (!adev->ip_blocks[i].status.late_initialized)
2583 continue;
2584 /* skip CG for GFX on S0ix */
2585 if (adev->in_s0ix &&
2586 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2587 continue;
2588 /* skip CG for VCE/UVD, it's handled specially */
2589 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2590 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2591 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2592 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2593 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2594 /* enable clockgating to save power */
2595 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2596 state);
2597 if (r) {
2598 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2599 adev->ip_blocks[i].version->funcs->name, r);
2600 return r;
2601 }
2602 }
2603 }
2604
2605 return 0;
2606 }
2607
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2608 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2609 enum amd_powergating_state state)
2610 {
2611 int i, j, r;
2612
2613 if (amdgpu_emu_mode == 1)
2614 return 0;
2615
2616 for (j = 0; j < adev->num_ip_blocks; j++) {
2617 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2618 if (!adev->ip_blocks[i].status.late_initialized)
2619 continue;
2620 /* skip PG for GFX on S0ix */
2621 if (adev->in_s0ix &&
2622 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2623 continue;
2624 /* skip CG for VCE/UVD, it's handled specially */
2625 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2626 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2627 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2628 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2629 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2630 /* enable powergating to save power */
2631 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2632 state);
2633 if (r) {
2634 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2635 adev->ip_blocks[i].version->funcs->name, r);
2636 return r;
2637 }
2638 }
2639 }
2640 return 0;
2641 }
2642
amdgpu_device_enable_mgpu_fan_boost(void)2643 static int amdgpu_device_enable_mgpu_fan_boost(void)
2644 {
2645 struct amdgpu_gpu_instance *gpu_ins;
2646 struct amdgpu_device *adev;
2647 int i, ret = 0;
2648
2649 mutex_lock(&mgpu_info.mutex);
2650
2651 /*
2652 * MGPU fan boost feature should be enabled
2653 * only when there are two or more dGPUs in
2654 * the system
2655 */
2656 if (mgpu_info.num_dgpu < 2)
2657 goto out;
2658
2659 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2660 gpu_ins = &(mgpu_info.gpu_ins[i]);
2661 adev = gpu_ins->adev;
2662 if (!(adev->flags & AMD_IS_APU) &&
2663 !gpu_ins->mgpu_fan_enabled) {
2664 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2665 if (ret)
2666 break;
2667
2668 gpu_ins->mgpu_fan_enabled = 1;
2669 }
2670 }
2671
2672 out:
2673 mutex_unlock(&mgpu_info.mutex);
2674
2675 return ret;
2676 }
2677
2678 /**
2679 * amdgpu_device_ip_late_init - run late init for hardware IPs
2680 *
2681 * @adev: amdgpu_device pointer
2682 *
2683 * Late initialization pass for hardware IPs. The list of all the hardware
2684 * IPs that make up the asic is walked and the late_init callbacks are run.
2685 * late_init covers any special initialization that an IP requires
2686 * after all of the have been initialized or something that needs to happen
2687 * late in the init process.
2688 * Returns 0 on success, negative error code on failure.
2689 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2690 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2691 {
2692 struct amdgpu_gpu_instance *gpu_instance;
2693 int i = 0, r;
2694
2695 for (i = 0; i < adev->num_ip_blocks; i++) {
2696 if (!adev->ip_blocks[i].status.hw)
2697 continue;
2698 if (adev->ip_blocks[i].version->funcs->late_init) {
2699 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2700 if (r) {
2701 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2702 adev->ip_blocks[i].version->funcs->name, r);
2703 return r;
2704 }
2705 }
2706 adev->ip_blocks[i].status.late_initialized = true;
2707 }
2708
2709 r = amdgpu_ras_late_init(adev);
2710 if (r) {
2711 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2712 return r;
2713 }
2714
2715 amdgpu_ras_set_error_query_ready(adev, true);
2716
2717 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2718 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2719
2720 amdgpu_device_fill_reset_magic(adev);
2721
2722 r = amdgpu_device_enable_mgpu_fan_boost();
2723 if (r)
2724 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2725
2726 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2727 if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)||
2728 adev->asic_type == CHIP_ALDEBARAN ))
2729 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2730
2731 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2732 mutex_lock(&mgpu_info.mutex);
2733
2734 /*
2735 * Reset device p-state to low as this was booted with high.
2736 *
2737 * This should be performed only after all devices from the same
2738 * hive get initialized.
2739 *
2740 * However, it's unknown how many device in the hive in advance.
2741 * As this is counted one by one during devices initializations.
2742 *
2743 * So, we wait for all XGMI interlinked devices initialized.
2744 * This may bring some delays as those devices may come from
2745 * different hives. But that should be OK.
2746 */
2747 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2748 for (i = 0; i < mgpu_info.num_gpu; i++) {
2749 gpu_instance = &(mgpu_info.gpu_ins[i]);
2750 if (gpu_instance->adev->flags & AMD_IS_APU)
2751 continue;
2752
2753 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2754 AMDGPU_XGMI_PSTATE_MIN);
2755 if (r) {
2756 DRM_ERROR("pstate setting failed (%d).\n", r);
2757 break;
2758 }
2759 }
2760 }
2761
2762 mutex_unlock(&mgpu_info.mutex);
2763 }
2764
2765 return 0;
2766 }
2767
2768 /**
2769 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2770 *
2771 * @adev: amdgpu_device pointer
2772 *
2773 * For ASICs need to disable SMC first
2774 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)2775 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2776 {
2777 int i, r;
2778
2779 if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2780 return;
2781
2782 for (i = 0; i < adev->num_ip_blocks; i++) {
2783 if (!adev->ip_blocks[i].status.hw)
2784 continue;
2785 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2786 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2787 /* XXX handle errors */
2788 if (r) {
2789 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2790 adev->ip_blocks[i].version->funcs->name, r);
2791 }
2792 adev->ip_blocks[i].status.hw = false;
2793 break;
2794 }
2795 }
2796 }
2797
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2798 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2799 {
2800 int i, r;
2801
2802 for (i = 0; i < adev->num_ip_blocks; i++) {
2803 if (!adev->ip_blocks[i].version->funcs->early_fini)
2804 continue;
2805
2806 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2807 if (r) {
2808 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2809 adev->ip_blocks[i].version->funcs->name, r);
2810 }
2811 }
2812
2813 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2814 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2815
2816 amdgpu_amdkfd_suspend(adev, false);
2817
2818 /* Workaroud for ASICs need to disable SMC first */
2819 amdgpu_device_smu_fini_early(adev);
2820
2821 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2822 if (!adev->ip_blocks[i].status.hw)
2823 continue;
2824
2825 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2826 /* XXX handle errors */
2827 if (r) {
2828 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2829 adev->ip_blocks[i].version->funcs->name, r);
2830 }
2831
2832 adev->ip_blocks[i].status.hw = false;
2833 }
2834
2835 if (amdgpu_sriov_vf(adev)) {
2836 if (amdgpu_virt_release_full_gpu(adev, false))
2837 DRM_ERROR("failed to release exclusive mode on fini\n");
2838 }
2839
2840 return 0;
2841 }
2842
2843 /**
2844 * amdgpu_device_ip_fini - run fini for hardware IPs
2845 *
2846 * @adev: amdgpu_device pointer
2847 *
2848 * Main teardown pass for hardware IPs. The list of all the hardware
2849 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2850 * are run. hw_fini tears down the hardware associated with each IP
2851 * and sw_fini tears down any software state associated with each IP.
2852 * Returns 0 on success, negative error code on failure.
2853 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2854 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2855 {
2856 int i, r;
2857
2858 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2859 amdgpu_virt_release_ras_err_handler_data(adev);
2860
2861 if (adev->gmc.xgmi.num_physical_nodes > 1)
2862 amdgpu_xgmi_remove_device(adev);
2863
2864 amdgpu_amdkfd_device_fini_sw(adev);
2865
2866 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2867 if (!adev->ip_blocks[i].status.sw)
2868 continue;
2869
2870 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2871 amdgpu_ucode_free_bo(adev);
2872 amdgpu_free_static_csa(&adev->virt.csa_obj);
2873 amdgpu_device_wb_fini(adev);
2874 amdgpu_device_vram_scratch_fini(adev);
2875 amdgpu_ib_pool_fini(adev);
2876 }
2877
2878 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2879 /* XXX handle errors */
2880 if (r) {
2881 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2882 adev->ip_blocks[i].version->funcs->name, r);
2883 }
2884 adev->ip_blocks[i].status.sw = false;
2885 adev->ip_blocks[i].status.valid = false;
2886 }
2887
2888 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2889 if (!adev->ip_blocks[i].status.late_initialized)
2890 continue;
2891 if (adev->ip_blocks[i].version->funcs->late_fini)
2892 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2893 adev->ip_blocks[i].status.late_initialized = false;
2894 }
2895
2896 amdgpu_ras_fini(adev);
2897
2898 return 0;
2899 }
2900
2901 /**
2902 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2903 *
2904 * @work: work_struct.
2905 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2906 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2907 {
2908 struct amdgpu_device *adev =
2909 container_of(work, struct amdgpu_device, delayed_init_work.work);
2910 int r;
2911
2912 r = amdgpu_ib_ring_tests(adev);
2913 if (r)
2914 DRM_ERROR("ib ring test failed (%d).\n", r);
2915 }
2916
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2917 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2918 {
2919 struct amdgpu_device *adev =
2920 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2921
2922 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2923 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2924
2925 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2926 adev->gfx.gfx_off_state = true;
2927 }
2928
2929 /**
2930 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2931 *
2932 * @adev: amdgpu_device pointer
2933 *
2934 * Main suspend function for hardware IPs. The list of all the hardware
2935 * IPs that make up the asic is walked, clockgating is disabled and the
2936 * suspend callbacks are run. suspend puts the hardware and software state
2937 * in each IP into a state suitable for suspend.
2938 * Returns 0 on success, negative error code on failure.
2939 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2940 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2941 {
2942 int i, r;
2943
2944 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2945 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2946
2947 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2948 if (!adev->ip_blocks[i].status.valid)
2949 continue;
2950
2951 /* displays are handled separately */
2952 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2953 continue;
2954
2955 /* XXX handle errors */
2956 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2957 /* XXX handle errors */
2958 if (r) {
2959 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2960 adev->ip_blocks[i].version->funcs->name, r);
2961 return r;
2962 }
2963
2964 adev->ip_blocks[i].status.hw = false;
2965 }
2966
2967 return 0;
2968 }
2969
2970 /**
2971 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2972 *
2973 * @adev: amdgpu_device pointer
2974 *
2975 * Main suspend function for hardware IPs. The list of all the hardware
2976 * IPs that make up the asic is walked, clockgating is disabled and the
2977 * suspend callbacks are run. suspend puts the hardware and software state
2978 * in each IP into a state suitable for suspend.
2979 * Returns 0 on success, negative error code on failure.
2980 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2981 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2982 {
2983 int i, r;
2984
2985 if (adev->in_s0ix)
2986 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
2987
2988 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2989 if (!adev->ip_blocks[i].status.valid)
2990 continue;
2991 /* displays are handled in phase1 */
2992 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2993 continue;
2994 /* PSP lost connection when err_event_athub occurs */
2995 if (amdgpu_ras_intr_triggered() &&
2996 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2997 adev->ip_blocks[i].status.hw = false;
2998 continue;
2999 }
3000
3001 /* skip unnecessary suspend if we do not initialize them yet */
3002 if (adev->gmc.xgmi.pending_reset &&
3003 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3004 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3005 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3006 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3007 adev->ip_blocks[i].status.hw = false;
3008 continue;
3009 }
3010
3011 /* skip suspend of gfx and psp for S0ix
3012 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3013 * like at runtime. PSP is also part of the always on hardware
3014 * so no need to suspend it.
3015 */
3016 if (adev->in_s0ix &&
3017 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3018 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
3019 continue;
3020
3021 /* XXX handle errors */
3022 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3023 /* XXX handle errors */
3024 if (r) {
3025 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3026 adev->ip_blocks[i].version->funcs->name, r);
3027 }
3028 adev->ip_blocks[i].status.hw = false;
3029 /* handle putting the SMC in the appropriate state */
3030 if(!amdgpu_sriov_vf(adev)){
3031 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3032 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3033 if (r) {
3034 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3035 adev->mp1_state, r);
3036 return r;
3037 }
3038 }
3039 }
3040 }
3041
3042 return 0;
3043 }
3044
3045 /**
3046 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3047 *
3048 * @adev: amdgpu_device pointer
3049 *
3050 * Main suspend function for hardware IPs. The list of all the hardware
3051 * IPs that make up the asic is walked, clockgating is disabled and the
3052 * suspend callbacks are run. suspend puts the hardware and software state
3053 * in each IP into a state suitable for suspend.
3054 * Returns 0 on success, negative error code on failure.
3055 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3056 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3057 {
3058 int r;
3059
3060 if (amdgpu_sriov_vf(adev)) {
3061 amdgpu_virt_fini_data_exchange(adev);
3062 amdgpu_virt_request_full_gpu(adev, false);
3063 }
3064
3065 r = amdgpu_device_ip_suspend_phase1(adev);
3066 if (r)
3067 return r;
3068 r = amdgpu_device_ip_suspend_phase2(adev);
3069
3070 if (amdgpu_sriov_vf(adev))
3071 amdgpu_virt_release_full_gpu(adev, false);
3072
3073 return r;
3074 }
3075
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3076 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3077 {
3078 int i, r;
3079
3080 static enum amd_ip_block_type ip_order[] = {
3081 AMD_IP_BLOCK_TYPE_GMC,
3082 AMD_IP_BLOCK_TYPE_COMMON,
3083 AMD_IP_BLOCK_TYPE_PSP,
3084 AMD_IP_BLOCK_TYPE_IH,
3085 };
3086
3087 for (i = 0; i < adev->num_ip_blocks; i++) {
3088 int j;
3089 struct amdgpu_ip_block *block;
3090
3091 block = &adev->ip_blocks[i];
3092 block->status.hw = false;
3093
3094 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3095
3096 if (block->version->type != ip_order[j] ||
3097 !block->status.valid)
3098 continue;
3099
3100 r = block->version->funcs->hw_init(adev);
3101 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3102 if (r)
3103 return r;
3104 block->status.hw = true;
3105 }
3106 }
3107
3108 return 0;
3109 }
3110
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3111 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3112 {
3113 int i, r;
3114
3115 static enum amd_ip_block_type ip_order[] = {
3116 AMD_IP_BLOCK_TYPE_SMC,
3117 AMD_IP_BLOCK_TYPE_DCE,
3118 AMD_IP_BLOCK_TYPE_GFX,
3119 AMD_IP_BLOCK_TYPE_SDMA,
3120 AMD_IP_BLOCK_TYPE_UVD,
3121 AMD_IP_BLOCK_TYPE_VCE,
3122 AMD_IP_BLOCK_TYPE_VCN
3123 };
3124
3125 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3126 int j;
3127 struct amdgpu_ip_block *block;
3128
3129 for (j = 0; j < adev->num_ip_blocks; j++) {
3130 block = &adev->ip_blocks[j];
3131
3132 if (block->version->type != ip_order[i] ||
3133 !block->status.valid ||
3134 block->status.hw)
3135 continue;
3136
3137 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3138 r = block->version->funcs->resume(adev);
3139 else
3140 r = block->version->funcs->hw_init(adev);
3141
3142 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3143 if (r)
3144 return r;
3145 block->status.hw = true;
3146 }
3147 }
3148
3149 return 0;
3150 }
3151
3152 /**
3153 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3154 *
3155 * @adev: amdgpu_device pointer
3156 *
3157 * First resume function for hardware IPs. The list of all the hardware
3158 * IPs that make up the asic is walked and the resume callbacks are run for
3159 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3160 * after a suspend and updates the software state as necessary. This
3161 * function is also used for restoring the GPU after a GPU reset.
3162 * Returns 0 on success, negative error code on failure.
3163 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3164 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3165 {
3166 int i, r;
3167
3168 for (i = 0; i < adev->num_ip_blocks; i++) {
3169 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3170 continue;
3171 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3172 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3173 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
3174
3175 r = adev->ip_blocks[i].version->funcs->resume(adev);
3176 if (r) {
3177 DRM_ERROR("resume of IP block <%s> failed %d\n",
3178 adev->ip_blocks[i].version->funcs->name, r);
3179 return r;
3180 }
3181 adev->ip_blocks[i].status.hw = true;
3182 }
3183 }
3184
3185 return 0;
3186 }
3187
3188 /**
3189 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3190 *
3191 * @adev: amdgpu_device pointer
3192 *
3193 * First resume function for hardware IPs. The list of all the hardware
3194 * IPs that make up the asic is walked and the resume callbacks are run for
3195 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3196 * functional state after a suspend and updates the software state as
3197 * necessary. This function is also used for restoring the GPU after a GPU
3198 * reset.
3199 * Returns 0 on success, negative error code on failure.
3200 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3201 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3202 {
3203 int i, r;
3204
3205 for (i = 0; i < adev->num_ip_blocks; i++) {
3206 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3207 continue;
3208 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3209 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3210 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3211 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3212 continue;
3213 r = adev->ip_blocks[i].version->funcs->resume(adev);
3214 if (r) {
3215 DRM_ERROR("resume of IP block <%s> failed %d\n",
3216 adev->ip_blocks[i].version->funcs->name, r);
3217 return r;
3218 }
3219 adev->ip_blocks[i].status.hw = true;
3220 }
3221
3222 return 0;
3223 }
3224
3225 /**
3226 * amdgpu_device_ip_resume - run resume for hardware IPs
3227 *
3228 * @adev: amdgpu_device pointer
3229 *
3230 * Main resume function for hardware IPs. The hardware IPs
3231 * are split into two resume functions because they are
3232 * are also used in in recovering from a GPU reset and some additional
3233 * steps need to be take between them. In this case (S3/S4) they are
3234 * run sequentially.
3235 * Returns 0 on success, negative error code on failure.
3236 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3237 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3238 {
3239 int r;
3240
3241 r = amdgpu_amdkfd_resume_iommu(adev);
3242 if (r)
3243 return r;
3244
3245 r = amdgpu_device_ip_resume_phase1(adev);
3246 if (r)
3247 return r;
3248
3249 r = amdgpu_device_fw_loading(adev);
3250 if (r)
3251 return r;
3252
3253 r = amdgpu_device_ip_resume_phase2(adev);
3254
3255 return r;
3256 }
3257
3258 /**
3259 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3260 *
3261 * @adev: amdgpu_device pointer
3262 *
3263 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3264 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3265 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3266 {
3267 if (amdgpu_sriov_vf(adev)) {
3268 if (adev->is_atom_fw) {
3269 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3270 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3271 } else {
3272 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3273 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3274 }
3275
3276 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3277 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3278 }
3279 }
3280
3281 /**
3282 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3283 *
3284 * @asic_type: AMD asic type
3285 *
3286 * Check if there is DC (new modesetting infrastructre) support for an asic.
3287 * returns true if DC has support, false if not.
3288 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3289 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3290 {
3291 switch (asic_type) {
3292 #ifdef CONFIG_DRM_AMDGPU_SI
3293 case CHIP_HAINAN:
3294 #endif
3295 case CHIP_TOPAZ:
3296 /* chips with no display hardware */
3297 return false;
3298 #if defined(CONFIG_DRM_AMD_DC)
3299 case CHIP_TAHITI:
3300 case CHIP_PITCAIRN:
3301 case CHIP_VERDE:
3302 case CHIP_OLAND:
3303 /*
3304 * We have systems in the wild with these ASICs that require
3305 * LVDS and VGA support which is not supported with DC.
3306 *
3307 * Fallback to the non-DC driver here by default so as not to
3308 * cause regressions.
3309 */
3310 #if defined(CONFIG_DRM_AMD_DC_SI)
3311 return amdgpu_dc > 0;
3312 #else
3313 return false;
3314 #endif
3315 case CHIP_BONAIRE:
3316 case CHIP_KAVERI:
3317 case CHIP_KABINI:
3318 case CHIP_MULLINS:
3319 /*
3320 * We have systems in the wild with these ASICs that require
3321 * LVDS and VGA support which is not supported with DC.
3322 *
3323 * Fallback to the non-DC driver here by default so as not to
3324 * cause regressions.
3325 */
3326 return amdgpu_dc > 0;
3327 case CHIP_HAWAII:
3328 case CHIP_CARRIZO:
3329 case CHIP_STONEY:
3330 case CHIP_POLARIS10:
3331 case CHIP_POLARIS11:
3332 case CHIP_POLARIS12:
3333 case CHIP_VEGAM:
3334 case CHIP_TONGA:
3335 case CHIP_FIJI:
3336 case CHIP_VEGA10:
3337 case CHIP_VEGA12:
3338 case CHIP_VEGA20:
3339 #if defined(CONFIG_DRM_AMD_DC_DCN)
3340 case CHIP_RAVEN:
3341 case CHIP_NAVI10:
3342 case CHIP_NAVI14:
3343 case CHIP_NAVI12:
3344 case CHIP_RENOIR:
3345 case CHIP_CYAN_SKILLFISH:
3346 case CHIP_SIENNA_CICHLID:
3347 case CHIP_NAVY_FLOUNDER:
3348 case CHIP_DIMGREY_CAVEFISH:
3349 case CHIP_BEIGE_GOBY:
3350 case CHIP_VANGOGH:
3351 case CHIP_YELLOW_CARP:
3352 #endif
3353 default:
3354 return amdgpu_dc != 0;
3355 #else
3356 default:
3357 if (amdgpu_dc > 0)
3358 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3359 "but isn't supported by ASIC, ignoring\n");
3360 return false;
3361 #endif
3362 }
3363 }
3364
3365 /**
3366 * amdgpu_device_has_dc_support - check if dc is supported
3367 *
3368 * @adev: amdgpu_device pointer
3369 *
3370 * Returns true for supported, false for not supported
3371 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3372 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3373 {
3374 if (amdgpu_sriov_vf(adev) ||
3375 adev->enable_virtual_display ||
3376 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3377 return false;
3378
3379 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3380 }
3381
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3382 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3383 {
3384 struct amdgpu_device *adev =
3385 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3386 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3387
3388 /* It's a bug to not have a hive within this function */
3389 if (WARN_ON(!hive))
3390 return;
3391
3392 /*
3393 * Use task barrier to synchronize all xgmi reset works across the
3394 * hive. task_barrier_enter and task_barrier_exit will block
3395 * until all the threads running the xgmi reset works reach
3396 * those points. task_barrier_full will do both blocks.
3397 */
3398 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3399
3400 task_barrier_enter(&hive->tb);
3401 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3402
3403 if (adev->asic_reset_res)
3404 goto fail;
3405
3406 task_barrier_exit(&hive->tb);
3407 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3408
3409 if (adev->asic_reset_res)
3410 goto fail;
3411
3412 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3413 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3414 adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3415 } else {
3416
3417 task_barrier_full(&hive->tb);
3418 adev->asic_reset_res = amdgpu_asic_reset(adev);
3419 }
3420
3421 fail:
3422 if (adev->asic_reset_res)
3423 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3424 adev->asic_reset_res, adev_to_drm(adev)->unique);
3425 amdgpu_put_xgmi_hive(hive);
3426 }
3427
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3428 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3429 {
3430 char *input = amdgpu_lockup_timeout;
3431 char *timeout_setting = NULL;
3432 int index = 0;
3433 long timeout;
3434 int ret = 0;
3435
3436 /*
3437 * By default timeout for non compute jobs is 10000
3438 * and 60000 for compute jobs.
3439 * In SR-IOV or passthrough mode, timeout for compute
3440 * jobs are 60000 by default.
3441 */
3442 adev->gfx_timeout = msecs_to_jiffies(10000);
3443 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3444 if (amdgpu_sriov_vf(adev))
3445 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3446 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3447 else
3448 adev->compute_timeout = msecs_to_jiffies(60000);
3449
3450 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3451 while ((timeout_setting = strsep(&input, ",")) &&
3452 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3453 ret = kstrtol(timeout_setting, 0, &timeout);
3454 if (ret)
3455 return ret;
3456
3457 if (timeout == 0) {
3458 index++;
3459 continue;
3460 } else if (timeout < 0) {
3461 timeout = MAX_SCHEDULE_TIMEOUT;
3462 dev_warn(adev->dev, "lockup timeout disabled");
3463 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3464 } else {
3465 timeout = msecs_to_jiffies(timeout);
3466 }
3467
3468 switch (index++) {
3469 case 0:
3470 adev->gfx_timeout = timeout;
3471 break;
3472 case 1:
3473 adev->compute_timeout = timeout;
3474 break;
3475 case 2:
3476 adev->sdma_timeout = timeout;
3477 break;
3478 case 3:
3479 adev->video_timeout = timeout;
3480 break;
3481 default:
3482 break;
3483 }
3484 }
3485 /*
3486 * There is only one value specified and
3487 * it should apply to all non-compute jobs.
3488 */
3489 if (index == 1) {
3490 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3491 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3492 adev->compute_timeout = adev->gfx_timeout;
3493 }
3494 }
3495
3496 return ret;
3497 }
3498
3499 /**
3500 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3501 *
3502 * @adev: amdgpu_device pointer
3503 *
3504 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3505 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)3506 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3507 {
3508 struct iommu_domain *domain;
3509
3510 domain = iommu_get_domain_for_dev(adev->dev);
3511 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3512 adev->ram_is_direct_mapped = true;
3513 }
3514
3515 static const struct attribute *amdgpu_dev_attributes[] = {
3516 &dev_attr_product_name.attr,
3517 &dev_attr_product_number.attr,
3518 &dev_attr_serial_number.attr,
3519 &dev_attr_pcie_replay_count.attr,
3520 NULL
3521 };
3522
3523 /**
3524 * amdgpu_device_init - initialize the driver
3525 *
3526 * @adev: amdgpu_device pointer
3527 * @flags: driver flags
3528 *
3529 * Initializes the driver info and hw (all asics).
3530 * Returns 0 for success or an error on failure.
3531 * Called at driver startup.
3532 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3533 int amdgpu_device_init(struct amdgpu_device *adev,
3534 uint32_t flags)
3535 {
3536 struct drm_device *ddev = adev_to_drm(adev);
3537 struct pci_dev *pdev = adev->pdev;
3538 int r, i;
3539 bool px = false;
3540 u32 max_MBps;
3541
3542 adev->shutdown = false;
3543 adev->flags = flags;
3544
3545 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3546 adev->asic_type = amdgpu_force_asic_type;
3547 else
3548 adev->asic_type = flags & AMD_ASIC_MASK;
3549
3550 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3551 if (amdgpu_emu_mode == 1)
3552 adev->usec_timeout *= 10;
3553 adev->gmc.gart_size = 512 * 1024 * 1024;
3554 adev->accel_working = false;
3555 adev->num_rings = 0;
3556 adev->mman.buffer_funcs = NULL;
3557 adev->mman.buffer_funcs_ring = NULL;
3558 adev->vm_manager.vm_pte_funcs = NULL;
3559 adev->vm_manager.vm_pte_num_scheds = 0;
3560 adev->gmc.gmc_funcs = NULL;
3561 adev->harvest_ip_mask = 0x0;
3562 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3563 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3564
3565 adev->smc_rreg = &amdgpu_invalid_rreg;
3566 adev->smc_wreg = &amdgpu_invalid_wreg;
3567 adev->pcie_rreg = &amdgpu_invalid_rreg;
3568 adev->pcie_wreg = &amdgpu_invalid_wreg;
3569 adev->pciep_rreg = &amdgpu_invalid_rreg;
3570 adev->pciep_wreg = &amdgpu_invalid_wreg;
3571 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3572 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3573 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3574 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3575 adev->didt_rreg = &amdgpu_invalid_rreg;
3576 adev->didt_wreg = &amdgpu_invalid_wreg;
3577 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3578 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3579 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3580 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3581
3582 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3583 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3584 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3585
3586 /* mutex initialization are all done here so we
3587 * can recall function without having locking issues */
3588 mutex_init(&adev->firmware.mutex);
3589 mutex_init(&adev->pm.mutex);
3590 mutex_init(&adev->gfx.gpu_clock_mutex);
3591 mutex_init(&adev->srbm_mutex);
3592 mutex_init(&adev->gfx.pipe_reserve_mutex);
3593 mutex_init(&adev->gfx.gfx_off_mutex);
3594 mutex_init(&adev->grbm_idx_mutex);
3595 mutex_init(&adev->mn_lock);
3596 mutex_init(&adev->virt.vf_errors.lock);
3597 hash_init(adev->mn_hash);
3598 mutex_init(&adev->psp.mutex);
3599 mutex_init(&adev->notifier_lock);
3600 mutex_init(&adev->pm.stable_pstate_ctx_lock);
3601 mutex_init(&adev->benchmark_mutex);
3602
3603 amdgpu_device_init_apu_flags(adev);
3604
3605 r = amdgpu_device_check_arguments(adev);
3606 if (r)
3607 return r;
3608
3609 spin_lock_init(&adev->mmio_idx_lock);
3610 spin_lock_init(&adev->smc_idx_lock);
3611 spin_lock_init(&adev->pcie_idx_lock);
3612 spin_lock_init(&adev->uvd_ctx_idx_lock);
3613 spin_lock_init(&adev->didt_idx_lock);
3614 spin_lock_init(&adev->gc_cac_idx_lock);
3615 spin_lock_init(&adev->se_cac_idx_lock);
3616 spin_lock_init(&adev->audio_endpt_idx_lock);
3617 spin_lock_init(&adev->mm_stats.lock);
3618
3619 INIT_LIST_HEAD(&adev->shadow_list);
3620 mutex_init(&adev->shadow_list_lock);
3621
3622 INIT_LIST_HEAD(&adev->reset_list);
3623
3624 INIT_LIST_HEAD(&adev->ras_list);
3625
3626 INIT_DELAYED_WORK(&adev->delayed_init_work,
3627 amdgpu_device_delayed_init_work_handler);
3628 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3629 amdgpu_device_delay_enable_gfx_off);
3630
3631 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3632
3633 adev->gfx.gfx_off_req_count = 1;
3634 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3635
3636 atomic_set(&adev->throttling_logging_enabled, 1);
3637 /*
3638 * If throttling continues, logging will be performed every minute
3639 * to avoid log flooding. "-1" is subtracted since the thermal
3640 * throttling interrupt comes every second. Thus, the total logging
3641 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3642 * for throttling interrupt) = 60 seconds.
3643 */
3644 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3645 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3646
3647 /* Registers mapping */
3648 /* TODO: block userspace mapping of io register */
3649 if (adev->asic_type >= CHIP_BONAIRE) {
3650 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3651 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3652 } else {
3653 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3654 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3655 }
3656
3657 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3658 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3659
3660 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3661 if (adev->rmmio == NULL) {
3662 return -ENOMEM;
3663 }
3664 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3665 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3666
3667 amdgpu_device_get_pcie_info(adev);
3668
3669 if (amdgpu_mcbp)
3670 DRM_INFO("MCBP is enabled\n");
3671
3672 if (adev->asic_type >= CHIP_NAVI10) {
3673 if (amdgpu_mes || amdgpu_mes_kiq)
3674 adev->enable_mes = true;
3675
3676 if (amdgpu_mes_kiq)
3677 adev->enable_mes_kiq = true;
3678 }
3679
3680 /*
3681 * Reset domain needs to be present early, before XGMI hive discovered
3682 * (if any) and intitialized to use reset sem and in_gpu reset flag
3683 * early on during init and before calling to RREG32.
3684 */
3685 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3686 if (!adev->reset_domain)
3687 return -ENOMEM;
3688
3689 /* detect hw virtualization here */
3690 amdgpu_detect_virtualization(adev);
3691
3692 r = amdgpu_device_get_job_timeout_settings(adev);
3693 if (r) {
3694 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3695 return r;
3696 }
3697
3698 /* early init functions */
3699 r = amdgpu_device_ip_early_init(adev);
3700 if (r)
3701 return r;
3702
3703 /* Enable TMZ based on IP_VERSION */
3704 amdgpu_gmc_tmz_set(adev);
3705
3706 amdgpu_gmc_noretry_set(adev);
3707 /* Need to get xgmi info early to decide the reset behavior*/
3708 if (adev->gmc.xgmi.supported) {
3709 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3710 if (r)
3711 return r;
3712 }
3713
3714 /* enable PCIE atomic ops */
3715 if (amdgpu_sriov_vf(adev))
3716 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3717 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3718 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3719 else
3720 adev->have_atomics_support =
3721 !pci_enable_atomic_ops_to_root(adev->pdev,
3722 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3723 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3724 if (!adev->have_atomics_support)
3725 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3726
3727 /* doorbell bar mapping and doorbell index init*/
3728 amdgpu_device_doorbell_init(adev);
3729
3730 if (amdgpu_emu_mode == 1) {
3731 /* post the asic on emulation mode */
3732 emu_soc_asic_init(adev);
3733 goto fence_driver_init;
3734 }
3735
3736 amdgpu_reset_init(adev);
3737
3738 /* detect if we are with an SRIOV vbios */
3739 amdgpu_device_detect_sriov_bios(adev);
3740
3741 /* check if we need to reset the asic
3742 * E.g., driver was not cleanly unloaded previously, etc.
3743 */
3744 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3745 if (adev->gmc.xgmi.num_physical_nodes) {
3746 dev_info(adev->dev, "Pending hive reset.\n");
3747 adev->gmc.xgmi.pending_reset = true;
3748 /* Only need to init necessary block for SMU to handle the reset */
3749 for (i = 0; i < adev->num_ip_blocks; i++) {
3750 if (!adev->ip_blocks[i].status.valid)
3751 continue;
3752 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3753 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3754 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3755 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3756 DRM_DEBUG("IP %s disabled for hw_init.\n",
3757 adev->ip_blocks[i].version->funcs->name);
3758 adev->ip_blocks[i].status.hw = true;
3759 }
3760 }
3761 } else {
3762 r = amdgpu_asic_reset(adev);
3763 if (r) {
3764 dev_err(adev->dev, "asic reset on init failed\n");
3765 goto failed;
3766 }
3767 }
3768 }
3769
3770 pci_enable_pcie_error_reporting(adev->pdev);
3771
3772 /* Post card if necessary */
3773 if (amdgpu_device_need_post(adev)) {
3774 if (!adev->bios) {
3775 dev_err(adev->dev, "no vBIOS found\n");
3776 r = -EINVAL;
3777 goto failed;
3778 }
3779 DRM_INFO("GPU posting now...\n");
3780 r = amdgpu_device_asic_init(adev);
3781 if (r) {
3782 dev_err(adev->dev, "gpu post error!\n");
3783 goto failed;
3784 }
3785 }
3786
3787 if (adev->is_atom_fw) {
3788 /* Initialize clocks */
3789 r = amdgpu_atomfirmware_get_clock_info(adev);
3790 if (r) {
3791 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3792 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3793 goto failed;
3794 }
3795 } else {
3796 /* Initialize clocks */
3797 r = amdgpu_atombios_get_clock_info(adev);
3798 if (r) {
3799 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3800 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3801 goto failed;
3802 }
3803 /* init i2c buses */
3804 if (!amdgpu_device_has_dc_support(adev))
3805 amdgpu_atombios_i2c_init(adev);
3806 }
3807
3808 fence_driver_init:
3809 /* Fence driver */
3810 r = amdgpu_fence_driver_sw_init(adev);
3811 if (r) {
3812 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3813 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3814 goto failed;
3815 }
3816
3817 /* init the mode config */
3818 drm_mode_config_init(adev_to_drm(adev));
3819
3820 r = amdgpu_device_ip_init(adev);
3821 if (r) {
3822 /* failed in exclusive mode due to timeout */
3823 if (amdgpu_sriov_vf(adev) &&
3824 !amdgpu_sriov_runtime(adev) &&
3825 amdgpu_virt_mmio_blocked(adev) &&
3826 !amdgpu_virt_wait_reset(adev)) {
3827 dev_err(adev->dev, "VF exclusive mode timeout\n");
3828 /* Don't send request since VF is inactive. */
3829 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3830 adev->virt.ops = NULL;
3831 r = -EAGAIN;
3832 goto release_ras_con;
3833 }
3834 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3835 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3836 goto release_ras_con;
3837 }
3838
3839 amdgpu_fence_driver_hw_init(adev);
3840
3841 dev_info(adev->dev,
3842 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3843 adev->gfx.config.max_shader_engines,
3844 adev->gfx.config.max_sh_per_se,
3845 adev->gfx.config.max_cu_per_sh,
3846 adev->gfx.cu_info.number);
3847
3848 adev->accel_working = true;
3849
3850 amdgpu_vm_check_compute_bug(adev);
3851
3852 /* Initialize the buffer migration limit. */
3853 if (amdgpu_moverate >= 0)
3854 max_MBps = amdgpu_moverate;
3855 else
3856 max_MBps = 8; /* Allow 8 MB/s. */
3857 /* Get a log2 for easy divisions. */
3858 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3859
3860 r = amdgpu_pm_sysfs_init(adev);
3861 if (r) {
3862 adev->pm_sysfs_en = false;
3863 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3864 } else
3865 adev->pm_sysfs_en = true;
3866
3867 r = amdgpu_ucode_sysfs_init(adev);
3868 if (r) {
3869 adev->ucode_sysfs_en = false;
3870 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3871 } else
3872 adev->ucode_sysfs_en = true;
3873
3874 r = amdgpu_psp_sysfs_init(adev);
3875 if (r) {
3876 adev->psp_sysfs_en = false;
3877 if (!amdgpu_sriov_vf(adev))
3878 DRM_ERROR("Creating psp sysfs failed\n");
3879 } else
3880 adev->psp_sysfs_en = true;
3881
3882 /*
3883 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3884 * Otherwise the mgpu fan boost feature will be skipped due to the
3885 * gpu instance is counted less.
3886 */
3887 amdgpu_register_gpu_instance(adev);
3888
3889 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3890 * explicit gating rather than handling it automatically.
3891 */
3892 if (!adev->gmc.xgmi.pending_reset) {
3893 r = amdgpu_device_ip_late_init(adev);
3894 if (r) {
3895 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3896 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3897 goto release_ras_con;
3898 }
3899 /* must succeed. */
3900 amdgpu_ras_resume(adev);
3901 queue_delayed_work(system_wq, &adev->delayed_init_work,
3902 msecs_to_jiffies(AMDGPU_RESUME_MS));
3903 }
3904
3905 if (amdgpu_sriov_vf(adev))
3906 flush_delayed_work(&adev->delayed_init_work);
3907
3908 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3909 if (r)
3910 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3911
3912 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3913 r = amdgpu_pmu_init(adev);
3914 if (r)
3915 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3916
3917 /* Have stored pci confspace at hand for restore in sudden PCI error */
3918 if (amdgpu_device_cache_pci_state(adev->pdev))
3919 pci_restore_state(pdev);
3920
3921 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3922 /* this will fail for cards that aren't VGA class devices, just
3923 * ignore it */
3924 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3925 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3926
3927 if (amdgpu_device_supports_px(ddev)) {
3928 px = true;
3929 vga_switcheroo_register_client(adev->pdev,
3930 &amdgpu_switcheroo_ops, px);
3931 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3932 }
3933
3934 if (adev->gmc.xgmi.pending_reset)
3935 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3936 msecs_to_jiffies(AMDGPU_RESUME_MS));
3937
3938 amdgpu_device_check_iommu_direct_map(adev);
3939
3940 return 0;
3941
3942 release_ras_con:
3943 amdgpu_release_ras_context(adev);
3944
3945 failed:
3946 amdgpu_vf_error_trans_all(adev);
3947
3948 return r;
3949 }
3950
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)3951 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3952 {
3953
3954 /* Clear all CPU mappings pointing to this device */
3955 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3956
3957 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3958 amdgpu_device_doorbell_fini(adev);
3959
3960 iounmap(adev->rmmio);
3961 adev->rmmio = NULL;
3962 if (adev->mman.aper_base_kaddr)
3963 iounmap(adev->mman.aper_base_kaddr);
3964 adev->mman.aper_base_kaddr = NULL;
3965
3966 /* Memory manager related */
3967 if (!adev->gmc.xgmi.connected_to_cpu) {
3968 arch_phys_wc_del(adev->gmc.vram_mtrr);
3969 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3970 }
3971 }
3972
3973 /**
3974 * amdgpu_device_fini_hw - tear down the driver
3975 *
3976 * @adev: amdgpu_device pointer
3977 *
3978 * Tear down the driver info (all asics).
3979 * Called at driver shutdown.
3980 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)3981 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3982 {
3983 dev_info(adev->dev, "amdgpu: finishing device.\n");
3984 flush_delayed_work(&adev->delayed_init_work);
3985 adev->shutdown = true;
3986
3987 /* make sure IB test finished before entering exclusive mode
3988 * to avoid preemption on IB test
3989 * */
3990 if (amdgpu_sriov_vf(adev)) {
3991 amdgpu_virt_request_full_gpu(adev, false);
3992 amdgpu_virt_fini_data_exchange(adev);
3993 }
3994
3995 /* disable all interrupts */
3996 amdgpu_irq_disable_all(adev);
3997 if (adev->mode_info.mode_config_initialized){
3998 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3999 drm_helper_force_disable_all(adev_to_drm(adev));
4000 else
4001 drm_atomic_helper_shutdown(adev_to_drm(adev));
4002 }
4003 amdgpu_fence_driver_hw_fini(adev);
4004
4005 if (adev->mman.initialized) {
4006 flush_delayed_work(&adev->mman.bdev.wq);
4007 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
4008 }
4009
4010 if (adev->pm_sysfs_en)
4011 amdgpu_pm_sysfs_fini(adev);
4012 if (adev->ucode_sysfs_en)
4013 amdgpu_ucode_sysfs_fini(adev);
4014 if (adev->psp_sysfs_en)
4015 amdgpu_psp_sysfs_fini(adev);
4016 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4017
4018 /* disable ras feature must before hw fini */
4019 amdgpu_ras_pre_fini(adev);
4020
4021 amdgpu_device_ip_fini_early(adev);
4022
4023 amdgpu_irq_fini_hw(adev);
4024
4025 if (adev->mman.initialized)
4026 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4027
4028 amdgpu_gart_dummy_page_fini(adev);
4029
4030 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4031 amdgpu_device_unmap_mmio(adev);
4032
4033 }
4034
amdgpu_device_fini_sw(struct amdgpu_device * adev)4035 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4036 {
4037 int idx;
4038
4039 amdgpu_fence_driver_sw_fini(adev);
4040 amdgpu_device_ip_fini(adev);
4041 release_firmware(adev->firmware.gpu_info_fw);
4042 adev->firmware.gpu_info_fw = NULL;
4043 adev->accel_working = false;
4044
4045 amdgpu_reset_fini(adev);
4046
4047 /* free i2c buses */
4048 if (!amdgpu_device_has_dc_support(adev))
4049 amdgpu_i2c_fini(adev);
4050
4051 if (amdgpu_emu_mode != 1)
4052 amdgpu_atombios_fini(adev);
4053
4054 kfree(adev->bios);
4055 adev->bios = NULL;
4056 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4057 vga_switcheroo_unregister_client(adev->pdev);
4058 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4059 }
4060 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4061 vga_client_unregister(adev->pdev);
4062
4063 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4064
4065 iounmap(adev->rmmio);
4066 adev->rmmio = NULL;
4067 amdgpu_device_doorbell_fini(adev);
4068 drm_dev_exit(idx);
4069 }
4070
4071 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4072 amdgpu_pmu_fini(adev);
4073 if (adev->mman.discovery_bin)
4074 amdgpu_discovery_fini(adev);
4075
4076 amdgpu_reset_put_reset_domain(adev->reset_domain);
4077 adev->reset_domain = NULL;
4078
4079 kfree(adev->pci_state);
4080
4081 }
4082
4083 /**
4084 * amdgpu_device_evict_resources - evict device resources
4085 * @adev: amdgpu device object
4086 *
4087 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4088 * of the vram memory type. Mainly used for evicting device resources
4089 * at suspend time.
4090 *
4091 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4092 static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
4093 {
4094 /* No need to evict vram on APUs for suspend to ram or s2idle */
4095 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4096 return;
4097
4098 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
4099 DRM_WARN("evicting device resources failed\n");
4100
4101 }
4102
4103 /*
4104 * Suspend & resume.
4105 */
4106 /**
4107 * amdgpu_device_suspend - initiate device suspend
4108 *
4109 * @dev: drm dev pointer
4110 * @fbcon : notify the fbdev of suspend
4111 *
4112 * Puts the hw in the suspend state (all asics).
4113 * Returns 0 for success or an error on failure.
4114 * Called at driver suspend.
4115 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4116 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4117 {
4118 struct amdgpu_device *adev = drm_to_adev(dev);
4119
4120 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4121 return 0;
4122
4123 adev->in_suspend = true;
4124
4125 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4126 DRM_WARN("smart shift update failed\n");
4127
4128 drm_kms_helper_poll_disable(dev);
4129
4130 if (fbcon)
4131 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4132
4133 cancel_delayed_work_sync(&adev->delayed_init_work);
4134
4135 amdgpu_ras_suspend(adev);
4136
4137 amdgpu_device_ip_suspend_phase1(adev);
4138
4139 if (!adev->in_s0ix)
4140 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4141
4142 amdgpu_device_evict_resources(adev);
4143
4144 amdgpu_fence_driver_hw_fini(adev);
4145
4146 amdgpu_device_ip_suspend_phase2(adev);
4147
4148 return 0;
4149 }
4150
4151 /**
4152 * amdgpu_device_resume - initiate device resume
4153 *
4154 * @dev: drm dev pointer
4155 * @fbcon : notify the fbdev of resume
4156 *
4157 * Bring the hw back to operating state (all asics).
4158 * Returns 0 for success or an error on failure.
4159 * Called at driver resume.
4160 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4161 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4162 {
4163 struct amdgpu_device *adev = drm_to_adev(dev);
4164 int r = 0;
4165
4166 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4167 return 0;
4168
4169 if (adev->in_s0ix)
4170 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4171
4172 /* post card */
4173 if (amdgpu_device_need_post(adev)) {
4174 r = amdgpu_device_asic_init(adev);
4175 if (r)
4176 dev_err(adev->dev, "amdgpu asic init failed\n");
4177 }
4178
4179 r = amdgpu_device_ip_resume(adev);
4180 if (r) {
4181 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4182 return r;
4183 }
4184 amdgpu_fence_driver_hw_init(adev);
4185
4186 r = amdgpu_device_ip_late_init(adev);
4187 if (r)
4188 return r;
4189
4190 queue_delayed_work(system_wq, &adev->delayed_init_work,
4191 msecs_to_jiffies(AMDGPU_RESUME_MS));
4192
4193 if (!adev->in_s0ix) {
4194 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4195 if (r)
4196 return r;
4197 }
4198
4199 /* Make sure IB tests flushed */
4200 flush_delayed_work(&adev->delayed_init_work);
4201
4202 if (fbcon)
4203 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4204
4205 drm_kms_helper_poll_enable(dev);
4206
4207 amdgpu_ras_resume(adev);
4208
4209 /*
4210 * Most of the connector probing functions try to acquire runtime pm
4211 * refs to ensure that the GPU is powered on when connector polling is
4212 * performed. Since we're calling this from a runtime PM callback,
4213 * trying to acquire rpm refs will cause us to deadlock.
4214 *
4215 * Since we're guaranteed to be holding the rpm lock, it's safe to
4216 * temporarily disable the rpm helpers so this doesn't deadlock us.
4217 */
4218 #ifdef CONFIG_PM
4219 dev->dev->power.disable_depth++;
4220 #endif
4221 if (!amdgpu_device_has_dc_support(adev))
4222 drm_helper_hpd_irq_event(dev);
4223 else
4224 drm_kms_helper_hotplug_event(dev);
4225 #ifdef CONFIG_PM
4226 dev->dev->power.disable_depth--;
4227 #endif
4228 adev->in_suspend = false;
4229
4230 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4231 DRM_WARN("smart shift update failed\n");
4232
4233 return 0;
4234 }
4235
4236 /**
4237 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4238 *
4239 * @adev: amdgpu_device pointer
4240 *
4241 * The list of all the hardware IPs that make up the asic is walked and
4242 * the check_soft_reset callbacks are run. check_soft_reset determines
4243 * if the asic is still hung or not.
4244 * Returns true if any of the IPs are still in a hung state, false if not.
4245 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4246 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4247 {
4248 int i;
4249 bool asic_hang = false;
4250
4251 if (amdgpu_sriov_vf(adev))
4252 return true;
4253
4254 if (amdgpu_asic_need_full_reset(adev))
4255 return true;
4256
4257 for (i = 0; i < adev->num_ip_blocks; i++) {
4258 if (!adev->ip_blocks[i].status.valid)
4259 continue;
4260 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4261 adev->ip_blocks[i].status.hang =
4262 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4263 if (adev->ip_blocks[i].status.hang) {
4264 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4265 asic_hang = true;
4266 }
4267 }
4268 return asic_hang;
4269 }
4270
4271 /**
4272 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4273 *
4274 * @adev: amdgpu_device pointer
4275 *
4276 * The list of all the hardware IPs that make up the asic is walked and the
4277 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4278 * handles any IP specific hardware or software state changes that are
4279 * necessary for a soft reset to succeed.
4280 * Returns 0 on success, negative error code on failure.
4281 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4282 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4283 {
4284 int i, r = 0;
4285
4286 for (i = 0; i < adev->num_ip_blocks; i++) {
4287 if (!adev->ip_blocks[i].status.valid)
4288 continue;
4289 if (adev->ip_blocks[i].status.hang &&
4290 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4291 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4292 if (r)
4293 return r;
4294 }
4295 }
4296
4297 return 0;
4298 }
4299
4300 /**
4301 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4302 *
4303 * @adev: amdgpu_device pointer
4304 *
4305 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4306 * reset is necessary to recover.
4307 * Returns true if a full asic reset is required, false if not.
4308 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4309 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4310 {
4311 int i;
4312
4313 if (amdgpu_asic_need_full_reset(adev))
4314 return true;
4315
4316 for (i = 0; i < adev->num_ip_blocks; i++) {
4317 if (!adev->ip_blocks[i].status.valid)
4318 continue;
4319 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4320 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4321 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4322 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4323 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4324 if (adev->ip_blocks[i].status.hang) {
4325 dev_info(adev->dev, "Some block need full reset!\n");
4326 return true;
4327 }
4328 }
4329 }
4330 return false;
4331 }
4332
4333 /**
4334 * amdgpu_device_ip_soft_reset - do a soft reset
4335 *
4336 * @adev: amdgpu_device pointer
4337 *
4338 * The list of all the hardware IPs that make up the asic is walked and the
4339 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4340 * IP specific hardware or software state changes that are necessary to soft
4341 * reset the IP.
4342 * Returns 0 on success, negative error code on failure.
4343 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4344 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4345 {
4346 int i, r = 0;
4347
4348 for (i = 0; i < adev->num_ip_blocks; i++) {
4349 if (!adev->ip_blocks[i].status.valid)
4350 continue;
4351 if (adev->ip_blocks[i].status.hang &&
4352 adev->ip_blocks[i].version->funcs->soft_reset) {
4353 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4354 if (r)
4355 return r;
4356 }
4357 }
4358
4359 return 0;
4360 }
4361
4362 /**
4363 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4364 *
4365 * @adev: amdgpu_device pointer
4366 *
4367 * The list of all the hardware IPs that make up the asic is walked and the
4368 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4369 * handles any IP specific hardware or software state changes that are
4370 * necessary after the IP has been soft reset.
4371 * Returns 0 on success, negative error code on failure.
4372 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4373 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4374 {
4375 int i, r = 0;
4376
4377 for (i = 0; i < adev->num_ip_blocks; i++) {
4378 if (!adev->ip_blocks[i].status.valid)
4379 continue;
4380 if (adev->ip_blocks[i].status.hang &&
4381 adev->ip_blocks[i].version->funcs->post_soft_reset)
4382 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4383 if (r)
4384 return r;
4385 }
4386
4387 return 0;
4388 }
4389
4390 /**
4391 * amdgpu_device_recover_vram - Recover some VRAM contents
4392 *
4393 * @adev: amdgpu_device pointer
4394 *
4395 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4396 * restore things like GPUVM page tables after a GPU reset where
4397 * the contents of VRAM might be lost.
4398 *
4399 * Returns:
4400 * 0 on success, negative error code on failure.
4401 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4402 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4403 {
4404 struct dma_fence *fence = NULL, *next = NULL;
4405 struct amdgpu_bo *shadow;
4406 struct amdgpu_bo_vm *vmbo;
4407 long r = 1, tmo;
4408
4409 if (amdgpu_sriov_runtime(adev))
4410 tmo = msecs_to_jiffies(8000);
4411 else
4412 tmo = msecs_to_jiffies(100);
4413
4414 dev_info(adev->dev, "recover vram bo from shadow start\n");
4415 mutex_lock(&adev->shadow_list_lock);
4416 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4417 shadow = &vmbo->bo;
4418 /* No need to recover an evicted BO */
4419 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4420 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4421 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4422 continue;
4423
4424 r = amdgpu_bo_restore_shadow(shadow, &next);
4425 if (r)
4426 break;
4427
4428 if (fence) {
4429 tmo = dma_fence_wait_timeout(fence, false, tmo);
4430 dma_fence_put(fence);
4431 fence = next;
4432 if (tmo == 0) {
4433 r = -ETIMEDOUT;
4434 break;
4435 } else if (tmo < 0) {
4436 r = tmo;
4437 break;
4438 }
4439 } else {
4440 fence = next;
4441 }
4442 }
4443 mutex_unlock(&adev->shadow_list_lock);
4444
4445 if (fence)
4446 tmo = dma_fence_wait_timeout(fence, false, tmo);
4447 dma_fence_put(fence);
4448
4449 if (r < 0 || tmo <= 0) {
4450 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4451 return -EIO;
4452 }
4453
4454 dev_info(adev->dev, "recover vram bo from shadow done\n");
4455 return 0;
4456 }
4457
4458
4459 /**
4460 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4461 *
4462 * @adev: amdgpu_device pointer
4463 * @from_hypervisor: request from hypervisor
4464 *
4465 * do VF FLR and reinitialize Asic
4466 * return 0 means succeeded otherwise failed
4467 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4468 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4469 bool from_hypervisor)
4470 {
4471 int r;
4472 struct amdgpu_hive_info *hive = NULL;
4473 int retry_limit = 0;
4474
4475 retry:
4476 amdgpu_amdkfd_pre_reset(adev);
4477
4478 if (from_hypervisor)
4479 r = amdgpu_virt_request_full_gpu(adev, true);
4480 else
4481 r = amdgpu_virt_reset_gpu(adev);
4482 if (r)
4483 return r;
4484
4485 /* Resume IP prior to SMC */
4486 r = amdgpu_device_ip_reinit_early_sriov(adev);
4487 if (r)
4488 goto error;
4489
4490 amdgpu_virt_init_data_exchange(adev);
4491
4492 r = amdgpu_device_fw_loading(adev);
4493 if (r)
4494 return r;
4495
4496 /* now we are okay to resume SMC/CP/SDMA */
4497 r = amdgpu_device_ip_reinit_late_sriov(adev);
4498 if (r)
4499 goto error;
4500
4501 hive = amdgpu_get_xgmi_hive(adev);
4502 /* Update PSP FW topology after reset */
4503 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4504 r = amdgpu_xgmi_update_topology(hive, adev);
4505
4506 if (hive)
4507 amdgpu_put_xgmi_hive(hive);
4508
4509 if (!r) {
4510 amdgpu_irq_gpu_reset_resume_helper(adev);
4511 r = amdgpu_ib_ring_tests(adev);
4512
4513 amdgpu_amdkfd_post_reset(adev);
4514 }
4515
4516 error:
4517 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4518 amdgpu_inc_vram_lost(adev);
4519 r = amdgpu_device_recover_vram(adev);
4520 }
4521 amdgpu_virt_release_full_gpu(adev, true);
4522
4523 if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4524 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4525 retry_limit++;
4526 goto retry;
4527 } else
4528 DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4529 }
4530
4531 return r;
4532 }
4533
4534 /**
4535 * amdgpu_device_has_job_running - check if there is any job in mirror list
4536 *
4537 * @adev: amdgpu_device pointer
4538 *
4539 * check if there is any job in mirror list
4540 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4541 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4542 {
4543 int i;
4544 struct drm_sched_job *job;
4545
4546 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4547 struct amdgpu_ring *ring = adev->rings[i];
4548
4549 if (!ring || !ring->sched.thread)
4550 continue;
4551
4552 spin_lock(&ring->sched.job_list_lock);
4553 job = list_first_entry_or_null(&ring->sched.pending_list,
4554 struct drm_sched_job, list);
4555 spin_unlock(&ring->sched.job_list_lock);
4556 if (job)
4557 return true;
4558 }
4559 return false;
4560 }
4561
4562 /**
4563 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4564 *
4565 * @adev: amdgpu_device pointer
4566 *
4567 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4568 * a hung GPU.
4569 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4570 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4571 {
4572 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4573 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4574 return false;
4575 }
4576
4577 if (amdgpu_gpu_recovery == 0)
4578 goto disabled;
4579
4580 if (amdgpu_sriov_vf(adev))
4581 return true;
4582
4583 if (amdgpu_gpu_recovery == -1) {
4584 switch (adev->asic_type) {
4585 #ifdef CONFIG_DRM_AMDGPU_SI
4586 case CHIP_VERDE:
4587 case CHIP_TAHITI:
4588 case CHIP_PITCAIRN:
4589 case CHIP_OLAND:
4590 case CHIP_HAINAN:
4591 #endif
4592 #ifdef CONFIG_DRM_AMDGPU_CIK
4593 case CHIP_KAVERI:
4594 case CHIP_KABINI:
4595 case CHIP_MULLINS:
4596 #endif
4597 case CHIP_CARRIZO:
4598 case CHIP_STONEY:
4599 case CHIP_CYAN_SKILLFISH:
4600 goto disabled;
4601 default:
4602 break;
4603 }
4604 }
4605
4606 return true;
4607
4608 disabled:
4609 dev_info(adev->dev, "GPU recovery disabled.\n");
4610 return false;
4611 }
4612
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4613 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4614 {
4615 u32 i;
4616 int ret = 0;
4617
4618 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4619
4620 dev_info(adev->dev, "GPU mode1 reset\n");
4621
4622 /* disable BM */
4623 pci_clear_master(adev->pdev);
4624
4625 amdgpu_device_cache_pci_state(adev->pdev);
4626
4627 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4628 dev_info(adev->dev, "GPU smu mode1 reset\n");
4629 ret = amdgpu_dpm_mode1_reset(adev);
4630 } else {
4631 dev_info(adev->dev, "GPU psp mode1 reset\n");
4632 ret = psp_gpu_reset(adev);
4633 }
4634
4635 if (ret)
4636 dev_err(adev->dev, "GPU mode1 reset failed\n");
4637
4638 amdgpu_device_load_pci_state(adev->pdev);
4639
4640 /* wait for asic to come out of reset */
4641 for (i = 0; i < adev->usec_timeout; i++) {
4642 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4643
4644 if (memsize != 0xffffffff)
4645 break;
4646 udelay(1);
4647 }
4648
4649 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4650 return ret;
4651 }
4652
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4653 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4654 struct amdgpu_reset_context *reset_context)
4655 {
4656 int i, r = 0;
4657 struct amdgpu_job *job = NULL;
4658 bool need_full_reset =
4659 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4660
4661 if (reset_context->reset_req_dev == adev)
4662 job = reset_context->job;
4663
4664 if (amdgpu_sriov_vf(adev)) {
4665 /* stop the data exchange thread */
4666 amdgpu_virt_fini_data_exchange(adev);
4667 }
4668
4669 /* block all schedulers and reset given job's ring */
4670 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4671 struct amdgpu_ring *ring = adev->rings[i];
4672
4673 if (!ring || !ring->sched.thread)
4674 continue;
4675
4676 /*clear job fence from fence drv to avoid force_completion
4677 *leave NULL and vm flush fence in fence drv */
4678 amdgpu_fence_driver_clear_job_fences(ring);
4679
4680 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4681 amdgpu_fence_driver_force_completion(ring);
4682 }
4683
4684 if (job && job->vm)
4685 drm_sched_increase_karma(&job->base);
4686
4687 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4688 /* If reset handler not implemented, continue; otherwise return */
4689 if (r == -ENOSYS)
4690 r = 0;
4691 else
4692 return r;
4693
4694 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4695 if (!amdgpu_sriov_vf(adev)) {
4696
4697 if (!need_full_reset)
4698 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4699
4700 if (!need_full_reset) {
4701 amdgpu_device_ip_pre_soft_reset(adev);
4702 r = amdgpu_device_ip_soft_reset(adev);
4703 amdgpu_device_ip_post_soft_reset(adev);
4704 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4705 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4706 need_full_reset = true;
4707 }
4708 }
4709
4710 if (need_full_reset)
4711 r = amdgpu_device_ip_suspend(adev);
4712 if (need_full_reset)
4713 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4714 else
4715 clear_bit(AMDGPU_NEED_FULL_RESET,
4716 &reset_context->flags);
4717 }
4718
4719 return r;
4720 }
4721
amdgpu_reset_reg_dumps(struct amdgpu_device * adev)4722 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4723 {
4724 uint32_t reg_value;
4725 int i;
4726
4727 lockdep_assert_held(&adev->reset_domain->sem);
4728 dump_stack();
4729
4730 for (i = 0; i < adev->num_regs; i++) {
4731 reg_value = RREG32(adev->reset_dump_reg_list[i]);
4732 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
4733 }
4734
4735 return 0;
4736 }
4737
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4738 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4739 struct amdgpu_reset_context *reset_context)
4740 {
4741 struct amdgpu_device *tmp_adev = NULL;
4742 bool need_full_reset, skip_hw_reset, vram_lost = false;
4743 int r = 0;
4744
4745 /* Try reset handler method first */
4746 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4747 reset_list);
4748 amdgpu_reset_reg_dumps(tmp_adev);
4749
4750 reset_context->reset_device_list = device_list_handle;
4751 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4752 /* If reset handler not implemented, continue; otherwise return */
4753 if (r == -ENOSYS)
4754 r = 0;
4755 else
4756 return r;
4757
4758 /* Reset handler not implemented, use the default method */
4759 need_full_reset =
4760 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4761 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4762
4763 /*
4764 * ASIC reset has to be done on all XGMI hive nodes ASAP
4765 * to allow proper links negotiation in FW (within 1 sec)
4766 */
4767 if (!skip_hw_reset && need_full_reset) {
4768 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4769 /* For XGMI run all resets in parallel to speed up the process */
4770 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4771 tmp_adev->gmc.xgmi.pending_reset = false;
4772 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4773 r = -EALREADY;
4774 } else
4775 r = amdgpu_asic_reset(tmp_adev);
4776
4777 if (r) {
4778 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4779 r, adev_to_drm(tmp_adev)->unique);
4780 break;
4781 }
4782 }
4783
4784 /* For XGMI wait for all resets to complete before proceed */
4785 if (!r) {
4786 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4787 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4788 flush_work(&tmp_adev->xgmi_reset_work);
4789 r = tmp_adev->asic_reset_res;
4790 if (r)
4791 break;
4792 }
4793 }
4794 }
4795 }
4796
4797 if (!r && amdgpu_ras_intr_triggered()) {
4798 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4799 if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4800 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4801 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4802 }
4803
4804 amdgpu_ras_intr_cleared();
4805 }
4806
4807 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4808 if (need_full_reset) {
4809 /* post card */
4810 r = amdgpu_device_asic_init(tmp_adev);
4811 if (r) {
4812 dev_warn(tmp_adev->dev, "asic atom init failed!");
4813 } else {
4814 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4815 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4816 if (r)
4817 goto out;
4818
4819 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4820 if (r)
4821 goto out;
4822
4823 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4824 if (vram_lost) {
4825 DRM_INFO("VRAM is lost due to GPU reset!\n");
4826 amdgpu_inc_vram_lost(tmp_adev);
4827 }
4828
4829 r = amdgpu_device_fw_loading(tmp_adev);
4830 if (r)
4831 return r;
4832
4833 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4834 if (r)
4835 goto out;
4836
4837 if (vram_lost)
4838 amdgpu_device_fill_reset_magic(tmp_adev);
4839
4840 /*
4841 * Add this ASIC as tracked as reset was already
4842 * complete successfully.
4843 */
4844 amdgpu_register_gpu_instance(tmp_adev);
4845
4846 if (!reset_context->hive &&
4847 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4848 amdgpu_xgmi_add_device(tmp_adev);
4849
4850 r = amdgpu_device_ip_late_init(tmp_adev);
4851 if (r)
4852 goto out;
4853
4854 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
4855
4856 /*
4857 * The GPU enters bad state once faulty pages
4858 * by ECC has reached the threshold, and ras
4859 * recovery is scheduled next. So add one check
4860 * here to break recovery if it indeed exceeds
4861 * bad page threshold, and remind user to
4862 * retire this GPU or setting one bigger
4863 * bad_page_threshold value to fix this once
4864 * probing driver again.
4865 */
4866 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4867 /* must succeed. */
4868 amdgpu_ras_resume(tmp_adev);
4869 } else {
4870 r = -EINVAL;
4871 goto out;
4872 }
4873
4874 /* Update PSP FW topology after reset */
4875 if (reset_context->hive &&
4876 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4877 r = amdgpu_xgmi_update_topology(
4878 reset_context->hive, tmp_adev);
4879 }
4880 }
4881
4882 out:
4883 if (!r) {
4884 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4885 r = amdgpu_ib_ring_tests(tmp_adev);
4886 if (r) {
4887 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4888 need_full_reset = true;
4889 r = -EAGAIN;
4890 goto end;
4891 }
4892 }
4893
4894 if (!r)
4895 r = amdgpu_device_recover_vram(tmp_adev);
4896 else
4897 tmp_adev->asic_reset_res = r;
4898 }
4899
4900 end:
4901 if (need_full_reset)
4902 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4903 else
4904 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4905 return r;
4906 }
4907
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)4908 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
4909 {
4910
4911 switch (amdgpu_asic_reset_method(adev)) {
4912 case AMD_RESET_METHOD_MODE1:
4913 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4914 break;
4915 case AMD_RESET_METHOD_MODE2:
4916 adev->mp1_state = PP_MP1_STATE_RESET;
4917 break;
4918 default:
4919 adev->mp1_state = PP_MP1_STATE_NONE;
4920 break;
4921 }
4922 }
4923
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)4924 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
4925 {
4926 amdgpu_vf_error_trans_all(adev);
4927 adev->mp1_state = PP_MP1_STATE_NONE;
4928 }
4929
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4930 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4931 {
4932 struct pci_dev *p = NULL;
4933
4934 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4935 adev->pdev->bus->number, 1);
4936 if (p) {
4937 pm_runtime_enable(&(p->dev));
4938 pm_runtime_resume(&(p->dev));
4939 }
4940 }
4941
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4942 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4943 {
4944 enum amd_reset_method reset_method;
4945 struct pci_dev *p = NULL;
4946 u64 expires;
4947
4948 /*
4949 * For now, only BACO and mode1 reset are confirmed
4950 * to suffer the audio issue without proper suspended.
4951 */
4952 reset_method = amdgpu_asic_reset_method(adev);
4953 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4954 (reset_method != AMD_RESET_METHOD_MODE1))
4955 return -EINVAL;
4956
4957 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4958 adev->pdev->bus->number, 1);
4959 if (!p)
4960 return -ENODEV;
4961
4962 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4963 if (!expires)
4964 /*
4965 * If we cannot get the audio device autosuspend delay,
4966 * a fixed 4S interval will be used. Considering 3S is
4967 * the audio controller default autosuspend delay setting.
4968 * 4S used here is guaranteed to cover that.
4969 */
4970 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4971
4972 while (!pm_runtime_status_suspended(&(p->dev))) {
4973 if (!pm_runtime_suspend(&(p->dev)))
4974 break;
4975
4976 if (expires < ktime_get_mono_fast_ns()) {
4977 dev_warn(adev->dev, "failed to suspend display audio\n");
4978 /* TODO: abort the succeeding gpu reset? */
4979 return -ETIMEDOUT;
4980 }
4981 }
4982
4983 pm_runtime_disable(&(p->dev));
4984
4985 return 0;
4986 }
4987
amdgpu_device_recheck_guilty_jobs(struct amdgpu_device * adev,struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4988 static void amdgpu_device_recheck_guilty_jobs(
4989 struct amdgpu_device *adev, struct list_head *device_list_handle,
4990 struct amdgpu_reset_context *reset_context)
4991 {
4992 int i, r = 0;
4993
4994 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4995 struct amdgpu_ring *ring = adev->rings[i];
4996 int ret = 0;
4997 struct drm_sched_job *s_job;
4998
4999 if (!ring || !ring->sched.thread)
5000 continue;
5001
5002 s_job = list_first_entry_or_null(&ring->sched.pending_list,
5003 struct drm_sched_job, list);
5004 if (s_job == NULL)
5005 continue;
5006
5007 /* clear job's guilty and depend the folowing step to decide the real one */
5008 drm_sched_reset_karma(s_job);
5009 /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
5010 * to make sure fence is balanced */
5011 dma_fence_get(s_job->s_fence->parent);
5012 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
5013
5014 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
5015 if (ret == 0) { /* timeout */
5016 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
5017 ring->sched.name, s_job->id);
5018
5019 /* set guilty */
5020 drm_sched_increase_karma(s_job);
5021 retry:
5022 /* do hw reset */
5023 if (amdgpu_sriov_vf(adev)) {
5024 amdgpu_virt_fini_data_exchange(adev);
5025 r = amdgpu_device_reset_sriov(adev, false);
5026 if (r)
5027 adev->asic_reset_res = r;
5028 } else {
5029 clear_bit(AMDGPU_SKIP_HW_RESET,
5030 &reset_context->flags);
5031 r = amdgpu_do_asic_reset(device_list_handle,
5032 reset_context);
5033 if (r && r == -EAGAIN)
5034 goto retry;
5035 }
5036
5037 /*
5038 * add reset counter so that the following
5039 * resubmitted job could flush vmid
5040 */
5041 atomic_inc(&adev->gpu_reset_counter);
5042 continue;
5043 }
5044
5045 /* got the hw fence, signal finished fence */
5046 atomic_dec(ring->sched.score);
5047 dma_fence_put(s_job->s_fence->parent);
5048 dma_fence_get(&s_job->s_fence->finished);
5049 dma_fence_signal(&s_job->s_fence->finished);
5050 dma_fence_put(&s_job->s_fence->finished);
5051
5052 /* remove node from list and free the job */
5053 spin_lock(&ring->sched.job_list_lock);
5054 list_del_init(&s_job->list);
5055 spin_unlock(&ring->sched.job_list_lock);
5056 ring->sched.ops->free_job(s_job);
5057 }
5058 }
5059
5060 /**
5061 * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler
5062 *
5063 * @adev: amdgpu_device pointer
5064 * @job: which job trigger hang
5065 *
5066 * Attempt to reset the GPU if it has hung (all asics).
5067 * Attempt to do soft-reset or full-reset and reinitialize Asic
5068 * Returns 0 for success or an error on failure.
5069 */
5070
amdgpu_device_gpu_recover_imp(struct amdgpu_device * adev,struct amdgpu_job * job)5071 int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
5072 struct amdgpu_job *job)
5073 {
5074 struct list_head device_list, *device_list_handle = NULL;
5075 bool job_signaled = false;
5076 struct amdgpu_hive_info *hive = NULL;
5077 struct amdgpu_device *tmp_adev = NULL;
5078 int i, r = 0;
5079 bool need_emergency_restart = false;
5080 bool audio_suspended = false;
5081 int tmp_vram_lost_counter;
5082 struct amdgpu_reset_context reset_context;
5083
5084 memset(&reset_context, 0, sizeof(reset_context));
5085
5086 /*
5087 * Special case: RAS triggered and full reset isn't supported
5088 */
5089 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5090
5091 /*
5092 * Flush RAM to disk so that after reboot
5093 * the user can read log and see why the system rebooted.
5094 */
5095 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5096 DRM_WARN("Emergency reboot.");
5097
5098 ksys_sync_helper();
5099 emergency_restart();
5100 }
5101
5102 dev_info(adev->dev, "GPU %s begin!\n",
5103 need_emergency_restart ? "jobs stop":"reset");
5104
5105 if (!amdgpu_sriov_vf(adev))
5106 hive = amdgpu_get_xgmi_hive(adev);
5107 if (hive)
5108 mutex_lock(&hive->hive_lock);
5109
5110 reset_context.method = AMD_RESET_METHOD_NONE;
5111 reset_context.reset_req_dev = adev;
5112 reset_context.job = job;
5113 reset_context.hive = hive;
5114 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5115
5116 /*
5117 * Build list of devices to reset.
5118 * In case we are in XGMI hive mode, resort the device list
5119 * to put adev in the 1st position.
5120 */
5121 INIT_LIST_HEAD(&device_list);
5122 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5123 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5124 list_add_tail(&tmp_adev->reset_list, &device_list);
5125 if (!list_is_first(&adev->reset_list, &device_list))
5126 list_rotate_to_front(&adev->reset_list, &device_list);
5127 device_list_handle = &device_list;
5128 } else {
5129 list_add_tail(&adev->reset_list, &device_list);
5130 device_list_handle = &device_list;
5131 }
5132
5133 /* We need to lock reset domain only once both for XGMI and single device */
5134 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5135 reset_list);
5136 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5137
5138 /* block all schedulers and reset given job's ring */
5139 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5140
5141 amdgpu_device_set_mp1_state(tmp_adev);
5142
5143 /*
5144 * Try to put the audio codec into suspend state
5145 * before gpu reset started.
5146 *
5147 * Due to the power domain of the graphics device
5148 * is shared with AZ power domain. Without this,
5149 * we may change the audio hardware from behind
5150 * the audio driver's back. That will trigger
5151 * some audio codec errors.
5152 */
5153 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5154 audio_suspended = true;
5155
5156 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5157
5158 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5159
5160 if (!amdgpu_sriov_vf(tmp_adev))
5161 amdgpu_amdkfd_pre_reset(tmp_adev);
5162
5163 /*
5164 * Mark these ASICs to be reseted as untracked first
5165 * And add them back after reset completed
5166 */
5167 amdgpu_unregister_gpu_instance(tmp_adev);
5168
5169 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5170
5171 /* disable ras on ALL IPs */
5172 if (!need_emergency_restart &&
5173 amdgpu_device_ip_need_full_reset(tmp_adev))
5174 amdgpu_ras_suspend(tmp_adev);
5175
5176 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5177 struct amdgpu_ring *ring = tmp_adev->rings[i];
5178
5179 if (!ring || !ring->sched.thread)
5180 continue;
5181
5182 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5183
5184 if (need_emergency_restart)
5185 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5186 }
5187 atomic_inc(&tmp_adev->gpu_reset_counter);
5188 }
5189
5190 if (need_emergency_restart)
5191 goto skip_sched_resume;
5192
5193 /*
5194 * Must check guilty signal here since after this point all old
5195 * HW fences are force signaled.
5196 *
5197 * job->base holds a reference to parent fence
5198 */
5199 if (job && job->base.s_fence->parent &&
5200 dma_fence_is_signaled(job->base.s_fence->parent)) {
5201 job_signaled = true;
5202 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5203 goto skip_hw_reset;
5204 }
5205
5206 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5207 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5208 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
5209 /*TODO Should we stop ?*/
5210 if (r) {
5211 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5212 r, adev_to_drm(tmp_adev)->unique);
5213 tmp_adev->asic_reset_res = r;
5214 }
5215 }
5216
5217 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
5218 /* Actual ASIC resets if needed.*/
5219 /* Host driver will handle XGMI hive reset for SRIOV */
5220 if (amdgpu_sriov_vf(adev)) {
5221 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5222 if (r)
5223 adev->asic_reset_res = r;
5224
5225 /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
5226 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
5227 amdgpu_ras_resume(adev);
5228 } else {
5229 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
5230 if (r && r == -EAGAIN)
5231 goto retry;
5232 }
5233
5234 skip_hw_reset:
5235
5236 /* Post ASIC reset for all devs .*/
5237 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5238
5239 /*
5240 * Sometimes a later bad compute job can block a good gfx job as gfx
5241 * and compute ring share internal GC HW mutually. We add an additional
5242 * guilty jobs recheck step to find the real guilty job, it synchronously
5243 * submits and pends for the first job being signaled. If it gets timeout,
5244 * we identify it as a real guilty job.
5245 */
5246 if (amdgpu_gpu_recovery == 2 &&
5247 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
5248 amdgpu_device_recheck_guilty_jobs(
5249 tmp_adev, device_list_handle, &reset_context);
5250
5251 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5252 struct amdgpu_ring *ring = tmp_adev->rings[i];
5253
5254 if (!ring || !ring->sched.thread)
5255 continue;
5256
5257 /* No point to resubmit jobs if we didn't HW reset*/
5258 if (!tmp_adev->asic_reset_res && !job_signaled)
5259 drm_sched_resubmit_jobs(&ring->sched);
5260
5261 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5262 }
5263
5264 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5265 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5266 }
5267
5268 if (tmp_adev->asic_reset_res)
5269 r = tmp_adev->asic_reset_res;
5270
5271 tmp_adev->asic_reset_res = 0;
5272
5273 if (r) {
5274 /* bad news, how to tell it to userspace ? */
5275 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5276 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5277 } else {
5278 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5279 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5280 DRM_WARN("smart shift update failed\n");
5281 }
5282 }
5283
5284 skip_sched_resume:
5285 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5286 /* unlock kfd: SRIOV would do it separately */
5287 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5288 amdgpu_amdkfd_post_reset(tmp_adev);
5289
5290 /* kfd_post_reset will do nothing if kfd device is not initialized,
5291 * need to bring up kfd here if it's not be initialized before
5292 */
5293 if (!adev->kfd.init_complete)
5294 amdgpu_amdkfd_device_init(adev);
5295
5296 if (audio_suspended)
5297 amdgpu_device_resume_display_audio(tmp_adev);
5298
5299 amdgpu_device_unset_mp1_state(tmp_adev);
5300 }
5301
5302 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5303 reset_list);
5304 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5305
5306 if (hive) {
5307 mutex_unlock(&hive->hive_lock);
5308 amdgpu_put_xgmi_hive(hive);
5309 }
5310
5311 if (r)
5312 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5313 return r;
5314 }
5315
5316 struct amdgpu_recover_work_struct {
5317 struct work_struct base;
5318 struct amdgpu_device *adev;
5319 struct amdgpu_job *job;
5320 int ret;
5321 };
5322
amdgpu_device_queue_gpu_recover_work(struct work_struct * work)5323 static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
5324 {
5325 struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
5326
5327 recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
5328 }
5329 /*
5330 * Serialize gpu recover into reset domain single threaded wq
5331 */
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)5332 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5333 struct amdgpu_job *job)
5334 {
5335 struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
5336
5337 INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
5338
5339 if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
5340 return -EAGAIN;
5341
5342 flush_work(&work.base);
5343
5344 return work.ret;
5345 }
5346
5347 /**
5348 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5349 *
5350 * @adev: amdgpu_device pointer
5351 *
5352 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5353 * and lanes) of the slot the device is in. Handles APUs and
5354 * virtualized environments where PCIE config space may not be available.
5355 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5356 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5357 {
5358 struct pci_dev *pdev;
5359 enum pci_bus_speed speed_cap, platform_speed_cap;
5360 enum pcie_link_width platform_link_width;
5361
5362 if (amdgpu_pcie_gen_cap)
5363 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5364
5365 if (amdgpu_pcie_lane_cap)
5366 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5367
5368 /* covers APUs as well */
5369 if (pci_is_root_bus(adev->pdev->bus)) {
5370 if (adev->pm.pcie_gen_mask == 0)
5371 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5372 if (adev->pm.pcie_mlw_mask == 0)
5373 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5374 return;
5375 }
5376
5377 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5378 return;
5379
5380 pcie_bandwidth_available(adev->pdev, NULL,
5381 &platform_speed_cap, &platform_link_width);
5382
5383 if (adev->pm.pcie_gen_mask == 0) {
5384 /* asic caps */
5385 pdev = adev->pdev;
5386 speed_cap = pcie_get_speed_cap(pdev);
5387 if (speed_cap == PCI_SPEED_UNKNOWN) {
5388 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5389 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5390 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5391 } else {
5392 if (speed_cap == PCIE_SPEED_32_0GT)
5393 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5394 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5395 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5396 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5397 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5398 else if (speed_cap == PCIE_SPEED_16_0GT)
5399 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5400 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5401 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5402 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5403 else if (speed_cap == PCIE_SPEED_8_0GT)
5404 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5405 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5406 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5407 else if (speed_cap == PCIE_SPEED_5_0GT)
5408 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5409 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5410 else
5411 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5412 }
5413 /* platform caps */
5414 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5415 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5416 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5417 } else {
5418 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5419 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5420 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5421 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5422 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5423 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5424 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5425 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5426 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5427 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5428 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5429 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5430 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5431 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5432 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5433 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5434 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5435 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5436 else
5437 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5438
5439 }
5440 }
5441 if (adev->pm.pcie_mlw_mask == 0) {
5442 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5443 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5444 } else {
5445 switch (platform_link_width) {
5446 case PCIE_LNK_X32:
5447 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5448 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5449 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5450 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5451 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5452 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5453 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5454 break;
5455 case PCIE_LNK_X16:
5456 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5457 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5458 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5459 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5460 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5461 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5462 break;
5463 case PCIE_LNK_X12:
5464 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5465 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5466 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5467 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5468 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5469 break;
5470 case PCIE_LNK_X8:
5471 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5472 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5473 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5474 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5475 break;
5476 case PCIE_LNK_X4:
5477 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5478 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5479 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5480 break;
5481 case PCIE_LNK_X2:
5482 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5483 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5484 break;
5485 case PCIE_LNK_X1:
5486 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5487 break;
5488 default:
5489 break;
5490 }
5491 }
5492 }
5493 }
5494
amdgpu_device_baco_enter(struct drm_device * dev)5495 int amdgpu_device_baco_enter(struct drm_device *dev)
5496 {
5497 struct amdgpu_device *adev = drm_to_adev(dev);
5498 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5499
5500 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5501 return -ENOTSUPP;
5502
5503 if (ras && adev->ras_enabled &&
5504 adev->nbio.funcs->enable_doorbell_interrupt)
5505 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5506
5507 return amdgpu_dpm_baco_enter(adev);
5508 }
5509
amdgpu_device_baco_exit(struct drm_device * dev)5510 int amdgpu_device_baco_exit(struct drm_device *dev)
5511 {
5512 struct amdgpu_device *adev = drm_to_adev(dev);
5513 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5514 int ret = 0;
5515
5516 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5517 return -ENOTSUPP;
5518
5519 ret = amdgpu_dpm_baco_exit(adev);
5520 if (ret)
5521 return ret;
5522
5523 if (ras && adev->ras_enabled &&
5524 adev->nbio.funcs->enable_doorbell_interrupt)
5525 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5526
5527 if (amdgpu_passthrough(adev) &&
5528 adev->nbio.funcs->clear_doorbell_interrupt)
5529 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5530
5531 return 0;
5532 }
5533
5534 /**
5535 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5536 * @pdev: PCI device struct
5537 * @state: PCI channel state
5538 *
5539 * Description: Called when a PCI error is detected.
5540 *
5541 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5542 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5543 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5544 {
5545 struct drm_device *dev = pci_get_drvdata(pdev);
5546 struct amdgpu_device *adev = drm_to_adev(dev);
5547 int i;
5548
5549 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5550
5551 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5552 DRM_WARN("No support for XGMI hive yet...");
5553 return PCI_ERS_RESULT_DISCONNECT;
5554 }
5555
5556 adev->pci_channel_state = state;
5557
5558 switch (state) {
5559 case pci_channel_io_normal:
5560 return PCI_ERS_RESULT_CAN_RECOVER;
5561 /* Fatal error, prepare for slot reset */
5562 case pci_channel_io_frozen:
5563 /*
5564 * Locking adev->reset_domain->sem will prevent any external access
5565 * to GPU during PCI error recovery
5566 */
5567 amdgpu_device_lock_reset_domain(adev->reset_domain);
5568 amdgpu_device_set_mp1_state(adev);
5569
5570 /*
5571 * Block any work scheduling as we do for regular GPU reset
5572 * for the duration of the recovery
5573 */
5574 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5575 struct amdgpu_ring *ring = adev->rings[i];
5576
5577 if (!ring || !ring->sched.thread)
5578 continue;
5579
5580 drm_sched_stop(&ring->sched, NULL);
5581 }
5582 atomic_inc(&adev->gpu_reset_counter);
5583 return PCI_ERS_RESULT_NEED_RESET;
5584 case pci_channel_io_perm_failure:
5585 /* Permanent error, prepare for device removal */
5586 return PCI_ERS_RESULT_DISCONNECT;
5587 }
5588
5589 return PCI_ERS_RESULT_NEED_RESET;
5590 }
5591
5592 /**
5593 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5594 * @pdev: pointer to PCI device
5595 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5596 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5597 {
5598
5599 DRM_INFO("PCI error: mmio enabled callback!!\n");
5600
5601 /* TODO - dump whatever for debugging purposes */
5602
5603 /* This called only if amdgpu_pci_error_detected returns
5604 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5605 * works, no need to reset slot.
5606 */
5607
5608 return PCI_ERS_RESULT_RECOVERED;
5609 }
5610
5611 /**
5612 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5613 * @pdev: PCI device struct
5614 *
5615 * Description: This routine is called by the pci error recovery
5616 * code after the PCI slot has been reset, just before we
5617 * should resume normal operations.
5618 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5619 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5620 {
5621 struct drm_device *dev = pci_get_drvdata(pdev);
5622 struct amdgpu_device *adev = drm_to_adev(dev);
5623 int r, i;
5624 struct amdgpu_reset_context reset_context;
5625 u32 memsize;
5626 struct list_head device_list;
5627
5628 DRM_INFO("PCI error: slot reset callback!!\n");
5629
5630 memset(&reset_context, 0, sizeof(reset_context));
5631
5632 INIT_LIST_HEAD(&device_list);
5633 list_add_tail(&adev->reset_list, &device_list);
5634
5635 /* wait for asic to come out of reset */
5636 msleep(500);
5637
5638 /* Restore PCI confspace */
5639 amdgpu_device_load_pci_state(pdev);
5640
5641 /* confirm ASIC came out of reset */
5642 for (i = 0; i < adev->usec_timeout; i++) {
5643 memsize = amdgpu_asic_get_config_memsize(adev);
5644
5645 if (memsize != 0xffffffff)
5646 break;
5647 udelay(1);
5648 }
5649 if (memsize == 0xffffffff) {
5650 r = -ETIME;
5651 goto out;
5652 }
5653
5654 reset_context.method = AMD_RESET_METHOD_NONE;
5655 reset_context.reset_req_dev = adev;
5656 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5657 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5658
5659 adev->no_hw_access = true;
5660 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5661 adev->no_hw_access = false;
5662 if (r)
5663 goto out;
5664
5665 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5666
5667 out:
5668 if (!r) {
5669 if (amdgpu_device_cache_pci_state(adev->pdev))
5670 pci_restore_state(adev->pdev);
5671
5672 DRM_INFO("PCIe error recovery succeeded\n");
5673 } else {
5674 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5675 amdgpu_device_unset_mp1_state(adev);
5676 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5677 }
5678
5679 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5680 }
5681
5682 /**
5683 * amdgpu_pci_resume() - resume normal ops after PCI reset
5684 * @pdev: pointer to PCI device
5685 *
5686 * Called when the error recovery driver tells us that its
5687 * OK to resume normal operation.
5688 */
amdgpu_pci_resume(struct pci_dev * pdev)5689 void amdgpu_pci_resume(struct pci_dev *pdev)
5690 {
5691 struct drm_device *dev = pci_get_drvdata(pdev);
5692 struct amdgpu_device *adev = drm_to_adev(dev);
5693 int i;
5694
5695
5696 DRM_INFO("PCI error: resume callback!!\n");
5697
5698 /* Only continue execution for the case of pci_channel_io_frozen */
5699 if (adev->pci_channel_state != pci_channel_io_frozen)
5700 return;
5701
5702 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5703 struct amdgpu_ring *ring = adev->rings[i];
5704
5705 if (!ring || !ring->sched.thread)
5706 continue;
5707
5708
5709 drm_sched_resubmit_jobs(&ring->sched);
5710 drm_sched_start(&ring->sched, true);
5711 }
5712
5713 amdgpu_device_unset_mp1_state(adev);
5714 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5715 }
5716
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5717 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5718 {
5719 struct drm_device *dev = pci_get_drvdata(pdev);
5720 struct amdgpu_device *adev = drm_to_adev(dev);
5721 int r;
5722
5723 r = pci_save_state(pdev);
5724 if (!r) {
5725 kfree(adev->pci_state);
5726
5727 adev->pci_state = pci_store_saved_state(pdev);
5728
5729 if (!adev->pci_state) {
5730 DRM_ERROR("Failed to store PCI saved state");
5731 return false;
5732 }
5733 } else {
5734 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5735 return false;
5736 }
5737
5738 return true;
5739 }
5740
amdgpu_device_load_pci_state(struct pci_dev * pdev)5741 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5742 {
5743 struct drm_device *dev = pci_get_drvdata(pdev);
5744 struct amdgpu_device *adev = drm_to_adev(dev);
5745 int r;
5746
5747 if (!adev->pci_state)
5748 return false;
5749
5750 r = pci_load_saved_state(pdev, adev->pci_state);
5751
5752 if (!r) {
5753 pci_restore_state(pdev);
5754 } else {
5755 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5756 return false;
5757 }
5758
5759 return true;
5760 }
5761
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5762 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5763 struct amdgpu_ring *ring)
5764 {
5765 #ifdef CONFIG_X86_64
5766 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5767 return;
5768 #endif
5769 if (adev->gmc.xgmi.connected_to_cpu)
5770 return;
5771
5772 if (ring && ring->funcs->emit_hdp_flush)
5773 amdgpu_ring_emit_hdp_flush(ring);
5774 else
5775 amdgpu_asic_flush_hdp(adev, ring);
5776 }
5777
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5778 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5779 struct amdgpu_ring *ring)
5780 {
5781 #ifdef CONFIG_X86_64
5782 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5783 return;
5784 #endif
5785 if (adev->gmc.xgmi.connected_to_cpu)
5786 return;
5787
5788 amdgpu_asic_invalidate_hdp(adev, ring);
5789 }
5790
amdgpu_in_reset(struct amdgpu_device * adev)5791 int amdgpu_in_reset(struct amdgpu_device *adev)
5792 {
5793 return atomic_read(&adev->reset_domain->in_gpu_reset);
5794 }
5795
5796 /**
5797 * amdgpu_device_halt() - bring hardware to some kind of halt state
5798 *
5799 * @adev: amdgpu_device pointer
5800 *
5801 * Bring hardware to some kind of halt state so that no one can touch it
5802 * any more. It will help to maintain error context when error occurred.
5803 * Compare to a simple hang, the system will keep stable at least for SSH
5804 * access. Then it should be trivial to inspect the hardware state and
5805 * see what's going on. Implemented as following:
5806 *
5807 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5808 * clears all CPU mappings to device, disallows remappings through page faults
5809 * 2. amdgpu_irq_disable_all() disables all interrupts
5810 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5811 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5812 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5813 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5814 * flush any in flight DMA operations
5815 */
amdgpu_device_halt(struct amdgpu_device * adev)5816 void amdgpu_device_halt(struct amdgpu_device *adev)
5817 {
5818 struct pci_dev *pdev = adev->pdev;
5819 struct drm_device *ddev = adev_to_drm(adev);
5820
5821 drm_dev_unplug(ddev);
5822
5823 amdgpu_irq_disable_all(adev);
5824
5825 amdgpu_fence_driver_hw_fini(adev);
5826
5827 adev->no_hw_access = true;
5828
5829 amdgpu_device_unmap_mmio(adev);
5830
5831 pci_disable_device(pdev);
5832 pci_wait_for_pending_transaction(pdev);
5833 }
5834
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)5835 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5836 u32 reg)
5837 {
5838 unsigned long flags, address, data;
5839 u32 r;
5840
5841 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5842 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5843
5844 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5845 WREG32(address, reg * 4);
5846 (void)RREG32(address);
5847 r = RREG32(data);
5848 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5849 return r;
5850 }
5851
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)5852 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5853 u32 reg, u32 v)
5854 {
5855 unsigned long flags, address, data;
5856
5857 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5858 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5859
5860 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5861 WREG32(address, reg * 4);
5862 (void)RREG32(address);
5863 WREG32(data, v);
5864 (void)RREG32(data);
5865 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5866 }
5867