Lines Matching refs:adev

128 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
131 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);
139 void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) in amdgpu_ras_set_error_query_ready() argument
141 if (adev && amdgpu_ras_get_context(adev)) in amdgpu_ras_set_error_query_ready()
142 amdgpu_ras_get_context(adev)->error_query_ready = ready; in amdgpu_ras_set_error_query_ready()
145 static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) in amdgpu_ras_get_error_query_ready() argument
147 if (adev && amdgpu_ras_get_context(adev)) in amdgpu_ras_get_error_query_ready()
148 return amdgpu_ras_get_context(adev)->error_query_ready; in amdgpu_ras_get_error_query_ready()
153 static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) in amdgpu_reserve_page_direct() argument
158 if ((address >= adev->gmc.mc_vram_size) || in amdgpu_reserve_page_direct()
160 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
166 if (amdgpu_ras_check_bad_page(adev, address)) { in amdgpu_reserve_page_direct()
167 dev_warn(adev->dev, in amdgpu_reserve_page_direct()
178 amdgpu_ras_add_bad_pages(adev, err_data.err_addr, in amdgpu_reserve_page_direct()
180 amdgpu_ras_save_bad_pages(adev, NULL); in amdgpu_reserve_page_direct()
183 dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); in amdgpu_reserve_page_direct()
184 dev_warn(adev->dev, "Clear EEPROM:\n"); in amdgpu_reserve_page_direct()
185 dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); in amdgpu_reserve_page_direct()
200 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_debugfs_read()
204 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_debugfs_read()
205 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_debugfs_read()
206 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_debugfs_read()
207 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_debugfs_read()
336 static void amdgpu_ras_instance_mask_check(struct amdgpu_device *adev, in amdgpu_ras_instance_mask_check() argument
339 int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; in amdgpu_ras_instance_mask_check()
345 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
357 mask = GENMASK(adev->sdma.num_instances - 1, 0); in amdgpu_ras_instance_mask_check()
361 mask = GENMASK(adev->vcn.num_vcn_inst - 1, 0); in amdgpu_ras_instance_mask_check()
371 dev_dbg(adev->dev, in amdgpu_ras_instance_mask_check()
460 struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; in amdgpu_ras_debugfs_ctrl_write() local
464 if (!amdgpu_ras_get_error_query_ready(adev)) { in amdgpu_ras_debugfs_ctrl_write()
465 dev_warn(adev->dev, "RAS WARN: error injection " in amdgpu_ras_debugfs_ctrl_write()
475 ret = amdgpu_reserve_page_direct(adev, data.inject.address); in amdgpu_ras_debugfs_ctrl_write()
482 if (!amdgpu_ras_is_supported(adev, data.head.block)) in amdgpu_ras_debugfs_ctrl_write()
487 ret = amdgpu_ras_feature_enable(adev, &data.head, 0); in amdgpu_ras_debugfs_ctrl_write()
490 ret = amdgpu_ras_feature_enable(adev, &data.head, 1); in amdgpu_ras_debugfs_ctrl_write()
493 if ((data.inject.address >= adev->gmc.mc_vram_size && in amdgpu_ras_debugfs_ctrl_write()
494 adev->gmc.mc_vram_size) || in amdgpu_ras_debugfs_ctrl_write()
496 dev_warn(adev->dev, "RAS WARN: input address " in amdgpu_ras_debugfs_ctrl_write()
505 amdgpu_ras_check_bad_page(adev, data.inject.address)) { in amdgpu_ras_debugfs_ctrl_write()
506 dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " in amdgpu_ras_debugfs_ctrl_write()
512 amdgpu_ras_instance_mask_check(adev, &data); in amdgpu_ras_debugfs_ctrl_write()
515 ret = amdgpu_ras_error_inject(adev, &data.inject); in amdgpu_ras_debugfs_ctrl_write()
548 struct amdgpu_device *adev = in amdgpu_ras_debugfs_eeprom_write() local
553 &(amdgpu_ras_get_context(adev)->eeprom_control)); in amdgpu_ras_debugfs_eeprom_write()
558 amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; in amdgpu_ras_debugfs_eeprom_write()
608 if (!amdgpu_ras_get_error_query_ready(obj->adev)) in amdgpu_ras_sysfs_read()
611 if (amdgpu_ras_query_error_status(obj->adev, &info)) in amdgpu_ras_sysfs_read()
614 if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_sysfs_read()
615 obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_sysfs_read()
616 if (amdgpu_ras_reset_error_status(obj->adev, info.head.block)) in amdgpu_ras_sysfs_read()
617 dev_warn(obj->adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_sysfs_read()
638 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, in amdgpu_ras_create_obj() argument
641 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_create_obj()
644 if (!adev->ras_enabled || !con) in amdgpu_ras_create_obj()
663 obj->adev = adev; in amdgpu_ras_create_obj()
671 struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, in amdgpu_ras_find_obj() argument
674 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_find_obj()
678 if (!adev->ras_enabled || !con) in amdgpu_ras_find_obj()
708 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, in amdgpu_ras_is_feature_allowed() argument
711 return adev->ras_hw_enabled & BIT(head->block); in amdgpu_ras_is_feature_allowed()
714 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, in amdgpu_ras_is_feature_enabled() argument
717 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_is_feature_enabled()
726 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, in __amdgpu_ras_feature_enable() argument
729 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in __amdgpu_ras_feature_enable()
730 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in __amdgpu_ras_feature_enable()
738 if (!amdgpu_ras_is_feature_allowed(adev, head)) in __amdgpu_ras_feature_enable()
743 obj = amdgpu_ras_create_obj(adev, head); in __amdgpu_ras_feature_enable()
752 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { in __amdgpu_ras_feature_enable()
762 int amdgpu_ras_feature_enable(struct amdgpu_device *adev, in amdgpu_ras_feature_enable() argument
765 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_feature_enable()
775 !amdgpu_ras_is_feature_allowed(adev, head)) in amdgpu_ras_feature_enable()
780 !amdgpu_sriov_vf(adev) && in amdgpu_ras_feature_enable()
798 ret = psp_ras_enable_features(&adev->psp, info, enable); in amdgpu_ras_feature_enable()
800 dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", in amdgpu_ras_feature_enable()
803 amdgpu_ras_is_poison_mode_supported(adev), ret); in amdgpu_ras_feature_enable()
812 __amdgpu_ras_feature_enable(adev, head, enable); in amdgpu_ras_feature_enable()
818 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, in amdgpu_ras_feature_enable_on_boot() argument
821 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_feature_enable_on_boot()
835 ret = amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
841 ret = __amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
843 dev_info(adev->dev, in amdgpu_ras_feature_enable_on_boot()
849 ret = __amdgpu_ras_feature_enable(adev, head, 1); in amdgpu_ras_feature_enable_on_boot()
857 ret = amdgpu_ras_feature_enable(adev, head, 0); in amdgpu_ras_feature_enable_on_boot()
860 if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) in amdgpu_ras_feature_enable_on_boot()
864 ret = amdgpu_ras_feature_enable(adev, head, enable); in amdgpu_ras_feature_enable_on_boot()
869 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev, in amdgpu_ras_disable_all_features() argument
872 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_disable_all_features()
880 if (__amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
883 if (amdgpu_ras_feature_enable(adev, &obj->head, 0)) in amdgpu_ras_disable_all_features()
891 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev, in amdgpu_ras_enable_all_features() argument
894 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_enable_all_features()
913 if (__amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
916 if (amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
933 if (__amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
936 if (amdgpu_ras_feature_enable(adev, &head, 1)) in amdgpu_ras_enable_all_features()
957 static struct amdgpu_ras_block_object *amdgpu_ras_get_ras_block(struct amdgpu_device *adev, in amdgpu_ras_get_ras_block() argument
966 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_get_ras_block()
968 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_get_ras_block()
985 static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_data *err_data) in amdgpu_ras_get_ecc_info() argument
987 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_get_ecc_info()
994 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(ras->umc_ecc)); in amdgpu_ras_get_ecc_info()
996 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
997 adev->umc.ras->ras_block.hw_ops->query_ras_error_count) in amdgpu_ras_get_ecc_info()
998 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1003 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && in amdgpu_ras_get_ecc_info()
1004 adev->umc.ras->ras_block.hw_ops->query_ras_error_address) in amdgpu_ras_get_ecc_info()
1005 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1007 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1008 adev->umc.ras->ecc_info_query_ras_error_count) in amdgpu_ras_get_ecc_info()
1009 adev->umc.ras->ecc_info_query_ras_error_count(adev, err_data); in amdgpu_ras_get_ecc_info()
1011 if (adev->umc.ras && in amdgpu_ras_get_ecc_info()
1012 adev->umc.ras->ecc_info_query_ras_error_address) in amdgpu_ras_get_ecc_info()
1013 adev->umc.ras->ecc_info_query_ras_error_address(adev, err_data); in amdgpu_ras_get_ecc_info()
1018 int amdgpu_ras_query_error_status(struct amdgpu_device *adev, in amdgpu_ras_query_error_status() argument
1022 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_query_error_status()
1029 amdgpu_ras_get_ecc_info(adev, &err_data); in amdgpu_ras_query_error_status()
1031 block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0); in amdgpu_ras_query_error_status()
1033 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_query_error_status()
1039 block_obj->hw_ops->query_ras_error_count(adev, &err_data); in amdgpu_ras_query_error_status()
1045 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_query_error_status()
1056 if (!adev->aid_mask && in amdgpu_ras_query_error_status()
1057 adev->smuio.funcs && in amdgpu_ras_query_error_status()
1058 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
1059 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
1060 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
1064 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
1065 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
1069 dev_info(adev->dev, "%ld correctable hardware errors " in amdgpu_ras_query_error_status()
1077 if (!adev->aid_mask && in amdgpu_ras_query_error_status()
1078 adev->smuio.funcs && in amdgpu_ras_query_error_status()
1079 adev->smuio.funcs->get_socket_id && in amdgpu_ras_query_error_status()
1080 adev->smuio.funcs->get_die_id) { in amdgpu_ras_query_error_status()
1081 dev_info(adev->dev, "socket: %d, die: %d " in amdgpu_ras_query_error_status()
1084 adev->smuio.funcs->get_socket_id(adev), in amdgpu_ras_query_error_status()
1085 adev->smuio.funcs->get_die_id(adev), in amdgpu_ras_query_error_status()
1089 dev_info(adev->dev, "%ld uncorrectable hardware errors " in amdgpu_ras_query_error_status()
1099 int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, in amdgpu_ras_reset_error_status() argument
1102 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); in amdgpu_ras_reset_error_status()
1104 if (!amdgpu_ras_is_supported(adev, block)) in amdgpu_ras_reset_error_status()
1108 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_reset_error_status()
1114 block_obj->hw_ops->reset_ras_error_count(adev); in amdgpu_ras_reset_error_status()
1119 block_obj->hw_ops->reset_ras_error_status(adev); in amdgpu_ras_reset_error_status()
1126 int amdgpu_ras_error_inject(struct amdgpu_device *adev, in amdgpu_ras_error_inject() argument
1129 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_error_inject()
1138 struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, in amdgpu_ras_error_inject()
1143 if (amdgpu_sriov_vf(adev)) in amdgpu_ras_error_inject()
1150 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_inject()
1156 if (adev->gmc.xgmi.num_physical_nodes > 1 && in amdgpu_ras_error_inject()
1159 amdgpu_xgmi_get_relative_phy_addr(adev, in amdgpu_ras_error_inject()
1165 ret = block_obj->hw_ops->ras_error_inject(adev, info, info->instance_mask); in amdgpu_ras_error_inject()
1167 ret = block_obj->hw_ops->ras_error_inject(adev, &block_info, in amdgpu_ras_error_inject()
1171 ret = psp_ras_trigger_error(&adev->psp, &block_info, info->instance_mask); in amdgpu_ras_error_inject()
1175 dev_err(adev->dev, "ras inject %s failed %d\n", in amdgpu_ras_error_inject()
1191 static int amdgpu_ras_query_error_count_helper(struct amdgpu_device *adev, in amdgpu_ras_query_error_count_helper() argument
1202 ret = amdgpu_ras_query_error_status(adev, query_info); in amdgpu_ras_query_error_count_helper()
1211 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_query_error_count_helper()
1212 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) { in amdgpu_ras_query_error_count_helper()
1213 if (amdgpu_ras_reset_error_status(adev, query_info->head.block)) in amdgpu_ras_query_error_count_helper()
1214 dev_warn(adev->dev, in amdgpu_ras_query_error_count_helper()
1235 int amdgpu_ras_query_error_count(struct amdgpu_device *adev, in amdgpu_ras_query_error_count() argument
1240 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_query_error_count()
1245 if (!adev->ras_enabled || !con) in amdgpu_ras_query_error_count()
1262 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, &info); in amdgpu_ras_query_error_count()
1266 ret = amdgpu_ras_query_error_count_helper(adev, &ce, &ue, query_info); in amdgpu_ras_query_error_count()
1285 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1337 struct amdgpu_device *adev = con->adev; in amdgpu_ras_sysfs_badpages_read() local
1348 if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) in amdgpu_ras_sysfs_badpages_read()
1372 static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_bad_page_node() argument
1374 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_bad_page_node()
1376 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_bad_page_node()
1377 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove_bad_page_node()
1382 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_feature_node() argument
1384 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_feature_node()
1394 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove_feature_node()
1395 sysfs_remove_group(&adev->dev->kobj, &group); in amdgpu_ras_sysfs_remove_feature_node()
1400 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, in amdgpu_ras_sysfs_create() argument
1403 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_sysfs_create()
1422 if (sysfs_add_file_to_group(&adev->dev->kobj, in amdgpu_ras_sysfs_create()
1434 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, in amdgpu_ras_sysfs_remove() argument
1437 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_sysfs_remove()
1442 if (adev->dev->kobj.sd) in amdgpu_ras_sysfs_remove()
1443 sysfs_remove_file_from_group(&adev->dev->kobj, in amdgpu_ras_sysfs_remove()
1452 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) in amdgpu_ras_sysfs_remove_all() argument
1454 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_sysfs_remove_all()
1458 amdgpu_ras_sysfs_remove(adev, &obj->head); in amdgpu_ras_sysfs_remove_all()
1462 amdgpu_ras_sysfs_remove_bad_page_node(adev); in amdgpu_ras_sysfs_remove_all()
1464 amdgpu_ras_sysfs_remove_feature_node(adev); in amdgpu_ras_sysfs_remove_all()
1489 static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) in amdgpu_ras_debugfs_create_ctrl_node() argument
1491 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_debugfs_create_ctrl_node()
1493 struct drm_minor *minor = adev_to_drm(adev)->primary; in amdgpu_ras_debugfs_create_ctrl_node()
1497 debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1499 debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1504 debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1505 debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); in amdgpu_ras_debugfs_create_ctrl_node()
1506 debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1509 S_IRUGO, dir, adev, in amdgpu_ras_debugfs_create_ctrl_node()
1532 static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, in amdgpu_ras_debugfs_create() argument
1536 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head); in amdgpu_ras_debugfs_create()
1551 void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) in amdgpu_ras_debugfs_create_all() argument
1553 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_debugfs_create_all()
1565 dir = amdgpu_ras_debugfs_create_ctrl_node(adev); in amdgpu_ras_debugfs_create_all()
1568 if (amdgpu_ras_is_supported(adev, obj->head.block) && in amdgpu_ras_debugfs_create_all()
1573 amdgpu_ras_debugfs_create(adev, &fs_info, dir); in amdgpu_ras_debugfs_create_all()
1585 static int amdgpu_ras_fs_init(struct amdgpu_device *adev) in amdgpu_ras_fs_init() argument
1587 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fs_init()
1615 r = sysfs_create_group(&adev->dev->kobj, &group); in amdgpu_ras_fs_init()
1617 dev_err(adev->dev, "Failed to create RAS sysfs group!"); in amdgpu_ras_fs_init()
1622 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) in amdgpu_ras_fs_fini() argument
1624 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fs_fini()
1629 ip_obj = amdgpu_ras_find_obj(adev, &con_obj->head); in amdgpu_ras_fs_fini()
1635 amdgpu_ras_sysfs_remove_all(adev); in amdgpu_ras_fs_fini()
1647 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) in amdgpu_ras_interrupt_fatal_error_handler() argument
1650 if (amdgpu_sriov_vf(adev)) in amdgpu_ras_interrupt_fatal_error_handler()
1653 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1654 adev->nbio.ras->handle_ras_controller_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1655 adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1657 if (adev->nbio.ras && in amdgpu_ras_interrupt_fatal_error_handler()
1658 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring) in amdgpu_ras_interrupt_fatal_error_handler()
1659 adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev); in amdgpu_ras_interrupt_fatal_error_handler()
1666 struct amdgpu_device *adev = obj->adev; in amdgpu_ras_interrupt_poison_consumption_handler() local
1668 amdgpu_ras_get_ras_block(adev, obj->head.block, 0); in amdgpu_ras_interrupt_poison_consumption_handler()
1678 poison_stat = block_obj->hw_ops->query_poison_status(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1681 dev_info(adev->dev, "No RAS poison status in %s poison IH.\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1688 amdgpu_umc_poison_handler(adev, false); in amdgpu_ras_interrupt_poison_consumption_handler()
1691 poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1695 dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", in amdgpu_ras_interrupt_poison_consumption_handler()
1697 amdgpu_ras_reset_gpu(adev); in amdgpu_ras_interrupt_poison_consumption_handler()
1699 amdgpu_gfx_poison_consumption_handler(adev, entry); in amdgpu_ras_interrupt_poison_consumption_handler()
1706 dev_info(obj->adev->dev, in amdgpu_ras_interrupt_poison_creation_handler()
1723 ret = data->cb(obj->adev, &err_data, entry); in amdgpu_ras_interrupt_umc_handler()
1752 if (amdgpu_ras_is_poison_mode_supported(obj->adev)) { in amdgpu_ras_interrupt_handler()
1761 dev_warn(obj->adev->dev, in amdgpu_ras_interrupt_handler()
1777 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, in amdgpu_ras_interrupt_dispatch() argument
1780 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head); in amdgpu_ras_interrupt_dispatch()
1802 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, in amdgpu_ras_interrupt_remove_handler() argument
1805 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_interrupt_remove_handler()
1824 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev, in amdgpu_ras_interrupt_add_handler() argument
1827 struct ras_manager *obj = amdgpu_ras_find_obj(adev, head); in amdgpu_ras_interrupt_add_handler()
1833 obj = amdgpu_ras_create_obj(adev, head); in amdgpu_ras_interrupt_add_handler()
1868 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) in amdgpu_ras_interrupt_remove_all() argument
1870 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_interrupt_remove_all()
1874 amdgpu_ras_interrupt_remove_handler(adev, &obj->head); in amdgpu_ras_interrupt_remove_all()
1882 static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) in amdgpu_ras_log_on_err_counter() argument
1884 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_log_on_err_counter()
1887 if (!adev->ras_enabled || !con) in amdgpu_ras_log_on_err_counter()
1911 (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))) in amdgpu_ras_log_on_err_counter()
1914 amdgpu_ras_query_error_status(adev, &info); in amdgpu_ras_log_on_err_counter()
1916 if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) && in amdgpu_ras_log_on_err_counter()
1917 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) && in amdgpu_ras_log_on_err_counter()
1918 adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) { in amdgpu_ras_log_on_err_counter()
1919 if (amdgpu_ras_reset_error_status(adev, info.head.block)) in amdgpu_ras_log_on_err_counter()
1920 dev_warn(adev->dev, "Failed to reset error counter and error status"); in amdgpu_ras_log_on_err_counter()
1926 static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, in amdgpu_ras_error_status_query() argument
1938 block_obj = amdgpu_ras_get_ras_block(adev, in amdgpu_ras_error_status_query()
1943 dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", in amdgpu_ras_error_status_query()
1949 block_obj->hw_ops->query_ras_error_status(adev); in amdgpu_ras_error_status_query()
1953 static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) in amdgpu_ras_query_err_status() argument
1955 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_query_err_status()
1958 if (!adev->ras_enabled || !con) in amdgpu_ras_query_err_status()
1966 amdgpu_ras_error_status_query(adev, &info); in amdgpu_ras_query_err_status()
1975 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, in amdgpu_ras_badpages_read() argument
1978 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_badpages_read()
2006 status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, in amdgpu_ras_badpages_read()
2025 struct amdgpu_device *adev = ras->adev; in amdgpu_ras_do_recovery() local
2029 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); in amdgpu_ras_do_recovery()
2032 if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { in amdgpu_ras_do_recovery()
2036 list_add_tail(&adev->gmc.xgmi.head, &device_list); in amdgpu_ras_do_recovery()
2049 if (amdgpu_device_should_recover_gpu(ras->adev)) { in amdgpu_ras_do_recovery()
2054 reset_context.reset_req_dev = adev; in amdgpu_ras_do_recovery()
2057 if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) in amdgpu_ras_do_recovery()
2074 psp_fatal_error_recovery_quirk(&adev->psp); in amdgpu_ras_do_recovery()
2078 amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context); in amdgpu_ras_do_recovery()
2084 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, in amdgpu_ras_realloc_eh_data_space() argument
2108 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, in amdgpu_ras_add_bad_pages() argument
2111 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_add_bad_pages()
2130 amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { in amdgpu_ras_add_bad_pages()
2135 amdgpu_vram_mgr_reserve_range(&adev->mman.vram_mgr, in amdgpu_ras_add_bad_pages()
2154 int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, in amdgpu_ras_save_bad_pages() argument
2157 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_save_bad_pages()
2176 *new_cnt = save_count / adev->umc.retire_unit; in amdgpu_ras_save_bad_pages()
2183 dev_err(adev->dev, "Failed to save EEPROM table data!"); in amdgpu_ras_save_bad_pages()
2187 dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); in amdgpu_ras_save_bad_pages()
2197 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) in amdgpu_ras_load_bad_pages() argument
2200 &adev->psp.ras_context.ras->eeprom_control; in amdgpu_ras_load_bad_pages()
2214 dev_err(adev->dev, "Failed to load EEPROM table records!"); in amdgpu_ras_load_bad_pages()
2216 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); in amdgpu_ras_load_bad_pages()
2241 static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, in amdgpu_ras_check_bad_page() argument
2244 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_check_bad_page()
2256 static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, in amdgpu_ras_validate_threshold() argument
2259 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_validate_threshold()
2282 u64 val = adev->gmc.mc_vram_size; in amdgpu_ras_validate_threshold()
2293 int amdgpu_ras_recovery_init(struct amdgpu_device *adev) in amdgpu_ras_recovery_init() argument
2295 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_recovery_init()
2301 if (!con || amdgpu_sriov_vf(adev)) in amdgpu_ras_recovery_init()
2309 con->adev = adev; in amdgpu_ras_recovery_init()
2311 if (!adev->ras_enabled) in amdgpu_ras_recovery_init()
2327 amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); in amdgpu_ras_recovery_init()
2333 if (adev->gmc.xgmi.pending_reset) in amdgpu_ras_recovery_init()
2344 ret = amdgpu_ras_load_bad_pages(adev); in amdgpu_ras_recovery_init()
2348 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs); in amdgpu_ras_recovery_init()
2351 amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap); in amdgpu_ras_recovery_init()
2357 if ((adev->asic_type == CHIP_ALDEBARAN) && in amdgpu_ras_recovery_init()
2358 (adev->gmc.xgmi.connected_to_cpu)) in amdgpu_ras_recovery_init()
2359 amdgpu_register_bad_pages_mca_notifier(adev); in amdgpu_ras_recovery_init()
2368 dev_warn(adev->dev, "Failed to initialize ras recovery! (%d)\n", ret); in amdgpu_ras_recovery_init()
2382 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) in amdgpu_ras_recovery_fini() argument
2384 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_recovery_fini()
2403 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) in amdgpu_ras_asic_supported() argument
2405 if (amdgpu_sriov_vf(adev)) { in amdgpu_ras_asic_supported()
2406 switch (adev->ip_versions[MP0_HWIP][0]) { in amdgpu_ras_asic_supported()
2415 if (adev->asic_type == CHIP_IP_DISCOVERY) { in amdgpu_ras_asic_supported()
2416 switch (adev->ip_versions[MP0_HWIP][0]) { in amdgpu_ras_asic_supported()
2426 return adev->asic_type == CHIP_VEGA10 || in amdgpu_ras_asic_supported()
2427 adev->asic_type == CHIP_VEGA20 || in amdgpu_ras_asic_supported()
2428 adev->asic_type == CHIP_ARCTURUS || in amdgpu_ras_asic_supported()
2429 adev->asic_type == CHIP_ALDEBARAN || in amdgpu_ras_asic_supported()
2430 adev->asic_type == CHIP_SIENNA_CICHLID; in amdgpu_ras_asic_supported()
2438 static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) in amdgpu_ras_get_quirks() argument
2440 struct atom_context *ctx = adev->mode_info.atom_context; in amdgpu_ras_get_quirks()
2449 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); in amdgpu_ras_get_quirks()
2461 static void amdgpu_ras_check_supported(struct amdgpu_device *adev) in amdgpu_ras_check_supported() argument
2463 adev->ras_hw_enabled = adev->ras_enabled = 0; in amdgpu_ras_check_supported()
2465 if (!amdgpu_ras_asic_supported(adev)) in amdgpu_ras_check_supported()
2468 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { in amdgpu_ras_check_supported()
2469 if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { in amdgpu_ras_check_supported()
2470 dev_info(adev->dev, "MEM ECC is active.\n"); in amdgpu_ras_check_supported()
2471 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2474 dev_info(adev->dev, "MEM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2477 if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { in amdgpu_ras_check_supported()
2478 dev_info(adev->dev, "SRAM ECC is active.\n"); in amdgpu_ras_check_supported()
2479 if (!amdgpu_sriov_vf(adev)) in amdgpu_ras_check_supported()
2480 adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | in amdgpu_ras_check_supported()
2483 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF | in amdgpu_ras_check_supported()
2490 if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0) || in amdgpu_ras_check_supported()
2491 adev->ip_versions[VCN_HWIP][0] == IP_VERSION(4, 0, 0)) in amdgpu_ras_check_supported()
2492 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2495 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | in amdgpu_ras_check_supported()
2502 if (!adev->gmc.xgmi.num_physical_nodes) in amdgpu_ras_check_supported()
2503 adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__XGMI_WAFL); in amdgpu_ras_check_supported()
2505 dev_info(adev->dev, "SRAM ECC is not presented.\n"); in amdgpu_ras_check_supported()
2510 adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | in amdgpu_ras_check_supported()
2515 amdgpu_ras_get_quirks(adev); in amdgpu_ras_check_supported()
2518 adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; in amdgpu_ras_check_supported()
2525 if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 6) && in amdgpu_ras_check_supported()
2526 adev->gmc.is_app_apu) in amdgpu_ras_check_supported()
2527 adev->ras_enabled = amdgpu_ras_enable != 1 ? 0 : in amdgpu_ras_check_supported()
2528 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
2530 adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : in amdgpu_ras_check_supported()
2531 adev->ras_hw_enabled & amdgpu_ras_mask; in amdgpu_ras_check_supported()
2538 struct amdgpu_device *adev = con->adev; in amdgpu_ras_counte_dw() local
2539 struct drm_device *dev = adev_to_drm(adev); in amdgpu_ras_counte_dw()
2549 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL) == 0) { in amdgpu_ras_counte_dw()
2559 static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) in amdgpu_ras_query_poison_mode() argument
2561 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_query_poison_mode()
2565 if (amdgpu_sriov_vf(adev) || !con) in amdgpu_ras_query_poison_mode()
2569 if (adev->gmc.xgmi.connected_to_cpu) { in amdgpu_ras_query_poison_mode()
2572 } else if (adev->df.funcs && in amdgpu_ras_query_poison_mode()
2573 adev->df.funcs->query_ras_poison_mode && in amdgpu_ras_query_poison_mode()
2574 adev->umc.ras && in amdgpu_ras_query_poison_mode()
2575 adev->umc.ras->query_ras_poison_mode) { in amdgpu_ras_query_poison_mode()
2577 adev->df.funcs->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
2579 adev->umc.ras->query_ras_poison_mode(adev); in amdgpu_ras_query_poison_mode()
2585 dev_warn(adev->dev, in amdgpu_ras_query_poison_mode()
2591 int amdgpu_ras_init(struct amdgpu_device *adev) in amdgpu_ras_init() argument
2593 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_init()
2606 con->adev = adev; in amdgpu_ras_init()
2613 amdgpu_ras_set_context(adev, con); in amdgpu_ras_init()
2615 amdgpu_ras_check_supported(adev); in amdgpu_ras_init()
2617 if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { in amdgpu_ras_init()
2621 if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { in amdgpu_ras_init()
2640 switch (adev->ip_versions[NBIO_HWIP][0]) { in amdgpu_ras_init()
2644 if (!adev->gmc.xgmi.connected_to_cpu) in amdgpu_ras_init()
2645 adev->nbio.ras = &nbio_v7_4_ras; in amdgpu_ras_init()
2648 if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF)) in amdgpu_ras_init()
2655 adev->nbio.ras = &nbio_v4_3_ras; in amdgpu_ras_init()
2658 if (!adev->gmc.is_app_apu) in amdgpu_ras_init()
2659 adev->nbio.ras = &nbio_v7_9_ras; in amdgpu_ras_init()
2668 r = amdgpu_nbio_ras_sw_init(adev); in amdgpu_ras_init()
2672 if (adev->nbio.ras && in amdgpu_ras_init()
2673 adev->nbio.ras->init_ras_controller_interrupt) { in amdgpu_ras_init()
2674 r = adev->nbio.ras->init_ras_controller_interrupt(adev); in amdgpu_ras_init()
2679 if (adev->nbio.ras && in amdgpu_ras_init()
2680 adev->nbio.ras->init_ras_err_event_athub_interrupt) { in amdgpu_ras_init()
2681 r = adev->nbio.ras->init_ras_err_event_athub_interrupt(adev); in amdgpu_ras_init()
2686 amdgpu_ras_query_poison_mode(adev); in amdgpu_ras_init()
2688 if (amdgpu_ras_fs_init(adev)) { in amdgpu_ras_init()
2693 dev_info(adev->dev, "RAS INFO: ras initialized successfully, " in amdgpu_ras_init()
2695 adev->ras_hw_enabled, adev->ras_enabled); in amdgpu_ras_init()
2699 amdgpu_ras_set_context(adev, NULL); in amdgpu_ras_init()
2705 int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) in amdgpu_persistent_edc_harvesting_supported() argument
2707 if (adev->gmc.xgmi.connected_to_cpu || in amdgpu_persistent_edc_harvesting_supported()
2708 adev->gmc.is_app_apu) in amdgpu_persistent_edc_harvesting_supported()
2713 static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, in amdgpu_persistent_edc_harvesting() argument
2720 if (!amdgpu_persistent_edc_harvesting_supported(adev)) in amdgpu_persistent_edc_harvesting()
2723 if (amdgpu_ras_query_error_status(adev, &info) != 0) in amdgpu_persistent_edc_harvesting()
2726 if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) in amdgpu_persistent_edc_harvesting()
2732 bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev) in amdgpu_ras_is_poison_mode_supported() argument
2734 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_is_poison_mode_supported()
2743 int amdgpu_ras_block_late_init(struct amdgpu_device *adev, in amdgpu_ras_block_late_init() argument
2747 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_block_late_init()
2753 if (!amdgpu_ras_is_supported(adev, ras_block->block)) { in amdgpu_ras_block_late_init()
2754 amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); in amdgpu_ras_block_late_init()
2758 r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); in amdgpu_ras_block_late_init()
2760 if (adev->in_suspend || amdgpu_in_reset(adev)) { in amdgpu_ras_block_late_init()
2769 amdgpu_persistent_edc_harvesting(adev, ras_block); in amdgpu_ras_block_late_init()
2772 if (adev->in_suspend || amdgpu_in_reset(adev)) in amdgpu_ras_block_late_init()
2779 r = amdgpu_ras_interrupt_add_handler(adev, ras_block); in amdgpu_ras_block_late_init()
2787 r = amdgpu_ras_sysfs_create(adev, ras_block); in amdgpu_ras_block_late_init()
2798 if (amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, query_info) == 0) { in amdgpu_ras_block_late_init()
2810 amdgpu_ras_interrupt_remove_handler(adev, ras_block); in amdgpu_ras_block_late_init()
2812 amdgpu_ras_feature_enable(adev, ras_block, 0); in amdgpu_ras_block_late_init()
2816 static int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev, in amdgpu_ras_block_late_init_default() argument
2819 return amdgpu_ras_block_late_init(adev, ras_block); in amdgpu_ras_block_late_init_default()
2823 void amdgpu_ras_block_late_fini(struct amdgpu_device *adev, in amdgpu_ras_block_late_fini() argument
2830 amdgpu_ras_sysfs_remove(adev, ras_block); in amdgpu_ras_block_late_fini()
2834 amdgpu_ras_interrupt_remove_handler(adev, ras_block); in amdgpu_ras_block_late_fini()
2837 static void amdgpu_ras_block_late_fini_default(struct amdgpu_device *adev, in amdgpu_ras_block_late_fini_default() argument
2840 return amdgpu_ras_block_late_fini(adev, ras_block); in amdgpu_ras_block_late_fini_default()
2846 void amdgpu_ras_resume(struct amdgpu_device *adev) in amdgpu_ras_resume() argument
2848 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_resume()
2851 if (!adev->ras_enabled || !con) { in amdgpu_ras_resume()
2853 amdgpu_release_ras_context(adev); in amdgpu_ras_resume()
2864 amdgpu_ras_enable_all_features(adev, 1); in amdgpu_ras_resume()
2871 if (!amdgpu_ras_is_supported(adev, obj->head.block)) { in amdgpu_ras_resume()
2872 amdgpu_ras_feature_enable(adev, &obj->head, 0); in amdgpu_ras_resume()
2880 void amdgpu_ras_suspend(struct amdgpu_device *adev) in amdgpu_ras_suspend() argument
2882 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_suspend()
2884 if (!adev->ras_enabled || !con) in amdgpu_ras_suspend()
2887 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_suspend()
2890 amdgpu_ras_disable_all_features(adev, 1); in amdgpu_ras_suspend()
2893 int amdgpu_ras_late_init(struct amdgpu_device *adev) in amdgpu_ras_late_init() argument
2900 if (amdgpu_sriov_vf(adev)) in amdgpu_ras_late_init()
2903 list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { in amdgpu_ras_late_init()
2905 dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); in amdgpu_ras_late_init()
2911 r = obj->ras_late_init(adev, &obj->ras_comm); in amdgpu_ras_late_init()
2913 dev_err(adev->dev, "%s failed to execute ras_late_init! ret:%d\n", in amdgpu_ras_late_init()
2918 amdgpu_ras_block_late_init_default(adev, &obj->ras_comm); in amdgpu_ras_late_init()
2925 int amdgpu_ras_pre_fini(struct amdgpu_device *adev) in amdgpu_ras_pre_fini() argument
2927 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_pre_fini()
2929 if (!adev->ras_enabled || !con) in amdgpu_ras_pre_fini()
2935 amdgpu_ras_disable_all_features(adev, 0); in amdgpu_ras_pre_fini()
2936 amdgpu_ras_recovery_fini(adev); in amdgpu_ras_pre_fini()
2940 int amdgpu_ras_fini(struct amdgpu_device *adev) in amdgpu_ras_fini() argument
2944 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_ras_fini()
2946 if (!adev->ras_enabled || !con) in amdgpu_ras_fini()
2949 list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { in amdgpu_ras_fini()
2952 if (amdgpu_ras_is_supported(adev, obj->ras_comm.block) && in amdgpu_ras_fini()
2954 obj->ras_fini(adev, &obj->ras_comm); in amdgpu_ras_fini()
2956 amdgpu_ras_block_late_fini_default(adev, &obj->ras_comm); in amdgpu_ras_fini()
2964 amdgpu_ras_fs_fini(adev); in amdgpu_ras_fini()
2965 amdgpu_ras_interrupt_remove_all(adev); in amdgpu_ras_fini()
2970 amdgpu_ras_disable_all_features(adev, 1); in amdgpu_ras_fini()
2974 amdgpu_ras_set_context(adev, NULL); in amdgpu_ras_fini()
2980 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) in amdgpu_ras_global_ras_isr() argument
2983 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_global_ras_isr()
2985 dev_info(adev->dev, "uncorrectable hardware error" in amdgpu_ras_global_ras_isr()
2989 amdgpu_ras_reset_gpu(adev); in amdgpu_ras_global_ras_isr()
2993 bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) in amdgpu_ras_need_emergency_restart() argument
2995 if (adev->asic_type == CHIP_VEGA20 && in amdgpu_ras_need_emergency_restart()
2996 adev->pm.fw_version <= 0x283400) { in amdgpu_ras_need_emergency_restart()
2997 return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && in amdgpu_ras_need_emergency_restart()
3004 void amdgpu_release_ras_context(struct amdgpu_device *adev) in amdgpu_release_ras_context() argument
3006 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); in amdgpu_release_ras_context()
3011 if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { in amdgpu_release_ras_context()
3013 amdgpu_ras_set_context(adev, NULL); in amdgpu_release_ras_context()
3022 struct amdgpu_device *adev = NULL; in find_adev() local
3025 adev = mce_adev_list.devs[i]; in find_adev()
3027 if (adev && adev->gmc.xgmi.connected_to_cpu && in find_adev()
3028 adev->gmc.xgmi.physical_node_id == node_id) in find_adev()
3030 adev = NULL; in find_adev()
3033 return adev; in find_adev()
3045 struct amdgpu_device *adev = NULL; in amdgpu_bad_page_notifier() local
3069 adev = find_adev(gpu_id); in amdgpu_bad_page_notifier()
3070 if (!adev) { in amdgpu_bad_page_notifier()
3083 dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d", in amdgpu_bad_page_notifier()
3086 if (!amdgpu_umc_page_retirement_mca(adev, m->addr, ch_inst, umc_inst)) in amdgpu_bad_page_notifier()
3097 static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev) in amdgpu_register_bad_pages_mca_notifier() argument
3107 mce_adev_list.devs[mce_adev_list.num_gpu++] = adev; in amdgpu_register_bad_pages_mca_notifier()
3120 struct amdgpu_ras *amdgpu_ras_get_context(struct amdgpu_device *adev) in amdgpu_ras_get_context() argument
3122 if (!adev) in amdgpu_ras_get_context()
3125 return adev->psp.ras_context.ras; in amdgpu_ras_get_context()
3128 int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con) in amdgpu_ras_set_context() argument
3130 if (!adev) in amdgpu_ras_set_context()
3133 adev->psp.ras_context.ras = ras_con; in amdgpu_ras_set_context()
3138 int amdgpu_ras_is_supported(struct amdgpu_device *adev, in amdgpu_ras_is_supported() argument
3142 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_is_supported()
3147 ret = ras && (adev->ras_enabled & (1 << block)); in amdgpu_ras_is_supported()
3160 amdgpu_ras_is_poison_mode_supported(adev) && in amdgpu_ras_is_supported()
3161 amdgpu_ras_get_ras_block(adev, block, 0)) in amdgpu_ras_is_supported()
3167 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) in amdgpu_ras_reset_gpu() argument
3169 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); in amdgpu_ras_reset_gpu()
3172 amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); in amdgpu_ras_reset_gpu()
3178 int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, in amdgpu_ras_register_ras_block() argument
3182 if (!adev || !ras_block_obj) in amdgpu_ras_register_ras_block()
3191 list_add_tail(&ras_node->node, &adev->ras_list); in amdgpu_ras_register_ras_block()
3214 bool amdgpu_ras_inst_get_memory_id_field(struct amdgpu_device *adev, in amdgpu_ras_inst_get_memory_id_field() argument
3238 bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, in amdgpu_ras_inst_get_err_cnt_field() argument
3256 dev_dbg(adev->dev, "Invalid err_info field\n"); in amdgpu_ras_inst_get_err_cnt_field()
3264 void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, in amdgpu_ras_inst_query_ras_error_count() argument
3280 if (!amdgpu_ras_inst_get_memory_id_field(adev, &reg_list[i], in amdgpu_ras_inst_query_ras_error_count()
3285 if (!amdgpu_ras_inst_get_err_cnt_field(adev, &reg_list[i], in amdgpu_ras_inst_query_ras_error_count()
3296 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
3304 dev_info(adev->dev, in amdgpu_ras_inst_query_ras_error_count()
3316 void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev, in amdgpu_ras_inst_reset_ras_error_count() argument