diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 538 |
1 files changed, 355 insertions, 183 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index affde2de2a0d..eb7cfe87042e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -64,9 +64,11 @@ #include "amdgpu_xgmi.h" #include "amdgpu_ras.h" #include "amdgpu_pmu.h" +#include "amdgpu_fru_eeprom.h" #include <linux/suspend.h> #include <drm/task_barrier.h> +#include <linux/pm_runtime.h> MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); @@ -78,6 +80,8 @@ MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin"); +MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 @@ -110,6 +114,8 @@ const char *amdgpu_asic_name[] = { "NAVI10", "NAVI14", "NAVI12", + "SIENNA_CICHLID", + "NAVY_FLOUNDER", "LAST", }; @@ -138,6 +144,72 @@ static DEVICE_ATTR(pcie_replay_count, S_IRUGO, static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); /** + * DOC: product_name + * + * The amdgpu driver provides a sysfs API for reporting the product name + * for the device + * The file serial_number is used for this and returns the product name + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_product_name(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name); +} + +static DEVICE_ATTR(product_name, S_IRUGO, + amdgpu_device_get_product_name, NULL); + +/** + * DOC: product_number + * + * The amdgpu driver provides a sysfs API for reporting the part number + * for the device + * The file serial_number is used for this and returns the part number + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_product_number(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + + return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number); +} + +static DEVICE_ATTR(product_number, S_IRUGO, + amdgpu_device_get_product_number, NULL); + +/** + * DOC: serial_number + * + * The amdgpu driver provides a sysfs API for reporting the serial number + * for the device + * The file serial_number is used for this and returns the serial number + * as returned from the FRU. + * NOTE: This is only available for certain server cards + */ + +static ssize_t amdgpu_device_get_serial_number(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = ddev->dev_private; + + return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial); +} + +static DEVICE_ATTR(serial_number, S_IRUGO, + amdgpu_device_get_serial_number, NULL); + +/** * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control * * @dev: drm_device pointer @@ -247,10 +319,10 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, { uint32_t ret; - if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) return amdgpu_kiq_rreg(adev, reg); - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) + if ((reg * 4) < adev->rmmio_size) ret = readl(((void __iomem *)adev->rmmio) + (reg * 4)); else { unsigned long flags; @@ -310,7 +382,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, { trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); - if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX)) + if ((reg * 4) < adev->rmmio_size) writel(v, ((void __iomem *)adev->rmmio) + (reg * 4)); else { unsigned long flags; @@ -320,10 +392,6 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4)); spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); } - - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { - udelay(500); - } } /** @@ -339,11 +407,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { - adev->last_mm_index = v; - } - - if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))) + if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) return amdgpu_kiq_wreg(adev, reg, v); amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags); @@ -397,20 +461,12 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) */ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) { - if (adev->asic_type >= CHIP_VEGA10 && reg == 0) { - adev->last_mm_index = v; - } - if ((reg * 4) < adev->rio_mem_size) iowrite32(v, adev->rio_mem + (reg * 4)); else { iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4)); iowrite32(v, adev->rio_mem + (mmMM_DATA * 4)); } - - if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) { - udelay(500); - } } /** @@ -866,6 +922,11 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev)) return 0; + /* skip if the bios has already enabled large BAR */ + if (adev->gmc.real_vram_size && + (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size)) + return 0; + /* Check if the root BUS has 64bit memory resources */ root = adev->pdev->bus; while (root->parent) @@ -1118,6 +1179,16 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_vm_fragment_size = -1; } + if (amdgpu_sched_hw_submission < 2) { + dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n", + amdgpu_sched_hw_submission); + amdgpu_sched_hw_submission = 2; + } else if (!is_power_of_2(amdgpu_sched_hw_submission)) { + dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n", + amdgpu_sched_hw_submission); + amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission); + } + amdgpu_device_check_smu_prv_buffer_size(adev); amdgpu_device_check_vm_size(adev); @@ -1126,6 +1197,8 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); + amdgpu_gmc_tmz_set(adev); + return 0; } @@ -1147,7 +1220,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero return; if (state == VGA_SWITCHEROO_ON) { - pr_info("amdgpu: switched on\n"); + pr_info("switched on\n"); /* don't suspend or resume card normally */ dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; @@ -1161,7 +1234,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero dev->switch_power_state = DRM_SWITCH_POWER_ON; drm_kms_helper_poll_enable(dev); } else { - pr_info("amdgpu: switched off\n"); + pr_info("switched off\n"); drm_kms_helper_poll_disable(dev); dev->switch_power_state = DRM_SWITCH_POWER_CHANGING; amdgpu_device_suspend(dev, true); @@ -1484,22 +1557,25 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev) static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) { const char *chip_name; - char fw_name[30]; + char fw_name[40]; int err; const struct gpu_info_firmware_header_v1_0 *hdr; adev->firmware.gpu_info_fw = NULL; + if (adev->discovery_bin) { + amdgpu_discovery_get_gfx_info(adev); + + /* + * FIXME: The bounding box is still needed by Navi12, so + * temporarily read it from gpu_info firmware. Should be droped + * when DAL no longer needs it. + */ + if (adev->asic_type != CHIP_NAVI12) + return 0; + } + switch (adev->asic_type) { - case CHIP_TOPAZ: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - case CHIP_CARRIZO: - case CHIP_STONEY: #ifdef CONFIG_DRM_AMDGPU_SI case CHIP_VERDE: case CHIP_TAHITI: @@ -1514,6 +1590,15 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_KABINI: case CHIP_MULLINS: #endif + case CHIP_TOPAZ: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + case CHIP_VEGAM: + case CHIP_CARRIZO: + case CHIP_STONEY: case CHIP_VEGA20: default: return 0; @@ -1524,9 +1609,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) chip_name = "vega12"; break; case CHIP_RAVEN: - if (adev->rev_id >= 8) + if (adev->apu_flags & AMD_APU_IS_RAVEN2) chip_name = "raven2"; - else if (adev->pdev->device == 0x15d8) + else if (adev->apu_flags & AMD_APU_IS_PICASSO) chip_name = "picasso"; else chip_name = "raven"; @@ -1546,6 +1631,12 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) case CHIP_NAVI12: chip_name = "navi12"; break; + case CHIP_SIENNA_CICHLID: + chip_name = "sienna_cichlid"; + break; + case CHIP_NAVY_FLOUNDER: + chip_name = "navy_flounder"; + break; } snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name); @@ -1574,7 +1665,10 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); - if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) + /* + * Should be droped when DAL no longer needs it. + */ + if (adev->asic_type == CHIP_NAVI12) goto parse_soc_bounding_box; adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); @@ -1608,7 +1702,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) parse_soc_bounding_box: /* * soc bounding box info is not integrated in disocovery table, - * we always need to parse it from gpu info firmware. + * we always need to parse it from gpu info firmware if needed. */ if (hdr->version_minor == 2) { const struct gpu_info_firmware_v1_2 *gpu_info_fw = @@ -1644,25 +1738,13 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) amdgpu_device_enable_virtual_display(adev); - switch (adev->asic_type) { - case CHIP_TOPAZ: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - case CHIP_CARRIZO: - case CHIP_STONEY: - if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY) - adev->family = AMDGPU_FAMILY_CZ; - else - adev->family = AMDGPU_FAMILY_VI; - - r = vi_set_ip_blocks(adev); + if (amdgpu_sriov_vf(adev)) { + r = amdgpu_virt_request_full_gpu(adev, true); if (r) return r; - break; + } + + switch (adev->asic_type) { #ifdef CONFIG_DRM_AMDGPU_SI case CHIP_VERDE: case CHIP_TAHITI: @@ -1681,24 +1763,41 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) case CHIP_KAVERI: case CHIP_KABINI: case CHIP_MULLINS: - if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII)) - adev->family = AMDGPU_FAMILY_CI; - else + if (adev->flags & AMD_IS_APU) adev->family = AMDGPU_FAMILY_KV; + else + adev->family = AMDGPU_FAMILY_CI; r = cik_set_ip_blocks(adev); if (r) return r; break; #endif + case CHIP_TOPAZ: + case CHIP_TONGA: + case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + case CHIP_VEGAM: + case CHIP_CARRIZO: + case CHIP_STONEY: + if (adev->flags & AMD_IS_APU) + adev->family = AMDGPU_FAMILY_CZ; + else + adev->family = AMDGPU_FAMILY_VI; + + r = vi_set_ip_blocks(adev); + if (r) + return r; + break; case CHIP_VEGA10: case CHIP_VEGA12: case CHIP_VEGA20: case CHIP_RAVEN: case CHIP_ARCTURUS: case CHIP_RENOIR: - if (adev->asic_type == CHIP_RAVEN || - adev->asic_type == CHIP_RENOIR) + if (adev->flags & AMD_IS_APU) adev->family = AMDGPU_FAMILY_RV; else adev->family = AMDGPU_FAMILY_AI; @@ -1710,6 +1809,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) case CHIP_NAVI10: case CHIP_NAVI14: case CHIP_NAVI12: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: adev->family = AMDGPU_FAMILY_NV; r = nv_set_ip_blocks(adev); @@ -1721,21 +1822,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) return -EINVAL; } - r = amdgpu_device_parse_gpu_info_fw(adev); - if (r) - return r; - - if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) - amdgpu_discovery_get_gfx_info(adev); - amdgpu_amdkfd_device_probe(adev); - if (amdgpu_sriov_vf(adev)) { - r = amdgpu_virt_request_full_gpu(adev, true); - if (r) - return -EAGAIN; - } - adev->pm.pp_feature = amdgpu_pp_feature_mask; if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; @@ -1763,6 +1851,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) } /* get the vbios after the asic_funcs are set up */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { + r = amdgpu_device_parse_gpu_info_fw(adev); + if (r) + return r; + /* Read BIOS */ if (!amdgpu_get_bios(adev)) return -EINVAL; @@ -1975,6 +2067,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) amdgpu_xgmi_add_device(adev); amdgpu_amdkfd_device_init(adev); + amdgpu_fru_get_product_info(adev); + init_failed: if (amdgpu_sriov_vf(adev)) amdgpu_virt_release_full_gpu(adev, true); @@ -2171,6 +2265,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) adev->ip_blocks[i].status.late_initialized = true; } + amdgpu_ras_set_error_query_ready(adev, true); + amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE); @@ -2203,7 +2299,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (gpu_instance->adev->flags & AMD_IS_APU) continue; - r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); + r = amdgpu_xgmi_set_pstate(gpu_instance->adev, + AMDGPU_XGMI_PSTATE_MIN); if (r) { DRM_ERROR("pstate setting failed (%d).\n", r); break; @@ -2232,6 +2329,9 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) { int i, r; + if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) + amdgpu_virt_release_ras_err_handler_data(adev); + amdgpu_ras_pre_fini(adev); if (adev->gmc.xgmi.num_physical_nodes > 1) @@ -2362,18 +2462,21 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev) for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.valid) continue; + /* displays are handled separately */ - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) { - /* XXX handle errors */ - r = adev->ip_blocks[i].version->funcs->suspend(adev); - /* XXX handle errors */ - if (r) { - DRM_ERROR("suspend of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); - return r; - } - adev->ip_blocks[i].status.hw = false; + if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE) + continue; + + /* XXX handle errors */ + r = adev->ip_blocks[i].version->funcs->suspend(adev); + /* XXX handle errors */ + if (r) { + DRM_ERROR("suspend of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + return r; } + + adev->ip_blocks[i].status.hw = false; } return 0; @@ -2471,6 +2574,9 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) AMD_IP_BLOCK_TYPE_IH, }; + for (i = 0; i < adev->num_ip_blocks; i++) + adev->ip_blocks[i].status.hw = false; + for (i = 0; i < ARRAY_SIZE(ip_order); i++) { int j; struct amdgpu_ip_block *block; @@ -2478,7 +2584,6 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) for (j = 0; j < adev->num_ip_blocks; j++) { block = &adev->ip_blocks[j]; - block->status.hw = false; if (block->version->type != ip_order[i] || !block->status.valid) continue; @@ -2703,6 +2808,10 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_NAVI12: case CHIP_RENOIR: #endif +#if defined(CONFIG_DRM_AMD_DC_DCN3_0) + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: +#endif return amdgpu_dc != 0; #endif default: @@ -2785,12 +2894,12 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) * By default timeout for non compute jobs is 10000. * And there is no timeout enforced on compute jobs. * In SR-IOV or passthrough mode, timeout for compute - * jobs are 10000 by default. + * jobs are 60000 by default. */ adev->gfx_timeout = msecs_to_jiffies(10000); adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) - adev->compute_timeout = adev->gfx_timeout; + adev->compute_timeout = msecs_to_jiffies(60000); else adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; @@ -2841,6 +2950,14 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) return ret; } +static const struct attribute *amdgpu_dev_attributes[] = { + &dev_attr_product_name.attr, + &dev_attr_product_number.attr, + &dev_attr_serial_number.attr, + &dev_attr_pcie_replay_count.attr, + NULL +}; + /** * amdgpu_device_init - initialize the driver * @@ -2942,9 +3059,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_LIST_HEAD(&adev->shadow_list); mutex_init(&adev->shadow_list_lock); - INIT_LIST_HEAD(&adev->ring_lru_list); - spin_lock_init(&adev->ring_lru_list_lock); - INIT_DELAYED_WORK(&adev->delayed_init_work, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, @@ -2953,7 +3067,18 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); adev->gfx.gfx_off_req_count = 1; - adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; + adev->pm.ac_power = power_supply_is_system_supplied() > 0; + + atomic_set(&adev->throttling_logging_enabled, 1); + /* + * If throttling continues, logging will be performed every minute + * to avoid log flooding. "-1" is subtracted since the thermal + * throttling interrupt comes every second. Thus, the total logging + * interval is 59 seconds(retelimited printk interval) + 1(waiting + * for throttling interrupt) = 60 seconds. + */ + ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); + ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); /* Registers mapping */ /* TODO: block userspace mapping of io register */ @@ -3002,18 +3127,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) adev->enable_mes = true; - if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) { - r = amdgpu_discovery_init(adev); - if (r) { - dev_err(adev->dev, "amdgpu_discovery_init failed\n"); - return r; - } - } - - /* early init functions */ - r = amdgpu_device_ip_early_init(adev); - if (r) - return r; + /* detect hw virtualization here */ + amdgpu_detect_virtualization(adev); r = amdgpu_device_get_job_timeout_settings(adev); if (r) { @@ -3021,6 +3136,11 @@ int amdgpu_device_init(struct amdgpu_device *adev, return r; } + /* early init functions */ + r = amdgpu_device_ip_early_init(adev); + if (r) + return r; + /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); @@ -3127,14 +3247,13 @@ fence_driver_init: goto failed; } - DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", + dev_info(adev->dev, + "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n", adev->gfx.config.max_shader_engines, adev->gfx.config.max_sh_per_se, adev->gfx.config.max_cu_per_sh, adev->gfx.cu_info.number); - amdgpu_ctx_init_sched(adev); - adev->accel_working = true; amdgpu_vm_check_compute_bug(adev); @@ -3199,9 +3318,12 @@ fence_driver_init: queue_delayed_work(system_wq, &adev->delayed_init_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); - r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); + if (amdgpu_sriov_vf(adev)) + flush_delayed_work(&adev->delayed_init_work); + + r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes); if (r) { - dev_err(adev->dev, "Could not create pcie_replay_count"); + dev_err(adev->dev, "Could not create amdgpu device attr\n"); return r; } @@ -3255,10 +3377,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_pm_sysfs_fini(adev); amdgpu_fbdev_fini(adev); r = amdgpu_device_ip_fini(adev); - if (adev->firmware.gpu_info_fw) { - release_firmware(adev->firmware.gpu_info_fw); - adev->firmware.gpu_info_fw = NULL; - } + release_firmware(adev->firmware.gpu_info_fw); + adev->firmware.gpu_info_fw = NULL; adev->accel_working = false; /* free i2c buses */ if (!amdgpu_device_has_dc_support(adev)) @@ -3284,12 +3404,13 @@ void amdgpu_device_fini(struct amdgpu_device *adev) adev->rmmio = NULL; amdgpu_device_doorbell_fini(adev); - device_remove_file(adev->dev, &dev_attr_pcie_replay_count); if (adev->ucode_sysfs_en) amdgpu_ucode_sysfs_fini(adev); + + sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); if (IS_ENABLED(CONFIG_PERF_EVENTS)) amdgpu_pmu_fini(adev); - if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) + if (adev->discovery_bin) amdgpu_discovery_fini(adev); } @@ -3301,7 +3422,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev) * amdgpu_device_suspend - initiate device suspend * * @dev: drm dev pointer - * @suspend: suspend state * @fbcon : notify the fbdev of suspend * * Puts the hw in the suspend state (all asics). @@ -3398,7 +3518,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) * amdgpu_device_resume - initiate device resume * * @dev: drm dev pointer - * @resume: resume state * @fbcon : notify the fbdev of resume * * Bring the hw back to operating state (all asics). @@ -3754,6 +3873,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; + amdgpu_amdkfd_pre_reset(adev); + /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) @@ -3827,6 +3948,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) case CHIP_NAVI10: case CHIP_NAVI14: case CHIP_NAVI12: + case CHIP_SIENNA_CICHLID: break; default: goto disabled; @@ -3848,6 +3970,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, int i, r = 0; bool need_full_reset = *need_full_reset_arg; + amdgpu_debugfs_wait_dump(adev); + /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -4052,6 +4176,64 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) mutex_unlock(&adev->lock_reset); } +static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) +{ + struct pci_dev *p = NULL; + + p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), + adev->pdev->bus->number, 1); + if (p) { + pm_runtime_enable(&(p->dev)); + pm_runtime_resume(&(p->dev)); + } +} + +static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev) +{ + enum amd_reset_method reset_method; + struct pci_dev *p = NULL; + u64 expires; + + /* + * For now, only BACO and mode1 reset are confirmed + * to suffer the audio issue without proper suspended. + */ + reset_method = amdgpu_asic_reset_method(adev); + if ((reset_method != AMD_RESET_METHOD_BACO) && + (reset_method != AMD_RESET_METHOD_MODE1)) + return -EINVAL; + + p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus), + adev->pdev->bus->number, 1); + if (!p) + return -ENODEV; + + expires = pm_runtime_autosuspend_expiration(&(p->dev)); + if (!expires) + /* + * If we cannot get the audio device autosuspend delay, + * a fixed 4S interval will be used. Considering 3S is + * the audio controller default autosuspend delay setting. + * 4S used here is guaranteed to cover that. + */ + expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL; + + while (!pm_runtime_status_suspended(&(p->dev))) { + if (!pm_runtime_suspend(&(p->dev))) + break; + + if (expires < ktime_get_mono_fast_ns()) { + dev_warn(adev->dev, "failed to suspend display audio\n"); + /* TODO: abort the succeeding gpu reset? */ + return -ETIMEDOUT; + } + } + + pm_runtime_disable(&(p->dev)); + + return 0; +} + /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -4067,36 +4249,32 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) { struct list_head device_list, *device_list_handle = NULL; - bool need_full_reset, job_signaled; + bool need_full_reset = false; + bool job_signaled = false; struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; - bool in_ras_intr = amdgpu_ras_intr_triggered(); - bool use_baco = - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? - true : false; + bool need_emergency_restart = false; + bool audio_suspended = false; + + /** + * Special case: RAS triggered and full reset isn't supported + */ + need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); /* * Flush RAM to disk so that after reboot * the user can read log and see why the system rebooted. */ - if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { - + if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { DRM_WARN("Emergency reboot."); ksys_sync_helper(); emergency_restart(); } - need_full_reset = job_signaled = false; - INIT_LIST_HEAD(&device_list); - dev_info(adev->dev, "GPU %s begin!\n", - (in_ras_intr && !use_baco) ? "jobs stop":"reset"); - - cancel_delayed_work_sync(&adev->delayed_init_work); - - hive = amdgpu_get_xgmi_hive(adev, false); + need_emergency_restart ? "jobs stop":"reset"); /* * Here we trylock to avoid chain of resets executing from @@ -4105,39 +4283,25 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - + hive = amdgpu_get_xgmi_hive(adev, true); if (hive && !mutex_trylock(&hive->reset_lock)) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", job ? job->base.id : -1, hive->hive_id); + mutex_unlock(&hive->hive_lock); return 0; } - /* Start with adev pre asic reset first for soft reset check.*/ - if (!amdgpu_device_lock_adev(adev, !hive)) { - DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", - job ? job->base.id : -1); - return 0; - } - - /* Block kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_pre_reset(adev); - - /* Build list of devices to reset */ - if (adev->gmc.xgmi.num_physical_nodes > 1) { - if (!hive) { - /*unlock kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_post_reset(adev); - amdgpu_device_unlock_adev(adev); + /* + * Build list of devices to reset. + * In case we are in XGMI hive mode, resort the device list + * to put adev in the 1st position. + */ + INIT_LIST_HEAD(&device_list); + if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!hive) return -ENODEV; - } - - /* - * In case we are in XGMI hive mode device reset is done for all the - * nodes in the hive to retrain all XGMI links and hence the reset - * sequence is executed in loop on all nodes. - */ + if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list)) + list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list); device_list_handle = &hive->device_list; } else { list_add_tail(&adev->gmc.xgmi.head, &device_list); @@ -4146,22 +4310,43 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (tmp_adev != adev) { - amdgpu_device_lock_adev(tmp_adev, false); - if (!amdgpu_sriov_vf(tmp_adev)) - amdgpu_amdkfd_pre_reset(tmp_adev); + if (!amdgpu_device_lock_adev(tmp_adev, !hive)) { + DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress", + job ? job->base.id : -1); + mutex_unlock(&hive->hive_lock); + return 0; } /* + * Try to put the audio codec into suspend state + * before gpu reset started. + * + * Due to the power domain of the graphics device + * is shared with AZ power domain. Without this, + * we may change the audio hardware from behind + * the audio driver's back. That will trigger + * some audio codec errors. + */ + if (!amdgpu_device_suspend_display_audio(tmp_adev)) + audio_suspended = true; + + amdgpu_ras_set_error_query_ready(tmp_adev, false); + + cancel_delayed_work_sync(&tmp_adev->delayed_init_work); + + if (!amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_pre_reset(tmp_adev); + + /* * Mark these ASICs to be reseted as untracked first * And add them back after reset completed */ amdgpu_unregister_gpu_instance(tmp_adev); - amdgpu_fbdev_set_suspend(adev, 1); + amdgpu_fbdev_set_suspend(tmp_adev, 1); /* disable ras on ALL IPs */ - if (!(in_ras_intr && !use_baco) && + if (!need_emergency_restart && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); @@ -4173,13 +4358,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, drm_sched_stop(&ring->sched, job ? &job->base : NULL); - if (in_ras_intr && !use_baco) + if (need_emergency_restart) amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } - - if (in_ras_intr && !use_baco) + if (need_emergency_restart) goto skip_sched_resume; /* @@ -4189,30 +4373,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * job->base holds a reference to parent fence */ if (job && job->base.s_fence->parent && - dma_fence_is_signaled(job->base.s_fence->parent)) + dma_fence_is_signaled(job->base.s_fence->parent)) { job_signaled = true; - - if (job_signaled) { dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); goto skip_hw_reset; } - - /* Guilty job will be freed after this*/ - r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset); - if (r) { - /*TODO Should we stop ?*/ - DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ", - r, adev->ddev->unique); - adev->asic_reset_res = r; - } - retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - - if (tmp_adev == adev) - continue; - r = amdgpu_device_pre_asic_reset(tmp_adev, NULL, &need_full_reset); @@ -4272,13 +4440,17 @@ skip_hw_reset: skip_sched_resume: list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /*unlock kfd: SRIOV would do it separately */ - if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) + if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); + if (audio_suspended) + amdgpu_device_resume_display_audio(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); } - if (hive) + if (hive) { mutex_unlock(&hive->reset_lock); + mutex_unlock(&hive->hive_lock); + } if (r) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); |
