Skip to content

Commit

Permalink
Merge tag 'amd-drm-fixes-6.8-2024-01-18' of https://gitlab.freedeskto…
Browse files Browse the repository at this point in the history
…p.org/agd5f/linux into drm-next

amd-drm-fixes-6.8-2024-01-18:

amdgpu:
- DSC fixes
- DC resource pool fixes
- OTG fix
- DML2 fixes
- Aux fix
- GFX10 RLC firmware handling fix
- Revert a broken workaround for SMU 13.0.2
- DC writeback fix
- Enable gfxoff when ROCm apps are active on gfx11 with the proper FW version

amdkfd:
- Fix dma-buf exports using GEM handles

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240118223127.4904-1-alexander.deucher@amd.com
  • Loading branch information
airlied committed Jan 19, 2024
2 parents 205e18c + aa0901a commit 0a1123c
Show file tree
Hide file tree
Showing 67 changed files with 554 additions and 373 deletions.
3 changes: 1 addition & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,6 @@ extern int amdgpu_agp;

extern int amdgpu_wbrf;

extern int fw_bo_location;

#define AMDGPU_VM_MAX_NUM_CTX 4096
#define AMDGPU_SG_THRESHOLD (256*1024*1024)
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000
Expand Down Expand Up @@ -1146,6 +1144,7 @@ struct amdgpu_device {
bool debug_vm;
bool debug_largebar;
bool debug_disable_soft_recovery;
bool debug_use_vram_fw_buf;
};

static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
Expand Down
11 changes: 6 additions & 5 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}

static const struct drm_client_funcs kfd_client_funcs = {
.unregister = drm_client_release,
};
void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
{
int i;
Expand All @@ -161,7 +164,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
.enable_mes = adev->enable_mes,
};

ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", NULL);
ret = drm_client_init(&adev->ddev, &adev->kfd.client, "kfd", &kfd_client_funcs);
if (ret) {
dev_err(adev->dev, "Failed to init DRM client: %d\n", ret);
return;
Expand Down Expand Up @@ -695,10 +698,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
{
enum amd_powergating_state state = idle ? AMD_PG_STATE_GATE : AMD_PG_STATE_UNGATE;
/* Temporary workaround to fix issues observed in some
* compute applications when GFXOFF is enabled on GFX11.
*/
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11) {
if (IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 11 &&
((adev->mes.kiq_version & AMDGPU_MES_VERSION_MASK) <= 64)) {
pr_debug("GFXOFF is %s\n", idle ? "enabled" : "disabled");
amdgpu_gfx_off_ctrl(adev, idle);
} else if ((IP_VERSION_MAJ(amdgpu_ip_version(adev, GC_HWIP, 0)) == 9) &&
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo);

int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
struct dma_fence **ef);
struct dma_fence __rcu **ef);
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
struct kfd_vm_fault_info *info);
int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,
Expand Down
4 changes: 2 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -2802,7 +2802,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
put_task_struct(usertask);
}

static void replace_eviction_fence(struct dma_fence **ef,
static void replace_eviction_fence(struct dma_fence __rcu **ef,
struct dma_fence *new_ef)
{
struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
Expand Down Expand Up @@ -2837,7 +2837,7 @@ static void replace_eviction_fence(struct dma_fence **ef,
* 7. Add fence to all PD and PT BOs.
* 8. Unreserve all BOs
*/
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)
{
struct amdkfd_process_info *process_info = info;
struct amdgpu_vm *peer_vm;
Expand Down
33 changes: 2 additions & 31 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -1544,6 +1544,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
return true;

fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
release_firmware(adev->pm.fw);
if (fw_ver < 0x00160e00)
return true;
}
Expand Down Expand Up @@ -5245,7 +5246,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
bool gpu_reset_for_dev_remove = 0;

/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
Expand All @@ -5265,10 +5265,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);

gpu_reset_for_dev_remove =
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
Expand Down Expand Up @@ -5311,18 +5307,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
amdgpu_ras_intr_cleared();
}

/* Since the mode1 reset affects base ip blocks, the
* phase1 ip blocks need to be resumed. Otherwise there
* will be a BIOS signature error and the psp bootloader
* can't load kdb on the next amdgpu install.
*/
if (gpu_reset_for_dev_remove) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
amdgpu_device_ip_resume_phase1(tmp_adev);

goto end;
}

list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
Expand Down Expand Up @@ -5559,11 +5543,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool need_emergency_restart = false;
bool audio_suspended = false;
bool gpu_reset_for_dev_remove = false;

gpu_reset_for_dev_remove =
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);

/*
* Special case: RAS triggered and full reset isn't supported
Expand Down Expand Up @@ -5601,7 +5580,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
list_add_tail(&tmp_adev->reset_list, &device_list);
if (gpu_reset_for_dev_remove && adev->shutdown)
if (adev->shutdown)
tmp_adev->shutdown = true;
}
if (!list_is_first(&adev->reset_list, &device_list))
Expand Down Expand Up @@ -5686,10 +5665,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (gpu_reset_for_dev_remove) {
/* Workaroud for ASICs need to disable SMC first */
amdgpu_device_smu_fini_early(tmp_adev);
}
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
/*TODO Should we stop ?*/
if (r) {
Expand Down Expand Up @@ -5721,9 +5696,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
if (r && r == -EAGAIN)
goto retry;

if (!r && gpu_reset_for_dev_remove)
goto recover_end;
}

skip_hw_reset:
Expand Down Expand Up @@ -5779,7 +5751,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
amdgpu_ras_set_error_query_ready(tmp_adev, true);
}

recover_end:
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
Expand Down
2 changes: 0 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
Original file line number Diff line number Diff line change
Expand Up @@ -1963,8 +1963,6 @@ static int amdgpu_discovery_set_gc_ip_blocks(struct amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, &gfx_v9_0_ip_block);
break;
case IP_VERSION(9, 4, 3):
if (!amdgpu_exp_hw_support)
return -EINVAL;
amdgpu_device_ip_block_add(adev, &gfx_v9_4_3_ip_block);
break;
case IP_VERSION(10, 1, 10):
Expand Down
47 changes: 8 additions & 39 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_VM = BIT(0),
AMDGPU_DEBUG_LARGEBAR = BIT(1),
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
};

unsigned int amdgpu_vram_limit = UINT_MAX;
Expand Down Expand Up @@ -210,7 +211,6 @@ int amdgpu_seamless = -1; /* auto */
uint amdgpu_debug_mask;
int amdgpu_agp = -1; /* auto */
int amdgpu_wbrf = -1;
int fw_bo_location = -1;

static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);

Expand Down Expand Up @@ -990,10 +990,6 @@ MODULE_PARM_DESC(wbrf,
"Enable Wifi RFI interference mitigation (0 = disabled, 1 = enabled, -1 = auto(default)");
module_param_named(wbrf, amdgpu_wbrf, int, 0444);

MODULE_PARM_DESC(fw_bo_location,
"location to put firmware bo for frontdoor loading (-1 = auto (default), 0 = on ram, 1 = on vram");
module_param(fw_bo_location, int, 0644);

/* These devices are not supported by amdgpu.
* They are supported by the mach64, r128, radeon drivers
*/
Expand Down Expand Up @@ -2122,6 +2118,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
pr_info("debug: soft reset for GPU recovery disabled\n");
adev->debug_disable_soft_recovery = true;
}

if (amdgpu_debug_mask & AMDGPU_DEBUG_USE_VRAM_FW_BUF) {
pr_info("debug: place fw in vram for frontdoor loading\n");
adev->debug_use_vram_fw_buf = true;
}
}

static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
Expand Down Expand Up @@ -2233,6 +2234,8 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,

pci_set_drvdata(pdev, ddev);

amdgpu_init_debug_options(adev);

ret = amdgpu_driver_load_kms(adev, flags);
if (ret)
goto err_pci;
Expand Down Expand Up @@ -2313,8 +2316,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
amdgpu_get_secondary_funcs(adev);
}

amdgpu_init_debug_options(adev);

return 0;

err_pci:
Expand All @@ -2336,38 +2337,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(dev->dev);
}

if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 2) &&
!amdgpu_sriov_vf(adev)) {
bool need_to_reset_gpu = false;

if (adev->gmc.xgmi.num_physical_nodes > 1) {
struct amdgpu_hive_info *hive;

hive = amdgpu_get_xgmi_hive(adev);
if (hive->device_remove_count == 0)
need_to_reset_gpu = true;
hive->device_remove_count++;
amdgpu_put_xgmi_hive(hive);
} else {
need_to_reset_gpu = true;
}

/* Workaround for ASICs need to reset SMU.
* Called only when the first device is removed.
*/
if (need_to_reset_gpu) {
struct amdgpu_reset_context reset_context;

adev->shutdown = true;
memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
}

amdgpu_driver_unload_kms(dev);

/*
Expand Down
21 changes: 14 additions & 7 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
Original file line number Diff line number Diff line change
Expand Up @@ -1045,21 +1045,28 @@ int amdgpu_gmc_vram_checking(struct amdgpu_device *adev)
* seconds, so here, we just pick up three parts for emulation.
*/
ret = memcmp(vram_ptr, cptr, 10);
if (ret)
return ret;
if (ret) {
ret = -EIO;
goto release_buffer;
}

ret = memcmp(vram_ptr + (size / 2), cptr, 10);
if (ret)
return ret;
if (ret) {
ret = -EIO;
goto release_buffer;
}

ret = memcmp(vram_ptr + size - 10, cptr, 10);
if (ret)
return ret;
if (ret) {
ret = -EIO;
goto release_buffer;
}

release_buffer:
amdgpu_bo_free_kernel(&vram_bo, &vram_gpu,
&vram_ptr);

return 0;
return ret;
}

static ssize_t current_memory_partition_show(
Expand Down
7 changes: 6 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
Original file line number Diff line number Diff line change
Expand Up @@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
if (amdgpu_dpm_read_sensor(adev,
AMDGPU_PP_SENSOR_GPU_AVG_POWER,
(void *)&ui32, &ui32_size)) {
return -EINVAL;
/* fall back to input power for backwards compat */
if (amdgpu_dpm_read_sensor(adev,
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
(void *)&ui32, &ui32_size)) {
return -EINVAL;
}
}
ui32 >>= 8;
break;
Expand Down
2 changes: 1 addition & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ static int psp_sw_init(void *handle)
}

ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
(amdgpu_sriov_vf(adev) || fw_bo_location == 1) ?
(amdgpu_sriov_vf(adev) || adev->debug_use_vram_fw_buf) ?
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
&psp->fw_pri_bo,
&psp->fw_pri_mc_addr,
Expand Down
Loading

0 comments on commit 0a1123c

Please sign in to comment.