Skip to content

Commit

Permalink
drm/amdgpu: block ring buffer access during GPU recovery
Browse files Browse the repository at this point in the history
When GPU is in reset, its status isn't stable and ring buffer also need
be reset when resuming. Therefore driver should protect GPU recovery
thread from ring buffer accessed by other threads. Otherwise GPU will
randomly hang during recovery.

v2: correct indent

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
  • Loading branch information
Dennis Li authored and alexdeucher committed Sep 3, 2020
1 parent f6eb433 commit 8120280
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 10 deletions.
17 changes: 13 additions & 4 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
{
uint32_t ret;

if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_rreg(adev, reg);
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_sem)) {
ret = amdgpu_kiq_rreg(adev, reg);
up_read(&adev->reset_sem);
return ret;
}

if ((reg * 4) < adev->rmmio_size)
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
Expand All @@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
}

trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
return ret;
}
Expand Down Expand Up @@ -409,8 +414,12 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags)
{
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
return amdgpu_kiq_wreg(adev, reg, v);
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_sem)) {
amdgpu_kiq_wreg(adev, reg, v);
up_read(&adev->reset_sem);
return;
}

amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
}
Expand Down
5 changes: 3 additions & 2 deletions drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
*/
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!amdgpu_in_reset(adev)) {

down_read_trylock(&adev->reset_sem)) {
struct amdgpu_vmhub *hub = &adev->vmhub[vmhub];
const unsigned eng = 17;
u32 inv_req = hub->vmhub_funcs->get_invalidate_req(vmid, flush_type);
Expand All @@ -297,6 +296,8 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,

amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid);

up_read(&adev->reset_sem);
return;
}

Expand Down
11 changes: 7 additions & 4 deletions drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
Original file line number Diff line number Diff line change
Expand Up @@ -503,13 +503,14 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
* as GFXOFF under bare metal
*/
if (adev->gfx.kiq.ring.sched.ready &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
!amdgpu_in_reset(adev)) {
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) &&
down_read_trylock(&adev->reset_sem)) {
uint32_t req = hub->vm_inv_eng0_req + hub->eng_distance * eng;
uint32_t ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng;

amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid);
up_read(&adev->reset_sem);
return;
}

Expand Down Expand Up @@ -602,7 +603,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
if (amdgpu_in_reset(adev))
return -EIO;

if (ring->sched.ready) {
if (ring->sched.ready && down_read_trylock(&adev->reset_sem)) {
/* Vega20+XGMI caches PTEs in TC and TLB. Add a
* heavy-weight TLB flush (type 2), which flushes
* both. Due to a race condition with concurrent
Expand All @@ -629,6 +630,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
if (r) {
amdgpu_ring_undo(ring);
spin_unlock(&adev->gfx.kiq.ring_lock);
up_read(&adev->reset_sem);
return -ETIME;
}

Expand All @@ -637,9 +639,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
if (r < 1) {
dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
up_read(&adev->reset_sem);
return -ETIME;
}

up_read(&adev->reset_sem);
return 0;
}

Expand Down

0 comments on commit 8120280

Please sign in to comment.