summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilip Yang <Philip.Yang@amd.com>2024-04-30 13:51:51 -0400
committerAlex Deucher <alexander.deucher@amd.com>2024-05-13 15:44:02 -0400
commit9095e5544061b16d1b331aca3f32c76cbd656d72 (patch)
tree1fc184dd13bc267638a4954ef3292ba7c144d77d
parent10f624ef239bd136cdcc5bbc626157a57b938a31 (diff)
drm/amdkfd: Remove arbitrary timeout for hmm_range_fault
On system with khugepaged enabled and user cases with THP buffer, the hmm_range_fault may takes > 15 seconds to return -EBUSY, the arbitrary timeout value is not accurate, cause memory allocation failure. Remove the arbitrary timeout value, return EAGAIN to application if hmm_range_fault return EBUSY, then userspace libdrm and Thunk will call ioctl again. Change EAGAIN to debug message as this is not error. Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <felix.kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c12
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c5
3 files changed, 8 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 75d49390cae0..3314821e4cf3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1088,7 +1088,10 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range);
if (ret) {
- pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
+ if (ret == -EAGAIN)
+ pr_debug("Failed to get user pages, try again\n");
+ else
+ pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
goto unregister_out;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
index 431ec72655ec..e36fede7f74c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -202,20 +202,12 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end);
- /* Assuming 64MB takes maximum 1 second to fault page address */
- timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
- timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
- timeout = jiffies + msecs_to_jiffies(timeout);
+ timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
retry:
hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
r = hmm_range_fault(hmm_range);
if (unlikely(r)) {
- schedule();
- /*
- * FIXME: This timeout should encompass the retry from
- * mmu_interval_read_retry() as well.
- */
if (r == -EBUSY && !time_after(jiffies, timeout))
goto retry;
goto out_free_pfns;
@@ -247,6 +239,8 @@ out_free_pfns:
out_free_range:
kfree(hmm_range);
+ if (r == -EBUSY)
+ r = -EAGAIN;
return r;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 4885d1b2cc29..4e5851bdf832 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1690,11 +1690,8 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
readonly, owner, NULL,
&hmm_range);
WRITE_ONCE(p->svms.faulting_task, NULL);
- if (r) {
+ if (r)
pr_debug("failed %d to get svm range pages\n", r);
- if (r == -EBUSY)
- r = -EAGAIN;
- }
} else {
r = -EFAULT;
}