From 30f0b804b8a5324346fca9de1ffd4671094c5f66 Mon Sep 17 00:00:00 2001 From: Harish Kasiviswanathan Date: Mon, 26 Feb 2024 14:10:12 -0500 Subject: [PATCH] libhsakmt: Associate correct GPU with queue memory Pass the correct gpu_id to KFD for system memory that is allocated for the queue and eop buffer Change-Id: I43bb6333560a7d9d38293c191303161ab1443b5d Signed-off-by: Harish Kasiviswanathan --- src/events.c | 2 +- src/libhsakmt.h | 1 + src/queues.c | 53 ++++++++++++++++++++++++++++++------------------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/src/events.c b/src/events.c index 60e9f6d..07356db 100644 --- a/src/events.c +++ b/src/events.c @@ -76,7 +76,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateEvent(HsaEventDescriptor *EventDesc, if (is_dgpu && !events_page) { events_page = allocate_exec_aligned_memory_gpu( - KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, true, false, true); + KFD_SIGNAL_EVENT_LIMIT * 8, PAGE_SIZE, 0, 0, true, false, true); if (!events_page) { pthread_mutex_unlock(&hsakmt_mutex); return HSAKMT_STATUS_ERROR; diff --git a/src/libhsakmt.h b/src/libhsakmt.h index 62f65d0..0f19434 100644 --- a/src/libhsakmt.h +++ b/src/libhsakmt.h @@ -195,6 +195,7 @@ bool topology_is_svm_needed(HSA_ENGINE_ID EngineId); HSAuint32 PageSizeFromFlags(unsigned int pageSizeFlags); void* allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, + uint32_t gpu_id, uint32_t NodeId, bool NonPaged, bool DeviceLocal, bool Uncached); void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align); diff --git a/src/queues.c b/src/queues.c index f9e5612..e6e0963 100644 --- a/src/queues.c +++ b/src/queues.c @@ -310,15 +310,14 @@ static bool update_ctx_save_restore_size(uint32_t nodeid, struct queue *q) return false; } -void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, +void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, uint32_t gpu_id, uint32_t NodeId, bool nonPaged, bool DeviceLocal, bool Uncached) { - void *mem; + void *mem = NULL; HSAuint64 gpu_va; HsaMemFlags flags; - HSAKMT_STATUS ret; HSAuint32 cpu_id = 0; flags.Value = 0; @@ -329,22 +328,32 @@ void *allocate_exec_aligned_memory_gpu(uint32_t size, uint32_t align, flags.ui32.CoarseGrain = DeviceLocal; flags.ui32.Uncached = Uncached; - /* Get the closest cpu_id to GPU NodeId for system memory allocation - * nonPaged=1 system memory allocation uses GTT path - */ - if (!DeviceLocal && !nonPaged) { - cpu_id = get_direct_link_cpu(NodeId); - if (cpu_id == INVALID_NODEID) { - flags.ui32.NoNUMABind = 1; - cpu_id = 0; + size = ALIGN_UP(size, align); + + if (DeviceLocal && !zfb_support) + mem = fmm_allocate_device(gpu_id, NodeId, mem, size, flags); + else { + /* VRAM under ZFB mode should be supported here without any + * additional code + */ + /* Get the closest cpu_id to GPU NodeId for system memory allocation + * nonPaged=0 system memory allocation uses GTT path + */ + if (!nonPaged) { + cpu_id = get_direct_link_cpu(NodeId); + if (cpu_id == INVALID_NODEID) { + flags.ui32.NoNUMABind = 1; + cpu_id = 0; + } } + mem = fmm_allocate_host(gpu_id, cpu_id, mem, size, flags); } - size = ALIGN_UP(size, align); - - ret = hsaKmtAllocMemory(DeviceLocal ? NodeId : cpu_id, size, flags, &mem); - if (ret != HSAKMT_STATUS_SUCCESS) + if (!mem) { + pr_err("Alloc %s memory failed size %d\n", + DeviceLocal ? "VRAM" : "GTT", size); return NULL; + } if (NodeId != 0) { uint32_t nodes_array[1] = {NodeId}; @@ -376,13 +385,14 @@ void free_exec_aligned_memory_gpu(void *addr, uint32_t size, uint32_t align) */ static void *allocate_exec_aligned_memory(uint32_t size, bool use_ats, + uint32_t gpu_id, uint32_t NodeId, bool nonPaged, bool DeviceLocal, bool Uncached) { if (!use_ats) - return allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, NodeId, + return allocate_exec_aligned_memory_gpu(size, PAGE_SIZE, gpu_id, NodeId, nonPaged, DeviceLocal, Uncached); return allocate_exec_aligned_memory_cpu(size); @@ -469,6 +479,7 @@ static inline void fill_cwsr_header(struct queue *q, void *addr, static int handle_concrete_asic(struct queue *q, struct kfd_ioctl_create_queue_args *args, + uint32_t gpu_id, uint32_t NodeId, HsaEvent *Event, volatile HSAint64 *ErrPayload) @@ -480,8 +491,9 @@ static int handle_concrete_asic(struct queue *q, return HSAKMT_STATUS_SUCCESS; if (q->eop_buffer_size > 0) { + pr_info("Allocating VRAM for EOP\n"); q->eop_buffer = allocate_exec_aligned_memory(q->eop_buffer_size, - q->use_ats, + q->use_ats, gpu_id, NodeId, true, true, /* Unused for VRAM */false); if (!q->eop_buffer) return HSAKMT_STATUS_NO_MEMORY; @@ -516,6 +528,7 @@ static int handle_concrete_asic(struct queue *q, void *addr; HSAKMT_STATUS r = HSAKMT_STATUS_ERROR; + pr_info("Allocating GTT for CWSR\n"); addr = mmap_allocate_aligned(PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, size, GPU_HUGE_PAGE_SIZE, 0, @@ -548,7 +561,7 @@ static int handle_concrete_asic(struct queue *q, if (!q->unified_ctx_save_restore) { q->ctx_save_restore = allocate_exec_aligned_memory( q->total_mem_alloc_size, - q->use_ats, NodeId, + q->use_ats, gpu_id, NodeId, false, false, false); if (!q->ctx_save_restore) @@ -597,7 +610,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, return result; struct queue *q = allocate_exec_aligned_memory(sizeof(*q), - false, NodeId, true, false, true); + false, gpu_id, NodeId, true, false, true); if (!q) return HSAKMT_STATUS_NO_MEMORY; @@ -652,7 +665,7 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtCreateQueue(HSAuint32 NodeId, QueueResource->QueueWptrValue = (uintptr_t)&q->wptr; } - err = handle_concrete_asic(q, &args, NodeId, Event, QueueResource->ErrorReason); + err = handle_concrete_asic(q, &args, gpu_id, NodeId, Event, QueueResource->ErrorReason); if (err != HSAKMT_STATUS_SUCCESS) { free_queue(q); return err;