diff --git a/README.md b/README.md index 231ecb7e..13a8c4b0 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ This repository is for the AMD XDNA™️ Driver (amdxdna.ko) for Linux®️ and - [System Requirements](#system-requirements) - [Linux compilation and installation](#linux-compilation-and-installation) - [Clone](#clone) + - [Build](#build) - [Test](#test) - [Q&A](#qa) @@ -88,7 +89,7 @@ cd /build cd xrt/build ./build.sh -noert -noalveo # To adapt according to your OS & version -sudo apt reinstall ./Release/xrt_202410.2.17.0_23.10-amd64-xrt.deb ./Release/xrt_202410.2.17.0_23.10-amd64-xbflash2.deb +sudo apt reinstall ./Release/xrt_202410.2.17.0_23.10-amd64-xrt.deb cd ../../build # Start XDNA driver release build diff --git a/WHENCE b/WHENCE index 46fe77ea..800b3567 100644 --- a/WHENCE +++ b/WHENCE @@ -11,5 +11,6 @@ File: tools/bins/1502_00/validate.xclbin tools/bins/17f0_10/validate.xclbin tools/bins/17f0_11/validate.xclbin + tools/bins/17f0_20/validate.xclbin Licence: Redistributable. See LICENSE.amdnpu for details. diff --git a/src/driver/CMakeLists.txt b/src/driver/CMakeLists.txt index 1687d274..ad33fb76 100644 --- a/src/driver/CMakeLists.txt +++ b/src/driver/CMakeLists.txt @@ -92,6 +92,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/dkms.conf set(amdxdna_drv_tools ${CMAKE_CURRENT_SOURCE_DIR}/tools/dkms_driver.sh + ${CMAKE_CURRENT_SOURCE_DIR}/tools/npu_perf_trace.sh + ${CMAKE_CURRENT_SOURCE_DIR}/tools/npu_perf_analyze.sh ) install(FILES ${amdxdna_drv_tools} PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE diff --git a/src/driver/amdxdna/aie2_ctx.c b/src/driver/amdxdna/aie2_ctx.c index c630f048..1cab054c 100644 --- a/src/driver/amdxdna/aie2_ctx.c +++ b/src/driver/amdxdna/aie2_ctx.c @@ -47,12 +47,6 @@ aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq) { int idx; - /* Special sequence number for oldest fence if exist */ - if (seq == AMDXDNA_INVALID_CMD_HANDLE) { - idx = get_job_idx(hwctx->submitted); - goto out; - } - if (seq >= hwctx->submitted) return ERR_PTR(-EINVAL); @@ -60,8 +54,6 @@ aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq) return NULL; idx = get_job_idx(seq); - -out: return hwctx->priv->pending[idx]; } @@ -230,8 +222,8 @@ aie2_sched_notify(struct amdxdna_sched_job *job) struct dma_fence *fence = job->fence; job->hwctx->completed++; + trace_xdna_job(&job->base, job->hwctx->name, "signaling fence", job->seq); dma_fence_signal(fence); - trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq); dma_fence_put(fence); mmput(job->mm); amdxdna_job_put(job); @@ -257,7 +249,7 @@ aie2_sched_resp_handler(void *handle, const u32 *data, size_t size) } status = *data; - XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status); + XDNA_DBG(job->hwctx->client->xdna, "Response status 0x%x", status); if (status == AIE2_STATUS_SUCCESS) amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED); else @@ -284,7 +276,7 @@ aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size) } status = *data; - XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status); + XDNA_DBG(job->hwctx->client->xdna, "Response status 0x%x", status); out: aie2_sched_notify(job); @@ -540,6 +532,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) struct drm_gpu_scheduler *sched; struct amdxdna_hwctx_priv *priv; struct amdxdna_gem_obj *heap; + unsigned int wq_flags; int i, ret; priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL); @@ -587,12 +580,21 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) sched = &priv->sched; mutex_init(&priv->io_lock); - ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT, + + wq_flags = __WQ_ORDERED; + if (!aie2_pm_is_turbo(xdna->dev_handle)) + wq_flags |= WQ_UNBOUND; + priv->submit_wq = alloc_workqueue(hwctx->name, wq_flags, 1); + if (!priv->submit_wq) { + XDNA_ERR(xdna, "Failed to alloc submit wq"); + goto free_cmd_bufs; + } + ret = drm_sched_init(sched, &sched_ops, priv->submit_wq, DRM_SCHED_PRIORITY_COUNT, HWCTX_MAX_CMDS, 0, MAX_SCHEDULE_TIMEOUT, NULL, NULL, hwctx->name, xdna->ddev.dev); if (ret) { XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret); - goto free_cmd_bufs; + goto free_wq; } ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL, @@ -645,6 +647,8 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx) drm_sched_entity_destroy(&priv->entity); free_sched: drm_sched_fini(&priv->sched); +free_wq: + destroy_workqueue(priv->submit_wq); free_cmd_bufs: for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) { if (!priv->cmd_buf[i]) @@ -681,6 +685,7 @@ void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx) aie2_hwctx_wait_for_idle(hwctx); drm_sched_entity_destroy(&hwctx->priv->entity); drm_sched_fini(&hwctx->priv->sched); + destroy_workqueue(hwctx->priv->submit_wq); for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) { job = hwctx->priv->pending[idx]; @@ -928,7 +933,7 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, job->out_fence = dma_fence_get(&job->base.s_fence->finished); retry: - ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx); + ret = amdxdna_lock_objects(job, &acquire_ctx); if (ret) { XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret); goto put_fence; @@ -937,7 +942,7 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, for (i = 0; i < job->bo_cnt; i++) { abo = to_xdna_obj(job->bos[i]); if (abo->mem.map_invalid) { - drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx); + amdxdna_unlock_objects(job, &acquire_ctx); if (!timeout) { timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); @@ -955,19 +960,27 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, ret = dma_resv_reserve_fences(job->bos[i]->resv, 1); if (ret) { XDNA_WARN(xdna, "Failed to reserve fences %d", ret); - drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx); + amdxdna_unlock_objects(job, &acquire_ctx); goto put_fence; } } for (i = 0; i < job->bo_cnt; i++) dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE); - drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx); + amdxdna_unlock_objects(job, &acquire_ctx); +again: mutex_lock(&hwctx->priv->io_lock); ret = aie2_hwctx_add_job(hwctx, job); if (ret) { mutex_unlock(&hwctx->priv->io_lock); + + if (ret == -EAGAIN) { + // Waiting for the first pending cmd to complete before trying again. + int res = aie2_cmd_wait(hwctx, hwctx->submitted - HWCTX_MAX_CMDS, 0); + if (!res) + goto again; + } goto signal_fence; } diff --git a/src/driver/amdxdna/aie2_debugfs.c b/src/driver/amdxdna/aie2_debugfs.c index ea5105f7..4023e6b2 100644 --- a/src/driver/amdxdna/aie2_debugfs.c +++ b/src/driver/amdxdna/aie2_debugfs.c @@ -77,71 +77,6 @@ static int aie2_dbgfs_entry_release(struct inode *inode, struct file *file) #define file_to_ndev_rw(file) \ (((struct seq_file *)(file)->private_data)->private) -static ssize_t -aie2_dbgfs_clock_write(struct amdxdna_dev_hdl *ndev, struct clock *clock, - const char __user *ptr, size_t len, loff_t *off) -{ - u32 val; - int ret; - - ret = kstrtouint_from_user(ptr, len, 10, &val); - if (ret) { - XDNA_ERR(ndev->xdna, "Invalid input value: %d", val); - return ret; - } - - clock->dbg_freq_mhz = val; - if (!clock->dbg_freq_mhz) { - XDNA_INFO(ndev->xdna, "Auto %s", clock->name); - return 0; - } - - ret = aie2_smu_set_clock_freq(ndev, clock, val); - if (ret) { - clock->dbg_freq_mhz = 0; - XDNA_ERR(ndev->xdna, "Set %s ret %d, use auto clock", clock->name, ret); - return ret; - } - - return len; -} - -static ssize_t aie2_dbgfs_mpnpu_clock_write(struct file *file, const char __user *ptr, - size_t len, loff_t *off) -{ - struct amdxdna_dev_hdl *ndev = file_to_ndev_rw(file); - - return aie2_dbgfs_clock_write(ndev, &ndev->smu.mp_npu_clock, ptr, len, off); -} - -static int aie2_dbgfs_mpnpu_clock_show(struct seq_file *m, void *unused) -{ - struct amdxdna_dev_hdl *ndev = m->private; - - seq_printf(m, "%d\n", aie2_smu_get_mpnpu_clock_freq(ndev)); - return 0; -} - -AIE2_DBGFS_FOPS(npuclock, aie2_dbgfs_mpnpu_clock_show, aie2_dbgfs_mpnpu_clock_write); - -static ssize_t aie2_dbgfs_hclock_write(struct file *file, const char __user *ptr, - size_t len, loff_t *off) -{ - struct amdxdna_dev_hdl *ndev = file_to_ndev_rw(file); - - return aie2_dbgfs_clock_write(ndev, &ndev->smu.h_clock, ptr, len, off); -} - -static int aie2_dbgfs_hclock_show(struct seq_file *m, void *unused) -{ - struct amdxdna_dev_hdl *ndev = m->private; - - seq_printf(m, "%d\n", aie2_smu_get_hclock_freq(ndev)); - return 0; -} - -AIE2_DBGFS_FOPS(hclock, aie2_dbgfs_hclock_show, aie2_dbgfs_hclock_write); - static ssize_t aie2_pasid_write(struct file *file, const char __user *ptr, size_t len, loff_t *off) { @@ -291,7 +226,7 @@ static ssize_t aie2_dpm_level_set(struct file *file, const char __user *ptr, return ret; } - ret = aie2_smu_set_dpm_level(ndev, val, true); + ret = aie2_smu_set_dpm_level(ndev, val); if (ret) { XDNA_ERR(ndev->xdna, "Setting dpm_level:%d failed, ret: %d", val, ret); return ret; @@ -302,8 +237,24 @@ static ssize_t aie2_dpm_level_set(struct file *file, const char __user *ptr, static int aie2_dpm_level_get(struct seq_file *m, void *unused) { struct amdxdna_dev_hdl *ndev = m->private; + const struct dpm_clk *dpm_table; + u32 num_dpm_levels; + int dpm_level; + int i; - seq_printf(m, "%d\n", aie2_smu_get_dpm_level(ndev)); + dpm_table = SMU_DPM_TABLE_ENTRY(ndev, 0); + dpm_level = aie2_smu_get_dpm_level(ndev); + num_dpm_levels = SMU_DPM_MAX(ndev); + for (i = 0; i <= num_dpm_levels; i++) { + u32 npuclk = dpm_table[i].npuclk; + u32 hclk = dpm_table[i].hclk; + + if (dpm_level == i) + seq_printf(m, " [%d,%d] ", npuclk, hclk); + else + seq_printf(m, " %d,%d ", npuclk, hclk); + } + seq_puts(m, "\n"); return 0; } @@ -511,8 +462,6 @@ seq_printf(m, "%ld:%s\n", _name, #_name) drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_WAIT_CMD); drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_GET_INFO); drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SET_STATE); - drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL); - drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SUBMIT_WAIT); drm_ioctl_id_seq_print(DRM_IOCTL_GEM_CLOSE); drm_ioctl_id_seq_print(DRM_IOCTL_PRIME_HANDLE_TO_FD); @@ -609,8 +558,6 @@ const struct { umode_t mode; } aie2_dbgfs_files[] = { AIE2_DBGFS_FILE(nputest, 0600), - AIE2_DBGFS_FILE(hclock, 0600), - AIE2_DBGFS_FILE(npuclock, 0600), AIE2_DBGFS_FILE(pasid, 0600), AIE2_DBGFS_FILE(state, 0600), AIE2_DBGFS_FILE(powerstate, 0600), diff --git a/src/driver/amdxdna/aie2_message.c b/src/driver/amdxdna/aie2_message.c index 66b3299a..e1ccd9af 100644 --- a/src/driver/amdxdna/aie2_message.c +++ b/src/driver/amdxdna/aie2_message.c @@ -16,8 +16,13 @@ #define DECLARE_AIE2_MSG(name, op) \ DECLARE_XDNA_MSG_COMMON(name, op, MAX_AIE2_STATUS_CODE) -static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev, - struct xdna_mailbox_msg *msg) +#define aie2_send_mgmt_msg_wait(ndev, msg) \ + aie2_send_mgmt_msg_wait_offset(ndev, msg, 0) + +static int +aie2_send_mgmt_msg_wait_offset(struct amdxdna_dev_hdl *ndev, + struct xdna_mailbox_msg *msg, + u32 offset) { struct amdxdna_dev *xdna = ndev->xdna; struct xdna_notify *hdl = msg->handle; @@ -34,7 +39,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev, ndev->mgmt_chann = NULL; } - if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) { + if (!ret && hdl->data[offset] != AIE2_STATUS_SUCCESS) { XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x", msg->opcode, *hdl->data); ret = -EINVAL; @@ -95,18 +100,9 @@ int aie2_check_protocol_version(struct amdxdna_dev_hdl *ndev) return ret; } - if (resp.major != ndev->priv->protocol_major) { - XDNA_ERR(xdna, "Incompatible firmware protocol version major %d minor %d", - resp.major, resp.minor); - return -EINVAL; - } - - /* - * Greater protocol minor version means new messages/status/emun are - * added into the firmware interface protocol. - */ - if (resp.minor < ndev->priv->protocol_minor) { - XDNA_ERR(xdna, "Firmware minor version smaller than supported"); + ret = aie2_check_protocol(ndev, resp.major, resp.minor); + if (ret) { + XDNA_ERR(xdna, "Failed check protocol %d.%d", resp.major, resp.minor); return -EINVAL; } @@ -114,36 +110,6 @@ int aie2_check_protocol_version(struct amdxdna_dev_hdl *ndev) } #ifdef AMDXDNA_DEVEL -/* TODO: Delete this. move status to the first word of struct get_telemetry_resp */ -static int aie2_send_mgmt_msg_wait_for_telemetry(struct amdxdna_dev_hdl *ndev, - struct xdna_mailbox_msg *msg) -{ - struct amdxdna_dev *xdna = ndev->xdna; - struct xdna_notify *hdl = msg->handle; - struct get_telemetry_resp *resp; - int ret; - - if (!ndev->mgmt_chann) - return -ENODEV; - - drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); - ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg); - if (ret == -ETIME) { - xdna_mailbox_stop_channel(ndev->mgmt_chann); - xdna_mailbox_destroy_channel(ndev->mgmt_chann); - ndev->mgmt_chann = NULL; - } - - resp = (struct get_telemetry_resp *)hdl->data; - if (!ret && resp->status != AIE2_STATUS_SUCCESS) { - XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x", - msg->opcode, resp->status); - ret = -EINVAL; - } - - return ret; -} - int aie2_get_telemetry(struct amdxdna_dev_hdl *ndev, u32 type, dma_addr_t addr, u32 size) { DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY); @@ -159,7 +125,7 @@ int aie2_get_telemetry(struct amdxdna_dev_hdl *ndev, u32 type, dma_addr_t addr, req.buf_size = size; req.type = type; - ret = aie2_send_mgmt_msg_wait_for_telemetry(ndev, &msg); + ret = aie2_send_mgmt_msg_wait_offset(ndev, &msg, XDNA_STATUS_OFFSET(get_telemetry)); if (ret) { XDNA_ERR(xdna, "Failed to get telemetry, ret %d", ret); return ret; @@ -258,6 +224,7 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct { DECLARE_AIE2_MSG(create_ctx, MSG_OP_CREATE_CONTEXT); struct amdxdna_dev *xdna = ndev->xdna; + enum xdna_mailbox_channel_type type; struct xdna_mailbox_chann_res x2i; struct xdna_mailbox_chann_res i2x; struct cq_pair *cq_pair; @@ -296,8 +263,12 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct } intr_reg = i2x.mb_head_ptr_reg + 4; + if (aie2_pm_is_turbo(ndev)) + type = MB_CHANNEL_USER_POLL; + else + type = MB_CHANNEL_USER_NORMAL; hwctx->priv->mbox_chann = xdna_mailbox_create_channel(ndev->mbox, &x2i, &i2x, - intr_reg, ret); + intr_reg, ret, type); if (!hwctx->priv->mbox_chann) { XDNA_ERR(xdna, "not able to create channel"); ret = -EINVAL; diff --git a/src/driver/amdxdna/aie2_msg_priv.h b/src/driver/amdxdna/aie2_msg_priv.h index 663a6084..2d18ef63 100644 --- a/src/driver/amdxdna/aie2_msg_priv.h +++ b/src/driver/amdxdna/aie2_msg_priv.h @@ -186,7 +186,6 @@ struct exec_dpu_req { u32 inst_prop_cnt; u32 cu_idx; u32 payload[35]; - } __packed; struct exec_dpu_preempt_req { diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c index 05bfe03e..2825710c 100644 --- a/src/driver/amdxdna/aie2_pci.c +++ b/src/driver/amdxdna/aie2_pci.c @@ -21,15 +21,31 @@ #include "aie2_internal.h" #endif -int aie2_max_col = XRS_MAX_COL; -module_param(aie2_max_col, int, 0600); +uint aie2_max_col = XRS_MAX_COL; +module_param(aie2_max_col, uint, 0600); MODULE_PARM_DESC(aie2_max_col, "Maximum column could be used"); +uint aie2_control_flags; +module_param(aie2_control_flags, uint, 0400); +MODULE_PARM_DESC(aie2_control_flags, + " Bit " __stringify(AIE2_BIT_BYPASS_POWER_SWITCH) ": Bypass power on/off," + " Bit " __stringify(AIE2_BIT_BYPASS_SET_FREQ) ": Bypass set freq," + " Bit " __stringify(AIE2_BIT_BYPASS_FW_LOAD) ": Bypass FW loading"); + /* * The management mailbox channel is allocated by firmware. * The related register and ring buffer information is on SRAM BAR. * This struct is the register layout. + * + * Mgmt channel info query flow: + * 1. Poll alive pointer register until it is non zero + * 2. The alive pointer pointing to Mgmt Mbox Info on SRAM bar + * 4. Read x2i_* and i2x_* + * 3. If magic number MGMT_MBOX_MAGIC not presented, done; + * Otherwise, read msi_id, major, minor etc.. */ +#define MGMT_MBOX_MAGIC 0x55504e5f /* _NPU */ +#define MAGIC_OFFSET offsetof(struct mgmt_mbox_chann_info, magic[0]) struct mgmt_mbox_chann_info { u32 x2i_tail; u32 x2i_head; @@ -39,8 +55,45 @@ struct mgmt_mbox_chann_info { u32 i2x_head; u32 i2x_buf; u32 i2x_buf_sz; + u32 magic; + u32 msi_id; + u32 prot_major; + u32 prot_minor; + u32 rsvd[4]; }; +int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor) +{ + struct amdxdna_dev *xdna = ndev->xdna; + + /* + * The driver supported mailbox behavior is defined by + * ndev->priv->protocol_major and protocol_minor. + * + * When major different, it means incompatible behavior. + * When only minor different, the greater minor means more opcode etc. + * + * Thus, + * 1. driver and fw major must be the same + * 2. driver minor must smaller than or equal to fw minor + */ + if (ndev->priv->protocol_major != fw_major) { + XDNA_ERR(xdna, "Incompatible firmware protocol major %d minor %d", + fw_major, fw_minor); + return -EINVAL; + } + + /* + * Greater protocol minor version means new messages/status/emun are + * added into the firmware interface protocol. + */ + if (ndev->priv->protocol_minor > fw_minor) { + XDNA_ERR(xdna, "Firmware minor version smaller than supported"); + return -EINVAL; + } + return 0; +} + static inline void aie2_dump_chann_info_debug(struct amdxdna_dev_hdl *ndev) { struct amdxdna_dev *xdna = ndev->xdna; @@ -54,6 +107,11 @@ static inline void aie2_dump_chann_info_debug(struct amdxdna_dev_hdl *ndev) XDNA_DBG(xdna, "x2i ringbuf 0x%x", ndev->mgmt_x2i.rb_start_addr); XDNA_DBG(xdna, "x2i rsize 0x%x", ndev->mgmt_x2i.rb_size); XDNA_DBG(xdna, "x2i chann index 0x%x", ndev->mgmt_chan_idx); + if (!ndev->mgmt_prot_major) + return; + + XDNA_DBG(xdna, "mailbox protocol major 0x%x", ndev->mgmt_prot_major); + XDNA_DBG(xdna, "mailbox protocol minor 0x%x", ndev->mgmt_prot_minor); } static int aie2_get_mgmt_chann_info(struct amdxdna_dev_hdl *ndev) @@ -96,14 +154,25 @@ static int aie2_get_mgmt_chann_info(struct amdxdna_dev_hdl *ndev) x2i->mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, info_regs.x2i_tail); x2i->rb_start_addr = AIE2_SRAM_OFF(ndev, info_regs.x2i_buf); x2i->rb_size = info_regs.x2i_buf_sz; - ndev->mgmt_chan_idx = CHANN_INDEX(ndev, x2i->rb_start_addr); + if (info_regs.magic != MGMT_MBOX_MAGIC) { + ndev->mgmt_chan_idx = CHANN_INDEX(ndev, x2i->rb_start_addr); + goto done; + } + + ndev->mgmt_chan_idx = info_regs.msi_id; + ndev->mgmt_prot_major = info_regs.prot_major; + ndev->mgmt_prot_minor = info_regs.prot_minor; + if (aie2_check_protocol(ndev, ndev->mgmt_prot_major, ndev->mgmt_prot_minor)) + ret = -EINVAL; + +done: aie2_dump_chann_info_debug(ndev); /* Must clear address at FW_ALIVE_OFF */ writel(0, SRAM_GET_ADDR(ndev, FW_ALIVE_OFF)); - return 0; + return ret; } static int aie2_runtime_cfg(struct amdxdna_dev_hdl *ndev) @@ -165,10 +234,12 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev) { int ret; - ret = aie2_check_protocol_version(ndev); - if (ret) { - XDNA_ERR(ndev->xdna, "Check header hash failed"); - return ret; + if (!ndev->mgmt_prot_major) { + ret = aie2_check_protocol_version(ndev); + if (ret) { + XDNA_ERR(ndev->xdna, "Check protocol version failed"); + return ret; + } } ret = aie2_runtime_cfg(ndev); @@ -242,7 +313,7 @@ static int aie2_set_dpm_level(void *cb_arg, u32 dpm_level) xdna = hwctx->client->xdna; - ret = aie2_smu_set_dpm_level(xdna->dev_handle, dpm_level, true); + ret = aie2_smu_set_dpm_level(xdna->dev_handle, dpm_level); if (ret) XDNA_ERR(xdna, "set dpm level failed, ret %d", ret); @@ -297,8 +368,10 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna) xdna_mailbox_stop_channel(ndev->mgmt_chann); xdna_mailbox_destroy_channel(ndev->mgmt_chann); ndev->mgmt_chann = NULL; - xdna_mailbox_destroy(ndev->mbox); - ndev->mbox = NULL; + if (ndev->mbox) { + xdna_mailbox_destroy(ndev->mbox); + ndev->mbox = NULL; + } aie2_psp_stop(ndev->psp_hdl); aie2_smu_stop(ndev); pci_clear_master(pdev); @@ -334,7 +407,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna) ret = aie2_get_mgmt_chann_info(ndev); if (ret) { - XDNA_ERR(xdna, "firmware is not alive"); + XDNA_ERR(xdna, "firmware mgmt info ret %d", ret); goto stop_psp; } @@ -362,7 +435,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna) &ndev->mgmt_x2i, &ndev->mgmt_i2x, xdna_mailbox_intr_reg, - mgmt_mb_irq); + mgmt_mb_irq, MB_CHANNEL_MGMT); if (!ndev->mgmt_chann) { XDNA_ERR(xdna, "failed to create management mailbox channel"); ret = -EINVAL; @@ -411,6 +484,7 @@ static int aie2_init(struct amdxdna_dev *xdna) void __iomem * const *tbl; int i, bars, nvec, ret; + XDNA_DBG(xdna, "Control flags 0x%x", aie2_control_flags); ndev = devm_kzalloc(&pdev->dev, sizeof(*ndev), GFP_KERNEL); if (!ndev) return -ENOMEM; @@ -510,6 +584,7 @@ static int aie2_init(struct amdxdna_dev *xdna) aie2_smu_setup(ndev); ndev->pw_mode = POWER_MODE_DEFAULT; + ndev->clk_gate_enabled = true; ret = aie2_hw_start(xdna); if (ret) { XDNA_ERR(xdna, "start npu failed, ret %d", ret); @@ -523,6 +598,7 @@ static int aie2_init(struct amdxdna_dev *xdna) } ndev->total_col = min(aie2_max_col, ndev->metadata.cols); + xrs_cfg.max_dpm_level = SMU_DPM_MAX(ndev); xrs_cfg.clk_list.num_levels = ndev->priv->smu_npu_dpm_levels; xrs_cfg.clk_list.cu_clk_list = ndev->priv->smu_npu_dpm_clk_table; xrs_cfg.sys_eff_factor = 1; @@ -722,7 +798,7 @@ static int aie2_get_firmware_version(struct amdxdna_client *client, static int aie2_get_power_mode(struct amdxdna_client *client, struct amdxdna_drm_get_info *args) { - struct amdxdna_drm_get_power_mode mode; + struct amdxdna_drm_get_power_mode mode = {}; struct amdxdna_dev *xdna = client->xdna; struct amdxdna_dev_hdl *ndev; @@ -919,9 +995,8 @@ static int aie2_set_power_mode(struct amdxdna_client *client, struct amdxdna_drm return -EFAULT; } - /* Interpret the given buf->power_mode into the correct power mode*/ power_mode = power_state.power_mode; - if (power_mode > POWER_MODE_HIGH) { + if (power_mode > POWER_MODE_TURBO) { XDNA_ERR(xdna, "Invalid power mode %d", power_mode); return -EINVAL; } diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h index b00e93ea..51cf66c9 100644 --- a/src/driver/amdxdna/aie2_pci.h +++ b/src/driver/amdxdna/aie2_pci.h @@ -67,15 +67,10 @@ pci_resource_len(NDEV2PDEV(_ndev), (_ndev)->xdna->dev_info->mbox_bar); \ }) -#define SMU_MPNPUCLK_FREQ_MAX(ndev) \ - ((ndev)->priv->smu_mpnpuclk_freq_max) -#define SMU_HCLK_FREQ_MAX(ndev) \ - ((ndev)->priv->smu_hclk_freq_max) #define SMU_DPM_MAX(ndev) \ - ((ndev)->priv->smu_dpm_max) - -#define SMU_NPU_DPM_TABLE_ENTRY(ndev, level) \ - (&ndev->priv->smu_npu_dpm_clk_table[level]) + ((ndev)->smu.num_dpm_levels - 1) +#define SMU_DPM_TABLE_ENTRY(ndev, level) \ + (&(ndev)->smu.dpm_table[level]) enum aie2_smu_reg_idx { SMU_CMD_REG = 0, @@ -154,12 +149,11 @@ struct clock { char name[16]; u32 max_freq_mhz; u32 freq_mhz; -#if defined(CONFIG_DEBUG_FS) - u32 dbg_freq_mhz; -#endif }; struct smu { + const struct dpm_clk *dpm_table; + u32 num_dpm_levels; struct clock mp_npu_clock; struct clock h_clock; u32 curr_dpm_level; @@ -199,6 +193,7 @@ struct amdxdna_hwctx_priv { u32 num_pending; struct amdxdna_gem_obj *cmd_buf[HWCTX_MAX_CMDS]; + struct workqueue_struct *submit_wq; }; struct async_events; @@ -214,6 +209,8 @@ struct amdxdna_dev_hdl { struct xdna_mailbox_chann_res mgmt_x2i; struct xdna_mailbox_chann_res mgmt_i2x; u32 mgmt_chan_idx; + u32 mgmt_prot_major; + u32 mgmt_prot_minor; u32 total_col; u32 smu_curr_dpm_level; @@ -221,6 +218,7 @@ struct amdxdna_dev_hdl { struct aie_metadata metadata; struct smu smu; enum amdxdna_power_mode_type pw_mode; + bool clk_gate_enabled; /* Mailbox and the management channel */ struct mailbox *mbox; @@ -265,10 +263,6 @@ struct amdxdna_dev_priv { struct aie2_bar_off_pair psp_regs_off[PSP_MAX_REGS]; struct aie2_bar_off_pair smu_regs_off[SMU_MAX_REGS]; struct rt_config_clk_gating clk_gating; - u32 smu_mpnpuclk_freq_max; - u32 smu_hclk_freq_max; - /* npu1: 0, not support dpm; npu2+: support dpm up to 7 */ - u32 smu_dpm_max; u32 smu_rev; const struct dpm_clk *smu_npu_dpm_clk_table; u32 smu_npu_dpm_levels; @@ -278,13 +272,17 @@ struct amdxdna_dev_priv { }; /* aie2_pci.c */ +#define AIE2_BIT_BYPASS_POWER_SWITCH 0 /* NOSYS */ +#define AIE2_BIT_BYPASS_SET_FREQ 1 +#define AIE2_BIT_BYPASS_FW_LOAD 2 /* NOSYS */ +extern uint aie2_control_flags; extern const struct amdxdna_dev_ops aie2_ops; +int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor); /* aie2_smu.c */ void aie2_smu_setup(struct amdxdna_dev_hdl *ndev); int aie2_smu_start(struct amdxdna_dev_hdl *ndev); void aie2_smu_stop(struct amdxdna_dev_hdl *ndev); -int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev, struct clock *clock, u32 freq_mhz); char *aie2_smu_get_mpnpu_clock_name(struct amdxdna_dev_hdl *ndev); char *aie2_smu_get_hclock_name(struct amdxdna_dev_hdl *ndev); int aie2_smu_get_mpnpu_clock_freq(struct amdxdna_dev_hdl *ndev); @@ -293,8 +291,7 @@ int aie2_smu_set_power_on(struct amdxdna_dev_hdl *ndev); int aie2_smu_set_power_off(struct amdxdna_dev_hdl *ndev); int aie2_smu_get_power_state(struct amdxdna_dev_hdl *ndev); int aie2_smu_get_dpm_level(struct amdxdna_dev_hdl *ndev); -int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cache); -void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev); +int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level); /* aie2_psp.c */ struct psp_device *aie2m_psp_create(struct device *dev, struct psp_config *conf); @@ -365,6 +362,7 @@ void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map); /* aie2_pm.c */ int aie2_pm_start(struct amdxdna_dev_hdl *ndev); void aie2_pm_stop(struct amdxdna_dev_hdl *ndev); +bool aie2_pm_is_turbo(struct amdxdna_dev_hdl *ndev); int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target); #endif /* _AIE2_PCI_H_ */ diff --git a/src/driver/amdxdna/aie2_pm.c b/src/driver/amdxdna/aie2_pm.c index 64468c72..0a71e5b2 100644 --- a/src/driver/amdxdna/aie2_pm.c +++ b/src/driver/amdxdna/aie2_pm.c @@ -5,13 +5,19 @@ #include "aie2_pci.h" -static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev, bool enable) +static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev, + enum amdxdna_power_mode_type target) { const struct rt_config_clk_gating *config; + bool enable; u32 value; int ret; int i; + enable = (target != POWER_MODE_TURBO && target != POWER_MODE_HIGH); + if (enable == ndev->clk_gate_enabled) + return 0; + config = &ndev->priv->clk_gating; if (enable) value = config->value_enable; @@ -30,9 +36,40 @@ static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev, bool enable) } } + if (!ret) + ndev->clk_gate_enabled = enable; + return ret; } +bool aie2_pm_is_turbo(struct amdxdna_dev_hdl *ndev) +{ + return ndev->pw_mode == POWER_MODE_TURBO; +} + +static int aie2_pm_check_turbo(struct amdxdna_dev_hdl *ndev, + enum amdxdna_power_mode_type prev, + enum amdxdna_power_mode_type next) +{ + struct amdxdna_dev *xdna = ndev->xdna; + struct amdxdna_client *client; + + if (prev != POWER_MODE_TURBO && next != POWER_MODE_TURBO) + return 0; + + drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock)); + list_for_each_entry(client, &xdna->client_list, node) { + bool empty; + + mutex_lock(&client->hwctx_lock); + empty = idr_is_empty(&client->hwctx_idr); + mutex_unlock(&client->hwctx_lock); + if (!empty) + return -EBUSY; + } + return 0; +} + int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target) { struct amdxdna_dev *xdna = ndev->xdna; @@ -44,23 +81,29 @@ int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type if (target == POWER_MODE_LOW || target == POWER_MODE_MEDIUM) return -EOPNOTSUPP; - XDNA_DBG(xdna, "Changing power mode from %d to %d", ndev->pw_mode, target); - /* Set resource solver power property to the user choice */ + ret = aie2_pm_check_turbo(ndev, ndev->pw_mode, target); + if (ret) { + XDNA_WARN(xdna, "Change Turbo mode failed"); + return ret; + } - /* Set power level within the device */ + XDNA_DBG(xdna, "Changing power mode from %d to %d", ndev->pw_mode, target); - /* - * Other mode -> POWER_MODE_HIGH: Turn off clock gating - * POWER_MODE_HIGH -> Other mode: Turn on clock gating - * Otherwise, no change + /* TODO: + *switch (ndev->pw_mode) { + *case POWER_MODE_LOW: + * Set to low DPM level + *case POWER_MODE_MEDIUM: + * Set to medium DPM level + *case POWER_MODE_HIGH: + *case POWER_MODE_TURBO: + * Set to highest DPM level + *default: + * Let driver decides DPM level + *} */ - if (target == POWER_MODE_HIGH) { - XDNA_DBG(xdna, "Clock gating turning off"); - ret = aie2_pm_clock_gating(ndev, false); - } else if (ndev->pw_mode == POWER_MODE_HIGH) { - XDNA_DBG(xdna, "Clock gating turning on"); - ret = aie2_pm_clock_gating(ndev, true); - } + + ret = aie2_pm_clock_gating(ndev, target); if (ret) { XDNA_ERR(xdna, "Failed to config clock gating"); return ret; @@ -73,21 +116,10 @@ int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type int aie2_pm_start(struct amdxdna_dev_hdl *ndev) { - /* - * TODO: should only skip POWER_MODE_DEFAULT. - * Let's make it right after full DPM support is ready - */ - if (ndev->pw_mode != POWER_MODE_HIGH) - return 0; - - return aie2_pm_clock_gating(ndev, false); + return aie2_pm_clock_gating(ndev, ndev->pw_mode); } void aie2_pm_stop(struct amdxdna_dev_hdl *ndev) { - if (ndev->pw_mode != POWER_MODE_HIGH) - return; - - /* Clock gating must be turned ON before suspend firmware */ - aie2_pm_clock_gating(ndev, true); + aie2_pm_clock_gating(ndev, POWER_MODE_DEFAULT); } diff --git a/src/driver/amdxdna/aie2_smu.c b/src/driver/amdxdna/aie2_smu.c index 3675b4e5..c01a44ba 100644 --- a/src/driver/amdxdna/aie2_smu.c +++ b/src/driver/amdxdna/aie2_smu.c @@ -10,12 +10,21 @@ /* SMU commands */ #define AIE2_SMU_POWER_ON 0x3 #define AIE2_SMU_POWER_OFF 0x4 +/* For SMU v0 */ #define AIE2_SMU_SET_MPNPUCLK_FREQ 0x5 #define AIE2_SMU_SET_HCLK_FREQ 0x6 +/* For SMU v1 */ #define AIE2_SMU_SET_SOFT_DPMLEVEL 0x7 #define AIE2_SMU_SET_HARD_DPMLEVEL 0x8 -static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg) +/* This is a hack for NPU1 device */ +const struct dpm_clk npu1_hack_dpm_clk_table[] = { + {400, 800}, + {600, 1024}, +}; + +static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, + u32 reg_arg, u32 *out) { u32 resp; int ret; @@ -35,6 +44,9 @@ static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg) return ret; } + if (out) + *out = readl(SMU_REG(ndev, SMU_OUT_REG)); + if (resp != SMU_RESULT_OK) { XDNA_ERR(ndev->xdna, "SMU cmd %d failed, 0x%x", reg_cmd, resp); return -EINVAL; @@ -43,26 +55,12 @@ static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg) return 0; } -static int aie2_smu_update_clock_freq(struct amdxdna_dev_hdl *ndev, u32 cmd, - struct clock *clock, u32 freq_mhz) -{ - int ret; - - ret = aie2_smu_exec(ndev, cmd, freq_mhz); - if (ret) - return ret; - - clock->freq_mhz = freq_mhz; - - return 0; -} - /* * Depending on the current running frequency and debugfs setting, * aie2_smu_set_clock_freq() might or might not update freqency. */ -int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev, - struct clock *clock, u32 freq_mhz) +static int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev, + struct clock *clock, u32 freq_mhz) { u32 smu_cmd; int ret; @@ -82,18 +80,11 @@ int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev, if (freq_mhz == clock->freq_mhz) return 0; -#if defined(CONFIG_DEBUG_FS) - /* If freq is set by debugfs, respect it until debugfs write freq 0 */ - if (clock->dbg_freq_mhz && freq_mhz != clock->dbg_freq_mhz) { - XDNA_DBG(ndev->xdna, "%s debug freq %d, ignore target freq %d", - clock->name, clock->dbg_freq_mhz, freq_mhz); - return 0; - } -#endif - ret = aie2_smu_update_clock_freq(ndev, smu_cmd, clock, freq_mhz); + ret = aie2_smu_exec(ndev, smu_cmd, freq_mhz, NULL); if (ret) return ret; + clock->freq_mhz = freq_mhz; XDNA_DBG(ndev->xdna, "Set %s = %d mhz", clock->name, clock->freq_mhz); return 0; } @@ -120,23 +111,23 @@ char *aie2_smu_get_hclock_name(struct amdxdna_dev_hdl *ndev) static int aie2_smu_set_dpm_level_v0(struct amdxdna_dev_hdl *ndev, u32 dpm_level) { - int ret; - const struct dpm_clk *dpm_entry = SMU_NPU_DPM_TABLE_ENTRY(ndev, dpm_level); + const struct dpm_clk *dpm_entry = SMU_DPM_TABLE_ENTRY(ndev, dpm_level); struct clock *clk; + int ret; clk = &ndev->smu.mp_npu_clock; - ret = aie2_smu_set_clock_freq(ndev, clk, dpm_entry->npuclk); if (ret) { - XDNA_ERR(ndev->xdna, "setting npu clk failed for dpm level %d, ret: %d", dpm_level, ret); + XDNA_ERR(ndev->xdna, "setting npu clk failed for dpm level %d, ret: %d", + dpm_level, ret); return ret; } clk = &ndev->smu.h_clock; - ret = aie2_smu_set_clock_freq(ndev, clk, dpm_entry->hclk); if (ret) { - XDNA_ERR(ndev->xdna, "setting hclk failed for dpm level %d, ret: %d", dpm_level, ret); + XDNA_ERR(ndev->xdna, "setting hclk failed for dpm level %d, ret: %d", + dpm_level, ret); return ret; } @@ -147,16 +138,19 @@ static int aie2_smu_set_dpm_level_v1(struct amdxdna_dev_hdl *ndev, u32 dpm_level { int ret; - ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level); + ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level, NULL); if (!ret) XDNA_INFO_ONCE(ndev->xdna, "Set hard dpm level = %d", dpm_level); else return ret; - ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level); + ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level, NULL); if (!ret) XDNA_INFO_ONCE(ndev->xdna, "Set soft dpm level = %d", dpm_level); + ndev->smu.mp_npu_clock.freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, dpm_level)->npuclk; + ndev->smu.h_clock.freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, dpm_level)->hclk; + return ret; } @@ -165,11 +159,16 @@ int aie2_smu_get_dpm_level(struct amdxdna_dev_hdl *ndev) return ndev->smu.curr_dpm_level; } -int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cache) +int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level) { int ret; - if (dpm_level < 0 || dpm_level > SMU_DPM_MAX(ndev)) + if (aie2_control_flags & BIT(AIE2_BIT_BYPASS_SET_FREQ)) { + XDNA_DBG(ndev->xdna, "Bypassed set dpm level"); + return 0; + } + + if (dpm_level > SMU_DPM_MAX(ndev)) return -EINVAL; if (!ndev->priv->smu_rev) @@ -177,8 +176,10 @@ int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cac else ret = aie2_smu_set_dpm_level_v1(ndev, dpm_level); - if (!ret & cache) + if (!ret) { ndev->smu.curr_dpm_level = dpm_level; + XDNA_DBG(ndev->xdna, "The dpm level is set to %d", dpm_level); + } return ret; } @@ -187,7 +188,7 @@ int aie2_smu_set_power_on(struct amdxdna_dev_hdl *ndev) { int ret; - ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0); + ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0, NULL); if (ret) return ret; @@ -199,7 +200,7 @@ int aie2_smu_set_power_off(struct amdxdna_dev_hdl *ndev) { int ret; - ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0); + ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0, NULL); if (ret) return ret; @@ -215,7 +216,6 @@ int aie2_smu_get_power_state(struct amdxdna_dev_hdl *ndev) int aie2_smu_start(struct amdxdna_dev_hdl *ndev) { struct smu *smu = &ndev->smu; - u32 freq_mhz; int ret; ret = aie2_smu_set_power_on(ndev); @@ -224,63 +224,23 @@ int aie2_smu_start(struct amdxdna_dev_hdl *ndev) return ret; } - freq_mhz = smu->mp_npu_clock.freq_mhz; - ret = aie2_smu_update_clock_freq(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, - &smu->mp_npu_clock, freq_mhz); + ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level); if (ret) { - XDNA_ERR(ndev->xdna, "Set mpnpu clk freq failed, ret %d", ret); + XDNA_ERR(ndev->xdna, "Set dpm level failed, ret %d", ret); return ret; } - XDNA_INFO_ONCE(ndev->xdna, "Set %s = %d mhz", smu->mp_npu_clock.name, freq_mhz); - - freq_mhz = smu->h_clock.freq_mhz; - ret = aie2_smu_update_clock_freq(ndev, AIE2_SMU_SET_HCLK_FREQ, - &smu->h_clock, freq_mhz); - if (ret) { - XDNA_ERR(ndev->xdna, "Set hclk freq failed, ret %d", ret); - return ret; - } - XDNA_INFO_ONCE(ndev->xdna, "Set %s = %d mhz", smu->h_clock.name, freq_mhz); - - if (SMU_DPM_MAX(ndev) > 0) { - ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level, true); - if (ret) { - XDNA_ERR(ndev->xdna, "Set dpm level failed, ret %d", ret); - return ret; - } - } return 0; } -void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev) -{ - u32 freq_mhz; - int ret; - - freq_mhz = 400; - ret = aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, freq_mhz); - if (ret) - XDNA_ERR(ndev->xdna, "Set mpnpu clk freq %d mhz failed, ret %d", freq_mhz, ret); - - freq_mhz = 800; - ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HCLK_FREQ, freq_mhz); - if (ret) - XDNA_ERR(ndev->xdna, "Set hclk freq %d mhz failed, ret %d", freq_mhz, ret); - - if (SMU_DPM_MAX(ndev) > 0) { - ret = aie2_smu_set_dpm_level(ndev, 0, false); - if (ret) - XDNA_ERR(ndev->xdna, "Set dpm level 0 failed, ret %d", ret); - } -} - void aie2_smu_stop(struct amdxdna_dev_hdl *ndev) { int ret; /* Minimize clocks/dpm level prior to power off */ - aie2_smu_prepare_s0i3(ndev); + ret = aie2_smu_set_dpm_level(ndev, 0); + if (ret) + XDNA_WARN(ndev->xdna, "Set dpm level 0 failed, ret %d", ret); ret = aie2_smu_set_power_off(ndev); if (ret) @@ -292,13 +252,26 @@ void aie2_smu_setup(struct amdxdna_dev_hdl *ndev) struct smu *smu = &ndev->smu; snprintf(smu->mp_npu_clock.name, sizeof(smu->mp_npu_clock.name), "MP-NPU Clock"); - smu->mp_npu_clock.max_freq_mhz = SMU_MPNPUCLK_FREQ_MAX(ndev); - snprintf(smu->h_clock.name, sizeof(smu->h_clock.name), "H Clock"); - smu->h_clock.max_freq_mhz = SMU_HCLK_FREQ_MAX(ndev); - - /* The first time SMU start, it will use below clock frequency */ - smu->mp_npu_clock.freq_mhz = smu->mp_npu_clock.max_freq_mhz; - smu->h_clock.freq_mhz = smu->h_clock.max_freq_mhz; + smu->dpm_table = ndev->priv->smu_npu_dpm_clk_table; + smu->num_dpm_levels = ndev->priv->smu_npu_dpm_levels; smu->curr_dpm_level = SMU_DPM_MAX(ndev); + + if (!ndev->priv->smu_rev) { + u32 npuclk_freq; + u32 out; + + /* This is a hack for special NPU1 device */ + npuclk_freq = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->npuclk; + aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, npuclk_freq, &out); + if (npuclk_freq != out) { + XDNA_DBG(ndev->xdna, "Use small DPM table"); + smu->dpm_table = npu1_hack_dpm_clk_table; + smu->num_dpm_levels = ARRAY_SIZE(npu1_hack_dpm_clk_table); + smu->curr_dpm_level = SMU_DPM_MAX(ndev); + } + } + + smu->mp_npu_clock.max_freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->npuclk; + smu->h_clock.max_freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->hclk; } diff --git a/src/driver/amdxdna/aie2_solver.c b/src/driver/amdxdna/aie2_solver.c index 289f2ec7..38de5a9d 100644 --- a/src/driver/amdxdna/aie2_solver.c +++ b/src/driver/amdxdna/aie2_solver.c @@ -95,6 +95,19 @@ static int sanity_check(struct solver_state *xrs, struct alloc_requests *req) return 0; } +static bool is_valid_qos_dpm_params(struct aie_qos *rqos) +{ + /* + * gops is retrieved from the xmodel, so it's always set + * fps and latency are the configurable params from the application + */ + if (rqos->gops > 0 && (rqos->fps > 0 || rqos->latency > 0)) { + return true; + } + + return false; +} + static u32 find_dpm_level(struct solver_state *xrs, struct alloc_requests *req) { struct cdo_parts *cdop = &req->cdo; @@ -103,8 +116,9 @@ static u32 find_dpm_level(struct solver_state *xrs, struct alloc_requests *req) struct solver_node *node; u32 cu_clk_freq, dpm_level; - if (cdop->ncols > xrs->cfg.total_col) - return -EINVAL; + /* If no QoS parameters are passed, set it to the max DPM level */ + if (!is_valid_qos_dpm_params(rqos)) + return xrs->cfg.max_dpm_level; /* * We can find at least one CDOs groups that meet the diff --git a/src/driver/amdxdna/aie2_solver.h b/src/driver/amdxdna/aie2_solver.h index 19fd4b87..98b16380 100644 --- a/src/driver/amdxdna/aie2_solver.h +++ b/src/driver/amdxdna/aie2_solver.h @@ -91,6 +91,7 @@ struct init_config { u32 total_col; u32 sys_eff_factor; /* system efficiency factor */ u32 latency_adj; /* latency adjustment in ms */ + u32 max_dpm_level; /* Max dpm level in the system */ struct clk_list_info clk_list; /* List of frequencies available in system */ struct device *dev; struct xrs_action_ops *actions; diff --git a/src/driver/amdxdna/amdxdna_ctx.c b/src/driver/amdxdna/amdxdna_ctx.c index ba263879..9bb219f7 100644 --- a/src/driver/amdxdna/amdxdna_ctx.c +++ b/src/driver/amdxdna/amdxdna_ctx.c @@ -333,7 +333,7 @@ amdxdna_arg_bos_lookup(struct amdxdna_client *client, abo = to_xdna_obj(gobj); mutex_lock(&abo->lock); - if (abo->pinned) { + if (abo->flags & BO_SUBMIT_PINNED) { mutex_unlock(&abo->lock); job->bos[i] = gobj; continue; @@ -345,7 +345,7 @@ amdxdna_arg_bos_lookup(struct amdxdna_client *client, drm_gem_object_put(gobj); goto put_arg_bos; } - abo->pinned = true; + abo->flags |= BO_SUBMIT_PINNED; mutex_unlock(&abo->lock); job->bos[i] = gobj; @@ -375,6 +375,77 @@ void amdxdna_job_put(struct amdxdna_sched_job *job) kref_put(&job->refcnt, amdxdna_sched_job_release); } +int amdxdna_lock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx) +{ + struct amdxdna_dev *xdna = job->hwctx->client->xdna; + struct amdxdna_gem_obj *abo; + int contended = -1, i, ret; + + ww_acquire_init(ctx, &reservation_ww_class); + +retry: + if (contended != -1) { + ret = dma_resv_lock_slow_interruptible(job->bos[contended]->resv, ctx); + if (ret) { + ww_acquire_fini(ctx); + return ret; + } + abo->flags |= BO_SUBMIT_LOCKED; + } + + for (i = 0; i < job->bo_cnt; i++) { + abo = to_xdna_obj(job->bos[i]); + if (abo->flags & BO_SUBMIT_LOCKED) + continue; + + ret = dma_resv_lock_interruptible(job->bos[i]->resv, ctx); + if (ret) { + int j; + + for (j = 0; j < i; j++) { + abo = to_xdna_obj(job->bos[j]); + dma_resv_unlock(job->bos[j]->resv); + abo->flags &= ~BO_SUBMIT_LOCKED; + } + + if (contended != -1 && contended >= i) + dma_resv_unlock(job->bos[contended]->resv); + + if (ret == -EDEADLK) { + contended = i; + goto retry; + } + + ww_acquire_fini(ctx); + + XDNA_ERR(xdna, "Lock BO failed, ret %d", ret); + return ret; + } + abo->flags |= BO_SUBMIT_LOCKED; + } + + ww_acquire_done(ctx); + + return 0; +} + +void amdxdna_unlock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx) +{ + struct amdxdna_gem_obj *abo; + int i; + + for (i = 0; i < job->bo_cnt; i++) { + abo = to_xdna_obj(job->bos[i]); + if (!(abo->flags & BO_SUBMIT_LOCKED)) + continue; + + dma_resv_unlock(job->bos[i]->resv); + abo->flags &= ~BO_SUBMIT_LOCKED; + } + + ww_acquire_fini(ctx); +} + int amdxdna_cmd_submit(struct amdxdna_client *client, u32 opcode, u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt, u32 hwctx_hdl, u64 *seq) diff --git a/src/driver/amdxdna/amdxdna_ctx.h b/src/driver/amdxdna/amdxdna_ctx.h index c1d7ba17..6ccaa45a 100644 --- a/src/driver/amdxdna/amdxdna_ctx.h +++ b/src/driver/amdxdna/amdxdna_ctx.h @@ -228,6 +228,8 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client); void amdxdna_hwctx_suspend(struct amdxdna_client *client); void amdxdna_hwctx_resume(struct amdxdna_client *client); +int amdxdna_lock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx); +void amdxdna_unlock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx); int amdxdna_cmd_submit(struct amdxdna_client *client, u32 opcode, u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt, u32 hwctx_hdl, u64 *seq); diff --git a/src/driver/amdxdna/amdxdna_devel.c b/src/driver/amdxdna/amdxdna_devel.c index c4dd3ee5..3f6698a0 100644 --- a/src/driver/amdxdna/amdxdna_devel.c +++ b/src/driver/amdxdna/amdxdna_devel.c @@ -171,6 +171,9 @@ void amdxdna_bo_dma_unmap(struct amdxdna_gem_obj *abo) struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev); XDNA_DBG(xdna, "BO type %d dma_addr 0x%llx", abo->type, abo->mem.dma_addr); + if (is_import_bo(abo)) + return; + drm_gem_shmem_put_pages(&abo->base); } #else diff --git a/src/driver/amdxdna/amdxdna_drm.c b/src/driver/amdxdna/amdxdna_drm.c index e44062c5..b080d1ea 100644 --- a/src/driver/amdxdna/amdxdna_drm.c +++ b/src/driver/amdxdna/amdxdna_drm.c @@ -117,8 +117,17 @@ static int amdxdna_flush(struct file *f, fl_owner_t id) struct drm_file *filp = f->private_data; struct amdxdna_client *client = filp->driver_priv; struct amdxdna_dev *xdna = client->xdna; + pid_t pid = task_tgid_nr(current); int idx; + /* When current PID not equals to Client PID, this is a flush() + * triggered by closing a child process. If this is the case, flush() is + * just a no-op. The process which open() device should finally flush() + * and close() device. + */ + if (pid != client->pid) + return 0; + XDNA_DBG(xdna, "PID %d flushing...", client->pid); if (!drm_dev_enter(&xdna->ddev, &idx)) return 0; @@ -229,7 +238,7 @@ const struct drm_driver amdxdna_drm_drv = { /* For shmem object create */ .gem_create_object = amdxdna_gem_create_object_cb, #ifdef AMDXDNA_SHMEM - .gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table, + .gem_prime_import = amdxdna_gem_prime_import, #else .gem_prime_import_sg_table = drm_gem_dma_prime_import_sg_table, #endif diff --git a/src/driver/amdxdna/amdxdna_gem.c b/src/driver/amdxdna/amdxdna_gem.c index 6ef81a9b..eb25f5e6 100644 --- a/src/driver/amdxdna/amdxdna_gem.c +++ b/src/driver/amdxdna/amdxdna_gem.c @@ -60,48 +60,6 @@ amdxdna_gem_insert_node_locked(struct amdxdna_gem_obj *abo, bool use_vmap) return 0; } -static void amdxdna_gem_obj_free(struct drm_gem_object *gobj) -{ - struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev); - struct amdxdna_gem_obj *abo = to_xdna_obj(gobj); - struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva); - - XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr); - if (abo->pinned) - amdxdna_gem_unpin(abo); - - flush_work(&abo->hmm_unreg_work); - if (abo->type == AMDXDNA_BO_DEV) { - mutex_lock(&abo->client->mm_lock); - drm_mm_remove_node(&abo->mm_node); - mutex_unlock(&abo->client->mm_lock); - - vunmap(abo->mem.kva); - drm_gem_object_put(to_gobj(abo->dev_heap)); - drm_gem_object_release(gobj); - mutex_destroy(&abo->lock); - kfree(abo); - return; - } - - if (abo->type == AMDXDNA_BO_DEV_HEAP) - drm_mm_takedown(&abo->mm); - -#ifdef AMDXDNA_DEVEL - if (abo->type == AMDXDNA_BO_CMD) - amdxdna_mem_unmap(xdna, &abo->mem); - else if (iommu_mode == AMDXDNA_IOMMU_NO_PASID) - amdxdna_bo_dma_unmap(abo); -#endif - drm_gem_vunmap_unlocked(gobj, &map); - mutex_destroy(&abo->lock); - drm_gem_shmem_free(&abo->base); -} - -static const struct drm_gem_object_funcs amdxdna_gem_dev_obj_funcs = { - .free = amdxdna_gem_obj_free, -}; - static bool amdxdna_hmm_invalidate(struct mmu_interval_notifier *mni, const struct mmu_notifier_range *range, unsigned long cur_seq) @@ -136,8 +94,11 @@ static void amdxdna_hmm_unregister(struct amdxdna_gem_obj *abo) if (!xdna->dev_info->ops->hmm_invalidate) return; - if (!abo->mem.pfns) + mutex_lock(&abo->lock); + if (!abo->mem.pfns) { + mutex_unlock(&abo->lock); return; + } mmu_interval_notifier_remove(&abo->mem.notifier); kvfree(abo->mem.pfns); @@ -145,6 +106,8 @@ static void amdxdna_hmm_unregister(struct amdxdna_gem_obj *abo) if (is_import_bo(abo) && vma->vm_file && vma->vm_file->f_mapping) mapping_clear_unevictable(vma->vm_file->f_mapping); + + mutex_unlock(&abo->lock); } static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo, @@ -159,14 +122,19 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo, if (!xdna->dev_info->ops->hmm_invalidate) return 0; - if (abo->mem.pfns) - return -EEXIST; + mutex_lock(&abo->lock); + if (abo->mem.pfns) { + ret = -EEXIST; + goto out_unlock; + } nr_pages = (PAGE_ALIGN(addr + len) - (addr & PAGE_MASK)) >> PAGE_SHIFT; abo->mem.pfns = kvcalloc(nr_pages, sizeof(unsigned long), GFP_KERNEL); - if (!abo->mem.pfns) - return -ENOMEM; + if (!abo->mem.pfns) { + ret = -ENOMEM; + goto out_unlock; + } ret = mmu_interval_notifier_insert_locked(&abo->mem.notifier, current->mm, @@ -175,65 +143,122 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo, &amdxdna_hmm_ops); if (ret) { XDNA_ERR(xdna, "Insert mmu notifier failed, ret %d", ret); - kvfree(abo->mem.pfns); - abo->mem.pfns = NULL; - return ret; + goto free_pfns; } abo->mem.userptr = addr; abo->mem.vma = vma; if (is_import_bo(abo) && vma->vm_file && vma->vm_file->f_mapping) mapping_set_unevictable(vma->vm_file->f_mapping); + mutex_unlock(&abo->lock); + return 0; + +free_pfns: + kvfree(abo->mem.pfns); + abo->mem.pfns = NULL; +out_unlock: + mutex_unlock(&abo->lock); + return ret; } -static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data) +static void amdxdna_gem_obj_free(struct drm_gem_object *gobj) { - if (pte_none(ptep_get(pte))) - return -EINVAL; + struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev); + struct amdxdna_gem_obj *abo = to_xdna_obj(gobj); + struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva); - *(bool *)data = true; - return 0; + XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr); + if (abo->flags & BO_SUBMIT_PINNED) + amdxdna_gem_unpin(abo); + + amdxdna_hmm_unregister(abo); + flush_work(&abo->hmm_unreg_work); + if (abo->type == AMDXDNA_BO_DEV) { + mutex_lock(&abo->client->mm_lock); + drm_mm_remove_node(&abo->mm_node); + mutex_unlock(&abo->client->mm_lock); + + vunmap(abo->mem.kva); + drm_gem_object_put(to_gobj(abo->dev_heap)); + drm_gem_object_release(gobj); + mutex_destroy(&abo->lock); + kfree(abo); + return; + } + + if (abo->type == AMDXDNA_BO_DEV_HEAP) + drm_mm_takedown(&abo->mm); + +#ifdef AMDXDNA_DEVEL + if (abo->type == AMDXDNA_BO_CMD) + amdxdna_mem_unmap(xdna, &abo->mem); + else if (iommu_mode == AMDXDNA_IOMMU_NO_PASID) + amdxdna_bo_dma_unmap(abo); +#endif + drm_gem_vunmap_unlocked(gobj, &map); + mutex_destroy(&abo->lock); + drm_gem_shmem_free(&abo->base); } +static const struct drm_gem_object_funcs amdxdna_gem_dev_obj_funcs = { + .free = amdxdna_gem_obj_free, +}; + static int amdxdna_insert_pages(struct amdxdna_gem_obj *abo, struct vm_area_struct *vma) { - unsigned long num_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - struct sg_dma_page_iter sg_iter; - bool has_mapped_page = false; + struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev); + unsigned long num_pages = vma_pages(vma); unsigned long offset = 0; int ret; if (!is_import_bo(abo)) { + ret = drm_gem_shmem_mmap(&abo->base, vma); + if (ret) { + XDNA_ERR(xdna, "Failed shmem mmap %d", ret); + return ret; + } + /* The buffer is based on memory pages. Fix the flag. */ vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP); - return vm_insert_pages(vma, vma->vm_start, abo->base.pages, - &num_pages); - } + ret = vm_insert_pages(vma, vma->vm_start, abo->base.pages, + &num_pages); + if (ret) { + XDNA_ERR(xdna, "Failed insert pages %d", ret); + vma->vm_ops->close(vma); + return ret; + } - ret = apply_to_page_range(vma->vm_mm, vma->vm_start, num_pages, - is_mapped_fn, &has_mapped_page); - if (!ret) return 0; + } - if (has_mapped_page) - return -EBUSY; + vma->vm_private_data = NULL; + vma->vm_ops = NULL; + ret = dma_buf_mmap(to_gobj(abo)->dma_buf, vma, 0); + if (ret) { + XDNA_ERR(xdna, "Failed to mmap dma buf %d", ret); + return ret; + } - for_each_sgtable_dma_page(abo->base.sgt, &sg_iter, 0) { - dma_addr_t addr = sg_page_iter_dma_address(&sg_iter); - unsigned long pfn; + do { + vm_fault_t fault_ret; - pfn = PFN_DOWN(dma_to_phys(to_gobj(abo)->dev->dev, addr)); - ret = io_remap_pfn_range(vma, vma->vm_start + offset, pfn, - PAGE_SIZE, vma->vm_page_prot); - if (ret) - break; + fault_ret = handle_mm_fault(vma, vma->vm_start+offset, + FAULT_FLAG_WRITE, NULL); + if (fault_ret & VM_FAULT_ERROR) { + vma->vm_ops->close(vma); + XDNA_ERR(xdna, "Fault in page failed"); + return -EFAULT; + } offset += PAGE_SIZE; - } + } while (--num_pages); - return ret; + /* Drop the reference drm_gem_mmap_obj() acquired.*/ + drm_gem_object_put(to_gobj(abo)); + + return 0; } static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj, @@ -250,16 +275,10 @@ static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj, if (ret) return ret; - ret = drm_gem_shmem_mmap(&abo->base, vma); - if (ret) { - XDNA_ERR(xdna, "failed shmem mmap %d", ret); - goto hmm_unreg; - } - ret = amdxdna_insert_pages(abo, vma); if (ret) { XDNA_ERR(xdna, "Failed insert pages, ret %d", ret); - goto close_vma; + goto hmm_unreg; } XDNA_DBG(xdna, "BO map_offset 0x%llx type %d userptr 0x%llx size 0x%lx", @@ -267,8 +286,6 @@ static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj, abo->mem.userptr, gobj->size); return 0; -close_vma: - vma->vm_ops->close(vma); hmm_unreg: amdxdna_hmm_unregister(abo); return ret; @@ -354,7 +371,6 @@ amdxdna_gem_create_obj(struct drm_device *dev, size_t size) if (!abo) return ERR_PTR(-ENOMEM); - abo->pinned = false; abo->assigned_hwctx = AMDXDNA_INVALID_CTX_HANDLE; mutex_init(&abo->lock); INIT_WORK(&abo->hmm_unreg_work, amdxdna_hmm_unreg_work); @@ -381,6 +397,60 @@ amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size) return to_gobj(abo); } +struct drm_gem_object * +amdxdna_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf) +{ + struct dma_buf_attachment *attach; + struct drm_gem_object *gobj; + struct sg_table *sgt; + int ret; + + attach = dma_buf_attach(dma_buf, dev->dev); + if (IS_ERR(attach)) + return ERR_CAST(attach); + + get_dma_buf(dma_buf); + + sgt = dma_buf_map_attachment_unlocked(attach, DMA_BIDIRECTIONAL); + if (IS_ERR(sgt)) { + ret = PTR_ERR(sgt); + goto fail_detach; + } + + gobj = drm_gem_shmem_prime_import_sg_table(dev, attach, sgt); + if (IS_ERR(gobj)) { + ret = PTR_ERR(gobj); + goto fail_unmap; + } + + gobj->import_attach = attach; + gobj->resv = dma_buf->resv; + +#ifdef AMDXDNA_DEVEL + if (iommu_mode == AMDXDNA_IOMMU_NO_PASID) { + struct amdxdna_gem_obj *abo; + + abo = to_xdna_obj(gobj); + ret = amdxdna_bo_dma_map(abo); + if (ret) { + drm_gem_object_put(gobj); + goto fail_unmap; + } + abo->mem.dev_addr = abo->mem.dma_addr; + } +#endif + + return gobj; + +fail_unmap: + dma_buf_unmap_attachment_unlocked(attach, sgt, DMA_BIDIRECTIONAL); +fail_detach: + dma_buf_detach(dma_buf, attach); + dma_buf_put(dma_buf); + + return ERR_PTR(ret); +} + static struct amdxdna_gem_obj * amdxdna_drm_alloc_shmem(struct drm_device *dev, struct amdxdna_drm_create_bo *args, diff --git a/src/driver/amdxdna/amdxdna_gem.h b/src/driver/amdxdna/amdxdna_gem.h index 3429a3ee..24a61608 100644 --- a/src/driver/amdxdna/amdxdna_gem.h +++ b/src/driver/amdxdna/amdxdna_gem.h @@ -27,11 +27,13 @@ struct amdxdna_mem { #endif }; +#define BO_SUBMIT_PINNED BIT(0) +#define BO_SUBMIT_LOCKED BIT(1) struct amdxdna_gem_obj { struct drm_gem_shmem_object base; struct amdxdna_client *client; u8 type; - bool pinned; + u64 flags; struct mutex lock; /* Protects: pinned, assigned_hwctx */ struct amdxdna_mem mem; struct work_struct hmm_unreg_work; @@ -60,6 +62,8 @@ static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo) struct drm_gem_object * amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size); +struct drm_gem_object * +amdxdna_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf); struct amdxdna_gem_obj * amdxdna_drm_alloc_dev_bo(struct drm_device *dev, struct amdxdna_drm_create_bo *args, diff --git a/src/driver/amdxdna/amdxdna_mailbox.c b/src/driver/amdxdna/amdxdna_mailbox.c index 2f58fd3e..409ab26f 100644 --- a/src/driver/amdxdna/amdxdna_mailbox.c +++ b/src/driver/amdxdna/amdxdna_mailbox.c @@ -52,8 +52,11 @@ #ifdef AMDXDNA_DEVEL int mailbox_polling; -module_param(mailbox_polling, int, 0644); -MODULE_PARM_DESC(mailbox_polling, "0:interrupt(default); >0:poll interval in ms; <0: busy poll"); +module_param(mailbox_polling, int, 0444); +MODULE_PARM_DESC(mailbox_polling, "<=0:interrupt(default); >0:poll interval in ms; <0: busy poll"); +#define MB_DEFAULT_NO_POLL (mailbox_polling <= 0) +#define MB_PERIODIC_POLL (mailbox_polling > 0) +#define MB_FORCE_USER_POLL (mailbox_polling < 0) #define MB_TIMER_JIFF msecs_to_jiffies(mailbox_polling) #endif @@ -70,29 +73,39 @@ struct mailbox { /* protect channel list */ struct mutex mbox_lock; struct list_head chann_list; -#ifdef AMDXDNA_DEVEL + struct list_head poll_chann_list; struct task_struct *polld; struct wait_queue_head poll_wait; bool sent_msg; /* For polld */ -#endif - #if defined(CONFIG_DEBUG_FS) struct list_head res_records; #endif /* CONFIG_DEBUG_FS */ +}; +#if defined(CONFIG_DEBUG_FS) +struct mailbox_res_record { + enum xdna_mailbox_channel_type type; + struct list_head re_entry; + struct xdna_mailbox_chann_res re_x2i; + struct xdna_mailbox_chann_res re_i2x; + int re_irq; + int active; }; +#endif /* CONFIG_DEBUG_FS */ struct mailbox_channel { struct mailbox *mb; #if defined(CONFIG_DEBUG_FS) - struct list_head chann_entry; + struct mailbox_res_record *record; #endif + struct list_head chann_entry; struct xdna_mailbox_chann_res res[CHAN_RES_NUM]; int msix_irq; + u32 x2i_tail; u32 iohub_int_addr; + enum xdna_mailbox_channel_type type; struct idr chan_idr; spinlock_t chan_idr_lock; /* protect idr operations */ - u32 x2i_tail; /* Received msg related fields */ struct workqueue_struct *work_q; @@ -134,15 +147,6 @@ struct mailbox_msg { struct mailbox_pkg pkg; }; -#if defined(CONFIG_DEBUG_FS) -struct mailbox_res_record { - struct list_head re_entry; - struct xdna_mailbox_chann_res re_x2i; - struct xdna_mailbox_chann_res re_i2x; - int re_irq; -}; -#endif /* CONFIG_DEBUG_FS */ - static void mailbox_reg_write(struct mailbox_channel *mb_chann, u32 mbox_reg, u32 data) { struct xdna_mailbox_res *mb_res = &mb_chann->mb->res; @@ -162,17 +166,23 @@ static u32 mailbox_reg_read(struct mailbox_channel *mb_chann, u32 mbox_reg) static int mailbox_tail_read_non_zero(struct mailbox_channel *mb_chann, u32 *val) { u32 mbox_reg = mb_chann->res[CHAN_RES_I2X].mb_tail_ptr_reg; + u32 ringbuf_size = mb_chann->res[CHAN_RES_I2X].rb_size; struct xdna_mailbox_res *mb_res = &mb_chann->mb->res; u64 ringbuf_addr = mb_res->mbox_base + mbox_reg; - int ret, value; + int ret, tail; - /* Poll till value is not zero */ - ret = readx_poll_timeout(ioread32, (void *)ringbuf_addr, value, - value, 1 /* us */, 100); + /* Poll till tail is not zero */ + ret = readx_poll_timeout(ioread32, (void *)ringbuf_addr, tail, + tail, 0 /* tight-loops */, 100 /* us timeout */); if (ret < 0) return ret; - *val = value; + if (unlikely(tail > ringbuf_size || !IS_ALIGNED(tail, 4))) { + MB_WARN_ONCE(mb_chann, "Invalid tail 0x%x", tail); + return -EINVAL; + } + + *val = tail; return 0; } @@ -349,6 +359,12 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade return ret; } +/* + * mailbox_get_msg() is the key function to get message from ring buffer. + * If it returns 0, means 1 message was consumed. + * If it returns -ENOENT, means ring buffer is emtpy. + * If it returns other value, means ERROR. + */ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann) { struct xdna_msg_header header; @@ -359,19 +375,15 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann) u64 read_addr; int ret; - if (mailbox_tail_read_non_zero(mb_chann, &tail)) { + ret = mailbox_tail_read_non_zero(mb_chann, &tail); + if (ret) { MB_WARN_ONCE(mb_chann, "Zero tail too long"); - return -EINVAL; + return ret; } head = mb_chann->i2x_head; ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_I2X); start_addr = mb_chann->res[CHAN_RES_I2X].rb_start_addr; - if (unlikely(tail > ringbuf_size || !IS_ALIGNED(tail, 4))) { - MB_WARN_ONCE(mb_chann, "Invalid tail 0x%x", tail); - return -EINVAL; - } - /* ringbuf empty */ if (head == tail) return -ENOENT; @@ -389,8 +401,17 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann) head, tail); return -EINVAL; } - mailbox_set_headptr(mb_chann, 0); - return 0; + + /* Read from beginning of ringbuf */ + head = 0; + ret = mailbox_tail_read_non_zero(mb_chann, &tail); + if (ret) { + MB_WARN_ONCE(mb_chann, "Hit tombstone, re-read tail failed"); + return -EINVAL; + } + /* Re-peek size of the message */ + read_addr = mb_chann->mb->res.ringbuf_base + start_addr; + header.total_size = ioread32((void *)read_addr); } if (unlikely(!header.total_size || !IS_ALIGNED(header.total_size, 4))) { @@ -416,20 +437,64 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann) /* After update head, it can equal to ringbuf_size. This is expected. */ trace_mbox_set_head(MAILBOX_NAME, mb_chann->msix_irq, header.opcode, header.id); - return ret; } +static void mailbox_rx_worker(struct work_struct *rx_work) +{ + struct mailbox_channel *mb_chann; + int ret; + + mb_chann = container_of(rx_work, struct mailbox_channel, rx_work); + trace_mbox_rx_worker(MAILBOX_NAME, mb_chann->msix_irq); + + if (READ_ONCE(mb_chann->bad_state)) { + MB_ERR(mb_chann, "Channel in bad state, work aborted"); + return; + } + + while (1) { + /* + * If return is 0, keep consuming next message, until there is + * no messages or an error happened. + */ + ret = mailbox_get_msg(mb_chann); + if (ret == -ENOENT) + break; + + /* Other error means device doesn't look good, disable irq. */ + if (unlikely(ret)) { + MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret); + WRITE_ONCE(mb_chann->bad_state, true); + disable_irq(mb_chann->msix_irq); + break; + } + } +} + static irqreturn_t mailbox_irq_handler(int irq, void *p) { struct mailbox_channel *mb_chann = p; + u32 iohub; + int i; trace_mbox_irq_handle(MAILBOX_NAME, irq); - /* Schedule a rx_work to call the callback functions */ - queue_work(mb_chann->work_q, &mb_chann->rx_work); + if (mb_chann->type == MB_CHANNEL_USER_POLL) + return IRQ_HANDLED; /* Clear IOHUB register */ mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0); + /* Schedule a rx_work to call the callback functions */ + queue_work(mb_chann->work_q, &mb_chann->rx_work); + for (i = 0; i < 4; i++) { + iohub = mailbox_reg_read(mb_chann, mb_chann->iohub_int_addr); + if (iohub) + goto race; + } + return IRQ_HANDLED; +race: + mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0); + queue_work(mb_chann->work_q, &mb_chann->rx_work); return IRQ_HANDLED; } @@ -441,13 +506,12 @@ static void mailbox_timer(struct timer_list *t) /* The timer mimic interrupt. It is good to reuse irq routine */ tail = mailbox_get_tailptr(mb_chann, CHAN_RES_I2X); - if (tail) { - MB_DBG(mb_chann, "Mimic interrupt..."); + if (tail) mailbox_irq_handler(0, mb_chann); - } mod_timer(&mb_chann->timer, jiffies + MB_TIMER_JIFF); } +#endif static void mailbox_polld_handle_chann(struct mailbox_channel *mb_chann) { @@ -501,7 +565,10 @@ static bool mailbox_polld_event(struct mailbox *mb) struct mailbox_channel *mb_chann; mutex_lock(&mb->mbox_lock); - list_for_each_entry(mb_chann, &mb->chann_list, chann_entry) { + list_for_each_entry(mb_chann, &mb->poll_chann_list, chann_entry) { + if (mb_chann->type == MB_CHANNEL_MGMT) + break; + if (mailbox_channel_no_msg(mb_chann)) continue; @@ -530,13 +597,11 @@ static int mailbox_polld(void *data) continue; mutex_lock(&mb->mbox_lock); - if (unlikely(list_empty(&mb->chann_list))) { - mutex_unlock(&mb->mbox_lock); - continue; - } - chann_all_empty = true; - list_for_each_entry(mb_chann, &mb->chann_list, chann_entry) { + list_for_each_entry(mb_chann, &mb->poll_chann_list, chann_entry) { + if (mb_chann->type == MB_CHANNEL_MGMT) + break; + if (mailbox_channel_no_msg(mb_chann)) continue; @@ -558,38 +623,6 @@ static int mailbox_polld(void *data) return 0; } -#endif - -static void mailbox_rx_worker(struct work_struct *rx_work) -{ - struct mailbox_channel *mb_chann; - int ret; - - mb_chann = container_of(rx_work, struct mailbox_channel, rx_work); - - if (READ_ONCE(mb_chann->bad_state)) { - MB_ERR(mb_chann, "Channel in bad state, work aborted"); - return; - } - - while (1) { - /* - * If return is 0, keep consuming next message, until there is - * no messages or an error happened. - */ - ret = mailbox_get_msg(mb_chann); - if (ret == -ENOENT) - break; - - /* Other error means device doesn't look good, disable irq. */ - if (unlikely(ret)) { - MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret); - WRITE_ONCE(mb_chann->bad_state, true); - disable_irq(mb_chann->msix_irq); - break; - } - } -} int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann, const struct xdna_mailbox_msg *msg, u64 tx_timeout) @@ -656,10 +689,8 @@ int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann, goto release_id; } -#ifdef AMDXDNA_DEVEL - if (mb_chann->mb->polld) + if (mb_chann->type == MB_CHANNEL_USER_POLL) mailbox_polld_wakeup(mb_chann->mb); -#endif return 0; release_id: @@ -670,42 +701,75 @@ int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann, } #if defined(CONFIG_DEBUG_FS) +static struct mailbox_res_record * +xdna_mailbox_get_record(struct mailbox *mb, int mb_irq, + const struct xdna_mailbox_chann_res *x2i, + const struct xdna_mailbox_chann_res *i2x, + enum xdna_mailbox_channel_type type) +{ + struct mailbox_res_record *record; + int record_found = 0; + + mutex_lock(&mb->mbox_lock); + list_for_each_entry(record, &mb->res_records, re_entry) { + if (record->re_irq != mb_irq) + continue; + + record_found = 1; + break; + } + + if (record_found) { + record->type = type; + goto found; + } + + record = kzalloc(sizeof(*record), GFP_KERNEL); + if (!record) + goto out; + list_add_tail(&record->re_entry, &mb->res_records); + record->re_irq = mb_irq; + +found: + record->type = type; + memcpy(&record->re_x2i, x2i, sizeof(*x2i)); + memcpy(&record->re_i2x, i2x, sizeof(*i2x)); +out: + mutex_unlock(&mb->mbox_lock); + return record; +} + int xdna_mailbox_info_show(struct mailbox *mb, struct seq_file *m) { - static const char ring_fmt[] = "%4d %3s %5d 0x%08x 0x%04x "; + static const char ring_fmt[] = "%4d %3s %5d %4d 0x%08x 0x%04x "; static const char mbox_fmt[] = "0x%08x 0x%08x 0x%04x 0x%04x\n"; struct mailbox_res_record *record; - struct mailbox_channel *chann; /* If below two puts changed, make sure update fmt[] as well */ - seq_puts(m, "mbox dir alive ring addr size "); + seq_puts(m, "mbox dir alive type ring addr size "); seq_puts(m, "head ptr tail ptr head val tail val\n"); #define xdna_mbox_dump_queue(_dir, _act) \ - { \ - u32 head_ptr, tail_ptr, head_val, tail_val; \ - u32 rb_start, rb_size; \ - u32 mbox_irq; \ - mbox_irq = record->re_irq; \ - rb_start = record->re_##_dir.rb_start_addr; \ - rb_size = record->re_##_dir.rb_size; \ - head_ptr = record->re_##_dir.mb_head_ptr_reg; \ - tail_ptr = record->re_##_dir.mb_tail_ptr_reg; \ - head_val = ioread32((void *)(mb->res.mbox_base + head_ptr)); \ - tail_val = ioread32((void *)(mb->res.mbox_base + tail_ptr)); \ - seq_printf(m, ring_fmt, mbox_irq, #_dir, _act, rb_start, rb_size); \ - seq_printf(m, mbox_fmt, head_ptr, tail_ptr, head_val, tail_val); \ - } +{ \ + u32 head_ptr, tail_ptr, head_val, tail_val; \ + u32 rb_start, rb_size; \ + u32 mbox_irq; \ + u32 type; \ + type = record->type; \ + mbox_irq = record->re_irq; \ + rb_start = record->re_##_dir.rb_start_addr; \ + rb_size = record->re_##_dir.rb_size; \ + head_ptr = record->re_##_dir.mb_head_ptr_reg; \ + tail_ptr = record->re_##_dir.mb_tail_ptr_reg; \ + head_val = ioread32((void *)(mb->res.mbox_base + head_ptr)); \ + tail_val = ioread32((void *)(mb->res.mbox_base + tail_ptr)); \ + seq_printf(m, ring_fmt, mbox_irq, #_dir, _act, type, rb_start, rb_size); \ + seq_printf(m, mbox_fmt, head_ptr, tail_ptr, head_val, tail_val); \ +} mutex_lock(&mb->mbox_lock); list_for_each_entry(record, &mb->res_records, re_entry) { - int active = 0; - - list_for_each_entry(chann, &mb->chann_list, chann_entry) { - if (record->re_irq == chann->msix_irq) - active = 1; - } - xdna_mbox_dump_queue(x2i, active); - xdna_mbox_dump_queue(i2x, active); + xdna_mbox_dump_queue(x2i, record->active); + xdna_mbox_dump_queue(i2x, record->active); } mutex_unlock(&mb->mbox_lock); @@ -747,42 +811,17 @@ struct mailbox_channel * xdna_mailbox_create_channel(struct mailbox *mb, const struct xdna_mailbox_chann_res *x2i, const struct xdna_mailbox_chann_res *i2x, - u32 iohub_int_addr, - int mb_irq) + u32 iohub_int_addr, int mb_irq, + enum xdna_mailbox_channel_type type) { struct mailbox_channel *mb_chann; int ret; #if defined(CONFIG_DEBUG_FS) struct mailbox_res_record *record; - int record_found = 0; - - mutex_lock(&mb->mbox_lock); - list_for_each_entry(record, &mb->res_records, re_entry) { - if (record->re_irq != mb_irq) - continue; - - record_found = 1; - break; - } - - if (record_found) - goto skip_record; - - record = kzalloc(sizeof(*record), GFP_KERNEL); - if (!record) { - mutex_unlock(&mb->mbox_lock); - return NULL; - } - - memcpy(&record->re_x2i, x2i, sizeof(*x2i)); - memcpy(&record->re_i2x, i2x, sizeof(*i2x)); - record->re_irq = mb_irq; - /* Record will be released when mailbox device destroy*/ - list_add_tail(&record->re_entry, &mb->res_records); - -skip_record: - mutex_unlock(&mb->mbox_lock); + record = xdna_mailbox_get_record(mb, mb_irq, x2i, i2x, type); + if (!record) + return NULL; #endif /* CONFIG_DEBUG_FS */ if (!is_power_of_2(x2i->rb_size) || !is_power_of_2(i2x->rb_size)) { @@ -795,6 +834,11 @@ xdna_mailbox_create_channel(struct mailbox *mb, return NULL; mb_chann->mb = mb; + mb_chann->type = type; +#ifdef AMDXDNA_DEVEL + if (type != MB_CHANNEL_MGMT && MB_FORCE_USER_POLL) + mb_chann->type = MB_CHANNEL_USER_POLL; +#endif mb_chann->msix_irq = mb_irq; mb_chann->iohub_int_addr = iohub_int_addr; memcpy(&mb_chann->res[CHAN_RES_X2I], x2i, sizeof(*x2i)); @@ -804,20 +848,17 @@ xdna_mailbox_create_channel(struct mailbox *mb, idr_init(&mb_chann->chan_idr); mb_chann->x2i_tail = mailbox_get_tailptr(mb_chann, CHAN_RES_X2I); mb_chann->i2x_head = mailbox_get_headptr(mb_chann, CHAN_RES_I2X); -#ifdef AMDXDNA_DEVEL - if (mb->polld) - goto skip_irq; -#endif + mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0); INIT_WORK(&mb_chann->rx_work, mailbox_rx_worker); - mb_chann->work_q = create_singlethread_workqueue(MAILBOX_NAME); + mb_chann->work_q = alloc_ordered_workqueue(MAILBOX_NAME, 0); if (!mb_chann->work_q) { MB_ERR(mb_chann, "Create workqueue failed"); goto free_and_out; } #ifdef AMDXDNA_DEVEL - if (mailbox_polling > 0) { + if (MB_PERIODIC_POLL) { /* Poll response every few ms. Good for bring up a new device */ timer_setup(&mb_chann->timer, mailbox_timer, 0); @@ -839,10 +880,18 @@ xdna_mailbox_create_channel(struct mailbox *mb, #endif mb_chann->bad_state = false; mutex_lock(&mb->mbox_lock); - list_add(&mb_chann->chann_entry, &mb->chann_list); + if (mb_chann->type == MB_CHANNEL_USER_POLL) + list_add_tail(&mb_chann->chann_entry, &mb->poll_chann_list); + else + list_add_tail(&mb_chann->chann_entry, &mb->chann_list); +#if defined(CONFIG_DEBUG_FS) + mb_chann->record = record; + record->active = 1; +#endif mutex_unlock(&mb->mbox_lock); - MB_DBG(mb_chann, "Mailbox channel created (irq: %d)", mb_chann->msix_irq); + MB_DBG(mb_chann, "Mailbox channel created type %d (irq: %d)", + mb_chann->type, mb_chann->msix_irq); return mb_chann; destroy_wq: @@ -859,13 +908,13 @@ int xdna_mailbox_destroy_channel(struct mailbox_channel *mb_chann) mutex_lock(&mb_chann->mb->mbox_lock); list_del(&mb_chann->chann_entry); +#if defined(CONFIG_DEBUG_FS) + mb_chann->record->active = 0; +#endif mutex_unlock(&mb_chann->mb->mbox_lock); #ifdef AMDXDNA_DEVEL - if (mb_chann->mb->polld) - goto free_msg; - - if (mailbox_polling > 0) + if (MB_PERIODIC_POLL) goto destroy_wq; #endif free_irq(mb_chann->msix_irq, mb_chann); @@ -876,13 +925,11 @@ int xdna_mailbox_destroy_channel(struct mailbox_channel *mb_chann) destroy_workqueue(mb_chann->work_q); /* We can clean up and release resources */ -#ifdef AMDXDNA_DEVEL -free_msg: -#endif idr_for_each(&mb_chann->chan_idr, mailbox_release_msg, mb_chann); idr_destroy(&mb_chann->chan_idr); - MB_DBG(mb_chann, "Mailbox channel destroyed, irq: %d", mb_chann->msix_irq); + MB_DBG(mb_chann, "Mailbox channel destroyed type %d irq: %d", + mb_chann->type, mb_chann->msix_irq); kfree(mb_chann); return 0; } @@ -893,10 +940,7 @@ void xdna_mailbox_stop_channel(struct mailbox_channel *mb_chann) return; #ifdef AMDXDNA_DEVEL - if (mb_chann->mb->polld) - return; - - if (mailbox_polling > 0) { + if (MB_PERIODIC_POLL) { timer_delete_sync(&mb_chann->timer); goto skip_irq; } @@ -927,11 +971,13 @@ struct mailbox *xdna_mailbox_create(struct device *dev, mutex_init(&mb->mbox_lock); INIT_LIST_HEAD(&mb->chann_list); -#ifdef AMDXDNA_DEVEL - if (mailbox_polling >= 0) - goto skip_polld; + INIT_LIST_HEAD(&mb->poll_chann_list); - /* Launch per device busy polling kthread */ + /* + * The polld kthread will only wakeup and handle those + * MB_CHANNEL_USER_POLL channels. If no thing to do, polld should + * just sleep. It is a per device kthread. + */ mb->polld = kthread_run(mailbox_polld, mb, MAILBOX_NAME); if (IS_ERR(mb->polld)) { dev_err(mb->dev, "Failed to create polld ret %ld", PTR_ERR(mb->polld)); @@ -940,8 +986,6 @@ struct mailbox *xdna_mailbox_create(struct device *dev, } init_waitqueue_head(&mb->poll_wait); mb->sent_msg = false; -skip_polld: -#endif #if defined(CONFIG_DEBUG_FS) INIT_LIST_HEAD(&mb->res_records); @@ -965,18 +1009,11 @@ void xdna_mailbox_destroy(struct mailbox *mb) } done_release_record: #endif /* CONFIG_DEBUG_FS */ -#ifdef AMDXDNA_DEVEL - if (mailbox_polling >= 0) - goto skip_polld; - dev_dbg(mb->dev, "Stopping polld"); (void)kthread_stop(mb->polld); -skip_polld: -#endif mutex_lock(&mb->mbox_lock); - if (!list_empty(&mb->chann_list)) - WARN_ON("Channel not destroy"); + WARN_ONCE(!list_empty(&mb->chann_list), "Channel not destroy"); mutex_unlock(&mb->mbox_lock); mutex_destroy(&mb->mbox_lock); diff --git a/src/driver/amdxdna/amdxdna_mailbox.h b/src/driver/amdxdna/amdxdna_mailbox.h index 2e114644..8ac677d9 100644 --- a/src/driver/amdxdna/amdxdna_mailbox.h +++ b/src/driver/amdxdna/amdxdna_mailbox.h @@ -80,6 +80,13 @@ struct mailbox *xdna_mailbox_create(struct device *dev, */ void xdna_mailbox_destroy(struct mailbox *mailbox); +enum xdna_mailbox_channel_type { + MB_CHANNEL_MGMT = 0, + MB_CHANNEL_USER_NORMAL, + MB_CHANNEL_USER_POLL, + MB_CHANNEL_MAX_TYPE, +}; + /* * xdna_mailbox_create_channel() -- Create a mailbox channel instance * @@ -88,6 +95,7 @@ void xdna_mailbox_destroy(struct mailbox *mailbox); * @i2x: firmware to host mailbox resources * @xdna_mailbox_intr_reg: register addr of MSI-X interrupt * @mb_irq: Linux IRQ number associated with mailbox MSI-X interrupt vector index + * @type: Type of channel * * Return: If success, return a handle of mailbox channel. Otherwise, return NULL. */ @@ -96,7 +104,7 @@ xdna_mailbox_create_channel(struct mailbox *mailbox, const struct xdna_mailbox_chann_res *x2i, const struct xdna_mailbox_chann_res *i2x, u32 xdna_mailbox_intr_reg, - int mb_irq); + int mb_irq, enum xdna_mailbox_channel_type type); /* * xdna_mailbox_destroy_channel() -- destroy mailbox channel diff --git a/src/driver/amdxdna/amdxdna_mailbox_helper.h b/src/driver/amdxdna/amdxdna_mailbox_helper.h index 20c1fe7b..e1c3f16f 100644 --- a/src/driver/amdxdna/amdxdna_mailbox_helper.h +++ b/src/driver/amdxdna/amdxdna_mailbox_helper.h @@ -37,6 +37,8 @@ struct xdna_notify { .notify_cb = xdna_msg_cb, \ } +#define XDNA_STATUS_OFFSET(name) (offsetof(struct name##_resp, status) / sizeof(u32)) + int xdna_msg_cb(void *handle, const u32 *data, size_t size); int xdna_send_msg_wait(struct amdxdna_dev *xdna, struct mailbox_channel *chann, struct xdna_mailbox_msg *msg); diff --git a/src/driver/amdxdna/amdxdna_pci_drv.c b/src/driver/amdxdna/amdxdna_pci_drv.c index 9f6f7fed..fab4c295 100644 --- a/src/driver/amdxdna/amdxdna_pci_drv.c +++ b/src/driver/amdxdna/amdxdna_pci_drv.c @@ -13,7 +13,9 @@ #include "amdxdna_pci_drv.h" #include "amdxdna_sysfs.h" -#define AMDXDNA_AUTOSUSPEND_DELAY 5000 /* miliseconds */ +int autosuspend_ms = -1; +module_param(autosuspend_ms, int, 0644); +MODULE_PARM_DESC(autosuspend_ms, "runtime suspend delay in miliseconds. < 0: prevent it"); /* * There are platforms which share the same PCI device ID @@ -100,7 +102,7 @@ static int amdxdna_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto failed_dev_fini; } - pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY); + pm_runtime_set_autosuspend_delay(dev, autosuspend_ms); pm_runtime_use_autosuspend(dev); pm_runtime_allow(dev); diff --git a/src/driver/amdxdna/amdxdna_tdr.c b/src/driver/amdxdna/amdxdna_tdr.c index f5640b3c..716fe198 100644 --- a/src/driver/amdxdna/amdxdna_tdr.c +++ b/src/driver/amdxdna/amdxdna_tdr.c @@ -6,8 +6,8 @@ #include "amdxdna_drm.h" #include "amdxdna_tdr.h" -int timeout_in_sec = 2; -module_param(timeout_in_sec, int, 0644); +uint timeout_in_sec = 2; +module_param(timeout_in_sec, uint, 0644); MODULE_PARM_DESC(timeout_in_sec, "Seconds to timeout and recovery, default 2; 0 - No TDR"); #define TDR_TIMEOUT_JIFF msecs_to_jiffies(timeout_in_sec * 1000) diff --git a/src/driver/amdxdna/amdxdna_trace.h b/src/driver/amdxdna/amdxdna_trace.h index 6d73c823..4620d2e4 100644 --- a/src/driver/amdxdna/amdxdna_trace.h +++ b/src/driver/amdxdna/amdxdna_trace.h @@ -130,6 +130,11 @@ DEFINE_EVENT(xdna_mbox_name_id, mbox_irq_handle, TP_ARGS(name, irq) ); +DEFINE_EVENT(xdna_mbox_name_id, mbox_rx_worker, + TP_PROTO(char *name, int irq), + TP_ARGS(name, irq) +); + DEFINE_EVENT(xdna_mbox_name_id, mbox_poll_handle, TP_PROTO(char *name, int irq), TP_ARGS(name, irq) diff --git a/src/driver/amdxdna/npu1_regs.c b/src/driver/amdxdna/npu1_regs.c index 3c442021..11a1e3fd 100644 --- a/src/driver/amdxdna/npu1_regs.c +++ b/src/driver/amdxdna/npu1_regs.c @@ -51,9 +51,6 @@ #define NPU1_RT_CFG_VAL_DEBUG_BO_DEFAULT 0 #define NPU1_RT_CFG_VAL_DEBUG_BO_LARGE 1 -#define NPU1_MPNPUCLK_FREQ_MAX 847 -#define NPU1_HCLK_FREQ_MAX 1600 - /*fill in the dpm clock frequencies */ const struct dpm_clk npu1_dpm_clk_table[] = { {400, 800}, @@ -109,9 +106,6 @@ const struct amdxdna_dev_priv npu1_dev_priv = { .value_enable = NPU1_RT_CFG_VAL_CLK_GATING_ON, .value_disable = NPU1_RT_CFG_VAL_CLK_GATING_OFF, }, - .smu_mpnpuclk_freq_max = NPU1_MPNPUCLK_FREQ_MAX, - .smu_hclk_freq_max = NPU1_HCLK_FREQ_MAX, - .smu_dpm_max = 7, .smu_rev = SMU_REVISION_V0, .smu_npu_dpm_clk_table = npu1_dpm_clk_table, .smu_npu_dpm_levels = ARRAY_SIZE(npu1_dpm_clk_table), diff --git a/src/driver/amdxdna/npu2_regs.c b/src/driver/amdxdna/npu2_regs.c index f84c726e..3a10be1e 100644 --- a/src/driver/amdxdna/npu2_regs.c +++ b/src/driver/amdxdna/npu2_regs.c @@ -3,159 +3,19 @@ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc. */ -#include "drm_local/amdxdna_accel.h" -#include "aie2_pci.h" +#include "npu4_family.h" -/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */ -#define MPNPU_PUB_SEC_INTR 0x3010060 -#define MPNPU_PUB_PWRMGMT_INTR 0x3010064 -#define MPNPU_PUB_SCRATCH0 0x301006C -#define MPNPU_PUB_SCRATCH1 0x3010070 -#define MPNPU_PUB_SCRATCH2 0x3010074 -#define MPNPU_PUB_SCRATCH3 0x3010078 -#define MPNPU_PUB_SCRATCH4 0x301007C -#define MPNPU_PUB_SCRATCH5 0x3010080 -#define MPNPU_PUB_SCRATCH6 0x3010084 -#define MPNPU_PUB_SCRATCH7 0x3010088 -#define MPNPU_PUB_SCRATCH8 0x301008C -#define MPNPU_PUB_SCRATCH9 0x3010090 -#define MPNPU_PUB_SCRATCH10 0x3010094 -#define MPNPU_PUB_SCRATCH11 0x3010098 -#define MPNPU_PUB_SCRATCH12 0x301009C -#define MPNPU_PUB_SCRATCH13 0x30100A0 -#define MPNPU_PUB_SCRATCH14 0x30100A4 -#define MPNPU_PUB_SCRATCH15 0x30100A8 -#define MP0_C2PMSG_73 0x3810A24 -#define MP0_C2PMSG_123 0x3810AEC - -#define MP1_C2PMSG_0 0x3B10900 -#define MP1_C2PMSG_60 0x3B109F0 -#define MP1_C2PMSG_61 0x3B109F4 - -#define MPNPU_SRAM_X2I_MAILBOX_0 0x3600000 -#define MPNPU_SRAM_X2I_MAILBOX_15 0x361E000 -#define MPNPU_SRAM_X2I_MAILBOX_31 0x363E000 -#define MPNPU_SRAM_I2X_MAILBOX_31 0x363F000 - -#define MMNPU_APERTURE0_BASE 0x3000000 -#define MMNPU_APERTURE1_BASE 0x3600000 -#define MMNPU_APERTURE3_BASE 0x3810000 -#define MMNPU_APERTURE4_BASE 0x3B10000 - -/* PCIe BAR Index for NPU2 */ -#define NPU2_REG_BAR_INDEX 0 -#define NPU2_MBOX_BAR_INDEX 0 -#define NPU2_PSP_BAR_INDEX 4 -#define NPU2_SMU_BAR_INDEX 5 -#define NPU2_SRAM_BAR_INDEX 2 -/* Associated BARs and Apertures */ -#define NPU2_REG_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU2_MBOX_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU2_PSP_BAR_BASE MMNPU_APERTURE3_BASE -#define NPU2_SMU_BAR_BASE MMNPU_APERTURE4_BASE -#define NPU2_SRAM_BAR_BASE MMNPU_APERTURE1_BASE - -#define NPU2_RT_CFG_TYPE_CLK_GATING 1 -#define NPU2_RT_CFG_TYPE_HCLK_GATING 2 -#define NPU2_RT_CFG_TYPE_PWR_GATING 3 -#define NPU2_RT_CFG_TYPE_L1IMU_GATING 4 -#define NPU2_RT_CFG_TYPE_PDI_LOAD 5 -#define NPU2_RT_CFG_TYPE_DEBUG_BO 10 - -#define NPU2_RT_CFG_VAL_CLK_GATING_OFF 0 -#define NPU2_RT_CFG_VAL_CLK_GATING_ON 1 - -#define NPU2_RT_CFG_VAL_PDI_LOAD_MGMT 0 -#define NPU2_RT_CFG_VAL_PDI_LOAD_APP 1 - -#define NPU2_RT_CFG_VAL_DEBUG_BO_DEFAULT 0 -#define NPU2_RT_CFG_VAL_DEBUG_BO_LARGE 1 - -#define NPU2_MPNPUCLK_FREQ_MAX 1267 -#define NPU2_HCLK_FREQ_MAX 1800 - -const struct dpm_clk npu2_dpm_clk_table[DPM_LEVEL_MAX] = { - {396, 792}, - {600, 1056}, - {792, 1152}, - {975, 1267}, - {975, 1267}, - {1056, 1408}, - {1152, 1584}, - {1267, 1800} -}; - -const struct rt_config npu2_rt_cfg[] = { - {NPU2_RT_CFG_TYPE_PDI_LOAD, NPU2_RT_CFG_VAL_PDI_LOAD_APP}, - {NPU2_RT_CFG_TYPE_DEBUG_BO, NPU2_RT_CFG_VAL_DEBUG_BO_LARGE}, -}; - -const u32 npu2_clk_gating_types[] = { - NPU2_RT_CFG_TYPE_CLK_GATING, - NPU2_RT_CFG_TYPE_HCLK_GATING, - NPU2_RT_CFG_TYPE_PWR_GATING, - NPU2_RT_CFG_TYPE_L1IMU_GATING, -}; +/* NPU2 is the prototype of NPU4. It will be obsoleted in near future. */ const struct amdxdna_dev_priv npu2_dev_priv = { .fw_path = "amdnpu/17f0_00/npu.sbin", .protocol_major = 0x6, .protocol_minor = 0x6, - .rt_config = npu2_rt_cfg, - .num_rt_cfg = ARRAY_SIZE(npu2_rt_cfg), - .col_align = COL_ALIGN_NATURE, - .mbox_dev_addr = NPU2_MBOX_BAR_BASE, - .mbox_size = 0, /* Use BAR size */ - .sram_dev_addr = NPU2_SRAM_BAR_BASE, - .sram_offs = { - DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), - DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), - }, - .psp_regs_off = { - DEFINE_BAR_OFFSET(PSP_CMD_REG, NPU2_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_ARG0_REG, NPU2_REG, MPNPU_PUB_SCRATCH3), - DEFINE_BAR_OFFSET(PSP_ARG1_REG, NPU2_REG, MPNPU_PUB_SCRATCH4), - DEFINE_BAR_OFFSET(PSP_ARG2_REG, NPU2_REG, MPNPU_PUB_SCRATCH9), - DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU2_PSP, MP0_C2PMSG_73), - DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU2_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU2_REG, MPNPU_PUB_SCRATCH3), - }, - .smu_regs_off = { - DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU2_SMU, MP1_C2PMSG_0), - DEFINE_BAR_OFFSET(SMU_ARG_REG, NPU2_SMU, MP1_C2PMSG_60), - DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU2_SMU, MMNPU_APERTURE4_BASE), - DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU2_SMU, MP1_C2PMSG_61), - DEFINE_BAR_OFFSET(SMU_OUT_REG, NPU2_SMU, MP1_C2PMSG_60), - }, - .clk_gating = { - .types = npu2_clk_gating_types, - .num_types = ARRAY_SIZE(npu2_clk_gating_types), - .value_enable = NPU2_RT_CFG_VAL_CLK_GATING_ON, - .value_disable = NPU2_RT_CFG_VAL_CLK_GATING_OFF, - }, - .smu_mpnpuclk_freq_max = NPU2_MPNPUCLK_FREQ_MAX, - .smu_hclk_freq_max = NPU2_HCLK_FREQ_MAX, - .smu_dpm_max = 7, - .smu_rev = SMU_REVISION_V1, - .smu_npu_dpm_clk_table = npu2_dpm_clk_table, - .smu_npu_dpm_levels = ARRAY_SIZE(npu2_dpm_clk_table), -#ifdef AMDXDNA_DEVEL - .priv_load_cfg = {NPU2_RT_CFG_TYPE_PDI_LOAD, NPU2_RT_CFG_VAL_PDI_LOAD_MGMT}, -#endif + NPU4_COMMON_DEV_PRIV, }; const struct amdxdna_dev_info dev_npu2_info = { - .reg_bar = NPU2_REG_BAR_INDEX, - .mbox_bar = NPU2_MBOX_BAR_INDEX, - .sram_bar = NPU2_SRAM_BAR_INDEX, - .psp_bar = NPU2_PSP_BAR_INDEX, - .smu_bar = NPU2_SMU_BAR_INDEX, - .first_col = 0, - .dev_mem_buf_shift = 15, /* 32 KiB aligned */ - .dev_mem_base = AIE2_DEVM_BASE, - .dev_mem_size = AIE2_DEVM_SIZE, .vbnv = "RyzenAI-npu2", - .device_type = AMDXDNA_DEV_TYPE_KMQ, .dev_priv = &npu2_dev_priv, - .ops = &aie2_ops, /* NPU2 can share NPU1's callback */ + NPU4_COMMON_DEV_INFO, }; diff --git a/src/driver/amdxdna/npu4_family.h b/src/driver/amdxdna/npu4_family.h new file mode 100644 index 00000000..9da6d971 --- /dev/null +++ b/src/driver/amdxdna/npu4_family.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2024, Advanced Micro Devices, Inc. + */ + +#ifndef _NPU4_FAMILY_H_ +#define _NPU4_FAMILY_H_ + +#include "drm_local/amdxdna_accel.h" +#include "aie2_pci.h" + +/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */ +#define MPNPU_PUB_SEC_INTR 0x3010060 +#define MPNPU_PUB_PWRMGMT_INTR 0x3010064 +#define MPNPU_PUB_SCRATCH0 0x301006C +#define MPNPU_PUB_SCRATCH1 0x3010070 +#define MPNPU_PUB_SCRATCH2 0x3010074 +#define MPNPU_PUB_SCRATCH3 0x3010078 +#define MPNPU_PUB_SCRATCH4 0x301007C +#define MPNPU_PUB_SCRATCH5 0x3010080 +#define MPNPU_PUB_SCRATCH6 0x3010084 +#define MPNPU_PUB_SCRATCH7 0x3010088 +#define MPNPU_PUB_SCRATCH8 0x301008C +#define MPNPU_PUB_SCRATCH9 0x3010090 +#define MPNPU_PUB_SCRATCH10 0x3010094 +#define MPNPU_PUB_SCRATCH11 0x3010098 +#define MPNPU_PUB_SCRATCH12 0x301009C +#define MPNPU_PUB_SCRATCH13 0x30100A0 +#define MPNPU_PUB_SCRATCH14 0x30100A4 +#define MPNPU_PUB_SCRATCH15 0x30100A8 +#define MP0_C2PMSG_73 0x3810A24 +#define MP0_C2PMSG_123 0x3810AEC + +#define MP1_C2PMSG_0 0x3B10900 +#define MP1_C2PMSG_60 0x3B109F0 +#define MP1_C2PMSG_61 0x3B109F4 + +#define MPNPU_SRAM_X2I_MAILBOX_0 0x3600000 +#define MPNPU_SRAM_X2I_MAILBOX_15 0x361E000 +#define MPNPU_SRAM_X2I_MAILBOX_31 0x363E000 +#define MPNPU_SRAM_I2X_MAILBOX_31 0x363F000 + +#define MMNPU_APERTURE0_BASE 0x3000000 +#define MMNPU_APERTURE1_BASE 0x3600000 +#define MMNPU_APERTURE3_BASE 0x3810000 +#define MMNPU_APERTURE4_BASE 0x3B10000 + +/* PCIe BAR Index for NPU4 */ +#define NPU4_REG_BAR_INDEX 0 +#define NPU4_MBOX_BAR_INDEX 0 +#define NPU4_PSP_BAR_INDEX 4 +#define NPU4_SMU_BAR_INDEX 5 +#define NPU4_SRAM_BAR_INDEX 2 +/* Associated BARs and Apertures */ +#define NPU4_REG_BAR_BASE MMNPU_APERTURE0_BASE +#define NPU4_MBOX_BAR_BASE MMNPU_APERTURE0_BASE +#define NPU4_PSP_BAR_BASE MMNPU_APERTURE3_BASE +#define NPU4_SMU_BAR_BASE MMNPU_APERTURE4_BASE +#define NPU4_SRAM_BAR_BASE MMNPU_APERTURE1_BASE + +#define NPU4_RT_CFG_TYPE_CLK_GATING 1 +#define NPU4_RT_CFG_TYPE_HCLK_GATING 2 +#define NPU4_RT_CFG_TYPE_PWR_GATING 3 +#define NPU4_RT_CFG_TYPE_L1IMU_GATING 4 +#define NPU4_RT_CFG_TYPE_PDI_LOAD 5 +#define NPU4_RT_CFG_TYPE_DEBUG_BO 10 + +#define NPU4_RT_CFG_VAL_CLK_GATING_OFF 0 +#define NPU4_RT_CFG_VAL_CLK_GATING_ON 1 + +#define NPU4_RT_CFG_VAL_PDI_LOAD_MGMT 0 +#define NPU4_RT_CFG_VAL_PDI_LOAD_APP 1 + +#define NPU4_RT_CFG_VAL_DEBUG_BO_DEFAULT 0 +#define NPU4_RT_CFG_VAL_DEBUG_BO_LARGE 1 + +#define NPU4_INIT_RT_CFG_NUM 2 +#define NPU4_CLK_GATING_CFG_NUM 4 + +extern const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX]; +extern const struct rt_config npu4_rt_cfg[NPU4_INIT_RT_CFG_NUM]; +extern const u32 npu4_clk_gating_types[NPU4_CLK_GATING_CFG_NUM]; + +#define NPU4_COMMON_DEV_PRIV \ + .rt_config = npu4_rt_cfg, \ + .num_rt_cfg = ARRAY_SIZE(npu4_rt_cfg), \ + .priv_load_cfg = {NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_MGMT}, \ + .col_align = COL_ALIGN_NATURE, \ + .mbox_dev_addr = NPU4_MBOX_BAR_BASE, \ + .mbox_size = 0, /* Use BAR size */ \ + .sram_dev_addr = NPU4_SRAM_BAR_BASE, \ + .sram_offs = { \ + DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), \ + DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), \ + }, \ + .psp_regs_off = { \ + DEFINE_BAR_OFFSET(PSP_CMD_REG, NPU4_PSP, MP0_C2PMSG_123), \ + DEFINE_BAR_OFFSET(PSP_ARG0_REG, NPU4_REG, MPNPU_PUB_SCRATCH3), \ + DEFINE_BAR_OFFSET(PSP_ARG1_REG, NPU4_REG, MPNPU_PUB_SCRATCH4), \ + DEFINE_BAR_OFFSET(PSP_ARG2_REG, NPU4_REG, MPNPU_PUB_SCRATCH9), \ + DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU4_PSP, MP0_C2PMSG_73), \ + DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123), \ + DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU4_REG, MPNPU_PUB_SCRATCH3), \ + }, \ + .smu_regs_off = { \ + DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU4_SMU, MP1_C2PMSG_0), \ + DEFINE_BAR_OFFSET(SMU_ARG_REG, NPU4_SMU, MP1_C2PMSG_60), \ + DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU4_SMU, MMNPU_APERTURE4_BASE), \ + DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU4_SMU, MP1_C2PMSG_61), \ + DEFINE_BAR_OFFSET(SMU_OUT_REG, NPU4_SMU, MP1_C2PMSG_60), \ + }, \ + .clk_gating = { \ + .types = npu4_clk_gating_types, \ + .num_types = ARRAY_SIZE(npu4_clk_gating_types), \ + .value_enable = NPU4_RT_CFG_VAL_CLK_GATING_ON, \ + .value_disable = NPU4_RT_CFG_VAL_CLK_GATING_OFF, \ + }, \ + .smu_rev = SMU_REVISION_V1, \ + .smu_npu_dpm_clk_table = npu4_dpm_clk_table, \ + .smu_npu_dpm_levels = ARRAY_SIZE(npu4_dpm_clk_table) + +#define NPU4_COMMON_DEV_INFO \ + .reg_bar = NPU4_REG_BAR_INDEX, \ + .mbox_bar = NPU4_MBOX_BAR_INDEX, \ + .sram_bar = NPU4_SRAM_BAR_INDEX, \ + .psp_bar = NPU4_PSP_BAR_INDEX, \ + .smu_bar = NPU4_SMU_BAR_INDEX, \ + .first_col = 0, \ + .dev_mem_buf_shift = 15, /* 32 KiB aligned */ \ + .dev_mem_base = AIE2_DEVM_BASE, \ + .dev_mem_size = AIE2_DEVM_SIZE, \ + .device_type = AMDXDNA_DEV_TYPE_KMQ, \ + .ops = &aie2_ops + +#endif /* _NPU4_FAMILY_H_ */ diff --git a/src/driver/amdxdna/npu4_regs.c b/src/driver/amdxdna/npu4_regs.c index b86958e3..50d0bb10 100644 --- a/src/driver/amdxdna/npu4_regs.c +++ b/src/driver/amdxdna/npu4_regs.c @@ -3,76 +3,7 @@ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc. */ -#include "drm_local/amdxdna_accel.h" -#include "aie2_pci.h" - -/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */ -#define MPNPU_PUB_SEC_INTR 0x3010060 -#define MPNPU_PUB_PWRMGMT_INTR 0x3010064 -#define MPNPU_PUB_SCRATCH0 0x301006C -#define MPNPU_PUB_SCRATCH1 0x3010070 -#define MPNPU_PUB_SCRATCH2 0x3010074 -#define MPNPU_PUB_SCRATCH3 0x3010078 -#define MPNPU_PUB_SCRATCH4 0x301007C -#define MPNPU_PUB_SCRATCH5 0x3010080 -#define MPNPU_PUB_SCRATCH6 0x3010084 -#define MPNPU_PUB_SCRATCH7 0x3010088 -#define MPNPU_PUB_SCRATCH8 0x301008C -#define MPNPU_PUB_SCRATCH9 0x3010090 -#define MPNPU_PUB_SCRATCH10 0x3010094 -#define MPNPU_PUB_SCRATCH11 0x3010098 -#define MPNPU_PUB_SCRATCH12 0x301009C -#define MPNPU_PUB_SCRATCH13 0x30100A0 -#define MPNPU_PUB_SCRATCH14 0x30100A4 -#define MPNPU_PUB_SCRATCH15 0x30100A8 -#define MP0_C2PMSG_73 0x3810A24 -#define MP0_C2PMSG_123 0x3810AEC - -#define MP1_C2PMSG_0 0x3B10900 -#define MP1_C2PMSG_60 0x3B109F0 -#define MP1_C2PMSG_61 0x3B109F4 - -#define MPNPU_SRAM_X2I_MAILBOX_0 0x3600000 -#define MPNPU_SRAM_X2I_MAILBOX_15 0x361E000 -#define MPNPU_SRAM_X2I_MAILBOX_31 0x363E000 -#define MPNPU_SRAM_I2X_MAILBOX_31 0x363F000 - -#define MMNPU_APERTURE0_BASE 0x3000000 -#define MMNPU_APERTURE1_BASE 0x3600000 -#define MMNPU_APERTURE3_BASE 0x3810000 -#define MMNPU_APERTURE4_BASE 0x3B10000 - -/* PCIe BAR Index for NPU4 */ -#define NPU4_REG_BAR_INDEX 0 -#define NPU4_MBOX_BAR_INDEX 0 -#define NPU4_PSP_BAR_INDEX 4 -#define NPU4_SMU_BAR_INDEX 5 -#define NPU4_SRAM_BAR_INDEX 2 -/* Associated BARs and Apertures */ -#define NPU4_REG_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU4_MBOX_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU4_PSP_BAR_BASE MMNPU_APERTURE3_BASE -#define NPU4_SMU_BAR_BASE MMNPU_APERTURE4_BASE -#define NPU4_SRAM_BAR_BASE MMNPU_APERTURE1_BASE - -#define NPU4_RT_CFG_TYPE_CLK_GATING 1 -#define NPU4_RT_CFG_TYPE_HCLK_GATING 2 -#define NPU4_RT_CFG_TYPE_PWR_GATING 3 -#define NPU4_RT_CFG_TYPE_L1IMU_GATING 4 -#define NPU4_RT_CFG_TYPE_PDI_LOAD 5 -#define NPU4_RT_CFG_TYPE_DEBUG_BO 10 - -#define NPU4_RT_CFG_VAL_CLK_GATING_OFF 0 -#define NPU4_RT_CFG_VAL_CLK_GATING_ON 1 - -#define NPU4_RT_CFG_VAL_PDI_LOAD_MGMT 0 -#define NPU4_RT_CFG_VAL_PDI_LOAD_APP 1 - -#define NPU4_RT_CFG_VAL_DEBUG_BO_DEFAULT 0 -#define NPU4_RT_CFG_VAL_DEBUG_BO_LARGE 1 - -#define NPU4_MPNPUCLK_FREQ_MAX 1267 -#define NPU4_HCLK_FREQ_MAX 1800 +#include "npu4_family.h" const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX] = { {396, 792}, @@ -85,12 +16,12 @@ const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX] = { {1267, 1800} }; -const struct rt_config npu4_rt_cfg[] = { +const struct rt_config npu4_rt_cfg[NPU4_INIT_RT_CFG_NUM] = { {NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_APP}, {NPU4_RT_CFG_TYPE_DEBUG_BO, NPU4_RT_CFG_VAL_DEBUG_BO_LARGE}, }; -const u32 npu4_clk_gating_types[] = { +const u32 npu4_clk_gating_types[NPU4_CLK_GATING_CFG_NUM] = { NPU4_RT_CFG_TYPE_CLK_GATING, NPU4_RT_CFG_TYPE_HCLK_GATING, NPU4_RT_CFG_TYPE_PWR_GATING, @@ -101,61 +32,11 @@ const struct amdxdna_dev_priv npu4_dev_priv = { .fw_path = "amdnpu/17f0_10/npu.sbin", .protocol_major = 0x6, .protocol_minor = 0x6, - .rt_config = npu4_rt_cfg, - .num_rt_cfg = ARRAY_SIZE(npu4_rt_cfg), - .col_align = COL_ALIGN_NATURE, - .mbox_dev_addr = NPU4_MBOX_BAR_BASE, - .mbox_size = 0, /* Use BAR size */ - .sram_dev_addr = NPU4_SRAM_BAR_BASE, - .sram_offs = { - DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), - DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), - }, - .psp_regs_off = { - DEFINE_BAR_OFFSET(PSP_CMD_REG, NPU4_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_ARG0_REG, NPU4_REG, MPNPU_PUB_SCRATCH3), - DEFINE_BAR_OFFSET(PSP_ARG1_REG, NPU4_REG, MPNPU_PUB_SCRATCH4), - DEFINE_BAR_OFFSET(PSP_ARG2_REG, NPU4_REG, MPNPU_PUB_SCRATCH9), - DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU4_PSP, MP0_C2PMSG_73), - DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU4_REG, MPNPU_PUB_SCRATCH3), - }, - .smu_regs_off = { - DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU4_SMU, MP1_C2PMSG_0), - DEFINE_BAR_OFFSET(SMU_ARG_REG, NPU4_SMU, MP1_C2PMSG_60), - DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU4_SMU, MMNPU_APERTURE4_BASE), - DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU4_SMU, MP1_C2PMSG_61), - DEFINE_BAR_OFFSET(SMU_OUT_REG, NPU4_SMU, MP1_C2PMSG_60), - }, - .clk_gating = { - .types = npu4_clk_gating_types, - .num_types = ARRAY_SIZE(npu4_clk_gating_types), - .value_enable = NPU4_RT_CFG_VAL_CLK_GATING_ON, - .value_disable = NPU4_RT_CFG_VAL_CLK_GATING_OFF, - }, - .smu_mpnpuclk_freq_max = NPU4_MPNPUCLK_FREQ_MAX, - .smu_hclk_freq_max = NPU4_HCLK_FREQ_MAX, - .smu_dpm_max = 7, - .smu_rev = SMU_REVISION_V1, - .smu_npu_dpm_clk_table = npu4_dpm_clk_table, - .smu_npu_dpm_levels = ARRAY_SIZE(npu4_dpm_clk_table), -#ifdef AMDXDNA_DEVEL - .priv_load_cfg = {NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_MGMT}, -#endif + NPU4_COMMON_DEV_PRIV, }; const struct amdxdna_dev_info dev_npu4_info = { - .reg_bar = NPU4_REG_BAR_INDEX, - .mbox_bar = NPU4_MBOX_BAR_INDEX, - .sram_bar = NPU4_SRAM_BAR_INDEX, - .psp_bar = NPU4_PSP_BAR_INDEX, - .smu_bar = NPU4_SMU_BAR_INDEX, - .first_col = 0, - .dev_mem_buf_shift = 15, /* 32 KiB aligned */ - .dev_mem_base = AIE2_DEVM_BASE, - .dev_mem_size = AIE2_DEVM_SIZE, .vbnv = "RyzenAI-npu4", - .device_type = AMDXDNA_DEV_TYPE_KMQ, .dev_priv = &npu4_dev_priv, - .ops = &aie2_ops, /* NPU4 can share NPU1's callback */ + NPU4_COMMON_DEV_INFO, }; diff --git a/src/driver/amdxdna/npu5_regs.c b/src/driver/amdxdna/npu5_regs.c index ed7d81df..7f0050d1 100644 --- a/src/driver/amdxdna/npu5_regs.c +++ b/src/driver/amdxdna/npu5_regs.c @@ -3,159 +3,17 @@ * Copyright (C) 2024, Advanced Micro Devices, Inc. */ -#include "drm_local/amdxdna_accel.h" -#include "aie2_pci.h" - -/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */ -#define MPNPU_PUB_SEC_INTR 0x3010060 -#define MPNPU_PUB_PWRMGMT_INTR 0x3010064 -#define MPNPU_PUB_SCRATCH0 0x301006C -#define MPNPU_PUB_SCRATCH1 0x3010070 -#define MPNPU_PUB_SCRATCH2 0x3010074 -#define MPNPU_PUB_SCRATCH3 0x3010078 -#define MPNPU_PUB_SCRATCH4 0x301007C -#define MPNPU_PUB_SCRATCH5 0x3010080 -#define MPNPU_PUB_SCRATCH6 0x3010084 -#define MPNPU_PUB_SCRATCH7 0x3010088 -#define MPNPU_PUB_SCRATCH8 0x301008C -#define MPNPU_PUB_SCRATCH9 0x3010090 -#define MPNPU_PUB_SCRATCH10 0x3010094 -#define MPNPU_PUB_SCRATCH11 0x3010098 -#define MPNPU_PUB_SCRATCH12 0x301009C -#define MPNPU_PUB_SCRATCH13 0x30100A0 -#define MPNPU_PUB_SCRATCH14 0x30100A4 -#define MPNPU_PUB_SCRATCH15 0x30100A8 -#define MP0_C2PMSG_73 0x3810A24 -#define MP0_C2PMSG_123 0x3810AEC - -#define MP1_C2PMSG_0 0x3B10900 -#define MP1_C2PMSG_60 0x3B109F0 -#define MP1_C2PMSG_61 0x3B109F4 - -#define MPNPU_SRAM_X2I_MAILBOX_0 0x3600000 -#define MPNPU_SRAM_X2I_MAILBOX_15 0x361E000 -#define MPNPU_SRAM_X2I_MAILBOX_31 0x363E000 -#define MPNPU_SRAM_I2X_MAILBOX_31 0x363F000 - -#define MMNPU_APERTURE0_BASE 0x3000000 -#define MMNPU_APERTURE1_BASE 0x3600000 -#define MMNPU_APERTURE3_BASE 0x3810000 -#define MMNPU_APERTURE4_BASE 0x3B10000 - -/* PCIe BAR Index for NPU5 */ -#define NPU5_REG_BAR_INDEX 0 -#define NPU5_MBOX_BAR_INDEX 0 -#define NPU5_PSP_BAR_INDEX 4 -#define NPU5_SMU_BAR_INDEX 5 -#define NPU5_SRAM_BAR_INDEX 2 -/* Associated BARs and Apertures */ -#define NPU5_REG_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU5_MBOX_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU5_PSP_BAR_BASE MMNPU_APERTURE3_BASE -#define NPU5_SMU_BAR_BASE MMNPU_APERTURE4_BASE -#define NPU5_SRAM_BAR_BASE MMNPU_APERTURE1_BASE - -#define NPU5_RT_CFG_TYPE_CLK_GATING 1 -#define NPU5_RT_CFG_TYPE_HCLK_GATING 2 -#define NPU5_RT_CFG_TYPE_PWR_GATING 3 -#define NPU5_RT_CFG_TYPE_L1IMU_GATING 4 -#define NPU5_RT_CFG_TYPE_PDI_LOAD 5 -#define NPU5_RT_CFG_TYPE_DEBUG_BO 10 - -#define NPU5_RT_CFG_VAL_CLK_GATING_OFF 0 -#define NPU5_RT_CFG_VAL_CLK_GATING_ON 1 - -#define NPU5_RT_CFG_VAL_PDI_LOAD_MGMT 0 -#define NPU5_RT_CFG_VAL_PDI_LOAD_APP 1 - -#define NPU5_RT_CFG_VAL_DEBUG_BO_DEFAULT 0 -#define NPU5_RT_CFG_VAL_DEBUG_BO_LARGE 1 - -#define NPU5_MPNPUCLK_FREQ_MAX 1267 -#define NPU5_HCLK_FREQ_MAX 1800 - -const struct dpm_clk npu5_dpm_clk_table[DPM_LEVEL_MAX] = { - {396, 792}, - {600, 1056}, - {792, 1152}, - {975, 1267}, - {975, 1267}, - {1056, 1408}, - {1152, 1584}, - {1267, 1800} -}; - -const struct rt_config npu5_rt_cfg[] = { - {NPU5_RT_CFG_TYPE_PDI_LOAD, NPU5_RT_CFG_VAL_PDI_LOAD_APP}, - {NPU5_RT_CFG_TYPE_DEBUG_BO, NPU5_RT_CFG_VAL_DEBUG_BO_LARGE}, -}; - -const u32 npu5_clk_gating_types[] = { - NPU5_RT_CFG_TYPE_CLK_GATING, - NPU5_RT_CFG_TYPE_HCLK_GATING, - NPU5_RT_CFG_TYPE_PWR_GATING, - NPU5_RT_CFG_TYPE_L1IMU_GATING, -}; +#include "npu4_family.h" const struct amdxdna_dev_priv npu5_dev_priv = { .fw_path = "amdnpu/17f0_11/npu.sbin", .protocol_major = 0x6, .protocol_minor = 0x6, - .rt_config = npu5_rt_cfg, - .num_rt_cfg = ARRAY_SIZE(npu5_rt_cfg), - .col_align = COL_ALIGN_NATURE, - .mbox_dev_addr = NPU5_MBOX_BAR_BASE, - .mbox_size = 0, /* Use BAR size */ - .sram_dev_addr = NPU5_SRAM_BAR_BASE, - .sram_offs = { - DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), - DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), - }, - .psp_regs_off = { - DEFINE_BAR_OFFSET(PSP_CMD_REG, NPU5_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_ARG0_REG, NPU5_REG, MPNPU_PUB_SCRATCH3), - DEFINE_BAR_OFFSET(PSP_ARG1_REG, NPU5_REG, MPNPU_PUB_SCRATCH4), - DEFINE_BAR_OFFSET(PSP_ARG2_REG, NPU5_REG, MPNPU_PUB_SCRATCH9), - DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU5_PSP, MP0_C2PMSG_73), - DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU5_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU5_REG, MPNPU_PUB_SCRATCH3), - }, - .smu_regs_off = { - DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU5_SMU, MP1_C2PMSG_0), - DEFINE_BAR_OFFSET(SMU_ARG_REG, NPU5_SMU, MP1_C2PMSG_60), - DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU5_SMU, MMNPU_APERTURE4_BASE), - DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU5_SMU, MP1_C2PMSG_61), - DEFINE_BAR_OFFSET(SMU_OUT_REG, NPU5_SMU, MP1_C2PMSG_60), - }, - .clk_gating = { - .types = npu5_clk_gating_types, - .num_types = ARRAY_SIZE(npu5_clk_gating_types), - .value_enable = NPU5_RT_CFG_VAL_CLK_GATING_ON, - .value_disable = NPU5_RT_CFG_VAL_CLK_GATING_OFF, - }, - .smu_mpnpuclk_freq_max = NPU5_MPNPUCLK_FREQ_MAX, - .smu_hclk_freq_max = NPU5_HCLK_FREQ_MAX, - .smu_dpm_max = 7, - .smu_rev = SMU_REVISION_V1, - .smu_npu_dpm_clk_table = npu5_dpm_clk_table, - .smu_npu_dpm_levels = ARRAY_SIZE(npu5_dpm_clk_table), -#ifdef AMDXDNA_DEVEL - .priv_load_cfg = {NPU5_RT_CFG_TYPE_PDI_LOAD, NPU5_RT_CFG_VAL_PDI_LOAD_MGMT}, -#endif + NPU4_COMMON_DEV_PRIV, }; const struct amdxdna_dev_info dev_npu5_info = { - .reg_bar = NPU5_REG_BAR_INDEX, - .mbox_bar = NPU5_MBOX_BAR_INDEX, - .sram_bar = NPU5_SRAM_BAR_INDEX, - .psp_bar = NPU5_PSP_BAR_INDEX, - .smu_bar = NPU5_SMU_BAR_INDEX, - .first_col = 0, - .dev_mem_buf_shift = 15, /* 32 KiB aligned */ - .dev_mem_base = AIE2_DEVM_BASE, - .dev_mem_size = AIE2_DEVM_SIZE, .vbnv = "RyzenAI-npu5", - .device_type = AMDXDNA_DEV_TYPE_KMQ, .dev_priv = &npu5_dev_priv, - .ops = &aie2_ops, /* NPU5 can share NPU1's callback */ + NPU4_COMMON_DEV_INFO, }; diff --git a/src/driver/amdxdna/npu6_regs.c b/src/driver/amdxdna/npu6_regs.c index f418896a..efa01321 100644 --- a/src/driver/amdxdna/npu6_regs.c +++ b/src/driver/amdxdna/npu6_regs.c @@ -3,159 +3,17 @@ * Copyright (C) 2024, Advanced Micro Devices, Inc. */ -#include "drm_local/amdxdna_accel.h" -#include "aie2_pci.h" - -/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */ -#define MPNPU_PUB_SEC_INTR 0x3010060 -#define MPNPU_PUB_PWRMGMT_INTR 0x3010064 -#define MPNPU_PUB_SCRATCH0 0x301006C -#define MPNPU_PUB_SCRATCH1 0x3010070 -#define MPNPU_PUB_SCRATCH2 0x3010074 -#define MPNPU_PUB_SCRATCH3 0x3010078 -#define MPNPU_PUB_SCRATCH4 0x301007C -#define MPNPU_PUB_SCRATCH5 0x3010080 -#define MPNPU_PUB_SCRATCH6 0x3010084 -#define MPNPU_PUB_SCRATCH7 0x3010088 -#define MPNPU_PUB_SCRATCH8 0x301008C -#define MPNPU_PUB_SCRATCH9 0x3010090 -#define MPNPU_PUB_SCRATCH10 0x3010094 -#define MPNPU_PUB_SCRATCH11 0x3010098 -#define MPNPU_PUB_SCRATCH12 0x301009C -#define MPNPU_PUB_SCRATCH13 0x30100A0 -#define MPNPU_PUB_SCRATCH14 0x30100A4 -#define MPNPU_PUB_SCRATCH15 0x30100A8 -#define MP0_C2PMSG_73 0x3810A24 -#define MP0_C2PMSG_123 0x3810AEC - -#define MP1_C2PMSG_0 0x3B10900 -#define MP1_C2PMSG_60 0x3B109F0 -#define MP1_C2PMSG_61 0x3B109F4 - -#define MPNPU_SRAM_X2I_MAILBOX_0 0x3600000 -#define MPNPU_SRAM_X2I_MAILBOX_15 0x361E000 -#define MPNPU_SRAM_X2I_MAILBOX_31 0x363E000 -#define MPNPU_SRAM_I2X_MAILBOX_31 0x363F000 - -#define MMNPU_APERTURE0_BASE 0x3000000 -#define MMNPU_APERTURE1_BASE 0x3600000 -#define MMNPU_APERTURE3_BASE 0x3810000 -#define MMNPU_APERTURE4_BASE 0x3B10000 - -/* PCIe BAR Index for NPU6 */ -#define NPU6_REG_BAR_INDEX 0 -#define NPU6_MBOX_BAR_INDEX 0 -#define NPU6_PSP_BAR_INDEX 4 -#define NPU6_SMU_BAR_INDEX 5 -#define NPU6_SRAM_BAR_INDEX 2 -/* Associated BARs and Apertures */ -#define NPU6_REG_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU6_MBOX_BAR_BASE MMNPU_APERTURE0_BASE -#define NPU6_PSP_BAR_BASE MMNPU_APERTURE3_BASE -#define NPU6_SMU_BAR_BASE MMNPU_APERTURE4_BASE -#define NPU6_SRAM_BAR_BASE MMNPU_APERTURE1_BASE - -#define NPU6_RT_CFG_TYPE_CLK_GATING 1 -#define NPU6_RT_CFG_TYPE_HCLK_GATING 2 -#define NPU6_RT_CFG_TYPE_PWR_GATING 3 -#define NPU6_RT_CFG_TYPE_L1IMU_GATING 4 -#define NPU6_RT_CFG_TYPE_PDI_LOAD 5 -#define NPU6_RT_CFG_TYPE_DEBUG_BO 10 - -#define NPU6_RT_CFG_VAL_CLK_GATING_OFF 0 -#define NPU6_RT_CFG_VAL_CLK_GATING_ON 1 - -#define NPU6_RT_CFG_VAL_PDI_LOAD_MGMT 0 -#define NPU6_RT_CFG_VAL_PDI_LOAD_APP 1 - -#define NPU6_RT_CFG_VAL_DEBUG_BO_DEFAULT 0 -#define NPU6_RT_CFG_VAL_DEBUG_BO_LARGE 1 - -#define NPU6_MPNPUCLK_FREQ_MAX 1267 -#define NPU6_HCLK_FREQ_MAX 1800 - -const struct dpm_clk npu6_dpm_clk_table[DPM_LEVEL_MAX] = { - {396, 792}, - {600, 1056}, - {792, 1152}, - {975, 1267}, - {975, 1267}, - {1056, 1408}, - {1152, 1584}, - {1267, 1800} -}; - -const struct rt_config npu6_rt_cfg[] = { - {NPU6_RT_CFG_TYPE_PDI_LOAD, NPU6_RT_CFG_VAL_PDI_LOAD_APP}, - {NPU6_RT_CFG_TYPE_DEBUG_BO, NPU6_RT_CFG_VAL_DEBUG_BO_LARGE}, -}; - -const u32 npu6_clk_gating_types[] = { - NPU6_RT_CFG_TYPE_CLK_GATING, - NPU6_RT_CFG_TYPE_HCLK_GATING, - NPU6_RT_CFG_TYPE_PWR_GATING, - NPU6_RT_CFG_TYPE_L1IMU_GATING, -}; +#include "npu4_family.h" const struct amdxdna_dev_priv npu6_dev_priv = { - .fw_path = "amdnpu/17f0_20/npu.sbin", + .fw_path = "amdnpu/17f0_10/npu.sbin", .protocol_major = 0x6, .protocol_minor = 0x6, - .rt_config = npu6_rt_cfg, - .num_rt_cfg = ARRAY_SIZE(npu6_rt_cfg), - .col_align = COL_ALIGN_NATURE, - .mbox_dev_addr = NPU6_MBOX_BAR_BASE, - .mbox_size = 0, /* Use BAR size */ - .sram_dev_addr = NPU6_SRAM_BAR_BASE, - .sram_offs = { - DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_0), - DEFINE_BAR_OFFSET(FW_ALIVE_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_15), - }, - .psp_regs_off = { - DEFINE_BAR_OFFSET(PSP_CMD_REG, NPU6_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_ARG0_REG, NPU6_REG, MPNPU_PUB_SCRATCH3), - DEFINE_BAR_OFFSET(PSP_ARG1_REG, NPU6_REG, MPNPU_PUB_SCRATCH4), - DEFINE_BAR_OFFSET(PSP_ARG2_REG, NPU6_REG, MPNPU_PUB_SCRATCH9), - DEFINE_BAR_OFFSET(PSP_INTR_REG, NPU6_PSP, MP0_C2PMSG_73), - DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU6_PSP, MP0_C2PMSG_123), - DEFINE_BAR_OFFSET(PSP_RESP_REG, NPU6_REG, MPNPU_PUB_SCRATCH3), - }, - .smu_regs_off = { - DEFINE_BAR_OFFSET(SMU_CMD_REG, NPU6_SMU, MP1_C2PMSG_0), - DEFINE_BAR_OFFSET(SMU_ARG_REG, NPU6_SMU, MP1_C2PMSG_60), - DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU6_SMU, MMNPU_APERTURE4_BASE), - DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU6_SMU, MP1_C2PMSG_61), - DEFINE_BAR_OFFSET(SMU_OUT_REG, NPU6_SMU, MP1_C2PMSG_60), - }, - .clk_gating = { - .types = npu6_clk_gating_types, - .num_types = ARRAY_SIZE(npu6_clk_gating_types), - .value_enable = NPU6_RT_CFG_VAL_CLK_GATING_ON, - .value_disable = NPU6_RT_CFG_VAL_CLK_GATING_OFF, - }, - .smu_mpnpuclk_freq_max = NPU6_MPNPUCLK_FREQ_MAX, - .smu_hclk_freq_max = NPU6_HCLK_FREQ_MAX, - .smu_dpm_max = 7, - .smu_rev = SMU_REVISION_V1, - .smu_npu_dpm_clk_table = npu6_dpm_clk_table, - .smu_npu_dpm_levels = ARRAY_SIZE(npu6_dpm_clk_table), -#ifdef AMDXDNA_DEVEL - .priv_load_cfg = {NPU6_RT_CFG_TYPE_PDI_LOAD, NPU6_RT_CFG_VAL_PDI_LOAD_MGMT}, -#endif + NPU4_COMMON_DEV_PRIV, }; const struct amdxdna_dev_info dev_npu6_info = { - .reg_bar = NPU6_REG_BAR_INDEX, - .mbox_bar = NPU6_MBOX_BAR_INDEX, - .sram_bar = NPU6_SRAM_BAR_INDEX, - .psp_bar = NPU6_PSP_BAR_INDEX, - .smu_bar = NPU6_SMU_BAR_INDEX, - .first_col = 0, - .dev_mem_buf_shift = 15, /* 32 KiB aligned */ - .dev_mem_base = AIE2_DEVM_BASE, - .dev_mem_size = AIE2_DEVM_SIZE, .vbnv = "RyzenAI-npu6", - .device_type = AMDXDNA_DEV_TYPE_KMQ, .dev_priv = &npu6_dev_priv, - .ops = &aie2_ops, + NPU4_COMMON_DEV_INFO, }; diff --git a/src/driver/doc/amdnpu.rst b/src/driver/doc/amdnpu.rst new file mode 100644 index 00000000..7ea04261 --- /dev/null +++ b/src/driver/doc/amdnpu.rst @@ -0,0 +1,277 @@ +.. SPDX-License-Identifier: GPL-2.0-only + +========= + AMD NPU +========= + +:Copyright: |copy| 2024 Advanced Micro Devices, Inc. +:Author: Sonal Santan + +Overview +======== + +AMD NPU (Neural Processing Unit) is a multi-user AI inference accelerator +integrated into AMD client APU. NPU enables efficient execution of Machine +Learning applications like CNN, LLM, etc. NPU is based on +`AMD XDNA Architecture`_. NPU is managed by **amdxdna** driver. + + +Hardware Description +==================== + +AMD NPU consists of the following hardware components: + +AMD XDNA Array +-------------- + +AMD XDNA Array comprises of 2D array of compute and memory tiles built with +`AMD AI Engine Technology`_. Each column has 4 rows of compute tiles and 1 +row of memory tile. Each compute tile contains a VLIW processor with its own +dedicated program and data memory. The memory tile acts as L2 memory. The 2D +array can be partitioned at a column boundary creating a spatially isolated +partition which can be bound to a workload context. + +Each column also has dedicated DMA engines to move data between host DDR and +memory tile. + +AMD Phoenix and AMD Hawk Point client NPU have a 4x5 topology, i.e., 4 rows of +compute tiles arranged into 5 columns. AMD Strix Point client APU have 4x8 +topology, i.e., 4 rows of compute tiles arranged into 8 columns. + +Shared L2 Memory +................ + +The single row of memory tiles create a pool of software managed on chip L2 +memory. DMA engines are used to move data between host DDR and memory tiles. +AMD Phoenix and AMD Hawk Point NPUs have a total of 2560 KB of L2 memory. +AMD Strix Point NPU has a total of 4096 KB of L2 memory. + +Microcontroller +--------------- + +A microcontroller runs NPU Firmware which is responsible for command processing, +XDNA Array partition setup, XDNA Array configuration, workload context +management and workload orchestration. + +NPU Firmware uses a dedicated instance of an isolated non-privileged context +called ERT to service each workload context. ERT is also used to execute user +provided ``ctrlcode`` associated with the workload context. + +NPU Firmware uses a single isolated privileged context called MERT to service +management commands from the amdxdna driver. + +Mailboxes +......... + +The microcontroller and amdxdna driver use a privileged channel for management +tasks like setting up of contexts, telemetry, query, error handling, setting up +user channel, etc. As mentioned before, privileged channel requests are +serviced by MERT. The privileged channel is bound to a single mailbox. + +The microcontroller and amdxdna driver use a dedicated user channel per +workload context. The user channel is primarily used for submitting work to +the NPU. As mentioned before, a user channel requests are serviced by an +instance of ERT. Each user channel is bound to its own dedicated mailbox. + +PCIe EP +------- + +NPU is visible to the x86 as a PCIe device with multiple BARs and some MSI-X interrupt +vectors. NPU uses a dedicated high bandwidth SoC level fabric for reading +writing into host memory. Each instance of ERT gets its own dedicated MSI-X +interrupt. MERT gets a single instance of MSI-X interrupt. + +The number of PCIe BARs varies depending on the specific device. +Based on their functions, PCIe BARs can generally be categorized into the +following types. + +* PSP BAR: Expose the AMD PSP (Platform Security Processor) function +* SMU BAR: Expose the AMD SMU (System Management Unit) function +* SRAM BAR: Expose ring buffers for the mailbox +* Mailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.) +* Public Register BAR: Expose public registers + +On specific devices, the above-mentioned BAR type might be combined into a single physical PCIe BAR. +Or a module might require two physical PCIe BARs to be fully functional. +For example, + +* On AMD Phoenix device, PSP, SMU, Public Register BARs are on PCIe BAR index 0. +* On AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0. + The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR). + +Process Isolation Hardware +-------------------------- + +As explained before, XDNA Array can be dynamically divided into isolated +spatial partitions, each of which may have one or more columns. The spatial +partition is setup by programming the column isolation registers by the +microcontroller. Each spatial partition is associated with a PASID which is +also programmed by the microcontroller. Hence multiple spatial partitions in +the NPU can make concurrent host access protected by PASID. + +The NPU FW itself uses microcontroller MMU enforced isolated contexts for +servicing user and privileged channel requests. + + +Mixed Spatial and Temporal Scheduling +===================================== + +AMD XDNA architecture supports mixed spatial and temporal (time sharing) +scheduling of 2D array. This means that spatial partitions may be setup and +torn down dynamically to accommodate various workloads. A *spatial* partition +may be *exclusively* bound to one workload context while another partition may +be *temporarily* bound to more than one workload contexts. The microcontroller +updates the PASID for a temporarily shared partition to match the context that +has been bound to the partition at any moment. + +Resource Solver +--------------- + +The Resource Solver component of the amdxdna driver manages the allocation +of 2D array among various workloads. Every workload describes the number +of columns required to run the NPU binary in its metadata. The Resource Solver +component uses hints passed by the workload and its own heuristics to +decide 2D array (re)partition strategy and mapping of workloads for spatial and +temporal sharing of columns. The FW enforces the context-to-column(s) resource +binding decisions made by the Resource Solver. + +AMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent workload +contexts. AMD Strix Point can support 16 concurrent workload contexts. + + +Application Binaries +==================== + +A NPU application workload is comprised of two separate binaries which are +generated by the NPU compiler. + +1. AMD XDNA Array overlay, which is used to configure a NPU spatial partition. + The overlay contains instructions for setting up the stream switch + configuration and ELF for the compute tiles. The overlay is loaded on the + spatial partition bound to the workload by the associated ERT instance. + Refer to the + `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details. + +2. ``ctrlcode``, used for orchestrating the overlay loaded on the spatial + partition. ``ctrlcode`` is executed by the ERT running in protected mode on + the microcontroller in the context of the workload. ``ctrlcode`` is made up + of a sequence of opcodes named ``XAie_TxnOpcode``. Refer to the + `AI Engine Run Time`_ for more details. + + +Special Host Buffers +==================== + +Per-context Instruction Buffer +------------------------------ + +Every workload context uses a host resident 64 MB buffer which is memory +mapped into the ERT instance created to service the workload. The ``ctrlcode`` +used by the workload is copied into this special memory. This buffer is +protected by PASID like all other input/output buffers used by that workload. +Instruction buffer is also mapped into the user space of the workload. + +Global Privileged Buffer +------------------------ + +In addition, the driver also allocates a single buffer for maintenance tasks +like recording errors from MERT. This global buffer uses the global IOMMU +domain and is only accessible by MERT. + + +High-level Use Flow +=================== + +Here are the steps to run a workload on AMD NPU: + +1. Compile the workload into an overlay and a ``ctrlcode`` binary. +2. Userspace opens a context in the driver and provides the overlay. +3. The driver checks with the Resource Solver for provisioning a set of columns + for the workload. +4. The driver then asks MERT to create a context on the device with the desired + columns. +5. MERT then creates an instance of ERT. MERT also maps the Instruction Buffer + into ERT memory. +6. The userspace then copies the ``ctrlcode`` to the Instruction Buffer. +7. Userspace then creates a command buffer with pointers to input, output, and + instruction buffer; it then submits command buffer with the driver and goes + to sleep waiting for completion. +8. The driver sends the command over the Mailbox to ERT. +9. ERT *executes* the ``ctrlcode`` in the instruction buffer. +10. Execution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while + AMD XDNA Array is running. +11. When ERT reaches end of ``ctrlcode``, it raises an MSI-X to send completion + signal to the driver which then wakes up the waiting workload. + + +Boot Flow +========= + +amdxdna driver uses PSP to securely load signed NPU FW and kick off the boot +of the NPU microcontroller. amdxdna driver then waits for the alive signal in +a special location on BAR 0. The NPU is switched off during SoC suspend and +turned on after resume where the NPU FW is reloaded, and the handshake is +performed again. + + +Userspace components +==================== + +Compiler +-------- + +Peano is an LLVM based open-source compiler for AMD XDNA Array compute tile +available at: +https://github.com/Xilinx/llvm-aie + +The open-source IREE compiler supports graph compilation of ML models for AMD +NPU and uses Peano underneath. It is available at: +https://github.com/nod-ai/iree-amd-aie + +Usermode Driver (UMD) +--------------------- + +The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT +can be found at: +https://github.com/Xilinx/XRT + +The open-source XRT shim for NPU is can be found at: +https://github.com/amd/xdna-driver + + +DMA Operation +============= + +DMA operation instructions are encoded in the ``ctrlcode`` as +``XAIE_IO_BLOCKWRITE`` opcode. When ERT executes ``XAIE_IO_BLOCKWRITE``, DMA +operations between host DDR and L2 memory are effected. + + +Error Handling +============== + +When MERT detects an error in AMD XDNA Array, it pauses execution for that +workload context and sends an asynchronous message to the driver over the +privileged channel. The driver then sends a buffer pointer to MERT to capture +the register states for the partition bound to faulting workload context. The +driver then decodes the error by reading the contents of the buffer pointer. + + +Telemetry +========= + +MERT can report various kinds of telemetry information like the following: +* L1 interrupt counter +* DMA counter +* Deep Sleep counter +* etc. + + +References +========== + +- `AMD XDNA Architecture `_ +- `AMD AI Engine Technology `_ +- `Peano `_ +- `Versal Adaptive SoC AIE-ML Architecture Manual (AM020) `_ +- `AI Engine Run Time `_ diff --git a/src/driver/tools/npu_perf_analyze.sh b/src/driver/tools/npu_perf_analyze.sh new file mode 100755 index 00000000..d32f2f3c --- /dev/null +++ b/src/driver/tools/npu_perf_analyze.sh @@ -0,0 +1,171 @@ +#!/usr/bin/bash + +# SPDX-License-Identifier: Apache-2.0 +# Copyright (C) 2024, Advanced Micro Devices, Inc. + +usage() +{ + cat << USAGE_END +Usage: $0 [options] event1_pattern event2_pattern +Options: + -file/-f: Trace log file for parsing + -range/-r: [entry_index_begin:entry_index_end), e.g.: 100:200 +Parsing trace log file to find time interval from event1 to event2. +event pattern examples: + "sdt_xrt:ioctl_exit: \(.+\) arg1=DRM_IOCTL_AMDXDNA_WAIT_CMD" +USAGE_END +} + +read_timestamps() +{ + timestamps=() + + while IFS= read -r line; do + if [ "$line" != "" ]; then + timestamps+=($(("10#${line}"))) + fi + done <<< `egrep "$1" ${perf_out_file} | awk '{print $4}' | tr -d '.' | tr -d ':'` + echo ${timestamps[@]} +} + +if [ "$#" -eq 0 ]; then + usage + exit 1 +fi + +range_start=-1 +range_end=-1 +event1="" +event2="" +perf_out_file="perf.converted.out" +while [ $# -gt 0 ]; do + case "$1" in + -range | -r) + st=$(echo $2 | cut -d':' -f1) + end=$(echo $2 | cut -d':' -f2) + if [ "${st}" != "" ]; then + if [[ "${st}" =~ ^[0-9]+$ ]]; then + range_start=$(("10#${st}")) + else + echo Invalid range start: ${st} + exit 1 + fi + fi + if [ "${end}" != "" ]; then + if [[ "${end}" =~ ^[0-9]+$ ]]; then + range_end=$(("10#${end}")) + else + echo Invalid range end: ${end} + exit 1 + fi + fi + shift + ;; + -file | -f) + perf_out_file=$2 + shift + ;; + *) + break + ;; + esac + shift +done +event1=$1 +event2=$2 + +if [ ! -f ${perf_out_file} ]; then + echo "${perf_out_file} is not found" + exit 1 +else + echo "Parsing ${perf_out_file}..." +fi + +event1_ts=($(read_timestamps "${event1}")) +event1_ts_num=${#event1_ts[@]} +if [ ${event1_ts_num} -eq 0 ]; then + echo No events found for ${event1} + exit 1 +fi +echo "${event1_ts_num} events for: '${event1}'" + +event2_ts=($(read_timestamps "${event2}")) +event2_ts_num=${#event2_ts[@]} +if [ ${event2_ts_num} -eq 0 ]; then + echo No events found for ${event2} + exit 1 +fi +echo "${event2_ts_num} events for: '${event2}'" + +# Caculate time difference between two events +diffs_event1=() +diffs_event2=() +diffs=() +i1=0 +i2=0 +while [ 1 ]; do + while [[ ${i2} -lt ${event2_ts_num} && ${event2_ts[i2]} -lt ${event1_ts[i1]} ]]; do + (( i2++ )) + done + if [ ${i2} -eq ${event2_ts_num} ]; then + break + fi + + while [[ ${i1} -lt ${event1_ts_num} && ${event1_ts[i1]} -lt ${event2_ts[i2]} ]]; do + (( i1++ )) + done + if [ ${i1} -eq ${event1_ts_num} ]; then + break + fi + + + (( i1-- )) + diffs_event1+=( $((event1_ts[i1])) ) + diffs_event2+=( $((event2_ts[i2])) ) + diffs+=( $((event2_ts[i2] - event1_ts[i1])) ) + (( i1++ )) + (( i2++ )) +done +#echo ${event1_ts[@]} > /tmp/e1 +#echo ${event2_ts[@]} > /tmp/e2 +#echo ${diffs[@]} > /tmp/diffs + + +# Data mining within specified range + +if [ ${range_start} -eq -1 ]; then + range_start=0 +fi +if [ ${range_end} -eq -1 ]; then + range_end=${#diffs[@]} +fi +if [ ${range_end} -eq ${range_start} ]; then + echo Range start and end are the same + exit 1 +elif [ ${range_end} -lt ${range_start} ]; then + echo Range start after end + exit 1 +fi + +total=0 +largest=${diffs[${range_start}]} +largest_idx=${range_start} +smallest=${diffs[${range_start}]} +smallest_idx=${range_start} +for (( i=${range_start}; i<${range_end}; i++ )); do + total=$(( total + diffs[i] )) + if [[ ${largest} -lt ${diffs[i]} ]]; then + largest=${diffs[i]} + largest_idx=${i} + fi + if [[ ${smallest} -gt ${diffs[i]} ]]; then + smallest=${diffs[i]} + smallest_idx=${i} + fi +done + +# Output result +total_events=$(( range_end - range_start )) +echo Average over ${total_events} events: $(( total / total_events ))ns +echo Largest: ${largest}ns@${largest_idx}: event1=${diffs_event1[largest_idx]}, event2=${diffs_event2[largest_idx]} +echo Smallest: ${smallest}ns@${smallest_idx}: event1=${diffs_event1[smallest_idx]}, event2=${diffs_event2[smallest_idx]} diff --git a/src/driver/tools/npu_perf_trace.sh b/src/driver/tools/npu_perf_trace.sh new file mode 100755 index 00000000..97892757 --- /dev/null +++ b/src/driver/tools/npu_perf_trace.sh @@ -0,0 +1,136 @@ +#! /bin/bash -- + +# SPDX-License-Identifier: Apache-2.0 +# Copyright (C) 2024, Advanced Micro Devices, Inc. + +#set -eu + +bold=$(tput bold) +normal=$(tput sgr0) +red=$(tput setaf 1) +yellow=$(tput setaf 3) +blue=$(tput setaf 4) + +trace_info() +{ + what=$1 + echo -e "[INFO]: $what" +} + +trace_warn() +{ + what=$1 + echo -e "[${yellow}WARNING${normal}]: $what" +} + +trace_error() +{ + what=$1 + echo -e "[${red}ERROR${normal}]: $what" 1>&2 + exit 1 +} + +add_sdt_xrt() +{ + perf list | grep sdt_xrt > /dev/null && sdt_pre_enabled=1 + if [[ $sdt_pre_enabled == 1 ]]; then + remove_sdt_xrt + #trace_warn "XRT SDT had beed added. Skip..." + #return + fi + + # Add XRT SDT events + perf buildid-cache --add $xrt_libs + # Convert SDT events to trace points + perf probe --add=sdt_xrt:* &> /dev/null + + trace_info "XRT SDT is added" +} + +remove_sdt_xrt() +{ + #if [[ $sdt_pre_enabled == 1 ]]; then + # trace_warn "XRT SDT was pre added. Skip..." + # return + #fi + + # Delete SDT trace points + perf probe --del=sdt_xrt:* &> /dev/null + # Remove XRT STD events + perf buildid-cache --remove $xrt_libs + trace_info "XRT SDT is removed" +} + +## -------- trace flow start -------- +if [ "$EUID" -ne 0 ]; then + trace_error "Please run as root" +fi + +# Global variables +sdt_pre_enabled=0 +xrt_lib_prefix="/opt/xilinx/xrt/lib" +while [ $# -gt 0 ]; do + case "$1" in + -libdir | -l) + xrt_lib_prefix=$2 + shift + ;; + *) + break + ;; + esac + shift +done +accel_debugfs="/sys/kernel/debug/accel" +xrt_libs="${xrt_lib_prefix}/libxrt_coreutil.so,${xrt_lib_prefix}/libxrt_driver_xdna.so" +perf_record_args="-e amdxdna_trace:* " +perf_record_args+="-e sdt_xrt:* " +exec_cmd="" + +perf --version > /dev/null + +# Argument parsing +exec_cmd=$@ +if [[ -z "$exec_cmd" ]]; then + trace_error "Please put execute application at the end" +fi + +dev="" +ioctl_sed_expr="" +for dir in $(ls $accel_debugfs); do + accel_fs_name=$(cat ${accel_debugfs}/$dir/name) + driver_name=$(echo $accel_fs_name | awk '{print $1}') + if [[ ! "$driver_name" =~ "amdxdna" ]]; then + continue + fi + + if [[ ! -f ${accel_debugfs}/$dir/ioctl_id ]]; then + trace_error "${accel_debugfs}/$dir/ioctl_id not exist. amdxdna driver too old?" + fi + + dev=$(echo $accel_fs_name | awk -F'[ =]' '{print $3}') + ioctl_sed_expr=$(awk -F ':' '{print "s/"$1"/"$2"/g"}' ${accel_debugfs}/$dir/ioctl_id) +done + +if [[ -z "$dev" ]]; then + trace_error "No device found" +fi + +trace_info "Found NPU device $dev at ${accel_debugfs}" + +add_sdt_xrt + +command="perf record $perf_record_args -a $exec_cmd" +trace_info "$command" +eval $command + +tmp_file=/tmp/perf.out +# convert timestamp from second to microsecond to avoid floating numbers +#perf script | awk '{ $4=$4*1000000; print }' > ${tmp_file} +perf script --reltime --ns > ${tmp_file} +# replace IOCTL cmd number to name +sed "$ioctl_sed_expr" "${tmp_file}" > perf.converted.out +rm -rf ${tmp_file} + +remove_sdt_xrt +## -------- trace flow end -------- diff --git a/src/include/uapi/drm_local/amdxdna_accel.h b/src/include/uapi/drm_local/amdxdna_accel.h index 134ef87b..fe41f6ee 100644 --- a/src/include/uapi/drm_local/amdxdna_accel.h +++ b/src/include/uapi/drm_local/amdxdna_accel.h @@ -17,7 +17,6 @@ extern "C" { #define AMDXDNA_DRIVER_MAJOR 1 #define AMDXDNA_DRIVER_MINOR 0 -#define AMDXDNA_INVALID_CMD_HANDLE (~0UL) #define AMDXDNA_INVALID_ADDR (~0UL) #define AMDXDNA_INVALID_CTX_HANDLE 0 #define AMDXDNA_INVALID_BO_HANDLE 0 @@ -49,8 +48,6 @@ enum amdxdna_drm_ioctl_id { DRM_AMDXDNA_WAIT_CMD, DRM_AMDXDNA_GET_INFO, DRM_AMDXDNA_SET_STATE, - DRM_AMDXDNA_SUBMIT_WAIT, - DRM_AMDXDNA_SUBMIT_SIGNAL, DRM_AMDXDNA_NUM_IOCTLS }; @@ -273,8 +270,6 @@ struct amdxdna_drm_exec_cmd { * @seq: sequence number of the command returned by execute command. * * Wait a command specified by seq to be completed. - * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot - * to submit a new command. */ struct amdxdna_drm_wait_cmd { __u32 hwctx; @@ -461,6 +456,7 @@ enum amdxdna_power_mode_type { POWER_MODE_LOW, /**< Set frequency to lowest DPM */ POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ POWER_MODE_HIGH, /**< Set frequency to highest DPM */ + POWER_MODE_TURBO, /**< More power, more performance */ }; /** @@ -542,20 +538,6 @@ struct amdxdna_drm_set_state { __u64 buffer; /* in */ }; - -/** - * struct amdxdna_drm_syncobjs - Signal or wait on array of DRM timelined sync objects. - * @handles: Array of handles of sync objects. - * @points: Array of time points for each sync objects. - * @count: Number of elements in the above array. - */ -struct amdxdna_drm_syncobjs { - __u64 handles; /* in */ - __u64 points; /* in */ - __u32 count; /* in */ - __u32 pad; -}; - #define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ struct amdxdna_drm_create_hwctx) @@ -596,14 +578,6 @@ struct amdxdna_drm_syncobjs { DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ struct amdxdna_drm_set_state) -#define DRM_IOCTL_AMDXDNA_SUBMIT_WAIT \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT, \ - struct amdxdna_drm_syncobjs) - -#define DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL \ - DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL, \ - struct amdxdna_drm_syncobjs) - #if defined(__cplusplus) } /* extern c end */ #endif diff --git a/src/shim/bo.cpp b/src/shim/bo.cpp index 4fdeebe5..36142f63 100644 --- a/src/shim/bo.cpp +++ b/src/shim/bo.cpp @@ -230,7 +230,6 @@ alloc_bo() amdxdna_drm_get_bo_info bo_info = {}; get_drm_bo_info(m_pdev, boh, &bo_info); m_bo = std::make_unique(*this, bo_info); - m_pdev.insert_hdl_mapping(boh, reinterpret_cast(this)); } void @@ -248,7 +247,6 @@ void bo:: free_bo() { - m_pdev.remove_hdl_mapping(get_drm_bo_handle()); m_bo.reset(); } diff --git a/src/shim/device.cpp b/src/shim/device.cpp index 278298d5..cef74159 100644 --- a/src/shim/device.cpp +++ b/src/shim/device.cpp @@ -262,7 +262,7 @@ struct partition_info for (uint32_t i = 0; i < data_size; i++) { const auto& entry = data[i]; - xrt_core::query::aie_partition_info::data new_entry; + xrt_core::query::aie_partition_info::data new_entry{}; new_entry.metadata.id = std::to_string(entry.context_id); new_entry.metadata.xclbin_uuid = "N/A"; new_entry.start_col = entry.start_col; diff --git a/src/shim/fence.cpp b/src/shim/fence.cpp index b67ce0ef..26cb428f 100644 --- a/src/shim/fence.cpp +++ b/src/shim/fence.cpp @@ -107,28 +107,35 @@ wait_syncobj_available(const shim_xdna::pdev& dev, } void -submit_wait_syncobjs(const shim_xdna::pdev& dev, +submit_wait_syncobjs(const shim_xdna::pdev& dev, const shim_xdna::hw_ctx *ctx, const uint32_t* sobj_hdls, const uint64_t* points, uint32_t num) { wait_syncobj_available(dev, sobj_hdls, points, num); - amdxdna_drm_syncobjs swsobj = { - .handles = reinterpret_cast(sobj_hdls), - .points = reinterpret_cast(points), - .count = num, + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->get_slotidx(), + .type = AMDXDNA_CMD_SUBMIT_DEPENDENCY, + .cmd_handles = reinterpret_cast(sobj_hdls), + .args = reinterpret_cast(points), + .cmd_count = num, + .arg_count = num, }; - dev.ioctl(DRM_IOCTL_AMDXDNA_SUBMIT_WAIT, &swsobj); + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); } void -submit_signal_syncobj(const shim_xdna::pdev& dev, uint32_t sobj_hdl, uint64_t point) +submit_signal_syncobj(const shim_xdna::pdev& dev, const shim_xdna::hw_ctx *ctx, + uint32_t sobj_hdl, uint64_t point) { - amdxdna_drm_syncobjs sssobj = { - .handles = reinterpret_cast(&sobj_hdl), - .points = reinterpret_cast(&point), - .count = 1, + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->get_slotidx(), + .type = AMDXDNA_CMD_SUBMIT_SIGNAL, + .cmd_handles = reinterpret_cast(&sobj_hdl), + .args = reinterpret_cast(&point), + .cmd_count = 1, + .arg_count = 1, }; - dev.ioctl(DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL, &sssobj); + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); } } @@ -199,25 +206,15 @@ clone() const return std::make_unique(*this); } -void +uint64_t fence:: -wait(bool async) const +wait_next_state() const { std::lock_guard guard(m_lock); - auto st = m_state; - if (st != initial_state && m_signaled) + if (m_state != initial_state && m_signaled) shim_err(-EINVAL, "Can't wait on fence that has been signaled before."); - - st++; - shim_debug("%s for command fence %d@%ld", - async ? "Submitting wait" : "Waiting", m_syncobj_hdl, st); - if (async) - submit_wait_syncobjs(m_pdev, &m_syncobj_hdl, &st, 1); - else - wait_syncobj_done(m_pdev, m_syncobj_hdl, st); - - m_state = st; + return ++m_state; } // Timeout value is ignored for now. @@ -225,57 +222,54 @@ void fence:: wait(uint32_t timeout_ms) const { - wait(false); + auto st = signal_next_state(); + shim_debug("Waiting for command fence %d@%ld", m_syncobj_hdl, st); + wait_syncobj_done(m_pdev, m_syncobj_hdl, st); } void fence:: -submit_wait() const +submit_wait(const hw_ctx *ctx) const { - wait(true); + auto st = signal_next_state(); + shim_debug("Submitting wait for command fence %d@%ld", m_syncobj_hdl, st); + submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1); } -void +uint64_t fence:: -signal(bool async) const +signal_next_state() const { std::lock_guard guard(m_lock); - auto st = m_state; - if (st != initial_state && !m_signaled) + if (m_state != initial_state && !m_signaled) shim_err(-EINVAL, "Can't signal fence that has been waited before."); - - if (st == initial_state) + if (m_state == initial_state) m_signaled = true; - - st++; - shim_debug("%s command fence %d@%ld", - async ? "Submitting signal" : "Signaling", m_syncobj_hdl, st); - if (async) - submit_signal_syncobj(m_pdev, m_syncobj_hdl, st); - else - signal_syncobj(m_pdev, m_syncobj_hdl, st); - - m_state = st; + return ++m_state; } void fence:: signal() const { - signal(false); + auto st = signal_next_state(); + shim_debug("Signaling command fence %d@%ld", m_syncobj_hdl, st); + signal_syncobj(m_pdev, m_syncobj_hdl, st); } void fence:: -submit_signal() const +submit_signal(const hw_ctx *ctx) const { - signal(true); + auto st = signal_next_state(); + shim_debug("Submitting signal command fence %d@%ld", m_syncobj_hdl, st); + submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st); } void fence:: -submit_wait(const pdev& dev, const std::vector& fences) +submit_wait(const pdev& dev, const hw_ctx *ctx, const std::vector& fences) { constexpr int max_fences = 1024; uint32_t hdls[max_fences]; @@ -287,12 +281,13 @@ submit_wait(const pdev& dev, const std::vector& fences) for (auto f : fences) { auto fh = static_cast(f); - std::lock_guard guard(fh->m_lock); + auto st = fh->wait_next_state(); + shim_debug("Waiting for command fence %d@%ld", fh->m_syncobj_hdl, st); hdls[i] = fh->m_syncobj_hdl; - pts[i] = ++fh->m_state; + pts[i] = st; i++; } - submit_wait_syncobjs(dev, hdls, pts, i); + submit_wait_syncobjs(dev, ctx, hdls, pts, i); } } // shim_xdna diff --git a/src/shim/fence.h b/src/shim/fence.h index fe3ff295..1b6cdbca 100644 --- a/src/shim/fence.h +++ b/src/shim/fence.h @@ -4,6 +4,7 @@ #ifndef _FENCE_XDNA_H_ #define _FENCE_XDNA_H_ +#include "hwctx.h" #include "device.h" #include "shared.h" @@ -41,20 +42,20 @@ class fence : public xrt_core::fence_handle public: void - submit_wait() const; + submit_wait(const hw_ctx*) const; static void - submit_wait(const pdev& dev, const std::vector& fences); + submit_wait(const pdev& dev, const hw_ctx*, const std::vector& fences); void - submit_signal() const; + submit_signal(const hw_ctx*) const; private: - void - wait(bool async) const; + uint64_t + wait_next_state() const; - void - signal(bool async) const; + uint64_t + signal_next_state() const; const pdev& m_pdev; const std::unique_ptr m_import; diff --git a/src/shim/hwq.cpp b/src/shim/hwq.cpp index 2bda0db1..14a31a52 100644 --- a/src/shim/hwq.cpp +++ b/src/shim/hwq.cpp @@ -5,6 +5,7 @@ #include "hwq.h" #include "fence.h" #include "shim_debug.h" +#include "core/common/trace.h" namespace { @@ -82,60 +83,29 @@ void hw_q:: submit_command(xrt_core::buffer_handle *cmd) { - auto pkt = get_chained_command_pkt(cmd); - if (!m_pdev.is_force_unchained_command() || !pkt) { - issue_command(cmd); - return; - } - - // HACK: Forcibly unchain commands, to be removed later. - // - // Forcibly unchain commands and send to driver one by one. - auto payload = get_ert_cmd_chain_data(pkt); - for (size_t i = 0; i < payload->command_count; i++) { - auto boh = reinterpret_cast( - m_pdev.lookup_hdl_mapping(static_cast(payload->data[i]))); - issue_command(boh); - } + issue_command(cmd); } int hw_q:: -wait_command(xrt_core::buffer_handle *cmd, uint32_t timeout_ms) const +poll_command(xrt_core::buffer_handle *cmd) const { - auto pkt = get_chained_command_pkt(cmd); - if (!m_pdev.is_force_unchained_command() || !pkt) - return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms); - - // HACK: handling forcibly unchained commands, to be removed later. - // - // Wait for the last unchained command. - auto payload = get_ert_cmd_chain_data(pkt); - auto last_boh = reinterpret_cast( - m_pdev.lookup_hdl_mapping(static_cast(payload->data[payload->command_count-1]))); - auto ret = wait_cmd(m_pdev, m_hwctx, last_boh, timeout_ms); - if (ret != 1) - return ret; - - // Check the state of the last command. - auto cmdpkt = reinterpret_cast(last_boh->map(xrt_core::buffer_handle::map_type::read)); - if (cmdpkt->state == ERT_CMD_STATE_COMPLETED) { - pkt->state = ERT_CMD_STATE_COMPLETED; + auto cmdpkt = reinterpret_cast(cmd->map(xrt_core::buffer_handle::map_type::write)); + + if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) { + XRT_TRACE_POINT_LOG(poll_command_done); return 1; } + return 0; +} - // Find out the first command failed. - for (int i = 0; i < payload->command_count; i++) { - auto boh = reinterpret_cast( - m_pdev.lookup_hdl_mapping(static_cast(payload->data[i]))); - cmdpkt = reinterpret_cast(boh->map(xrt_core::buffer_handle::map_type::read)); - if (cmdpkt->state != ERT_CMD_STATE_COMPLETED) { - pkt->state = cmdpkt->state; - payload->error_index = i; - break; - } - } - return 1; +int +hw_q:: +wait_command(xrt_core::buffer_handle *cmd, uint32_t timeout_ms) const +{ + if (poll_command(cmd)) + return 1; + return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms); } void @@ -143,14 +113,14 @@ hw_q:: submit_wait(const xrt_core::fence_handle* f) { auto fh = static_cast(f); - fh->submit_wait(); + fh->submit_wait(m_hwctx); } void hw_q:: submit_wait(const std::vector& fences) { - fence::submit_wait(m_pdev, fences); + fence::submit_wait(m_pdev, m_hwctx, fences); } void @@ -158,7 +128,7 @@ hw_q:: submit_signal(const xrt_core::fence_handle* f) { auto fh = static_cast(f); - fh->submit_signal(); + fh->submit_signal(m_hwctx); } } // shim_xdna diff --git a/src/shim/hwq.h b/src/shim/hwq.h index ce2c1c83..afb9ca97 100644 --- a/src/shim/hwq.h +++ b/src/shim/hwq.h @@ -20,6 +20,9 @@ class hw_q : public xrt_core::hwqueue_handle void submit_command(xrt_core::buffer_handle *) override; + int + poll_command(xrt_core::buffer_handle *) const override; + int wait_command(xrt_core::buffer_handle *, uint32_t timeout_ms) const override; diff --git a/src/shim/kmq/device.cpp b/src/shim/kmq/device.cpp index ec703d7c..1479a3c5 100644 --- a/src/shim/kmq/device.cpp +++ b/src/shim/kmq/device.cpp @@ -6,21 +6,12 @@ #include "hwctx.h" #include "drm_local/amdxdna_accel.h" -namespace { - -// Device memory heap needs to be within one 64MB page. The maximum size is 64MB. -const size_t dev_mem_size = (64 << 20); - -} - namespace shim_xdna { device_kmq:: device_kmq(const pdev& pdev, handle_type shim_handle, id_type device_id) : device(pdev, shim_handle, device_id) { - // Alloc and register device memory w/ driver. - m_dev_heap_bo = std::make_unique(*this, dev_mem_size, AMDXDNA_BO_DEV_HEAP); shim_debug("Created KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str()); } diff --git a/src/shim/kmq/device.h b/src/shim/kmq/device.h index 8fa76362..768aee14 100644 --- a/src/shim/kmq/device.h +++ b/src/shim/kmq/device.h @@ -26,9 +26,6 @@ class device_kmq : public device { std::unique_ptr import_bo(xrt_core::shared_handle::export_handle ehdl) const override; - -private: - std::unique_ptr m_dev_heap_bo; }; } // namespace shim_xdna diff --git a/src/shim/kmq/hwq.cpp b/src/shim/kmq/hwq.cpp index b785aac6..ebb292dd 100644 --- a/src/shim/kmq/hwq.cpp +++ b/src/shim/kmq/hwq.cpp @@ -31,30 +31,13 @@ issue_command(xrt_core::buffer_handle *cmd_bo) amdxdna_drm_exec_cmd ecmd = { .hwctx = m_hwctx->get_slotidx(), + .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF, .cmd_handles = cmd_bo_hdl, .args = reinterpret_cast(arg_bo_hdls), .cmd_count = 1, .arg_count = static_cast(boh->get_arg_bo_handles(arg_bo_hdls, max_arg_bos)), }; - - int ret = EAGAIN; - while (ret == EAGAIN) { - try { - m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); - ret = 0; - } - catch (const xrt_core::system_error& ex) { - ret = ex.get_code(); - if (ret != EAGAIN) - throw; - amdxdna_drm_wait_cmd wcmd = { - .hwctx = ecmd.hwctx, - .timeout = 0, // Infinite waiting - .seq = AMDXDNA_INVALID_CMD_HANDLE, // Wait for free slot - }; - m_pdev.ioctl(DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd); - } - } + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); auto id = ecmd.seq; boh->set_cmd_id(id); diff --git a/src/shim/kmq/pcidev.cpp b/src/shim/kmq/pcidev.cpp index 26a3ae8f..0d271ce5 100644 --- a/src/shim/kmq/pcidev.cpp +++ b/src/shim/kmq/pcidev.cpp @@ -1,9 +1,17 @@ // SPDX-License-Identifier: Apache-2.0 // Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. +#include "bo.h" #include "device.h" #include "pcidev.h" +namespace { + +// Device memory heap needs to be within one 64MB page. The maximum size is 64MB. +const size_t dev_mem_size = (64 << 20); + +} + namespace shim_xdna { pdev_kmq:: @@ -23,7 +31,24 @@ std::shared_ptr pdev_kmq:: create_device(xrt_core::device::handle_type handle, xrt_core::device::id_type id) const { - return std::make_shared(*this, handle, id); + auto dev = std::make_shared(*this, handle, id); + try { + // Alloc device memory on first device creation. + // No locking is needed since driver will ensure only one heap BO is created. + if (m_dev_heap_bo == nullptr) + m_dev_heap_bo = std::make_unique(*dev, dev_mem_size, AMDXDNA_BO_DEV_HEAP); + } catch (const xrt_core::system_error& ex) { + if (ex.get_code() != EBUSY) + throw; + } + return dev; +} + +void +pdev_kmq:: +on_last_close() const +{ + m_dev_heap_bo.reset(); } } // namespace shim_xdna diff --git a/src/shim/kmq/pcidev.h b/src/shim/kmq/pcidev.h index 03ded1ec..65585924 100644 --- a/src/shim/kmq/pcidev.h +++ b/src/shim/kmq/pcidev.h @@ -17,6 +17,13 @@ class pdev_kmq : public pdev std::shared_ptr create_device(xrt_core::device::handle_type handle, xrt_core::device::id_type id) const override; + +private: + // Create on first device creation and removed right before device is closed + mutable std::unique_ptr m_dev_heap_bo; + + virtual void + on_last_close() const override; }; } // namespace shim_xdna diff --git a/src/shim/pcidev.cpp b/src/shim/pcidev.cpp index 5d66e372..faa089a4 100644 --- a/src/shim/pcidev.cpp +++ b/src/shim/pcidev.cpp @@ -6,7 +6,6 @@ #include "pcidrv.h" #include "shim_debug.h" #include "drm_local/amdxdna_accel.h" -#include "core/common/config_reader.h" #include "core/common/trace.h" namespace { @@ -35,10 +34,6 @@ namespace { return "DRM_IOCTL_AMDXDNA_GET_INFO"; case DRM_IOCTL_AMDXDNA_SET_STATE: return "DRM_IOCTL_AMDXDNA_SET_STATE"; - case DRM_IOCTL_AMDXDNA_SUBMIT_WAIT: - return "DRM_IOCTL_AMDXDNA_SUBMIT_WAIT"; - case DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL: - return "DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL"; case DRM_IOCTL_GEM_CLOSE: return "DRM_IOCTL_GEM_CLOSE"; case DRM_IOCTL_PRIME_HANDLE_TO_FD: @@ -71,10 +66,6 @@ namespace shim_xdna { pdev:: pdev(std::shared_ptr driver, std::string sysfs_name) : xrt_core::pci::dev(driver, std::move(sysfs_name)) - // Default of force_unchained_command should be false once command - // chaining is natively supported by driver/firmware. - , m_force_unchained_command(xrt_core::config::detail::get_bool_value( - "Debug.force_unchained_command", false)) { m_is_ready = true; // We're always ready. } @@ -111,6 +102,8 @@ open() const m_dev_fd = fd; } ++m_dev_users; + + on_first_open(); } void @@ -122,6 +115,8 @@ close() const --m_dev_users; if (m_dev_users == 0) { + on_last_close(); + // Stop new users of the fd from other threads. fd = m_dev_fd; m_dev_fd = -1; @@ -158,12 +153,5 @@ munmap(void* addr, size_t len) const ::munmap(addr, len); } -bool -pdev:: -is_force_unchained_command() const -{ - return m_force_unchained_command; -} - } // namespace shim_xdna diff --git a/src/shim/pcidev.h b/src/shim/pcidev.h index 0d487518..da0cdeda 100644 --- a/src/shim/pcidev.h +++ b/src/shim/pcidev.h @@ -43,37 +43,15 @@ class pdev : public xrt_core::pci::dev void close() const; - bool - is_force_unchained_command() const; - - // Below routines are for managing drm_bo_hdl -> buffer_handle* mapping. - // This is only a temporary hack for supporting forcibly unchained runlist. - void - insert_hdl_mapping(uint32_t hdl, uint64_t ptr) const - { - const std::lock_guard lock(m_lock); - m_hdl_map[hdl] = ptr; - } - void - remove_hdl_mapping(uint32_t hdl) const - { - const std::lock_guard lock(m_lock); - m_hdl_map.erase(hdl); - } - uint64_t - lookup_hdl_mapping(uint32_t hdl) const - { - const std::lock_guard lock(m_lock); - return m_hdl_map[hdl]; - } - private: + virtual void + on_first_open() const {} + virtual void + on_last_close() const {} + mutable int m_dev_fd = -1; mutable int m_dev_users = 0; mutable std::mutex m_lock; - const bool m_force_unchained_command = true; - // Mark it as mutable since pdev does not look at what is saved in this map - mutable std::map m_hdl_map; }; } // namespace shim_xdna diff --git a/src/shim/umq/host_queue.h b/src/shim/umq/host_queue.h index 14cb41e0..fe8dc8bc 100644 --- a/src/shim/umq/host_queue.h +++ b/src/shim/umq/host_queue.h @@ -1,60 +1,9 @@ -/* (c) Copyright 2014 - 2022 Xilinx, Inc. All rights reserved. - - This file contains confidential and proprietary information - of Xilinx, Inc. and is protected under U.S. and - international copyright and other intellectual property - laws. - - DISCLAIMER - This disclaimer is not a license and does not grant any - rights to the materials distributed herewith. Except as - otherwise provided in a valid license issued to you by - Xilinx, and to the maximum extent permitted by applicable - law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND - WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES - AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING - BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON- - INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and - (2) Xilinx shall not be liable (whether in contract or tort, - including negligence, or under any other theory of - liability) for any loss or damage of any kind or nature - related to, arising under or in connection with these - materials, including for any direct, or any indirect, - special, incidental, or consequential loss or damage - (including loss of data, profits, goodwill, or any type of - loss or damage suffered as a result of any action brought - by a third party) even if such damage or loss was - reasonably foreseeable or Xilinx had been advised of the - possibility of the same. - - CRITICAL APPLICATIONS - Xilinx products are not designed or intended to be fail- - safe, or for use in any application requiring fail-safe - performance, such as life-support or safety devices or - systems, Class III medical devices, nuclear facilities, - applications related to the deployment of airbags, or any - other applications that could lead to death, personal - injury, or severe property or environmental damage - (individually and collectively, "Critical - Applications"). Customer assumes the sole risk and - liability of any use of Xilinx products in Critical - Applications, subject only to applicable laws and - regulations governing limitations on product liability. - - THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS - PART OF THIS FILE AT ALL TIMES. */ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #ifndef _HOST_QUEUE_H_ #define _HOST_QUEUE_H_ -#include -#include - -#define SHIM_USER_EVENT_0_ID 0xb6 -#define DOORBELL_EVENT_ID SHIM_USER_EVENT_0_ID - -#define PDI_TABLE_SIZE 64 - #define HSA_PKT_SUCCESS (0) /* * 32-bit return code in completion of HSA pkt back to host. @@ -62,7 +11,8 @@ * will check them on all devices/platforms. * HSA specific error code will be on high 28 bits. */ -enum hsa_cmd_state { // ert_cmd_state essentially +enum hsa_cmd_state +{ // ert_cmd_state essentially HSA_CMD_STATE_NEW = 1, HSA_CMD_STATE_QUEUED = 2, HSA_CMD_STATE_RUNNING = 3, @@ -83,46 +33,14 @@ enum hsa_cmd_state { // ert_cmd_state essentially #define HSA_INVALID_OPCODE HSA_ERR(column_index_rel * 100 + 3) #define HSA_INVALID_PKT HSA_ERR(4) #define HSA_INVALID_PAGE HSA_ERR(column_index_rel * 100 + 5) +#define HSA_INDIRECT_PKT_NUM 6 -typedef enum +enum host_queue_packet_opcode { HOST_QUEUE_PACKET_EXEC_BUF = 1, HOST_QUEUE_PACKET_TEST = 2, HOST_QUEUE_PACKET_EXIT = 3, -} -host_queue_packet_opcode_t; - -/* - * cu_config contains cu <-> pdi mapping info - * - * due to memory footprint limitation, the pdi info (host address) is not saved in CERT - * if num_mappings == 1, then pdi_info_host_addr contains the host addr of the pdi - * if num_mappings > 1, then pdi_info_host_addr contains the host addr of a table, in which - * the host addr of all the pdi are saved. - * - * note: both cu_index and pdi_index should be start from 0 - * e.g mapping[0] = 0, mapping[1] = 1, mapping[2] = 0, - * means, - * cu0 <-> pdi0 - * cu1 <-> pdi1 - * cu2 <-> pdi0 - * there are 3 mappings, and 2 pdi in pdi_info_host_addr table - */ -typedef struct -{ - uint32_t num_mappings; - uint32_t pdi_info_host_addr_low; - uint32_t pdi_info_host_addr_high; - uint8_t mapping[PDI_TABLE_SIZE]; -} -config_cu_t; - -#define INVALID_PDI_ID (0xFF) - -/* - * Maximum number of exec buf args in 4B - */ -#define EXEC_BUF_ARGS_MAX_LEN (20) +}; /* * hsa pkt payload of exec_buf @@ -134,7 +52,7 @@ config_cu_t; * args contains the info of input/output frame, parameter of network * etc, which are all transparent to CERT */ -typedef struct +struct exec_buf { uint16_t cu_index; uint16_t reserved0; @@ -144,48 +62,28 @@ typedef struct uint16_t reserved1; uint32_t args_host_addr_low; uint32_t args_host_addr_high; -} -exec_buf_t; - +}; -typedef struct +struct host_queue_header { uint64_t read_index; - - uint32_t reserved; - - //! @note Queue capacity, must be a power of two. - uint32_t capacity; - - /* - * NOTE!!! - * Due to the cache is not cache coherence between host and device. We have - * to flush the cache of the host queue. - * - * Most importantly, the read_index has to be in different cache line - * (64Bytes in linux) than the write_index. Because the read_index might be - * flushed from a different context from kernel driver that is monitoring - * the completed message. While at the same time, the write_index might be - * being flushed from UMQ. - */ - //Note: temporary disable padding because FW has not been fully changed yet. - //uint64_t padding[6]; - + struct + { + uint16_t major; + uint16_t minor; + } + version; + uint32_t capacity; //Queue capacity, must be a power of two. uint64_t write_index; - uint64_t data_address; - - // TODO Ready signal? -} -host_queue_header_t; +}; -typedef enum +enum host_queue_packet_type { HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC = 0, HOST_QUEUE_PACKET_TYPE_INVALID = 1, -} -host_queue_packet_type_t; +}; /* * 8 Bytes common header of hsa pkt used in CERT. @@ -199,7 +97,7 @@ host_queue_packet_type_t; * for 'indirect', 'count' is used to calc the number of indirect pkt entry, * see below */ -typedef struct +struct common_header { union { struct { @@ -214,29 +112,24 @@ typedef struct uint16_t count; uint8_t distribute; uint8_t indirect; -} -common_header_t; +}; -typedef struct +struct xrt_packet_header { - common_header_t common_header; + struct common_header common_header; uint64_t completion_signal; -} -xrt_packet_header_t; +}; /* * format of indirect pkt. multiple-indirect-level is supported * there is vendor specific header (common header plus completion_signal) in 1st indirect level * there is common header in all the remaining indirect levels */ -typedef struct +struct host_indirect_packet_entry { - uint16_t column_index; - uint16_t reserved; uint32_t host_addr_low; uint32_t host_addr_high; -} -host_indirect_packet_entry_t; +}; /* * hsa pkt format -- 64Bytes fixed length @@ -245,23 +138,23 @@ host_indirect_packet_entry_t; * xrt_packet_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 24 (sizeof(exec_buf_t)) + * count: 24 (sizeof(struct exec_buf)) * distribute: 0 * indirect: 0 * completion_signal: xxx * data: - * exec_buf_t + * struct exec_buf * * case 2 -- indirect config_cu * xrt_packet_header: * type: 0 (vendor specific) * opcode: 0 (config_cu) - * count: 12 (1 * sizeof(host_indirect_packet_entry_t)) + * count: 12 (1 * sizeof(struct host_indirect_packet_entry)) * distribute: 0 * indirect: 1 // common header of indirect * completion_signal: xxx * data: - * host_indirect_packet_entry_t: + * struct host_indirect_packet_entry: * column_index: index of lead uc * host_addr*: host addr of next level * common_header: @@ -270,123 +163,97 @@ host_indirect_packet_entry_t; * count: 72 (config_cu with 16 entries)) //10 entry config_cu can fit in direct pkt * indirect: 0 // common header of direct * payload: - * config_cu_t: 16 entries of mapping table + * struct config_cu: 16 entries of mapping table * * case 3 -- indirect exec_buf on 4 column partition * xrt_packet_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 48 (4 *sizeof(host_indirect_packet_entry_t)) + * count: 48 (6 * sizeof(struct host_indirect_packet_entry)) * distribute: 1 * indirect: 1 // common header of indirect * completion_signal: xxx * data: - * host_indirect_packet_entry_t: - * column_index: index of lead uc + * struct host_indirect_packet_entry: * host_addr*: host addr of next level * common_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 24 (sizeof(exec_buf_t)) + * count: 24 (sizeof(struct exec_buf)) * indirect: 0 // common header of direct * payload: - * exec_buf_t - * host_indirect_packet_entry_t: - * column_index: index of slave1 + * struct exec_buf + * struct host_indirect_packet_entry: * host_addr*: host addr of next level * common_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 24 (sizeof(exec_buf_t)) + * count: 24 (struct sizeof(exec_buf)) * indirect: 0 // common header of direct * payload: - * exec_buf_t - * host_indirect_packet_entry_t: + * struct exec_buf + * struct host_indirect_packet_entry: * slave2,3,etc... * * case 4 -- indirect exec_buf on 8 column partition * xrt_packet_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 12 (sizeof(host_indirect_packet_entry_t)) + * count: 12 (sizeof(struct host_indirect_packet_entry)) * distribute: 1 * indirect: 1 // common_header of level-1 indirect * completion_signal: xxx * data: - * host_indirect_packet_entry_t: - * column_index: index of lead uc + * struct host_indirect_packet_entry: * host_addr*: host addr of next level * common_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 12*8 (12 * sizeof(host_indirect_packet_entry_t)) + * count: 12*8 (12 * sizeof(struct host_indirect_packet_entry)) * distribute: 1 * indirect: 1 // common header of level-2 indirect * indirect_payload: - * host_indirect_packet_entry_t: - * column_index: index of lead uc + * struct host_indirect_packet_entry: * host_addr*: host addr of next level * common_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 24 (sizeof(exec_buf_t)) + * count: 24 (sizeof(struct exec_buf)) * distribute: 1 * indirect: 0 // common_header of direct * payload: - * exec_buf_t - * host_indirect_packet_entry_t: - * column_index: index of slave1 + * struct exec_buf + * struct host_indirect_packet_entry: * host_addr*: host addr of next level * common_header: * type: 0 (vendor specific) * opcode: 1 (exec_buf) - * count: 24 (sizeof(exec_buf_t)) + * count: 24 (sizeof(struct exec_buf)) * distribute: 1 * indirect: 0 // common_header of direct * payload: - * exec_buf_t - * host_indirect_packet_entry_t: + * struct exec_buf + * struct host_indirect_packet_entry: * slave2,3,etc... */ -typedef struct +struct host_queue_packet { - xrt_packet_header_t xrt_header; + struct xrt_packet_header xrt_header; uint32_t data[12]; -} -host_queue_packet_t; +}; /* * xrt pkt with random length. */ -typedef struct +struct xrt_packet { - xrt_packet_header_t xrt_header; + struct xrt_packet_header xrt_header; uint64_t xrt_payload_host_addr; -} -xrt_packet_t; - -#define XRT_PKT_TYPE(p) ((p)->xrt_header.common_header.type) -#define XRT_PKT_OPCODE(p) ((p)->xrt_header.common_header.opcode) -#define XRT_PKT_LEN(p) ((p)->xrt_header.common_header.count) -#define XRT_PKT_DISTRIBUTE(p) ((p)->xrt_header.common_header.distribute) -#define XRT_PKT_INDIRECT(p) ((p)->xrt_header.common_header.indirect) -#define XRT_PKT_COMPLETION(p) ((p)->xrt_header.completion_signal) -#define XRT_PKT_PAYLOAD(p) ((p)->xrt_payload_host_addr) - -#define ADDR_HIGH(x) ((x) >> 32) -#define ADDR_LOW(x) ((x) & 0xFFFFFFFF) -#define MOD_POW2(x, y) ((x) & ((y) - 1)) +}; -typedef struct +struct host_queue { uint64_t address; -} -host_queue_t; - -void host_queue_init(host_queue_t *queue, uint64_t address); - -xrt_packet_t *host_queue_pop(host_queue_t *queue, bool block); - -void host_queue_finish_packet(host_queue_t *queue, xrt_packet_t *packet, uint32_t completion); +}; #endif diff --git a/src/shim/umq/hwq.cpp b/src/shim/umq/hwq.cpp index cbdfde5e..5c41fa47 100644 --- a/src/shim/umq/hwq.cpp +++ b/src/shim/umq/hwq.cpp @@ -22,13 +22,13 @@ clflush_data(void *data, int len) } inline void -mark_slot_invalid(volatile host_queue_packet_t *pkt) +mark_slot_invalid(volatile struct host_queue_packet *pkt) { pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_INVALID; } inline void -mark_slot_valid(volatile host_queue_packet_t *pkt) +mark_slot_valid(volatile struct host_queue_packet *pkt) { /* Issue mfence instruction to make sure all writes to the slot before is done */ std::atomic_thread_fence(std::memory_order::memory_order_seq_cst); @@ -38,7 +38,7 @@ mark_slot_valid(volatile host_queue_packet_t *pkt) } inline bool -is_slot_valid(volatile host_queue_packet_t *pkt) +is_slot_valid(volatile struct host_queue_packet *pkt) { return pkt->xrt_header.common_header.type == HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC; } @@ -47,34 +47,63 @@ is_slot_valid(volatile host_queue_packet_t *pkt) namespace shim_xdna { +void +hw_q_umq:: +init_indirect_buf(volatile struct host_indirect_data *indirect_buf, int size) +{ + for (int i = 0; i < size; i++) { + indirect_buf[i].header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC; + indirect_buf[i].header.opcode = HOST_QUEUE_PACKET_EXEC_BUF; + indirect_buf[i].header.count = sizeof(struct exec_buf); + indirect_buf[i].header.distribute = 1; + indirect_buf[i].header.indirect = 0; + } +} + hw_q_umq:: hw_q_umq(const device& dev, size_t nslots) : hw_q(dev) { #ifdef UMQ_HELLO_TEST const size_t header_sz = 8192; // Hard code to 2 pages const size_t queue_sz = 0; + const size_t indirect_sz = 0; #else - const size_t header_sz = sizeof(host_queue_header_t); - const size_t queue_sz = sizeof(host_queue_packet_t) * nslots; + // + // host queue layout: + // host_queue_header_t + // host_queue_packet_t [nslots] + // indirect [4 * indirect_buffer * nslots] + const size_t header_sz = sizeof(struct host_queue_header); + const size_t queue_sz = sizeof(struct host_queue_packet) * nslots; + const size_t indirect_sz = (sizeof(struct host_indirect_data) * HSA_INDIRECT_PKT_NUM) * nslots; #endif - const size_t umq_sz = header_sz + queue_sz; + const size_t umq_sz = header_sz + queue_sz + indirect_sz; + shim_debug("umq sz %ld", umq_sz); m_umq_bo = const_cast(dev).alloc_bo(umq_sz, XCL_BO_FLAGS_EXECBUF); m_umq_bo_buf = m_umq_bo->map(bo::map_type::write); - m_umq_hdr = reinterpret_cast(m_umq_bo_buf); - m_umq_pkt = reinterpret_cast + m_umq_hdr = reinterpret_cast(m_umq_bo_buf); + m_umq_pkt = reinterpret_cast ((char *)m_umq_bo_buf + header_sz); + m_umq_indirect_buf = reinterpret_cast + ((char *)m_umq_bo_buf + header_sz + queue_sz); // set all mapped memory to 0 std::memset(m_umq_bo_buf, 0, umq_sz); - for (int i = 0; i < nslots; i++) + // init slots and indirect buf + for (int i = 0; i < nslots; i++) { mark_slot_invalid(&m_umq_pkt[i]); + init_indirect_buf(&m_umq_indirect_buf[i * HSA_INDIRECT_PKT_NUM], HSA_INDIRECT_PKT_NUM); + } m_umq_hdr->capacity = nslots; // data_address starts after header m_umq_hdr->data_address = m_umq_bo->get_properties().paddr + header_sz; + // indirect buf starts after queue + m_indirect_paddr = m_umq_hdr->data_address + queue_sz; + // this is the bo handler defined in parent class m_queue_boh = static_cast(m_umq_bo.get())->get_drm_bo_handle(); @@ -98,11 +127,11 @@ map_doorbell(uint32_t doorbell_offset) m_pdev.mmap(0, sizeof(uint32_t), PROT_WRITE, MAP_SHARED, doorbell_offset)); } -volatile host_queue_header_t * +volatile struct host_queue_header * hw_q_umq:: get_header_ptr() const { - return reinterpret_cast(m_umq_bo_buf); + return reinterpret_cast(m_umq_bo_buf); } void @@ -129,9 +158,33 @@ dump() const shim_debug("\tdistribute:\t%u", pkt->xrt_header.common_header.distribute); shim_debug("\tindirect:\t%u", pkt->xrt_header.common_header.indirect); shim_debug("\tcomplete addr:\t%p", pkt->xrt_header.completion_signal); - for (int j = 0; j < sizeof(pkt->data) / sizeof(pkt->data[0]); j++) - shim_debug("\tdata[%d]:\t0x%08x", j, pkt->data[j]); + if (pkt->xrt_header.common_header.indirect == 0) { + volatile struct exec_buf *ebp = + reinterpret_cast(pkt->data); + + shim_debug("\tcu_index:\t%d", ebp->cu_index); + shim_debug("\tdpu: [0x%x 0x%x]", + ebp->dpu_control_code_host_addr_high, + ebp->dpu_control_code_host_addr_low); + } else { + volatile struct host_indirect_packet_entry *hp = + reinterpret_cast(pkt->data); + + for (int i = 0; i < HSA_INDIRECT_PKT_NUM; i++, hp++) { + shim_debug("\thost addr: [0x%x 0x%x]", hp->host_addr_high, hp->host_addr_low); + + volatile struct host_indirect_data *data = + reinterpret_cast(m_umq_indirect_buf); + shim_debug("\t\th:distribute:\t%d", data[i].header.distribute); + shim_debug("\t\th:indirect:\t%d", data[i].header.indirect); + shim_debug("\t\tp:cu_index:\t%d", data[i].payload.cu_index); + shim_debug("\t\tp:dpu: [0x%x 0x%x]", + data[i].payload.dpu_control_code_host_addr_high, + data[i].payload.dpu_control_code_host_addr_low); + } + } } + shim_debug("dump finished\r\n"); } void @@ -139,7 +192,7 @@ hw_q_umq:: dump_raw() const { auto d = reinterpret_cast(m_umq_pkt); - auto sz = get_header_ptr()->capacity * sizeof(host_queue_packet_t) / sizeof(uint32_t); + auto sz = get_header_ptr()->capacity * sizeof(struct host_queue_packet) / sizeof(uint32_t); shim_debug("Dumping raw UMQ queue slot data @%p, len=%ld WORDs:", m_umq_pkt, sz); for (int i = 0; i < sz; i++) shim_debug("0x%08x", d[i]); @@ -172,11 +225,18 @@ reserve_slot() return cur_slot; } -volatile host_queue_packet_t * +int +hw_q_umq:: +get_pkt_idx(uint64_t index) +{ + return index & (get_header_ptr()->capacity - 1); +} + +volatile struct host_queue_packet * hw_q_umq:: -get_slot(uint64_t index) +get_pkt(uint64_t index) { - auto pkt = &m_umq_pkt[index & (get_header_ptr()->capacity - 1)]; + auto pkt = &m_umq_pkt[get_pkt_idx(index)]; if (is_slot_valid(pkt)) { shim_err(EINVAL, "Slot is ready before use! index=0x%lx", index); dump(); @@ -188,26 +248,101 @@ uint64_t hw_q_umq:: issue_exec_buf(uint16_t cu_idx, ert_dpu_data *dpu, uint64_t comp) { - auto idx = reserve_slot(); - auto pkt = get_slot(idx); + auto slot_idx = reserve_slot(); + auto pkt = get_pkt(slot_idx); + size_t pkt_size; + + if (get_ert_dpu_data_next(dpu)) + pkt_size = fill_indirect_exec_buf(slot_idx, cu_idx, pkt, dpu); + else + pkt_size = fill_direct_exec_buf(cu_idx, pkt, dpu); + auto hdr = &pkt->xrt_header; hdr->common_header.opcode = HOST_QUEUE_PACKET_EXEC_BUF; - hdr->common_header.distribute = 0; - hdr->common_header.indirect = 0; hdr->completion_signal = comp; - exec_buf_t payload = {}; - payload.cu_index = cu_idx; - payload.dpu_control_code_host_addr_low = static_cast(dpu->instruction_buffer); - payload.dpu_control_code_host_addr_high = static_cast(dpu->instruction_buffer >> 32); + fill_slot_and_send(pkt, pkt_size); - fill_slot_and_send(pkt, &payload, sizeof(payload)); - return idx; + return slot_idx; +} + +size_t +hw_q_umq:: +fill_indirect_exec_buf(uint64_t slot_idx, uint16_t cu_idx, + volatile struct host_queue_packet *pkt, + ert_dpu_data *dpu) { + auto pkt_size = (dpu->chained + 1) * sizeof(struct host_indirect_packet_entry); + + if (dpu->chained + 1 >= HSA_INDIRECT_PKT_NUM) + shim_err(EINVAL, "unsupported indirect number %d, valid number <= %d", + dpu->chained + 1, HSA_INDIRECT_PKT_NUM); + + if (pkt_size > sizeof(pkt->data)) + shim_err(EINVAL, "dpu pkt_size=0x%lx > pkt_data max size=%x%lx", + pkt_size, sizeof(pkt->data)); + + // no need to memset to zero, all buffer will be set + volatile struct host_indirect_packet_entry *hp = + reinterpret_cast(pkt->data); + + for (int i = 0; dpu; i++, hp++, dpu = get_ert_dpu_data_next(dpu)) { + auto data_size = sizeof(struct host_indirect_data) * HSA_INDIRECT_PKT_NUM; + auto prefix_off = get_pkt_idx(slot_idx) * data_size; + auto prefix_idx = get_pkt_idx(slot_idx) * HSA_INDIRECT_PKT_NUM; + auto buf_paddr = m_indirect_paddr + prefix_off + + sizeof(struct host_indirect_data) * i; + + hp->host_addr_low = static_cast(buf_paddr); + hp->host_addr_high = static_cast(buf_paddr >> 32); + + auto cebp = &m_umq_indirect_buf[prefix_idx + i]; + // do not zero this buffer, the cebp->header is pre-set + // set every cebp->payload field in case of garbage data + cebp->payload.cu_index = cu_idx; + cebp->payload.dpu_control_code_host_addr_low = + static_cast(dpu->instruction_buffer); + cebp->payload.dpu_control_code_host_addr_high = + static_cast(dpu->instruction_buffer >> 32); + cebp->payload.args_len = 0; + cebp->payload.args_host_addr_low = 0; + cebp->payload.args_host_addr_high = 0; + } + + auto hdr = &pkt->xrt_header; + hdr->common_header.distribute = 1; + hdr->common_header.indirect = 1; + + return pkt_size; +} + +size_t +hw_q_umq:: +fill_direct_exec_buf(uint16_t cu_idx, volatile struct host_queue_packet *pkt, + ert_dpu_data *dpu) { + auto pkt_size = sizeof(struct exec_buf); + if (pkt_size > sizeof(pkt->data)) + shim_err(EINVAL, "dpu pkt_size=0x%lx > pkt_data max size=%x%lx", + pkt_size, sizeof(pkt->data)); + + // zero this buffer + auto data = const_cast(pkt->data); + std::memset(data, 0, pkt_size); + // set correct dpu control code + volatile struct exec_buf *ebp = reinterpret_cast(pkt->data); + ebp->cu_index = cu_idx; + ebp->dpu_control_code_host_addr_low = static_cast(dpu->instruction_buffer); + ebp->dpu_control_code_host_addr_high = static_cast(dpu->instruction_buffer >> 32); + + auto hdr = &pkt->xrt_header; + hdr->common_header.distribute = 0; + hdr->common_header.indirect = 0; + + return pkt_size; } void hw_q_umq:: -fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size) +fill_slot_and_send(volatile struct host_queue_packet *pkt, size_t size) { if (size > sizeof(pkt->data)) shim_err(EINVAL, "HSA packet payload too big, size=0x%lx", size); @@ -215,10 +350,11 @@ fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size auto hdr = &pkt->xrt_header; hdr->common_header.count = size; - auto data = const_cast(pkt->data); - std::memcpy(data, payload, size); /* must flush data to make cache coherence */ - clflush_data((void *)data, size); + clflush_data((void *)(pkt->data), size); + + //comment this out, debug only + //dump(); /* Always done as last step. */ mark_slot_valid(pkt); @@ -247,9 +383,9 @@ issue_command(xrt_core::buffer_handle *cmd_bo) } if (get_ert_dpu_data_next(dpu_data)) - shim_err(EOPNOTSUPP, "chained dpu data is not supported yet"); + shim_debug("this is a multi-column dpu request."); - // Completion signal area has to be a full WORD + // Completion signal area has to be a full WORD, we utilze the command_bo uint64_t comp = boh->get_properties().paddr + offsetof(ert_start_kernel_cmd, header); auto id = issue_exec_buf(ffs(cmd->cu_mask) - 1, dpu_data, comp); diff --git a/src/shim/umq/hwq.h b/src/shim/umq/hwq.h index 1d484569..8c8cc707 100644 --- a/src/shim/umq/hwq.h +++ b/src/shim/umq/hwq.h @@ -30,14 +30,22 @@ class hw_q_umq : public hw_q void bind_hwctx(const hw_ctx *ctx); - volatile host_queue_header_t * + volatile struct host_queue_header * get_header_ptr() const; private: + + struct host_indirect_data { + struct common_header header; + struct exec_buf payload; + }; + std::unique_ptr m_umq_bo; void *m_umq_bo_buf; - volatile host_queue_header_t *m_umq_hdr = nullptr; - volatile host_queue_packet_t *m_umq_pkt = nullptr; + volatile struct host_queue_header *m_umq_hdr = nullptr; + volatile struct host_queue_packet *m_umq_pkt = nullptr; + volatile struct host_indirect_data *m_umq_indirect_buf = nullptr; + uint64_t m_indirect_paddr; volatile uint32_t *m_mapped_doorbell = nullptr; @@ -46,11 +54,25 @@ class hw_q_umq : public hw_q uint64_t reserve_slot(); - volatile host_queue_packet_t * - get_slot(uint64_t index); + int + get_pkt_idx(uint64_t index); + + volatile struct host_queue_packet * + get_pkt(uint64_t index); + + void + init_indirect_buf(volatile struct host_indirect_data *indirect_buf, int size); + + size_t + fill_direct_exec_buf(uint16_t cu_idx, + volatile struct host_queue_packet *pkt, ert_dpu_data *dpu); + + size_t + fill_indirect_exec_buf(uint64_t idx, uint16_t cu_idx, + volatile struct host_queue_packet *pkt, ert_dpu_data *dpu); void - fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size); + fill_slot_and_send(volatile struct host_queue_packet *pkt, size_t size); uint64_t issue_exec_buf(uint16_t cu_idx, ert_dpu_data *dpu_data, uint64_t comp); diff --git a/test/shim_test/io_config.h b/test/shim_test/io_config.h index 27478ef1..1d1351db 100644 --- a/test/shim_test/io_config.h +++ b/test/shim_test/io_config.h @@ -208,7 +208,7 @@ int verify_output(int8_t* buf, const std::string &wrk_path) ss >> key >> str_val; ss.clear(); golden_output_files.push_back(wrk_path + "golden_" + str_val + ".bin"); - dump_output_files.push_back(wrk_path + "dump_" + str_val + ".bin"); + dump_output_files.push_back("/tmp/dump_" + str_val + "." + std::to_string(getpid()) + ".bin"); getline(myfile, line); ss.str(line); @@ -239,12 +239,15 @@ int verify_output(int8_t* buf, const std::string &wrk_path) int ret = 0; for (int i = 0; i < num_outputs; i++) { - std::cout << "Examing output: " << golden_output_files[i] << std::endl; ret = comp_buf_strides(buf + output_ddr_addr[i], golden_output_files[i], dump_output_files[i], output_shapes[i], output_strides[i]); if (ret) { std::cout << "Examing failed, ret " << ret << std::endl; + std::cout << "Examing output: " << dump_output_files[i] << std::endl; break; + } else { + if (std::remove(dump_output_files[i].c_str())) + std::cout << "Failed to remove " << dump_output_files[i] << std::endl; } } diff --git a/test/shim_test/io_param.h b/test/shim_test/io_param.h index b86446a6..452c1076 100644 --- a/test/shim_test/io_param.h +++ b/test/shim_test/io_param.h @@ -14,6 +14,9 @@ struct io_test_parameter { #define IO_TEST_NOOP_RUN 1 #define IO_TEST_BAD_RUN 2 int type; +#define IO_TEST_IOCTL_WAIT 0 +#define IO_TEST_POLL_WAIT 1 + int wait; bool debug; }; diff --git a/test/shim_test/io_test.cpp b/test/shim_test/io_test.cpp index 747e8be0..74df0346 100644 --- a/test/shim_test/io_test.cpp +++ b/test/shim_test/io_test.cpp @@ -19,10 +19,11 @@ namespace { io_test_parameter io_test_parameters; void -io_test_parameter_init(int perf, int type, bool debug = false) +io_test_parameter_init(int perf, int type, int wait, bool debug = false) { io_test_parameters.perf = perf; io_test_parameters.type = type; + io_test_parameters.wait = wait; io_test_parameters.debug = debug; } @@ -30,7 +31,7 @@ io_test_bo_set alloc_and_init_bo_set(device* dev, const std::string& local_data_path) { io_test_bo_set boset{dev, local_data_path}; - auto bos = boset.get_bos(); + auto& bos = boset.get_bos(); if (io_test_parameters.type == IO_TEST_NOOP_RUN) { // Preparing no-op kernel's special control code @@ -82,7 +83,15 @@ io_test_init_runlist_cmd(bo* cmd_bo, std::vector& cmd_bos) } } -#define IO_TEST_TIMEOUT 5000 /* millisecond */ +void io_test_cmd_wait(hwqueue_handle *hwq, std::shared_ptr bo) +{ + if (io_test_parameters.wait == IO_TEST_POLL_WAIT) { + while(!hwq->poll_command(bo->get())); + } else { + hwq->wait_command(bo->get(), 0); + } +} + void io_test_cmd_submit_and_wait_latency( hwqueue_handle *hwq, @@ -96,9 +105,10 @@ io_test_cmd_submit_and_wait_latency( while (completed < total_cmd_submission) { for (auto& cmd : cmdlist_bos) { hwq->submit_command(std::get<0>(cmd).get()->get()); - hwq->wait_command(std::get<0>(cmd).get()->get(), IO_TEST_TIMEOUT); + io_test_cmd_wait(hwq, std::get<0>(cmd)); if (std::get<1>(cmd)->state != ERT_CMD_STATE_COMPLETED) throw std::runtime_error("Command error"); + std::get<1>(cmd)->state = ERT_CMD_STATE_NEW; completed++; if (completed >= total_cmd_submission) break; @@ -125,9 +135,10 @@ io_test_cmd_submit_and_wait_thruput( } while (completed < issued) { - hwq->wait_command(std::get<0>(cmdlist_bos[wait_idx]).get()->get(), IO_TEST_TIMEOUT); + io_test_cmd_wait(hwq, std::get<0>(cmdlist_bos[wait_idx])); if (std::get<1>(cmdlist_bos[wait_idx])->state != ERT_CMD_STATE_COMPLETED) throw std::runtime_error("Command error"); + std::get<1>(cmdlist_bos[wait_idx])->state = ERT_CMD_STATE_NEW; completed++; if (issued < total_cmd_submission) { @@ -235,28 +246,63 @@ io_test(device::id_type id, device* dev, int total_hwq_submit, int num_cmdlist, void TEST_io(device::id_type id, std::shared_ptr sdev, arg_type& arg) { - io_test_parameter_init(IO_TEST_NO_PERF, static_cast(arg[0])); + unsigned int run_type = static_cast(arg[0]); + + io_test_parameter_init(IO_TEST_NO_PERF, run_type, IO_TEST_IOCTL_WAIT); io_test(id, sdev.get(), 1, 1, arg[1]); } void TEST_io_latency(device::id_type id, std::shared_ptr sdev, arg_type& arg) { - io_test_parameter_init(IO_TEST_LATENCY_PERF, static_cast(arg[0])); - io_test(id, sdev.get(), 1000, 1, 1); + unsigned int run_type = static_cast(arg[0]); + unsigned int wait_type = static_cast(arg[1]); + unsigned int total = static_cast(arg[2]); + + io_test_parameter_init(IO_TEST_LATENCY_PERF, run_type, wait_type); + io_test(id, sdev.get(), total, 1, 1); } void TEST_io_throughput(device::id_type id, std::shared_ptr sdev, arg_type& arg) { + unsigned int run_type = static_cast(arg[0]); + unsigned int wait_type = static_cast(arg[1]); + unsigned int total = static_cast(arg[2]); + + io_test_parameter_init(IO_TEST_THRUPUT_PERF, run_type, wait_type); + io_test(id, sdev.get(), total, 8, 1); +} + +void +TEST_io_runlist_latency(device::id_type id, std::shared_ptr sdev, arg_type& arg) +{ + unsigned int run_type = static_cast(arg[0]); + unsigned int wait_type = static_cast(arg[1]); + unsigned int total = static_cast(arg[2]); + const size_t max_cmd_per_list = 24; + + io_test_parameter_init(IO_TEST_LATENCY_PERF, run_type, wait_type); + for (int cmds_per_list = 1; cmds_per_list <=32; cmds_per_list *=2) { + if (cmds_per_list > max_cmd_per_list) + cmds_per_list = max_cmd_per_list; + int total_hwq_submit = total / cmds_per_list; + io_test(id, sdev.get(), total_hwq_submit, 1, cmds_per_list); + } +} + +void +TEST_io_runlist_throughput(device::id_type id, std::shared_ptr sdev, arg_type& arg) +{ + unsigned int run_type = static_cast(arg[0]); + unsigned int wait_type = static_cast(arg[1]); + unsigned int total_commands = static_cast(arg[2]); int num_bo_set = 256; - int total_commands = 32000; const size_t max_cmd_per_list = 24; - io_test_parameter_init(IO_TEST_THRUPUT_PERF, static_cast(arg[0])); + io_test_parameter_init(IO_TEST_THRUPUT_PERF, run_type, wait_type); - int cmds_per_list; - for (cmds_per_list = 1; cmds_per_list <= 32; cmds_per_list *= 2) { + for (int cmds_per_list = 1; cmds_per_list <= 32; cmds_per_list *= 2) { if (cmds_per_list > max_cmd_per_list) cmds_per_list = max_cmd_per_list; int num_cmdlist = num_bo_set / cmds_per_list; diff --git a/test/shim_test/shim_test.cpp b/test/shim_test/shim_test.cpp index 140d240c..b1c38aea 100644 --- a/test/shim_test/shim_test.cpp +++ b/test/shim_test/shim_test.cpp @@ -30,6 +30,8 @@ void TEST_export_import_bo(device::id_type, std::shared_ptr, arg_type&); void TEST_io(device::id_type, std::shared_ptr, arg_type&); void TEST_io_latency(device::id_type, std::shared_ptr, arg_type&); void TEST_io_throughput(device::id_type, std::shared_ptr, arg_type&); +void TEST_io_runlist_latency(device::id_type, std::shared_ptr, arg_type&); +void TEST_io_runlist_throughput(device::id_type, std::shared_ptr, arg_type&); void TEST_noop_io_with_dup_bo(device::id_type, std::shared_ptr, arg_type&); void TEST_shim_umq_vadd(device::id_type, std::shared_ptr, arg_type&); void TEST_shim_umq_memtiles(device::id_type, std::shared_ptr, arg_type&); @@ -519,10 +521,10 @@ std::vector test_list { TEST_POSITIVE, dev_filter_is_aie2, TEST_io, { IO_TEST_NORMAL_RUN, 1 } }, test_case{ "measure no-op kernel latency", - TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN } + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 } }, test_case{ "measure real kernel latency", - TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NORMAL_RUN } + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NORMAL_RUN, IO_TEST_IOCTL_WAIT, 32000 } }, test_case{ "create and free debug bo", TEST_POSITIVE, dev_filter_is_aie2, TEST_create_free_debug_bo, { 0x1000 } @@ -533,8 +535,8 @@ std::vector test_list { test_case{ "multi-command io test real kernel good run", TEST_POSITIVE, dev_filter_is_aie2, TEST_io, { IO_TEST_NORMAL_RUN, 3 } }, - test_case{ "measure no-op kernel throughput listed command", - TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN } + test_case{ "measure no-op kernel throughput chained command", + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 } }, test_case{ "npu3 shim vadd", TEST_POSITIVE, dev_filter_is_aie4, TEST_shim_umq_vadd, {} @@ -560,9 +562,27 @@ std::vector test_list { //test_case{ "Cmd fencing (device side)", // TEST_POSITIVE, dev_filter_is_aie2, TEST_cmd_fence_device, {} //}, - //test_case{ "io test no op with duplicated BOs", - // TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {} - //}, + test_case{ "io test no op with duplicated BOs", + TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {} + }, + test_case{ "measure no-op kernel latency chained command", + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 } + }, + test_case{ "measure no-op kernel throuput", + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 } + }, + test_case{ "measure no-op kernel latency (polling)", + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 } + }, + test_case{ "measure no-op kernel throuput (polling)", + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 } + }, + test_case{ "measure no-op kernel latency chained command (polling)", + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 } + }, + test_case{ "measure no-op kernel throughput chained command (polling)", + TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 } + }, }; } // namespace diff --git a/test/shim_test/speed.h b/test/shim_test/speed.h index 89d41fae..79702816 100644 --- a/test/shim_test/speed.h +++ b/test/shim_test/speed.h @@ -7,6 +7,7 @@ #include using clk = std::chrono::high_resolution_clock; +using ms_t = std::chrono::milliseconds; using us_t = std::chrono::microseconds; using ns_t = std::chrono::nanoseconds; diff --git a/tools/bins/17f0_20/validate.xclbin b/tools/bins/17f0_20/validate.xclbin new file mode 100644 index 00000000..9c66f31b Binary files /dev/null and b/tools/bins/17f0_20/validate.xclbin differ diff --git a/tools/info.json b/tools/info.json index c8f73197..c55558d5 100644 --- a/tools/info.json +++ b/tools/info.json @@ -1,21 +1,21 @@ { "copyright": "Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.", "xrt" : { - "version": "202420.2.18.101", + "version": "202420.2.18.134", "os_rel": "22.04" }, "firmwares": [ { "device": "npu1", - "url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.313", + "url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.329", "pci_device_id": "1502", "pci_revision_id": "00", - "version": "1.4.2.313", + "version": "1.4.2.329", "fw_name": "npu.sbin" }, { "device": "npu2", - "url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_00/npu.sbin.0.7.22.185", + "url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_00/npu.sbin.0.7.22.185", "pci_device_id": "17f0", "pci_revision_id": "00", "version": "0.7.22.185", @@ -23,26 +23,18 @@ }, { "device": "npu4", - "url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.30.20", + "url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.35.35", "pci_device_id": "17f0", "pci_revision_id": "10", - "version": "0.7.30.20", + "version": "0.7.35.35", "fw_name": "npu.sbin" }, { "device": "npu5", - "url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.30.101", + "url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.35.139", "pci_device_id": "17f0", "pci_revision_id": "11", - "version": "0.7.30.101", - "fw_name": "npu.sbin" - }, - { - "device": "npu6", - "url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_20/npu.sbin.0.7.30.20", - "pci_device_id": "17f0", - "pci_revision_id": "20", - "version": "0.7.30.20", + "version": "0.7.35.139", "fw_name": "npu.sbin" } ] diff --git a/xrt b/xrt index 476f42f4..64d03f56 160000 --- a/xrt +++ b/xrt @@ -1 +1 @@ -Subproject commit 476f42f419bbc5d1545aded3627f03c1c2f1336e +Subproject commit 64d03f567db628c9107b6fcf5d362668d1834567