diff --git a/README.md b/README.md
index 231ecb7e..13a8c4b0 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ This repository is for the AMD XDNA™️ Driver (amdxdna.ko) for Linux®️ and
 - [System Requirements](#system-requirements)
 - [Linux compilation and installation](#linux-compilation-and-installation)
 - [Clone](#clone)
+  
 - [Build](#build)
 - [Test](#test)
 - [Q&A](#qa)
@@ -88,7 +89,7 @@ cd <root-of-source-tree>/build
 cd xrt/build
 ./build.sh -noert -noalveo
 # To adapt according to your OS & version
-sudo apt reinstall ./Release/xrt_202410.2.17.0_23.10-amd64-xrt.deb ./Release/xrt_202410.2.17.0_23.10-amd64-xbflash2.deb
+sudo apt reinstall ./Release/xrt_202410.2.17.0_23.10-amd64-xrt.deb
 cd ../../build
 
 # Start XDNA driver release build
diff --git a/WHENCE b/WHENCE
index 46fe77ea..800b3567 100644
--- a/WHENCE
+++ b/WHENCE
@@ -11,5 +11,6 @@ File:
 	tools/bins/1502_00/validate.xclbin
 	tools/bins/17f0_10/validate.xclbin
 	tools/bins/17f0_11/validate.xclbin
+	tools/bins/17f0_20/validate.xclbin
 
 Licence: Redistributable. See LICENSE.amdnpu for details.
diff --git a/src/driver/CMakeLists.txt b/src/driver/CMakeLists.txt
index 1687d274..ad33fb76 100644
--- a/src/driver/CMakeLists.txt
+++ b/src/driver/CMakeLists.txt
@@ -92,6 +92,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/dkms.conf
 
 set(amdxdna_drv_tools
   ${CMAKE_CURRENT_SOURCE_DIR}/tools/dkms_driver.sh
+  ${CMAKE_CURRENT_SOURCE_DIR}/tools/npu_perf_trace.sh
+  ${CMAKE_CURRENT_SOURCE_DIR}/tools/npu_perf_analyze.sh
   )
 install(FILES ${amdxdna_drv_tools}
   PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
diff --git a/src/driver/amdxdna/aie2_ctx.c b/src/driver/amdxdna/aie2_ctx.c
index c630f048..1cab054c 100644
--- a/src/driver/amdxdna/aie2_ctx.c
+++ b/src/driver/amdxdna/aie2_ctx.c
@@ -47,12 +47,6 @@ aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
 {
 	int idx;
 
-	/* Special sequence number for oldest fence if exist */
-	if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
-		idx = get_job_idx(hwctx->submitted);
-		goto out;
-	}
-
 	if (seq >= hwctx->submitted)
 		return ERR_PTR(-EINVAL);
 
@@ -60,8 +54,6 @@ aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
 		return NULL;
 
 	idx = get_job_idx(seq);
-
-out:
 	return hwctx->priv->pending[idx];
 }
 
@@ -230,8 +222,8 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
 	struct dma_fence *fence = job->fence;
 
 	job->hwctx->completed++;
+	trace_xdna_job(&job->base, job->hwctx->name, "signaling fence", job->seq);
 	dma_fence_signal(fence);
-	trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
 	dma_fence_put(fence);
 	mmput(job->mm);
 	amdxdna_job_put(job);
@@ -257,7 +249,7 @@ aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
 	}
 
 	status = *data;
-	XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+	XDNA_DBG(job->hwctx->client->xdna, "Response status 0x%x", status);
 	if (status == AIE2_STATUS_SUCCESS)
 		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
 	else
@@ -284,7 +276,7 @@ aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
 	}
 
 	status = *data;
-	XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+	XDNA_DBG(job->hwctx->client->xdna, "Response status 0x%x", status);
 
 out:
 	aie2_sched_notify(job);
@@ -540,6 +532,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 	struct drm_gpu_scheduler *sched;
 	struct amdxdna_hwctx_priv *priv;
 	struct amdxdna_gem_obj *heap;
+	unsigned int wq_flags;
 	int i, ret;
 
 	priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
@@ -587,12 +580,21 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 
 	sched = &priv->sched;
 	mutex_init(&priv->io_lock);
-	ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
+
+	wq_flags = __WQ_ORDERED;
+	if (!aie2_pm_is_turbo(xdna->dev_handle))
+		wq_flags |= WQ_UNBOUND;
+	priv->submit_wq = alloc_workqueue(hwctx->name, wq_flags, 1);
+	if (!priv->submit_wq) {
+		XDNA_ERR(xdna, "Failed to alloc submit wq");
+		goto free_cmd_bufs;
+	}
+	ret = drm_sched_init(sched, &sched_ops, priv->submit_wq, DRM_SCHED_PRIORITY_COUNT,
 			     HWCTX_MAX_CMDS, 0, MAX_SCHEDULE_TIMEOUT,
 			     NULL, NULL, hwctx->name, xdna->ddev.dev);
 	if (ret) {
 		XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
-		goto free_cmd_bufs;
+		goto free_wq;
 	}
 
 	ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
@@ -645,6 +647,8 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 	drm_sched_entity_destroy(&priv->entity);
 free_sched:
 	drm_sched_fini(&priv->sched);
+free_wq:
+	destroy_workqueue(priv->submit_wq);
 free_cmd_bufs:
 	for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
 		if (!priv->cmd_buf[i])
@@ -681,6 +685,7 @@ void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
 	aie2_hwctx_wait_for_idle(hwctx);
 	drm_sched_entity_destroy(&hwctx->priv->entity);
 	drm_sched_fini(&hwctx->priv->sched);
+	destroy_workqueue(hwctx->priv->submit_wq);
 
 	for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) {
 		job = hwctx->priv->pending[idx];
@@ -928,7 +933,7 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 	job->out_fence = dma_fence_get(&job->base.s_fence->finished);
 
 retry:
-	ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+	ret = amdxdna_lock_objects(job, &acquire_ctx);
 	if (ret) {
 		XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret);
 		goto put_fence;
@@ -937,7 +942,7 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 	for (i = 0; i < job->bo_cnt; i++) {
 		abo = to_xdna_obj(job->bos[i]);
 		if (abo->mem.map_invalid) {
-			drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+			amdxdna_unlock_objects(job, &acquire_ctx);
 			if (!timeout) {
 				timeout = jiffies +
 					msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
@@ -955,19 +960,27 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 		ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
 		if (ret) {
 			XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
-			drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+			amdxdna_unlock_objects(job, &acquire_ctx);
 			goto put_fence;
 		}
 	}
 
 	for (i = 0; i < job->bo_cnt; i++)
 		dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
-	drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+	amdxdna_unlock_objects(job, &acquire_ctx);
 
+again:
 	mutex_lock(&hwctx->priv->io_lock);
 	ret = aie2_hwctx_add_job(hwctx, job);
 	if (ret) {
 		mutex_unlock(&hwctx->priv->io_lock);
+
+		if (ret == -EAGAIN) {
+			// Waiting for the first pending cmd to complete before trying again.
+			int res = aie2_cmd_wait(hwctx, hwctx->submitted - HWCTX_MAX_CMDS, 0);
+			if (!res)
+				goto again;
+		}
 		goto signal_fence;
 	}
 
diff --git a/src/driver/amdxdna/aie2_debugfs.c b/src/driver/amdxdna/aie2_debugfs.c
index ea5105f7..4023e6b2 100644
--- a/src/driver/amdxdna/aie2_debugfs.c
+++ b/src/driver/amdxdna/aie2_debugfs.c
@@ -77,71 +77,6 @@ static int aie2_dbgfs_entry_release(struct inode *inode, struct file *file)
 #define file_to_ndev_rw(file) \
 	(((struct seq_file *)(file)->private_data)->private)
 
-static ssize_t
-aie2_dbgfs_clock_write(struct amdxdna_dev_hdl *ndev, struct clock *clock,
-		       const char __user *ptr, size_t len, loff_t *off)
-{
-	u32 val;
-	int ret;
-
-	ret = kstrtouint_from_user(ptr, len, 10, &val);
-	if (ret) {
-		XDNA_ERR(ndev->xdna, "Invalid input value: %d", val);
-		return ret;
-	}
-
-	clock->dbg_freq_mhz = val;
-	if (!clock->dbg_freq_mhz) {
-		XDNA_INFO(ndev->xdna, "Auto %s", clock->name);
-		return 0;
-	}
-
-	ret = aie2_smu_set_clock_freq(ndev, clock, val);
-	if (ret) {
-		clock->dbg_freq_mhz = 0;
-		XDNA_ERR(ndev->xdna, "Set %s ret %d, use auto clock", clock->name, ret);
-		return ret;
-	}
-
-	return len;
-}
-
-static ssize_t aie2_dbgfs_mpnpu_clock_write(struct file *file, const char __user *ptr,
-					    size_t len, loff_t *off)
-{
-	struct amdxdna_dev_hdl *ndev = file_to_ndev_rw(file);
-
-	return aie2_dbgfs_clock_write(ndev, &ndev->smu.mp_npu_clock, ptr, len, off);
-}
-
-static int aie2_dbgfs_mpnpu_clock_show(struct seq_file *m, void *unused)
-{
-	struct amdxdna_dev_hdl *ndev = m->private;
-
-	seq_printf(m, "%d\n", aie2_smu_get_mpnpu_clock_freq(ndev));
-	return 0;
-}
-
-AIE2_DBGFS_FOPS(npuclock, aie2_dbgfs_mpnpu_clock_show, aie2_dbgfs_mpnpu_clock_write);
-
-static ssize_t aie2_dbgfs_hclock_write(struct file *file, const char __user *ptr,
-				       size_t len, loff_t *off)
-{
-	struct amdxdna_dev_hdl *ndev = file_to_ndev_rw(file);
-
-	return aie2_dbgfs_clock_write(ndev, &ndev->smu.h_clock, ptr, len, off);
-}
-
-static int aie2_dbgfs_hclock_show(struct seq_file *m, void *unused)
-{
-	struct amdxdna_dev_hdl *ndev = m->private;
-
-	seq_printf(m, "%d\n", aie2_smu_get_hclock_freq(ndev));
-	return 0;
-}
-
-AIE2_DBGFS_FOPS(hclock, aie2_dbgfs_hclock_show, aie2_dbgfs_hclock_write);
-
 static ssize_t aie2_pasid_write(struct file *file, const char __user *ptr,
 				size_t len, loff_t *off)
 {
@@ -291,7 +226,7 @@ static ssize_t aie2_dpm_level_set(struct file *file, const char __user *ptr,
 		return ret;
 	}
 
-	ret = aie2_smu_set_dpm_level(ndev, val, true);
+	ret = aie2_smu_set_dpm_level(ndev, val);
 	if (ret) {
 		XDNA_ERR(ndev->xdna, "Setting dpm_level:%d failed, ret: %d", val, ret);
 		return ret;
@@ -302,8 +237,24 @@ static ssize_t aie2_dpm_level_set(struct file *file, const char __user *ptr,
 static int aie2_dpm_level_get(struct seq_file *m, void *unused)
 {
 	struct amdxdna_dev_hdl *ndev = m->private;
+	const struct dpm_clk *dpm_table;
+	u32 num_dpm_levels;
+	int dpm_level;
+	int i;
 
-	seq_printf(m, "%d\n", aie2_smu_get_dpm_level(ndev));
+	dpm_table = SMU_DPM_TABLE_ENTRY(ndev, 0);
+	dpm_level = aie2_smu_get_dpm_level(ndev);
+	num_dpm_levels = SMU_DPM_MAX(ndev);
+	for (i = 0; i <= num_dpm_levels; i++) {
+		u32 npuclk = dpm_table[i].npuclk;
+		u32 hclk = dpm_table[i].hclk;
+
+		if (dpm_level == i)
+			seq_printf(m, " [%d,%d] ", npuclk, hclk);
+		else
+			seq_printf(m, " %d,%d ", npuclk, hclk);
+	}
+	seq_puts(m, "\n");
 	return 0;
 }
 
@@ -511,8 +462,6 @@ seq_printf(m, "%ld:%s\n", _name, #_name)
 	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_WAIT_CMD);
 	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_GET_INFO);
 	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SET_STATE);
-	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL);
-	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SUBMIT_WAIT);
 
 	drm_ioctl_id_seq_print(DRM_IOCTL_GEM_CLOSE);
 	drm_ioctl_id_seq_print(DRM_IOCTL_PRIME_HANDLE_TO_FD);
@@ -609,8 +558,6 @@ const struct {
 	umode_t mode;
 } aie2_dbgfs_files[] = {
 	AIE2_DBGFS_FILE(nputest, 0600),
-	AIE2_DBGFS_FILE(hclock, 0600),
-	AIE2_DBGFS_FILE(npuclock, 0600),
 	AIE2_DBGFS_FILE(pasid, 0600),
 	AIE2_DBGFS_FILE(state, 0600),
 	AIE2_DBGFS_FILE(powerstate, 0600),
diff --git a/src/driver/amdxdna/aie2_message.c b/src/driver/amdxdna/aie2_message.c
index 66b3299a..e1ccd9af 100644
--- a/src/driver/amdxdna/aie2_message.c
+++ b/src/driver/amdxdna/aie2_message.c
@@ -16,8 +16,13 @@
 #define DECLARE_AIE2_MSG(name, op) \
 	DECLARE_XDNA_MSG_COMMON(name, op, MAX_AIE2_STATUS_CODE)
 
-static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
-				   struct xdna_mailbox_msg *msg)
+#define aie2_send_mgmt_msg_wait(ndev, msg) \
+	aie2_send_mgmt_msg_wait_offset(ndev, msg, 0)
+
+static int
+aie2_send_mgmt_msg_wait_offset(struct amdxdna_dev_hdl *ndev,
+			       struct xdna_mailbox_msg *msg,
+			       u32 offset)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
 	struct xdna_notify *hdl = msg->handle;
@@ -34,7 +39,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
 		ndev->mgmt_chann = NULL;
 	}
 
-	if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) {
+	if (!ret && hdl->data[offset] != AIE2_STATUS_SUCCESS) {
 		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
 			 msg->opcode, *hdl->data);
 		ret = -EINVAL;
@@ -95,18 +100,9 @@ int aie2_check_protocol_version(struct amdxdna_dev_hdl *ndev)
 		return ret;
 	}
 
-	if (resp.major != ndev->priv->protocol_major) {
-		XDNA_ERR(xdna, "Incompatible firmware protocol version major %d minor %d",
-			 resp.major, resp.minor);
-		return -EINVAL;
-	}
-
-	/*
-	 * Greater protocol minor version means new messages/status/emun are
-	 * added into the firmware interface protocol.
-	 */
-	if (resp.minor < ndev->priv->protocol_minor) {
-		XDNA_ERR(xdna, "Firmware minor version smaller than supported");
+	ret = aie2_check_protocol(ndev, resp.major, resp.minor);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed check protocol %d.%d", resp.major, resp.minor);
 		return -EINVAL;
 	}
 
@@ -114,36 +110,6 @@ int aie2_check_protocol_version(struct amdxdna_dev_hdl *ndev)
 }
 
 #ifdef AMDXDNA_DEVEL
-/* TODO: Delete this. move status to the first word of struct get_telemetry_resp */
-static int aie2_send_mgmt_msg_wait_for_telemetry(struct amdxdna_dev_hdl *ndev,
-						 struct xdna_mailbox_msg *msg)
-{
-	struct amdxdna_dev *xdna = ndev->xdna;
-	struct xdna_notify *hdl = msg->handle;
-	struct get_telemetry_resp *resp;
-	int ret;
-
-	if (!ndev->mgmt_chann)
-		return -ENODEV;
-
-	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
-	ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
-	if (ret == -ETIME) {
-		xdna_mailbox_stop_channel(ndev->mgmt_chann);
-		xdna_mailbox_destroy_channel(ndev->mgmt_chann);
-		ndev->mgmt_chann = NULL;
-	}
-
-	resp = (struct get_telemetry_resp *)hdl->data;
-	if (!ret && resp->status != AIE2_STATUS_SUCCESS) {
-		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
-			 msg->opcode, resp->status);
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
 int aie2_get_telemetry(struct amdxdna_dev_hdl *ndev, u32 type, dma_addr_t addr, u32 size)
 {
 	DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY);
@@ -159,7 +125,7 @@ int aie2_get_telemetry(struct amdxdna_dev_hdl *ndev, u32 type, dma_addr_t addr,
 	req.buf_size = size;
 	req.type = type;
 
-	ret = aie2_send_mgmt_msg_wait_for_telemetry(ndev, &msg);
+	ret = aie2_send_mgmt_msg_wait_offset(ndev, &msg, XDNA_STATUS_OFFSET(get_telemetry));
 	if (ret) {
 		XDNA_ERR(xdna, "Failed to get telemetry, ret %d", ret);
 		return ret;
@@ -258,6 +224,7 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 {
 	DECLARE_AIE2_MSG(create_ctx, MSG_OP_CREATE_CONTEXT);
 	struct amdxdna_dev *xdna = ndev->xdna;
+	enum xdna_mailbox_channel_type type;
 	struct xdna_mailbox_chann_res x2i;
 	struct xdna_mailbox_chann_res i2x;
 	struct cq_pair *cq_pair;
@@ -296,8 +263,12 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 	}
 
 	intr_reg = i2x.mb_head_ptr_reg + 4;
+	if (aie2_pm_is_turbo(ndev))
+		type = MB_CHANNEL_USER_POLL;
+	else
+		type = MB_CHANNEL_USER_NORMAL;
 	hwctx->priv->mbox_chann = xdna_mailbox_create_channel(ndev->mbox, &x2i, &i2x,
-							      intr_reg, ret);
+							      intr_reg, ret, type);
 	if (!hwctx->priv->mbox_chann) {
 		XDNA_ERR(xdna, "not able to create channel");
 		ret = -EINVAL;
diff --git a/src/driver/amdxdna/aie2_msg_priv.h b/src/driver/amdxdna/aie2_msg_priv.h
index 663a6084..2d18ef63 100644
--- a/src/driver/amdxdna/aie2_msg_priv.h
+++ b/src/driver/amdxdna/aie2_msg_priv.h
@@ -186,7 +186,6 @@ struct exec_dpu_req {
 	u32     inst_prop_cnt;
 	u32     cu_idx;
 	u32	payload[35];
-
 } __packed;
 
 struct exec_dpu_preempt_req {
diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c
index 05bfe03e..2825710c 100644
--- a/src/driver/amdxdna/aie2_pci.c
+++ b/src/driver/amdxdna/aie2_pci.c
@@ -21,15 +21,31 @@
 #include "aie2_internal.h"
 #endif
 
-int aie2_max_col = XRS_MAX_COL;
-module_param(aie2_max_col, int, 0600);
+uint aie2_max_col = XRS_MAX_COL;
+module_param(aie2_max_col, uint, 0600);
 MODULE_PARM_DESC(aie2_max_col, "Maximum column could be used");
 
+uint aie2_control_flags;
+module_param(aie2_control_flags, uint, 0400);
+MODULE_PARM_DESC(aie2_control_flags,
+		 " Bit " __stringify(AIE2_BIT_BYPASS_POWER_SWITCH) ": Bypass power on/off,"
+		 " Bit " __stringify(AIE2_BIT_BYPASS_SET_FREQ) ": Bypass set freq,"
+		 " Bit " __stringify(AIE2_BIT_BYPASS_FW_LOAD) ": Bypass FW loading");
+
 /*
  * The management mailbox channel is allocated by firmware.
  * The related register and ring buffer information is on SRAM BAR.
  * This struct is the register layout.
+ *
+ * Mgmt channel info query flow:
+ * 1. Poll alive pointer register until it is non zero
+ * 2. The alive pointer pointing to Mgmt Mbox Info on SRAM bar
+ * 4. Read x2i_* and i2x_*
+ * 3. If magic number MGMT_MBOX_MAGIC not presented, done;
+ * Otherwise, read msi_id, major, minor etc..
  */
+#define MGMT_MBOX_MAGIC 0x55504e5f /* _NPU */
+#define MAGIC_OFFSET offsetof(struct mgmt_mbox_chann_info, magic[0])
 struct mgmt_mbox_chann_info {
 	u32	x2i_tail;
 	u32	x2i_head;
@@ -39,8 +55,45 @@ struct mgmt_mbox_chann_info {
 	u32	i2x_head;
 	u32	i2x_buf;
 	u32	i2x_buf_sz;
+	u32	magic;
+	u32	msi_id;
+	u32	prot_major;
+	u32	prot_minor;
+	u32	rsvd[4];
 };
 
+int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+
+	/*
+	 * The driver supported mailbox behavior is defined by
+	 * ndev->priv->protocol_major and protocol_minor.
+	 *
+	 * When major different, it means incompatible behavior.
+	 * When only minor different, the greater minor means more opcode etc.
+	 *
+	 * Thus,
+	 * 1. driver and fw major must be the same
+	 * 2. driver minor must smaller than or equal to fw minor
+	 */
+	if (ndev->priv->protocol_major != fw_major) {
+		XDNA_ERR(xdna, "Incompatible firmware protocol major %d minor %d",
+			 fw_major, fw_minor);
+		return -EINVAL;
+	}
+
+	/*
+	 * Greater protocol minor version means new messages/status/emun are
+	 * added into the firmware interface protocol.
+	 */
+	if (ndev->priv->protocol_minor > fw_minor) {
+		XDNA_ERR(xdna, "Firmware minor version smaller than supported");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static inline void aie2_dump_chann_info_debug(struct amdxdna_dev_hdl *ndev)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
@@ -54,6 +107,11 @@ static inline void aie2_dump_chann_info_debug(struct amdxdna_dev_hdl *ndev)
 	XDNA_DBG(xdna, "x2i ringbuf 0x%x", ndev->mgmt_x2i.rb_start_addr);
 	XDNA_DBG(xdna, "x2i rsize   0x%x", ndev->mgmt_x2i.rb_size);
 	XDNA_DBG(xdna, "x2i chann index 0x%x", ndev->mgmt_chan_idx);
+	if (!ndev->mgmt_prot_major)
+		return;
+
+	XDNA_DBG(xdna, "mailbox protocol major 0x%x", ndev->mgmt_prot_major);
+	XDNA_DBG(xdna, "mailbox protocol minor 0x%x", ndev->mgmt_prot_minor);
 }
 
 static int aie2_get_mgmt_chann_info(struct amdxdna_dev_hdl *ndev)
@@ -96,14 +154,25 @@ static int aie2_get_mgmt_chann_info(struct amdxdna_dev_hdl *ndev)
 	x2i->mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, info_regs.x2i_tail);
 	x2i->rb_start_addr   = AIE2_SRAM_OFF(ndev, info_regs.x2i_buf);
 	x2i->rb_size         = info_regs.x2i_buf_sz;
-	ndev->mgmt_chan_idx  = CHANN_INDEX(ndev, x2i->rb_start_addr);
 
+	if (info_regs.magic != MGMT_MBOX_MAGIC) {
+		ndev->mgmt_chan_idx = CHANN_INDEX(ndev, x2i->rb_start_addr);
+		goto done;
+	}
+
+	ndev->mgmt_chan_idx  = info_regs.msi_id;
+	ndev->mgmt_prot_major = info_regs.prot_major;
+	ndev->mgmt_prot_minor = info_regs.prot_minor;
+	if (aie2_check_protocol(ndev, ndev->mgmt_prot_major, ndev->mgmt_prot_minor))
+		ret = -EINVAL;
+
+done:
 	aie2_dump_chann_info_debug(ndev);
 
 	/* Must clear address at FW_ALIVE_OFF */
 	writel(0, SRAM_GET_ADDR(ndev, FW_ALIVE_OFF));
 
-	return 0;
+	return ret;
 }
 
 static int aie2_runtime_cfg(struct amdxdna_dev_hdl *ndev)
@@ -165,10 +234,12 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
-	ret = aie2_check_protocol_version(ndev);
-	if (ret) {
-		XDNA_ERR(ndev->xdna, "Check header hash failed");
-		return ret;
+	if (!ndev->mgmt_prot_major) {
+		ret = aie2_check_protocol_version(ndev);
+		if (ret) {
+			XDNA_ERR(ndev->xdna, "Check protocol version failed");
+			return ret;
+		}
 	}
 
 	ret = aie2_runtime_cfg(ndev);
@@ -242,7 +313,7 @@ static int aie2_set_dpm_level(void *cb_arg, u32 dpm_level)
 
 	xdna = hwctx->client->xdna;
 
-	ret = aie2_smu_set_dpm_level(xdna->dev_handle, dpm_level, true);
+	ret = aie2_smu_set_dpm_level(xdna->dev_handle, dpm_level);
 	if (ret)
 		XDNA_ERR(xdna, "set dpm level failed, ret %d", ret);
 
@@ -297,8 +368,10 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
 	xdna_mailbox_stop_channel(ndev->mgmt_chann);
 	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
 	ndev->mgmt_chann = NULL;
-	xdna_mailbox_destroy(ndev->mbox);
-	ndev->mbox = NULL;
+	if (ndev->mbox) {
+		xdna_mailbox_destroy(ndev->mbox);
+		ndev->mbox = NULL;
+	}
 	aie2_psp_stop(ndev->psp_hdl);
 	aie2_smu_stop(ndev);
 	pci_clear_master(pdev);
@@ -334,7 +407,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
 
 	ret = aie2_get_mgmt_chann_info(ndev);
 	if (ret) {
-		XDNA_ERR(xdna, "firmware is not alive");
+		XDNA_ERR(xdna, "firmware mgmt info ret %d", ret);
 		goto stop_psp;
 	}
 
@@ -362,7 +435,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
 						       &ndev->mgmt_x2i,
 						       &ndev->mgmt_i2x,
 						       xdna_mailbox_intr_reg,
-						       mgmt_mb_irq);
+						       mgmt_mb_irq, MB_CHANNEL_MGMT);
 	if (!ndev->mgmt_chann) {
 		XDNA_ERR(xdna, "failed to create management mailbox channel");
 		ret = -EINVAL;
@@ -411,6 +484,7 @@ static int aie2_init(struct amdxdna_dev *xdna)
 	void __iomem * const *tbl;
 	int i, bars, nvec, ret;
 
+	XDNA_DBG(xdna, "Control flags 0x%x", aie2_control_flags);
 	ndev = devm_kzalloc(&pdev->dev, sizeof(*ndev), GFP_KERNEL);
 	if (!ndev)
 		return -ENOMEM;
@@ -510,6 +584,7 @@ static int aie2_init(struct amdxdna_dev *xdna)
 	aie2_smu_setup(ndev);
 
 	ndev->pw_mode = POWER_MODE_DEFAULT;
+	ndev->clk_gate_enabled = true;
 	ret = aie2_hw_start(xdna);
 	if (ret) {
 		XDNA_ERR(xdna, "start npu failed, ret %d", ret);
@@ -523,6 +598,7 @@ static int aie2_init(struct amdxdna_dev *xdna)
 	}
 	ndev->total_col = min(aie2_max_col, ndev->metadata.cols);
 
+	xrs_cfg.max_dpm_level = SMU_DPM_MAX(ndev);
 	xrs_cfg.clk_list.num_levels = ndev->priv->smu_npu_dpm_levels;
 	xrs_cfg.clk_list.cu_clk_list = ndev->priv->smu_npu_dpm_clk_table;
 	xrs_cfg.sys_eff_factor = 1;
@@ -722,7 +798,7 @@ static int aie2_get_firmware_version(struct amdxdna_client *client,
 static int aie2_get_power_mode(struct amdxdna_client *client,
 			       struct amdxdna_drm_get_info *args)
 {
-	struct amdxdna_drm_get_power_mode mode;
+	struct amdxdna_drm_get_power_mode mode = {};
 	struct amdxdna_dev *xdna = client->xdna;
 	struct amdxdna_dev_hdl *ndev;
 
@@ -919,9 +995,8 @@ static int aie2_set_power_mode(struct amdxdna_client *client, struct amdxdna_drm
 		return -EFAULT;
 	}
 
-	/* Interpret the given buf->power_mode into the correct power mode*/
 	power_mode = power_state.power_mode;
-	if (power_mode > POWER_MODE_HIGH) {
+	if (power_mode > POWER_MODE_TURBO) {
 		XDNA_ERR(xdna, "Invalid power mode %d", power_mode);
 		return -EINVAL;
 	}
diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h
index b00e93ea..51cf66c9 100644
--- a/src/driver/amdxdna/aie2_pci.h
+++ b/src/driver/amdxdna/aie2_pci.h
@@ -67,15 +67,10 @@
 	pci_resource_len(NDEV2PDEV(_ndev), (_ndev)->xdna->dev_info->mbox_bar); \
 })
 
-#define SMU_MPNPUCLK_FREQ_MAX(ndev) \
-	((ndev)->priv->smu_mpnpuclk_freq_max)
-#define SMU_HCLK_FREQ_MAX(ndev) \
-	((ndev)->priv->smu_hclk_freq_max)
 #define SMU_DPM_MAX(ndev) \
-	((ndev)->priv->smu_dpm_max)
-
-#define SMU_NPU_DPM_TABLE_ENTRY(ndev, level) \
-	(&ndev->priv->smu_npu_dpm_clk_table[level])
+	((ndev)->smu.num_dpm_levels - 1)
+#define SMU_DPM_TABLE_ENTRY(ndev, level) \
+	(&(ndev)->smu.dpm_table[level])
 
 enum aie2_smu_reg_idx {
 	SMU_CMD_REG = 0,
@@ -154,12 +149,11 @@ struct clock {
 	char name[16];
 	u32 max_freq_mhz;
 	u32 freq_mhz;
-#if defined(CONFIG_DEBUG_FS)
-	u32 dbg_freq_mhz;
-#endif
 };
 
 struct smu {
+	const struct dpm_clk	*dpm_table;
+	u32			num_dpm_levels;
 	struct clock		mp_npu_clock;
 	struct clock		h_clock;
 	u32			curr_dpm_level;
@@ -199,6 +193,7 @@ struct amdxdna_hwctx_priv {
 	u32				num_pending;
 
 	struct amdxdna_gem_obj		*cmd_buf[HWCTX_MAX_CMDS];
+	struct workqueue_struct		*submit_wq;
 };
 
 struct async_events;
@@ -214,6 +209,8 @@ struct amdxdna_dev_hdl {
 	struct xdna_mailbox_chann_res	mgmt_x2i;
 	struct xdna_mailbox_chann_res	mgmt_i2x;
 	u32				mgmt_chan_idx;
+	u32				mgmt_prot_major;
+	u32				mgmt_prot_minor;
 
 	u32				total_col;
 	u32				smu_curr_dpm_level;
@@ -221,6 +218,7 @@ struct amdxdna_dev_hdl {
 	struct aie_metadata		metadata;
 	struct smu			smu;
 	enum amdxdna_power_mode_type	pw_mode;
+	bool				clk_gate_enabled;
 
 	/* Mailbox and the management channel */
 	struct mailbox			*mbox;
@@ -265,10 +263,6 @@ struct amdxdna_dev_priv {
 	struct aie2_bar_off_pair	psp_regs_off[PSP_MAX_REGS];
 	struct aie2_bar_off_pair	smu_regs_off[SMU_MAX_REGS];
 	struct rt_config_clk_gating	clk_gating;
-	u32				smu_mpnpuclk_freq_max;
-	u32				smu_hclk_freq_max;
-	/* npu1: 0, not support dpm; npu2+: support dpm up to 7 */
-	u32				smu_dpm_max;
 	u32				smu_rev;
 	const struct dpm_clk		*smu_npu_dpm_clk_table;
 	u32				smu_npu_dpm_levels;
@@ -278,13 +272,17 @@ struct amdxdna_dev_priv {
 };
 
 /* aie2_pci.c */
+#define AIE2_BIT_BYPASS_POWER_SWITCH	0 /* NOSYS */
+#define AIE2_BIT_BYPASS_SET_FREQ	1
+#define AIE2_BIT_BYPASS_FW_LOAD		2 /* NOSYS */
+extern uint aie2_control_flags;
 extern const struct amdxdna_dev_ops aie2_ops;
+int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor);
 
 /* aie2_smu.c */
 void aie2_smu_setup(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_start(struct amdxdna_dev_hdl *ndev);
 void aie2_smu_stop(struct amdxdna_dev_hdl *ndev);
-int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev, struct clock *clock, u32 freq_mhz);
 char *aie2_smu_get_mpnpu_clock_name(struct amdxdna_dev_hdl *ndev);
 char *aie2_smu_get_hclock_name(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_mpnpu_clock_freq(struct amdxdna_dev_hdl *ndev);
@@ -293,8 +291,7 @@ int aie2_smu_set_power_on(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_set_power_off(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_power_state(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_dpm_level(struct amdxdna_dev_hdl *ndev);
-int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cache);
-void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev);
+int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level);
 
 /* aie2_psp.c */
 struct psp_device *aie2m_psp_create(struct device *dev, struct psp_config *conf);
@@ -365,6 +362,7 @@ void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map);
 /* aie2_pm.c */
 int aie2_pm_start(struct amdxdna_dev_hdl *ndev);
 void aie2_pm_stop(struct amdxdna_dev_hdl *ndev);
+bool aie2_pm_is_turbo(struct amdxdna_dev_hdl *ndev);
 int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target);
 
 #endif /* _AIE2_PCI_H_ */
diff --git a/src/driver/amdxdna/aie2_pm.c b/src/driver/amdxdna/aie2_pm.c
index 64468c72..0a71e5b2 100644
--- a/src/driver/amdxdna/aie2_pm.c
+++ b/src/driver/amdxdna/aie2_pm.c
@@ -5,13 +5,19 @@
 
 #include "aie2_pci.h"
 
-static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev, bool enable)
+static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev,
+				enum amdxdna_power_mode_type target)
 {
 	const struct rt_config_clk_gating *config;
+	bool enable;
 	u32 value;
 	int ret;
 	int i;
 
+	enable = (target != POWER_MODE_TURBO && target != POWER_MODE_HIGH);
+	if (enable == ndev->clk_gate_enabled)
+		return 0;
+
 	config = &ndev->priv->clk_gating;
 	if (enable)
 		value = config->value_enable;
@@ -30,9 +36,40 @@ static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev, bool enable)
 		}
 	}
 
+	if (!ret)
+		ndev->clk_gate_enabled = enable;
+
 	return ret;
 }
 
+bool aie2_pm_is_turbo(struct amdxdna_dev_hdl *ndev)
+{
+	return ndev->pw_mode == POWER_MODE_TURBO;
+}
+
+static int aie2_pm_check_turbo(struct amdxdna_dev_hdl *ndev,
+			       enum amdxdna_power_mode_type prev,
+			       enum amdxdna_power_mode_type next)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct amdxdna_client *client;
+
+	if (prev != POWER_MODE_TURBO && next != POWER_MODE_TURBO)
+		return 0;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	list_for_each_entry(client, &xdna->client_list, node) {
+		bool empty;
+
+		mutex_lock(&client->hwctx_lock);
+		empty = idr_is_empty(&client->hwctx_idr);
+		mutex_unlock(&client->hwctx_lock);
+		if (!empty)
+			return -EBUSY;
+	}
+	return 0;
+}
+
 int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
@@ -44,23 +81,29 @@ int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type
 	if (target == POWER_MODE_LOW || target == POWER_MODE_MEDIUM)
 		return -EOPNOTSUPP;
 
-	XDNA_DBG(xdna, "Changing power mode from %d to %d", ndev->pw_mode, target);
-	/* Set resource solver power property to the user choice */
+	ret = aie2_pm_check_turbo(ndev, ndev->pw_mode, target);
+	if (ret) {
+		XDNA_WARN(xdna, "Change Turbo mode failed");
+		return ret;
+	}
 
-	/* Set power level within the device */
+	XDNA_DBG(xdna, "Changing power mode from %d to %d", ndev->pw_mode, target);
 
-	/*
-	 * Other mode -> POWER_MODE_HIGH: Turn off clock gating
-	 * POWER_MODE_HIGH -> Other mode: Turn on clock gating
-	 * Otherwise, no change
+	/* TODO:
+	 *switch (ndev->pw_mode) {
+	 *case POWER_MODE_LOW:
+	 *	Set to low DPM level
+	 *case POWER_MODE_MEDIUM:
+	 *	Set to medium DPM level
+	 *case POWER_MODE_HIGH:
+	 *case POWER_MODE_TURBO:
+	 *	Set to highest DPM level
+	 *default:
+	 *	Let driver decides DPM level
+	 *}
 	 */
-	if (target == POWER_MODE_HIGH) {
-		XDNA_DBG(xdna, "Clock gating turning off");
-		ret = aie2_pm_clock_gating(ndev, false);
-	} else if (ndev->pw_mode == POWER_MODE_HIGH) {
-		XDNA_DBG(xdna, "Clock gating turning on");
-		ret = aie2_pm_clock_gating(ndev, true);
-	}
+
+	ret = aie2_pm_clock_gating(ndev, target);
 	if (ret) {
 		XDNA_ERR(xdna, "Failed to config clock gating");
 		return ret;
@@ -73,21 +116,10 @@ int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type
 
 int aie2_pm_start(struct amdxdna_dev_hdl *ndev)
 {
-	/*
-	 * TODO: should only skip POWER_MODE_DEFAULT.
-	 * Let's make it right after full DPM support is ready
-	 */
-	if (ndev->pw_mode != POWER_MODE_HIGH)
-		return 0;
-
-	return aie2_pm_clock_gating(ndev, false);
+	return aie2_pm_clock_gating(ndev, ndev->pw_mode);
 }
 
 void aie2_pm_stop(struct amdxdna_dev_hdl *ndev)
 {
-	if (ndev->pw_mode != POWER_MODE_HIGH)
-		return;
-
-	/* Clock gating must be turned ON before suspend firmware */
-	aie2_pm_clock_gating(ndev, true);
+	aie2_pm_clock_gating(ndev, POWER_MODE_DEFAULT);
 }
diff --git a/src/driver/amdxdna/aie2_smu.c b/src/driver/amdxdna/aie2_smu.c
index 3675b4e5..c01a44ba 100644
--- a/src/driver/amdxdna/aie2_smu.c
+++ b/src/driver/amdxdna/aie2_smu.c
@@ -10,12 +10,21 @@
 /* SMU commands */
 #define AIE2_SMU_POWER_ON		0x3
 #define AIE2_SMU_POWER_OFF		0x4
+/* For SMU v0 */
 #define AIE2_SMU_SET_MPNPUCLK_FREQ	0x5
 #define AIE2_SMU_SET_HCLK_FREQ		0x6
+/* For SMU v1 */
 #define AIE2_SMU_SET_SOFT_DPMLEVEL	0x7
 #define AIE2_SMU_SET_HARD_DPMLEVEL	0x8
 
-static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg)
+/* This is a hack for NPU1 device */
+const struct dpm_clk npu1_hack_dpm_clk_table[] = {
+	{400, 800},
+	{600, 1024},
+};
+
+static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd,
+			 u32 reg_arg, u32 *out)
 {
 	u32 resp;
 	int ret;
@@ -35,6 +44,9 @@ static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg)
 		return ret;
 	}
 
+	if (out)
+		*out = readl(SMU_REG(ndev, SMU_OUT_REG));
+
 	if (resp != SMU_RESULT_OK) {
 		XDNA_ERR(ndev->xdna, "SMU cmd %d failed, 0x%x", reg_cmd, resp);
 		return -EINVAL;
@@ -43,26 +55,12 @@ static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg)
 	return 0;
 }
 
-static int aie2_smu_update_clock_freq(struct amdxdna_dev_hdl *ndev, u32 cmd,
-				      struct clock *clock, u32 freq_mhz)
-{
-	int ret;
-
-	ret = aie2_smu_exec(ndev, cmd, freq_mhz);
-	if (ret)
-		return ret;
-
-	clock->freq_mhz = freq_mhz;
-
-	return 0;
-}
-
 /*
  * Depending on the current running frequency and debugfs setting,
  * aie2_smu_set_clock_freq() might or might not update freqency.
  */
-int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev,
-			    struct clock *clock, u32 freq_mhz)
+static int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev,
+				   struct clock *clock, u32 freq_mhz)
 {
 	u32 smu_cmd;
 	int ret;
@@ -82,18 +80,11 @@ int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev,
 	if (freq_mhz == clock->freq_mhz)
 		return 0;
 
-#if defined(CONFIG_DEBUG_FS)
-	/* If freq is set by debugfs, respect it until debugfs write freq 0 */
-	if (clock->dbg_freq_mhz && freq_mhz != clock->dbg_freq_mhz) {
-		XDNA_DBG(ndev->xdna, "%s debug freq %d, ignore target freq %d",
-			 clock->name, clock->dbg_freq_mhz, freq_mhz);
-		return 0;
-	}
-#endif
-	ret = aie2_smu_update_clock_freq(ndev, smu_cmd, clock, freq_mhz);
+	ret = aie2_smu_exec(ndev, smu_cmd, freq_mhz, NULL);
 	if (ret)
 		return ret;
 
+	clock->freq_mhz = freq_mhz;
 	XDNA_DBG(ndev->xdna, "Set %s = %d mhz", clock->name, clock->freq_mhz);
 	return 0;
 }
@@ -120,23 +111,23 @@ char *aie2_smu_get_hclock_name(struct amdxdna_dev_hdl *ndev)
 
 static int aie2_smu_set_dpm_level_v0(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
-	int ret;
-	const struct dpm_clk *dpm_entry = SMU_NPU_DPM_TABLE_ENTRY(ndev, dpm_level);
+	const struct dpm_clk *dpm_entry = SMU_DPM_TABLE_ENTRY(ndev, dpm_level);
 	struct clock *clk;
+	int ret;
 
 	clk = &ndev->smu.mp_npu_clock;
-
 	ret = aie2_smu_set_clock_freq(ndev, clk, dpm_entry->npuclk);
 	if (ret) {
-		XDNA_ERR(ndev->xdna, "setting npu clk failed for dpm level %d, ret: %d", dpm_level, ret);
+		XDNA_ERR(ndev->xdna, "setting npu clk failed for dpm level %d, ret: %d",
+			 dpm_level, ret);
 		return ret;
 	}
 
 	clk = &ndev->smu.h_clock;
-
 	ret = aie2_smu_set_clock_freq(ndev, clk, dpm_entry->hclk);
 	if (ret) {
-		XDNA_ERR(ndev->xdna, "setting hclk failed for dpm level %d, ret: %d", dpm_level, ret);
+		XDNA_ERR(ndev->xdna, "setting hclk failed for dpm level %d, ret: %d",
+			 dpm_level, ret);
 		return ret;
 	}
 
@@ -147,16 +138,19 @@ static int aie2_smu_set_dpm_level_v1(struct amdxdna_dev_hdl *ndev, u32 dpm_level
 {
 	int ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level, NULL);
 	if (!ret)
 		XDNA_INFO_ONCE(ndev->xdna, "Set hard dpm level = %d", dpm_level);
 	else
 		return ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level, NULL);
 	if (!ret)
 		XDNA_INFO_ONCE(ndev->xdna, "Set soft dpm level = %d", dpm_level);
 
+	ndev->smu.mp_npu_clock.freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, dpm_level)->npuclk;
+	ndev->smu.h_clock.freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, dpm_level)->hclk;
+
 	return ret;
 }
 
@@ -165,11 +159,16 @@ int aie2_smu_get_dpm_level(struct amdxdna_dev_hdl *ndev)
 	return ndev->smu.curr_dpm_level;
 }
 
-int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cache)
+int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
 	int ret;
 
-	if (dpm_level < 0 || dpm_level > SMU_DPM_MAX(ndev))
+	if (aie2_control_flags & BIT(AIE2_BIT_BYPASS_SET_FREQ)) {
+		XDNA_DBG(ndev->xdna, "Bypassed set dpm level");
+		return 0;
+	}
+
+	if (dpm_level > SMU_DPM_MAX(ndev))
 		return -EINVAL;
 
 	if (!ndev->priv->smu_rev)
@@ -177,8 +176,10 @@ int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cac
 	else
 		ret = aie2_smu_set_dpm_level_v1(ndev, dpm_level);
 
-	if (!ret & cache)
+	if (!ret) {
 		ndev->smu.curr_dpm_level = dpm_level;
+		XDNA_DBG(ndev->xdna, "The dpm level is set to %d", dpm_level);
+	}
 
 	return ret;
 }
@@ -187,7 +188,7 @@ int aie2_smu_set_power_on(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0, NULL);
 	if (ret)
 		return ret;
 
@@ -199,7 +200,7 @@ int aie2_smu_set_power_off(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0, NULL);
 	if (ret)
 		return ret;
 
@@ -215,7 +216,6 @@ int aie2_smu_get_power_state(struct amdxdna_dev_hdl *ndev)
 int aie2_smu_start(struct amdxdna_dev_hdl *ndev)
 {
 	struct smu *smu = &ndev->smu;
-	u32 freq_mhz;
 	int ret;
 
 	ret = aie2_smu_set_power_on(ndev);
@@ -224,63 +224,23 @@ int aie2_smu_start(struct amdxdna_dev_hdl *ndev)
 		return ret;
 	}
 
-	freq_mhz = smu->mp_npu_clock.freq_mhz;
-	ret = aie2_smu_update_clock_freq(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ,
-					 &smu->mp_npu_clock, freq_mhz);
+	ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level);
 	if (ret) {
-		XDNA_ERR(ndev->xdna, "Set mpnpu clk freq failed, ret %d", ret);
+		XDNA_ERR(ndev->xdna, "Set dpm level failed, ret %d", ret);
 		return ret;
 	}
-	XDNA_INFO_ONCE(ndev->xdna, "Set %s = %d mhz", smu->mp_npu_clock.name, freq_mhz);
-
-	freq_mhz = smu->h_clock.freq_mhz;
-	ret = aie2_smu_update_clock_freq(ndev, AIE2_SMU_SET_HCLK_FREQ,
-					 &smu->h_clock, freq_mhz);
-	if (ret) {
-		XDNA_ERR(ndev->xdna, "Set hclk freq failed, ret %d", ret);
-		return ret;
-	}
-	XDNA_INFO_ONCE(ndev->xdna, "Set %s = %d mhz", smu->h_clock.name, freq_mhz);
-
-	if (SMU_DPM_MAX(ndev) > 0) {
-		ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level, true);
-		if (ret) {
-			XDNA_ERR(ndev->xdna, "Set dpm level failed, ret %d", ret);
-			return ret;
-		}
-	}
 
 	return 0;
 }
 
-void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev)
-{
-	u32 freq_mhz;
-	int ret;
-
-	freq_mhz = 400;
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, freq_mhz);
-	if (ret)
-		XDNA_ERR(ndev->xdna, "Set mpnpu clk freq %d mhz failed, ret %d", freq_mhz, ret);
-
-	freq_mhz = 800;
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HCLK_FREQ, freq_mhz);
-	if (ret)
-		XDNA_ERR(ndev->xdna, "Set hclk freq %d mhz failed, ret %d", freq_mhz, ret);
-
-	if (SMU_DPM_MAX(ndev) > 0) {
-		ret = aie2_smu_set_dpm_level(ndev, 0, false);
-		if (ret)
-			XDNA_ERR(ndev->xdna, "Set dpm level 0 failed, ret %d", ret);
-	}
-}
-
 void aie2_smu_stop(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
 	/* Minimize clocks/dpm level prior to power off */
-	aie2_smu_prepare_s0i3(ndev);
+	ret = aie2_smu_set_dpm_level(ndev, 0);
+	if (ret)
+		XDNA_WARN(ndev->xdna, "Set dpm level 0 failed, ret %d", ret);
 
 	ret = aie2_smu_set_power_off(ndev);
 	if (ret)
@@ -292,13 +252,26 @@ void aie2_smu_setup(struct amdxdna_dev_hdl *ndev)
 	struct smu *smu = &ndev->smu;
 
 	snprintf(smu->mp_npu_clock.name, sizeof(smu->mp_npu_clock.name), "MP-NPU Clock");
-	smu->mp_npu_clock.max_freq_mhz = SMU_MPNPUCLK_FREQ_MAX(ndev);
-
 	snprintf(smu->h_clock.name, sizeof(smu->h_clock.name), "H Clock");
-	smu->h_clock.max_freq_mhz = SMU_HCLK_FREQ_MAX(ndev);
-
-	/* The first time SMU start, it will use below clock frequency */
-	smu->mp_npu_clock.freq_mhz = smu->mp_npu_clock.max_freq_mhz;
-	smu->h_clock.freq_mhz = smu->h_clock.max_freq_mhz;
+	smu->dpm_table = ndev->priv->smu_npu_dpm_clk_table;
+	smu->num_dpm_levels = ndev->priv->smu_npu_dpm_levels;
 	smu->curr_dpm_level = SMU_DPM_MAX(ndev);
+
+	if (!ndev->priv->smu_rev) {
+		u32 npuclk_freq;
+		u32 out;
+
+		/* This is a hack for special NPU1 device */
+		npuclk_freq = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->npuclk;
+		aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, npuclk_freq, &out);
+		if (npuclk_freq != out) {
+			XDNA_DBG(ndev->xdna, "Use small DPM table");
+			smu->dpm_table = npu1_hack_dpm_clk_table;
+			smu->num_dpm_levels = ARRAY_SIZE(npu1_hack_dpm_clk_table);
+			smu->curr_dpm_level = SMU_DPM_MAX(ndev);
+		}
+	}
+
+	smu->mp_npu_clock.max_freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->npuclk;
+	smu->h_clock.max_freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->hclk;
 }
diff --git a/src/driver/amdxdna/aie2_solver.c b/src/driver/amdxdna/aie2_solver.c
index 289f2ec7..38de5a9d 100644
--- a/src/driver/amdxdna/aie2_solver.c
+++ b/src/driver/amdxdna/aie2_solver.c
@@ -95,6 +95,19 @@ static int sanity_check(struct solver_state *xrs, struct alloc_requests *req)
 	return 0;
 }
 
+static bool is_valid_qos_dpm_params(struct aie_qos *rqos)
+{
+	/*
+	 * gops is retrieved from the xmodel, so it's always set
+	 * fps and latency are the configurable params from the application
+	 */
+	if (rqos->gops > 0 && (rqos->fps > 0 ||  rqos->latency > 0)) {
+		return true;
+	}
+
+	return false;
+}
+
 static u32 find_dpm_level(struct solver_state *xrs, struct alloc_requests *req)
 {
 	struct cdo_parts *cdop = &req->cdo;
@@ -103,8 +116,9 @@ static u32 find_dpm_level(struct solver_state *xrs, struct alloc_requests *req)
 	struct solver_node *node;
 	u32 cu_clk_freq, dpm_level;
 
-	if (cdop->ncols > xrs->cfg.total_col)
-		return -EINVAL;
+	/* If no QoS parameters are passed, set it to the max DPM level */
+	if (!is_valid_qos_dpm_params(rqos))
+		return xrs->cfg.max_dpm_level;
 
         /*
          * We can find at least one CDOs groups that meet the
diff --git a/src/driver/amdxdna/aie2_solver.h b/src/driver/amdxdna/aie2_solver.h
index 19fd4b87..98b16380 100644
--- a/src/driver/amdxdna/aie2_solver.h
+++ b/src/driver/amdxdna/aie2_solver.h
@@ -91,6 +91,7 @@ struct init_config {
 	u32			total_col;
 	u32			sys_eff_factor; /* system efficiency factor */
 	u32			latency_adj;    /* latency adjustment in ms */
+	u32			max_dpm_level;	/* Max dpm level in the system */
 	struct clk_list_info	clk_list;       /* List of frequencies available in system */
 	struct device		*dev;
 	struct xrs_action_ops	*actions;
diff --git a/src/driver/amdxdna/amdxdna_ctx.c b/src/driver/amdxdna/amdxdna_ctx.c
index ba263879..9bb219f7 100644
--- a/src/driver/amdxdna/amdxdna_ctx.c
+++ b/src/driver/amdxdna/amdxdna_ctx.c
@@ -333,7 +333,7 @@ amdxdna_arg_bos_lookup(struct amdxdna_client *client,
 		abo = to_xdna_obj(gobj);
 
 		mutex_lock(&abo->lock);
-		if (abo->pinned) {
+		if (abo->flags & BO_SUBMIT_PINNED) {
 			mutex_unlock(&abo->lock);
 			job->bos[i] = gobj;
 			continue;
@@ -345,7 +345,7 @@ amdxdna_arg_bos_lookup(struct amdxdna_client *client,
 			drm_gem_object_put(gobj);
 			goto put_arg_bos;
 		}
-		abo->pinned = true;
+		abo->flags |= BO_SUBMIT_PINNED;
 		mutex_unlock(&abo->lock);
 
 		job->bos[i] = gobj;
@@ -375,6 +375,77 @@ void amdxdna_job_put(struct amdxdna_sched_job *job)
 	kref_put(&job->refcnt, amdxdna_sched_job_release);
 }
 
+int amdxdna_lock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx)
+{
+	struct amdxdna_dev *xdna = job->hwctx->client->xdna;
+	struct amdxdna_gem_obj *abo;
+	int contended = -1, i, ret;
+
+	ww_acquire_init(ctx, &reservation_ww_class);
+
+retry:
+	if (contended != -1) {
+		ret = dma_resv_lock_slow_interruptible(job->bos[contended]->resv, ctx);
+		if (ret) {
+			ww_acquire_fini(ctx);
+			return ret;
+		}
+		abo->flags |= BO_SUBMIT_LOCKED;
+	}
+
+	for (i = 0; i < job->bo_cnt; i++) {
+		abo = to_xdna_obj(job->bos[i]);
+		if (abo->flags & BO_SUBMIT_LOCKED)
+			continue;
+
+		ret = dma_resv_lock_interruptible(job->bos[i]->resv, ctx);
+		if (ret) {
+			int j;
+
+			for (j = 0; j < i; j++) {
+				abo = to_xdna_obj(job->bos[j]);
+				dma_resv_unlock(job->bos[j]->resv);
+				abo->flags &= ~BO_SUBMIT_LOCKED;
+			}
+
+			if (contended != -1 && contended >= i)
+				dma_resv_unlock(job->bos[contended]->resv);
+
+			if (ret == -EDEADLK) {
+				contended = i;
+				goto retry;
+			}
+
+			ww_acquire_fini(ctx);
+
+			XDNA_ERR(xdna, "Lock BO failed, ret %d", ret);
+			return ret;
+		}
+		abo->flags |= BO_SUBMIT_LOCKED;
+	}
+
+	ww_acquire_done(ctx);
+
+	return 0;
+}
+
+void amdxdna_unlock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx)
+{
+	struct amdxdna_gem_obj *abo;
+	int i;
+
+	for (i = 0; i < job->bo_cnt; i++) {
+		abo = to_xdna_obj(job->bos[i]);
+		if (!(abo->flags & BO_SUBMIT_LOCKED))
+			continue;
+
+		dma_resv_unlock(job->bos[i]->resv);
+		abo->flags &= ~BO_SUBMIT_LOCKED;
+	}
+
+	ww_acquire_fini(ctx);
+}
+
 int amdxdna_cmd_submit(struct amdxdna_client *client, u32 opcode,
 		       u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
 		       u32 hwctx_hdl, u64 *seq)
diff --git a/src/driver/amdxdna/amdxdna_ctx.h b/src/driver/amdxdna/amdxdna_ctx.h
index c1d7ba17..6ccaa45a 100644
--- a/src/driver/amdxdna/amdxdna_ctx.h
+++ b/src/driver/amdxdna/amdxdna_ctx.h
@@ -228,6 +228,8 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
 void amdxdna_hwctx_suspend(struct amdxdna_client *client);
 void amdxdna_hwctx_resume(struct amdxdna_client *client);
 
+int amdxdna_lock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx);
+void amdxdna_unlock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx);
 int amdxdna_cmd_submit(struct amdxdna_client *client, u32 opcode,
 		       u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
 		       u32 hwctx_hdl, u64 *seq);
diff --git a/src/driver/amdxdna/amdxdna_devel.c b/src/driver/amdxdna/amdxdna_devel.c
index c4dd3ee5..3f6698a0 100644
--- a/src/driver/amdxdna/amdxdna_devel.c
+++ b/src/driver/amdxdna/amdxdna_devel.c
@@ -171,6 +171,9 @@ void amdxdna_bo_dma_unmap(struct amdxdna_gem_obj *abo)
 	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
 
 	XDNA_DBG(xdna, "BO type %d dma_addr 0x%llx", abo->type, abo->mem.dma_addr);
+	if (is_import_bo(abo))
+		return;
+
 	drm_gem_shmem_put_pages(&abo->base);
 }
 #else
diff --git a/src/driver/amdxdna/amdxdna_drm.c b/src/driver/amdxdna/amdxdna_drm.c
index e44062c5..b080d1ea 100644
--- a/src/driver/amdxdna/amdxdna_drm.c
+++ b/src/driver/amdxdna/amdxdna_drm.c
@@ -117,8 +117,17 @@ static int amdxdna_flush(struct file *f, fl_owner_t id)
 	struct drm_file *filp = f->private_data;
 	struct amdxdna_client *client = filp->driver_priv;
 	struct amdxdna_dev *xdna = client->xdna;
+	pid_t pid = task_tgid_nr(current);
 	int idx;
 
+	/* When current PID not equals to Client PID, this is a flush()
+	 * triggered by closing a child process. If this is the case, flush() is
+	 * just a no-op. The process which open() device should finally flush()
+	 * and close() device.
+	 */
+	if (pid != client->pid)
+		return 0;
+
 	XDNA_DBG(xdna, "PID %d flushing...", client->pid);
 	if (!drm_dev_enter(&xdna->ddev, &idx))
 		return 0;
@@ -229,7 +238,7 @@ const struct drm_driver amdxdna_drm_drv = {
 	/* For shmem object create */
 	.gem_create_object = amdxdna_gem_create_object_cb,
 #ifdef AMDXDNA_SHMEM
-	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
+	.gem_prime_import = amdxdna_gem_prime_import,
 #else
 	.gem_prime_import_sg_table = drm_gem_dma_prime_import_sg_table,
 #endif
diff --git a/src/driver/amdxdna/amdxdna_gem.c b/src/driver/amdxdna/amdxdna_gem.c
index 6ef81a9b..eb25f5e6 100644
--- a/src/driver/amdxdna/amdxdna_gem.c
+++ b/src/driver/amdxdna/amdxdna_gem.c
@@ -60,48 +60,6 @@ amdxdna_gem_insert_node_locked(struct amdxdna_gem_obj *abo, bool use_vmap)
 	return 0;
 }
 
-static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
-{
-	struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev);
-	struct amdxdna_gem_obj *abo = to_xdna_obj(gobj);
-	struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva);
-
-	XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr);
-	if (abo->pinned)
-		amdxdna_gem_unpin(abo);
-
-	flush_work(&abo->hmm_unreg_work);
-	if (abo->type == AMDXDNA_BO_DEV) {
-		mutex_lock(&abo->client->mm_lock);
-		drm_mm_remove_node(&abo->mm_node);
-		mutex_unlock(&abo->client->mm_lock);
-
-		vunmap(abo->mem.kva);
-		drm_gem_object_put(to_gobj(abo->dev_heap));
-		drm_gem_object_release(gobj);
-		mutex_destroy(&abo->lock);
-		kfree(abo);
-		return;
-	}
-
-	if (abo->type == AMDXDNA_BO_DEV_HEAP)
-		drm_mm_takedown(&abo->mm);
-
-#ifdef AMDXDNA_DEVEL
-	if (abo->type == AMDXDNA_BO_CMD)
-		amdxdna_mem_unmap(xdna, &abo->mem);
-	else if (iommu_mode == AMDXDNA_IOMMU_NO_PASID)
-		amdxdna_bo_dma_unmap(abo);
-#endif
-	drm_gem_vunmap_unlocked(gobj, &map);
-	mutex_destroy(&abo->lock);
-	drm_gem_shmem_free(&abo->base);
-}
-
-static const struct drm_gem_object_funcs amdxdna_gem_dev_obj_funcs = {
-	.free = amdxdna_gem_obj_free,
-};
-
 static bool amdxdna_hmm_invalidate(struct mmu_interval_notifier *mni,
 				   const struct mmu_notifier_range *range,
 				   unsigned long cur_seq)
@@ -136,8 +94,11 @@ static void amdxdna_hmm_unregister(struct amdxdna_gem_obj *abo)
 	if (!xdna->dev_info->ops->hmm_invalidate)
 		return;
 
-	if (!abo->mem.pfns)
+	mutex_lock(&abo->lock);
+	if (!abo->mem.pfns) {
+		mutex_unlock(&abo->lock);
 		return;
+	}
 
 	mmu_interval_notifier_remove(&abo->mem.notifier);
 	kvfree(abo->mem.pfns);
@@ -145,6 +106,8 @@ static void amdxdna_hmm_unregister(struct amdxdna_gem_obj *abo)
 
 	if (is_import_bo(abo) && vma->vm_file && vma->vm_file->f_mapping)
 		mapping_clear_unevictable(vma->vm_file->f_mapping);
+
+	mutex_unlock(&abo->lock);
 }
 
 static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
@@ -159,14 +122,19 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
 	if (!xdna->dev_info->ops->hmm_invalidate)
 		return 0;
 
-	if (abo->mem.pfns)
-		return -EEXIST;
+	mutex_lock(&abo->lock);
+	if (abo->mem.pfns) {
+		ret = -EEXIST;
+		goto out_unlock;
+	}
 
 	nr_pages = (PAGE_ALIGN(addr + len) - (addr & PAGE_MASK)) >> PAGE_SHIFT;
 	abo->mem.pfns = kvcalloc(nr_pages, sizeof(unsigned long),
 				 GFP_KERNEL);
-	if (!abo->mem.pfns)
-		return -ENOMEM;
+	if (!abo->mem.pfns) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
 
 	ret = mmu_interval_notifier_insert_locked(&abo->mem.notifier,
 						  current->mm,
@@ -175,65 +143,122 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
 						  &amdxdna_hmm_ops);
 	if (ret) {
 		XDNA_ERR(xdna, "Insert mmu notifier failed, ret %d", ret);
-		kvfree(abo->mem.pfns);
-		abo->mem.pfns = NULL;
-		return ret;
+		goto free_pfns;
 	}
 	abo->mem.userptr = addr;
 	abo->mem.vma = vma;
 	if (is_import_bo(abo) && vma->vm_file && vma->vm_file->f_mapping)
 		mapping_set_unevictable(vma->vm_file->f_mapping);
 
+	mutex_unlock(&abo->lock);
+
 	return 0;
+
+free_pfns:
+	kvfree(abo->mem.pfns);
+	abo->mem.pfns = NULL;
+out_unlock:
+	mutex_unlock(&abo->lock);
+	return ret;
 }
 
-static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data)
+static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
 {
-	if (pte_none(ptep_get(pte)))
-		return -EINVAL;
+	struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev);
+	struct amdxdna_gem_obj *abo = to_xdna_obj(gobj);
+	struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva);
 
-	*(bool *)data = true;
-	return 0;
+	XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr);
+	if (abo->flags & BO_SUBMIT_PINNED)
+		amdxdna_gem_unpin(abo);
+
+	amdxdna_hmm_unregister(abo);
+	flush_work(&abo->hmm_unreg_work);
+	if (abo->type == AMDXDNA_BO_DEV) {
+		mutex_lock(&abo->client->mm_lock);
+		drm_mm_remove_node(&abo->mm_node);
+		mutex_unlock(&abo->client->mm_lock);
+
+		vunmap(abo->mem.kva);
+		drm_gem_object_put(to_gobj(abo->dev_heap));
+		drm_gem_object_release(gobj);
+		mutex_destroy(&abo->lock);
+		kfree(abo);
+		return;
+	}
+
+	if (abo->type == AMDXDNA_BO_DEV_HEAP)
+		drm_mm_takedown(&abo->mm);
+
+#ifdef AMDXDNA_DEVEL
+	if (abo->type == AMDXDNA_BO_CMD)
+		amdxdna_mem_unmap(xdna, &abo->mem);
+	else if (iommu_mode == AMDXDNA_IOMMU_NO_PASID)
+		amdxdna_bo_dma_unmap(abo);
+#endif
+	drm_gem_vunmap_unlocked(gobj, &map);
+	mutex_destroy(&abo->lock);
+	drm_gem_shmem_free(&abo->base);
 }
 
+static const struct drm_gem_object_funcs amdxdna_gem_dev_obj_funcs = {
+	.free = amdxdna_gem_obj_free,
+};
+
 static int amdxdna_insert_pages(struct amdxdna_gem_obj *abo,
 				struct vm_area_struct *vma)
 {
-	unsigned long num_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-	struct sg_dma_page_iter sg_iter;
-	bool has_mapped_page = false;
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+	unsigned long num_pages = vma_pages(vma);
 	unsigned long offset = 0;
 	int ret;
 
 	if (!is_import_bo(abo)) {
+		ret = drm_gem_shmem_mmap(&abo->base, vma);
+		if (ret) {
+			XDNA_ERR(xdna, "Failed shmem mmap %d", ret);
+			return ret;
+		}
+
 		/* The buffer is based on memory pages. Fix the flag. */
 		vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP);
-		return vm_insert_pages(vma, vma->vm_start, abo->base.pages,
-				       &num_pages);
-	}
+		ret = vm_insert_pages(vma, vma->vm_start, abo->base.pages,
+				      &num_pages);
+		if (ret) {
+			XDNA_ERR(xdna, "Failed insert pages %d", ret);
+			vma->vm_ops->close(vma);
+			return ret;
+		}
 
-	ret = apply_to_page_range(vma->vm_mm, vma->vm_start, num_pages,
-				  is_mapped_fn, &has_mapped_page);
-	if (!ret)
 		return 0;
+	}
 
-	if (has_mapped_page)
-		return -EBUSY;
+	vma->vm_private_data = NULL;
+	vma->vm_ops = NULL;
+	ret = dma_buf_mmap(to_gobj(abo)->dma_buf, vma, 0);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to mmap dma buf %d", ret);
+		return ret;
+	}
 
-	for_each_sgtable_dma_page(abo->base.sgt, &sg_iter, 0) {
-		dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
-		unsigned long pfn;
+	do {
+		vm_fault_t fault_ret;
 
-		pfn = PFN_DOWN(dma_to_phys(to_gobj(abo)->dev->dev, addr));
-		ret = io_remap_pfn_range(vma, vma->vm_start + offset, pfn,
-					 PAGE_SIZE, vma->vm_page_prot);
-		if (ret)
-			break;
+		fault_ret = handle_mm_fault(vma, vma->vm_start+offset,
+					    FAULT_FLAG_WRITE, NULL);
+		if (fault_ret & VM_FAULT_ERROR) {
+			vma->vm_ops->close(vma);
+			XDNA_ERR(xdna, "Fault in page failed"); 
+			return -EFAULT;
+		}
 
 		offset += PAGE_SIZE;
-	}
+	} while (--num_pages);
 
-	return ret;
+	/* Drop the reference drm_gem_mmap_obj() acquired.*/
+	drm_gem_object_put(to_gobj(abo));
+
+	return 0;
 }
 
 static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj,
@@ -250,16 +275,10 @@ static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj,
 	if (ret)
 		return ret;
 
-	ret = drm_gem_shmem_mmap(&abo->base, vma);
-	if (ret) {
-		XDNA_ERR(xdna, "failed shmem mmap %d", ret);
-		goto hmm_unreg;
-	}
-
 	ret = amdxdna_insert_pages(abo, vma);
 	if (ret) {
 		XDNA_ERR(xdna, "Failed insert pages, ret %d", ret);
-		goto close_vma;
+		goto hmm_unreg;
 	}
 
 	XDNA_DBG(xdna, "BO map_offset 0x%llx type %d userptr 0x%llx size 0x%lx",
@@ -267,8 +286,6 @@ static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj,
 		 abo->mem.userptr, gobj->size);
 	return 0;
 
-close_vma:
-	vma->vm_ops->close(vma);
 hmm_unreg:
 	amdxdna_hmm_unregister(abo);
 	return ret;
@@ -354,7 +371,6 @@ amdxdna_gem_create_obj(struct drm_device *dev, size_t size)
 	if (!abo)
 		return ERR_PTR(-ENOMEM);
 
-	abo->pinned = false;
 	abo->assigned_hwctx = AMDXDNA_INVALID_CTX_HANDLE;
 	mutex_init(&abo->lock);
 	INIT_WORK(&abo->hmm_unreg_work, amdxdna_hmm_unreg_work);
@@ -381,6 +397,60 @@ amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size)
 	return to_gobj(abo);
 }
 
+struct drm_gem_object *
+amdxdna_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf)
+{
+	struct dma_buf_attachment *attach;
+	struct drm_gem_object *gobj;
+	struct sg_table *sgt;
+	int ret;
+
+	attach = dma_buf_attach(dma_buf, dev->dev);
+	if (IS_ERR(attach))
+		return ERR_CAST(attach);
+
+	get_dma_buf(dma_buf);
+
+	sgt = dma_buf_map_attachment_unlocked(attach, DMA_BIDIRECTIONAL);
+	if (IS_ERR(sgt)) {
+		ret = PTR_ERR(sgt);
+		goto fail_detach;
+	}
+
+	gobj = drm_gem_shmem_prime_import_sg_table(dev, attach, sgt);
+	if (IS_ERR(gobj)) {
+		ret = PTR_ERR(gobj);
+		goto fail_unmap;
+	}
+
+	gobj->import_attach = attach;
+	gobj->resv = dma_buf->resv;
+
+#ifdef AMDXDNA_DEVEL
+	if (iommu_mode == AMDXDNA_IOMMU_NO_PASID) {
+		struct amdxdna_gem_obj *abo;
+
+		abo = to_xdna_obj(gobj);
+		ret = amdxdna_bo_dma_map(abo);
+		if (ret) {
+			drm_gem_object_put(gobj);
+			goto fail_unmap;
+		}
+		abo->mem.dev_addr = abo->mem.dma_addr;
+	}
+#endif
+
+	return gobj;
+
+fail_unmap:
+	dma_buf_unmap_attachment_unlocked(attach, sgt, DMA_BIDIRECTIONAL);
+fail_detach:
+	dma_buf_detach(dma_buf, attach);
+	dma_buf_put(dma_buf);
+
+	return ERR_PTR(ret);
+}
+
 static struct amdxdna_gem_obj *
 amdxdna_drm_alloc_shmem(struct drm_device *dev,
 			struct amdxdna_drm_create_bo *args,
diff --git a/src/driver/amdxdna/amdxdna_gem.h b/src/driver/amdxdna/amdxdna_gem.h
index 3429a3ee..24a61608 100644
--- a/src/driver/amdxdna/amdxdna_gem.h
+++ b/src/driver/amdxdna/amdxdna_gem.h
@@ -27,11 +27,13 @@ struct amdxdna_mem {
 #endif
 };
 
+#define BO_SUBMIT_PINNED	BIT(0)
+#define BO_SUBMIT_LOCKED	BIT(1)
 struct amdxdna_gem_obj {
 	struct drm_gem_shmem_object	base;
 	struct amdxdna_client		*client;
 	u8				type;
-	bool				pinned;
+	u64				flags;
 	struct mutex			lock; /* Protects: pinned, assigned_hwctx */
 	struct amdxdna_mem		mem;
 	struct work_struct		hmm_unreg_work;
@@ -60,6 +62,8 @@ static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo)
 
 struct drm_gem_object *
 amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size);
+struct drm_gem_object *
+amdxdna_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf);
 struct amdxdna_gem_obj *
 amdxdna_drm_alloc_dev_bo(struct drm_device *dev,
 			 struct amdxdna_drm_create_bo *args,
diff --git a/src/driver/amdxdna/amdxdna_mailbox.c b/src/driver/amdxdna/amdxdna_mailbox.c
index 2f58fd3e..409ab26f 100644
--- a/src/driver/amdxdna/amdxdna_mailbox.c
+++ b/src/driver/amdxdna/amdxdna_mailbox.c
@@ -52,8 +52,11 @@
 
 #ifdef AMDXDNA_DEVEL
 int mailbox_polling;
-module_param(mailbox_polling, int, 0644);
-MODULE_PARM_DESC(mailbox_polling, "0:interrupt(default); >0:poll interval in ms; <0: busy poll");
+module_param(mailbox_polling, int, 0444);
+MODULE_PARM_DESC(mailbox_polling, "<=0:interrupt(default); >0:poll interval in ms; <0: busy poll");
+#define MB_DEFAULT_NO_POLL (mailbox_polling <= 0)
+#define MB_PERIODIC_POLL   (mailbox_polling > 0)
+#define MB_FORCE_USER_POLL   (mailbox_polling < 0)
 
 #define MB_TIMER_JIFF msecs_to_jiffies(mailbox_polling)
 #endif
@@ -70,29 +73,39 @@ struct mailbox {
 	/* protect channel list */
 	struct mutex		mbox_lock;
 	struct list_head        chann_list;
-#ifdef AMDXDNA_DEVEL
+	struct list_head        poll_chann_list;
 	struct task_struct	*polld;
 	struct wait_queue_head	poll_wait;
 	bool			sent_msg; /* For polld */
-#endif
-
 #if defined(CONFIG_DEBUG_FS)
 	struct list_head        res_records;
 #endif /* CONFIG_DEBUG_FS */
+};
 
+#if defined(CONFIG_DEBUG_FS)
+struct mailbox_res_record {
+	enum xdna_mailbox_channel_type	type;
+	struct list_head		re_entry;
+	struct xdna_mailbox_chann_res	re_x2i;
+	struct xdna_mailbox_chann_res	re_i2x;
+	int				re_irq;
+	int				active;
 };
+#endif /* CONFIG_DEBUG_FS */
 
 struct mailbox_channel {
 	struct mailbox			*mb;
 #if defined(CONFIG_DEBUG_FS)
-	struct list_head		chann_entry;
+	struct mailbox_res_record	*record;
 #endif
+	struct list_head		chann_entry;
 	struct xdna_mailbox_chann_res	res[CHAN_RES_NUM];
 	int				msix_irq;
+	u32				x2i_tail;
 	u32				iohub_int_addr;
+	enum xdna_mailbox_channel_type	type;
 	struct idr			chan_idr;
 	spinlock_t			chan_idr_lock; /* protect idr operations */
-	u32				x2i_tail;
 
 	/* Received msg related fields */
 	struct workqueue_struct		*work_q;
@@ -134,15 +147,6 @@ struct mailbox_msg {
 	struct mailbox_pkg	pkg;
 };
 
-#if defined(CONFIG_DEBUG_FS)
-struct mailbox_res_record {
-	struct list_head		re_entry;
-	struct xdna_mailbox_chann_res	re_x2i;
-	struct xdna_mailbox_chann_res	re_i2x;
-	int				re_irq;
-};
-#endif /* CONFIG_DEBUG_FS */
-
 static void mailbox_reg_write(struct mailbox_channel *mb_chann, u32 mbox_reg, u32 data)
 {
 	struct xdna_mailbox_res *mb_res = &mb_chann->mb->res;
@@ -162,17 +166,23 @@ static u32 mailbox_reg_read(struct mailbox_channel *mb_chann, u32 mbox_reg)
 static int mailbox_tail_read_non_zero(struct mailbox_channel *mb_chann, u32 *val)
 {
 	u32 mbox_reg = mb_chann->res[CHAN_RES_I2X].mb_tail_ptr_reg;
+	u32 ringbuf_size = mb_chann->res[CHAN_RES_I2X].rb_size;
 	struct xdna_mailbox_res *mb_res = &mb_chann->mb->res;
 	u64 ringbuf_addr = mb_res->mbox_base + mbox_reg;
-	int ret, value;
+	int ret, tail;
 
-	/* Poll till value is not zero */
-	ret = readx_poll_timeout(ioread32, (void *)ringbuf_addr, value,
-				 value, 1 /* us */, 100);
+	/* Poll till tail is not zero */
+	ret = readx_poll_timeout(ioread32, (void *)ringbuf_addr, tail,
+				 tail, 0 /* tight-loops */, 100 /* us timeout */);
 	if (ret < 0)
 		return ret;
 
-	*val = value;
+	if (unlikely(tail > ringbuf_size || !IS_ALIGNED(tail, 4))) {
+		MB_WARN_ONCE(mb_chann, "Invalid tail 0x%x", tail);
+		return -EINVAL;
+	}
+
+	*val = tail;
 	return 0;
 }
 
@@ -349,6 +359,12 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade
 	return ret;
 }
 
+/*
+ * mailbox_get_msg() is the key function to get message from ring buffer.
+ * If it returns 0, means 1 message was consumed.
+ * If it returns -ENOENT, means ring buffer is emtpy.
+ * If it returns other value, means ERROR.
+ */
 static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 {
 	struct xdna_msg_header header;
@@ -359,19 +375,15 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 	u64 read_addr;
 	int ret;
 
-	if (mailbox_tail_read_non_zero(mb_chann, &tail)) {
+	ret = mailbox_tail_read_non_zero(mb_chann, &tail);
+	if (ret) {
 		MB_WARN_ONCE(mb_chann, "Zero tail too long");
-		return -EINVAL;
+		return ret;
 	}
 	head = mb_chann->i2x_head;
 	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_I2X);
 	start_addr = mb_chann->res[CHAN_RES_I2X].rb_start_addr;
 
-	if (unlikely(tail > ringbuf_size || !IS_ALIGNED(tail, 4))) {
-		MB_WARN_ONCE(mb_chann, "Invalid tail 0x%x", tail);
-		return -EINVAL;
-	}
-
 	/* ringbuf empty */
 	if (head == tail)
 		return -ENOENT;
@@ -389,8 +401,17 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 				     head, tail);
 			return -EINVAL;
 		}
-		mailbox_set_headptr(mb_chann, 0);
-		return 0;
+
+		/* Read from beginning of ringbuf */
+		head = 0;
+		ret = mailbox_tail_read_non_zero(mb_chann, &tail);
+		if (ret) {
+			MB_WARN_ONCE(mb_chann, "Hit tombstone, re-read tail failed");
+			return -EINVAL;
+		}
+		/* Re-peek size of the message */
+		read_addr = mb_chann->mb->res.ringbuf_base + start_addr;
+		header.total_size = ioread32((void *)read_addr);
 	}
 
 	if (unlikely(!header.total_size || !IS_ALIGNED(header.total_size, 4))) {
@@ -416,20 +437,64 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 	/* After update head, it can equal to ringbuf_size. This is expected. */
 	trace_mbox_set_head(MAILBOX_NAME, mb_chann->msix_irq,
 			    header.opcode, header.id);
-
 	return ret;
 }
 
+static void mailbox_rx_worker(struct work_struct *rx_work)
+{
+	struct mailbox_channel *mb_chann;
+	int ret;
+
+	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
+	trace_mbox_rx_worker(MAILBOX_NAME, mb_chann->msix_irq);
+
+	if (READ_ONCE(mb_chann->bad_state)) {
+		MB_ERR(mb_chann, "Channel in bad state, work aborted");
+		return;
+	}
+
+	while (1) {
+		/*
+		 * If return is 0, keep consuming next message, until there is
+		 * no messages or an error happened.
+		 */
+		ret = mailbox_get_msg(mb_chann);
+		if (ret == -ENOENT)
+			break;
+
+		/* Other error means device doesn't look good, disable irq. */
+		if (unlikely(ret)) {
+			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
+			WRITE_ONCE(mb_chann->bad_state, true);
+			disable_irq(mb_chann->msix_irq);
+			break;
+		}
+	}
+}
+
 static irqreturn_t mailbox_irq_handler(int irq, void *p)
 {
 	struct mailbox_channel *mb_chann = p;
+	u32 iohub;
+	int i;
 
 	trace_mbox_irq_handle(MAILBOX_NAME, irq);
-	/* Schedule a rx_work to call the callback functions */
-	queue_work(mb_chann->work_q, &mb_chann->rx_work);
+	if (mb_chann->type == MB_CHANNEL_USER_POLL)
+		return IRQ_HANDLED;
 	/* Clear IOHUB register */
 	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
+	/* Schedule a rx_work to call the callback functions */
+	queue_work(mb_chann->work_q, &mb_chann->rx_work);
+	for (i = 0; i < 4; i++) {
+		iohub = mailbox_reg_read(mb_chann, mb_chann->iohub_int_addr);
+		if (iohub)
+			goto race;
+	}
 
+	return IRQ_HANDLED;
+race:
+	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
+	queue_work(mb_chann->work_q, &mb_chann->rx_work);
 	return IRQ_HANDLED;
 }
 
@@ -441,13 +506,12 @@ static void mailbox_timer(struct timer_list *t)
 
 	/* The timer mimic interrupt. It is good to reuse irq routine */
 	tail = mailbox_get_tailptr(mb_chann, CHAN_RES_I2X);
-	if (tail) {
-		MB_DBG(mb_chann, "Mimic interrupt...");
+	if (tail)
 		mailbox_irq_handler(0, mb_chann);
-	}
 
 	mod_timer(&mb_chann->timer, jiffies + MB_TIMER_JIFF);
 }
+#endif
 
 static void mailbox_polld_handle_chann(struct mailbox_channel *mb_chann)
 {
@@ -501,7 +565,10 @@ static bool mailbox_polld_event(struct mailbox *mb)
 	struct mailbox_channel *mb_chann;
 
 	mutex_lock(&mb->mbox_lock);
-	list_for_each_entry(mb_chann, &mb->chann_list, chann_entry) {
+	list_for_each_entry(mb_chann, &mb->poll_chann_list, chann_entry) {
+		if (mb_chann->type == MB_CHANNEL_MGMT)
+			break;
+
 		if (mailbox_channel_no_msg(mb_chann))
 			continue;
 
@@ -530,13 +597,11 @@ static int mailbox_polld(void *data)
 			continue;
 
 		mutex_lock(&mb->mbox_lock);
-		if (unlikely(list_empty(&mb->chann_list))) {
-			mutex_unlock(&mb->mbox_lock);
-			continue;
-		}
-
 		chann_all_empty = true;
-		list_for_each_entry(mb_chann, &mb->chann_list, chann_entry) {
+		list_for_each_entry(mb_chann, &mb->poll_chann_list, chann_entry) {
+			if (mb_chann->type == MB_CHANNEL_MGMT)
+				break;
+
 			if (mailbox_channel_no_msg(mb_chann))
 				continue;
 
@@ -558,38 +623,6 @@ static int mailbox_polld(void *data)
 
 	return 0;
 }
-#endif
-
-static void mailbox_rx_worker(struct work_struct *rx_work)
-{
-	struct mailbox_channel *mb_chann;
-	int ret;
-
-	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
-
-	if (READ_ONCE(mb_chann->bad_state)) {
-		MB_ERR(mb_chann, "Channel in bad state, work aborted");
-		return;
-	}
-
-	while (1) {
-		/*
-		 * If return is 0, keep consuming next message, until there is
-		 * no messages or an error happened.
-		 */
-		ret = mailbox_get_msg(mb_chann);
-		if (ret == -ENOENT)
-			break;
-
-		/* Other error means device doesn't look good, disable irq. */
-		if (unlikely(ret)) {
-			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
-			WRITE_ONCE(mb_chann->bad_state, true);
-			disable_irq(mb_chann->msix_irq);
-			break;
-		}
-	}
-}
 
 int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
 			  const struct xdna_mailbox_msg *msg, u64 tx_timeout)
@@ -656,10 +689,8 @@ int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
 		goto release_id;
 	}
 
-#ifdef AMDXDNA_DEVEL
-	if (mb_chann->mb->polld)
+	if (mb_chann->type == MB_CHANNEL_USER_POLL)
 		mailbox_polld_wakeup(mb_chann->mb);
-#endif
 	return 0;
 
 release_id:
@@ -670,42 +701,75 @@ int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
 }
 
 #if defined(CONFIG_DEBUG_FS)
+static struct mailbox_res_record *
+xdna_mailbox_get_record(struct mailbox *mb, int mb_irq,
+			const struct xdna_mailbox_chann_res *x2i,
+			const struct xdna_mailbox_chann_res *i2x,
+			enum xdna_mailbox_channel_type type)
+{
+	struct mailbox_res_record *record;
+	int record_found = 0;
+
+	mutex_lock(&mb->mbox_lock);
+	list_for_each_entry(record, &mb->res_records, re_entry) {
+		if (record->re_irq != mb_irq)
+			continue;
+
+		record_found = 1;
+		break;
+	}
+
+	if (record_found) {
+		record->type = type;
+		goto found;
+	}
+
+	record = kzalloc(sizeof(*record), GFP_KERNEL);
+	if (!record)
+		goto out;
+	list_add_tail(&record->re_entry, &mb->res_records);
+	record->re_irq = mb_irq;
+
+found:
+	record->type = type;
+	memcpy(&record->re_x2i, x2i, sizeof(*x2i));
+	memcpy(&record->re_i2x, i2x, sizeof(*i2x));
+out:
+	mutex_unlock(&mb->mbox_lock);
+	return record;
+}
+
 int xdna_mailbox_info_show(struct mailbox *mb, struct seq_file *m)
 {
-	static const char ring_fmt[] = "%4d  %3s  %5d  0x%08x  0x%04x  ";
+	static const char ring_fmt[] = "%4d  %3s  %5d  %4d  0x%08x  0x%04x  ";
 	static const char mbox_fmt[] = "0x%08x  0x%08x  0x%04x    0x%04x\n";
 	struct mailbox_res_record *record;
-	struct mailbox_channel *chann;
 
 	/* If below two puts changed, make sure update fmt[] as well */
-	seq_puts(m, "mbox  dir  alive  ring addr   size    ");
+	seq_puts(m, "mbox  dir  alive  type  ring addr   size    ");
 	seq_puts(m, "head ptr    tail ptr    head val  tail val\n");
 
 #define xdna_mbox_dump_queue(_dir, _act) \
-	{ \
-		u32 head_ptr, tail_ptr, head_val, tail_val; \
-		u32 rb_start, rb_size; \
-		u32 mbox_irq; \
-		mbox_irq = record->re_irq; \
-		rb_start = record->re_##_dir.rb_start_addr; \
-		rb_size = record->re_##_dir.rb_size; \
-		head_ptr = record->re_##_dir.mb_head_ptr_reg; \
-		tail_ptr = record->re_##_dir.mb_tail_ptr_reg; \
-		head_val = ioread32((void *)(mb->res.mbox_base + head_ptr)); \
-		tail_val = ioread32((void *)(mb->res.mbox_base + tail_ptr)); \
-		seq_printf(m, ring_fmt, mbox_irq, #_dir, _act, rb_start, rb_size); \
-		seq_printf(m, mbox_fmt, head_ptr, tail_ptr, head_val, tail_val); \
-	}
+{ \
+	u32 head_ptr, tail_ptr, head_val, tail_val; \
+	u32 rb_start, rb_size; \
+	u32 mbox_irq; \
+	u32 type; \
+	type = record->type; \
+	mbox_irq = record->re_irq; \
+	rb_start = record->re_##_dir.rb_start_addr; \
+	rb_size = record->re_##_dir.rb_size; \
+	head_ptr = record->re_##_dir.mb_head_ptr_reg; \
+	tail_ptr = record->re_##_dir.mb_tail_ptr_reg; \
+	head_val = ioread32((void *)(mb->res.mbox_base + head_ptr)); \
+	tail_val = ioread32((void *)(mb->res.mbox_base + tail_ptr)); \
+	seq_printf(m, ring_fmt, mbox_irq, #_dir, _act, type, rb_start, rb_size); \
+	seq_printf(m, mbox_fmt, head_ptr, tail_ptr, head_val, tail_val); \
+}
 	mutex_lock(&mb->mbox_lock);
 	list_for_each_entry(record, &mb->res_records, re_entry) {
-		int active = 0;
-
-		list_for_each_entry(chann, &mb->chann_list, chann_entry) {
-			if (record->re_irq == chann->msix_irq)
-				active = 1;
-		}
-		xdna_mbox_dump_queue(x2i, active);
-		xdna_mbox_dump_queue(i2x, active);
+		xdna_mbox_dump_queue(x2i, record->active);
+		xdna_mbox_dump_queue(i2x, record->active);
 	}
 	mutex_unlock(&mb->mbox_lock);
 
@@ -747,42 +811,17 @@ struct mailbox_channel *
 xdna_mailbox_create_channel(struct mailbox *mb,
 			    const struct xdna_mailbox_chann_res *x2i,
 			    const struct xdna_mailbox_chann_res *i2x,
-			    u32 iohub_int_addr,
-			    int mb_irq)
+			    u32 iohub_int_addr, int mb_irq,
+			    enum xdna_mailbox_channel_type type)
 {
 	struct mailbox_channel *mb_chann;
 	int ret;
 #if defined(CONFIG_DEBUG_FS)
 	struct mailbox_res_record *record;
-	int record_found = 0;
-
-	mutex_lock(&mb->mbox_lock);
-	list_for_each_entry(record, &mb->res_records, re_entry) {
-		if (record->re_irq != mb_irq)
-			continue;
-
-		record_found = 1;
-		break;
-	}
-
-	if (record_found)
-		goto skip_record;
-
-	record = kzalloc(sizeof(*record), GFP_KERNEL);
-	if (!record) {
-		mutex_unlock(&mb->mbox_lock);
-		return NULL;
-	}
-
-	memcpy(&record->re_x2i, x2i, sizeof(*x2i));
-	memcpy(&record->re_i2x, i2x, sizeof(*i2x));
-	record->re_irq = mb_irq;
-
 	/* Record will be released when mailbox device destroy*/
-	list_add_tail(&record->re_entry, &mb->res_records);
-
-skip_record:
-	mutex_unlock(&mb->mbox_lock);
+	record = xdna_mailbox_get_record(mb, mb_irq, x2i, i2x, type);
+	if (!record)
+		return NULL;
 #endif /* CONFIG_DEBUG_FS */
 
 	if (!is_power_of_2(x2i->rb_size) || !is_power_of_2(i2x->rb_size)) {
@@ -795,6 +834,11 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 		return NULL;
 
 	mb_chann->mb = mb;
+	mb_chann->type = type;
+#ifdef AMDXDNA_DEVEL
+	if (type != MB_CHANNEL_MGMT && MB_FORCE_USER_POLL)
+		mb_chann->type = MB_CHANNEL_USER_POLL;
+#endif
 	mb_chann->msix_irq = mb_irq;
 	mb_chann->iohub_int_addr = iohub_int_addr;
 	memcpy(&mb_chann->res[CHAN_RES_X2I], x2i, sizeof(*x2i));
@@ -804,20 +848,17 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 	idr_init(&mb_chann->chan_idr);
 	mb_chann->x2i_tail = mailbox_get_tailptr(mb_chann, CHAN_RES_X2I);
 	mb_chann->i2x_head = mailbox_get_headptr(mb_chann, CHAN_RES_I2X);
-#ifdef AMDXDNA_DEVEL
-	if (mb->polld)
-		goto skip_irq;
-#endif
+	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
 
 	INIT_WORK(&mb_chann->rx_work, mailbox_rx_worker);
-	mb_chann->work_q = create_singlethread_workqueue(MAILBOX_NAME);
+	mb_chann->work_q = alloc_ordered_workqueue(MAILBOX_NAME, 0);
 	if (!mb_chann->work_q) {
 		MB_ERR(mb_chann, "Create workqueue failed");
 		goto free_and_out;
 	}
 
 #ifdef AMDXDNA_DEVEL
-	if (mailbox_polling > 0) {
+	if (MB_PERIODIC_POLL) {
 		/* Poll response every few ms. Good for bring up a new device */
 		timer_setup(&mb_chann->timer, mailbox_timer, 0);
 
@@ -839,10 +880,18 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 #endif
 	mb_chann->bad_state = false;
 	mutex_lock(&mb->mbox_lock);
-	list_add(&mb_chann->chann_entry, &mb->chann_list);
+	if (mb_chann->type == MB_CHANNEL_USER_POLL)
+		list_add_tail(&mb_chann->chann_entry, &mb->poll_chann_list);
+	else
+		list_add_tail(&mb_chann->chann_entry, &mb->chann_list);
+#if defined(CONFIG_DEBUG_FS)
+	mb_chann->record = record;
+	record->active = 1;
+#endif
 	mutex_unlock(&mb->mbox_lock);
 
-	MB_DBG(mb_chann, "Mailbox channel created (irq: %d)", mb_chann->msix_irq);
+	MB_DBG(mb_chann, "Mailbox channel created type %d (irq: %d)",
+	       mb_chann->type, mb_chann->msix_irq);
 	return mb_chann;
 
 destroy_wq:
@@ -859,13 +908,13 @@ int xdna_mailbox_destroy_channel(struct mailbox_channel *mb_chann)
 
 	mutex_lock(&mb_chann->mb->mbox_lock);
 	list_del(&mb_chann->chann_entry);
+#if defined(CONFIG_DEBUG_FS)
+	mb_chann->record->active = 0;
+#endif
 	mutex_unlock(&mb_chann->mb->mbox_lock);
 
 #ifdef AMDXDNA_DEVEL
-	if (mb_chann->mb->polld)
-		goto free_msg;
-
-	if (mailbox_polling > 0)
+	if (MB_PERIODIC_POLL)
 		goto destroy_wq;
 #endif
 	free_irq(mb_chann->msix_irq, mb_chann);
@@ -876,13 +925,11 @@ int xdna_mailbox_destroy_channel(struct mailbox_channel *mb_chann)
 	destroy_workqueue(mb_chann->work_q);
 	/* We can clean up and release resources */
 
-#ifdef AMDXDNA_DEVEL
-free_msg:
-#endif
 	idr_for_each(&mb_chann->chan_idr, mailbox_release_msg, mb_chann);
 	idr_destroy(&mb_chann->chan_idr);
 
-	MB_DBG(mb_chann, "Mailbox channel destroyed, irq: %d", mb_chann->msix_irq);
+	MB_DBG(mb_chann, "Mailbox channel destroyed type %d irq: %d",
+	       mb_chann->type, mb_chann->msix_irq);
 	kfree(mb_chann);
 	return 0;
 }
@@ -893,10 +940,7 @@ void xdna_mailbox_stop_channel(struct mailbox_channel *mb_chann)
 		return;
 
 #ifdef AMDXDNA_DEVEL
-	if (mb_chann->mb->polld)
-		return;
-
-	if (mailbox_polling > 0) {
+	if (MB_PERIODIC_POLL) {
 		timer_delete_sync(&mb_chann->timer);
 		goto skip_irq;
 	}
@@ -927,11 +971,13 @@ struct mailbox *xdna_mailbox_create(struct device *dev,
 
 	mutex_init(&mb->mbox_lock);
 	INIT_LIST_HEAD(&mb->chann_list);
-#ifdef AMDXDNA_DEVEL
-	if (mailbox_polling >= 0)
-		goto skip_polld;
+	INIT_LIST_HEAD(&mb->poll_chann_list);
 
-	/* Launch per device busy polling kthread */
+	/*
+	 * The polld kthread will only wakeup and handle those
+	 * MB_CHANNEL_USER_POLL channels. If no thing to do, polld should
+	 * just sleep. It is a per device kthread.
+	 */
 	mb->polld = kthread_run(mailbox_polld, mb, MAILBOX_NAME);
 	if (IS_ERR(mb->polld)) {
 		dev_err(mb->dev, "Failed to create polld ret %ld", PTR_ERR(mb->polld));
@@ -940,8 +986,6 @@ struct mailbox *xdna_mailbox_create(struct device *dev,
 	}
 	init_waitqueue_head(&mb->poll_wait);
 	mb->sent_msg = false;
-skip_polld:
-#endif
 
 #if defined(CONFIG_DEBUG_FS)
 	INIT_LIST_HEAD(&mb->res_records);
@@ -965,18 +1009,11 @@ void xdna_mailbox_destroy(struct mailbox *mb)
 	}
 done_release_record:
 #endif /* CONFIG_DEBUG_FS */
-#ifdef AMDXDNA_DEVEL
-	if (mailbox_polling >= 0)
-		goto skip_polld;
-
 	dev_dbg(mb->dev, "Stopping polld");
 	(void)kthread_stop(mb->polld);
-skip_polld:
-#endif
 
 	mutex_lock(&mb->mbox_lock);
-	if (!list_empty(&mb->chann_list))
-		WARN_ON("Channel not destroy");
+	WARN_ONCE(!list_empty(&mb->chann_list), "Channel not destroy");
 	mutex_unlock(&mb->mbox_lock);
 
 	mutex_destroy(&mb->mbox_lock);
diff --git a/src/driver/amdxdna/amdxdna_mailbox.h b/src/driver/amdxdna/amdxdna_mailbox.h
index 2e114644..8ac677d9 100644
--- a/src/driver/amdxdna/amdxdna_mailbox.h
+++ b/src/driver/amdxdna/amdxdna_mailbox.h
@@ -80,6 +80,13 @@ struct mailbox *xdna_mailbox_create(struct device *dev,
  */
 void xdna_mailbox_destroy(struct mailbox *mailbox);
 
+enum xdna_mailbox_channel_type {
+	MB_CHANNEL_MGMT = 0,
+	MB_CHANNEL_USER_NORMAL,
+	MB_CHANNEL_USER_POLL,
+	MB_CHANNEL_MAX_TYPE,
+};
+
 /*
  * xdna_mailbox_create_channel() -- Create a mailbox channel instance
  *
@@ -88,6 +95,7 @@ void xdna_mailbox_destroy(struct mailbox *mailbox);
  * @i2x: firmware to host mailbox resources
  * @xdna_mailbox_intr_reg: register addr of MSI-X interrupt
  * @mb_irq: Linux IRQ number associated with mailbox MSI-X interrupt vector index
+ * @type: Type of channel
  *
  * Return: If success, return a handle of mailbox channel. Otherwise, return NULL.
  */
@@ -96,7 +104,7 @@ xdna_mailbox_create_channel(struct mailbox *mailbox,
 			    const struct xdna_mailbox_chann_res *x2i,
 			    const struct xdna_mailbox_chann_res *i2x,
 			    u32 xdna_mailbox_intr_reg,
-			    int mb_irq);
+			    int mb_irq, enum xdna_mailbox_channel_type type);
 
 /*
  * xdna_mailbox_destroy_channel() -- destroy mailbox channel
diff --git a/src/driver/amdxdna/amdxdna_mailbox_helper.h b/src/driver/amdxdna/amdxdna_mailbox_helper.h
index 20c1fe7b..e1c3f16f 100644
--- a/src/driver/amdxdna/amdxdna_mailbox_helper.h
+++ b/src/driver/amdxdna/amdxdna_mailbox_helper.h
@@ -37,6 +37,8 @@ struct xdna_notify {
 		.notify_cb = xdna_msg_cb,			\
 	}
 
+#define XDNA_STATUS_OFFSET(name) (offsetof(struct name##_resp, status) / sizeof(u32))
+
 int xdna_msg_cb(void *handle, const u32 *data, size_t size);
 int xdna_send_msg_wait(struct amdxdna_dev *xdna, struct mailbox_channel *chann,
 		       struct xdna_mailbox_msg *msg);
diff --git a/src/driver/amdxdna/amdxdna_pci_drv.c b/src/driver/amdxdna/amdxdna_pci_drv.c
index 9f6f7fed..fab4c295 100644
--- a/src/driver/amdxdna/amdxdna_pci_drv.c
+++ b/src/driver/amdxdna/amdxdna_pci_drv.c
@@ -13,7 +13,9 @@
 #include "amdxdna_pci_drv.h"
 #include "amdxdna_sysfs.h"
 
-#define AMDXDNA_AUTOSUSPEND_DELAY	5000 /* miliseconds */
+int autosuspend_ms = -1;
+module_param(autosuspend_ms, int, 0644);
+MODULE_PARM_DESC(autosuspend_ms, "runtime suspend delay in miliseconds. < 0: prevent it");
 
 /*
  *  There are platforms which share the same PCI device ID
@@ -100,7 +102,7 @@ static int amdxdna_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto failed_dev_fini;
 	}
 
-	pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY);
+	pm_runtime_set_autosuspend_delay(dev, autosuspend_ms);
 	pm_runtime_use_autosuspend(dev);
 	pm_runtime_allow(dev);
 
diff --git a/src/driver/amdxdna/amdxdna_tdr.c b/src/driver/amdxdna/amdxdna_tdr.c
index f5640b3c..716fe198 100644
--- a/src/driver/amdxdna/amdxdna_tdr.c
+++ b/src/driver/amdxdna/amdxdna_tdr.c
@@ -6,8 +6,8 @@
 #include "amdxdna_drm.h"
 #include "amdxdna_tdr.h"
 
-int timeout_in_sec = 2;
-module_param(timeout_in_sec, int, 0644);
+uint timeout_in_sec = 2;
+module_param(timeout_in_sec, uint, 0644);
 MODULE_PARM_DESC(timeout_in_sec, "Seconds to timeout and recovery, default 2; 0 - No TDR");
 
 #define TDR_TIMEOUT_JIFF msecs_to_jiffies(timeout_in_sec * 1000)
diff --git a/src/driver/amdxdna/amdxdna_trace.h b/src/driver/amdxdna/amdxdna_trace.h
index 6d73c823..4620d2e4 100644
--- a/src/driver/amdxdna/amdxdna_trace.h
+++ b/src/driver/amdxdna/amdxdna_trace.h
@@ -130,6 +130,11 @@ DEFINE_EVENT(xdna_mbox_name_id, mbox_irq_handle,
 	     TP_ARGS(name, irq)
 );
 
+DEFINE_EVENT(xdna_mbox_name_id, mbox_rx_worker,
+	     TP_PROTO(char *name, int irq),
+	     TP_ARGS(name, irq)
+);
+
 DEFINE_EVENT(xdna_mbox_name_id, mbox_poll_handle,
 	     TP_PROTO(char *name, int irq),
 	     TP_ARGS(name, irq)
diff --git a/src/driver/amdxdna/npu1_regs.c b/src/driver/amdxdna/npu1_regs.c
index 3c442021..11a1e3fd 100644
--- a/src/driver/amdxdna/npu1_regs.c
+++ b/src/driver/amdxdna/npu1_regs.c
@@ -51,9 +51,6 @@
 #define NPU1_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
 #define NPU1_RT_CFG_VAL_DEBUG_BO_LARGE   1
 
-#define NPU1_MPNPUCLK_FREQ_MAX  847
-#define NPU1_HCLK_FREQ_MAX      1600
-
 /*fill in the dpm clock frequencies */
 const struct dpm_clk npu1_dpm_clk_table[] = {
 	{400, 800},
@@ -109,9 +106,6 @@ const struct amdxdna_dev_priv npu1_dev_priv = {
 		.value_enable = NPU1_RT_CFG_VAL_CLK_GATING_ON,
 		.value_disable = NPU1_RT_CFG_VAL_CLK_GATING_OFF,
 	},
-	.smu_mpnpuclk_freq_max = NPU1_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU1_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
 	.smu_rev = SMU_REVISION_V0,
 	.smu_npu_dpm_clk_table = npu1_dpm_clk_table,
 	.smu_npu_dpm_levels = ARRAY_SIZE(npu1_dpm_clk_table),
diff --git a/src/driver/amdxdna/npu2_regs.c b/src/driver/amdxdna/npu2_regs.c
index f84c726e..3a10be1e 100644
--- a/src/driver/amdxdna/npu2_regs.c
+++ b/src/driver/amdxdna/npu2_regs.c
@@ -3,159 +3,19 @@
  * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
+#include "npu4_family.h"
 
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU2 */
-#define NPU2_REG_BAR_INDEX	0
-#define NPU2_MBOX_BAR_INDEX	0
-#define NPU2_PSP_BAR_INDEX	4
-#define NPU2_SMU_BAR_INDEX	5
-#define NPU2_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU2_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU2_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU2_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU2_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU2_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU2_RT_CFG_TYPE_CLK_GATING   1
-#define NPU2_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU2_RT_CFG_TYPE_PWR_GATING   3
-#define NPU2_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU2_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU2_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU2_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU2_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU2_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU2_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU2_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU2_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU2_MPNPUCLK_FREQ_MAX  1267
-#define NPU2_HCLK_FREQ_MAX      1800
-
-const struct dpm_clk npu2_dpm_clk_table[DPM_LEVEL_MAX] = {
-	{396, 792},
-	{600, 1056},
-	{792, 1152},
-	{975, 1267},
-	{975, 1267},
-	{1056, 1408},
-	{1152, 1584},
-	{1267, 1800}
-};
-
-const struct rt_config npu2_rt_cfg[] = {
-	{NPU2_RT_CFG_TYPE_PDI_LOAD, NPU2_RT_CFG_VAL_PDI_LOAD_APP},
-	{NPU2_RT_CFG_TYPE_DEBUG_BO, NPU2_RT_CFG_VAL_DEBUG_BO_LARGE},
-};
-
-const u32 npu2_clk_gating_types[] = {
-	NPU2_RT_CFG_TYPE_CLK_GATING,
-	NPU2_RT_CFG_TYPE_HCLK_GATING,
-	NPU2_RT_CFG_TYPE_PWR_GATING,
-	NPU2_RT_CFG_TYPE_L1IMU_GATING,
-};
+/* NPU2 is the prototype of NPU4. It will be obsoleted in near future. */
 
 const struct amdxdna_dev_priv npu2_dev_priv = {
 	.fw_path        = "amdnpu/17f0_00/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu2_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu2_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU2_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU2_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU2_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU2_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU2_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU2_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU2_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU2_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU2_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU2_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU2_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU2_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU2_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU2_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu2_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu2_clk_gating_types),
-		.value_enable = NPU2_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU2_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU2_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU2_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu2_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu2_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU2_RT_CFG_TYPE_PDI_LOAD, NPU2_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu2_info = {
-	.reg_bar           = NPU2_REG_BAR_INDEX,
-	.mbox_bar          = NPU2_MBOX_BAR_INDEX,
-	.sram_bar          = NPU2_SRAM_BAR_INDEX,
-	.psp_bar           = NPU2_PSP_BAR_INDEX,
-	.smu_bar           = NPU2_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu2",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu2_dev_priv,
-	.ops               = &aie2_ops, /* NPU2 can share NPU1's callback */
+	NPU4_COMMON_DEV_INFO,
 };
diff --git a/src/driver/amdxdna/npu4_family.h b/src/driver/amdxdna/npu4_family.h
new file mode 100644
index 00000000..9da6d971
--- /dev/null
+++ b/src/driver/amdxdna/npu4_family.h
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _NPU4_FAMILY_H_
+#define _NPU4_FAMILY_H_
+
+#include "drm_local/amdxdna_accel.h"
+#include "aie2_pci.h"
+
+/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
+#define MPNPU_PUB_SEC_INTR             0x3010060
+#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
+#define MPNPU_PUB_SCRATCH0             0x301006C
+#define MPNPU_PUB_SCRATCH1             0x3010070
+#define MPNPU_PUB_SCRATCH2             0x3010074
+#define MPNPU_PUB_SCRATCH3             0x3010078
+#define MPNPU_PUB_SCRATCH4             0x301007C
+#define MPNPU_PUB_SCRATCH5             0x3010080
+#define MPNPU_PUB_SCRATCH6             0x3010084
+#define MPNPU_PUB_SCRATCH7             0x3010088
+#define MPNPU_PUB_SCRATCH8             0x301008C
+#define MPNPU_PUB_SCRATCH9             0x3010090
+#define MPNPU_PUB_SCRATCH10            0x3010094
+#define MPNPU_PUB_SCRATCH11            0x3010098
+#define MPNPU_PUB_SCRATCH12            0x301009C
+#define MPNPU_PUB_SCRATCH13            0x30100A0
+#define MPNPU_PUB_SCRATCH14            0x30100A4
+#define MPNPU_PUB_SCRATCH15            0x30100A8
+#define MP0_C2PMSG_73                  0x3810A24
+#define MP0_C2PMSG_123                 0x3810AEC
+
+#define MP1_C2PMSG_0                   0x3B10900
+#define MP1_C2PMSG_60                  0x3B109F0
+#define MP1_C2PMSG_61                  0x3B109F4
+
+#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
+#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
+#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
+#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
+
+#define MMNPU_APERTURE0_BASE           0x3000000
+#define MMNPU_APERTURE1_BASE           0x3600000
+#define MMNPU_APERTURE3_BASE           0x3810000
+#define MMNPU_APERTURE4_BASE           0x3B10000
+
+/* PCIe BAR Index for NPU4 */
+#define NPU4_REG_BAR_INDEX	0
+#define NPU4_MBOX_BAR_INDEX	0
+#define NPU4_PSP_BAR_INDEX	4
+#define NPU4_SMU_BAR_INDEX	5
+#define NPU4_SRAM_BAR_INDEX	2
+/* Associated BARs and Apertures */
+#define NPU4_REG_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU4_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU4_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
+#define NPU4_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
+#define NPU4_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
+
+#define NPU4_RT_CFG_TYPE_CLK_GATING   1
+#define NPU4_RT_CFG_TYPE_HCLK_GATING  2
+#define NPU4_RT_CFG_TYPE_PWR_GATING   3
+#define NPU4_RT_CFG_TYPE_L1IMU_GATING 4
+#define NPU4_RT_CFG_TYPE_PDI_LOAD     5
+#define NPU4_RT_CFG_TYPE_DEBUG_BO     10
+
+#define NPU4_RT_CFG_VAL_CLK_GATING_OFF 0
+#define NPU4_RT_CFG_VAL_CLK_GATING_ON 1
+
+#define NPU4_RT_CFG_VAL_PDI_LOAD_MGMT 0
+#define NPU4_RT_CFG_VAL_PDI_LOAD_APP 1
+
+#define NPU4_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
+#define NPU4_RT_CFG_VAL_DEBUG_BO_LARGE   1
+
+#define NPU4_INIT_RT_CFG_NUM	2
+#define NPU4_CLK_GATING_CFG_NUM 4
+
+extern const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX];
+extern const struct rt_config npu4_rt_cfg[NPU4_INIT_RT_CFG_NUM];
+extern const u32 npu4_clk_gating_types[NPU4_CLK_GATING_CFG_NUM];
+
+#define NPU4_COMMON_DEV_PRIV \
+	.rt_config	= npu4_rt_cfg,								\
+	.num_rt_cfg	= ARRAY_SIZE(npu4_rt_cfg),						\
+	.priv_load_cfg = {NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_MGMT},		\
+	.col_align	= COL_ALIGN_NATURE,							\
+	.mbox_dev_addr  = NPU4_MBOX_BAR_BASE,							\
+	.mbox_size      = 0, /* Use BAR size */							\
+	.sram_dev_addr  = NPU4_SRAM_BAR_BASE,							\
+	.sram_offs      = {									\
+		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),		\
+		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),	\
+	},											\
+	.psp_regs_off   = {									\
+		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU4_PSP, MP0_C2PMSG_123),			\
+		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),		\
+		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU4_REG, MPNPU_PUB_SCRATCH4),		\
+		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU4_REG, MPNPU_PUB_SCRATCH9),		\
+		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU4_PSP, MP0_C2PMSG_73),			\
+		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123),			\
+		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),		\
+	},											\
+	.smu_regs_off   = {									\
+		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU4_SMU, MP1_C2PMSG_0),			\
+		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU4_SMU, MP1_C2PMSG_60),			\
+		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU4_SMU, MMNPU_APERTURE4_BASE),		\
+		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU4_SMU, MP1_C2PMSG_61),			\
+		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU4_SMU, MP1_C2PMSG_60),			\
+	},											\
+	.clk_gating = {										\
+		.types = npu4_clk_gating_types,							\
+		.num_types = ARRAY_SIZE(npu4_clk_gating_types),					\
+		.value_enable = NPU4_RT_CFG_VAL_CLK_GATING_ON,					\
+		.value_disable = NPU4_RT_CFG_VAL_CLK_GATING_OFF,				\
+	},											\
+	.smu_rev = SMU_REVISION_V1,								\
+	.smu_npu_dpm_clk_table = npu4_dpm_clk_table,						\
+	.smu_npu_dpm_levels = ARRAY_SIZE(npu4_dpm_clk_table)
+
+#define NPU4_COMMON_DEV_INFO \
+	.reg_bar           = NPU4_REG_BAR_INDEX,						\
+	.mbox_bar          = NPU4_MBOX_BAR_INDEX,						\
+	.sram_bar          = NPU4_SRAM_BAR_INDEX,						\
+	.psp_bar           = NPU4_PSP_BAR_INDEX,						\
+	.smu_bar           = NPU4_SMU_BAR_INDEX,						\
+	.first_col         = 0,									\
+	.dev_mem_buf_shift = 15, /* 32 KiB aligned */						\
+	.dev_mem_base      = AIE2_DEVM_BASE,							\
+	.dev_mem_size      = AIE2_DEVM_SIZE,							\
+	.device_type       = AMDXDNA_DEV_TYPE_KMQ,						\
+	.ops               = &aie2_ops
+
+#endif /* _NPU4_FAMILY_H_ */
diff --git a/src/driver/amdxdna/npu4_regs.c b/src/driver/amdxdna/npu4_regs.c
index b86958e3..50d0bb10 100644
--- a/src/driver/amdxdna/npu4_regs.c
+++ b/src/driver/amdxdna/npu4_regs.c
@@ -3,76 +3,7 @@
  * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
-
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU4 */
-#define NPU4_REG_BAR_INDEX	0
-#define NPU4_MBOX_BAR_INDEX	0
-#define NPU4_PSP_BAR_INDEX	4
-#define NPU4_SMU_BAR_INDEX	5
-#define NPU4_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU4_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU4_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU4_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU4_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU4_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU4_RT_CFG_TYPE_CLK_GATING   1
-#define NPU4_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU4_RT_CFG_TYPE_PWR_GATING   3
-#define NPU4_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU4_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU4_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU4_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU4_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU4_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU4_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU4_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU4_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU4_MPNPUCLK_FREQ_MAX  1267
-#define NPU4_HCLK_FREQ_MAX      1800
+#include "npu4_family.h"
 
 const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX] = {
 	{396, 792},
@@ -85,12 +16,12 @@ const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX] = {
 	{1267, 1800}
 };
 
-const struct rt_config npu4_rt_cfg[] = {
+const struct rt_config npu4_rt_cfg[NPU4_INIT_RT_CFG_NUM] = {
 	{NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_APP},
 	{NPU4_RT_CFG_TYPE_DEBUG_BO, NPU4_RT_CFG_VAL_DEBUG_BO_LARGE},
 };
 
-const u32 npu4_clk_gating_types[] = {
+const u32 npu4_clk_gating_types[NPU4_CLK_GATING_CFG_NUM] = {
 	NPU4_RT_CFG_TYPE_CLK_GATING,
 	NPU4_RT_CFG_TYPE_HCLK_GATING,
 	NPU4_RT_CFG_TYPE_PWR_GATING,
@@ -101,61 +32,11 @@ const struct amdxdna_dev_priv npu4_dev_priv = {
 	.fw_path        = "amdnpu/17f0_10/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu4_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu4_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU4_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU4_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU4_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU4_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU4_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU4_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU4_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU4_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU4_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU4_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU4_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu4_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu4_clk_gating_types),
-		.value_enable = NPU4_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU4_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU4_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU4_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu4_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu4_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu4_info = {
-	.reg_bar           = NPU4_REG_BAR_INDEX,
-	.mbox_bar          = NPU4_MBOX_BAR_INDEX,
-	.sram_bar          = NPU4_SRAM_BAR_INDEX,
-	.psp_bar           = NPU4_PSP_BAR_INDEX,
-	.smu_bar           = NPU4_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu4",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu4_dev_priv,
-	.ops               = &aie2_ops, /* NPU4 can share NPU1's callback */
+	NPU4_COMMON_DEV_INFO,
 };
diff --git a/src/driver/amdxdna/npu5_regs.c b/src/driver/amdxdna/npu5_regs.c
index ed7d81df..7f0050d1 100644
--- a/src/driver/amdxdna/npu5_regs.c
+++ b/src/driver/amdxdna/npu5_regs.c
@@ -3,159 +3,17 @@
  * Copyright (C) 2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
-
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU5 */
-#define NPU5_REG_BAR_INDEX	0
-#define NPU5_MBOX_BAR_INDEX	0
-#define NPU5_PSP_BAR_INDEX	4
-#define NPU5_SMU_BAR_INDEX	5
-#define NPU5_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU5_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU5_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU5_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU5_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU5_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU5_RT_CFG_TYPE_CLK_GATING   1
-#define NPU5_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU5_RT_CFG_TYPE_PWR_GATING   3
-#define NPU5_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU5_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU5_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU5_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU5_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU5_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU5_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU5_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU5_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU5_MPNPUCLK_FREQ_MAX  1267
-#define NPU5_HCLK_FREQ_MAX      1800
-
-const struct dpm_clk npu5_dpm_clk_table[DPM_LEVEL_MAX] = {
-	{396, 792},
-	{600, 1056},
-	{792, 1152},
-	{975, 1267},
-	{975, 1267},
-	{1056, 1408},
-	{1152, 1584},
-	{1267, 1800}
-};
-
-const struct rt_config npu5_rt_cfg[] = {
-	{NPU5_RT_CFG_TYPE_PDI_LOAD, NPU5_RT_CFG_VAL_PDI_LOAD_APP},
-	{NPU5_RT_CFG_TYPE_DEBUG_BO, NPU5_RT_CFG_VAL_DEBUG_BO_LARGE},
-};
-
-const u32 npu5_clk_gating_types[] = {
-	NPU5_RT_CFG_TYPE_CLK_GATING,
-	NPU5_RT_CFG_TYPE_HCLK_GATING,
-	NPU5_RT_CFG_TYPE_PWR_GATING,
-	NPU5_RT_CFG_TYPE_L1IMU_GATING,
-};
+#include "npu4_family.h"
 
 const struct amdxdna_dev_priv npu5_dev_priv = {
 	.fw_path        = "amdnpu/17f0_11/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu5_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu5_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU5_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU5_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU5_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU5_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU5_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU5_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU5_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU5_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU5_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU5_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU5_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU5_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU5_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU5_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu5_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu5_clk_gating_types),
-		.value_enable = NPU5_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU5_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU5_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU5_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu5_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu5_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU5_RT_CFG_TYPE_PDI_LOAD, NPU5_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu5_info = {
-	.reg_bar           = NPU5_REG_BAR_INDEX,
-	.mbox_bar          = NPU5_MBOX_BAR_INDEX,
-	.sram_bar          = NPU5_SRAM_BAR_INDEX,
-	.psp_bar           = NPU5_PSP_BAR_INDEX,
-	.smu_bar           = NPU5_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu5",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu5_dev_priv,
-	.ops               = &aie2_ops, /* NPU5 can share NPU1's callback */
+	NPU4_COMMON_DEV_INFO,
 };
diff --git a/src/driver/amdxdna/npu6_regs.c b/src/driver/amdxdna/npu6_regs.c
index f418896a..efa01321 100644
--- a/src/driver/amdxdna/npu6_regs.c
+++ b/src/driver/amdxdna/npu6_regs.c
@@ -3,159 +3,17 @@
  * Copyright (C) 2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
-
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU6 */
-#define NPU6_REG_BAR_INDEX	0
-#define NPU6_MBOX_BAR_INDEX	0
-#define NPU6_PSP_BAR_INDEX	4
-#define NPU6_SMU_BAR_INDEX	5
-#define NPU6_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU6_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU6_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU6_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU6_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU6_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU6_RT_CFG_TYPE_CLK_GATING   1
-#define NPU6_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU6_RT_CFG_TYPE_PWR_GATING   3
-#define NPU6_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU6_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU6_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU6_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU6_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU6_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU6_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU6_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU6_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU6_MPNPUCLK_FREQ_MAX  1267
-#define NPU6_HCLK_FREQ_MAX      1800
-
-const struct dpm_clk npu6_dpm_clk_table[DPM_LEVEL_MAX] = {
-	{396, 792},
-	{600, 1056},
-	{792, 1152},
-	{975, 1267},
-	{975, 1267},
-	{1056, 1408},
-	{1152, 1584},
-	{1267, 1800}
-};
-
-const struct rt_config npu6_rt_cfg[] = {
-	{NPU6_RT_CFG_TYPE_PDI_LOAD, NPU6_RT_CFG_VAL_PDI_LOAD_APP},
-	{NPU6_RT_CFG_TYPE_DEBUG_BO, NPU6_RT_CFG_VAL_DEBUG_BO_LARGE},
-};
-
-const u32 npu6_clk_gating_types[] = {
-	NPU6_RT_CFG_TYPE_CLK_GATING,
-	NPU6_RT_CFG_TYPE_HCLK_GATING,
-	NPU6_RT_CFG_TYPE_PWR_GATING,
-	NPU6_RT_CFG_TYPE_L1IMU_GATING,
-};
+#include "npu4_family.h"
 
 const struct amdxdna_dev_priv npu6_dev_priv = {
-	.fw_path        = "amdnpu/17f0_20/npu.sbin",
+	.fw_path        = "amdnpu/17f0_10/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu6_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu6_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU6_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU6_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU6_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU6_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU6_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU6_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU6_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU6_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU6_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU6_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU6_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU6_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU6_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU6_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu6_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu6_clk_gating_types),
-		.value_enable = NPU6_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU6_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU6_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU6_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu6_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu6_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU6_RT_CFG_TYPE_PDI_LOAD, NPU6_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu6_info = {
-	.reg_bar           = NPU6_REG_BAR_INDEX,
-	.mbox_bar          = NPU6_MBOX_BAR_INDEX,
-	.sram_bar          = NPU6_SRAM_BAR_INDEX,
-	.psp_bar           = NPU6_PSP_BAR_INDEX,
-	.smu_bar           = NPU6_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu6",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu6_dev_priv,
-	.ops               = &aie2_ops,
+	NPU4_COMMON_DEV_INFO,
 };
diff --git a/src/driver/doc/amdnpu.rst b/src/driver/doc/amdnpu.rst
new file mode 100644
index 00000000..7ea04261
--- /dev/null
+++ b/src/driver/doc/amdnpu.rst
@@ -0,0 +1,277 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+=========
+ AMD NPU
+=========
+
+:Copyright: |copy| 2024 Advanced Micro Devices, Inc.
+:Author: Sonal Santan <sonal.santan@amd.com>
+
+Overview
+========
+
+AMD NPU (Neural Processing Unit) is a multi-user AI inference accelerator
+integrated into AMD client APU. NPU enables efficient execution of Machine
+Learning applications like CNN, LLM, etc. NPU is based on
+`AMD XDNA Architecture`_. NPU is managed by **amdxdna** driver.
+
+
+Hardware Description
+====================
+
+AMD NPU consists of the following hardware components:
+
+AMD XDNA Array
+--------------
+
+AMD XDNA Array comprises of 2D array of compute and memory tiles built with
+`AMD AI Engine Technology`_. Each column has 4 rows of compute tiles and 1
+row of memory tile. Each compute tile contains a VLIW processor with its own
+dedicated program and data memory. The memory tile acts as L2 memory. The 2D
+array can be partitioned at a column boundary creating a spatially isolated
+partition which can be bound to a workload context.
+
+Each column also has dedicated DMA engines to move data between host DDR and
+memory tile.
+
+AMD Phoenix and AMD Hawk Point client NPU have a 4x5 topology, i.e., 4 rows of
+compute tiles arranged into 5 columns. AMD Strix Point client APU have 4x8
+topology, i.e., 4 rows of compute tiles arranged into 8 columns.
+
+Shared L2 Memory
+................
+
+The single row of memory tiles create a pool of software managed on chip L2
+memory. DMA engines are used to move data between host DDR and memory tiles.
+AMD Phoenix and AMD Hawk Point NPUs have a total of 2560 KB of L2 memory.
+AMD Strix Point NPU has a total of 4096 KB of L2 memory.
+
+Microcontroller
+---------------
+
+A microcontroller runs NPU Firmware which is responsible for command processing,
+XDNA Array partition setup, XDNA Array configuration, workload context
+management and workload orchestration.
+
+NPU Firmware uses a dedicated instance of an isolated non-privileged context
+called ERT to service each workload context. ERT is also used to execute user
+provided ``ctrlcode`` associated with the workload context.
+
+NPU Firmware uses a single isolated privileged context called MERT to service
+management commands from the amdxdna driver.
+
+Mailboxes
+.........
+
+The microcontroller and amdxdna driver use a privileged channel for management
+tasks like setting up of contexts, telemetry, query, error handling, setting up
+user channel, etc. As mentioned before, privileged channel requests are
+serviced by MERT. The privileged channel is bound to a single mailbox.
+
+The microcontroller and amdxdna driver use a dedicated user channel per
+workload context. The user channel is primarily used for submitting work to
+the NPU. As mentioned before, a user channel requests are serviced by an
+instance of ERT. Each user channel is bound to its own dedicated mailbox.
+
+PCIe EP
+-------
+
+NPU is visible to the x86 as a PCIe device with multiple BARs and some MSI-X interrupt
+vectors. NPU uses a dedicated high bandwidth SoC level fabric for reading
+writing into host memory. Each instance of ERT gets its own dedicated MSI-X
+interrupt. MERT gets a single instance of MSI-X interrupt.
+
+The number of PCIe BARs varies depending on the specific device.
+Based on their functions, PCIe BARs can generally be categorized into the
+following types.
+
+* PSP BAR: Expose the AMD PSP (Platform Security Processor) function
+* SMU BAR: Expose the AMD SMU (System Management Unit) function
+* SRAM BAR: Expose ring buffers for the mailbox
+* Mailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)
+* Public Register BAR: Expose public registers
+
+On specific devices, the above-mentioned BAR type might be combined into a single physical PCIe BAR.
+Or a module might require two physical PCIe BARs to be fully functional.
+For example,
+
+* On AMD Phoenix device, PSP, SMU, Public Register BARs are on PCIe BAR index 0.
+* On AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0.
+  The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR).
+
+Process Isolation Hardware
+--------------------------
+
+As explained before, XDNA Array can be dynamically divided into isolated
+spatial partitions, each of which may have one or more columns. The spatial
+partition is setup by programming the column isolation registers by the
+microcontroller. Each spatial partition is associated with a PASID which is
+also programmed by the microcontroller. Hence multiple spatial partitions in
+the NPU can make concurrent host access protected by PASID.
+
+The NPU FW itself uses microcontroller MMU enforced isolated contexts for
+servicing user and privileged channel requests.
+
+
+Mixed Spatial and Temporal Scheduling
+=====================================
+
+AMD XDNA architecture supports mixed spatial and temporal (time sharing)
+scheduling of 2D array. This means that spatial partitions may be setup and
+torn down dynamically to accommodate various workloads. A *spatial* partition
+may be *exclusively* bound to one workload context while another partition may
+be *temporarily* bound to more than one workload contexts. The microcontroller
+updates the PASID for a temporarily shared partition to match the context that
+has been bound to the partition at any moment.
+
+Resource Solver
+---------------
+
+The Resource Solver component of the amdxdna driver manages the allocation
+of 2D array among various workloads. Every workload describes the number
+of columns required to run the NPU binary in its metadata. The Resource Solver
+component uses hints passed by the workload and its own heuristics to
+decide 2D array (re)partition strategy and mapping of workloads for spatial and
+temporal sharing of columns. The FW enforces the context-to-column(s) resource
+binding decisions made by the Resource Solver.
+
+AMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent workload
+contexts. AMD Strix Point can support 16 concurrent workload contexts.
+
+
+Application Binaries
+====================
+
+A NPU application workload is comprised of two separate binaries which are
+generated by the NPU compiler.
+
+1. AMD XDNA Array overlay, which is used to configure a NPU spatial partition.
+   The overlay contains instructions for setting up the stream switch
+   configuration and ELF for the compute tiles. The overlay is loaded on the
+   spatial partition bound to the workload by the associated ERT instance.
+   Refer to the
+   `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details.
+
+2. ``ctrlcode``, used for orchestrating the overlay loaded on the spatial
+   partition. ``ctrlcode`` is executed by the ERT running in protected mode on
+   the microcontroller in the context of the workload. ``ctrlcode`` is made up
+   of a sequence of opcodes named ``XAie_TxnOpcode``. Refer to the
+   `AI Engine Run Time`_ for more details.
+
+
+Special Host Buffers
+====================
+
+Per-context Instruction Buffer
+------------------------------
+
+Every workload context uses a host resident 64 MB buffer which is memory
+mapped into the ERT instance created to service the workload. The ``ctrlcode``
+used by the workload is copied into this special memory. This buffer is
+protected by PASID like all other input/output buffers used by that workload.
+Instruction buffer is also mapped into the user space of the workload.
+
+Global Privileged Buffer
+------------------------
+
+In addition, the driver also allocates a single buffer for maintenance tasks
+like recording errors from MERT. This global buffer uses the global IOMMU
+domain and is only accessible by MERT.
+
+
+High-level Use Flow
+===================
+
+Here are the steps to run a workload on AMD NPU:
+
+1.  Compile the workload into an overlay and a ``ctrlcode`` binary.
+2.  Userspace opens a context in the driver and provides the overlay.
+3.  The driver checks with the Resource Solver for provisioning a set of columns
+    for the workload.
+4.  The driver then asks MERT to create a context on the device with the desired
+    columns.
+5.  MERT then creates an instance of ERT. MERT also maps the Instruction Buffer
+    into ERT memory.
+6.  The userspace then copies the ``ctrlcode`` to the Instruction Buffer.
+7.  Userspace then creates a command buffer with pointers to input, output, and
+    instruction buffer; it then submits command buffer with the driver and goes
+    to sleep waiting for completion.
+8.  The driver sends the command over the Mailbox to ERT.
+9.  ERT *executes* the ``ctrlcode`` in the instruction buffer.
+10. Execution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while
+    AMD XDNA Array is running.
+11. When ERT reaches end of ``ctrlcode``, it raises an MSI-X to send completion
+    signal to the driver which then wakes up the waiting workload.
+
+
+Boot Flow
+=========
+
+amdxdna driver uses PSP to securely load signed NPU FW and kick off the boot
+of the NPU microcontroller. amdxdna driver then waits for the alive signal in
+a special location on BAR 0. The NPU is switched off during SoC suspend and
+turned on after resume where the NPU FW is reloaded, and the handshake is
+performed again.
+
+
+Userspace components
+====================
+
+Compiler
+--------
+
+Peano is an LLVM based open-source compiler for AMD XDNA Array compute tile
+available at:
+https://github.com/Xilinx/llvm-aie
+
+The open-source IREE compiler supports graph compilation of ML models for AMD
+NPU and uses Peano underneath. It is available at:
+https://github.com/nod-ai/iree-amd-aie
+
+Usermode Driver (UMD)
+---------------------
+
+The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT
+can be found at:
+https://github.com/Xilinx/XRT
+
+The open-source XRT shim for NPU is can be found at:
+https://github.com/amd/xdna-driver
+
+
+DMA Operation
+=============
+
+DMA operation instructions are encoded in the ``ctrlcode`` as
+``XAIE_IO_BLOCKWRITE`` opcode. When ERT executes ``XAIE_IO_BLOCKWRITE``, DMA
+operations between host DDR and L2 memory are effected.
+
+
+Error Handling
+==============
+
+When MERT detects an error in AMD XDNA Array, it pauses execution for that
+workload context and sends an asynchronous message to the driver over the
+privileged channel. The driver then sends a buffer pointer to MERT to capture
+the register states for the partition bound to faulting workload context. The
+driver then decodes the error by reading the contents of the buffer pointer.
+
+
+Telemetry
+=========
+
+MERT can report various kinds of telemetry information like the following:
+* L1 interrupt counter
+* DMA counter
+* Deep Sleep counter
+* etc.
+
+
+References
+==========
+
+- `AMD XDNA Architecture <https://www.amd.com/en/technologies/xdna.html>`_
+- `AMD AI Engine Technology <https://www.xilinx.com/products/technology/ai-engine.html>`_
+- `Peano <https://github.com/Xilinx/llvm-aie>`_
+- `Versal Adaptive SoC AIE-ML Architecture Manual (AM020) <https://docs.amd.com/r/en-US/am020-versal-aie-ml>`_
+- `AI Engine Run Time <https://github.com/Xilinx/aie-rt/tree/release/main_aig>`_
diff --git a/src/driver/tools/npu_perf_analyze.sh b/src/driver/tools/npu_perf_analyze.sh
new file mode 100755
index 00000000..d32f2f3c
--- /dev/null
+++ b/src/driver/tools/npu_perf_analyze.sh
@@ -0,0 +1,171 @@
+#!/usr/bin/bash
+
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+usage()
+{
+  cat << USAGE_END
+Usage: $0 [options] event1_pattern event2_pattern
+Options:
+  -file/-f: Trace log file for parsing
+  -range/-r: [entry_index_begin:entry_index_end), e.g.: 100:200
+Parsing trace log file to find time interval from event1 to event2.
+event pattern examples:
+  "sdt_xrt:ioctl_exit: \(.+\) arg1=DRM_IOCTL_AMDXDNA_WAIT_CMD"
+USAGE_END
+}
+
+read_timestamps()
+{
+	timestamps=()
+
+	while IFS= read -r line; do
+		if [ "$line" != "" ]; then
+			timestamps+=($(("10#${line}")))
+		fi
+	done <<< `egrep "$1" ${perf_out_file} | awk '{print $4}' | tr -d '.' | tr -d ':'`
+	echo ${timestamps[@]}
+}
+
+if [ "$#" -eq 0 ]; then
+	usage
+	exit 1
+fi
+
+range_start=-1
+range_end=-1
+event1=""
+event2=""
+perf_out_file="perf.converted.out"
+while [ $# -gt 0 ]; do
+	case "$1" in
+		-range | -r)
+			st=$(echo $2 | cut -d':' -f1)
+			end=$(echo $2 | cut -d':' -f2)
+			if [ "${st}" != "" ]; then
+				if [[ "${st}" =~ ^[0-9]+$ ]]; then
+					range_start=$(("10#${st}"))
+				else
+					echo Invalid range start: ${st}
+					exit 1
+				fi
+			fi
+			if [ "${end}" != "" ]; then
+				if [[ "${end}" =~ ^[0-9]+$ ]]; then
+					range_end=$(("10#${end}"))
+				else
+					echo Invalid range end: ${end}
+					exit 1
+				fi
+			fi
+			shift
+			;;
+		-file | -f)
+			perf_out_file=$2
+			shift
+			;;
+		*)
+			break
+			;;
+	esac
+	shift
+done
+event1=$1
+event2=$2
+
+if [ ! -f ${perf_out_file} ]; then
+	echo "${perf_out_file} is not found"
+	exit 1
+else
+	echo "Parsing ${perf_out_file}..."
+fi
+
+event1_ts=($(read_timestamps "${event1}"))
+event1_ts_num=${#event1_ts[@]}
+if [ ${event1_ts_num} -eq 0 ]; then
+	echo No events found for ${event1}
+	exit 1
+fi
+echo "${event1_ts_num} events for: '${event1}'"
+
+event2_ts=($(read_timestamps "${event2}"))
+event2_ts_num=${#event2_ts[@]}
+if [ ${event2_ts_num} -eq 0 ]; then
+	echo No events found for ${event2}
+	exit 1
+fi
+echo "${event2_ts_num} events for: '${event2}'"
+
+# Caculate time difference between two events
+diffs_event1=()
+diffs_event2=()
+diffs=()
+i1=0
+i2=0
+while [ 1 ]; do
+	while [[ ${i2} -lt ${event2_ts_num} && ${event2_ts[i2]} -lt ${event1_ts[i1]} ]]; do
+		(( i2++ ))
+	done
+	if [ ${i2} -eq ${event2_ts_num} ]; then
+		break
+	fi
+
+	while [[ ${i1} -lt ${event1_ts_num} && ${event1_ts[i1]} -lt ${event2_ts[i2]} ]]; do
+		(( i1++ ))
+	done
+	if [ ${i1} -eq ${event1_ts_num} ]; then
+		break
+	fi
+
+
+	(( i1-- ))
+	diffs_event1+=( $((event1_ts[i1])) )
+	diffs_event2+=( $((event2_ts[i2])) )
+	diffs+=( $((event2_ts[i2] - event1_ts[i1])) )
+	(( i1++ ))
+	(( i2++ ))
+done
+#echo ${event1_ts[@]} > /tmp/e1
+#echo ${event2_ts[@]} > /tmp/e2
+#echo ${diffs[@]} > /tmp/diffs
+
+
+# Data mining within specified range
+
+if [ ${range_start} -eq -1 ]; then
+	range_start=0
+fi
+if [ ${range_end} -eq -1 ]; then
+	range_end=${#diffs[@]}
+fi
+if [ ${range_end} -eq ${range_start} ]; then
+	echo Range start and end are the same
+	exit 1
+elif [ ${range_end} -lt ${range_start} ]; then
+	echo Range start after end
+	exit 1
+fi
+
+total=0
+largest=${diffs[${range_start}]}
+largest_idx=${range_start}
+smallest=${diffs[${range_start}]}
+smallest_idx=${range_start}
+for (( i=${range_start}; i<${range_end}; i++ )); do
+	total=$(( total + diffs[i] ))
+	if [[ ${largest} -lt ${diffs[i]} ]]; then
+		largest=${diffs[i]}
+		largest_idx=${i}
+	fi
+	if [[ ${smallest} -gt ${diffs[i]} ]]; then
+		smallest=${diffs[i]}
+		smallest_idx=${i}
+	fi
+done
+
+# Output result
+total_events=$(( range_end - range_start ))
+echo Average over ${total_events} events: $(( total / total_events ))ns
+echo Largest:  ${largest}ns@${largest_idx}: event1=${diffs_event1[largest_idx]}, event2=${diffs_event2[largest_idx]}
+echo Smallest: ${smallest}ns@${smallest_idx}: event1=${diffs_event1[smallest_idx]}, event2=${diffs_event2[smallest_idx]}
diff --git a/src/driver/tools/npu_perf_trace.sh b/src/driver/tools/npu_perf_trace.sh
new file mode 100755
index 00000000..97892757
--- /dev/null
+++ b/src/driver/tools/npu_perf_trace.sh
@@ -0,0 +1,136 @@
+#! /bin/bash --
+
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+#set -eu
+
+bold=$(tput bold)
+normal=$(tput sgr0)
+red=$(tput setaf 1)
+yellow=$(tput setaf 3)
+blue=$(tput setaf 4)
+
+trace_info()
+{
+	what=$1
+	echo -e "[INFO]: $what"
+}
+
+trace_warn()
+{
+	what=$1
+	echo -e "[${yellow}WARNING${normal}]: $what"
+}
+
+trace_error()
+{
+	what=$1
+	echo -e "[${red}ERROR${normal}]: $what" 1>&2
+	exit 1
+}
+
+add_sdt_xrt()
+{
+	perf list | grep sdt_xrt > /dev/null && sdt_pre_enabled=1
+	if [[ $sdt_pre_enabled == 1 ]]; then
+		remove_sdt_xrt
+		#trace_warn "XRT SDT had beed added. Skip..."
+		#return
+	fi
+
+	# Add XRT SDT events
+	perf buildid-cache --add $xrt_libs
+	# Convert SDT events to trace points
+	perf probe --add=sdt_xrt:* &> /dev/null
+
+	trace_info "XRT SDT is added"
+}
+
+remove_sdt_xrt()
+{
+	#if [[ $sdt_pre_enabled == 1 ]]; then
+	#	trace_warn "XRT SDT was pre added. Skip..."
+	#	return
+	#fi
+
+	# Delete SDT trace points
+	perf probe --del=sdt_xrt:* &> /dev/null
+	# Remove XRT STD events
+	perf buildid-cache --remove $xrt_libs
+	trace_info "XRT SDT is removed"
+}
+
+## -------- trace flow start --------
+if [ "$EUID" -ne 0 ]; then
+	trace_error "Please run as root"
+fi
+
+# Global variables
+sdt_pre_enabled=0
+xrt_lib_prefix="/opt/xilinx/xrt/lib"
+while [ $# -gt 0 ]; do
+	case "$1" in
+		-libdir | -l)
+			xrt_lib_prefix=$2
+			shift
+			;;
+		*)
+			break
+			;;
+	esac
+	shift
+done
+accel_debugfs="/sys/kernel/debug/accel"
+xrt_libs="${xrt_lib_prefix}/libxrt_coreutil.so,${xrt_lib_prefix}/libxrt_driver_xdna.so"
+perf_record_args="-e amdxdna_trace:* "
+perf_record_args+="-e sdt_xrt:* "
+exec_cmd=""
+
+perf --version > /dev/null
+
+# Argument parsing
+exec_cmd=$@
+if [[ -z "$exec_cmd" ]]; then
+	trace_error "Please put execute application at the end"
+fi
+
+dev=""
+ioctl_sed_expr=""
+for dir in $(ls $accel_debugfs); do
+	accel_fs_name=$(cat ${accel_debugfs}/$dir/name)
+	driver_name=$(echo $accel_fs_name | awk '{print $1}')
+	if [[ ! "$driver_name" =~ "amdxdna" ]]; then
+		continue
+	fi
+
+	if [[ ! -f ${accel_debugfs}/$dir/ioctl_id ]]; then
+		trace_error "${accel_debugfs}/$dir/ioctl_id not exist. amdxdna driver too old?"
+	fi
+
+	dev=$(echo $accel_fs_name | awk -F'[ =]' '{print $3}')
+	ioctl_sed_expr=$(awk -F ':' '{print "s/"$1"/"$2"/g"}' ${accel_debugfs}/$dir/ioctl_id)
+done
+
+if [[ -z "$dev" ]]; then
+	trace_error "No device found"
+fi
+
+trace_info "Found NPU device $dev at ${accel_debugfs}"
+
+add_sdt_xrt
+
+command="perf record $perf_record_args -a $exec_cmd"
+trace_info "$command"
+eval $command
+
+tmp_file=/tmp/perf.out
+# convert timestamp from second to microsecond to avoid floating numbers
+#perf script | awk '{ $4=$4*1000000; print }' > ${tmp_file}
+perf script --reltime --ns > ${tmp_file}
+# replace IOCTL cmd number to name
+sed "$ioctl_sed_expr" "${tmp_file}" > perf.converted.out
+rm -rf ${tmp_file}
+
+remove_sdt_xrt
+## -------- trace flow end --------
diff --git a/src/include/uapi/drm_local/amdxdna_accel.h b/src/include/uapi/drm_local/amdxdna_accel.h
index 134ef87b..fe41f6ee 100644
--- a/src/include/uapi/drm_local/amdxdna_accel.h
+++ b/src/include/uapi/drm_local/amdxdna_accel.h
@@ -17,7 +17,6 @@ extern "C" {
 #define AMDXDNA_DRIVER_MAJOR		1
 #define AMDXDNA_DRIVER_MINOR		0
 
-#define AMDXDNA_INVALID_CMD_HANDLE	(~0UL)
 #define AMDXDNA_INVALID_ADDR		(~0UL)
 #define AMDXDNA_INVALID_CTX_HANDLE	0
 #define AMDXDNA_INVALID_BO_HANDLE	0
@@ -49,8 +48,6 @@ enum amdxdna_drm_ioctl_id {
 	DRM_AMDXDNA_WAIT_CMD,
 	DRM_AMDXDNA_GET_INFO,
 	DRM_AMDXDNA_SET_STATE,
-	DRM_AMDXDNA_SUBMIT_WAIT,
-	DRM_AMDXDNA_SUBMIT_SIGNAL,
 	DRM_AMDXDNA_NUM_IOCTLS
 };
 
@@ -273,8 +270,6 @@ struct amdxdna_drm_exec_cmd {
  * @seq: sequence number of the command returned by execute command.
  *
  * Wait a command specified by seq to be completed.
- * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
- * to submit a new command.
  */
 struct amdxdna_drm_wait_cmd {
 	__u32 hwctx;
@@ -461,6 +456,7 @@ enum amdxdna_power_mode_type {
 	POWER_MODE_LOW,     /**< Set frequency to lowest DPM */
 	POWER_MODE_MEDIUM,  /**< Set frequency to medium DPM */
 	POWER_MODE_HIGH,    /**< Set frequency to highest DPM */
+	POWER_MODE_TURBO,   /**< More power, more performance */
 };
 
 /**
@@ -542,20 +538,6 @@ struct amdxdna_drm_set_state {
 	__u64 buffer; /* in */
 };
 
-
-/**
- * struct amdxdna_drm_syncobjs - Signal or wait on array of DRM timelined sync objects.
- * @handles: Array of handles of sync objects.
- * @points: Array of time points for each sync objects.
- * @count: Number of elements in the above array.
- */
-struct amdxdna_drm_syncobjs {
-	__u64 handles; /* in */
-	__u64 points; /* in */
-	__u32 count; /* in */
-	__u32 pad;
-};
-
 #define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
 		 struct amdxdna_drm_create_hwctx)
@@ -596,14 +578,6 @@ struct amdxdna_drm_syncobjs {
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \
 		 struct amdxdna_drm_set_state)
 
-#define DRM_IOCTL_AMDXDNA_SUBMIT_WAIT \
-	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT, \
-		 struct amdxdna_drm_syncobjs)
-
-#define DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL \
-	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL, \
-		 struct amdxdna_drm_syncobjs)
-
 #if defined(__cplusplus)
 } /* extern c end */
 #endif
diff --git a/src/shim/bo.cpp b/src/shim/bo.cpp
index 4fdeebe5..36142f63 100644
--- a/src/shim/bo.cpp
+++ b/src/shim/bo.cpp
@@ -230,7 +230,6 @@ alloc_bo()
   amdxdna_drm_get_bo_info bo_info = {};
   get_drm_bo_info(m_pdev, boh, &bo_info);
   m_bo = std::make_unique<bo::drm_bo>(*this, bo_info);
-  m_pdev.insert_hdl_mapping(boh, reinterpret_cast<uint64_t>(this));
 }
 
 void
@@ -248,7 +247,6 @@ void
 bo::
 free_bo()
 {
-  m_pdev.remove_hdl_mapping(get_drm_bo_handle());
   m_bo.reset();
 }
 
diff --git a/src/shim/device.cpp b/src/shim/device.cpp
index 278298d5..cef74159 100644
--- a/src/shim/device.cpp
+++ b/src/shim/device.cpp
@@ -262,7 +262,7 @@ struct partition_info
     for (uint32_t i = 0; i < data_size; i++) {
       const auto& entry = data[i];
 
-      xrt_core::query::aie_partition_info::data new_entry;
+      xrt_core::query::aie_partition_info::data new_entry{};
       new_entry.metadata.id = std::to_string(entry.context_id);
       new_entry.metadata.xclbin_uuid = "N/A";
       new_entry.start_col = entry.start_col;
diff --git a/src/shim/fence.cpp b/src/shim/fence.cpp
index b67ce0ef..26cb428f 100644
--- a/src/shim/fence.cpp
+++ b/src/shim/fence.cpp
@@ -107,28 +107,35 @@ wait_syncobj_available(const shim_xdna::pdev& dev,
 }
 
 void
-submit_wait_syncobjs(const shim_xdna::pdev& dev,
+submit_wait_syncobjs(const shim_xdna::pdev& dev, const shim_xdna::hw_ctx *ctx,
   const uint32_t* sobj_hdls, const uint64_t* points, uint32_t num)
 {
   wait_syncobj_available(dev, sobj_hdls, points, num);
 
-  amdxdna_drm_syncobjs swsobj = {
-    .handles = reinterpret_cast<uintptr_t>(sobj_hdls),
-    .points = reinterpret_cast<uintptr_t>(points),
-    .count = num,
+  amdxdna_drm_exec_cmd ecmd = {
+    .hwctx = ctx->get_slotidx(),
+    .type = AMDXDNA_CMD_SUBMIT_DEPENDENCY,
+    .cmd_handles = reinterpret_cast<uintptr_t>(sobj_hdls),
+    .args = reinterpret_cast<uintptr_t>(points),
+    .cmd_count = num,
+    .arg_count = num,
   };
-  dev.ioctl(DRM_IOCTL_AMDXDNA_SUBMIT_WAIT, &swsobj);
+  dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
 }
 
 void
-submit_signal_syncobj(const shim_xdna::pdev& dev, uint32_t sobj_hdl, uint64_t point)
+submit_signal_syncobj(const shim_xdna::pdev& dev, const shim_xdna::hw_ctx *ctx,
+  uint32_t sobj_hdl, uint64_t point)
 {
-  amdxdna_drm_syncobjs sssobj = {
-    .handles = reinterpret_cast<uintptr_t>(&sobj_hdl),
-    .points = reinterpret_cast<uintptr_t>(&point),
-    .count = 1,
+  amdxdna_drm_exec_cmd ecmd = {
+    .hwctx = ctx->get_slotidx(),
+    .type = AMDXDNA_CMD_SUBMIT_SIGNAL,
+    .cmd_handles = reinterpret_cast<uintptr_t>(&sobj_hdl),
+    .args = reinterpret_cast<uintptr_t>(&point),
+    .cmd_count = 1,
+    .arg_count = 1,
   };
-  dev.ioctl(DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL, &sssobj);
+  dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
 }
 
 }
@@ -199,25 +206,15 @@ clone() const
   return std::make_unique<fence>(*this);
 }
 
-void
+uint64_t
 fence::
-wait(bool async) const
+wait_next_state() const
 {
   std::lock_guard<std::mutex> guard(m_lock);
-  auto st = m_state;
 
-  if (st != initial_state && m_signaled)
+  if (m_state != initial_state && m_signaled)
     shim_err(-EINVAL, "Can't wait on fence that has been signaled before.");
-
-  st++;
-  shim_debug("%s for command fence %d@%ld",
-    async ? "Submitting wait" : "Waiting", m_syncobj_hdl, st);
-  if (async)
-    submit_wait_syncobjs(m_pdev, &m_syncobj_hdl, &st, 1);
-  else
-    wait_syncobj_done(m_pdev, m_syncobj_hdl, st);
-
-  m_state = st;
+  return ++m_state;
 }
 
 // Timeout value is ignored for now.
@@ -225,57 +222,54 @@ void
 fence::
 wait(uint32_t timeout_ms) const
 {
-  wait(false);
+  auto st = signal_next_state();
+  shim_debug("Waiting for command fence %d@%ld", m_syncobj_hdl, st);
+  wait_syncobj_done(m_pdev, m_syncobj_hdl, st);
 }
 
 void
 fence::
-submit_wait() const
+submit_wait(const hw_ctx *ctx) const
 {
-  wait(true);
+  auto st = signal_next_state();
+  shim_debug("Submitting wait for command fence %d@%ld", m_syncobj_hdl, st);
+  submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1);
 }
 
-void
+uint64_t
 fence::
-signal(bool async) const
+signal_next_state() const
 {
   std::lock_guard<std::mutex> guard(m_lock);
-  auto st = m_state;
 
-  if (st != initial_state && !m_signaled)
+  if (m_state != initial_state && !m_signaled)
     shim_err(-EINVAL, "Can't signal fence that has been waited before.");
-
-  if (st == initial_state)
+  if (m_state == initial_state)
     m_signaled = true;
-
-  st++;
-  shim_debug("%s command fence %d@%ld",
-    async ? "Submitting signal" : "Signaling", m_syncobj_hdl, st);
-  if (async)
-    submit_signal_syncobj(m_pdev, m_syncobj_hdl, st);
-  else
-    signal_syncobj(m_pdev, m_syncobj_hdl, st);
-
-  m_state = st;
+  return ++m_state;
 }
 
 void
 fence::
 signal() const
 {
-  signal(false);
+  auto st = signal_next_state();
+  shim_debug("Signaling command fence %d@%ld", m_syncobj_hdl, st);
+  signal_syncobj(m_pdev, m_syncobj_hdl, st);
 }
 
 void
 fence::
-submit_signal() const
+submit_signal(const hw_ctx *ctx) const
 {
-  signal(true);
+  auto st = signal_next_state();
+  shim_debug("Submitting signal command fence %d@%ld", m_syncobj_hdl, st);
+  submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st);
 }
 
 void
 fence::
-submit_wait(const pdev& dev, const std::vector<xrt_core::fence_handle*>& fences)
+submit_wait(const pdev& dev, const hw_ctx *ctx, const std::vector<xrt_core::fence_handle*>& fences)
 {
   constexpr int max_fences = 1024;
   uint32_t hdls[max_fences];
@@ -287,12 +281,13 @@ submit_wait(const pdev& dev, const std::vector<xrt_core::fence_handle*>& fences)
 
   for (auto f : fences) {
     auto fh = static_cast<const fence*>(f);
-    std::lock_guard<std::mutex> guard(fh->m_lock);
+    auto st = fh->wait_next_state();
+    shim_debug("Waiting for command fence %d@%ld", fh->m_syncobj_hdl, st);
     hdls[i] = fh->m_syncobj_hdl;
-    pts[i] = ++fh->m_state;
+    pts[i] = st;
     i++;
   }
-  submit_wait_syncobjs(dev, hdls, pts, i);
+  submit_wait_syncobjs(dev, ctx, hdls, pts, i);
 }
 
 } // shim_xdna
diff --git a/src/shim/fence.h b/src/shim/fence.h
index fe3ff295..1b6cdbca 100644
--- a/src/shim/fence.h
+++ b/src/shim/fence.h
@@ -4,6 +4,7 @@
 #ifndef _FENCE_XDNA_H_
 #define _FENCE_XDNA_H_
 
+#include "hwctx.h"
 #include "device.h"
 #include "shared.h"
 
@@ -41,20 +42,20 @@ class fence : public xrt_core::fence_handle
 
 public:
   void
-  submit_wait() const;
+  submit_wait(const hw_ctx*) const;
 
   static void
-  submit_wait(const pdev& dev, const std::vector<xrt_core::fence_handle*>& fences);
+  submit_wait(const pdev& dev, const hw_ctx*, const std::vector<xrt_core::fence_handle*>& fences);
 
   void
-  submit_signal() const;
+  submit_signal(const hw_ctx*) const;
 
 private:
-  void
-  wait(bool async) const;
+  uint64_t
+  wait_next_state() const;
 
-  void
-  signal(bool async) const;
+  uint64_t
+  signal_next_state() const;
 
   const pdev& m_pdev;
   const std::unique_ptr<xrt_core::shared_handle> m_import;
diff --git a/src/shim/hwq.cpp b/src/shim/hwq.cpp
index 2bda0db1..14a31a52 100644
--- a/src/shim/hwq.cpp
+++ b/src/shim/hwq.cpp
@@ -5,6 +5,7 @@
 #include "hwq.h"
 #include "fence.h"
 #include "shim_debug.h"
+#include "core/common/trace.h"
 
 namespace {
 
@@ -82,60 +83,29 @@ void
 hw_q::
 submit_command(xrt_core::buffer_handle *cmd)
 {
-  auto pkt = get_chained_command_pkt(cmd);
-  if (!m_pdev.is_force_unchained_command() || !pkt) {
-    issue_command(cmd);
-    return;
-  }
-
-  // HACK: Forcibly unchain commands, to be removed later.
-  //
-  // Forcibly unchain commands and send to driver one by one.
-  auto payload = get_ert_cmd_chain_data(pkt);
-  for (size_t i = 0; i < payload->command_count; i++) {
-    auto boh = reinterpret_cast<xrt_core::buffer_handle*>(
-      m_pdev.lookup_hdl_mapping(static_cast<uint32_t>(payload->data[i])));
-    issue_command(boh);
-  }
+  issue_command(cmd);
 }
 
 int
 hw_q::
-wait_command(xrt_core::buffer_handle *cmd, uint32_t timeout_ms) const
+poll_command(xrt_core::buffer_handle *cmd) const
 {
-  auto pkt = get_chained_command_pkt(cmd);
-  if (!m_pdev.is_force_unchained_command() || !pkt)
-    return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms);
-
-  // HACK: handling forcibly unchained commands, to be removed later.
-  //
-  // Wait for the last unchained command.
-  auto payload = get_ert_cmd_chain_data(pkt);
-  auto last_boh = reinterpret_cast<xrt_core::buffer_handle*>(
-    m_pdev.lookup_hdl_mapping(static_cast<uint32_t>(payload->data[payload->command_count-1])));
-  auto ret = wait_cmd(m_pdev, m_hwctx, last_boh, timeout_ms);
-  if (ret != 1)
-    return ret;
-
-  // Check the state of the last command.
-  auto cmdpkt = reinterpret_cast<ert_packet *>(last_boh->map(xrt_core::buffer_handle::map_type::read));
-  if (cmdpkt->state == ERT_CMD_STATE_COMPLETED) {
-    pkt->state = ERT_CMD_STATE_COMPLETED;
+  auto cmdpkt = reinterpret_cast<ert_packet *>(cmd->map(xrt_core::buffer_handle::map_type::write));
+
+  if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) {
+    XRT_TRACE_POINT_LOG(poll_command_done);
     return 1;
   }
+  return 0;
+}
 
-  // Find out the first command failed.
-  for (int i = 0; i < payload->command_count; i++) {
-    auto boh = reinterpret_cast<xrt_core::buffer_handle*>(
-      m_pdev.lookup_hdl_mapping(static_cast<uint32_t>(payload->data[i])));
-    cmdpkt = reinterpret_cast<ert_packet *>(boh->map(xrt_core::buffer_handle::map_type::read));
-    if (cmdpkt->state != ERT_CMD_STATE_COMPLETED) {
-      pkt->state = cmdpkt->state;
-      payload->error_index = i;
-      break;
-    }
-  }
-  return 1;
+int
+hw_q::
+wait_command(xrt_core::buffer_handle *cmd, uint32_t timeout_ms) const
+{
+  if (poll_command(cmd))
+      return 1;
+  return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms);
 }
 
 void
@@ -143,14 +113,14 @@ hw_q::
 submit_wait(const xrt_core::fence_handle* f)
 {
   auto fh = static_cast<const fence*>(f);
-  fh->submit_wait();
+  fh->submit_wait(m_hwctx);
 }
 
 void
 hw_q::
 submit_wait(const std::vector<xrt_core::fence_handle*>& fences)
 {
-  fence::submit_wait(m_pdev, fences);
+  fence::submit_wait(m_pdev, m_hwctx, fences);
 }
 
 void
@@ -158,7 +128,7 @@ hw_q::
 submit_signal(const xrt_core::fence_handle* f)
 {
   auto fh = static_cast<const fence*>(f);
-  fh->submit_signal();
+  fh->submit_signal(m_hwctx);
 }
 
 } // shim_xdna
diff --git a/src/shim/hwq.h b/src/shim/hwq.h
index ce2c1c83..afb9ca97 100644
--- a/src/shim/hwq.h
+++ b/src/shim/hwq.h
@@ -20,6 +20,9 @@ class hw_q : public xrt_core::hwqueue_handle
   void
   submit_command(xrt_core::buffer_handle *) override;
 
+  int
+  poll_command(xrt_core::buffer_handle *) const override;
+
   int
   wait_command(xrt_core::buffer_handle *, uint32_t timeout_ms) const override;
 
diff --git a/src/shim/kmq/device.cpp b/src/shim/kmq/device.cpp
index ec703d7c..1479a3c5 100644
--- a/src/shim/kmq/device.cpp
+++ b/src/shim/kmq/device.cpp
@@ -6,21 +6,12 @@
 #include "hwctx.h"
 #include "drm_local/amdxdna_accel.h"
 
-namespace {
-
-// Device memory heap needs to be within one 64MB page. The maximum size is 64MB.
-const size_t dev_mem_size = (64 << 20);
-
-}
-
 namespace shim_xdna {
 
 device_kmq::
 device_kmq(const pdev& pdev, handle_type shim_handle, id_type device_id)
 : device(pdev, shim_handle, device_id)
 {
-  // Alloc and register device memory w/ driver.
-  m_dev_heap_bo = std::make_unique<bo_kmq>(*this, dev_mem_size, AMDXDNA_BO_DEV_HEAP);
   shim_debug("Created KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str());
 }
 
diff --git a/src/shim/kmq/device.h b/src/shim/kmq/device.h
index 8fa76362..768aee14 100644
--- a/src/shim/kmq/device.h
+++ b/src/shim/kmq/device.h
@@ -26,9 +26,6 @@ class device_kmq : public device {
 
   std::unique_ptr<xrt_core::buffer_handle>
   import_bo(xrt_core::shared_handle::export_handle ehdl) const override;
-
-private:
-  std::unique_ptr<xrt_core::buffer_handle> m_dev_heap_bo;
 };
 
 } // namespace shim_xdna
diff --git a/src/shim/kmq/hwq.cpp b/src/shim/kmq/hwq.cpp
index b785aac6..ebb292dd 100644
--- a/src/shim/kmq/hwq.cpp
+++ b/src/shim/kmq/hwq.cpp
@@ -31,30 +31,13 @@ issue_command(xrt_core::buffer_handle *cmd_bo)
 
   amdxdna_drm_exec_cmd ecmd = {
     .hwctx = m_hwctx->get_slotidx(),
+    .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF,
     .cmd_handles = cmd_bo_hdl,
     .args = reinterpret_cast<uintptr_t>(arg_bo_hdls),
     .cmd_count = 1,
     .arg_count = static_cast<uint32_t>(boh->get_arg_bo_handles(arg_bo_hdls, max_arg_bos)),
   };
-
-  int ret = EAGAIN;
-  while (ret == EAGAIN) {
-    try {
-      m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
-      ret = 0;
-    }
-    catch (const xrt_core::system_error& ex) {
-      ret = ex.get_code();
-      if (ret != EAGAIN)
-        throw;
-      amdxdna_drm_wait_cmd wcmd = {
-        .hwctx = ecmd.hwctx,
-        .timeout = 0, // Infinite waiting
-        .seq = AMDXDNA_INVALID_CMD_HANDLE, // Wait for free slot
-      };
-      m_pdev.ioctl(DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd);
-    }
-  }
+  m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
 
   auto id = ecmd.seq;
   boh->set_cmd_id(id);
diff --git a/src/shim/kmq/pcidev.cpp b/src/shim/kmq/pcidev.cpp
index 26a3ae8f..0d271ce5 100644
--- a/src/shim/kmq/pcidev.cpp
+++ b/src/shim/kmq/pcidev.cpp
@@ -1,9 +1,17 @@
 // SPDX-License-Identifier: Apache-2.0
 // Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+#include "bo.h"
 #include "device.h"
 #include "pcidev.h"
 
+namespace {
+
+// Device memory heap needs to be within one 64MB page. The maximum size is 64MB.
+const size_t dev_mem_size = (64 << 20);
+
+}
+
 namespace shim_xdna {
 
 pdev_kmq::
@@ -23,7 +31,24 @@ std::shared_ptr<xrt_core::device>
 pdev_kmq::
 create_device(xrt_core::device::handle_type handle, xrt_core::device::id_type id) const
 {
-  return std::make_shared<device_kmq>(*this, handle, id);
+  auto dev = std::make_shared<device_kmq>(*this, handle, id);
+  try {
+    // Alloc device memory on first device creation.
+    // No locking is needed since driver will ensure only one heap BO is created.
+    if (m_dev_heap_bo == nullptr)
+      m_dev_heap_bo = std::make_unique<bo_kmq>(*dev, dev_mem_size, AMDXDNA_BO_DEV_HEAP);
+  } catch (const xrt_core::system_error& ex) {
+    if (ex.get_code() != EBUSY)
+      throw;
+  }
+  return dev;
+}
+
+void
+pdev_kmq::
+on_last_close() const
+{
+  m_dev_heap_bo.reset();
 }
 
 } // namespace shim_xdna
diff --git a/src/shim/kmq/pcidev.h b/src/shim/kmq/pcidev.h
index 03ded1ec..65585924 100644
--- a/src/shim/kmq/pcidev.h
+++ b/src/shim/kmq/pcidev.h
@@ -17,6 +17,13 @@ class pdev_kmq : public pdev
  
   std::shared_ptr<xrt_core::device>
   create_device(xrt_core::device::handle_type handle, xrt_core::device::id_type id) const override;
+
+private:
+  // Create on first device creation and removed right before device is closed
+  mutable std::unique_ptr<xrt_core::buffer_handle> m_dev_heap_bo;
+
+  virtual void
+  on_last_close() const override;
 };
 
 } // namespace shim_xdna
diff --git a/src/shim/pcidev.cpp b/src/shim/pcidev.cpp
index 5d66e372..faa089a4 100644
--- a/src/shim/pcidev.cpp
+++ b/src/shim/pcidev.cpp
@@ -6,7 +6,6 @@
 #include "pcidrv.h"
 #include "shim_debug.h"
 #include "drm_local/amdxdna_accel.h"
-#include "core/common/config_reader.h"
 #include "core/common/trace.h"
 
 namespace {
@@ -35,10 +34,6 @@ namespace {
       return "DRM_IOCTL_AMDXDNA_GET_INFO";
     case DRM_IOCTL_AMDXDNA_SET_STATE:
       return "DRM_IOCTL_AMDXDNA_SET_STATE";
-    case DRM_IOCTL_AMDXDNA_SUBMIT_WAIT:
-      return "DRM_IOCTL_AMDXDNA_SUBMIT_WAIT";
-    case DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL:
-      return "DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL";
     case DRM_IOCTL_GEM_CLOSE:
       return "DRM_IOCTL_GEM_CLOSE";
     case DRM_IOCTL_PRIME_HANDLE_TO_FD:
@@ -71,10 +66,6 @@ namespace shim_xdna {
 pdev::
 pdev(std::shared_ptr<const drv> driver, std::string sysfs_name)
   : xrt_core::pci::dev(driver, std::move(sysfs_name))
-  // Default of force_unchained_command should be false once command
-  // chaining is natively supported by driver/firmware.
-  , m_force_unchained_command(xrt_core::config::detail::get_bool_value(
-    "Debug.force_unchained_command", false))
 {
   m_is_ready = true; // We're always ready.
 }
@@ -111,6 +102,8 @@ open() const
     m_dev_fd = fd;
   }
   ++m_dev_users;
+
+  on_first_open();
 }
 
 void
@@ -122,6 +115,8 @@ close() const
 
   --m_dev_users;
   if (m_dev_users == 0) {
+    on_last_close();
+
     // Stop new users of the fd from other threads.
     fd = m_dev_fd;
     m_dev_fd = -1;
@@ -158,12 +153,5 @@ munmap(void* addr, size_t len) const
   ::munmap(addr, len);
 }
 
-bool
-pdev::
-is_force_unchained_command() const
-{
-  return m_force_unchained_command;
-}
-
 } // namespace shim_xdna
 
diff --git a/src/shim/pcidev.h b/src/shim/pcidev.h
index 0d487518..da0cdeda 100644
--- a/src/shim/pcidev.h
+++ b/src/shim/pcidev.h
@@ -43,37 +43,15 @@ class pdev : public xrt_core::pci::dev
   void
   close() const;
 
-  bool
-  is_force_unchained_command() const;
-
-  // Below routines are for managing drm_bo_hdl -> buffer_handle* mapping.
-  // This is only a temporary hack for supporting forcibly unchained runlist.
-  void
-  insert_hdl_mapping(uint32_t hdl, uint64_t ptr) const
-  {
-    const std::lock_guard<std::mutex> lock(m_lock);
-    m_hdl_map[hdl] = ptr;
-  }
-  void
-  remove_hdl_mapping(uint32_t hdl) const
-  {
-    const std::lock_guard<std::mutex> lock(m_lock);
-    m_hdl_map.erase(hdl);
-  }
-  uint64_t
-  lookup_hdl_mapping(uint32_t hdl) const
-  {
-    const std::lock_guard<std::mutex> lock(m_lock);
-    return m_hdl_map[hdl];
-  }
-
 private:
+  virtual void
+  on_first_open() const {}
+  virtual void
+  on_last_close() const {}
+
   mutable int m_dev_fd = -1;
   mutable int m_dev_users = 0;
   mutable std::mutex m_lock;
-  const bool m_force_unchained_command = true;
-  // Mark it as mutable since pdev does not look at what is saved in this map
-  mutable std::map<uint32_t, uint64_t> m_hdl_map;
 };
 
 } // namespace shim_xdna
diff --git a/src/shim/umq/host_queue.h b/src/shim/umq/host_queue.h
index 14cb41e0..fe8dc8bc 100644
--- a/src/shim/umq/host_queue.h
+++ b/src/shim/umq/host_queue.h
@@ -1,60 +1,9 @@
-/*  (c) Copyright 2014 - 2022 Xilinx, Inc. All rights reserved.
-   
-    This file contains confidential and proprietary information
-    of Xilinx, Inc. and is protected under U.S. and
-    international copyright and other intellectual property
-    laws.
-   
-    DISCLAIMER
-    This disclaimer is not a license and does not grant any
-    rights to the materials distributed herewith. Except as
-    otherwise provided in a valid license issued to you by
-    Xilinx, and to the maximum extent permitted by applicable
-    law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
-    WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
-    AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
-    BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
-    INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
-    (2) Xilinx shall not be liable (whether in contract or tort,
-    including negligence, or under any other theory of
-    liability) for any loss or damage of any kind or nature
-    related to, arising under or in connection with these
-    materials, including for any direct, or any indirect,
-    special, incidental, or consequential loss or damage
-    (including loss of data, profits, goodwill, or any type of
-    loss or damage suffered as a result of any action brought
-    by a third party) even if such damage or loss was
-    reasonably foreseeable or Xilinx had been advised of the
-    possibility of the same.
-   
-    CRITICAL APPLICATIONS
-    Xilinx products are not designed or intended to be fail-
-    safe, or for use in any application requiring fail-safe
-    performance, such as life-support or safety devices or
-    systems, Class III medical devices, nuclear facilities,
-    applications related to the deployment of airbags, or any
-    other applications that could lead to death, personal
-    injury, or severe property or environmental damage
-    (individually and collectively, "Critical
-    Applications"). Customer assumes the sole risk and
-    liability of any use of Xilinx products in Critical
-    Applications, subject only to applicable laws and
-    regulations governing limitations on product liability.
-   
-    THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
-    PART OF THIS FILE AT ALL TIMES.                       */
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef _HOST_QUEUE_H_
 #define _HOST_QUEUE_H_
 
-#include <stdbool.h>
-#include <stdint.h>
-
-#define SHIM_USER_EVENT_0_ID 0xb6
-#define DOORBELL_EVENT_ID SHIM_USER_EVENT_0_ID
-
-#define PDI_TABLE_SIZE 64
-
 #define HSA_PKT_SUCCESS (0)
 /*
  * 32-bit return code in completion of HSA pkt back to host.
@@ -62,7 +11,8 @@
  * will check them on all devices/platforms.
  * HSA specific error code will be on high 28 bits.
  */ 
-enum hsa_cmd_state { // ert_cmd_state essentially
+enum hsa_cmd_state
+{ // ert_cmd_state essentially
   HSA_CMD_STATE_NEW = 1,
   HSA_CMD_STATE_QUEUED = 2,
   HSA_CMD_STATE_RUNNING = 3,
@@ -83,46 +33,14 @@ enum hsa_cmd_state { // ert_cmd_state essentially
 #define HSA_INVALID_OPCODE        HSA_ERR(column_index_rel * 100 + 3)
 #define HSA_INVALID_PKT           HSA_ERR(4)
 #define HSA_INVALID_PAGE          HSA_ERR(column_index_rel * 100 + 5)
+#define HSA_INDIRECT_PKT_NUM      6
 
-typedef enum     
+enum host_queue_packet_opcode
 {            
   HOST_QUEUE_PACKET_EXEC_BUF = 1,
   HOST_QUEUE_PACKET_TEST = 2,
   HOST_QUEUE_PACKET_EXIT = 3,
-}            
-host_queue_packet_opcode_t;
-
-/*
- * cu_config contains cu <-> pdi mapping info
- *
- * due to memory footprint limitation, the pdi info (host address) is not saved in CERT
- * if num_mappings == 1, then pdi_info_host_addr contains the host addr of the pdi
- * if num_mappings > 1, then pdi_info_host_addr contains the host addr of a table, in which
- * the host addr of all the pdi are saved.
- *
- * note: both cu_index and pdi_index should be start from 0
- * e.g mapping[0] = 0, mapping[1] = 1, mapping[2] = 0,
- * means,
- * cu0 <-> pdi0
- * cu1 <-> pdi1
- * cu2 <-> pdi0
- * there are 3 mappings, and 2 pdi in pdi_info_host_addr table 
- */
-typedef struct
-{
-  uint32_t num_mappings;
-  uint32_t pdi_info_host_addr_low;
-  uint32_t pdi_info_host_addr_high;
-  uint8_t mapping[PDI_TABLE_SIZE];
-}
-config_cu_t;
-
-#define INVALID_PDI_ID (0xFF)
-
-/*
- * Maximum number of exec buf args in 4B
- */ 
-#define EXEC_BUF_ARGS_MAX_LEN (20)
+};            
 
 /*
  * hsa pkt payload of exec_buf
@@ -134,7 +52,7 @@ config_cu_t;
  * args contains the info of input/output frame, parameter of network
  * etc, which are all transparent to CERT 
  */ 
-typedef struct
+struct exec_buf
 {
   uint16_t cu_index;
   uint16_t reserved0;
@@ -144,48 +62,28 @@ typedef struct
   uint16_t reserved1;
   uint32_t args_host_addr_low;
   uint32_t args_host_addr_high;
-}
-exec_buf_t;
-
+};
 
-typedef struct
+struct host_queue_header
 {
   uint64_t read_index;
-  
-  uint32_t reserved;
-  
-  //! @note Queue capacity, must be a power of two.
-  uint32_t capacity;
-
-  /*
-   * NOTE!!!
-   *  Due to the cache is not cache coherence between host and device.  We have
-   *  to flush the cache of the host queue.
-   *
-   *  Most importantly, the read_index has to be in different cache line
-   *  (64Bytes in linux) than the write_index. Because the read_index might be
-   *  flushed from a different context from kernel driver that is monitoring
-   *  the completed message. While at the same time, the write_index might be 
-   *  being flushed from UMQ.
-   */ 
-  //Note: temporary disable padding because FW has not been fully changed yet.
-  //uint64_t padding[6];
-
+  struct
+  {
+    uint16_t major;
+    uint16_t minor;
+  }
+  version;
+  uint32_t capacity; //Queue capacity, must be a power of two.
   uint64_t write_index;
-  
   uint64_t data_address;
-  
-  // TODO Ready signal?
-}
-host_queue_header_t;
+};
 
 
-typedef enum     
+enum host_queue_packet_type
 {            
   HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC = 0,
   HOST_QUEUE_PACKET_TYPE_INVALID = 1,
-}            
-host_queue_packet_type_t;
+}; 
 
 /*
  * 8 Bytes common header of hsa pkt used in CERT.
@@ -199,7 +97,7 @@ host_queue_packet_type_t;
  * for 'indirect', 'count' is used to calc the number of indirect pkt entry,
  * see below
  */ 
-typedef struct
+struct common_header
 {
   union {
     struct {
@@ -214,29 +112,24 @@ typedef struct
   uint16_t count;
   uint8_t distribute;
   uint8_t indirect;
-}
-common_header_t;
+};
 
-typedef struct
+struct xrt_packet_header
 {
-  common_header_t common_header;	
+  struct common_header common_header;	
   uint64_t completion_signal;
-}
-xrt_packet_header_t;
+};
 
 /*
  * format of indirect pkt. multiple-indirect-level is supported
  * there is vendor specific header (common header plus completion_signal) in 1st indirect level
  * there is common header in all the remaining indirect levels
  */ 
-typedef struct
+struct host_indirect_packet_entry
 {
-  uint16_t column_index;
-  uint16_t reserved;
   uint32_t host_addr_low;
   uint32_t host_addr_high;
-}
-host_indirect_packet_entry_t;
+};
 
 /*
  * hsa pkt format -- 64Bytes fixed length
@@ -245,23 +138,23 @@ host_indirect_packet_entry_t;
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 1 (exec_buf)
- *   count: 24 (sizeof(exec_buf_t))
+ *   count: 24 (sizeof(struct exec_buf))
  *   distribute: 0
  *   indirect: 0
  *   completion_signal: xxx
  * data:
- *   exec_buf_t
+ *   struct exec_buf
  *
  * case 2 -- indirect config_cu 
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 0 (config_cu)
- *   count: 12 (1 * sizeof(host_indirect_packet_entry_t))
+ *   count: 12 (1 * sizeof(struct host_indirect_packet_entry))
  *   distribute: 0
  *   indirect: 1 // common header of indirect
  *   completion_signal: xxx
  * data:
- *   host_indirect_packet_entry_t:
+ *   struct host_indirect_packet_entry:
  *     column_index: index of lead uc
  *     host_addr*: host addr of next level
  *       common_header:
@@ -270,123 +163,97 @@ host_indirect_packet_entry_t;
  *         count: 72 (config_cu with 16 entries)) //10 entry config_cu can fit in direct pkt
  *         indirect: 0 // common header of direct
  *       payload:
- *         config_cu_t: 16 entries of mapping table
+ *         struct config_cu: 16 entries of mapping table
  *
  * case 3 -- indirect exec_buf on 4 column partition
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 1 (exec_buf)
- *   count: 48 (4 *sizeof(host_indirect_packet_entry_t))
+ *   count: 48 (6 * sizeof(struct host_indirect_packet_entry))
  *   distribute: 1
  *   indirect: 1 // common header of indirect
  *   completion_signal: xxx
  * data:
- *   host_indirect_packet_entry_t:
- *     column_index: index of lead uc
+ *   struct host_indirect_packet_entry:
  *     host_addr*: host addr of next level
  *       common_header:
  *         type: 0 (vendor specific)
  *         opcode: 1 (exec_buf)
- *         count: 24 (sizeof(exec_buf_t))
+ *         count: 24 (sizeof(struct exec_buf))
  *         indirect: 0 // common header of direct
  *       payload:
- *          exec_buf_t 
- *   host_indirect_packet_entry_t:
- *     column_index: index of slave1
+ *          struct exec_buf 
+ *   struct host_indirect_packet_entry:
  *     host_addr*: host addr of next level
  *       common_header:
  *          type: 0 (vendor specific)
  *          opcode: 1 (exec_buf)
- *          count: 24 (sizeof(exec_buf_t))
+ *          count: 24 (struct sizeof(exec_buf))
  *          indirect: 0 // common header of direct
  *       payload:
- *          exec_buf_t
- *   host_indirect_packet_entry_t:
+ *          struct exec_buf
+ *   struct host_indirect_packet_entry:
  *     slave2,3,etc...
  *
  * case 4 -- indirect exec_buf on 8 column partition 
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 1 (exec_buf)
- *   count: 12 (sizeof(host_indirect_packet_entry_t))
+ *   count: 12 (sizeof(struct host_indirect_packet_entry))
  *   distribute: 1
  *   indirect: 1 // common_header of level-1 indirect
  *   completion_signal: xxx
  * data:
- *   host_indirect_packet_entry_t:
- *     column_index: index of lead uc
+ *   struct host_indirect_packet_entry:
  *     host_addr*: host addr of next level
  *       common_header:
  *         type: 0 (vendor specific)
  *         opcode: 1 (exec_buf)
- *         count: 12*8 (12 * sizeof(host_indirect_packet_entry_t))
+ *         count: 12*8 (12 * sizeof(struct host_indirect_packet_entry))
  *         distribute: 1
  *         indirect: 1 // common header of level-2 indirect
  *       indirect_payload: 
- *         host_indirect_packet_entry_t:
- *           column_index: index of lead uc
+ *         struct host_indirect_packet_entry:
  *           host_addr*: host addr of next level
  *             common_header:
  *               type: 0 (vendor specific)
  *               opcode: 1 (exec_buf)
- *               count: 24 (sizeof(exec_buf_t))
+ *               count: 24 (sizeof(struct exec_buf))
  *               distribute: 1
  *               indirect: 0  // common_header of direct
  *             payload: 
- *               exec_buf_t
- *         host_indirect_packet_entry_t:
- *           column_index: index of slave1
+ *               struct exec_buf
+ *         struct host_indirect_packet_entry:
  *           host_addr*: host addr of next level
  *             common_header:
  *               type: 0 (vendor specific)
  *               opcode: 1 (exec_buf)
- *               count: 24 (sizeof(exec_buf_t))
+ *               count: 24 (sizeof(struct exec_buf))
  *               distribute: 1
  *               indirect: 0 // common_header of direct
  *             payload: 
- *               exec_buf_t
- *         host_indirect_packet_entry_t:
+ *               struct exec_buf
+ *         struct host_indirect_packet_entry:
  *           slave2,3,etc...
  */ 
-typedef struct
+struct host_queue_packet
 {
-  xrt_packet_header_t xrt_header;	
+  struct xrt_packet_header xrt_header;	
   uint32_t data[12];
-}
-host_queue_packet_t;
+};
 
 /*
  * xrt pkt with random length.
  */ 
-typedef struct
+struct xrt_packet
 {
-  xrt_packet_header_t xrt_header;	
+  struct xrt_packet_header xrt_header;	
   uint64_t xrt_payload_host_addr;
-}
-xrt_packet_t;
-
-#define XRT_PKT_TYPE(p) ((p)->xrt_header.common_header.type)
-#define XRT_PKT_OPCODE(p) ((p)->xrt_header.common_header.opcode)
-#define XRT_PKT_LEN(p) ((p)->xrt_header.common_header.count)
-#define XRT_PKT_DISTRIBUTE(p) ((p)->xrt_header.common_header.distribute)
-#define XRT_PKT_INDIRECT(p) ((p)->xrt_header.common_header.indirect)
-#define XRT_PKT_COMPLETION(p) ((p)->xrt_header.completion_signal)
-#define XRT_PKT_PAYLOAD(p) ((p)->xrt_payload_host_addr)
-
-#define ADDR_HIGH(x)        ((x) >> 32)
-#define ADDR_LOW(x)         ((x) & 0xFFFFFFFF)
-#define MOD_POW2(x, y)      ((x) & ((y) - 1)) 
+};
 
-typedef struct
+struct host_queue
 {
   uint64_t address;
-}
-host_queue_t;
-
-void host_queue_init(host_queue_t *queue, uint64_t address);
-
-xrt_packet_t *host_queue_pop(host_queue_t *queue, bool block);
-
-void host_queue_finish_packet(host_queue_t *queue, xrt_packet_t *packet, uint32_t completion);
+};
 
 #endif
diff --git a/src/shim/umq/hwq.cpp b/src/shim/umq/hwq.cpp
index cbdfde5e..5c41fa47 100644
--- a/src/shim/umq/hwq.cpp
+++ b/src/shim/umq/hwq.cpp
@@ -22,13 +22,13 @@ clflush_data(void *data, int len)
 }
 
 inline void
-mark_slot_invalid(volatile host_queue_packet_t *pkt)
+mark_slot_invalid(volatile struct host_queue_packet *pkt)
 {
   pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_INVALID;
 }
 
 inline void
-mark_slot_valid(volatile host_queue_packet_t *pkt)
+mark_slot_valid(volatile struct host_queue_packet *pkt)
 {
   /* Issue mfence instruction to make sure all writes to the slot before is done */
   std::atomic_thread_fence(std::memory_order::memory_order_seq_cst);
@@ -38,7 +38,7 @@ mark_slot_valid(volatile host_queue_packet_t *pkt)
 }
 
 inline bool
-is_slot_valid(volatile host_queue_packet_t *pkt)
+is_slot_valid(volatile struct host_queue_packet *pkt)
 {
   return pkt->xrt_header.common_header.type == HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC;
 }
@@ -47,34 +47,63 @@ is_slot_valid(volatile host_queue_packet_t *pkt)
 
 namespace shim_xdna {
 
+void
+hw_q_umq::
+init_indirect_buf(volatile struct host_indirect_data *indirect_buf, int size)
+{
+  for (int i = 0; i < size; i++) {
+    indirect_buf[i].header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC;
+    indirect_buf[i].header.opcode = HOST_QUEUE_PACKET_EXEC_BUF;
+    indirect_buf[i].header.count = sizeof(struct exec_buf);
+    indirect_buf[i].header.distribute = 1;
+    indirect_buf[i].header.indirect = 0;
+  }
+}
+
 hw_q_umq::
 hw_q_umq(const device& dev, size_t nslots) : hw_q(dev)
 {
 #ifdef UMQ_HELLO_TEST
   const size_t header_sz = 8192; // Hard code to 2 pages
   const size_t queue_sz = 0;
+  const size_t indirect_sz = 0;
 #else
-  const size_t header_sz = sizeof(host_queue_header_t);
-  const size_t queue_sz = sizeof(host_queue_packet_t) * nslots;
+  //
+  // host queue layout:
+  //   host_queue_header_t
+  //   host_queue_packet_t [nslots]
+  //   indirect [4 * indirect_buffer * nslots]
+  const size_t header_sz = sizeof(struct host_queue_header);
+  const size_t queue_sz = sizeof(struct host_queue_packet) * nslots;
+  const size_t indirect_sz = (sizeof(struct host_indirect_data) * HSA_INDIRECT_PKT_NUM) * nslots;
 #endif
-  const size_t umq_sz = header_sz + queue_sz;
+  const size_t umq_sz = header_sz + queue_sz + indirect_sz;
+  shim_debug("umq sz %ld", umq_sz);
 
   m_umq_bo = const_cast<device &>(dev).alloc_bo(umq_sz, XCL_BO_FLAGS_EXECBUF);
   m_umq_bo_buf = m_umq_bo->map(bo::map_type::write);
-  m_umq_hdr = reinterpret_cast<volatile host_queue_header_t *>(m_umq_bo_buf);
-  m_umq_pkt = reinterpret_cast<volatile host_queue_packet_t *>
+  m_umq_hdr = reinterpret_cast<volatile struct host_queue_header *>(m_umq_bo_buf);
+  m_umq_pkt = reinterpret_cast<volatile struct host_queue_packet *>
     ((char *)m_umq_bo_buf + header_sz);
+  m_umq_indirect_buf = reinterpret_cast<volatile struct host_indirect_data *>
+    ((char *)m_umq_bo_buf + header_sz + queue_sz);
 
   // set all mapped memory to 0 
   std::memset(m_umq_bo_buf, 0, umq_sz);
   
-  for (int i = 0; i < nslots; i++)
+  // init slots and indirect buf
+  for (int i = 0; i < nslots; i++) {
     mark_slot_invalid(&m_umq_pkt[i]);
+    init_indirect_buf(&m_umq_indirect_buf[i * HSA_INDIRECT_PKT_NUM], HSA_INDIRECT_PKT_NUM);
+  }
 
   m_umq_hdr->capacity = nslots;
   // data_address starts after header
   m_umq_hdr->data_address = m_umq_bo->get_properties().paddr + header_sz;
 
+  // indirect buf starts after queue
+  m_indirect_paddr = m_umq_hdr->data_address + queue_sz;
+
   // this is the bo handler defined in parent class
   m_queue_boh = static_cast<bo*>(m_umq_bo.get())->get_drm_bo_handle();
 
@@ -98,11 +127,11 @@ map_doorbell(uint32_t doorbell_offset)
     m_pdev.mmap(0, sizeof(uint32_t), PROT_WRITE, MAP_SHARED, doorbell_offset));
 }
 
-volatile host_queue_header_t *
+volatile struct host_queue_header *
 hw_q_umq::
 get_header_ptr() const
 {
-  return reinterpret_cast<volatile host_queue_header_t *>(m_umq_bo_buf);
+  return reinterpret_cast<volatile struct host_queue_header *>(m_umq_bo_buf);
 }
 
 void
@@ -129,9 +158,33 @@ dump() const
     shim_debug("\tdistribute:\t%u", pkt->xrt_header.common_header.distribute);
     shim_debug("\tindirect:\t%u", pkt->xrt_header.common_header.indirect);
     shim_debug("\tcomplete addr:\t%p", pkt->xrt_header.completion_signal);
-    for (int j = 0; j < sizeof(pkt->data) / sizeof(pkt->data[0]); j++)
-      shim_debug("\tdata[%d]:\t0x%08x", j, pkt->data[j]);
+    if (pkt->xrt_header.common_header.indirect == 0) {
+      volatile struct exec_buf *ebp =
+        reinterpret_cast<volatile struct exec_buf *>(pkt->data);
+
+      shim_debug("\tcu_index:\t%d", ebp->cu_index);
+      shim_debug("\tdpu: [0x%x 0x%x]",
+        ebp->dpu_control_code_host_addr_high,
+        ebp->dpu_control_code_host_addr_low);
+    } else {
+      volatile struct host_indirect_packet_entry *hp =
+        reinterpret_cast<volatile struct host_indirect_packet_entry *>(pkt->data);
+
+      for (int i = 0; i < HSA_INDIRECT_PKT_NUM; i++, hp++) {
+        shim_debug("\thost addr: [0x%x 0x%x]", hp->host_addr_high, hp->host_addr_low);
+
+	volatile struct host_indirect_data *data =
+	  reinterpret_cast<volatile struct host_indirect_data *>(m_umq_indirect_buf);
+	shim_debug("\t\th:distribute:\t%d", data[i].header.distribute);
+	shim_debug("\t\th:indirect:\t%d", data[i].header.indirect);
+	shim_debug("\t\tp:cu_index:\t%d", data[i].payload.cu_index);
+	shim_debug("\t\tp:dpu: [0x%x 0x%x]",
+          data[i].payload.dpu_control_code_host_addr_high,
+          data[i].payload.dpu_control_code_host_addr_low);
+      }
+    }
   }
+  shim_debug("dump finished\r\n");
 }
 
 void
@@ -139,7 +192,7 @@ hw_q_umq::
 dump_raw() const
 {
   auto d = reinterpret_cast<volatile uint32_t *>(m_umq_pkt);
-  auto sz = get_header_ptr()->capacity * sizeof(host_queue_packet_t) / sizeof(uint32_t);
+  auto sz = get_header_ptr()->capacity * sizeof(struct host_queue_packet) / sizeof(uint32_t);
   shim_debug("Dumping raw UMQ queue slot data @%p, len=%ld WORDs:", m_umq_pkt, sz);
   for (int i = 0; i < sz; i++)
     shim_debug("0x%08x", d[i]);
@@ -172,11 +225,18 @@ reserve_slot()
   return cur_slot;
 }
 
-volatile host_queue_packet_t *
+int
+hw_q_umq::
+get_pkt_idx(uint64_t index)
+{
+  return index & (get_header_ptr()->capacity - 1);
+}
+
+volatile struct host_queue_packet *
 hw_q_umq::
-get_slot(uint64_t index)
+get_pkt(uint64_t index)
 {
-  auto pkt = &m_umq_pkt[index & (get_header_ptr()->capacity - 1)];
+  auto pkt = &m_umq_pkt[get_pkt_idx(index)];
   if (is_slot_valid(pkt)) {
     shim_err(EINVAL, "Slot is ready before use! index=0x%lx", index);
     dump();
@@ -188,26 +248,101 @@ uint64_t
 hw_q_umq::
 issue_exec_buf(uint16_t cu_idx, ert_dpu_data *dpu, uint64_t comp)
 {
-  auto idx = reserve_slot();
-  auto pkt = get_slot(idx);
+  auto slot_idx = reserve_slot();
+  auto pkt = get_pkt(slot_idx);
+  size_t pkt_size;
+
+  if (get_ert_dpu_data_next(dpu))
+    pkt_size = fill_indirect_exec_buf(slot_idx, cu_idx, pkt, dpu);
+  else
+    pkt_size = fill_direct_exec_buf(cu_idx, pkt, dpu); 
+
   auto hdr = &pkt->xrt_header;
   hdr->common_header.opcode = HOST_QUEUE_PACKET_EXEC_BUF;
-  hdr->common_header.distribute = 0;
-  hdr->common_header.indirect = 0;
   hdr->completion_signal = comp;
 
-  exec_buf_t payload = {};
-  payload.cu_index = cu_idx;
-  payload.dpu_control_code_host_addr_low = static_cast<uint32_t>(dpu->instruction_buffer);
-  payload.dpu_control_code_host_addr_high = static_cast<uint32_t>(dpu->instruction_buffer >> 32);
+  fill_slot_and_send(pkt, pkt_size);
 
-  fill_slot_and_send(pkt, &payload, sizeof(payload));
-  return idx;
+  return slot_idx;
+}
+
+size_t
+hw_q_umq::
+fill_indirect_exec_buf(uint64_t slot_idx, uint16_t cu_idx,
+                        volatile struct host_queue_packet *pkt,
+                        ert_dpu_data *dpu) {
+  auto pkt_size = (dpu->chained + 1) * sizeof(struct host_indirect_packet_entry);
+
+  if (dpu->chained + 1 >= HSA_INDIRECT_PKT_NUM)
+    shim_err(EINVAL, "unsupported indirect number %d, valid number <= %d",
+      dpu->chained + 1, HSA_INDIRECT_PKT_NUM);
+
+  if (pkt_size > sizeof(pkt->data))
+    shim_err(EINVAL, "dpu pkt_size=0x%lx > pkt_data max size=%x%lx",
+      pkt_size, sizeof(pkt->data));
+
+  // no need to memset to zero, all buffer will be set
+  volatile struct host_indirect_packet_entry *hp =
+    reinterpret_cast<volatile struct host_indirect_packet_entry *>(pkt->data);
+
+  for (int i = 0; dpu; i++, hp++, dpu = get_ert_dpu_data_next(dpu)) {
+    auto data_size = sizeof(struct host_indirect_data) * HSA_INDIRECT_PKT_NUM;
+    auto prefix_off = get_pkt_idx(slot_idx) * data_size;
+    auto prefix_idx = get_pkt_idx(slot_idx) * HSA_INDIRECT_PKT_NUM;
+    auto buf_paddr = m_indirect_paddr + prefix_off +
+       sizeof(struct host_indirect_data) * i;
+
+    hp->host_addr_low = static_cast<uint32_t>(buf_paddr);
+    hp->host_addr_high = static_cast<uint32_t>(buf_paddr >> 32);
+
+    auto cebp = &m_umq_indirect_buf[prefix_idx + i];
+    // do not zero this buffer, the cebp->header is pre-set 
+    // set every cebp->payload field in case of garbage data
+    cebp->payload.cu_index = cu_idx;
+    cebp->payload.dpu_control_code_host_addr_low =
+      static_cast<uint32_t>(dpu->instruction_buffer);
+    cebp->payload.dpu_control_code_host_addr_high =
+      static_cast<uint32_t>(dpu->instruction_buffer >> 32);
+    cebp->payload.args_len = 0;
+    cebp->payload.args_host_addr_low = 0;
+    cebp->payload.args_host_addr_high = 0;
+  }
+
+  auto hdr = &pkt->xrt_header;
+  hdr->common_header.distribute = 1;
+  hdr->common_header.indirect = 1;
+
+  return pkt_size;
+}
+
+size_t
+hw_q_umq::
+fill_direct_exec_buf(uint16_t cu_idx, volatile struct host_queue_packet *pkt,
+                     ert_dpu_data *dpu) {
+  auto pkt_size = sizeof(struct exec_buf);
+  if (pkt_size > sizeof(pkt->data))
+    shim_err(EINVAL, "dpu pkt_size=0x%lx > pkt_data max size=%x%lx",
+      pkt_size, sizeof(pkt->data));
+  
+  // zero this buffer
+  auto data = const_cast<uint32_t *>(pkt->data);
+  std::memset(data, 0, pkt_size);
+  // set correct dpu control code
+  volatile struct exec_buf *ebp = reinterpret_cast<volatile struct exec_buf *>(pkt->data);
+  ebp->cu_index = cu_idx;
+  ebp->dpu_control_code_host_addr_low = static_cast<uint32_t>(dpu->instruction_buffer);
+  ebp->dpu_control_code_host_addr_high = static_cast<uint32_t>(dpu->instruction_buffer >> 32);
+
+  auto hdr = &pkt->xrt_header;
+  hdr->common_header.distribute = 0;
+  hdr->common_header.indirect = 0;
+
+  return pkt_size;
 }
 
 void
 hw_q_umq::
-fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size)
+fill_slot_and_send(volatile struct host_queue_packet *pkt, size_t size)
 {
   if (size > sizeof(pkt->data))
     shim_err(EINVAL, "HSA packet payload too big, size=0x%lx", size);
@@ -215,10 +350,11 @@ fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size
   auto hdr = &pkt->xrt_header;
   hdr->common_header.count = size;
 
-  auto data = const_cast<uint32_t *>(pkt->data);
-  std::memcpy(data, payload, size);
   /* must flush data to make cache coherence */
-  clflush_data((void *)data, size);
+  clflush_data((void *)(pkt->data), size);
+
+  //comment this out, debug only
+  //dump();
 
   /* Always done as last step. */
   mark_slot_valid(pkt);
@@ -247,9 +383,9 @@ issue_command(xrt_core::buffer_handle *cmd_bo)
   }
 
   if (get_ert_dpu_data_next(dpu_data))
-    shim_err(EOPNOTSUPP, "chained dpu data is not supported yet");
+    shim_debug("this is a multi-column dpu request.");
 
-  // Completion signal area has to be a full WORD
+  // Completion signal area has to be a full WORD, we utilze the command_bo
   uint64_t comp = boh->get_properties().paddr + offsetof(ert_start_kernel_cmd, header);
 
   auto id = issue_exec_buf(ffs(cmd->cu_mask) - 1, dpu_data, comp);
diff --git a/src/shim/umq/hwq.h b/src/shim/umq/hwq.h
index 1d484569..8c8cc707 100644
--- a/src/shim/umq/hwq.h
+++ b/src/shim/umq/hwq.h
@@ -30,14 +30,22 @@ class hw_q_umq : public hw_q
   void
   bind_hwctx(const hw_ctx *ctx);
 
-  volatile host_queue_header_t *
+  volatile struct host_queue_header *
   get_header_ptr() const;
 
 private:
+
+  struct host_indirect_data {
+    struct common_header	header;
+    struct exec_buf		payload;
+  };
+
   std::unique_ptr<xrt_core::buffer_handle> m_umq_bo;
   void *m_umq_bo_buf;
-  volatile host_queue_header_t *m_umq_hdr = nullptr;
-  volatile host_queue_packet_t *m_umq_pkt = nullptr;
+  volatile struct host_queue_header *m_umq_hdr = nullptr;
+  volatile struct host_queue_packet *m_umq_pkt = nullptr;
+  volatile struct host_indirect_data *m_umq_indirect_buf = nullptr;
+  uint64_t m_indirect_paddr;
 
   volatile uint32_t *m_mapped_doorbell = nullptr;
 
@@ -46,11 +54,25 @@ class hw_q_umq : public hw_q
   uint64_t
   reserve_slot();
 
-  volatile host_queue_packet_t *
-  get_slot(uint64_t index);
+  int
+  get_pkt_idx(uint64_t index);
+
+  volatile struct host_queue_packet *
+  get_pkt(uint64_t index);
+
+  void
+  init_indirect_buf(volatile struct host_indirect_data *indirect_buf, int size);
+
+  size_t
+  fill_direct_exec_buf(uint16_t cu_idx,
+    volatile struct host_queue_packet *pkt, ert_dpu_data *dpu);
+
+  size_t 
+  fill_indirect_exec_buf(uint64_t idx, uint16_t cu_idx,
+    volatile struct host_queue_packet *pkt, ert_dpu_data *dpu);
 
   void
-  fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size);
+  fill_slot_and_send(volatile struct host_queue_packet *pkt, size_t size);
 
   uint64_t
   issue_exec_buf(uint16_t cu_idx, ert_dpu_data *dpu_data, uint64_t comp);
diff --git a/test/shim_test/io_config.h b/test/shim_test/io_config.h
index 27478ef1..1d1351db 100644
--- a/test/shim_test/io_config.h
+++ b/test/shim_test/io_config.h
@@ -208,7 +208,7 @@ int verify_output(int8_t* buf, const std::string &wrk_path)
     ss >> key >> str_val;
     ss.clear();
     golden_output_files.push_back(wrk_path + "golden_" + str_val + ".bin");
-    dump_output_files.push_back(wrk_path + "dump_" + str_val + ".bin");
+    dump_output_files.push_back("/tmp/dump_" + str_val + "." + std::to_string(getpid()) + ".bin");
 
     getline(myfile, line);
     ss.str(line);
@@ -239,12 +239,15 @@ int verify_output(int8_t* buf, const std::string &wrk_path)
 
   int ret = 0;
   for (int i = 0; i < num_outputs; i++) {
-    std::cout << "Examing output: " << golden_output_files[i] << std::endl;
     ret = comp_buf_strides(buf + output_ddr_addr[i], golden_output_files[i],
                            dump_output_files[i], output_shapes[i], output_strides[i]);
     if (ret) {
         std::cout << "Examing failed, ret " << ret << std::endl;
+        std::cout << "Examing output: " << dump_output_files[i] << std::endl;
         break;
+    } else {
+        if (std::remove(dump_output_files[i].c_str()))
+            std::cout << "Failed to remove " << dump_output_files[i] << std::endl;
     }
   }
 
diff --git a/test/shim_test/io_param.h b/test/shim_test/io_param.h
index b86446a6..452c1076 100644
--- a/test/shim_test/io_param.h
+++ b/test/shim_test/io_param.h
@@ -14,6 +14,9 @@ struct io_test_parameter {
 #define IO_TEST_NOOP_RUN      1
 #define IO_TEST_BAD_RUN       2
   int type;
+#define IO_TEST_IOCTL_WAIT    0
+#define IO_TEST_POLL_WAIT     1
+  int wait;
   bool debug;
 };
 
diff --git a/test/shim_test/io_test.cpp b/test/shim_test/io_test.cpp
index 747e8be0..74df0346 100644
--- a/test/shim_test/io_test.cpp
+++ b/test/shim_test/io_test.cpp
@@ -19,10 +19,11 @@ namespace {
 io_test_parameter io_test_parameters;
 
 void
-io_test_parameter_init(int perf, int type, bool debug = false)
+io_test_parameter_init(int perf, int type, int wait, bool debug = false)
 {
   io_test_parameters.perf = perf;
   io_test_parameters.type = type;
+  io_test_parameters.wait = wait;
   io_test_parameters.debug = debug;
 }
 
@@ -30,7 +31,7 @@ io_test_bo_set
 alloc_and_init_bo_set(device* dev, const std::string& local_data_path)
 {
   io_test_bo_set boset{dev, local_data_path};
-  auto bos = boset.get_bos();
+  auto& bos = boset.get_bos();
 
   if (io_test_parameters.type == IO_TEST_NOOP_RUN) {
     // Preparing no-op kernel's special control code
@@ -82,7 +83,15 @@ io_test_init_runlist_cmd(bo* cmd_bo, std::vector<bo*>& cmd_bos)
   }
 }
 
-#define IO_TEST_TIMEOUT 5000 /* millisecond */
+void io_test_cmd_wait(hwqueue_handle *hwq, std::shared_ptr<bo> bo)
+{
+    if (io_test_parameters.wait == IO_TEST_POLL_WAIT) {
+        while(!hwq->poll_command(bo->get()));
+    } else {
+        hwq->wait_command(bo->get(), 0);
+    }
+}
+
 void
 io_test_cmd_submit_and_wait_latency(
   hwqueue_handle *hwq,
@@ -96,9 +105,10 @@ io_test_cmd_submit_and_wait_latency(
   while (completed < total_cmd_submission) {
     for (auto& cmd : cmdlist_bos) {
         hwq->submit_command(std::get<0>(cmd).get()->get());
-        hwq->wait_command(std::get<0>(cmd).get()->get(), IO_TEST_TIMEOUT);
+        io_test_cmd_wait(hwq, std::get<0>(cmd));
         if (std::get<1>(cmd)->state != ERT_CMD_STATE_COMPLETED)
           throw std::runtime_error("Command error");
+        std::get<1>(cmd)->state = ERT_CMD_STATE_NEW;
         completed++;
         if (completed >= total_cmd_submission)
           break;
@@ -125,9 +135,10 @@ io_test_cmd_submit_and_wait_thruput(
   }
 
   while (completed < issued) {
-    hwq->wait_command(std::get<0>(cmdlist_bos[wait_idx]).get()->get(), IO_TEST_TIMEOUT);
+    io_test_cmd_wait(hwq, std::get<0>(cmdlist_bos[wait_idx]));
     if (std::get<1>(cmdlist_bos[wait_idx])->state != ERT_CMD_STATE_COMPLETED)
       throw std::runtime_error("Command error");
+    std::get<1>(cmdlist_bos[wait_idx])->state = ERT_CMD_STATE_NEW;
     completed++;
 
     if (issued < total_cmd_submission) {
@@ -235,28 +246,63 @@ io_test(device::id_type id, device* dev, int total_hwq_submit, int num_cmdlist,
 void
 TEST_io(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
-  io_test_parameter_init(IO_TEST_NO_PERF, static_cast<unsigned int>(arg[0]));
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+
+  io_test_parameter_init(IO_TEST_NO_PERF, run_type, IO_TEST_IOCTL_WAIT);
   io_test(id, sdev.get(), 1, 1, arg[1]);
 }
 
 void
 TEST_io_latency(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
-  io_test_parameter_init(IO_TEST_LATENCY_PERF, static_cast<unsigned int>(arg[0]));
-  io_test(id, sdev.get(), 1000, 1, 1);
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total = static_cast<unsigned int>(arg[2]);
+
+  io_test_parameter_init(IO_TEST_LATENCY_PERF, run_type, wait_type);
+  io_test(id, sdev.get(), total, 1, 1);
 }
 
 void
 TEST_io_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total = static_cast<unsigned int>(arg[2]);
+
+  io_test_parameter_init(IO_TEST_THRUPUT_PERF, run_type, wait_type);
+  io_test(id, sdev.get(), total, 8, 1);
+}
+
+void
+TEST_io_runlist_latency(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
+{
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total = static_cast<unsigned int>(arg[2]);
+  const size_t max_cmd_per_list = 24;
+
+  io_test_parameter_init(IO_TEST_LATENCY_PERF, run_type, wait_type);
+  for (int cmds_per_list = 1; cmds_per_list <=32; cmds_per_list *=2) {
+    if (cmds_per_list > max_cmd_per_list)
+      cmds_per_list = max_cmd_per_list;
+    int total_hwq_submit = total / cmds_per_list;
+    io_test(id, sdev.get(), total_hwq_submit, 1, cmds_per_list);
+  }
+}
+
+void
+TEST_io_runlist_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
+{
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total_commands = static_cast<unsigned int>(arg[2]);
   int num_bo_set = 256;
-  int total_commands = 32000;
   const size_t max_cmd_per_list = 24;
 
-  io_test_parameter_init(IO_TEST_THRUPUT_PERF, static_cast<unsigned int>(arg[0]));
+  io_test_parameter_init(IO_TEST_THRUPUT_PERF, run_type, wait_type);
 
-  int cmds_per_list;
-  for (cmds_per_list = 1; cmds_per_list <= 32; cmds_per_list *= 2) {
+  for (int cmds_per_list = 1; cmds_per_list <= 32; cmds_per_list *= 2) {
     if (cmds_per_list > max_cmd_per_list)
       cmds_per_list = max_cmd_per_list;
     int num_cmdlist = num_bo_set / cmds_per_list;
diff --git a/test/shim_test/shim_test.cpp b/test/shim_test/shim_test.cpp
index 140d240c..b1c38aea 100644
--- a/test/shim_test/shim_test.cpp
+++ b/test/shim_test/shim_test.cpp
@@ -30,6 +30,8 @@ void TEST_export_import_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_latency(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
+void TEST_io_runlist_latency(device::id_type, std::shared_ptr<device>, arg_type&);
+void TEST_io_runlist_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_noop_io_with_dup_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_shim_umq_vadd(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_shim_umq_memtiles(device::id_type, std::shared_ptr<device>, arg_type&);
@@ -519,10 +521,10 @@ std::vector<test_case> test_list {
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io, { IO_TEST_NORMAL_RUN, 1 }
   },
   test_case{ "measure no-op kernel latency",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN }
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "measure real kernel latency",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NORMAL_RUN }
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NORMAL_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "create and free debug bo",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_create_free_debug_bo, { 0x1000 }
@@ -533,8 +535,8 @@ std::vector<test_case> test_list {
   test_case{ "multi-command io test real kernel good run",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io, { IO_TEST_NORMAL_RUN, 3 }
   },
-  test_case{ "measure no-op kernel throughput listed command",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN }
+  test_case{ "measure no-op kernel throughput chained command",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "npu3 shim vadd",
     TEST_POSITIVE, dev_filter_is_aie4, TEST_shim_umq_vadd, {}
@@ -560,9 +562,27 @@ std::vector<test_case> test_list {
   //test_case{ "Cmd fencing (device side)",
   //  TEST_POSITIVE, dev_filter_is_aie2, TEST_cmd_fence_device, {}
   //},
-  //test_case{ "io test no op with duplicated BOs",
-  //  TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {}
-  //},
+  test_case{ "io test no op with duplicated BOs",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {}
+  },
+  test_case{ "measure no-op kernel latency chained command",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel throuput",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel latency (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel throuput (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel latency chained command (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel throughput chained command (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
+  },
 };
 
 } // namespace
diff --git a/test/shim_test/speed.h b/test/shim_test/speed.h
index 89d41fae..79702816 100644
--- a/test/shim_test/speed.h
+++ b/test/shim_test/speed.h
@@ -7,6 +7,7 @@
 #include <chrono>
 
 using clk = std::chrono::high_resolution_clock;
+using ms_t = std::chrono::milliseconds;
 using us_t = std::chrono::microseconds;
 using ns_t = std::chrono::nanoseconds;
 
diff --git a/tools/bins/17f0_20/validate.xclbin b/tools/bins/17f0_20/validate.xclbin
new file mode 100644
index 00000000..9c66f31b
Binary files /dev/null and b/tools/bins/17f0_20/validate.xclbin differ
diff --git a/tools/info.json b/tools/info.json
index c8f73197..c55558d5 100644
--- a/tools/info.json
+++ b/tools/info.json
@@ -1,21 +1,21 @@
 {
 	"copyright": "Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.",
 	"xrt" : {
-		"version": "202420.2.18.101",
+		"version": "202420.2.18.134",
 		"os_rel": "22.04"
 	},
 	"firmwares": [
 		{
 			"device": "npu1",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.313",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.329",
 			"pci_device_id": "1502",
 			"pci_revision_id": "00",
-			"version": "1.4.2.313",
+			"version": "1.4.2.329",
 			"fw_name": "npu.sbin"
 		},
 		{
 			"device": "npu2",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_00/npu.sbin.0.7.22.185",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_00/npu.sbin.0.7.22.185",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "00",
 			"version": "0.7.22.185",
@@ -23,26 +23,18 @@
 		},
 		{
 			"device": "npu4",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.30.20",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.35.35",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "10",
-			"version": "0.7.30.20",
+			"version": "0.7.35.35",
 			"fw_name": "npu.sbin"
 		},
 		{
 			"device": "npu5",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.30.101",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.35.139",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "11",
-			"version": "0.7.30.101",
-			"fw_name": "npu.sbin"
-		},
-		{
-			"device": "npu6",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_20/npu.sbin.0.7.30.20",
-			"pci_device_id": "17f0",
-			"pci_revision_id": "20",
-			"version": "0.7.30.20",
+			"version": "0.7.35.139",
 			"fw_name": "npu.sbin"
 		}
 	]
diff --git a/xrt b/xrt
index 476f42f4..64d03f56 160000
--- a/xrt
+++ b/xrt
@@ -1 +1 @@
-Subproject commit 476f42f419bbc5d1545aded3627f03c1c2f1336e
+Subproject commit 64d03f567db628c9107b6fcf5d362668d1834567