From ed3147d896380b0f20f2a96f0e5a71f7841437a8 Mon Sep 17 00:00:00 2001
From: Lizhi Hou <36547078+houlz0507@users.noreply.github.com>
Date: Wed, 21 Aug 2024 15:33:19 -0700
Subject: [PATCH 01/44] fix bo import and support dup bo in submit list (#211)

---
 src/driver/amdxdna/aie2_ctx.c    |  8 ++--
 src/driver/amdxdna/amdxdna_ctx.c | 75 +++++++++++++++++++++++++++++++-
 src/driver/amdxdna/amdxdna_ctx.h |  2 +
 src/driver/amdxdna/amdxdna_drm.c |  2 +-
 src/driver/amdxdna/amdxdna_gem.c | 43 +++++++++++++++++-
 src/driver/amdxdna/amdxdna_gem.h |  6 ++-
 test/shim_test/shim_test.cpp     |  6 +--
 7 files changed, 129 insertions(+), 13 deletions(-)

diff --git a/src/driver/amdxdna/aie2_ctx.c b/src/driver/amdxdna/aie2_ctx.c
index c630f048..9b56da72 100644
--- a/src/driver/amdxdna/aie2_ctx.c
+++ b/src/driver/amdxdna/aie2_ctx.c
@@ -928,7 +928,7 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 	job->out_fence = dma_fence_get(&job->base.s_fence->finished);
 
 retry:
-	ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+	ret = amdxdna_lock_objects(job, &acquire_ctx);
 	if (ret) {
 		XDNA_WARN(xdna, "Failed to reverve fence, ret %d", ret);
 		goto put_fence;
@@ -937,7 +937,7 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 	for (i = 0; i < job->bo_cnt; i++) {
 		abo = to_xdna_obj(job->bos[i]);
 		if (abo->mem.map_invalid) {
-			drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+			amdxdna_unlock_objects(job, &acquire_ctx);
 			if (!timeout) {
 				timeout = jiffies +
 					msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
@@ -955,14 +955,14 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 		ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
 		if (ret) {
 			XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
-			drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+			amdxdna_unlock_objects(job, &acquire_ctx);
 			goto put_fence;
 		}
 	}
 
 	for (i = 0; i < job->bo_cnt; i++)
 		dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
-	drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+	amdxdna_unlock_objects(job, &acquire_ctx);
 
 	mutex_lock(&hwctx->priv->io_lock);
 	ret = aie2_hwctx_add_job(hwctx, job);
diff --git a/src/driver/amdxdna/amdxdna_ctx.c b/src/driver/amdxdna/amdxdna_ctx.c
index ba263879..9bb219f7 100644
--- a/src/driver/amdxdna/amdxdna_ctx.c
+++ b/src/driver/amdxdna/amdxdna_ctx.c
@@ -333,7 +333,7 @@ amdxdna_arg_bos_lookup(struct amdxdna_client *client,
 		abo = to_xdna_obj(gobj);
 
 		mutex_lock(&abo->lock);
-		if (abo->pinned) {
+		if (abo->flags & BO_SUBMIT_PINNED) {
 			mutex_unlock(&abo->lock);
 			job->bos[i] = gobj;
 			continue;
@@ -345,7 +345,7 @@ amdxdna_arg_bos_lookup(struct amdxdna_client *client,
 			drm_gem_object_put(gobj);
 			goto put_arg_bos;
 		}
-		abo->pinned = true;
+		abo->flags |= BO_SUBMIT_PINNED;
 		mutex_unlock(&abo->lock);
 
 		job->bos[i] = gobj;
@@ -375,6 +375,77 @@ void amdxdna_job_put(struct amdxdna_sched_job *job)
 	kref_put(&job->refcnt, amdxdna_sched_job_release);
 }
 
+int amdxdna_lock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx)
+{
+	struct amdxdna_dev *xdna = job->hwctx->client->xdna;
+	struct amdxdna_gem_obj *abo;
+	int contended = -1, i, ret;
+
+	ww_acquire_init(ctx, &reservation_ww_class);
+
+retry:
+	if (contended != -1) {
+		ret = dma_resv_lock_slow_interruptible(job->bos[contended]->resv, ctx);
+		if (ret) {
+			ww_acquire_fini(ctx);
+			return ret;
+		}
+		abo->flags |= BO_SUBMIT_LOCKED;
+	}
+
+	for (i = 0; i < job->bo_cnt; i++) {
+		abo = to_xdna_obj(job->bos[i]);
+		if (abo->flags & BO_SUBMIT_LOCKED)
+			continue;
+
+		ret = dma_resv_lock_interruptible(job->bos[i]->resv, ctx);
+		if (ret) {
+			int j;
+
+			for (j = 0; j < i; j++) {
+				abo = to_xdna_obj(job->bos[j]);
+				dma_resv_unlock(job->bos[j]->resv);
+				abo->flags &= ~BO_SUBMIT_LOCKED;
+			}
+
+			if (contended != -1 && contended >= i)
+				dma_resv_unlock(job->bos[contended]->resv);
+
+			if (ret == -EDEADLK) {
+				contended = i;
+				goto retry;
+			}
+
+			ww_acquire_fini(ctx);
+
+			XDNA_ERR(xdna, "Lock BO failed, ret %d", ret);
+			return ret;
+		}
+		abo->flags |= BO_SUBMIT_LOCKED;
+	}
+
+	ww_acquire_done(ctx);
+
+	return 0;
+}
+
+void amdxdna_unlock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx)
+{
+	struct amdxdna_gem_obj *abo;
+	int i;
+
+	for (i = 0; i < job->bo_cnt; i++) {
+		abo = to_xdna_obj(job->bos[i]);
+		if (!(abo->flags & BO_SUBMIT_LOCKED))
+			continue;
+
+		dma_resv_unlock(job->bos[i]->resv);
+		abo->flags &= ~BO_SUBMIT_LOCKED;
+	}
+
+	ww_acquire_fini(ctx);
+}
+
 int amdxdna_cmd_submit(struct amdxdna_client *client, u32 opcode,
 		       u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
 		       u32 hwctx_hdl, u64 *seq)
diff --git a/src/driver/amdxdna/amdxdna_ctx.h b/src/driver/amdxdna/amdxdna_ctx.h
index c1d7ba17..6ccaa45a 100644
--- a/src/driver/amdxdna/amdxdna_ctx.h
+++ b/src/driver/amdxdna/amdxdna_ctx.h
@@ -228,6 +228,8 @@ void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
 void amdxdna_hwctx_suspend(struct amdxdna_client *client);
 void amdxdna_hwctx_resume(struct amdxdna_client *client);
 
+int amdxdna_lock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx);
+void amdxdna_unlock_objects(struct amdxdna_sched_job *job, struct ww_acquire_ctx *ctx);
 int amdxdna_cmd_submit(struct amdxdna_client *client, u32 opcode,
 		       u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
 		       u32 hwctx_hdl, u64 *seq);
diff --git a/src/driver/amdxdna/amdxdna_drm.c b/src/driver/amdxdna/amdxdna_drm.c
index e44062c5..179fb608 100644
--- a/src/driver/amdxdna/amdxdna_drm.c
+++ b/src/driver/amdxdna/amdxdna_drm.c
@@ -229,7 +229,7 @@ const struct drm_driver amdxdna_drm_drv = {
 	/* For shmem object create */
 	.gem_create_object = amdxdna_gem_create_object_cb,
 #ifdef AMDXDNA_SHMEM
-	.gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table,
+	.gem_prime_import = amdxdna_gem_prime_import,
 #else
 	.gem_prime_import_sg_table = drm_gem_dma_prime_import_sg_table,
 #endif
diff --git a/src/driver/amdxdna/amdxdna_gem.c b/src/driver/amdxdna/amdxdna_gem.c
index 6ef81a9b..ebae8b30 100644
--- a/src/driver/amdxdna/amdxdna_gem.c
+++ b/src/driver/amdxdna/amdxdna_gem.c
@@ -67,7 +67,7 @@ static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
 	struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva);
 
 	XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr);
-	if (abo->pinned)
+	if (abo->flags & BO_SUBMIT_PINNED)
 		amdxdna_gem_unpin(abo);
 
 	flush_work(&abo->hmm_unreg_work);
@@ -354,7 +354,6 @@ amdxdna_gem_create_obj(struct drm_device *dev, size_t size)
 	if (!abo)
 		return ERR_PTR(-ENOMEM);
 
-	abo->pinned = false;
 	abo->assigned_hwctx = AMDXDNA_INVALID_CTX_HANDLE;
 	mutex_init(&abo->lock);
 	INIT_WORK(&abo->hmm_unreg_work, amdxdna_hmm_unreg_work);
@@ -381,6 +380,46 @@ amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size)
 	return to_gobj(abo);
 }
 
+struct drm_gem_object *
+amdxdna_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf)
+{
+	struct dma_buf_attachment *attach;
+	struct drm_gem_object *gobj;
+	struct sg_table *sgt;
+	int ret;
+
+	attach = dma_buf_attach(dma_buf, dev->dev);
+	if (IS_ERR(attach))
+		return ERR_CAST(attach);
+
+	get_dma_buf(dma_buf);
+
+	sgt = dma_buf_map_attachment_unlocked(attach, DMA_BIDIRECTIONAL);
+	if (IS_ERR(sgt)) {
+		ret = PTR_ERR(sgt);
+		goto fail_detach;
+	}
+
+	gobj = drm_gem_shmem_prime_import_sg_table(dev, attach, sgt);
+	if (IS_ERR(gobj)) {
+		ret = PTR_ERR(gobj);
+		goto fail_unmap;
+	}
+
+	gobj->import_attach = attach;
+	gobj->resv = dma_buf->resv;
+
+	return gobj;
+
+fail_unmap:
+	dma_buf_unmap_attachment_unlocked(attach, sgt, DMA_BIDIRECTIONAL);
+fail_detach:
+	dma_buf_detach(dma_buf, attach);
+	dma_buf_put(dma_buf);
+
+	return ERR_PTR(ret);
+}
+
 static struct amdxdna_gem_obj *
 amdxdna_drm_alloc_shmem(struct drm_device *dev,
 			struct amdxdna_drm_create_bo *args,
diff --git a/src/driver/amdxdna/amdxdna_gem.h b/src/driver/amdxdna/amdxdna_gem.h
index 3429a3ee..24a61608 100644
--- a/src/driver/amdxdna/amdxdna_gem.h
+++ b/src/driver/amdxdna/amdxdna_gem.h
@@ -27,11 +27,13 @@ struct amdxdna_mem {
 #endif
 };
 
+#define BO_SUBMIT_PINNED	BIT(0)
+#define BO_SUBMIT_LOCKED	BIT(1)
 struct amdxdna_gem_obj {
 	struct drm_gem_shmem_object	base;
 	struct amdxdna_client		*client;
 	u8				type;
-	bool				pinned;
+	u64				flags;
 	struct mutex			lock; /* Protects: pinned, assigned_hwctx */
 	struct amdxdna_mem		mem;
 	struct work_struct		hmm_unreg_work;
@@ -60,6 +62,8 @@ static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo)
 
 struct drm_gem_object *
 amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size);
+struct drm_gem_object *
+amdxdna_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf);
 struct amdxdna_gem_obj *
 amdxdna_drm_alloc_dev_bo(struct drm_device *dev,
 			 struct amdxdna_drm_create_bo *args,
diff --git a/test/shim_test/shim_test.cpp b/test/shim_test/shim_test.cpp
index 140d240c..8057b2f7 100644
--- a/test/shim_test/shim_test.cpp
+++ b/test/shim_test/shim_test.cpp
@@ -560,9 +560,9 @@ std::vector<test_case> test_list {
   //test_case{ "Cmd fencing (device side)",
   //  TEST_POSITIVE, dev_filter_is_aie2, TEST_cmd_fence_device, {}
   //},
-  //test_case{ "io test no op with duplicated BOs",
-  //  TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {}
-  //},
+  test_case{ "io test no op with duplicated BOs",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {}
+  },
 };
 
 } // namespace

From 62e2cba7ed0f3cc68cba8e0ea82c7fb876892a91 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Wed, 21 Aug 2024 17:47:09 -0700
Subject: [PATCH 02/44] somehow auto suspend not work then crash in remove()
 (#215)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/aie2_pci.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c
index 05bfe03e..68ba6624 100644
--- a/src/driver/amdxdna/aie2_pci.c
+++ b/src/driver/amdxdna/aie2_pci.c
@@ -297,8 +297,10 @@ static void aie2_hw_stop(struct amdxdna_dev *xdna)
 	xdna_mailbox_stop_channel(ndev->mgmt_chann);
 	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
 	ndev->mgmt_chann = NULL;
-	xdna_mailbox_destroy(ndev->mbox);
-	ndev->mbox = NULL;
+	if (ndev->mbox) {
+		xdna_mailbox_destroy(ndev->mbox);
+		ndev->mbox = NULL;
+	}
 	aie2_psp_stop(ndev->psp_hdl);
 	aie2_smu_stop(ndev);
 	pci_clear_master(pdev);

From 5771d8a2dc0828095fd6e195239e4e4c37214787 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Thu, 22 Aug 2024 10:31:02 -0700
Subject: [PATCH 03/44] share npu4 to npu2, 5, 6 (#216)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/npu2_regs.c   | 148 +------------------------------
 src/driver/amdxdna/npu4_family.h | 141 +++++++++++++++++++++++++++++
 src/driver/amdxdna/npu4_regs.c   | 129 ++-------------------------
 src/driver/amdxdna/npu5_regs.c   | 148 +------------------------------
 src/driver/amdxdna/npu6_regs.c   | 148 +------------------------------
 5 files changed, 156 insertions(+), 558 deletions(-)
 create mode 100644 src/driver/amdxdna/npu4_family.h

diff --git a/src/driver/amdxdna/npu2_regs.c b/src/driver/amdxdna/npu2_regs.c
index f84c726e..3a10be1e 100644
--- a/src/driver/amdxdna/npu2_regs.c
+++ b/src/driver/amdxdna/npu2_regs.c
@@ -3,159 +3,19 @@
  * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
+#include "npu4_family.h"
 
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU2 */
-#define NPU2_REG_BAR_INDEX	0
-#define NPU2_MBOX_BAR_INDEX	0
-#define NPU2_PSP_BAR_INDEX	4
-#define NPU2_SMU_BAR_INDEX	5
-#define NPU2_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU2_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU2_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU2_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU2_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU2_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU2_RT_CFG_TYPE_CLK_GATING   1
-#define NPU2_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU2_RT_CFG_TYPE_PWR_GATING   3
-#define NPU2_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU2_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU2_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU2_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU2_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU2_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU2_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU2_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU2_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU2_MPNPUCLK_FREQ_MAX  1267
-#define NPU2_HCLK_FREQ_MAX      1800
-
-const struct dpm_clk npu2_dpm_clk_table[DPM_LEVEL_MAX] = {
-	{396, 792},
-	{600, 1056},
-	{792, 1152},
-	{975, 1267},
-	{975, 1267},
-	{1056, 1408},
-	{1152, 1584},
-	{1267, 1800}
-};
-
-const struct rt_config npu2_rt_cfg[] = {
-	{NPU2_RT_CFG_TYPE_PDI_LOAD, NPU2_RT_CFG_VAL_PDI_LOAD_APP},
-	{NPU2_RT_CFG_TYPE_DEBUG_BO, NPU2_RT_CFG_VAL_DEBUG_BO_LARGE},
-};
-
-const u32 npu2_clk_gating_types[] = {
-	NPU2_RT_CFG_TYPE_CLK_GATING,
-	NPU2_RT_CFG_TYPE_HCLK_GATING,
-	NPU2_RT_CFG_TYPE_PWR_GATING,
-	NPU2_RT_CFG_TYPE_L1IMU_GATING,
-};
+/* NPU2 is the prototype of NPU4. It will be obsoleted in near future. */
 
 const struct amdxdna_dev_priv npu2_dev_priv = {
 	.fw_path        = "amdnpu/17f0_00/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu2_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu2_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU2_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU2_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU2_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU2_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU2_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU2_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU2_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU2_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU2_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU2_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU2_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU2_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU2_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU2_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu2_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu2_clk_gating_types),
-		.value_enable = NPU2_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU2_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU2_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU2_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu2_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu2_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU2_RT_CFG_TYPE_PDI_LOAD, NPU2_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu2_info = {
-	.reg_bar           = NPU2_REG_BAR_INDEX,
-	.mbox_bar          = NPU2_MBOX_BAR_INDEX,
-	.sram_bar          = NPU2_SRAM_BAR_INDEX,
-	.psp_bar           = NPU2_PSP_BAR_INDEX,
-	.smu_bar           = NPU2_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu2",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu2_dev_priv,
-	.ops               = &aie2_ops, /* NPU2 can share NPU1's callback */
+	NPU4_COMMON_DEV_INFO,
 };
diff --git a/src/driver/amdxdna/npu4_family.h b/src/driver/amdxdna/npu4_family.h
new file mode 100644
index 00000000..2fc82e20
--- /dev/null
+++ b/src/driver/amdxdna/npu4_family.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _NPU4_FAMILY_H_
+#define _NPU4_FAMILY_H_
+
+#include "drm_local/amdxdna_accel.h"
+#include "aie2_pci.h"
+
+/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
+#define MPNPU_PUB_SEC_INTR             0x3010060
+#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
+#define MPNPU_PUB_SCRATCH0             0x301006C
+#define MPNPU_PUB_SCRATCH1             0x3010070
+#define MPNPU_PUB_SCRATCH2             0x3010074
+#define MPNPU_PUB_SCRATCH3             0x3010078
+#define MPNPU_PUB_SCRATCH4             0x301007C
+#define MPNPU_PUB_SCRATCH5             0x3010080
+#define MPNPU_PUB_SCRATCH6             0x3010084
+#define MPNPU_PUB_SCRATCH7             0x3010088
+#define MPNPU_PUB_SCRATCH8             0x301008C
+#define MPNPU_PUB_SCRATCH9             0x3010090
+#define MPNPU_PUB_SCRATCH10            0x3010094
+#define MPNPU_PUB_SCRATCH11            0x3010098
+#define MPNPU_PUB_SCRATCH12            0x301009C
+#define MPNPU_PUB_SCRATCH13            0x30100A0
+#define MPNPU_PUB_SCRATCH14            0x30100A4
+#define MPNPU_PUB_SCRATCH15            0x30100A8
+#define MP0_C2PMSG_73                  0x3810A24
+#define MP0_C2PMSG_123                 0x3810AEC
+
+#define MP1_C2PMSG_0                   0x3B10900
+#define MP1_C2PMSG_60                  0x3B109F0
+#define MP1_C2PMSG_61                  0x3B109F4
+
+#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
+#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
+#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
+#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
+
+#define MMNPU_APERTURE0_BASE           0x3000000
+#define MMNPU_APERTURE1_BASE           0x3600000
+#define MMNPU_APERTURE3_BASE           0x3810000
+#define MMNPU_APERTURE4_BASE           0x3B10000
+
+/* PCIe BAR Index for NPU4 */
+#define NPU4_REG_BAR_INDEX	0
+#define NPU4_MBOX_BAR_INDEX	0
+#define NPU4_PSP_BAR_INDEX	4
+#define NPU4_SMU_BAR_INDEX	5
+#define NPU4_SRAM_BAR_INDEX	2
+/* Associated BARs and Apertures */
+#define NPU4_REG_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU4_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU4_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
+#define NPU4_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
+#define NPU4_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
+
+#define NPU4_RT_CFG_TYPE_CLK_GATING   1
+#define NPU4_RT_CFG_TYPE_HCLK_GATING  2
+#define NPU4_RT_CFG_TYPE_PWR_GATING   3
+#define NPU4_RT_CFG_TYPE_L1IMU_GATING 4
+#define NPU4_RT_CFG_TYPE_PDI_LOAD     5
+#define NPU4_RT_CFG_TYPE_DEBUG_BO     10
+
+#define NPU4_RT_CFG_VAL_CLK_GATING_OFF 0
+#define NPU4_RT_CFG_VAL_CLK_GATING_ON 1
+
+#define NPU4_RT_CFG_VAL_PDI_LOAD_MGMT 0
+#define NPU4_RT_CFG_VAL_PDI_LOAD_APP 1
+
+#define NPU4_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
+#define NPU4_RT_CFG_VAL_DEBUG_BO_LARGE   1
+
+#define NPU4_MPNPUCLK_FREQ_MAX  1267
+#define NPU4_HCLK_FREQ_MAX      1800
+
+#define NPU4_INIT_RT_CFG_NUM	2
+#define NPU4_CLK_GATING_CFG_NUM 4
+
+extern const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX];
+extern const struct rt_config npu4_rt_cfg[NPU4_INIT_RT_CFG_NUM];
+extern const u32 npu4_clk_gating_types[NPU4_CLK_GATING_CFG_NUM];
+
+#define NPU4_COMMON_DEV_PRIV \
+	.rt_config	= npu4_rt_cfg,								\
+	.num_rt_cfg	= ARRAY_SIZE(npu4_rt_cfg),						\
+	.priv_load_cfg = {NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_MGMT},		\
+	.col_align	= COL_ALIGN_NATURE,							\
+	.mbox_dev_addr  = NPU4_MBOX_BAR_BASE,							\
+	.mbox_size      = 0, /* Use BAR size */							\
+	.sram_dev_addr  = NPU4_SRAM_BAR_BASE,							\
+	.sram_offs      = {									\
+		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),		\
+		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),	\
+	},											\
+	.psp_regs_off   = {									\
+		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU4_PSP, MP0_C2PMSG_123),			\
+		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),		\
+		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU4_REG, MPNPU_PUB_SCRATCH4),		\
+		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU4_REG, MPNPU_PUB_SCRATCH9),		\
+		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU4_PSP, MP0_C2PMSG_73),			\
+		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123),			\
+		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),		\
+	},											\
+	.smu_regs_off   = {									\
+		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU4_SMU, MP1_C2PMSG_0),			\
+		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU4_SMU, MP1_C2PMSG_60),			\
+		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU4_SMU, MMNPU_APERTURE4_BASE),		\
+		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU4_SMU, MP1_C2PMSG_61),			\
+		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU4_SMU, MP1_C2PMSG_60),			\
+	},											\
+	.clk_gating = {										\
+		.types = npu4_clk_gating_types,							\
+		.num_types = ARRAY_SIZE(npu4_clk_gating_types),					\
+		.value_enable = NPU4_RT_CFG_VAL_CLK_GATING_ON,					\
+		.value_disable = NPU4_RT_CFG_VAL_CLK_GATING_OFF,				\
+	},											\
+	.smu_mpnpuclk_freq_max = NPU4_MPNPUCLK_FREQ_MAX,					\
+	.smu_hclk_freq_max     = NPU4_HCLK_FREQ_MAX,						\
+	.smu_dpm_max           = 7,								\
+	.smu_rev = SMU_REVISION_V1,								\
+	.smu_npu_dpm_clk_table = npu4_dpm_clk_table,						\
+	.smu_npu_dpm_levels = ARRAY_SIZE(npu4_dpm_clk_table)
+
+#define NPU4_COMMON_DEV_INFO \
+	.reg_bar           = NPU4_REG_BAR_INDEX,						\
+	.mbox_bar          = NPU4_MBOX_BAR_INDEX,						\
+	.sram_bar          = NPU4_SRAM_BAR_INDEX,						\
+	.psp_bar           = NPU4_PSP_BAR_INDEX,						\
+	.smu_bar           = NPU4_SMU_BAR_INDEX,						\
+	.first_col         = 0,									\
+	.dev_mem_buf_shift = 15, /* 32 KiB aligned */						\
+	.dev_mem_base      = AIE2_DEVM_BASE,							\
+	.dev_mem_size      = AIE2_DEVM_SIZE,							\
+	.device_type       = AMDXDNA_DEV_TYPE_KMQ,						\
+	.ops               = &aie2_ops
+
+#endif /* _NPU4_FAMILY_H_ */
diff --git a/src/driver/amdxdna/npu4_regs.c b/src/driver/amdxdna/npu4_regs.c
index b86958e3..50d0bb10 100644
--- a/src/driver/amdxdna/npu4_regs.c
+++ b/src/driver/amdxdna/npu4_regs.c
@@ -3,76 +3,7 @@
  * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
-
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU4 */
-#define NPU4_REG_BAR_INDEX	0
-#define NPU4_MBOX_BAR_INDEX	0
-#define NPU4_PSP_BAR_INDEX	4
-#define NPU4_SMU_BAR_INDEX	5
-#define NPU4_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU4_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU4_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU4_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU4_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU4_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU4_RT_CFG_TYPE_CLK_GATING   1
-#define NPU4_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU4_RT_CFG_TYPE_PWR_GATING   3
-#define NPU4_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU4_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU4_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU4_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU4_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU4_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU4_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU4_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU4_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU4_MPNPUCLK_FREQ_MAX  1267
-#define NPU4_HCLK_FREQ_MAX      1800
+#include "npu4_family.h"
 
 const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX] = {
 	{396, 792},
@@ -85,12 +16,12 @@ const struct dpm_clk npu4_dpm_clk_table[DPM_LEVEL_MAX] = {
 	{1267, 1800}
 };
 
-const struct rt_config npu4_rt_cfg[] = {
+const struct rt_config npu4_rt_cfg[NPU4_INIT_RT_CFG_NUM] = {
 	{NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_APP},
 	{NPU4_RT_CFG_TYPE_DEBUG_BO, NPU4_RT_CFG_VAL_DEBUG_BO_LARGE},
 };
 
-const u32 npu4_clk_gating_types[] = {
+const u32 npu4_clk_gating_types[NPU4_CLK_GATING_CFG_NUM] = {
 	NPU4_RT_CFG_TYPE_CLK_GATING,
 	NPU4_RT_CFG_TYPE_HCLK_GATING,
 	NPU4_RT_CFG_TYPE_PWR_GATING,
@@ -101,61 +32,11 @@ const struct amdxdna_dev_priv npu4_dev_priv = {
 	.fw_path        = "amdnpu/17f0_10/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu4_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu4_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU4_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU4_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU4_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU4_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU4_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU4_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU4_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU4_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU4_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU4_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU4_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu4_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu4_clk_gating_types),
-		.value_enable = NPU4_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU4_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU4_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU4_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu4_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu4_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU4_RT_CFG_TYPE_PDI_LOAD, NPU4_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu4_info = {
-	.reg_bar           = NPU4_REG_BAR_INDEX,
-	.mbox_bar          = NPU4_MBOX_BAR_INDEX,
-	.sram_bar          = NPU4_SRAM_BAR_INDEX,
-	.psp_bar           = NPU4_PSP_BAR_INDEX,
-	.smu_bar           = NPU4_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu4",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu4_dev_priv,
-	.ops               = &aie2_ops, /* NPU4 can share NPU1's callback */
+	NPU4_COMMON_DEV_INFO,
 };
diff --git a/src/driver/amdxdna/npu5_regs.c b/src/driver/amdxdna/npu5_regs.c
index ed7d81df..7f0050d1 100644
--- a/src/driver/amdxdna/npu5_regs.c
+++ b/src/driver/amdxdna/npu5_regs.c
@@ -3,159 +3,17 @@
  * Copyright (C) 2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
-
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU5 */
-#define NPU5_REG_BAR_INDEX	0
-#define NPU5_MBOX_BAR_INDEX	0
-#define NPU5_PSP_BAR_INDEX	4
-#define NPU5_SMU_BAR_INDEX	5
-#define NPU5_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU5_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU5_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU5_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU5_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU5_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU5_RT_CFG_TYPE_CLK_GATING   1
-#define NPU5_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU5_RT_CFG_TYPE_PWR_GATING   3
-#define NPU5_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU5_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU5_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU5_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU5_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU5_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU5_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU5_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU5_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU5_MPNPUCLK_FREQ_MAX  1267
-#define NPU5_HCLK_FREQ_MAX      1800
-
-const struct dpm_clk npu5_dpm_clk_table[DPM_LEVEL_MAX] = {
-	{396, 792},
-	{600, 1056},
-	{792, 1152},
-	{975, 1267},
-	{975, 1267},
-	{1056, 1408},
-	{1152, 1584},
-	{1267, 1800}
-};
-
-const struct rt_config npu5_rt_cfg[] = {
-	{NPU5_RT_CFG_TYPE_PDI_LOAD, NPU5_RT_CFG_VAL_PDI_LOAD_APP},
-	{NPU5_RT_CFG_TYPE_DEBUG_BO, NPU5_RT_CFG_VAL_DEBUG_BO_LARGE},
-};
-
-const u32 npu5_clk_gating_types[] = {
-	NPU5_RT_CFG_TYPE_CLK_GATING,
-	NPU5_RT_CFG_TYPE_HCLK_GATING,
-	NPU5_RT_CFG_TYPE_PWR_GATING,
-	NPU5_RT_CFG_TYPE_L1IMU_GATING,
-};
+#include "npu4_family.h"
 
 const struct amdxdna_dev_priv npu5_dev_priv = {
 	.fw_path        = "amdnpu/17f0_11/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu5_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu5_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU5_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU5_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU5_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU5_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU5_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU5_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU5_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU5_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU5_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU5_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU5_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU5_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU5_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU5_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu5_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu5_clk_gating_types),
-		.value_enable = NPU5_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU5_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU5_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU5_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu5_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu5_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU5_RT_CFG_TYPE_PDI_LOAD, NPU5_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu5_info = {
-	.reg_bar           = NPU5_REG_BAR_INDEX,
-	.mbox_bar          = NPU5_MBOX_BAR_INDEX,
-	.sram_bar          = NPU5_SRAM_BAR_INDEX,
-	.psp_bar           = NPU5_PSP_BAR_INDEX,
-	.smu_bar           = NPU5_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu5",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu5_dev_priv,
-	.ops               = &aie2_ops, /* NPU5 can share NPU1's callback */
+	NPU4_COMMON_DEV_INFO,
 };
diff --git a/src/driver/amdxdna/npu6_regs.c b/src/driver/amdxdna/npu6_regs.c
index f418896a..c780259a 100644
--- a/src/driver/amdxdna/npu6_regs.c
+++ b/src/driver/amdxdna/npu6_regs.c
@@ -3,159 +3,17 @@
  * Copyright (C) 2024, Advanced Micro Devices, Inc.
  */
 
-#include "drm_local/amdxdna_accel.h"
-#include "aie2_pci.h"
-
-/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
-#define MPNPU_PUB_SEC_INTR             0x3010060
-#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
-#define MPNPU_PUB_SCRATCH0             0x301006C
-#define MPNPU_PUB_SCRATCH1             0x3010070
-#define MPNPU_PUB_SCRATCH2             0x3010074
-#define MPNPU_PUB_SCRATCH3             0x3010078
-#define MPNPU_PUB_SCRATCH4             0x301007C
-#define MPNPU_PUB_SCRATCH5             0x3010080
-#define MPNPU_PUB_SCRATCH6             0x3010084
-#define MPNPU_PUB_SCRATCH7             0x3010088
-#define MPNPU_PUB_SCRATCH8             0x301008C
-#define MPNPU_PUB_SCRATCH9             0x3010090
-#define MPNPU_PUB_SCRATCH10            0x3010094
-#define MPNPU_PUB_SCRATCH11            0x3010098
-#define MPNPU_PUB_SCRATCH12            0x301009C
-#define MPNPU_PUB_SCRATCH13            0x30100A0
-#define MPNPU_PUB_SCRATCH14            0x30100A4
-#define MPNPU_PUB_SCRATCH15            0x30100A8
-#define MP0_C2PMSG_73                  0x3810A24
-#define MP0_C2PMSG_123                 0x3810AEC
-
-#define MP1_C2PMSG_0                   0x3B10900
-#define MP1_C2PMSG_60                  0x3B109F0
-#define MP1_C2PMSG_61                  0x3B109F4
-
-#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
-#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
-#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
-#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
-
-#define MMNPU_APERTURE0_BASE           0x3000000
-#define MMNPU_APERTURE1_BASE           0x3600000
-#define MMNPU_APERTURE3_BASE           0x3810000
-#define MMNPU_APERTURE4_BASE           0x3B10000
-
-/* PCIe BAR Index for NPU6 */
-#define NPU6_REG_BAR_INDEX	0
-#define NPU6_MBOX_BAR_INDEX	0
-#define NPU6_PSP_BAR_INDEX	4
-#define NPU6_SMU_BAR_INDEX	5
-#define NPU6_SRAM_BAR_INDEX	2
-/* Associated BARs and Apertures */
-#define NPU6_REG_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU6_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
-#define NPU6_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
-#define NPU6_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
-#define NPU6_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
-
-#define NPU6_RT_CFG_TYPE_CLK_GATING   1
-#define NPU6_RT_CFG_TYPE_HCLK_GATING  2
-#define NPU6_RT_CFG_TYPE_PWR_GATING   3
-#define NPU6_RT_CFG_TYPE_L1IMU_GATING 4
-#define NPU6_RT_CFG_TYPE_PDI_LOAD     5
-#define NPU6_RT_CFG_TYPE_DEBUG_BO     10
-
-#define NPU6_RT_CFG_VAL_CLK_GATING_OFF 0
-#define NPU6_RT_CFG_VAL_CLK_GATING_ON 1
-
-#define NPU6_RT_CFG_VAL_PDI_LOAD_MGMT 0
-#define NPU6_RT_CFG_VAL_PDI_LOAD_APP 1
-
-#define NPU6_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
-#define NPU6_RT_CFG_VAL_DEBUG_BO_LARGE   1
-
-#define NPU6_MPNPUCLK_FREQ_MAX  1267
-#define NPU6_HCLK_FREQ_MAX      1800
-
-const struct dpm_clk npu6_dpm_clk_table[DPM_LEVEL_MAX] = {
-	{396, 792},
-	{600, 1056},
-	{792, 1152},
-	{975, 1267},
-	{975, 1267},
-	{1056, 1408},
-	{1152, 1584},
-	{1267, 1800}
-};
-
-const struct rt_config npu6_rt_cfg[] = {
-	{NPU6_RT_CFG_TYPE_PDI_LOAD, NPU6_RT_CFG_VAL_PDI_LOAD_APP},
-	{NPU6_RT_CFG_TYPE_DEBUG_BO, NPU6_RT_CFG_VAL_DEBUG_BO_LARGE},
-};
-
-const u32 npu6_clk_gating_types[] = {
-	NPU6_RT_CFG_TYPE_CLK_GATING,
-	NPU6_RT_CFG_TYPE_HCLK_GATING,
-	NPU6_RT_CFG_TYPE_PWR_GATING,
-	NPU6_RT_CFG_TYPE_L1IMU_GATING,
-};
+#include "npu4_family.h"
 
 const struct amdxdna_dev_priv npu6_dev_priv = {
 	.fw_path        = "amdnpu/17f0_20/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
-	.rt_config	= npu6_rt_cfg,
-	.num_rt_cfg	= ARRAY_SIZE(npu6_rt_cfg),
-	.col_align	= COL_ALIGN_NATURE,
-	.mbox_dev_addr  = NPU6_MBOX_BAR_BASE,
-	.mbox_size      = 0, /* Use BAR size */
-	.sram_dev_addr  = NPU6_SRAM_BAR_BASE,
-	.sram_offs      = {
-		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
-		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
-	},
-	.psp_regs_off   = {
-		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU6_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU6_REG, MPNPU_PUB_SCRATCH3),
-		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU6_REG, MPNPU_PUB_SCRATCH4),
-		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU6_REG, MPNPU_PUB_SCRATCH9),
-		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU6_PSP, MP0_C2PMSG_73),
-		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU6_PSP, MP0_C2PMSG_123),
-		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU6_REG, MPNPU_PUB_SCRATCH3),
-	},
-	.smu_regs_off   = {
-		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU6_SMU, MP1_C2PMSG_0),
-		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU6_SMU, MP1_C2PMSG_60),
-		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU6_SMU, MMNPU_APERTURE4_BASE),
-		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU6_SMU, MP1_C2PMSG_61),
-		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU6_SMU, MP1_C2PMSG_60),
-	},
-	.clk_gating = {
-		.types = npu6_clk_gating_types,
-		.num_types = ARRAY_SIZE(npu6_clk_gating_types),
-		.value_enable = NPU6_RT_CFG_VAL_CLK_GATING_ON,
-		.value_disable = NPU6_RT_CFG_VAL_CLK_GATING_OFF,
-	},
-	.smu_mpnpuclk_freq_max = NPU6_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU6_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
-	.smu_rev = SMU_REVISION_V1,
-	.smu_npu_dpm_clk_table = npu6_dpm_clk_table,
-	.smu_npu_dpm_levels = ARRAY_SIZE(npu6_dpm_clk_table),
-#ifdef AMDXDNA_DEVEL
-	.priv_load_cfg = {NPU6_RT_CFG_TYPE_PDI_LOAD, NPU6_RT_CFG_VAL_PDI_LOAD_MGMT},
-#endif
+	NPU4_COMMON_DEV_PRIV,
 };
 
 const struct amdxdna_dev_info dev_npu6_info = {
-	.reg_bar           = NPU6_REG_BAR_INDEX,
-	.mbox_bar          = NPU6_MBOX_BAR_INDEX,
-	.sram_bar          = NPU6_SRAM_BAR_INDEX,
-	.psp_bar           = NPU6_PSP_BAR_INDEX,
-	.smu_bar           = NPU6_SMU_BAR_INDEX,
-	.first_col         = 0,
-	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
-	.dev_mem_base      = AIE2_DEVM_BASE,
-	.dev_mem_size      = AIE2_DEVM_SIZE,
 	.vbnv              = "RyzenAI-npu6",
-	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
 	.dev_priv          = &npu6_dev_priv,
-	.ops               = &aie2_ops,
+	NPU4_COMMON_DEV_INFO,
 };

From be2b8bfe7facf97a143da6a6f6a25f4962a63172 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Thu, 22 Aug 2024 11:15:18 -0700
Subject: [PATCH 04/44] Once signald job->fence, drm will cleanup job->base.
 Thus tracepoint should be in before that to avoid race. (#218)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/aie2_ctx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/driver/amdxdna/aie2_ctx.c b/src/driver/amdxdna/aie2_ctx.c
index 9b56da72..568ccb18 100644
--- a/src/driver/amdxdna/aie2_ctx.c
+++ b/src/driver/amdxdna/aie2_ctx.c
@@ -230,8 +230,8 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
 	struct dma_fence *fence = job->fence;
 
 	job->hwctx->completed++;
+	trace_xdna_job(&job->base, job->hwctx->name, "signale fence", job->seq);
 	dma_fence_signal(fence);
-	trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
 	dma_fence_put(fence);
 	mmput(job->mm);
 	amdxdna_job_put(job);

From 9337513c9edaf5313159990bb584d1755aee36bb Mon Sep 17 00:00:00 2001
From: Lizhi Hou <36547078+houlz0507@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:51:54 -0700
Subject: [PATCH 05/44] fix shimtest import export test failure when NPU driver
 is loaded with iommu_mode=1 (#217)

---
 src/driver/amdxdna/amdxdna_devel.c |  3 +++
 src/driver/amdxdna/amdxdna_gem.c   | 14 ++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/src/driver/amdxdna/amdxdna_devel.c b/src/driver/amdxdna/amdxdna_devel.c
index c4dd3ee5..3f6698a0 100644
--- a/src/driver/amdxdna/amdxdna_devel.c
+++ b/src/driver/amdxdna/amdxdna_devel.c
@@ -171,6 +171,9 @@ void amdxdna_bo_dma_unmap(struct amdxdna_gem_obj *abo)
 	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
 
 	XDNA_DBG(xdna, "BO type %d dma_addr 0x%llx", abo->type, abo->mem.dma_addr);
+	if (is_import_bo(abo))
+		return;
+
 	drm_gem_shmem_put_pages(&abo->base);
 }
 #else
diff --git a/src/driver/amdxdna/amdxdna_gem.c b/src/driver/amdxdna/amdxdna_gem.c
index ebae8b30..76f14d90 100644
--- a/src/driver/amdxdna/amdxdna_gem.c
+++ b/src/driver/amdxdna/amdxdna_gem.c
@@ -409,6 +409,20 @@ amdxdna_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf)
 	gobj->import_attach = attach;
 	gobj->resv = dma_buf->resv;
 
+#ifdef AMDXDNA_DEVEL
+	if (iommu_mode == AMDXDNA_IOMMU_NO_PASID) {
+		struct amdxdna_gem_obj *abo;
+
+		abo = to_xdna_obj(gobj);
+		ret = amdxdna_bo_dma_map(abo);
+		if (ret) {
+			drm_gem_object_put(gobj);
+			goto fail_unmap;
+		}
+		abo->mem.dev_addr = abo->mem.dma_addr;
+	}
+#endif
+
 	return gobj;
 
 fail_unmap:

From 27457c9c257073ec9166632c6c43f32d32ca4908 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Thu, 22 Aug 2024 17:45:45 -0700
Subject: [PATCH 06/44] npu6 and npu4 shared same firmware binary (#219)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/npu6_regs.c | 2 +-
 tools/info.json                | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/driver/amdxdna/npu6_regs.c b/src/driver/amdxdna/npu6_regs.c
index c780259a..efa01321 100644
--- a/src/driver/amdxdna/npu6_regs.c
+++ b/src/driver/amdxdna/npu6_regs.c
@@ -6,7 +6,7 @@
 #include "npu4_family.h"
 
 const struct amdxdna_dev_priv npu6_dev_priv = {
-	.fw_path        = "amdnpu/17f0_20/npu.sbin",
+	.fw_path        = "amdnpu/17f0_10/npu.sbin",
 	.protocol_major = 0x6,
 	.protocol_minor = 0x6,
 	NPU4_COMMON_DEV_PRIV,
diff --git a/tools/info.json b/tools/info.json
index c8f73197..2fb248d8 100644
--- a/tools/info.json
+++ b/tools/info.json
@@ -36,14 +36,6 @@
 			"pci_revision_id": "11",
 			"version": "0.7.30.101",
 			"fw_name": "npu.sbin"
-		},
-		{
-			"device": "npu6",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_20/npu.sbin.0.7.30.20",
-			"pci_device_id": "17f0",
-			"pci_revision_id": "20",
-			"version": "0.7.30.20",
-			"fw_name": "npu.sbin"
 		}
 	]
 }

From 5228267cb4345c4cf8c2bd1fd0f1786e46237db1 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Fri, 23 Aug 2024 09:49:17 -0700
Subject: [PATCH 07/44] add noop kernel runlist latency test (#221)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 test/shim_test/io_test.cpp   | 12 ++++++++++++
 test/shim_test/shim_test.cpp |  4 ++++
 2 files changed, 16 insertions(+)

diff --git a/test/shim_test/io_test.cpp b/test/shim_test/io_test.cpp
index 747e8be0..b7517174 100644
--- a/test/shim_test/io_test.cpp
+++ b/test/shim_test/io_test.cpp
@@ -246,6 +246,18 @@ TEST_io_latency(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
   io_test(id, sdev.get(), 1000, 1, 1);
 }
 
+void
+TEST_io_runlist_latency(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
+{
+  io_test_parameter_init(IO_TEST_LATENCY_PERF, static_cast<unsigned int>(arg[0]));
+  io_test(id, sdev.get(), 32000, 1,  1);
+  io_test(id, sdev.get(), 16000, 1,  2);
+  io_test(id, sdev.get(),  8000, 1,  4);
+  io_test(id, sdev.get(),  4000, 1,  8);
+  io_test(id, sdev.get(),  2000, 1, 16);
+  io_test(id, sdev.get(),  1333, 1, 24);
+}
+
 void
 TEST_io_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
diff --git a/test/shim_test/shim_test.cpp b/test/shim_test/shim_test.cpp
index 8057b2f7..28444b14 100644
--- a/test/shim_test/shim_test.cpp
+++ b/test/shim_test/shim_test.cpp
@@ -29,6 +29,7 @@ using arg_type = const std::vector<uint64_t>;
 void TEST_export_import_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_latency(device::id_type, std::shared_ptr<device>, arg_type&);
+void TEST_io_runlist_latency(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_noop_io_with_dup_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_shim_umq_vadd(device::id_type, std::shared_ptr<device>, arg_type&);
@@ -563,6 +564,9 @@ std::vector<test_case> test_list {
   test_case{ "io test no op with duplicated BOs",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {}
   },
+  test_case{ "io test no-op kernel latency listed command",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN }
+  },
 };
 
 } // namespace

From 44dd97c6c2d658b59054a060a1417e2dd85e3541 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Fri, 23 Aug 2024 13:04:42 -0700
Subject: [PATCH 08/44] support new mgmt channel info struct format (#220)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/aie2_message.c | 15 ++----
 src/driver/amdxdna/aie2_pci.c     | 78 ++++++++++++++++++++++++++++---
 src/driver/amdxdna/aie2_pci.h     |  3 ++
 3 files changed, 77 insertions(+), 19 deletions(-)

diff --git a/src/driver/amdxdna/aie2_message.c b/src/driver/amdxdna/aie2_message.c
index 66b3299a..fd83674f 100644
--- a/src/driver/amdxdna/aie2_message.c
+++ b/src/driver/amdxdna/aie2_message.c
@@ -95,18 +95,9 @@ int aie2_check_protocol_version(struct amdxdna_dev_hdl *ndev)
 		return ret;
 	}
 
-	if (resp.major != ndev->priv->protocol_major) {
-		XDNA_ERR(xdna, "Incompatible firmware protocol version major %d minor %d",
-			 resp.major, resp.minor);
-		return -EINVAL;
-	}
-
-	/*
-	 * Greater protocol minor version means new messages/status/emun are
-	 * added into the firmware interface protocol.
-	 */
-	if (resp.minor < ndev->priv->protocol_minor) {
-		XDNA_ERR(xdna, "Firmware minor version smaller than supported");
+	ret = aie2_check_protocol(ndev, resp.major, resp.minor);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed check protocol %d.%d", resp.major, resp.minor);
 		return -EINVAL;
 	}
 
diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c
index 68ba6624..bf221560 100644
--- a/src/driver/amdxdna/aie2_pci.c
+++ b/src/driver/amdxdna/aie2_pci.c
@@ -29,7 +29,16 @@ MODULE_PARM_DESC(aie2_max_col, "Maximum column could be used");
  * The management mailbox channel is allocated by firmware.
  * The related register and ring buffer information is on SRAM BAR.
  * This struct is the register layout.
+ *
+ * Mgmt channel info query flow:
+ * 1. Poll alive pointer register until it is non zero
+ * 2. The alive pointer pointing to Mgmt Mbox Info on SRAM bar
+ * 4. Read x2i_* and i2x_*
+ * 3. If magic number MGMT_MBOX_MAGIC not presented, done;
+ * Otherwise, read msi_id, major, minor etc..
  */
+#define MGMT_MBOX_MAGIC 0x55504e5f /* _NPU */
+#define MAGIC_OFFSET offsetof(struct mgmt_mbox_chann_info, magic[0])
 struct mgmt_mbox_chann_info {
 	u32	x2i_tail;
 	u32	x2i_head;
@@ -39,8 +48,45 @@ struct mgmt_mbox_chann_info {
 	u32	i2x_head;
 	u32	i2x_buf;
 	u32	i2x_buf_sz;
+	u32	magic;
+	u32	msi_id;
+	u32	prot_major;
+	u32	prot_minor;
+	u32	rsvd[4];
 };
 
+int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+
+	/*
+	 * The driver supported mailbox behavior is defined by
+	 * ndev->priv->protocol_major and protocol_minor.
+	 *
+	 * When major different, it means incompatible behavior.
+	 * When only minor different, the greater minor means more opcode etc.
+	 *
+	 * Thus,
+	 * 1. driver and fw major must be the same
+	 * 2. driver minor must smaller than or equal to fw minor
+	 */
+	if (ndev->priv->protocol_major != fw_major) {
+		XDNA_ERR(xdna, "Incompatible firmware protocol major %d minor %d",
+			 fw_major, fw_minor);
+		return -EINVAL;
+	}
+
+	/*
+	 * Greater protocol minor version means new messages/status/emun are
+	 * added into the firmware interface protocol.
+	 */
+	if (ndev->priv->protocol_minor > fw_minor) {
+		XDNA_ERR(xdna, "Firmware minor version smaller than supported");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static inline void aie2_dump_chann_info_debug(struct amdxdna_dev_hdl *ndev)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
@@ -54,6 +100,11 @@ static inline void aie2_dump_chann_info_debug(struct amdxdna_dev_hdl *ndev)
 	XDNA_DBG(xdna, "x2i ringbuf 0x%x", ndev->mgmt_x2i.rb_start_addr);
 	XDNA_DBG(xdna, "x2i rsize   0x%x", ndev->mgmt_x2i.rb_size);
 	XDNA_DBG(xdna, "x2i chann index 0x%x", ndev->mgmt_chan_idx);
+	if (!ndev->mgmt_prot_major)
+		return;
+
+	XDNA_DBG(xdna, "mailbox protocol major 0x%x", ndev->mgmt_prot_major);
+	XDNA_DBG(xdna, "mailbox protocol minor 0x%x", ndev->mgmt_prot_minor);
 }
 
 static int aie2_get_mgmt_chann_info(struct amdxdna_dev_hdl *ndev)
@@ -96,14 +147,25 @@ static int aie2_get_mgmt_chann_info(struct amdxdna_dev_hdl *ndev)
 	x2i->mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, info_regs.x2i_tail);
 	x2i->rb_start_addr   = AIE2_SRAM_OFF(ndev, info_regs.x2i_buf);
 	x2i->rb_size         = info_regs.x2i_buf_sz;
-	ndev->mgmt_chan_idx  = CHANN_INDEX(ndev, x2i->rb_start_addr);
 
+	if (info_regs.magic != MGMT_MBOX_MAGIC) {
+		ndev->mgmt_chan_idx = CHANN_INDEX(ndev, x2i->rb_start_addr);
+		goto done;
+	}
+
+	ndev->mgmt_chan_idx  = info_regs.msi_id;
+	ndev->mgmt_prot_major = info_regs.prot_major;
+	ndev->mgmt_prot_minor = info_regs.prot_minor;
+	if (aie2_check_protocol(ndev, ndev->mgmt_prot_major, ndev->mgmt_prot_minor))
+		ret = -EINVAL;
+
+done:
 	aie2_dump_chann_info_debug(ndev);
 
 	/* Must clear address at FW_ALIVE_OFF */
 	writel(0, SRAM_GET_ADDR(ndev, FW_ALIVE_OFF));
 
-	return 0;
+	return ret;
 }
 
 static int aie2_runtime_cfg(struct amdxdna_dev_hdl *ndev)
@@ -165,10 +227,12 @@ static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
-	ret = aie2_check_protocol_version(ndev);
-	if (ret) {
-		XDNA_ERR(ndev->xdna, "Check header hash failed");
-		return ret;
+	if (!ndev->mgmt_prot_major) {
+		ret = aie2_check_protocol_version(ndev);
+		if (ret) {
+			XDNA_ERR(ndev->xdna, "Check protocol version failed");
+			return ret;
+		}
 	}
 
 	ret = aie2_runtime_cfg(ndev);
@@ -336,7 +400,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
 
 	ret = aie2_get_mgmt_chann_info(ndev);
 	if (ret) {
-		XDNA_ERR(xdna, "firmware is not alive");
+		XDNA_ERR(xdna, "firmware mgmt info ret %d", ret);
 		goto stop_psp;
 	}
 
diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h
index b00e93ea..03e4cfc2 100644
--- a/src/driver/amdxdna/aie2_pci.h
+++ b/src/driver/amdxdna/aie2_pci.h
@@ -214,6 +214,8 @@ struct amdxdna_dev_hdl {
 	struct xdna_mailbox_chann_res	mgmt_x2i;
 	struct xdna_mailbox_chann_res	mgmt_i2x;
 	u32				mgmt_chan_idx;
+	u32				mgmt_prot_major;
+	u32				mgmt_prot_minor;
 
 	u32				total_col;
 	u32				smu_curr_dpm_level;
@@ -279,6 +281,7 @@ struct amdxdna_dev_priv {
 
 /* aie2_pci.c */
 extern const struct amdxdna_dev_ops aie2_ops;
+int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor);
 
 /* aie2_smu.c */
 void aie2_smu_setup(struct amdxdna_dev_hdl *ndev);

From b09ec43d602978a46c0f26a86e74a4ed4480ac31 Mon Sep 17 00:00:00 2001
From: VENKATA NARENDRA KUMAR GUTTA
 <78980180+vengutta18@users.noreply.github.com>
Date: Mon, 26 Aug 2024 18:07:50 -0700
Subject: [PATCH 09/44] Fix no QoS parameters case (#225)

When there are no QoS parameters are passed
set the DPM level to max, since this is
the default DPM level.
Also, no need to cache the dpm levels, since
the solver will always set a valid DPM level.

Signed-off-by: Venkata Narendra Kumar Gutta <vengutta@amd.com>
---
 src/driver/amdxdna/aie2_debugfs.c |  2 +-
 src/driver/amdxdna/aie2_pci.c     |  3 ++-
 src/driver/amdxdna/aie2_pci.h     |  2 +-
 src/driver/amdxdna/aie2_smu.c     | 10 ++++++----
 src/driver/amdxdna/aie2_solver.c  | 13 +++++++++++--
 src/driver/amdxdna/aie2_solver.h  |  1 +
 6 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/driver/amdxdna/aie2_debugfs.c b/src/driver/amdxdna/aie2_debugfs.c
index ea5105f7..1272efbe 100644
--- a/src/driver/amdxdna/aie2_debugfs.c
+++ b/src/driver/amdxdna/aie2_debugfs.c
@@ -291,7 +291,7 @@ static ssize_t aie2_dpm_level_set(struct file *file, const char __user *ptr,
 		return ret;
 	}
 
-	ret = aie2_smu_set_dpm_level(ndev, val, true);
+	ret = aie2_smu_set_dpm_level(ndev, val);
 	if (ret) {
 		XDNA_ERR(ndev->xdna, "Setting dpm_level:%d failed, ret: %d", val, ret);
 		return ret;
diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c
index bf221560..9454b707 100644
--- a/src/driver/amdxdna/aie2_pci.c
+++ b/src/driver/amdxdna/aie2_pci.c
@@ -306,7 +306,7 @@ static int aie2_set_dpm_level(void *cb_arg, u32 dpm_level)
 
 	xdna = hwctx->client->xdna;
 
-	ret = aie2_smu_set_dpm_level(xdna->dev_handle, dpm_level, true);
+	ret = aie2_smu_set_dpm_level(xdna->dev_handle, dpm_level);
 	if (ret)
 		XDNA_ERR(xdna, "set dpm level failed, ret %d", ret);
 
@@ -589,6 +589,7 @@ static int aie2_init(struct amdxdna_dev *xdna)
 	}
 	ndev->total_col = min(aie2_max_col, ndev->metadata.cols);
 
+	xrs_cfg.max_dpm_level = SMU_DPM_MAX(ndev);
 	xrs_cfg.clk_list.num_levels = ndev->priv->smu_npu_dpm_levels;
 	xrs_cfg.clk_list.cu_clk_list = ndev->priv->smu_npu_dpm_clk_table;
 	xrs_cfg.sys_eff_factor = 1;
diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h
index 03e4cfc2..6ee80555 100644
--- a/src/driver/amdxdna/aie2_pci.h
+++ b/src/driver/amdxdna/aie2_pci.h
@@ -296,7 +296,7 @@ int aie2_smu_set_power_on(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_set_power_off(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_power_state(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_dpm_level(struct amdxdna_dev_hdl *ndev);
-int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cache);
+int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level);
 void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev);
 
 /* aie2_psp.c */
diff --git a/src/driver/amdxdna/aie2_smu.c b/src/driver/amdxdna/aie2_smu.c
index 3675b4e5..c17c672e 100644
--- a/src/driver/amdxdna/aie2_smu.c
+++ b/src/driver/amdxdna/aie2_smu.c
@@ -165,7 +165,7 @@ int aie2_smu_get_dpm_level(struct amdxdna_dev_hdl *ndev)
 	return ndev->smu.curr_dpm_level;
 }
 
-int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cache)
+int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
 	int ret;
 
@@ -177,8 +177,10 @@ int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level, bool cac
 	else
 		ret = aie2_smu_set_dpm_level_v1(ndev, dpm_level);
 
-	if (!ret & cache)
+	if (!ret) {
 		ndev->smu.curr_dpm_level = dpm_level;
+		XDNA_DBG(ndev->xdna, "The dpm level is set to %d", dpm_level);
+	}
 
 	return ret;
 }
@@ -243,7 +245,7 @@ int aie2_smu_start(struct amdxdna_dev_hdl *ndev)
 	XDNA_INFO_ONCE(ndev->xdna, "Set %s = %d mhz", smu->h_clock.name, freq_mhz);
 
 	if (SMU_DPM_MAX(ndev) > 0) {
-		ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level, true);
+		ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level);
 		if (ret) {
 			XDNA_ERR(ndev->xdna, "Set dpm level failed, ret %d", ret);
 			return ret;
@@ -269,7 +271,7 @@ void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev)
 		XDNA_ERR(ndev->xdna, "Set hclk freq %d mhz failed, ret %d", freq_mhz, ret);
 
 	if (SMU_DPM_MAX(ndev) > 0) {
-		ret = aie2_smu_set_dpm_level(ndev, 0, false);
+		ret = aie2_smu_set_dpm_level(ndev, 0);
 		if (ret)
 			XDNA_ERR(ndev->xdna, "Set dpm level 0 failed, ret %d", ret);
 	}
diff --git a/src/driver/amdxdna/aie2_solver.c b/src/driver/amdxdna/aie2_solver.c
index 289f2ec7..5682c6cc 100644
--- a/src/driver/amdxdna/aie2_solver.c
+++ b/src/driver/amdxdna/aie2_solver.c
@@ -95,6 +95,14 @@ static int sanity_check(struct solver_state *xrs, struct alloc_requests *req)
 	return 0;
 }
 
+static bool is_valid_qos_dpm_params(struct aie_qos *rqos)
+{
+	if (rqos->gops > 0 || rqos->fps > 0 ||  rqos->latency > 0)
+		return true;
+
+	return false;
+}
+
 static u32 find_dpm_level(struct solver_state *xrs, struct alloc_requests *req)
 {
 	struct cdo_parts *cdop = &req->cdo;
@@ -103,8 +111,9 @@ static u32 find_dpm_level(struct solver_state *xrs, struct alloc_requests *req)
 	struct solver_node *node;
 	u32 cu_clk_freq, dpm_level;
 
-	if (cdop->ncols > xrs->cfg.total_col)
-		return -EINVAL;
+	/* If no QoS parameters are passed, set it to the max DPM level */
+	if (!is_valid_qos_dpm_params(rqos))
+		return xrs->cfg.max_dpm_level;
 
         /*
          * We can find at least one CDOs groups that meet the
diff --git a/src/driver/amdxdna/aie2_solver.h b/src/driver/amdxdna/aie2_solver.h
index 19fd4b87..98b16380 100644
--- a/src/driver/amdxdna/aie2_solver.h
+++ b/src/driver/amdxdna/aie2_solver.h
@@ -91,6 +91,7 @@ struct init_config {
 	u32			total_col;
 	u32			sys_eff_factor; /* system efficiency factor */
 	u32			latency_adj;    /* latency adjustment in ms */
+	u32			max_dpm_level;	/* Max dpm level in the system */
 	struct clk_list_info	clk_list;       /* List of frequencies available in system */
 	struct device		*dev;
 	struct xrs_action_ops	*actions;

From f4bed65b7d9361e0cf8b17cea528fa614719af98 Mon Sep 17 00:00:00 2001
From: VENKATA NARENDRA KUMAR GUTTA
 <78980180+vengutta18@users.noreply.github.com>
Date: Tue, 27 Aug 2024 13:13:21 -0700
Subject: [PATCH 10/44] Update valid qos dpm params check (#228)

Signed-off-by: Venkata Narendra Kumar Gutta <vengutta@amd.com>
---
 src/driver/amdxdna/aie2_solver.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/driver/amdxdna/aie2_solver.c b/src/driver/amdxdna/aie2_solver.c
index 5682c6cc..38de5a9d 100644
--- a/src/driver/amdxdna/aie2_solver.c
+++ b/src/driver/amdxdna/aie2_solver.c
@@ -97,8 +97,13 @@ static int sanity_check(struct solver_state *xrs, struct alloc_requests *req)
 
 static bool is_valid_qos_dpm_params(struct aie_qos *rqos)
 {
-	if (rqos->gops > 0 || rqos->fps > 0 ||  rqos->latency > 0)
+	/*
+	 * gops is retrieved from the xmodel, so it's always set
+	 * fps and latency are the configurable params from the application
+	 */
+	if (rqos->gops > 0 && (rqos->fps > 0 ||  rqos->latency > 0)) {
 		return true;
+	}
 
 	return false;
 }

From 3404736c604b33139670834841f743d1a9efd502 Mon Sep 17 00:00:00 2001
From: Lizhi Hou <36547078+houlz0507@users.noreply.github.com>
Date: Tue, 27 Aug 2024 13:59:32 -0700
Subject: [PATCH 11/44] fix insert pages for imported bo (#229)

---
 src/driver/amdxdna/amdxdna_gem.c | 58 ++++++++++++++------------------
 1 file changed, 26 insertions(+), 32 deletions(-)

diff --git a/src/driver/amdxdna/amdxdna_gem.c b/src/driver/amdxdna/amdxdna_gem.c
index 76f14d90..fea3aa06 100644
--- a/src/driver/amdxdna/amdxdna_gem.c
+++ b/src/driver/amdxdna/amdxdna_gem.c
@@ -187,51 +187,53 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
 	return 0;
 }
 
-static int is_mapped_fn(pte_t *pte, unsigned long addr, void *data)
-{
-	if (pte_none(ptep_get(pte)))
-		return -EINVAL;
-
-	*(bool *)data = true;
-	return 0;
-}
-
 static int amdxdna_insert_pages(struct amdxdna_gem_obj *abo,
 				struct vm_area_struct *vma)
 {
-	unsigned long num_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+	unsigned long num_pages = vma_pages(vma);
 	struct sg_dma_page_iter sg_iter;
-	bool has_mapped_page = false;
 	unsigned long offset = 0;
 	int ret;
 
 	if (!is_import_bo(abo)) {
+		ret = drm_gem_shmem_mmap(&abo->base, vma);
+		if (ret) {
+			XDNA_ERR(xdna, "Failed shmem mmap %d", ret);
+			return ret;
+		}
+
 		/* The buffer is based on memory pages. Fix the flag. */
 		vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP);
-		return vm_insert_pages(vma, vma->vm_start, abo->base.pages,
-				       &num_pages);
-	}
+		ret = vm_insert_pages(vma, vma->vm_start, abo->base.pages,
+				      &num_pages);
+		if (ret) {
+			XDNA_ERR(xdna, "Failed insert pages %d", ret);
+			vma->vm_ops->close(vma);
+			return ret;
+		}
 
-	ret = apply_to_page_range(vma->vm_mm, vma->vm_start, num_pages,
-				  is_mapped_fn, &has_mapped_page);
-	if (!ret)
 		return 0;
+	}
 
-	if (has_mapped_page)
-		return -EBUSY;
 
 	for_each_sgtable_dma_page(abo->base.sgt, &sg_iter, 0) {
 		dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
 		unsigned long pfn;
 
-		pfn = PFN_DOWN(dma_to_phys(to_gobj(abo)->dev->dev, addr));
-		ret = io_remap_pfn_range(vma, vma->vm_start + offset, pfn,
-					 PAGE_SIZE, vma->vm_page_prot);
-		if (ret)
+		pfn = __phys_to_pfn(dma_to_phys(to_gobj(abo)->dev->dev, addr));
+		ret = vm_insert_page(vma, vma->vm_start + offset, pfn_to_page(pfn));
+		if (ret) {
+			XDNA_ERR(xdna, "Failed insert page %dff", ret);
 			break;
+		}
 
+		if (!(--num_pages))
+			break;
 		offset += PAGE_SIZE;
 	}
+	vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
 	return ret;
 }
@@ -250,16 +252,10 @@ static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj,
 	if (ret)
 		return ret;
 
-	ret = drm_gem_shmem_mmap(&abo->base, vma);
-	if (ret) {
-		XDNA_ERR(xdna, "failed shmem mmap %d", ret);
-		goto hmm_unreg;
-	}
-
 	ret = amdxdna_insert_pages(abo, vma);
 	if (ret) {
 		XDNA_ERR(xdna, "Failed insert pages, ret %d", ret);
-		goto close_vma;
+		goto hmm_unreg;
 	}
 
 	XDNA_DBG(xdna, "BO map_offset 0x%llx type %d userptr 0x%llx size 0x%lx",
@@ -267,8 +263,6 @@ static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj,
 		 abo->mem.userptr, gobj->size);
 	return 0;
 
-close_vma:
-	vma->vm_ops->close(vma);
 hmm_unreg:
 	amdxdna_hmm_unregister(abo);
 	return ret;

From d1a2c8d40db49f996fd6852c2c202d6289f09a2f Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Tue, 27 Aug 2024 14:56:26 -0700
Subject: [PATCH 12/44] add exec buf throughput test (#230)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 test/shim_test/io_test.cpp   | 7 +++++++
 test/shim_test/shim_test.cpp | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/test/shim_test/io_test.cpp b/test/shim_test/io_test.cpp
index b7517174..61b5c5b6 100644
--- a/test/shim_test/io_test.cpp
+++ b/test/shim_test/io_test.cpp
@@ -258,6 +258,13 @@ TEST_io_runlist_latency(device::id_type id, std::shared_ptr<device> sdev, arg_ty
   io_test(id, sdev.get(),  1333, 1, 24);
 }
 
+void
+TEST_io_e_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
+{
+  io_test_parameter_init(IO_TEST_THRUPUT_PERF, static_cast<unsigned int>(arg[0]));
+  io_test(id, sdev.get(), 32000, 8, 1);
+}
+
 void
 TEST_io_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
diff --git a/test/shim_test/shim_test.cpp b/test/shim_test/shim_test.cpp
index 28444b14..fa4887fd 100644
--- a/test/shim_test/shim_test.cpp
+++ b/test/shim_test/shim_test.cpp
@@ -30,6 +30,7 @@ void TEST_export_import_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_latency(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_runlist_latency(device::id_type, std::shared_ptr<device>, arg_type&);
+void TEST_io_e_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_noop_io_with_dup_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_shim_umq_vadd(device::id_type, std::shared_ptr<device>, arg_type&);
@@ -567,6 +568,9 @@ std::vector<test_case> test_list {
   test_case{ "io test no-op kernel latency listed command",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN }
   },
+  test_case{ "measure no-op kernel throuput",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_e_throughput, { IO_TEST_NOOP_RUN }
+  },
 };
 
 } // namespace

From 4bb5966256971031795e20783857d36c18bc10e9 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Tue, 27 Aug 2024 15:40:01 -0700
Subject: [PATCH 13/44] iohub register race condition fix (#227)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/amdxdna_mailbox.c | 105 ++++++++++++++++-----------
 1 file changed, 61 insertions(+), 44 deletions(-)

diff --git a/src/driver/amdxdna/amdxdna_mailbox.c b/src/driver/amdxdna/amdxdna_mailbox.c
index 2f58fd3e..a27cbd17 100644
--- a/src/driver/amdxdna/amdxdna_mailbox.c
+++ b/src/driver/amdxdna/amdxdna_mailbox.c
@@ -82,10 +82,20 @@ struct mailbox {
 
 };
 
+#if defined(CONFIG_DEBUG_FS)
+struct mailbox_res_record {
+	struct list_head		re_entry;
+	struct xdna_mailbox_chann_res	re_x2i;
+	struct xdna_mailbox_chann_res	re_i2x;
+	int				re_irq;
+};
+#endif /* CONFIG_DEBUG_FS */
+
 struct mailbox_channel {
 	struct mailbox			*mb;
 #if defined(CONFIG_DEBUG_FS)
 	struct list_head		chann_entry;
+	struct mailbox_res_record	*record;
 #endif
 	struct xdna_mailbox_chann_res	res[CHAN_RES_NUM];
 	int				msix_irq;
@@ -134,15 +144,6 @@ struct mailbox_msg {
 	struct mailbox_pkg	pkg;
 };
 
-#if defined(CONFIG_DEBUG_FS)
-struct mailbox_res_record {
-	struct list_head		re_entry;
-	struct xdna_mailbox_chann_res	re_x2i;
-	struct xdna_mailbox_chann_res	re_i2x;
-	int				re_irq;
-};
-#endif /* CONFIG_DEBUG_FS */
-
 static void mailbox_reg_write(struct mailbox_channel *mb_chann, u32 mbox_reg, u32 data)
 {
 	struct xdna_mailbox_res *mb_res = &mb_chann->mb->res;
@@ -390,7 +391,8 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 			return -EINVAL;
 		}
 		mailbox_set_headptr(mb_chann, 0);
-		return 0;
+		ret = 0;
+		goto done;
 	}
 
 	if (unlikely(!header.total_size || !IS_ALIGNED(header.total_size, 4))) {
@@ -417,19 +419,62 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 	trace_mbox_set_head(MAILBOX_NAME, mb_chann->msix_irq,
 			    header.opcode, header.id);
 
+done:
 	return ret;
 }
 
+static void mailbox_rx_worker(struct work_struct *rx_work)
+{
+	struct mailbox_channel *mb_chann;
+	int ret;
+
+	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
+
+	if (READ_ONCE(mb_chann->bad_state)) {
+		MB_ERR(mb_chann, "Channel in bad state, work aborted");
+		return;
+	}
+
+	while (1) {
+		/*
+		 * If return is 0, keep consuming next message, until there is
+		 * no messages or an error happened.
+		 */
+		ret = mailbox_get_msg(mb_chann);
+		if (ret == -ENOENT)
+			break;
+
+		/* Other error means device doesn't look good, disable irq. */
+		if (unlikely(ret)) {
+			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
+			WRITE_ONCE(mb_chann->bad_state, true);
+			disable_irq(mb_chann->msix_irq);
+			break;
+		}
+	}
+}
+
 static irqreturn_t mailbox_irq_handler(int irq, void *p)
 {
 	struct mailbox_channel *mb_chann = p;
+	u32 iohub;
+	int i;
 
 	trace_mbox_irq_handle(MAILBOX_NAME, irq);
-	/* Schedule a rx_work to call the callback functions */
-	queue_work(mb_chann->work_q, &mb_chann->rx_work);
 	/* Clear IOHUB register */
 	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
+	/* Schedule a rx_work to call the callback functions */
+	queue_work(mb_chann->work_q, &mb_chann->rx_work);
+	for (i = 0; i < 4; i++) {
+		iohub = mailbox_reg_read(mb_chann, mb_chann->iohub_int_addr);
+		if (iohub)
+			goto race;
+	}
 
+	return IRQ_HANDLED;
+race:
+	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
+	queue_work(mb_chann->work_q, &mb_chann->rx_work);
 	return IRQ_HANDLED;
 }
 
@@ -560,37 +605,6 @@ static int mailbox_polld(void *data)
 }
 #endif
 
-static void mailbox_rx_worker(struct work_struct *rx_work)
-{
-	struct mailbox_channel *mb_chann;
-	int ret;
-
-	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
-
-	if (READ_ONCE(mb_chann->bad_state)) {
-		MB_ERR(mb_chann, "Channel in bad state, work aborted");
-		return;
-	}
-
-	while (1) {
-		/*
-		 * If return is 0, keep consuming next message, until there is
-		 * no messages or an error happened.
-		 */
-		ret = mailbox_get_msg(mb_chann);
-		if (ret == -ENOENT)
-			break;
-
-		/* Other error means device doesn't look good, disable irq. */
-		if (unlikely(ret)) {
-			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
-			WRITE_ONCE(mb_chann->bad_state, true);
-			disable_irq(mb_chann->msix_irq);
-			break;
-		}
-	}
-}
-
 int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
 			  const struct xdna_mailbox_msg *msg, u64 tx_timeout)
 {
@@ -810,7 +824,7 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 #endif
 
 	INIT_WORK(&mb_chann->rx_work, mailbox_rx_worker);
-	mb_chann->work_q = create_singlethread_workqueue(MAILBOX_NAME);
+	mb_chann->work_q = alloc_ordered_workqueue(MAILBOX_NAME, 0);
 	if (!mb_chann->work_q) {
 		MB_ERR(mb_chann, "Create workqueue failed");
 		goto free_and_out;
@@ -842,6 +856,9 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 	list_add(&mb_chann->chann_entry, &mb->chann_list);
 	mutex_unlock(&mb->mbox_lock);
 
+#if defined(CONFIG_DEBUG_FS)
+	mb_chann->record = record;
+#endif
 	MB_DBG(mb_chann, "Mailbox channel created (irq: %d)", mb_chann->msix_irq);
 	return mb_chann;
 

From da67fd1df19394d9238e1e055416bffb39e9a7df Mon Sep 17 00:00:00 2001
From: Lizhi Hou <36547078+houlz0507@users.noreply.github.com>
Date: Wed, 28 Aug 2024 13:44:54 -0700
Subject: [PATCH 14/44] fault in import bo pages instead of insert pages
 directly (#231)

---
 src/driver/amdxdna/amdxdna_gem.c | 35 ++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/driver/amdxdna/amdxdna_gem.c b/src/driver/amdxdna/amdxdna_gem.c
index fea3aa06..d96ac27c 100644
--- a/src/driver/amdxdna/amdxdna_gem.c
+++ b/src/driver/amdxdna/amdxdna_gem.c
@@ -192,7 +192,6 @@ static int amdxdna_insert_pages(struct amdxdna_gem_obj *abo,
 {
 	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
 	unsigned long num_pages = vma_pages(vma);
-	struct sg_dma_page_iter sg_iter;
 	unsigned long offset = 0;
 	int ret;
 
@@ -216,26 +215,32 @@ static int amdxdna_insert_pages(struct amdxdna_gem_obj *abo,
 		return 0;
 	}
 
+	vma->vm_private_data = NULL;
+	vma->vm_ops = NULL;
+	ret = dma_buf_mmap(to_gobj(abo)->dma_buf, vma, 0);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to mmap dma buf %d", ret);
+		return ret;
+	}
 
-	for_each_sgtable_dma_page(abo->base.sgt, &sg_iter, 0) {
-		dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
-		unsigned long pfn;
+	do {
+		vm_fault_t fault_ret;
 
-		pfn = __phys_to_pfn(dma_to_phys(to_gobj(abo)->dev->dev, addr));
-		ret = vm_insert_page(vma, vma->vm_start + offset, pfn_to_page(pfn));
-		if (ret) {
-			XDNA_ERR(xdna, "Failed insert page %dff", ret);
-			break;
+		fault_ret = handle_mm_fault(vma, vma->vm_start+offset,
+					    FAULT_FLAG_WRITE, NULL);
+		if (fault_ret & VM_FAULT_ERROR) {
+			vma->vm_ops->close(vma);
+			XDNA_ERR(xdna, "Fault in page failed"); 
+			return -EFAULT;
 		}
 
-		if (!(--num_pages))
-			break;
 		offset += PAGE_SIZE;
-	}
-	vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);
-	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	} while (--num_pages);
 
-	return ret;
+	/* Drop the reference drm_gem_mmap_obj() acquired.*/
+	drm_gem_object_put(to_gobj(abo));
+
+	return 0;
 }
 
 static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj,

From d2a430ed288b3f3bfcc47aa2d0707a741958a69d Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Wed, 28 Aug 2024 15:18:07 -0700
Subject: [PATCH 15/44] fix noop kernel was using wrong size issue (#232)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 test/shim_test/io_test.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/shim_test/io_test.cpp b/test/shim_test/io_test.cpp
index 61b5c5b6..8ff5e6d7 100644
--- a/test/shim_test/io_test.cpp
+++ b/test/shim_test/io_test.cpp
@@ -30,7 +30,7 @@ io_test_bo_set
 alloc_and_init_bo_set(device* dev, const std::string& local_data_path)
 {
   io_test_bo_set boset{dev, local_data_path};
-  auto bos = boset.get_bos();
+  auto& bos = boset.get_bos();
 
   if (io_test_parameters.type == IO_TEST_NOOP_RUN) {
     // Preparing no-op kernel's special control code

From 822ca7e6edc8e8a9bf36ecab8484d63cfe94fb28 Mon Sep 17 00:00:00 2001
From: amd-akshatah <159081599+amd-akshatah@users.noreply.github.com>
Date: Wed, 28 Aug 2024 16:18:15 -0700
Subject: [PATCH 16/44] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 231ecb7e..16089869 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ This repository is for the AMD XDNA™️ Driver (amdxdna.ko) for Linux®️ and
 - [System Requirements](#system-requirements)
 - [Linux compilation and installation](#linux-compilation-and-installation)
 - [Clone](#clone)
+  
 - [Build](#build)
 - [Test](#test)
 - [Q&A](#qa)

From e6038ab85b5517ad584b1ac218f8bc387be7f1e7 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Thu, 29 Aug 2024 15:15:12 -0700
Subject: [PATCH 17/44] user control autosuspend delay and disable autosuspend
 by default for now (#235)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/amdxdna_pci_drv.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/driver/amdxdna/amdxdna_pci_drv.c b/src/driver/amdxdna/amdxdna_pci_drv.c
index 9f6f7fed..fab4c295 100644
--- a/src/driver/amdxdna/amdxdna_pci_drv.c
+++ b/src/driver/amdxdna/amdxdna_pci_drv.c
@@ -13,7 +13,9 @@
 #include "amdxdna_pci_drv.h"
 #include "amdxdna_sysfs.h"
 
-#define AMDXDNA_AUTOSUSPEND_DELAY	5000 /* miliseconds */
+int autosuspend_ms = -1;
+module_param(autosuspend_ms, int, 0644);
+MODULE_PARM_DESC(autosuspend_ms, "runtime suspend delay in miliseconds. < 0: prevent it");
 
 /*
  *  There are platforms which share the same PCI device ID
@@ -100,7 +102,7 @@ static int amdxdna_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto failed_dev_fini;
 	}
 
-	pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY);
+	pm_runtime_set_autosuspend_delay(dev, autosuspend_ms);
 	pm_runtime_use_autosuspend(dev);
 	pm_runtime_allow(dev);
 

From 1c15f15aa594090bd81e30684290c7e9452d9485 Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:02:25 -0700
Subject: [PATCH 18/44] firmware repo location has moved (#236)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 tools/info.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/info.json b/tools/info.json
index 2fb248d8..821dceff 100644
--- a/tools/info.json
+++ b/tools/info.json
@@ -7,7 +7,7 @@
 	"firmwares": [
 		{
 			"device": "npu1",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.313",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.313",
 			"pci_device_id": "1502",
 			"pci_revision_id": "00",
 			"version": "1.4.2.313",
@@ -15,7 +15,7 @@
 		},
 		{
 			"device": "npu2",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_00/npu.sbin.0.7.22.185",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_00/npu.sbin.0.7.22.185",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "00",
 			"version": "0.7.22.185",
@@ -23,7 +23,7 @@
 		},
 		{
 			"device": "npu4",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.30.20",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.30.20",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "10",
 			"version": "0.7.30.20",
@@ -31,7 +31,7 @@
 		},
 		{
 			"device": "npu5",
-			"url": "https://gitlab.freedesktop.org/drm/firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.30.101",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.30.101",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "11",
 			"version": "0.7.30.101",

From a484a2940bfbd0ad6693917fea1eafd5dcd387fa Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Fri, 30 Aug 2024 16:39:20 -0700
Subject: [PATCH 19/44] timeout_in_sec should be uint (#237)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/amdxdna_tdr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/driver/amdxdna/amdxdna_tdr.c b/src/driver/amdxdna/amdxdna_tdr.c
index f5640b3c..716fe198 100644
--- a/src/driver/amdxdna/amdxdna_tdr.c
+++ b/src/driver/amdxdna/amdxdna_tdr.c
@@ -6,8 +6,8 @@
 #include "amdxdna_drm.h"
 #include "amdxdna_tdr.h"
 
-int timeout_in_sec = 2;
-module_param(timeout_in_sec, int, 0644);
+uint timeout_in_sec = 2;
+module_param(timeout_in_sec, uint, 0644);
 MODULE_PARM_DESC(timeout_in_sec, "Seconds to timeout and recovery, default 2; 0 - No TDR");
 
 #define TDR_TIMEOUT_JIFF msecs_to_jiffies(timeout_in_sec * 1000)

From b209ee9e9e20ac5edfc675ac6e49a1bd642661f0 Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:57:05 -0700
Subject: [PATCH 20/44] Add performance scripts (#238)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/driver/CMakeLists.txt            |   2 +
 src/driver/amdxdna/aie2_ctx.c        |   6 +-
 src/driver/amdxdna/amdxdna_mailbox.c |   1 +
 src/driver/amdxdna/amdxdna_trace.h   |   5 +
 src/driver/tools/npu_perf_analyze.sh | 134 +++++++++++++++++++++++++++
 src/driver/tools/npu_perf_trace.sh   | 124 +++++++++++++++++++++++++
 6 files changed, 269 insertions(+), 3 deletions(-)
 create mode 100755 src/driver/tools/npu_perf_analyze.sh
 create mode 100755 src/driver/tools/npu_perf_trace.sh

diff --git a/src/driver/CMakeLists.txt b/src/driver/CMakeLists.txt
index 1687d274..ad33fb76 100644
--- a/src/driver/CMakeLists.txt
+++ b/src/driver/CMakeLists.txt
@@ -92,6 +92,8 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/dkms.conf
 
 set(amdxdna_drv_tools
   ${CMAKE_CURRENT_SOURCE_DIR}/tools/dkms_driver.sh
+  ${CMAKE_CURRENT_SOURCE_DIR}/tools/npu_perf_trace.sh
+  ${CMAKE_CURRENT_SOURCE_DIR}/tools/npu_perf_analyze.sh
   )
 install(FILES ${amdxdna_drv_tools}
   PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
diff --git a/src/driver/amdxdna/aie2_ctx.c b/src/driver/amdxdna/aie2_ctx.c
index 568ccb18..5bbc0514 100644
--- a/src/driver/amdxdna/aie2_ctx.c
+++ b/src/driver/amdxdna/aie2_ctx.c
@@ -230,7 +230,7 @@ aie2_sched_notify(struct amdxdna_sched_job *job)
 	struct dma_fence *fence = job->fence;
 
 	job->hwctx->completed++;
-	trace_xdna_job(&job->base, job->hwctx->name, "signale fence", job->seq);
+	trace_xdna_job(&job->base, job->hwctx->name, "signaling fence", job->seq);
 	dma_fence_signal(fence);
 	dma_fence_put(fence);
 	mmput(job->mm);
@@ -257,7 +257,7 @@ aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
 	}
 
 	status = *data;
-	XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+	XDNA_DBG(job->hwctx->client->xdna, "Response status 0x%x", status);
 	if (status == AIE2_STATUS_SUCCESS)
 		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
 	else
@@ -284,7 +284,7 @@ aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
 	}
 
 	status = *data;
-	XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+	XDNA_DBG(job->hwctx->client->xdna, "Response status 0x%x", status);
 
 out:
 	aie2_sched_notify(job);
diff --git a/src/driver/amdxdna/amdxdna_mailbox.c b/src/driver/amdxdna/amdxdna_mailbox.c
index a27cbd17..8b152792 100644
--- a/src/driver/amdxdna/amdxdna_mailbox.c
+++ b/src/driver/amdxdna/amdxdna_mailbox.c
@@ -429,6 +429,7 @@ static void mailbox_rx_worker(struct work_struct *rx_work)
 	int ret;
 
 	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
+	trace_mbox_rx_worker(MAILBOX_NAME, mb_chann->msix_irq);
 
 	if (READ_ONCE(mb_chann->bad_state)) {
 		MB_ERR(mb_chann, "Channel in bad state, work aborted");
diff --git a/src/driver/amdxdna/amdxdna_trace.h b/src/driver/amdxdna/amdxdna_trace.h
index 6d73c823..4620d2e4 100644
--- a/src/driver/amdxdna/amdxdna_trace.h
+++ b/src/driver/amdxdna/amdxdna_trace.h
@@ -130,6 +130,11 @@ DEFINE_EVENT(xdna_mbox_name_id, mbox_irq_handle,
 	     TP_ARGS(name, irq)
 );
 
+DEFINE_EVENT(xdna_mbox_name_id, mbox_rx_worker,
+	     TP_PROTO(char *name, int irq),
+	     TP_ARGS(name, irq)
+);
+
 DEFINE_EVENT(xdna_mbox_name_id, mbox_poll_handle,
 	     TP_PROTO(char *name, int irq),
 	     TP_ARGS(name, irq)
diff --git a/src/driver/tools/npu_perf_analyze.sh b/src/driver/tools/npu_perf_analyze.sh
new file mode 100755
index 00000000..514d1e10
--- /dev/null
+++ b/src/driver/tools/npu_perf_analyze.sh
@@ -0,0 +1,134 @@
+#!/usr/bin/bash
+
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+perf_out_file="perf.converted.out"
+
+usage()
+{
+	echo "$0 [entry_index_begin:entry_index_end] event1_pattern event2_pattern"
+	echo "Calculate time from event1 to event2 within [entry_index_begin,entry_index_end)"
+	echo "event pattern examples:"
+	echo "    sdt_xrt:ioctl_exit: \(.+\) arg1=DRM_IOCTL_AMDXDNA_WAIT_CMD"
+}
+
+read_timestamps()
+{
+	timestamps=()
+
+	while IFS= read -r line; do
+		if [ "$line" != "" ]; then
+			timestamps+=($(("10#${line}")))
+		fi
+	done <<< `egrep "$1" ${perf_out_file} | awk '{print $4}' | tr -d '.' | tr -d ':'`
+	echo ${timestamps[@]}
+}
+
+range_start=0
+range_end=0
+event1=""
+event2=""
+if [ "$#" -eq 2 ]; then
+	event1=$1
+	event2=$2
+elif [ "$#" -eq 3 ]; then
+	st=$(echo $1 | cut -d':' -f1)
+	end=$(echo $1 | cut -d':' -f2)
+	if [ "${st}" != "" ]; then
+		range_start=$(("10#${st}"))
+	fi
+	if [ "${end}" != "" ]; then
+		range_end=$(("10#${end}"))
+	fi
+	event1=$2
+	event2=$3
+else
+	usage
+	exit 1
+fi
+
+if [ ! -f ${perf_out_file} ]; then
+	echo "${perf_out_file} is not found"
+	exit 1
+else
+	echo "Parsing ${perf_out_file}..."
+fi
+
+event1_ts=($(read_timestamps "${event1}"))
+event1_ts_num=${#event1_ts[@]}
+echo "${event1_ts_num} events for: '${event1}'"
+
+event2_ts=($(read_timestamps "${event2}"))
+event2_ts_num=${#event2_ts[@]}
+echo "${event2_ts_num} events for: '${event2}'"
+
+# Sanity check collected data
+if [ ${event1_ts_num} -eq 0 ]; then
+	echo No events found for ${event1}
+	exit 1
+fi
+if [ ${event2_ts_num} -eq 0 ]; then
+	echo No events found for ${event2}
+	exit 1
+fi
+# Find first event2 entry index which comes after first event1
+event2_index_base=-1
+for (( i=0; i<${event2_ts_num}; i++ )); do
+	if ! [[ ${event2_ts[i]} -lt ${event1_ts[0]} ]]; then
+		event2_index_base=${i}
+		break
+	fi
+done
+if [ ${event2_index_base} -eq -1 ]; then
+	echo No ${event2} is after ${event1}
+	exit 1
+fi
+
+# Caculate time difference between two events
+diffs=()
+for (( i=0; i<${event1_ts_num}; i++ )); do
+	i2=$(( i+${event2_index_base} ))
+	if ! [ ${i2} -lt ${event2_ts_num} ]; then
+		break
+	fi
+	diffs+=( $((event2_ts[i2] - event1_ts[i])) )
+done
+#echo ${diffs[@]}
+
+
+# Data mining within specified range
+
+if [ ${range_end} -eq 0 ]; then
+	range_end=${#diffs[@]}
+fi
+if [ ${range_end} -eq ${range_start} ]; then
+	echo Range start and end are the same
+	exit 1
+elif [ ${range_end} -lt ${range_start} ]; then
+	echo Range start after end
+	exit 1
+fi
+
+total=0
+largest=${diffs[${range_start}]}
+largest_idx=${range_start}
+smallest=${diffs[${range_start}]}
+smallest_idx=${range_start}
+for (( i=${range_start}; i<${range_end}; i++ )); do
+	total=$(( total + diffs[i] ))
+	if [[ ${largest} -lt ${diffs[i]} ]]; then
+		largest=${diffs[i]}
+		largest_idx=${i}
+	fi
+	if [[ ${smallest} -gt ${diffs[i]} ]]; then
+		smallest=${diffs[i]}
+		smallest_idx=${i}
+	fi
+done
+
+# Output result
+total_events=$(( range_end - range_start ))
+echo Average over ${total_events} events: $(( total / total_events ))us
+echo Largest:  ${largest}us@${largest_idx}
+echo Smallest: ${smallest}us@${smallest_idx}
diff --git a/src/driver/tools/npu_perf_trace.sh b/src/driver/tools/npu_perf_trace.sh
new file mode 100755
index 00000000..d9a09717
--- /dev/null
+++ b/src/driver/tools/npu_perf_trace.sh
@@ -0,0 +1,124 @@
+#! /bin/bash --
+
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+
+set -eu
+
+bold=$(tput bold)
+normal=$(tput sgr0)
+red=$(tput setaf 1)
+yellow=$(tput setaf 3)
+blue=$(tput setaf 4)
+
+trace_info()
+{
+	what=$1
+	echo -e "[INFO]: $what"
+}
+
+trace_warn()
+{
+	what=$1
+	echo -e "[${yellow}WARNING${normal}]: $what"
+}
+
+trace_error()
+{
+	what=$1
+	echo -e "[${red}ERROR${normal}]: $what" 1>&2
+	exit 1
+}
+
+add_sdt_xrt()
+{
+	perf list | grep sdt_xrt > /dev/null && sdt_pre_enabled=1
+	if [[ $sdt_pre_enabled == 1 ]]; then
+		remove_sdt_xrt
+		#trace_warn "XRT SDT had beed added. Skip..."
+		#return
+	fi
+
+	# Add XRT SDT events
+	perf buildid-cache --add $xrt_libs
+	# Convert SDT events to trace points
+	perf probe --add=sdt_xrt:* &> /dev/null
+
+	trace_info "XRT SDT is added"
+}
+
+remove_sdt_xrt()
+{
+	#if [[ $sdt_pre_enabled == 1 ]]; then
+	#	trace_warn "XRT SDT was pre added. Skip..."
+	#	return
+	#fi
+
+	# Delete SDT trace points
+	perf probe --del=sdt_xrt:* &> /dev/null
+	# Remove XRT STD events
+	perf buildid-cache --remove $xrt_libs
+	trace_info "XRT SDT is removed"
+}
+
+## -------- trace flow start --------
+if [ "$EUID" -ne 0 ]; then
+	trace_error "Please run as root"
+fi
+
+# Global variables
+sdt_pre_enabled=0
+xrt_lib_prefix="/opt/xilinx/xrt/lib"
+accel_debugfs="/sys/kernel/debug/accel"
+xrt_libs="${xrt_lib_prefix}/libxrt_coreutil.so,${xrt_lib_prefix}/libxrt_driver_xdna.so"
+perf_record_args="-e amdxdna_trace:* "
+perf_record_args+="-e sdt_xrt:* "
+exec_cmd=""
+
+perf --version > /dev/null
+
+# Argument parsing
+exec_cmd=$@
+if [[ -z "$exec_cmd" ]]; then
+	trace_error "Please put execute application at the end"
+fi
+
+dev=""
+ioctl_sed_expr=""
+for dir in $(ls $accel_debugfs); do
+	accel_fs_name=$(cat ${accel_debugfs}/$dir/name)
+	driver_name=$(echo $accel_fs_name | awk '{print $1}')
+	if [[ ! "$driver_name" =~ "amdxdna" ]]; then
+		continue
+	fi
+
+	if [[ ! -f ${accel_debugfs}/$dir/ioctl_id ]]; then
+		trace_error "${accel_debugfs}/$dir/ioctl_id not exist. amdxdna driver too old?"
+	fi
+
+	dev=$(echo $accel_fs_name | awk -F'[ =]' '{print $3}')
+	ioctl_sed_expr=$(awk -F ':' '{print "s/"$1"/"$2"/g"}' ${accel_debugfs}/$dir/ioctl_id)
+done
+
+if [[ -z "$dev" ]]; then
+	trace_error "No device found"
+fi
+
+trace_info "Found NPU device $dev at ${accel_debugfs}"
+
+add_sdt_xrt
+
+command="perf record $perf_record_args -a $exec_cmd"
+trace_info "$command"
+eval $command
+
+tmp_file=/tmp/perf.out
+# convert timestamp from second to microsecond to avoid floating numbers
+#perf script | awk '{ $4=$4*1000000; print }' > ${tmp_file}
+perf script --reltime > ${tmp_file}
+# replace IOCTL cmd number to name
+sed "$ioctl_sed_expr" "${tmp_file}" > perf.converted.out
+rm -rf ${tmp_file}
+
+remove_sdt_xrt
+## -------- trace flow end --------

From 47804562ec894f954ca6189cc9ad24aad3d4435f Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Tue, 3 Sep 2024 14:26:19 -0700
Subject: [PATCH 21/44] update npu1 fw (#239)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 tools/info.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/info.json b/tools/info.json
index 821dceff..f6f46174 100644
--- a/tools/info.json
+++ b/tools/info.json
@@ -7,10 +7,10 @@
 	"firmwares": [
 		{
 			"device": "npu1",
-			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.313",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/1502_00/npu.sbin.1.4.2.329",
 			"pci_device_id": "1502",
 			"pci_revision_id": "00",
-			"version": "1.4.2.313",
+			"version": "1.4.2.329",
 			"fw_name": "npu.sbin"
 		},
 		{

From 305e7f0e911b72d2c40a265121da3579ebbd3a36 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Wed, 4 Sep 2024 16:24:44 -0700
Subject: [PATCH 22/44] remove eu, looks like Vitis AI python env doesn't like
 it (#240)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/tools/npu_perf_trace.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/driver/tools/npu_perf_trace.sh b/src/driver/tools/npu_perf_trace.sh
index d9a09717..08bb18da 100755
--- a/src/driver/tools/npu_perf_trace.sh
+++ b/src/driver/tools/npu_perf_trace.sh
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 
-set -eu
+#set -eu
 
 bold=$(tput bold)
 normal=$(tput sgr0)

From 4a17aa5f4a2eabfe38cc47056c2cdfffb38f2fc4 Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Thu, 5 Sep 2024 12:43:17 -0700
Subject: [PATCH 23/44] CR-1210833 hw_ctx_create_with_priority fail with
 DRM_IOCTL_AMDXDNA_CREATE_BO exception (#242)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/shim/kmq/device.cpp |  9 ---------
 src/shim/kmq/device.h   |  3 ---
 src/shim/kmq/pcidev.cpp | 27 ++++++++++++++++++++++++++-
 src/shim/kmq/pcidev.h   |  7 +++++++
 src/shim/pcidev.cpp     |  8 ++++++--
 src/shim/pcidev.h       | 13 +++++++++----
 6 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/src/shim/kmq/device.cpp b/src/shim/kmq/device.cpp
index ec703d7c..1479a3c5 100644
--- a/src/shim/kmq/device.cpp
+++ b/src/shim/kmq/device.cpp
@@ -6,21 +6,12 @@
 #include "hwctx.h"
 #include "drm_local/amdxdna_accel.h"
 
-namespace {
-
-// Device memory heap needs to be within one 64MB page. The maximum size is 64MB.
-const size_t dev_mem_size = (64 << 20);
-
-}
-
 namespace shim_xdna {
 
 device_kmq::
 device_kmq(const pdev& pdev, handle_type shim_handle, id_type device_id)
 : device(pdev, shim_handle, device_id)
 {
-  // Alloc and register device memory w/ driver.
-  m_dev_heap_bo = std::make_unique<bo_kmq>(*this, dev_mem_size, AMDXDNA_BO_DEV_HEAP);
   shim_debug("Created KMQ device (%s) ...", get_pdev().m_sysfs_name.c_str());
 }
 
diff --git a/src/shim/kmq/device.h b/src/shim/kmq/device.h
index 8fa76362..768aee14 100644
--- a/src/shim/kmq/device.h
+++ b/src/shim/kmq/device.h
@@ -26,9 +26,6 @@ class device_kmq : public device {
 
   std::unique_ptr<xrt_core::buffer_handle>
   import_bo(xrt_core::shared_handle::export_handle ehdl) const override;
-
-private:
-  std::unique_ptr<xrt_core::buffer_handle> m_dev_heap_bo;
 };
 
 } // namespace shim_xdna
diff --git a/src/shim/kmq/pcidev.cpp b/src/shim/kmq/pcidev.cpp
index 26a3ae8f..0d271ce5 100644
--- a/src/shim/kmq/pcidev.cpp
+++ b/src/shim/kmq/pcidev.cpp
@@ -1,9 +1,17 @@
 // SPDX-License-Identifier: Apache-2.0
 // Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+#include "bo.h"
 #include "device.h"
 #include "pcidev.h"
 
+namespace {
+
+// Device memory heap needs to be within one 64MB page. The maximum size is 64MB.
+const size_t dev_mem_size = (64 << 20);
+
+}
+
 namespace shim_xdna {
 
 pdev_kmq::
@@ -23,7 +31,24 @@ std::shared_ptr<xrt_core::device>
 pdev_kmq::
 create_device(xrt_core::device::handle_type handle, xrt_core::device::id_type id) const
 {
-  return std::make_shared<device_kmq>(*this, handle, id);
+  auto dev = std::make_shared<device_kmq>(*this, handle, id);
+  try {
+    // Alloc device memory on first device creation.
+    // No locking is needed since driver will ensure only one heap BO is created.
+    if (m_dev_heap_bo == nullptr)
+      m_dev_heap_bo = std::make_unique<bo_kmq>(*dev, dev_mem_size, AMDXDNA_BO_DEV_HEAP);
+  } catch (const xrt_core::system_error& ex) {
+    if (ex.get_code() != EBUSY)
+      throw;
+  }
+  return dev;
+}
+
+void
+pdev_kmq::
+on_last_close() const
+{
+  m_dev_heap_bo.reset();
 }
 
 } // namespace shim_xdna
diff --git a/src/shim/kmq/pcidev.h b/src/shim/kmq/pcidev.h
index 03ded1ec..65585924 100644
--- a/src/shim/kmq/pcidev.h
+++ b/src/shim/kmq/pcidev.h
@@ -17,6 +17,13 @@ class pdev_kmq : public pdev
  
   std::shared_ptr<xrt_core::device>
   create_device(xrt_core::device::handle_type handle, xrt_core::device::id_type id) const override;
+
+private:
+  // Create on first device creation and removed right before device is closed
+  mutable std::unique_ptr<xrt_core::buffer_handle> m_dev_heap_bo;
+
+  virtual void
+  on_last_close() const override;
 };
 
 } // namespace shim_xdna
diff --git a/src/shim/pcidev.cpp b/src/shim/pcidev.cpp
index 5d66e372..7ea27527 100644
--- a/src/shim/pcidev.cpp
+++ b/src/shim/pcidev.cpp
@@ -99,7 +99,7 @@ pdev::
 open() const
 {
   int fd;
-  const std::lock_guard<std::mutex> lock(m_lock);
+  const std::lock_guard<std::recursive_mutex> lock(m_lock);
 
   if (m_dev_users == 0) {
     fd = xrt_core::pci::dev::open("", O_RDWR);
@@ -111,6 +111,8 @@ open() const
     m_dev_fd = fd;
   }
   ++m_dev_users;
+
+  on_first_open();
 }
 
 void
@@ -118,10 +120,12 @@ pdev::
 close() const
 {
   int fd;
-  const std::lock_guard<std::mutex> lock(m_lock);
+  const std::lock_guard<std::recursive_mutex> lock(m_lock);
 
   --m_dev_users;
   if (m_dev_users == 0) {
+    on_last_close();
+
     // Stop new users of the fd from other threads.
     fd = m_dev_fd;
     m_dev_fd = -1;
diff --git a/src/shim/pcidev.h b/src/shim/pcidev.h
index 0d487518..9a770da6 100644
--- a/src/shim/pcidev.h
+++ b/src/shim/pcidev.h
@@ -51,26 +51,31 @@ class pdev : public xrt_core::pci::dev
   void
   insert_hdl_mapping(uint32_t hdl, uint64_t ptr) const
   {
-    const std::lock_guard<std::mutex> lock(m_lock);
+    const std::lock_guard<std::recursive_mutex> lock(m_lock);
     m_hdl_map[hdl] = ptr;
   }
   void
   remove_hdl_mapping(uint32_t hdl) const
   {
-    const std::lock_guard<std::mutex> lock(m_lock);
+    const std::lock_guard<std::recursive_mutex> lock(m_lock);
     m_hdl_map.erase(hdl);
   }
   uint64_t
   lookup_hdl_mapping(uint32_t hdl) const
   {
-    const std::lock_guard<std::mutex> lock(m_lock);
+    const std::lock_guard<std::recursive_mutex> lock(m_lock);
     return m_hdl_map[hdl];
   }
 
 private:
+  virtual void
+  on_first_open() const {}
+  virtual void
+  on_last_close() const {}
+
   mutable int m_dev_fd = -1;
   mutable int m_dev_users = 0;
-  mutable std::mutex m_lock;
+  mutable std::recursive_mutex m_lock;
   const bool m_force_unchained_command = true;
   // Mark it as mutable since pdev does not look at what is saved in this map
   mutable std::map<uint32_t, uint64_t> m_hdl_map;

From 03f1eb56a3c008e116fee0255555925d2a7b7a02 Mon Sep 17 00:00:00 2001
From: Lizhi Hou <36547078+houlz0507@users.noreply.github.com>
Date: Thu, 5 Sep 2024 12:44:03 -0700
Subject: [PATCH 24/44] CR-1210981 System hang while using 4 threads to create
 BO at the same time (#243)

---
 src/driver/amdxdna/amdxdna_gem.c | 118 ++++++++++++++++++-------------
 1 file changed, 68 insertions(+), 50 deletions(-)

diff --git a/src/driver/amdxdna/amdxdna_gem.c b/src/driver/amdxdna/amdxdna_gem.c
index d96ac27c..eb25f5e6 100644
--- a/src/driver/amdxdna/amdxdna_gem.c
+++ b/src/driver/amdxdna/amdxdna_gem.c
@@ -60,48 +60,6 @@ amdxdna_gem_insert_node_locked(struct amdxdna_gem_obj *abo, bool use_vmap)
 	return 0;
 }
 
-static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
-{
-	struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev);
-	struct amdxdna_gem_obj *abo = to_xdna_obj(gobj);
-	struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva);
-
-	XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr);
-	if (abo->flags & BO_SUBMIT_PINNED)
-		amdxdna_gem_unpin(abo);
-
-	flush_work(&abo->hmm_unreg_work);
-	if (abo->type == AMDXDNA_BO_DEV) {
-		mutex_lock(&abo->client->mm_lock);
-		drm_mm_remove_node(&abo->mm_node);
-		mutex_unlock(&abo->client->mm_lock);
-
-		vunmap(abo->mem.kva);
-		drm_gem_object_put(to_gobj(abo->dev_heap));
-		drm_gem_object_release(gobj);
-		mutex_destroy(&abo->lock);
-		kfree(abo);
-		return;
-	}
-
-	if (abo->type == AMDXDNA_BO_DEV_HEAP)
-		drm_mm_takedown(&abo->mm);
-
-#ifdef AMDXDNA_DEVEL
-	if (abo->type == AMDXDNA_BO_CMD)
-		amdxdna_mem_unmap(xdna, &abo->mem);
-	else if (iommu_mode == AMDXDNA_IOMMU_NO_PASID)
-		amdxdna_bo_dma_unmap(abo);
-#endif
-	drm_gem_vunmap_unlocked(gobj, &map);
-	mutex_destroy(&abo->lock);
-	drm_gem_shmem_free(&abo->base);
-}
-
-static const struct drm_gem_object_funcs amdxdna_gem_dev_obj_funcs = {
-	.free = amdxdna_gem_obj_free,
-};
-
 static bool amdxdna_hmm_invalidate(struct mmu_interval_notifier *mni,
 				   const struct mmu_notifier_range *range,
 				   unsigned long cur_seq)
@@ -136,8 +94,11 @@ static void amdxdna_hmm_unregister(struct amdxdna_gem_obj *abo)
 	if (!xdna->dev_info->ops->hmm_invalidate)
 		return;
 
-	if (!abo->mem.pfns)
+	mutex_lock(&abo->lock);
+	if (!abo->mem.pfns) {
+		mutex_unlock(&abo->lock);
 		return;
+	}
 
 	mmu_interval_notifier_remove(&abo->mem.notifier);
 	kvfree(abo->mem.pfns);
@@ -145,6 +106,8 @@ static void amdxdna_hmm_unregister(struct amdxdna_gem_obj *abo)
 
 	if (is_import_bo(abo) && vma->vm_file && vma->vm_file->f_mapping)
 		mapping_clear_unevictable(vma->vm_file->f_mapping);
+
+	mutex_unlock(&abo->lock);
 }
 
 static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
@@ -159,14 +122,19 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
 	if (!xdna->dev_info->ops->hmm_invalidate)
 		return 0;
 
-	if (abo->mem.pfns)
-		return -EEXIST;
+	mutex_lock(&abo->lock);
+	if (abo->mem.pfns) {
+		ret = -EEXIST;
+		goto out_unlock;
+	}
 
 	nr_pages = (PAGE_ALIGN(addr + len) - (addr & PAGE_MASK)) >> PAGE_SHIFT;
 	abo->mem.pfns = kvcalloc(nr_pages, sizeof(unsigned long),
 				 GFP_KERNEL);
-	if (!abo->mem.pfns)
-		return -ENOMEM;
+	if (!abo->mem.pfns) {
+		ret = -ENOMEM;
+		goto out_unlock;
+	}
 
 	ret = mmu_interval_notifier_insert_locked(&abo->mem.notifier,
 						  current->mm,
@@ -175,18 +143,68 @@ static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo,
 						  &amdxdna_hmm_ops);
 	if (ret) {
 		XDNA_ERR(xdna, "Insert mmu notifier failed, ret %d", ret);
-		kvfree(abo->mem.pfns);
-		abo->mem.pfns = NULL;
-		return ret;
+		goto free_pfns;
 	}
 	abo->mem.userptr = addr;
 	abo->mem.vma = vma;
 	if (is_import_bo(abo) && vma->vm_file && vma->vm_file->f_mapping)
 		mapping_set_unevictable(vma->vm_file->f_mapping);
 
+	mutex_unlock(&abo->lock);
+
 	return 0;
+
+free_pfns:
+	kvfree(abo->mem.pfns);
+	abo->mem.pfns = NULL;
+out_unlock:
+	mutex_unlock(&abo->lock);
+	return ret;
+}
+
+static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev);
+	struct amdxdna_gem_obj *abo = to_xdna_obj(gobj);
+	struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva);
+
+	XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr);
+	if (abo->flags & BO_SUBMIT_PINNED)
+		amdxdna_gem_unpin(abo);
+
+	amdxdna_hmm_unregister(abo);
+	flush_work(&abo->hmm_unreg_work);
+	if (abo->type == AMDXDNA_BO_DEV) {
+		mutex_lock(&abo->client->mm_lock);
+		drm_mm_remove_node(&abo->mm_node);
+		mutex_unlock(&abo->client->mm_lock);
+
+		vunmap(abo->mem.kva);
+		drm_gem_object_put(to_gobj(abo->dev_heap));
+		drm_gem_object_release(gobj);
+		mutex_destroy(&abo->lock);
+		kfree(abo);
+		return;
+	}
+
+	if (abo->type == AMDXDNA_BO_DEV_HEAP)
+		drm_mm_takedown(&abo->mm);
+
+#ifdef AMDXDNA_DEVEL
+	if (abo->type == AMDXDNA_BO_CMD)
+		amdxdna_mem_unmap(xdna, &abo->mem);
+	else if (iommu_mode == AMDXDNA_IOMMU_NO_PASID)
+		amdxdna_bo_dma_unmap(abo);
+#endif
+	drm_gem_vunmap_unlocked(gobj, &map);
+	mutex_destroy(&abo->lock);
+	drm_gem_shmem_free(&abo->base);
 }
 
+static const struct drm_gem_object_funcs amdxdna_gem_dev_obj_funcs = {
+	.free = amdxdna_gem_obj_free,
+};
+
 static int amdxdna_insert_pages(struct amdxdna_gem_obj *abo,
 				struct vm_area_struct *vma)
 {

From 85fffa042830acb4ec0d4d0ff13758ff26057a22 Mon Sep 17 00:00:00 2001
From: Sonal Santan <sonal.santan@amd.com>
Date: Thu, 5 Sep 2024 15:13:36 -0700
Subject: [PATCH 25/44] Add a document describing AMD NPU workings

Signed-off-by: Sonal Santan <sonal.santan@amd.com>
---
 src/driver/doc/amdnpu.rst | 249 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 src/driver/doc/amdnpu.rst

diff --git a/src/driver/doc/amdnpu.rst b/src/driver/doc/amdnpu.rst
new file mode 100644
index 00000000..38b7629b
--- /dev/null
+++ b/src/driver/doc/amdnpu.rst
@@ -0,0 +1,249 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+
+=========
+ AMD NPU
+=========
+
+:Copyright: |copy| 2024 Advanced Micro Devices, Inc.
+:Author: Sonal Santan <sonal.santan@amd.com>
+
+Overview
+========
+
+AMD NPU (Neural Processing Unit) is a multi-user AI inference accelerator
+integrated into AMD client APU. NPU enables efficient execution of Machine
+Learning applications like CNN, LLM, etc. NPU is based on
+`AMD XDNA Architecture`_. NPU is managed by **amdxdna** driver.
+
+
+Hardware Description
+====================
+
+AMD NPU consists of the following hardware components:
+
+AMD XDNA Array
+--------------
+
+AMD XDNA Array comprises of 2D array of compute and memory tiles built with
+`AMD AI Engine Technology`_. Each column has 4 rows of compute tiles and 1
+row of memory tile. Each compute tile contains a VLIW processor with its own
+dedicated program and data memory. The memory tile acts as L2 memory. The 2D
+array can be partitioned at a column boundary creating a spatially isolated
+partition which can be bound to a workload context.
+
+Each column also has dedicated DMA engines to move data between host DDR and
+memory tile.
+
+AMD Phoenix and AMD Hawk Point client NPU have a 4x5 topology, i.e., 4 rows of
+compute tiles arranged into 5 columns. AMD Strix Point client APU have 4x8
+topology, i.e., 4 rows of compute tiles arranged into 8 columns.
+
+Shared L2 Memory
+................
+
+The single row of memory tiles create a pool of software managed on chip L2
+memory. DMA engines are used to move data between host DDR and memory tiles.
+AMD Phoenix and AMD Hawk Point NPUs have a total of 2560 KB of L2 memory.
+AMD Strix Point NPU has a total of 4096 KB of L2 memory.
+
+Microcontroller
+---------------
+
+A microcontroller runs NPU Firmware which is responsible for command processing,
+XDNA Array partition setup, XDNA Array configuration, workload context
+management and workload orchestration.
+
+NPU Firmware uses a dedicated instance of isolated non-privileged context
+called ERT to service each workload context. ERT is also used to execute
+user provided ``ctrlcode`` associated with the workload context.
+
+NPU Firmware uses a single isolated privileged context called MERT to service
+management commands from the amdxdna driver.
+
+Mailboxes
+.........
+
+The microcontroller and amdxdna driver use a privileged channel for management
+tasks like setting up of contexts, telemetry, query, error handling, setting up
+user channel, etc. As mentioned before, privileged channel requests are
+serviced by MERT.
+
+The microcontroller and amdxdna driver use a dedicated user channel per
+workload context. The user channel is primarily used for submitting work to
+the NPU. As mentioned before, a user channel requests are serviced by an
+instance of ERT.
+
+PCIe EP
+-------
+
+NPU is visible to the x86 as a PCIe device with 3 BARS and an MSI-X interrupt
+vector. NPU uses a dedicated high bandwidth SoC level fabric for reading
+writing into host memory.
+
+TODO, briefly describe the BARs
+
+Process Isolation Hardware
+--------------------------
+
+As explained before, XDNA Array can be dynamically divided into isolated
+spatial partitions, each of which may have one or more columns. The spatial
+partition is setup by programming the column isolation registers by the
+microcontroller. Each spatial partition is associated with a PASID which is
+also programmed by the microcontroller. Hence multiple spatial partitions in
+the NPU can make concurrent host access protected by PASID.
+
+The NPU FW itself uses microcontroller MMU enforced isolated contexts for
+servicing user and privileged channel requests.
+
+
+Mixed Spatial and Temporal Scheduling
+=====================================
+
+AMD XDNA architecture supports mixed spatial and temporal (time sharing)
+scheduling of 2D array. This means that spatial partitions may be setup and
+torn down dynamically to accommodate various workloads. A *spatial* partition
+may be *exclusively* bound to one workload context while another partition may
+be *temporarily* bound to more than one workload contexts. The microcontroller
+updates the PASID for a temporarily shared partition to match the context that
+has been bound to the partition at any moment.
+
+Resource Solver
+---------------
+
+The Resource Solver component of the amdxdna driver manages the allocation
+of 2D array among various workloads. Every workload describes the number
+of columns required to run the NPU binary in its metadata. The Resource Solver
+component uses hints passed by the workload and its own heuristics to
+decide 2D array (re)partition strategy and mapping of workloads for spatial and
+temporal sharing of columns. The FW enforces the context-to-column(s) resource
+binding decisions made by the Resource Solver.
+
+AMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent user
+contexts. AMD Strix Point can support 16 concurrent user contexts.
+
+
+Application Binaries
+====================
+
+A NPU application workload is comprised of two separate binaries which are
+generated by the NPU compiler.
+
+1. AMD XDNA Array overlay, which is used to configure a NPU spatial partition.
+   The overlay contains instructions for setting up the stream switch
+   configuration and ELF for the compute tiles. The overlay is loaded by the
+   microcontroller on the spatial partition when the workload context becomes
+   active. Refer to the
+   `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details.
+
+2. ``ctrlcode``, used for orchestrating the overlay loaded. ctrlcode is
+   executed by the ERT running in protected mode on the microcontroller in the
+   context of the workload. ctrlcode is made up as a sequence of
+   ``XAie_TxnOpcode``. Refer to the `AI Engine Run Time`_ for more details.
+
+
+Special Host Buffers
+====================
+
+Per-context Instruction Buffer
+------------------------------
+
+Every workload context uses a host resident 64 MB buffer which is memory
+mapped into the ERT instance created to service the workload. The ``ctrlcode``
+used by the workload is copied into this special memory. This buffer is
+protected by PASID like all other input/output buffers used by that workload.
+Instruction buffer is also mapped into the user space of the workload.
+
+Global Privileged Buffer
+------------------------
+
+In addition, the driver also allocates a single buffer for maintenance tasks
+like recording errors from MERT. This global buffer uses the global IOMMU
+domain and is only accessible by MERT.
+
+
+High-level Use Flow
+===================
+
+Here are the steps to run a workload on AMD NPU:
+
+1.  Compile the workload into an overlay and a ctrlcode binary.
+2.  Userspace opens a context in the driver and provides the overlay.
+3.  The driver checks with the Resource Solver for provisioning a set of columns
+    for the workload.
+4.  The driver then asks MERT to create a context on the device with the desired
+    columns.
+5.  MERT then creates an instance of ERT. MERT also maps the Instruction Buffer
+    into ERT memory.
+6.  The userspace then copies the ctrlcode to the Instruction Buffer.
+7.  Userspace then creates a command buffer with pointers to input, output, and
+    instruction buffer; it then submits command buffer with the driver and goes
+    to sleep waiting for completion.
+8.  The driver sends the command over the Mailbox to ERT.
+9.  ERT *executes* the ``ctrlcode`` in the instruction buffer.
+10. Execution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while
+    AMD XDNA Array is running.
+11. When ERT reaches end of ctrlcode, it raises an MSI-X to send completion
+    signal to the driver which then wakes up the waiting workload.
+
+
+Boot Flow
+=========
+
+amdxdna driver uses PSP to securely load signed NPU FW and kick off the boot
+of the NPU microcontroller. amdxdna driver then waits for the alive signal in
+a special location on BAR 0. The NPU is switched off during SoC suspend and
+turned on after resume where the NPU FW is reloaded, and the handshake is
+performed again.
+
+
+Userspace components
+====================
+
+Compiler
+--------
+
+Peano is an LLVM based open-source compiler for AMD XDNA Array compute tile
+available at:
+https://github.com/Xilinx/llvm-aie
+
+The open-source IREE compiler supports graph compilation of ML models for AMD
+NPU and uses Peano underneath. It is available at:
+https://github.com/nod-ai/iree-amd-aie
+
+Usermode Driver (UMD)
+---------------------
+
+The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT
+can be found at:
+https://github.com/Xilinx/XRT
+
+
+DMA Operation
+=============
+
+DMA operation instructions are encoded in the ctrlcode. When ERT executes the
+ctrlcode DMA operations are effected.
+
+
+Error Handling
+==============
+
+When MERT detects an error in AMD XDNA Array,
+TODO, briefly describe backtracking
+
+
+Telemetry
+=========
+
+MERT can report various kinds of telemetry information like
+TODO, list a few
+
+
+References
+==========
+
+- `AMD XDNA Architecture <https://www.amd.com/en/technologies/xdna.html>`_
+- `AMD AI Engine Technology <https://www.xilinx.com/products/technology/ai-engine.html>`_
+- `Peano <https://github.com/Xilinx/llvm-aie>`_
+- `Versal Adaptive SoC AIE-ML Architecture Manual (AM020) <https://docs.amd.com/r/en-US/am020-versal-aie-ml>`_
+- `AI Engine Run Time <https://github.com/Xilinx/aie-rt/tree/release/main_aig>`_

From 55a7d50d63acc2182f5da5c55afcce7d1f476cf8 Mon Sep 17 00:00:00 2001
From: Sonal Santan <sonal.santan@amd.com>
Date: Thu, 5 Sep 2024 21:52:28 -0700
Subject: [PATCH 26/44] Minor changes to incorporate feedback

Signed-off-by: Sonal Santan <sonal.santan@amd.com>
---
 src/driver/doc/amdnpu.rst | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/driver/doc/amdnpu.rst b/src/driver/doc/amdnpu.rst
index 38b7629b..b373e6da 100644
--- a/src/driver/doc/amdnpu.rst
+++ b/src/driver/doc/amdnpu.rst
@@ -53,9 +53,9 @@ A microcontroller runs NPU Firmware which is responsible for command processing,
 XDNA Array partition setup, XDNA Array configuration, workload context
 management and workload orchestration.
 
-NPU Firmware uses a dedicated instance of isolated non-privileged context
-called ERT to service each workload context. ERT is also used to execute
-user provided ``ctrlcode`` associated with the workload context.
+NPU Firmware uses a dedicated instance of an isolated non-privileged context
+called ERT to service each workload context. ERT is also used to execute user
+provided ``ctrlcode`` associated with the workload context.
 
 NPU Firmware uses a single isolated privileged context called MERT to service
 management commands from the amdxdna driver.
@@ -66,19 +66,20 @@ Mailboxes
 The microcontroller and amdxdna driver use a privileged channel for management
 tasks like setting up of contexts, telemetry, query, error handling, setting up
 user channel, etc. As mentioned before, privileged channel requests are
-serviced by MERT.
+serviced by MERT. The privileged channel is bound to a single mailbox.
 
 The microcontroller and amdxdna driver use a dedicated user channel per
 workload context. The user channel is primarily used for submitting work to
 the NPU. As mentioned before, a user channel requests are serviced by an
-instance of ERT.
+instance of ERT. Each user channel is bound to its own dedicated mailbox.
 
 PCIe EP
 -------
 
 NPU is visible to the x86 as a PCIe device with 3 BARS and an MSI-X interrupt
 vector. NPU uses a dedicated high bandwidth SoC level fabric for reading
-writing into host memory.
+writing into host memory. Each instance of ERT gets its own dedicated MSI-X
+interrupt. MERT gets a single instance of MSI-X interrupt.
 
 TODO, briefly describe the BARs
 
@@ -130,9 +131,9 @@ generated by the NPU compiler.
 
 1. AMD XDNA Array overlay, which is used to configure a NPU spatial partition.
    The overlay contains instructions for setting up the stream switch
-   configuration and ELF for the compute tiles. The overlay is loaded by the
-   microcontroller on the spatial partition when the workload context becomes
-   active. Refer to the
+   configuration and ELF for the compute tiles. The overlay is loaded on the
+   spatial partition bound to the workload by the associated ERT instance.
+   Refer to the
    `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details.
 
 2. ``ctrlcode``, used for orchestrating the overlay loaded. ctrlcode is
@@ -217,6 +218,9 @@ The open-source XRT runtime stack interfaces with amdxdna kernel driver. XRT
 can be found at:
 https://github.com/Xilinx/XRT
 
+The open-source XRT shim for NPU is can be found at:
+https://github.com/amd/xdna-driver
+
 
 DMA Operation
 =============

From 6f1b34179a95e00bafe9a6d34517917d35ad2c1f Mon Sep 17 00:00:00 2001
From: Sonal Santan <sonal.santan@amd.com>
Date: Fri, 6 Sep 2024 08:23:28 -0700
Subject: [PATCH 27/44] Elaborate on error handling and other minor changes

Signed-off-by: Sonal Santan <sonal.santan@amd.com>
---
 src/driver/doc/amdnpu.rst | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/driver/doc/amdnpu.rst b/src/driver/doc/amdnpu.rst
index b373e6da..b4b5dc2f 100644
--- a/src/driver/doc/amdnpu.rst
+++ b/src/driver/doc/amdnpu.rst
@@ -119,8 +119,8 @@ decide 2D array (re)partition strategy and mapping of workloads for spatial and
 temporal sharing of columns. The FW enforces the context-to-column(s) resource
 binding decisions made by the Resource Solver.
 
-AMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent user
-contexts. AMD Strix Point can support 16 concurrent user contexts.
+AMD Phoenix and AMD Hawk Point client NPU can support 6 concurrent workload
+contexts. AMD Strix Point can support 16 concurrent workload contexts.
 
 
 Application Binaries
@@ -136,10 +136,11 @@ generated by the NPU compiler.
    Refer to the
    `Versal Adaptive SoC AIE-ML Architecture Manual (AM020)`_ for more details.
 
-2. ``ctrlcode``, used for orchestrating the overlay loaded. ctrlcode is
-   executed by the ERT running in protected mode on the microcontroller in the
-   context of the workload. ctrlcode is made up as a sequence of
-   ``XAie_TxnOpcode``. Refer to the `AI Engine Run Time`_ for more details.
+2. ``ctrlcode``, used for orchestrating the overlay loaded on the spatial
+   partition. ``ctrlcode`` is executed by the ERT running in protected mode on
+   the microcontroller in the context of the workload. ``ctrlcode`` is made up
+   of a sequence of opcodes named ``XAie_TxnOpcode``. Refer to the
+   `AI Engine Run Time`_ for more details.
 
 
 Special Host Buffers
@@ -167,7 +168,7 @@ High-level Use Flow
 
 Here are the steps to run a workload on AMD NPU:
 
-1.  Compile the workload into an overlay and a ctrlcode binary.
+1.  Compile the workload into an overlay and a ``ctrlcode`` binary.
 2.  Userspace opens a context in the driver and provides the overlay.
 3.  The driver checks with the Resource Solver for provisioning a set of columns
     for the workload.
@@ -175,7 +176,7 @@ Here are the steps to run a workload on AMD NPU:
     columns.
 5.  MERT then creates an instance of ERT. MERT also maps the Instruction Buffer
     into ERT memory.
-6.  The userspace then copies the ctrlcode to the Instruction Buffer.
+6.  The userspace then copies the ``ctrlcode`` to the Instruction Buffer.
 7.  Userspace then creates a command buffer with pointers to input, output, and
     instruction buffer; it then submits command buffer with the driver and goes
     to sleep waiting for completion.
@@ -183,7 +184,7 @@ Here are the steps to run a workload on AMD NPU:
 9.  ERT *executes* the ``ctrlcode`` in the instruction buffer.
 10. Execution of the ``ctrlcode`` kicks off DMAs to and from the host DDR while
     AMD XDNA Array is running.
-11. When ERT reaches end of ctrlcode, it raises an MSI-X to send completion
+11. When ERT reaches end of ``ctrlcode``, it raises an MSI-X to send completion
     signal to the driver which then wakes up the waiting workload.
 
 
@@ -225,22 +226,26 @@ https://github.com/amd/xdna-driver
 DMA Operation
 =============
 
-DMA operation instructions are encoded in the ctrlcode. When ERT executes the
-ctrlcode DMA operations are effected.
+DMA operation instructions are encoded in the ``ctrlcode`` as
+``XAIE_IO_BLOCKWRITE`` opcode. When ERT executes ``XAIE_IO_BLOCKWRITE``, DMA
+operations between host DDR and L2 memory are effected.
 
 
 Error Handling
 ==============
 
-When MERT detects an error in AMD XDNA Array,
-TODO, briefly describe backtracking
+When MERT detects an error in AMD XDNA Array, it pauses execution for that
+workload context and sends an asynchronous message to the driver over the
+privileged channel. The driver then sends a buffer pointer to MERT to capture
+the register states for the partition bound to faulting workload context. The
+driver then decodes the error by reading the contents of the buffer pointer.
 
 
 Telemetry
 =========
 
 MERT can report various kinds of telemetry information like
-TODO, list a few
+TODO, list the key ones
 
 
 References

From 1be64666b739fe28ac0a7035476ba15adbf7386f Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Sun, 8 Sep 2024 15:17:32 -0700
Subject: [PATCH 28/44] improve npu_perf_analyze.sh (#246)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/driver/tools/npu_perf_analyze.sh | 118 ++++++++++++++++-----------
 1 file changed, 69 insertions(+), 49 deletions(-)

diff --git a/src/driver/tools/npu_perf_analyze.sh b/src/driver/tools/npu_perf_analyze.sh
index 514d1e10..a96cf915 100755
--- a/src/driver/tools/npu_perf_analyze.sh
+++ b/src/driver/tools/npu_perf_analyze.sh
@@ -3,14 +3,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # Copyright (C) 2024, Advanced Micro Devices, Inc.
 
-perf_out_file="perf.converted.out"
-
 usage()
 {
-	echo "$0 [entry_index_begin:entry_index_end] event1_pattern event2_pattern"
-	echo "Calculate time from event1 to event2 within [entry_index_begin,entry_index_end)"
-	echo "event pattern examples:"
-	echo "    sdt_xrt:ioctl_exit: \(.+\) arg1=DRM_IOCTL_AMDXDNA_WAIT_CMD"
+  cat << USAGE_END
+Usage: $0 [options] event1_pattern event2_pattern
+Options:
+  -file/-f: Trace log file for parsing
+  -range/-r: [entry_index_begin:entry_index_end), e.g.: 100:200
+Parsing trace log file to find time interval from event1 to event2.
+event pattern examples:
+  "sdt_xrt:ioctl_exit: \(.+\) arg1=DRM_IOCTL_AMDXDNA_WAIT_CMD"
+USAGE_END
 }
 
 read_timestamps()
@@ -25,29 +28,51 @@ read_timestamps()
 	echo ${timestamps[@]}
 }
 
-range_start=0
-range_end=0
-event1=""
-event2=""
-if [ "$#" -eq 2 ]; then
-	event1=$1
-	event2=$2
-elif [ "$#" -eq 3 ]; then
-	st=$(echo $1 | cut -d':' -f1)
-	end=$(echo $1 | cut -d':' -f2)
-	if [ "${st}" != "" ]; then
-		range_start=$(("10#${st}"))
-	fi
-	if [ "${end}" != "" ]; then
-		range_end=$(("10#${end}"))
-	fi
-	event1=$2
-	event2=$3
-else
+if [ "$#" -eq 0 ]; then
 	usage
 	exit 1
 fi
 
+range_start=-1
+range_end=-1
+event1=""
+event2=""
+perf_out_file="perf.converted.out"
+while [ $# -gt 0 ]; do
+	case "$1" in
+		-range | -r)
+			st=$(echo $2 | cut -d':' -f1)
+			end=$(echo $2 | cut -d':' -f2)
+			if [ "${st}" != "" ]; then
+				if [[ "${st}" =~ ^[0-9]+$ ]]; then
+					range_start=$(("10#${st}"))
+				else
+					echo Invalid range start: ${st}
+					exit 1
+				fi
+			fi
+			if [ "${end}" != "" ]; then
+				if [[ "${end}" =~ ^[0-9]+$ ]]; then
+					range_end=$(("10#${end}"))
+				else
+					echo Invalid range end: ${end}
+					exit 1
+				fi
+			fi
+			shift
+			;;
+		-file | -f)
+			perf_out_file=$2
+			shift
+			;;
+		*)
+			break
+	esac
+	shift
+done
+event1=$1
+event2=$2
+
 if [ ! -f ${perf_out_file} ]; then
 	echo "${perf_out_file} is not found"
 	exit 1
@@ -57,49 +82,44 @@ fi
 
 event1_ts=($(read_timestamps "${event1}"))
 event1_ts_num=${#event1_ts[@]}
-echo "${event1_ts_num} events for: '${event1}'"
-
-event2_ts=($(read_timestamps "${event2}"))
-event2_ts_num=${#event2_ts[@]}
-echo "${event2_ts_num} events for: '${event2}'"
-
-# Sanity check collected data
 if [ ${event1_ts_num} -eq 0 ]; then
 	echo No events found for ${event1}
 	exit 1
 fi
+echo "${event1_ts_num} events for: '${event1}'"
+
+event2_ts=($(read_timestamps "${event2}"))
+event2_ts_num=${#event2_ts[@]}
 if [ ${event2_ts_num} -eq 0 ]; then
 	echo No events found for ${event2}
 	exit 1
 fi
-# Find first event2 entry index which comes after first event1
-event2_index_base=-1
-for (( i=0; i<${event2_ts_num}; i++ )); do
-	if ! [[ ${event2_ts[i]} -lt ${event1_ts[0]} ]]; then
-		event2_index_base=${i}
-		break
-	fi
-done
-if [ ${event2_index_base} -eq -1 ]; then
-	echo No ${event2} is after ${event1}
-	exit 1
-fi
+echo "${event2_ts_num} events for: '${event2}'"
 
 # Caculate time difference between two events
 diffs=()
-for (( i=0; i<${event1_ts_num}; i++ )); do
-	i2=$(( i+${event2_index_base} ))
-	if ! [ ${i2} -lt ${event2_ts_num} ]; then
+i1=0
+i2=0
+while [ ${i1} -lt ${event1_ts_num} ]; do
+	while [[ ${i2} -lt ${event2_ts_num} && ${event2_ts[i2]} -lt ${event1_ts[i1]} ]]; do
+		(( i2++ ))
+	done
+	if [ ${i2} -eq ${event2_ts_num} ]; then
 		break
 	fi
-	diffs+=( $((event2_ts[i2] - event1_ts[i])) )
+	diffs+=( $((event2_ts[i2] - event1_ts[i1])) )
+	(( i1++ ))
+	(( i2++ ))
 done
 #echo ${diffs[@]}
 
 
 # Data mining within specified range
 
-if [ ${range_end} -eq 0 ]; then
+if [ ${range_start} -eq -1 ]; then
+	range_start=0
+fi
+if [ ${range_end} -eq -1 ]; then
 	range_end=${#diffs[@]}
 fi
 if [ ${range_end} -eq ${range_start} ]; then

From 182163a9a10a2c19901121cd014ec8b0123b93e5 Mon Sep 17 00:00:00 2001
From: David Zhang <50243230+xdavidz@users.noreply.github.com>
Date: Mon, 9 Sep 2024 09:27:53 -0700
Subject: [PATCH 29/44] indirect dpu pkt handling with device firmware
 host_queue changes (#241)

* indirect dpu pkt handling with device firmware host_queue changes
* remove the mailbox log

---------

Signed-off-by: David Zhang <yidong.zhang@amd.com>
---
 src/driver/amdxdna/amdxdna_mailbox.c |   4 +-
 src/shim/umq/host_queue.h            | 249 +++++++--------------------
 src/shim/umq/hwq.cpp                 | 203 ++++++++++++++++++----
 src/shim/umq/hwq.h                   |  34 +++-
 4 files changed, 257 insertions(+), 233 deletions(-)

diff --git a/src/driver/amdxdna/amdxdna_mailbox.c b/src/driver/amdxdna/amdxdna_mailbox.c
index 8b152792..f8cd8cc3 100644
--- a/src/driver/amdxdna/amdxdna_mailbox.c
+++ b/src/driver/amdxdna/amdxdna_mailbox.c
@@ -487,10 +487,8 @@ static void mailbox_timer(struct timer_list *t)
 
 	/* The timer mimic interrupt. It is good to reuse irq routine */
 	tail = mailbox_get_tailptr(mb_chann, CHAN_RES_I2X);
-	if (tail) {
-		MB_DBG(mb_chann, "Mimic interrupt...");
+	if (tail)
 		mailbox_irq_handler(0, mb_chann);
-	}
 
 	mod_timer(&mb_chann->timer, jiffies + MB_TIMER_JIFF);
 }
diff --git a/src/shim/umq/host_queue.h b/src/shim/umq/host_queue.h
index 14cb41e0..fe8dc8bc 100644
--- a/src/shim/umq/host_queue.h
+++ b/src/shim/umq/host_queue.h
@@ -1,60 +1,9 @@
-/*  (c) Copyright 2014 - 2022 Xilinx, Inc. All rights reserved.
-   
-    This file contains confidential and proprietary information
-    of Xilinx, Inc. and is protected under U.S. and
-    international copyright and other intellectual property
-    laws.
-   
-    DISCLAIMER
-    This disclaimer is not a license and does not grant any
-    rights to the materials distributed herewith. Except as
-    otherwise provided in a valid license issued to you by
-    Xilinx, and to the maximum extent permitted by applicable
-    law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
-    WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
-    AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
-    BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
-    INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
-    (2) Xilinx shall not be liable (whether in contract or tort,
-    including negligence, or under any other theory of
-    liability) for any loss or damage of any kind or nature
-    related to, arising under or in connection with these
-    materials, including for any direct, or any indirect,
-    special, incidental, or consequential loss or damage
-    (including loss of data, profits, goodwill, or any type of
-    loss or damage suffered as a result of any action brought
-    by a third party) even if such damage or loss was
-    reasonably foreseeable or Xilinx had been advised of the
-    possibility of the same.
-   
-    CRITICAL APPLICATIONS
-    Xilinx products are not designed or intended to be fail-
-    safe, or for use in any application requiring fail-safe
-    performance, such as life-support or safety devices or
-    systems, Class III medical devices, nuclear facilities,
-    applications related to the deployment of airbags, or any
-    other applications that could lead to death, personal
-    injury, or severe property or environmental damage
-    (individually and collectively, "Critical
-    Applications"). Customer assumes the sole risk and
-    liability of any use of Xilinx products in Critical
-    Applications, subject only to applicable laws and
-    regulations governing limitations on product liability.
-   
-    THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
-    PART OF THIS FILE AT ALL TIMES.                       */
+// SPDX-License-Identifier: Apache-2.0
+// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
 #ifndef _HOST_QUEUE_H_
 #define _HOST_QUEUE_H_
 
-#include <stdbool.h>
-#include <stdint.h>
-
-#define SHIM_USER_EVENT_0_ID 0xb6
-#define DOORBELL_EVENT_ID SHIM_USER_EVENT_0_ID
-
-#define PDI_TABLE_SIZE 64
-
 #define HSA_PKT_SUCCESS (0)
 /*
  * 32-bit return code in completion of HSA pkt back to host.
@@ -62,7 +11,8 @@
  * will check them on all devices/platforms.
  * HSA specific error code will be on high 28 bits.
  */ 
-enum hsa_cmd_state { // ert_cmd_state essentially
+enum hsa_cmd_state
+{ // ert_cmd_state essentially
   HSA_CMD_STATE_NEW = 1,
   HSA_CMD_STATE_QUEUED = 2,
   HSA_CMD_STATE_RUNNING = 3,
@@ -83,46 +33,14 @@ enum hsa_cmd_state { // ert_cmd_state essentially
 #define HSA_INVALID_OPCODE        HSA_ERR(column_index_rel * 100 + 3)
 #define HSA_INVALID_PKT           HSA_ERR(4)
 #define HSA_INVALID_PAGE          HSA_ERR(column_index_rel * 100 + 5)
+#define HSA_INDIRECT_PKT_NUM      6
 
-typedef enum     
+enum host_queue_packet_opcode
 {            
   HOST_QUEUE_PACKET_EXEC_BUF = 1,
   HOST_QUEUE_PACKET_TEST = 2,
   HOST_QUEUE_PACKET_EXIT = 3,
-}            
-host_queue_packet_opcode_t;
-
-/*
- * cu_config contains cu <-> pdi mapping info
- *
- * due to memory footprint limitation, the pdi info (host address) is not saved in CERT
- * if num_mappings == 1, then pdi_info_host_addr contains the host addr of the pdi
- * if num_mappings > 1, then pdi_info_host_addr contains the host addr of a table, in which
- * the host addr of all the pdi are saved.
- *
- * note: both cu_index and pdi_index should be start from 0
- * e.g mapping[0] = 0, mapping[1] = 1, mapping[2] = 0,
- * means,
- * cu0 <-> pdi0
- * cu1 <-> pdi1
- * cu2 <-> pdi0
- * there are 3 mappings, and 2 pdi in pdi_info_host_addr table 
- */
-typedef struct
-{
-  uint32_t num_mappings;
-  uint32_t pdi_info_host_addr_low;
-  uint32_t pdi_info_host_addr_high;
-  uint8_t mapping[PDI_TABLE_SIZE];
-}
-config_cu_t;
-
-#define INVALID_PDI_ID (0xFF)
-
-/*
- * Maximum number of exec buf args in 4B
- */ 
-#define EXEC_BUF_ARGS_MAX_LEN (20)
+};            
 
 /*
  * hsa pkt payload of exec_buf
@@ -134,7 +52,7 @@ config_cu_t;
  * args contains the info of input/output frame, parameter of network
  * etc, which are all transparent to CERT 
  */ 
-typedef struct
+struct exec_buf
 {
   uint16_t cu_index;
   uint16_t reserved0;
@@ -144,48 +62,28 @@ typedef struct
   uint16_t reserved1;
   uint32_t args_host_addr_low;
   uint32_t args_host_addr_high;
-}
-exec_buf_t;
-
+};
 
-typedef struct
+struct host_queue_header
 {
   uint64_t read_index;
-  
-  uint32_t reserved;
-  
-  //! @note Queue capacity, must be a power of two.
-  uint32_t capacity;
-
-  /*
-   * NOTE!!!
-   *  Due to the cache is not cache coherence between host and device.  We have
-   *  to flush the cache of the host queue.
-   *
-   *  Most importantly, the read_index has to be in different cache line
-   *  (64Bytes in linux) than the write_index. Because the read_index might be
-   *  flushed from a different context from kernel driver that is monitoring
-   *  the completed message. While at the same time, the write_index might be 
-   *  being flushed from UMQ.
-   */ 
-  //Note: temporary disable padding because FW has not been fully changed yet.
-  //uint64_t padding[6];
-
+  struct
+  {
+    uint16_t major;
+    uint16_t minor;
+  }
+  version;
+  uint32_t capacity; //Queue capacity, must be a power of two.
   uint64_t write_index;
-  
   uint64_t data_address;
-  
-  // TODO Ready signal?
-}
-host_queue_header_t;
+};
 
 
-typedef enum     
+enum host_queue_packet_type
 {            
   HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC = 0,
   HOST_QUEUE_PACKET_TYPE_INVALID = 1,
-}            
-host_queue_packet_type_t;
+}; 
 
 /*
  * 8 Bytes common header of hsa pkt used in CERT.
@@ -199,7 +97,7 @@ host_queue_packet_type_t;
  * for 'indirect', 'count' is used to calc the number of indirect pkt entry,
  * see below
  */ 
-typedef struct
+struct common_header
 {
   union {
     struct {
@@ -214,29 +112,24 @@ typedef struct
   uint16_t count;
   uint8_t distribute;
   uint8_t indirect;
-}
-common_header_t;
+};
 
-typedef struct
+struct xrt_packet_header
 {
-  common_header_t common_header;	
+  struct common_header common_header;	
   uint64_t completion_signal;
-}
-xrt_packet_header_t;
+};
 
 /*
  * format of indirect pkt. multiple-indirect-level is supported
  * there is vendor specific header (common header plus completion_signal) in 1st indirect level
  * there is common header in all the remaining indirect levels
  */ 
-typedef struct
+struct host_indirect_packet_entry
 {
-  uint16_t column_index;
-  uint16_t reserved;
   uint32_t host_addr_low;
   uint32_t host_addr_high;
-}
-host_indirect_packet_entry_t;
+};
 
 /*
  * hsa pkt format -- 64Bytes fixed length
@@ -245,23 +138,23 @@ host_indirect_packet_entry_t;
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 1 (exec_buf)
- *   count: 24 (sizeof(exec_buf_t))
+ *   count: 24 (sizeof(struct exec_buf))
  *   distribute: 0
  *   indirect: 0
  *   completion_signal: xxx
  * data:
- *   exec_buf_t
+ *   struct exec_buf
  *
  * case 2 -- indirect config_cu 
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 0 (config_cu)
- *   count: 12 (1 * sizeof(host_indirect_packet_entry_t))
+ *   count: 12 (1 * sizeof(struct host_indirect_packet_entry))
  *   distribute: 0
  *   indirect: 1 // common header of indirect
  *   completion_signal: xxx
  * data:
- *   host_indirect_packet_entry_t:
+ *   struct host_indirect_packet_entry:
  *     column_index: index of lead uc
  *     host_addr*: host addr of next level
  *       common_header:
@@ -270,123 +163,97 @@ host_indirect_packet_entry_t;
  *         count: 72 (config_cu with 16 entries)) //10 entry config_cu can fit in direct pkt
  *         indirect: 0 // common header of direct
  *       payload:
- *         config_cu_t: 16 entries of mapping table
+ *         struct config_cu: 16 entries of mapping table
  *
  * case 3 -- indirect exec_buf on 4 column partition
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 1 (exec_buf)
- *   count: 48 (4 *sizeof(host_indirect_packet_entry_t))
+ *   count: 48 (6 * sizeof(struct host_indirect_packet_entry))
  *   distribute: 1
  *   indirect: 1 // common header of indirect
  *   completion_signal: xxx
  * data:
- *   host_indirect_packet_entry_t:
- *     column_index: index of lead uc
+ *   struct host_indirect_packet_entry:
  *     host_addr*: host addr of next level
  *       common_header:
  *         type: 0 (vendor specific)
  *         opcode: 1 (exec_buf)
- *         count: 24 (sizeof(exec_buf_t))
+ *         count: 24 (sizeof(struct exec_buf))
  *         indirect: 0 // common header of direct
  *       payload:
- *          exec_buf_t 
- *   host_indirect_packet_entry_t:
- *     column_index: index of slave1
+ *          struct exec_buf 
+ *   struct host_indirect_packet_entry:
  *     host_addr*: host addr of next level
  *       common_header:
  *          type: 0 (vendor specific)
  *          opcode: 1 (exec_buf)
- *          count: 24 (sizeof(exec_buf_t))
+ *          count: 24 (struct sizeof(exec_buf))
  *          indirect: 0 // common header of direct
  *       payload:
- *          exec_buf_t
- *   host_indirect_packet_entry_t:
+ *          struct exec_buf
+ *   struct host_indirect_packet_entry:
  *     slave2,3,etc...
  *
  * case 4 -- indirect exec_buf on 8 column partition 
  * xrt_packet_header:
  *   type: 0 (vendor specific)
  *   opcode: 1 (exec_buf)
- *   count: 12 (sizeof(host_indirect_packet_entry_t))
+ *   count: 12 (sizeof(struct host_indirect_packet_entry))
  *   distribute: 1
  *   indirect: 1 // common_header of level-1 indirect
  *   completion_signal: xxx
  * data:
- *   host_indirect_packet_entry_t:
- *     column_index: index of lead uc
+ *   struct host_indirect_packet_entry:
  *     host_addr*: host addr of next level
  *       common_header:
  *         type: 0 (vendor specific)
  *         opcode: 1 (exec_buf)
- *         count: 12*8 (12 * sizeof(host_indirect_packet_entry_t))
+ *         count: 12*8 (12 * sizeof(struct host_indirect_packet_entry))
  *         distribute: 1
  *         indirect: 1 // common header of level-2 indirect
  *       indirect_payload: 
- *         host_indirect_packet_entry_t:
- *           column_index: index of lead uc
+ *         struct host_indirect_packet_entry:
  *           host_addr*: host addr of next level
  *             common_header:
  *               type: 0 (vendor specific)
  *               opcode: 1 (exec_buf)
- *               count: 24 (sizeof(exec_buf_t))
+ *               count: 24 (sizeof(struct exec_buf))
  *               distribute: 1
  *               indirect: 0  // common_header of direct
  *             payload: 
- *               exec_buf_t
- *         host_indirect_packet_entry_t:
- *           column_index: index of slave1
+ *               struct exec_buf
+ *         struct host_indirect_packet_entry:
  *           host_addr*: host addr of next level
  *             common_header:
  *               type: 0 (vendor specific)
  *               opcode: 1 (exec_buf)
- *               count: 24 (sizeof(exec_buf_t))
+ *               count: 24 (sizeof(struct exec_buf))
  *               distribute: 1
  *               indirect: 0 // common_header of direct
  *             payload: 
- *               exec_buf_t
- *         host_indirect_packet_entry_t:
+ *               struct exec_buf
+ *         struct host_indirect_packet_entry:
  *           slave2,3,etc...
  */ 
-typedef struct
+struct host_queue_packet
 {
-  xrt_packet_header_t xrt_header;	
+  struct xrt_packet_header xrt_header;	
   uint32_t data[12];
-}
-host_queue_packet_t;
+};
 
 /*
  * xrt pkt with random length.
  */ 
-typedef struct
+struct xrt_packet
 {
-  xrt_packet_header_t xrt_header;	
+  struct xrt_packet_header xrt_header;	
   uint64_t xrt_payload_host_addr;
-}
-xrt_packet_t;
-
-#define XRT_PKT_TYPE(p) ((p)->xrt_header.common_header.type)
-#define XRT_PKT_OPCODE(p) ((p)->xrt_header.common_header.opcode)
-#define XRT_PKT_LEN(p) ((p)->xrt_header.common_header.count)
-#define XRT_PKT_DISTRIBUTE(p) ((p)->xrt_header.common_header.distribute)
-#define XRT_PKT_INDIRECT(p) ((p)->xrt_header.common_header.indirect)
-#define XRT_PKT_COMPLETION(p) ((p)->xrt_header.completion_signal)
-#define XRT_PKT_PAYLOAD(p) ((p)->xrt_payload_host_addr)
-
-#define ADDR_HIGH(x)        ((x) >> 32)
-#define ADDR_LOW(x)         ((x) & 0xFFFFFFFF)
-#define MOD_POW2(x, y)      ((x) & ((y) - 1)) 
+};
 
-typedef struct
+struct host_queue
 {
   uint64_t address;
-}
-host_queue_t;
-
-void host_queue_init(host_queue_t *queue, uint64_t address);
-
-xrt_packet_t *host_queue_pop(host_queue_t *queue, bool block);
-
-void host_queue_finish_packet(host_queue_t *queue, xrt_packet_t *packet, uint32_t completion);
+};
 
 #endif
diff --git a/src/shim/umq/hwq.cpp b/src/shim/umq/hwq.cpp
index cbdfde5e..5e03b243 100644
--- a/src/shim/umq/hwq.cpp
+++ b/src/shim/umq/hwq.cpp
@@ -22,13 +22,13 @@ clflush_data(void *data, int len)
 }
 
 inline void
-mark_slot_invalid(volatile host_queue_packet_t *pkt)
+mark_slot_invalid(volatile struct host_queue_packet *pkt)
 {
   pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_INVALID;
 }
 
 inline void
-mark_slot_valid(volatile host_queue_packet_t *pkt)
+mark_slot_valid(volatile struct host_queue_packet *pkt)
 {
   /* Issue mfence instruction to make sure all writes to the slot before is done */
   std::atomic_thread_fence(std::memory_order::memory_order_seq_cst);
@@ -38,7 +38,7 @@ mark_slot_valid(volatile host_queue_packet_t *pkt)
 }
 
 inline bool
-is_slot_valid(volatile host_queue_packet_t *pkt)
+is_slot_valid(volatile struct host_queue_packet *pkt)
 {
   return pkt->xrt_header.common_header.type == HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC;
 }
@@ -47,34 +47,63 @@ is_slot_valid(volatile host_queue_packet_t *pkt)
 
 namespace shim_xdna {
 
+void
+hw_q_umq::
+init_indirect_buf(volatile struct host_indirect_data *indirect_buf, int size)
+{
+  for (int i = 0; i < size; i++) {
+    indirect_buf[i].header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC;
+    indirect_buf[i].header.opcode = HOST_QUEUE_PACKET_EXEC_BUF;
+    indirect_buf[i].header.count = sizeof(struct exec_buf);
+    indirect_buf[i].header.distribute = 1;
+    indirect_buf[i].header.indirect = 0;
+  }
+}
+
 hw_q_umq::
 hw_q_umq(const device& dev, size_t nslots) : hw_q(dev)
 {
 #ifdef UMQ_HELLO_TEST
   const size_t header_sz = 8192; // Hard code to 2 pages
   const size_t queue_sz = 0;
+  const size_t indirect_sz = 0;
 #else
-  const size_t header_sz = sizeof(host_queue_header_t);
-  const size_t queue_sz = sizeof(host_queue_packet_t) * nslots;
+  //
+  // host queue layout:
+  //   host_queue_header_t
+  //   host_queue_packet_t [nslots]
+  //   indirect [4 * indirect_buffer * nslots]
+  const size_t header_sz = sizeof(struct host_queue_header);
+  const size_t queue_sz = sizeof(struct host_queue_packet) * nslots;
+  const size_t indirect_sz = (sizeof(struct host_indirect_data) * HSA_INDIRECT_PKT_NUM) * nslots;
 #endif
-  const size_t umq_sz = header_sz + queue_sz;
+  const size_t umq_sz = header_sz + queue_sz + indirect_sz;
+  shim_debug("umq sz %ld", umq_sz);
 
   m_umq_bo = const_cast<device &>(dev).alloc_bo(umq_sz, XCL_BO_FLAGS_EXECBUF);
   m_umq_bo_buf = m_umq_bo->map(bo::map_type::write);
-  m_umq_hdr = reinterpret_cast<volatile host_queue_header_t *>(m_umq_bo_buf);
-  m_umq_pkt = reinterpret_cast<volatile host_queue_packet_t *>
+  m_umq_hdr = reinterpret_cast<volatile struct host_queue_header *>(m_umq_bo_buf);
+  m_umq_pkt = reinterpret_cast<volatile struct host_queue_packet *>
     ((char *)m_umq_bo_buf + header_sz);
+  m_umq_indirect_buf = reinterpret_cast<volatile struct host_indirect_data *>
+    ((char *)m_umq_bo_buf + header_sz + queue_sz);
 
   // set all mapped memory to 0 
   std::memset(m_umq_bo_buf, 0, umq_sz);
   
-  for (int i = 0; i < nslots; i++)
+  // init slots and indirect buf
+  for (int i = 0; i < nslots; i++) {
     mark_slot_invalid(&m_umq_pkt[i]);
+    init_indirect_buf(&m_umq_indirect_buf[i * HSA_INDIRECT_PKT_NUM], HSA_INDIRECT_PKT_NUM);
+  }
 
   m_umq_hdr->capacity = nslots;
   // data_address starts after header
   m_umq_hdr->data_address = m_umq_bo->get_properties().paddr + header_sz;
 
+  // indirect buf starts after queue
+  m_indirect_paddr = m_umq_hdr->data_address + queue_sz;
+
   // this is the bo handler defined in parent class
   m_queue_boh = static_cast<bo*>(m_umq_bo.get())->get_drm_bo_handle();
 
@@ -98,11 +127,11 @@ map_doorbell(uint32_t doorbell_offset)
     m_pdev.mmap(0, sizeof(uint32_t), PROT_WRITE, MAP_SHARED, doorbell_offset));
 }
 
-volatile host_queue_header_t *
+volatile struct host_queue_header *
 hw_q_umq::
 get_header_ptr() const
 {
-  return reinterpret_cast<volatile host_queue_header_t *>(m_umq_bo_buf);
+  return reinterpret_cast<volatile struct host_queue_header *>(m_umq_bo_buf);
 }
 
 void
@@ -129,9 +158,33 @@ dump() const
     shim_debug("\tdistribute:\t%u", pkt->xrt_header.common_header.distribute);
     shim_debug("\tindirect:\t%u", pkt->xrt_header.common_header.indirect);
     shim_debug("\tcomplete addr:\t%p", pkt->xrt_header.completion_signal);
-    for (int j = 0; j < sizeof(pkt->data) / sizeof(pkt->data[0]); j++)
-      shim_debug("\tdata[%d]:\t0x%08x", j, pkt->data[j]);
+    if (pkt->xrt_header.common_header.indirect == 0) {
+      volatile struct exec_buf *ebp =
+        reinterpret_cast<volatile struct exec_buf *>(pkt->data);
+
+      shim_debug("\tcu_index:\t%d", ebp->cu_index);
+      shim_debug("\tdpu: [0x%x 0x%x]",
+        ebp->dpu_control_code_host_addr_high,
+        ebp->dpu_control_code_host_addr_low);
+    } else {
+      volatile struct host_indirect_packet_entry *hp =
+        reinterpret_cast<volatile struct host_indirect_packet_entry *>(pkt->data);
+
+      for (int i = 0; i < HSA_INDIRECT_PKT_NUM; i++, hp++) {
+        shim_debug("\thost addr: [0x%x 0x%x]", hp->host_addr_high, hp->host_addr_low);
+
+	volatile struct host_indirect_data *data =
+	  reinterpret_cast<volatile struct host_indirect_data *>(m_umq_indirect_buf);
+	shim_debug("\t\th:distribute:\t%d", data[i].header.distribute);
+	shim_debug("\t\th:indirect:\t%d", data[i].header.indirect);
+	shim_debug("\t\tp:cu_index:\t%d", data[i].payload.cu_index);
+	shim_debug("\t\tp:dpu: [0x%x 0x%x]",
+          data[i].payload.dpu_control_code_host_addr_high,
+          data[i].payload.dpu_control_code_host_addr_low);
+      }
+    }
   }
+  shim_debug("dump finished\r\n");
 }
 
 void
@@ -139,7 +192,7 @@ hw_q_umq::
 dump_raw() const
 {
   auto d = reinterpret_cast<volatile uint32_t *>(m_umq_pkt);
-  auto sz = get_header_ptr()->capacity * sizeof(host_queue_packet_t) / sizeof(uint32_t);
+  auto sz = get_header_ptr()->capacity * sizeof(struct host_queue_packet) / sizeof(uint32_t);
   shim_debug("Dumping raw UMQ queue slot data @%p, len=%ld WORDs:", m_umq_pkt, sz);
   for (int i = 0; i < sz; i++)
     shim_debug("0x%08x", d[i]);
@@ -172,11 +225,18 @@ reserve_slot()
   return cur_slot;
 }
 
-volatile host_queue_packet_t *
+int
+hw_q_umq::
+get_pkt_idx(uint64_t index)
+{
+  return index & (get_header_ptr()->capacity - 1);
+}
+
+volatile struct host_queue_packet *
 hw_q_umq::
-get_slot(uint64_t index)
+get_pkt(uint64_t index)
 {
-  auto pkt = &m_umq_pkt[index & (get_header_ptr()->capacity - 1)];
+  auto pkt = &m_umq_pkt[get_pkt_idx(index)];
   if (is_slot_valid(pkt)) {
     shim_err(EINVAL, "Slot is ready before use! index=0x%lx", index);
     dump();
@@ -188,26 +248,102 @@ uint64_t
 hw_q_umq::
 issue_exec_buf(uint16_t cu_idx, ert_dpu_data *dpu, uint64_t comp)
 {
-  auto idx = reserve_slot();
-  auto pkt = get_slot(idx);
+  auto slot_idx = reserve_slot();
+  auto pkt = get_pkt(slot_idx);
+  size_t pkt_size;
+
+  if (get_ert_dpu_data_next(dpu))
+    pkt_size = fill_indirect_exec_buf(slot_idx, cu_idx, pkt, dpu);
+  else
+    pkt_size = fill_direct_exec_buf(cu_idx, pkt, dpu); 
+
   auto hdr = &pkt->xrt_header;
   hdr->common_header.opcode = HOST_QUEUE_PACKET_EXEC_BUF;
-  hdr->common_header.distribute = 0;
-  hdr->common_header.indirect = 0;
   hdr->completion_signal = comp;
 
-  exec_buf_t payload = {};
-  payload.cu_index = cu_idx;
-  payload.dpu_control_code_host_addr_low = static_cast<uint32_t>(dpu->instruction_buffer);
-  payload.dpu_control_code_host_addr_high = static_cast<uint32_t>(dpu->instruction_buffer >> 32);
+  fill_slot_and_send(pkt, pkt_size);
 
-  fill_slot_and_send(pkt, &payload, sizeof(payload));
-  return idx;
+  return slot_idx;
+}
+
+size_t
+hw_q_umq::
+fill_indirect_exec_buf(uint64_t slot_idx, uint16_t cu_idx,
+                        volatile struct host_queue_packet *pkt,
+                        ert_dpu_data *dpu) {
+  auto pkt_size = (dpu->chained + 1) * sizeof(struct host_indirect_packet_entry);
+
+  if (dpu->chained + 1 >= HSA_INDIRECT_PKT_NUM)
+    shim_err(EINVAL, "unsupported indirect number %d, valid number <= %d",
+      dpu->chained + 1, HSA_INDIRECT_PKT_NUM);
+
+  if (pkt_size > sizeof(pkt->data))
+    shim_err(EINVAL, "dpu pkt_size=0x%lx > pkt_data max size=%x%lx",
+      pkt_size, sizeof(pkt->data));
+
+  // no need to memset to zero, all buffer will be set
+  volatile struct host_indirect_packet_entry *hp =
+    reinterpret_cast<volatile struct host_indirect_packet_entry *>(pkt->data);
+
+  for (int i = 0; dpu && dpu->chained >= 0;
+    i++, hp++, dpu = get_ert_dpu_data_next(dpu)) {
+    auto data_size = sizeof(struct host_indirect_data) * HSA_INDIRECT_PKT_NUM;
+    auto prefix_off = get_pkt_idx(slot_idx) * data_size;
+    auto prefix_idx = get_pkt_idx(slot_idx) * HSA_INDIRECT_PKT_NUM;
+    auto buf_paddr = m_indirect_paddr + prefix_off +
+       sizeof(struct host_indirect_data) * i;
+
+    hp->host_addr_low = static_cast<uint32_t>(buf_paddr);
+    hp->host_addr_high = static_cast<uint32_t>(buf_paddr >> 32);
+
+    auto cebp = &m_umq_indirect_buf[prefix_idx + i];
+    // do not zero this buffer, the cebp->header is pre-set 
+    // set every cebp->payload field in case of garbage data
+    cebp->payload.cu_index = cu_idx;
+    cebp->payload.dpu_control_code_host_addr_low =
+      static_cast<uint32_t>(dpu->instruction_buffer);
+    cebp->payload.dpu_control_code_host_addr_high =
+      static_cast<uint32_t>(dpu->instruction_buffer >> 32);
+    cebp->payload.args_len = 0;
+    cebp->payload.args_host_addr_low = 0;
+    cebp->payload.args_host_addr_high = 0;
+  }
+
+  auto hdr = &pkt->xrt_header;
+  hdr->common_header.distribute = 1;
+  hdr->common_header.indirect = 1;
+
+  return pkt_size;
+}
+
+size_t
+hw_q_umq::
+fill_direct_exec_buf(uint16_t cu_idx, volatile struct host_queue_packet *pkt,
+                     ert_dpu_data *dpu) {
+  auto pkt_size = sizeof(struct exec_buf);
+  if (pkt_size > sizeof(pkt->data))
+    shim_err(EINVAL, "dpu pkt_size=0x%lx > pkt_data max size=%x%lx",
+      pkt_size, sizeof(pkt->data));
+  
+  // zero this buffer
+  auto data = const_cast<uint32_t *>(pkt->data);
+  std::memset(data, 0, pkt_size);
+  // set correct dpu control code
+  volatile struct exec_buf *ebp = reinterpret_cast<volatile struct exec_buf *>(pkt->data);
+  ebp->cu_index = cu_idx;
+  ebp->dpu_control_code_host_addr_low = static_cast<uint32_t>(dpu->instruction_buffer);
+  ebp->dpu_control_code_host_addr_high = static_cast<uint32_t>(dpu->instruction_buffer >> 32);
+
+  auto hdr = &pkt->xrt_header;
+  hdr->common_header.distribute = 0;
+  hdr->common_header.indirect = 0;
+
+  return pkt_size;
 }
 
 void
 hw_q_umq::
-fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size)
+fill_slot_and_send(volatile struct host_queue_packet *pkt, size_t size)
 {
   if (size > sizeof(pkt->data))
     shim_err(EINVAL, "HSA packet payload too big, size=0x%lx", size);
@@ -215,10 +351,11 @@ fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size
   auto hdr = &pkt->xrt_header;
   hdr->common_header.count = size;
 
-  auto data = const_cast<uint32_t *>(pkt->data);
-  std::memcpy(data, payload, size);
   /* must flush data to make cache coherence */
-  clflush_data((void *)data, size);
+  clflush_data((void *)(pkt->data), size);
+
+  //comment this out, debug only
+  //dump();
 
   /* Always done as last step. */
   mark_slot_valid(pkt);
@@ -247,9 +384,9 @@ issue_command(xrt_core::buffer_handle *cmd_bo)
   }
 
   if (get_ert_dpu_data_next(dpu_data))
-    shim_err(EOPNOTSUPP, "chained dpu data is not supported yet");
+    shim_debug("this is a multi-column dpu request.");
 
-  // Completion signal area has to be a full WORD
+  // Completion signal area has to be a full WORD, we utilze the command_bo
   uint64_t comp = boh->get_properties().paddr + offsetof(ert_start_kernel_cmd, header);
 
   auto id = issue_exec_buf(ffs(cmd->cu_mask) - 1, dpu_data, comp);
diff --git a/src/shim/umq/hwq.h b/src/shim/umq/hwq.h
index 1d484569..8c8cc707 100644
--- a/src/shim/umq/hwq.h
+++ b/src/shim/umq/hwq.h
@@ -30,14 +30,22 @@ class hw_q_umq : public hw_q
   void
   bind_hwctx(const hw_ctx *ctx);
 
-  volatile host_queue_header_t *
+  volatile struct host_queue_header *
   get_header_ptr() const;
 
 private:
+
+  struct host_indirect_data {
+    struct common_header	header;
+    struct exec_buf		payload;
+  };
+
   std::unique_ptr<xrt_core::buffer_handle> m_umq_bo;
   void *m_umq_bo_buf;
-  volatile host_queue_header_t *m_umq_hdr = nullptr;
-  volatile host_queue_packet_t *m_umq_pkt = nullptr;
+  volatile struct host_queue_header *m_umq_hdr = nullptr;
+  volatile struct host_queue_packet *m_umq_pkt = nullptr;
+  volatile struct host_indirect_data *m_umq_indirect_buf = nullptr;
+  uint64_t m_indirect_paddr;
 
   volatile uint32_t *m_mapped_doorbell = nullptr;
 
@@ -46,11 +54,25 @@ class hw_q_umq : public hw_q
   uint64_t
   reserve_slot();
 
-  volatile host_queue_packet_t *
-  get_slot(uint64_t index);
+  int
+  get_pkt_idx(uint64_t index);
+
+  volatile struct host_queue_packet *
+  get_pkt(uint64_t index);
+
+  void
+  init_indirect_buf(volatile struct host_indirect_data *indirect_buf, int size);
+
+  size_t
+  fill_direct_exec_buf(uint16_t cu_idx,
+    volatile struct host_queue_packet *pkt, ert_dpu_data *dpu);
+
+  size_t 
+  fill_indirect_exec_buf(uint64_t idx, uint16_t cu_idx,
+    volatile struct host_queue_packet *pkt, ert_dpu_data *dpu);
 
   void
-  fill_slot_and_send(volatile host_queue_packet_t *pkt, void *payload, size_t size);
+  fill_slot_and_send(volatile struct host_queue_packet *pkt, size_t size);
 
   uint64_t
   issue_exec_buf(uint16_t cu_idx, ert_dpu_data *dpu_data, uint64_t comp);

From 79c1313439ac4aa84641e8b42dad61f640e03b58 Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Mon, 9 Sep 2024 19:52:00 -0700
Subject: [PATCH 30/44] allow npu_perf_trace.sh to trace shim_test (#247)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/driver/tools/npu_perf_analyze.sh |  1 +
 src/driver/tools/npu_perf_trace.sh   | 12 ++++++++++++
 test/shim_test/io_config.h           |  6 ++++--
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/src/driver/tools/npu_perf_analyze.sh b/src/driver/tools/npu_perf_analyze.sh
index a96cf915..8941fdbb 100755
--- a/src/driver/tools/npu_perf_analyze.sh
+++ b/src/driver/tools/npu_perf_analyze.sh
@@ -67,6 +67,7 @@ while [ $# -gt 0 ]; do
 			;;
 		*)
 			break
+			;;
 	esac
 	shift
 done
diff --git a/src/driver/tools/npu_perf_trace.sh b/src/driver/tools/npu_perf_trace.sh
index 08bb18da..1cdc2bdc 100755
--- a/src/driver/tools/npu_perf_trace.sh
+++ b/src/driver/tools/npu_perf_trace.sh
@@ -69,6 +69,18 @@ fi
 # Global variables
 sdt_pre_enabled=0
 xrt_lib_prefix="/opt/xilinx/xrt/lib"
+while [ $# -gt 0 ]; do
+	case "$1" in
+		-libdir | -l)
+			xrt_lib_prefix=$2
+			shift
+			;;
+		*)
+			break
+			;;
+	esac
+	shift
+done
 accel_debugfs="/sys/kernel/debug/accel"
 xrt_libs="${xrt_lib_prefix}/libxrt_coreutil.so,${xrt_lib_prefix}/libxrt_driver_xdna.so"
 perf_record_args="-e amdxdna_trace:* "
diff --git a/test/shim_test/io_config.h b/test/shim_test/io_config.h
index 27478ef1..915022ef 100644
--- a/test/shim_test/io_config.h
+++ b/test/shim_test/io_config.h
@@ -208,7 +208,7 @@ int verify_output(int8_t* buf, const std::string &wrk_path)
     ss >> key >> str_val;
     ss.clear();
     golden_output_files.push_back(wrk_path + "golden_" + str_val + ".bin");
-    dump_output_files.push_back(wrk_path + "dump_" + str_val + ".bin");
+    dump_output_files.push_back("/tmp/dump_" + str_val + "." + std::to_string(getpid()) + ".bin");
 
     getline(myfile, line);
     ss.str(line);
@@ -239,12 +239,14 @@ int verify_output(int8_t* buf, const std::string &wrk_path)
 
   int ret = 0;
   for (int i = 0; i < num_outputs; i++) {
-    std::cout << "Examing output: " << golden_output_files[i] << std::endl;
     ret = comp_buf_strides(buf + output_ddr_addr[i], golden_output_files[i],
                            dump_output_files[i], output_shapes[i], output_strides[i]);
     if (ret) {
         std::cout << "Examing failed, ret " << ret << std::endl;
+        std::cout << "Examing output: " << dump_output_files[i] << std::endl;
         break;
+    } else {
+        std::remove(dump_output_files[i].c_str());
     }
   }
 

From 2bac8eb3f1ffe9d2815ad468bb107dbf395c4913 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Tue, 10 Sep 2024 12:12:52 -0700
Subject: [PATCH 31/44] update BARs and Telemetry (#248)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/doc/amdnpu.rst | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/driver/doc/amdnpu.rst b/src/driver/doc/amdnpu.rst
index b4b5dc2f..7ea04261 100644
--- a/src/driver/doc/amdnpu.rst
+++ b/src/driver/doc/amdnpu.rst
@@ -76,12 +76,28 @@ instance of ERT. Each user channel is bound to its own dedicated mailbox.
 PCIe EP
 -------
 
-NPU is visible to the x86 as a PCIe device with 3 BARS and an MSI-X interrupt
-vector. NPU uses a dedicated high bandwidth SoC level fabric for reading
+NPU is visible to the x86 as a PCIe device with multiple BARs and some MSI-X interrupt
+vectors. NPU uses a dedicated high bandwidth SoC level fabric for reading
 writing into host memory. Each instance of ERT gets its own dedicated MSI-X
 interrupt. MERT gets a single instance of MSI-X interrupt.
 
-TODO, briefly describe the BARs
+The number of PCIe BARs varies depending on the specific device.
+Based on their functions, PCIe BARs can generally be categorized into the
+following types.
+
+* PSP BAR: Expose the AMD PSP (Platform Security Processor) function
+* SMU BAR: Expose the AMD SMU (System Management Unit) function
+* SRAM BAR: Expose ring buffers for the mailbox
+* Mailbox BAR: Expose the mailbox control registers (head, tail and ISR registers etc.)
+* Public Register BAR: Expose public registers
+
+On specific devices, the above-mentioned BAR type might be combined into a single physical PCIe BAR.
+Or a module might require two physical PCIe BARs to be fully functional.
+For example,
+
+* On AMD Phoenix device, PSP, SMU, Public Register BARs are on PCIe BAR index 0.
+* On AMD Strix Point device, Mailbox and Public Register BARs are on PCIe BAR index 0.
+  The PSP has some registers in PCIe BAR index 0 (Public Register BAR) and PCIe BAR index 4 (PSP BAR).
 
 Process Isolation Hardware
 --------------------------
@@ -244,8 +260,11 @@ driver then decodes the error by reading the contents of the buffer pointer.
 Telemetry
 =========
 
-MERT can report various kinds of telemetry information like
-TODO, list the key ones
+MERT can report various kinds of telemetry information like the following:
+* L1 interrupt counter
+* DMA counter
+* Deep Sleep counter
+* etc.
 
 
 References

From dfd9ad5333b5b07b26cf60113a3c44b9acc6abb8 Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Tue, 10 Sep 2024 12:59:06 -0700
Subject: [PATCH 32/44] fix coverity issues (#249)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/driver/amdxdna/aie2_pci.c | 2 +-
 src/driver/amdxdna/aie2_smu.c | 2 +-
 src/shim/umq/hwq.cpp          | 3 +--
 test/shim_test/io_config.h    | 3 ++-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c
index 9454b707..ae5c85e9 100644
--- a/src/driver/amdxdna/aie2_pci.c
+++ b/src/driver/amdxdna/aie2_pci.c
@@ -789,7 +789,7 @@ static int aie2_get_firmware_version(struct amdxdna_client *client,
 static int aie2_get_power_mode(struct amdxdna_client *client,
 			       struct amdxdna_drm_get_info *args)
 {
-	struct amdxdna_drm_get_power_mode mode;
+	struct amdxdna_drm_get_power_mode mode = {};
 	struct amdxdna_dev *xdna = client->xdna;
 	struct amdxdna_dev_hdl *ndev;
 
diff --git a/src/driver/amdxdna/aie2_smu.c b/src/driver/amdxdna/aie2_smu.c
index c17c672e..abe9a1f9 100644
--- a/src/driver/amdxdna/aie2_smu.c
+++ b/src/driver/amdxdna/aie2_smu.c
@@ -169,7 +169,7 @@ int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
 	int ret;
 
-	if (dpm_level < 0 || dpm_level > SMU_DPM_MAX(ndev))
+	if (dpm_level > SMU_DPM_MAX(ndev))
 		return -EINVAL;
 
 	if (!ndev->priv->smu_rev)
diff --git a/src/shim/umq/hwq.cpp b/src/shim/umq/hwq.cpp
index 5e03b243..5c41fa47 100644
--- a/src/shim/umq/hwq.cpp
+++ b/src/shim/umq/hwq.cpp
@@ -285,8 +285,7 @@ fill_indirect_exec_buf(uint64_t slot_idx, uint16_t cu_idx,
   volatile struct host_indirect_packet_entry *hp =
     reinterpret_cast<volatile struct host_indirect_packet_entry *>(pkt->data);
 
-  for (int i = 0; dpu && dpu->chained >= 0;
-    i++, hp++, dpu = get_ert_dpu_data_next(dpu)) {
+  for (int i = 0; dpu; i++, hp++, dpu = get_ert_dpu_data_next(dpu)) {
     auto data_size = sizeof(struct host_indirect_data) * HSA_INDIRECT_PKT_NUM;
     auto prefix_off = get_pkt_idx(slot_idx) * data_size;
     auto prefix_idx = get_pkt_idx(slot_idx) * HSA_INDIRECT_PKT_NUM;
diff --git a/test/shim_test/io_config.h b/test/shim_test/io_config.h
index 915022ef..1d1351db 100644
--- a/test/shim_test/io_config.h
+++ b/test/shim_test/io_config.h
@@ -246,7 +246,8 @@ int verify_output(int8_t* buf, const std::string &wrk_path)
         std::cout << "Examing output: " << dump_output_files[i] << std::endl;
         break;
     } else {
-        std::remove(dump_output_files[i].c_str());
+        if (std::remove(dump_output_files[i].c_str()))
+            std::cout << "Failed to remove " << dump_output_files[i] << std::endl;
     }
   }
 

From da7c4b489158bdd3032bad28c3ead12196ec9bce Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Wed, 11 Sep 2024 14:53:49 -0700
Subject: [PATCH 33/44] update firwmare for npu4/5/6 (#251)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 tools/info.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/info.json b/tools/info.json
index f6f46174..cfe90782 100644
--- a/tools/info.json
+++ b/tools/info.json
@@ -23,18 +23,18 @@
 		},
 		{
 			"device": "npu4",
-			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.30.20",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_10/npu.sbin.0.7.35.35",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "10",
-			"version": "0.7.30.20",
+			"version": "0.7.35.35",
 			"fw_name": "npu.sbin"
 		},
 		{
 			"device": "npu5",
-			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.30.101",
+			"url": "https://gitlab.com/kernel-firmware/drm-firmware/-/raw/amd-ipu-staging/amdnpu/17f0_11/npu.sbin.0.7.35.139",
 			"pci_device_id": "17f0",
 			"pci_revision_id": "11",
-			"version": "0.7.30.101",
+			"version": "0.7.35.139",
 			"fw_name": "npu.sbin"
 		}
 	]

From eddd92c0f61592c576a500f16efa24eb23667c23 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Wed, 11 Sep 2024 14:59:00 -0700
Subject: [PATCH 34/44] Support turbo mode for better latency but burn more
 power on CPU & NPU (#250)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/aie2_message.c          |   7 +-
 src/driver/amdxdna/aie2_msg_priv.h         |   1 -
 src/driver/amdxdna/aie2_pci.c              |   6 +-
 src/driver/amdxdna/aie2_pci.h              |   2 +
 src/driver/amdxdna/aie2_pm.c               |  88 ++++---
 src/driver/amdxdna/amdxdna_mailbox.c       | 273 +++++++++++----------
 src/driver/amdxdna/amdxdna_mailbox.h       |  10 +-
 src/include/uapi/drm_local/amdxdna_accel.h |   1 +
 src/shim/hwq.cpp                           |  11 +
 src/shim/hwq.h                             |   3 +
 test/shim_test/io_param.h                  |   3 +
 test/shim_test/io_test.cpp                 |  73 ++++--
 test/shim_test/shim_test.cpp               |  28 ++-
 test/shim_test/speed.h                     |   1 +
 tools/info.json                            |   2 +-
 xrt                                        |   2 +-
 16 files changed, 318 insertions(+), 193 deletions(-)

diff --git a/src/driver/amdxdna/aie2_message.c b/src/driver/amdxdna/aie2_message.c
index fd83674f..0efc68d4 100644
--- a/src/driver/amdxdna/aie2_message.c
+++ b/src/driver/amdxdna/aie2_message.c
@@ -249,6 +249,7 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 {
 	DECLARE_AIE2_MSG(create_ctx, MSG_OP_CREATE_CONTEXT);
 	struct amdxdna_dev *xdna = ndev->xdna;
+	enum xdna_mailbox_channel_type type;
 	struct xdna_mailbox_chann_res x2i;
 	struct xdna_mailbox_chann_res i2x;
 	struct cq_pair *cq_pair;
@@ -287,8 +288,12 @@ int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwct
 	}
 
 	intr_reg = i2x.mb_head_ptr_reg + 4;
+	if (aie2_pm_is_turbo(ndev))
+		type = MB_CHANNEL_USER_POLL;
+	else
+		type = MB_CHANNEL_USER_NORMAL;
 	hwctx->priv->mbox_chann = xdna_mailbox_create_channel(ndev->mbox, &x2i, &i2x,
-							      intr_reg, ret);
+							      intr_reg, ret, type);
 	if (!hwctx->priv->mbox_chann) {
 		XDNA_ERR(xdna, "not able to create channel");
 		ret = -EINVAL;
diff --git a/src/driver/amdxdna/aie2_msg_priv.h b/src/driver/amdxdna/aie2_msg_priv.h
index 663a6084..2d18ef63 100644
--- a/src/driver/amdxdna/aie2_msg_priv.h
+++ b/src/driver/amdxdna/aie2_msg_priv.h
@@ -186,7 +186,6 @@ struct exec_dpu_req {
 	u32     inst_prop_cnt;
 	u32     cu_idx;
 	u32	payload[35];
-
 } __packed;
 
 struct exec_dpu_preempt_req {
diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c
index ae5c85e9..3f95844d 100644
--- a/src/driver/amdxdna/aie2_pci.c
+++ b/src/driver/amdxdna/aie2_pci.c
@@ -428,7 +428,7 @@ static int aie2_hw_start(struct amdxdna_dev *xdna)
 						       &ndev->mgmt_x2i,
 						       &ndev->mgmt_i2x,
 						       xdna_mailbox_intr_reg,
-						       mgmt_mb_irq);
+						       mgmt_mb_irq, MB_CHANNEL_MGMT);
 	if (!ndev->mgmt_chann) {
 		XDNA_ERR(xdna, "failed to create management mailbox channel");
 		ret = -EINVAL;
@@ -576,6 +576,7 @@ static int aie2_init(struct amdxdna_dev *xdna)
 	aie2_smu_setup(ndev);
 
 	ndev->pw_mode = POWER_MODE_DEFAULT;
+	ndev->clk_gate_enabled = true;
 	ret = aie2_hw_start(xdna);
 	if (ret) {
 		XDNA_ERR(xdna, "start npu failed, ret %d", ret);
@@ -986,9 +987,8 @@ static int aie2_set_power_mode(struct amdxdna_client *client, struct amdxdna_drm
 		return -EFAULT;
 	}
 
-	/* Interpret the given buf->power_mode into the correct power mode*/
 	power_mode = power_state.power_mode;
-	if (power_mode > POWER_MODE_HIGH) {
+	if (power_mode > POWER_MODE_TURBO) {
 		XDNA_ERR(xdna, "Invalid power mode %d", power_mode);
 		return -EINVAL;
 	}
diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h
index 6ee80555..f0bd4a5a 100644
--- a/src/driver/amdxdna/aie2_pci.h
+++ b/src/driver/amdxdna/aie2_pci.h
@@ -223,6 +223,7 @@ struct amdxdna_dev_hdl {
 	struct aie_metadata		metadata;
 	struct smu			smu;
 	enum amdxdna_power_mode_type	pw_mode;
+	bool				clk_gate_enabled;
 
 	/* Mailbox and the management channel */
 	struct mailbox			*mbox;
@@ -368,6 +369,7 @@ void aie2_stop_ctx_by_col_map(struct amdxdna_client *client, u32 col_map);
 /* aie2_pm.c */
 int aie2_pm_start(struct amdxdna_dev_hdl *ndev);
 void aie2_pm_stop(struct amdxdna_dev_hdl *ndev);
+bool aie2_pm_is_turbo(struct amdxdna_dev_hdl *ndev);
 int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target);
 
 #endif /* _AIE2_PCI_H_ */
diff --git a/src/driver/amdxdna/aie2_pm.c b/src/driver/amdxdna/aie2_pm.c
index 64468c72..0a71e5b2 100644
--- a/src/driver/amdxdna/aie2_pm.c
+++ b/src/driver/amdxdna/aie2_pm.c
@@ -5,13 +5,19 @@
 
 #include "aie2_pci.h"
 
-static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev, bool enable)
+static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev,
+				enum amdxdna_power_mode_type target)
 {
 	const struct rt_config_clk_gating *config;
+	bool enable;
 	u32 value;
 	int ret;
 	int i;
 
+	enable = (target != POWER_MODE_TURBO && target != POWER_MODE_HIGH);
+	if (enable == ndev->clk_gate_enabled)
+		return 0;
+
 	config = &ndev->priv->clk_gating;
 	if (enable)
 		value = config->value_enable;
@@ -30,9 +36,40 @@ static int aie2_pm_clock_gating(struct amdxdna_dev_hdl *ndev, bool enable)
 		}
 	}
 
+	if (!ret)
+		ndev->clk_gate_enabled = enable;
+
 	return ret;
 }
 
+bool aie2_pm_is_turbo(struct amdxdna_dev_hdl *ndev)
+{
+	return ndev->pw_mode == POWER_MODE_TURBO;
+}
+
+static int aie2_pm_check_turbo(struct amdxdna_dev_hdl *ndev,
+			       enum amdxdna_power_mode_type prev,
+			       enum amdxdna_power_mode_type next)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct amdxdna_client *client;
+
+	if (prev != POWER_MODE_TURBO && next != POWER_MODE_TURBO)
+		return 0;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	list_for_each_entry(client, &xdna->client_list, node) {
+		bool empty;
+
+		mutex_lock(&client->hwctx_lock);
+		empty = idr_is_empty(&client->hwctx_idr);
+		mutex_unlock(&client->hwctx_lock);
+		if (!empty)
+			return -EBUSY;
+	}
+	return 0;
+}
+
 int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
@@ -44,23 +81,29 @@ int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type
 	if (target == POWER_MODE_LOW || target == POWER_MODE_MEDIUM)
 		return -EOPNOTSUPP;
 
-	XDNA_DBG(xdna, "Changing power mode from %d to %d", ndev->pw_mode, target);
-	/* Set resource solver power property to the user choice */
+	ret = aie2_pm_check_turbo(ndev, ndev->pw_mode, target);
+	if (ret) {
+		XDNA_WARN(xdna, "Change Turbo mode failed");
+		return ret;
+	}
 
-	/* Set power level within the device */
+	XDNA_DBG(xdna, "Changing power mode from %d to %d", ndev->pw_mode, target);
 
-	/*
-	 * Other mode -> POWER_MODE_HIGH: Turn off clock gating
-	 * POWER_MODE_HIGH -> Other mode: Turn on clock gating
-	 * Otherwise, no change
+	/* TODO:
+	 *switch (ndev->pw_mode) {
+	 *case POWER_MODE_LOW:
+	 *	Set to low DPM level
+	 *case POWER_MODE_MEDIUM:
+	 *	Set to medium DPM level
+	 *case POWER_MODE_HIGH:
+	 *case POWER_MODE_TURBO:
+	 *	Set to highest DPM level
+	 *default:
+	 *	Let driver decides DPM level
+	 *}
 	 */
-	if (target == POWER_MODE_HIGH) {
-		XDNA_DBG(xdna, "Clock gating turning off");
-		ret = aie2_pm_clock_gating(ndev, false);
-	} else if (ndev->pw_mode == POWER_MODE_HIGH) {
-		XDNA_DBG(xdna, "Clock gating turning on");
-		ret = aie2_pm_clock_gating(ndev, true);
-	}
+
+	ret = aie2_pm_clock_gating(ndev, target);
 	if (ret) {
 		XDNA_ERR(xdna, "Failed to config clock gating");
 		return ret;
@@ -73,21 +116,10 @@ int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type
 
 int aie2_pm_start(struct amdxdna_dev_hdl *ndev)
 {
-	/*
-	 * TODO: should only skip POWER_MODE_DEFAULT.
-	 * Let's make it right after full DPM support is ready
-	 */
-	if (ndev->pw_mode != POWER_MODE_HIGH)
-		return 0;
-
-	return aie2_pm_clock_gating(ndev, false);
+	return aie2_pm_clock_gating(ndev, ndev->pw_mode);
 }
 
 void aie2_pm_stop(struct amdxdna_dev_hdl *ndev)
 {
-	if (ndev->pw_mode != POWER_MODE_HIGH)
-		return;
-
-	/* Clock gating must be turned ON before suspend firmware */
-	aie2_pm_clock_gating(ndev, true);
+	aie2_pm_clock_gating(ndev, POWER_MODE_DEFAULT);
 }
diff --git a/src/driver/amdxdna/amdxdna_mailbox.c b/src/driver/amdxdna/amdxdna_mailbox.c
index f8cd8cc3..409ab26f 100644
--- a/src/driver/amdxdna/amdxdna_mailbox.c
+++ b/src/driver/amdxdna/amdxdna_mailbox.c
@@ -52,8 +52,11 @@
 
 #ifdef AMDXDNA_DEVEL
 int mailbox_polling;
-module_param(mailbox_polling, int, 0644);
-MODULE_PARM_DESC(mailbox_polling, "0:interrupt(default); >0:poll interval in ms; <0: busy poll");
+module_param(mailbox_polling, int, 0444);
+MODULE_PARM_DESC(mailbox_polling, "<=0:interrupt(default); >0:poll interval in ms; <0: busy poll");
+#define MB_DEFAULT_NO_POLL (mailbox_polling <= 0)
+#define MB_PERIODIC_POLL   (mailbox_polling > 0)
+#define MB_FORCE_USER_POLL   (mailbox_polling < 0)
 
 #define MB_TIMER_JIFF msecs_to_jiffies(mailbox_polling)
 #endif
@@ -70,39 +73,39 @@ struct mailbox {
 	/* protect channel list */
 	struct mutex		mbox_lock;
 	struct list_head        chann_list;
-#ifdef AMDXDNA_DEVEL
+	struct list_head        poll_chann_list;
 	struct task_struct	*polld;
 	struct wait_queue_head	poll_wait;
 	bool			sent_msg; /* For polld */
-#endif
-
 #if defined(CONFIG_DEBUG_FS)
 	struct list_head        res_records;
 #endif /* CONFIG_DEBUG_FS */
-
 };
 
 #if defined(CONFIG_DEBUG_FS)
 struct mailbox_res_record {
+	enum xdna_mailbox_channel_type	type;
 	struct list_head		re_entry;
 	struct xdna_mailbox_chann_res	re_x2i;
 	struct xdna_mailbox_chann_res	re_i2x;
 	int				re_irq;
+	int				active;
 };
 #endif /* CONFIG_DEBUG_FS */
 
 struct mailbox_channel {
 	struct mailbox			*mb;
 #if defined(CONFIG_DEBUG_FS)
-	struct list_head		chann_entry;
 	struct mailbox_res_record	*record;
 #endif
+	struct list_head		chann_entry;
 	struct xdna_mailbox_chann_res	res[CHAN_RES_NUM];
 	int				msix_irq;
+	u32				x2i_tail;
 	u32				iohub_int_addr;
+	enum xdna_mailbox_channel_type	type;
 	struct idr			chan_idr;
 	spinlock_t			chan_idr_lock; /* protect idr operations */
-	u32				x2i_tail;
 
 	/* Received msg related fields */
 	struct workqueue_struct		*work_q;
@@ -163,17 +166,23 @@ static u32 mailbox_reg_read(struct mailbox_channel *mb_chann, u32 mbox_reg)
 static int mailbox_tail_read_non_zero(struct mailbox_channel *mb_chann, u32 *val)
 {
 	u32 mbox_reg = mb_chann->res[CHAN_RES_I2X].mb_tail_ptr_reg;
+	u32 ringbuf_size = mb_chann->res[CHAN_RES_I2X].rb_size;
 	struct xdna_mailbox_res *mb_res = &mb_chann->mb->res;
 	u64 ringbuf_addr = mb_res->mbox_base + mbox_reg;
-	int ret, value;
+	int ret, tail;
 
-	/* Poll till value is not zero */
-	ret = readx_poll_timeout(ioread32, (void *)ringbuf_addr, value,
-				 value, 1 /* us */, 100);
+	/* Poll till tail is not zero */
+	ret = readx_poll_timeout(ioread32, (void *)ringbuf_addr, tail,
+				 tail, 0 /* tight-loops */, 100 /* us timeout */);
 	if (ret < 0)
 		return ret;
 
-	*val = value;
+	if (unlikely(tail > ringbuf_size || !IS_ALIGNED(tail, 4))) {
+		MB_WARN_ONCE(mb_chann, "Invalid tail 0x%x", tail);
+		return -EINVAL;
+	}
+
+	*val = tail;
 	return 0;
 }
 
@@ -350,6 +359,12 @@ mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *heade
 	return ret;
 }
 
+/*
+ * mailbox_get_msg() is the key function to get message from ring buffer.
+ * If it returns 0, means 1 message was consumed.
+ * If it returns -ENOENT, means ring buffer is emtpy.
+ * If it returns other value, means ERROR.
+ */
 static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 {
 	struct xdna_msg_header header;
@@ -360,19 +375,15 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 	u64 read_addr;
 	int ret;
 
-	if (mailbox_tail_read_non_zero(mb_chann, &tail)) {
+	ret = mailbox_tail_read_non_zero(mb_chann, &tail);
+	if (ret) {
 		MB_WARN_ONCE(mb_chann, "Zero tail too long");
-		return -EINVAL;
+		return ret;
 	}
 	head = mb_chann->i2x_head;
 	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_I2X);
 	start_addr = mb_chann->res[CHAN_RES_I2X].rb_start_addr;
 
-	if (unlikely(tail > ringbuf_size || !IS_ALIGNED(tail, 4))) {
-		MB_WARN_ONCE(mb_chann, "Invalid tail 0x%x", tail);
-		return -EINVAL;
-	}
-
 	/* ringbuf empty */
 	if (head == tail)
 		return -ENOENT;
@@ -390,9 +401,17 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 				     head, tail);
 			return -EINVAL;
 		}
-		mailbox_set_headptr(mb_chann, 0);
-		ret = 0;
-		goto done;
+
+		/* Read from beginning of ringbuf */
+		head = 0;
+		ret = mailbox_tail_read_non_zero(mb_chann, &tail);
+		if (ret) {
+			MB_WARN_ONCE(mb_chann, "Hit tombstone, re-read tail failed");
+			return -EINVAL;
+		}
+		/* Re-peek size of the message */
+		read_addr = mb_chann->mb->res.ringbuf_base + start_addr;
+		header.total_size = ioread32((void *)read_addr);
 	}
 
 	if (unlikely(!header.total_size || !IS_ALIGNED(header.total_size, 4))) {
@@ -418,8 +437,6 @@ static inline int mailbox_get_msg(struct mailbox_channel *mb_chann)
 	/* After update head, it can equal to ringbuf_size. This is expected. */
 	trace_mbox_set_head(MAILBOX_NAME, mb_chann->msix_irq,
 			    header.opcode, header.id);
-
-done:
 	return ret;
 }
 
@@ -462,6 +479,8 @@ static irqreturn_t mailbox_irq_handler(int irq, void *p)
 	int i;
 
 	trace_mbox_irq_handle(MAILBOX_NAME, irq);
+	if (mb_chann->type == MB_CHANNEL_USER_POLL)
+		return IRQ_HANDLED;
 	/* Clear IOHUB register */
 	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
 	/* Schedule a rx_work to call the callback functions */
@@ -492,6 +511,7 @@ static void mailbox_timer(struct timer_list *t)
 
 	mod_timer(&mb_chann->timer, jiffies + MB_TIMER_JIFF);
 }
+#endif
 
 static void mailbox_polld_handle_chann(struct mailbox_channel *mb_chann)
 {
@@ -545,7 +565,10 @@ static bool mailbox_polld_event(struct mailbox *mb)
 	struct mailbox_channel *mb_chann;
 
 	mutex_lock(&mb->mbox_lock);
-	list_for_each_entry(mb_chann, &mb->chann_list, chann_entry) {
+	list_for_each_entry(mb_chann, &mb->poll_chann_list, chann_entry) {
+		if (mb_chann->type == MB_CHANNEL_MGMT)
+			break;
+
 		if (mailbox_channel_no_msg(mb_chann))
 			continue;
 
@@ -574,13 +597,11 @@ static int mailbox_polld(void *data)
 			continue;
 
 		mutex_lock(&mb->mbox_lock);
-		if (unlikely(list_empty(&mb->chann_list))) {
-			mutex_unlock(&mb->mbox_lock);
-			continue;
-		}
-
 		chann_all_empty = true;
-		list_for_each_entry(mb_chann, &mb->chann_list, chann_entry) {
+		list_for_each_entry(mb_chann, &mb->poll_chann_list, chann_entry) {
+			if (mb_chann->type == MB_CHANNEL_MGMT)
+				break;
+
 			if (mailbox_channel_no_msg(mb_chann))
 				continue;
 
@@ -602,7 +623,6 @@ static int mailbox_polld(void *data)
 
 	return 0;
 }
-#endif
 
 int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
 			  const struct xdna_mailbox_msg *msg, u64 tx_timeout)
@@ -669,10 +689,8 @@ int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
 		goto release_id;
 	}
 
-#ifdef AMDXDNA_DEVEL
-	if (mb_chann->mb->polld)
+	if (mb_chann->type == MB_CHANNEL_USER_POLL)
 		mailbox_polld_wakeup(mb_chann->mb);
-#endif
 	return 0;
 
 release_id:
@@ -683,42 +701,75 @@ int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
 }
 
 #if defined(CONFIG_DEBUG_FS)
+static struct mailbox_res_record *
+xdna_mailbox_get_record(struct mailbox *mb, int mb_irq,
+			const struct xdna_mailbox_chann_res *x2i,
+			const struct xdna_mailbox_chann_res *i2x,
+			enum xdna_mailbox_channel_type type)
+{
+	struct mailbox_res_record *record;
+	int record_found = 0;
+
+	mutex_lock(&mb->mbox_lock);
+	list_for_each_entry(record, &mb->res_records, re_entry) {
+		if (record->re_irq != mb_irq)
+			continue;
+
+		record_found = 1;
+		break;
+	}
+
+	if (record_found) {
+		record->type = type;
+		goto found;
+	}
+
+	record = kzalloc(sizeof(*record), GFP_KERNEL);
+	if (!record)
+		goto out;
+	list_add_tail(&record->re_entry, &mb->res_records);
+	record->re_irq = mb_irq;
+
+found:
+	record->type = type;
+	memcpy(&record->re_x2i, x2i, sizeof(*x2i));
+	memcpy(&record->re_i2x, i2x, sizeof(*i2x));
+out:
+	mutex_unlock(&mb->mbox_lock);
+	return record;
+}
+
 int xdna_mailbox_info_show(struct mailbox *mb, struct seq_file *m)
 {
-	static const char ring_fmt[] = "%4d  %3s  %5d  0x%08x  0x%04x  ";
+	static const char ring_fmt[] = "%4d  %3s  %5d  %4d  0x%08x  0x%04x  ";
 	static const char mbox_fmt[] = "0x%08x  0x%08x  0x%04x    0x%04x\n";
 	struct mailbox_res_record *record;
-	struct mailbox_channel *chann;
 
 	/* If below two puts changed, make sure update fmt[] as well */
-	seq_puts(m, "mbox  dir  alive  ring addr   size    ");
+	seq_puts(m, "mbox  dir  alive  type  ring addr   size    ");
 	seq_puts(m, "head ptr    tail ptr    head val  tail val\n");
 
 #define xdna_mbox_dump_queue(_dir, _act) \
-	{ \
-		u32 head_ptr, tail_ptr, head_val, tail_val; \
-		u32 rb_start, rb_size; \
-		u32 mbox_irq; \
-		mbox_irq = record->re_irq; \
-		rb_start = record->re_##_dir.rb_start_addr; \
-		rb_size = record->re_##_dir.rb_size; \
-		head_ptr = record->re_##_dir.mb_head_ptr_reg; \
-		tail_ptr = record->re_##_dir.mb_tail_ptr_reg; \
-		head_val = ioread32((void *)(mb->res.mbox_base + head_ptr)); \
-		tail_val = ioread32((void *)(mb->res.mbox_base + tail_ptr)); \
-		seq_printf(m, ring_fmt, mbox_irq, #_dir, _act, rb_start, rb_size); \
-		seq_printf(m, mbox_fmt, head_ptr, tail_ptr, head_val, tail_val); \
-	}
+{ \
+	u32 head_ptr, tail_ptr, head_val, tail_val; \
+	u32 rb_start, rb_size; \
+	u32 mbox_irq; \
+	u32 type; \
+	type = record->type; \
+	mbox_irq = record->re_irq; \
+	rb_start = record->re_##_dir.rb_start_addr; \
+	rb_size = record->re_##_dir.rb_size; \
+	head_ptr = record->re_##_dir.mb_head_ptr_reg; \
+	tail_ptr = record->re_##_dir.mb_tail_ptr_reg; \
+	head_val = ioread32((void *)(mb->res.mbox_base + head_ptr)); \
+	tail_val = ioread32((void *)(mb->res.mbox_base + tail_ptr)); \
+	seq_printf(m, ring_fmt, mbox_irq, #_dir, _act, type, rb_start, rb_size); \
+	seq_printf(m, mbox_fmt, head_ptr, tail_ptr, head_val, tail_val); \
+}
 	mutex_lock(&mb->mbox_lock);
 	list_for_each_entry(record, &mb->res_records, re_entry) {
-		int active = 0;
-
-		list_for_each_entry(chann, &mb->chann_list, chann_entry) {
-			if (record->re_irq == chann->msix_irq)
-				active = 1;
-		}
-		xdna_mbox_dump_queue(x2i, active);
-		xdna_mbox_dump_queue(i2x, active);
+		xdna_mbox_dump_queue(x2i, record->active);
+		xdna_mbox_dump_queue(i2x, record->active);
 	}
 	mutex_unlock(&mb->mbox_lock);
 
@@ -760,42 +811,17 @@ struct mailbox_channel *
 xdna_mailbox_create_channel(struct mailbox *mb,
 			    const struct xdna_mailbox_chann_res *x2i,
 			    const struct xdna_mailbox_chann_res *i2x,
-			    u32 iohub_int_addr,
-			    int mb_irq)
+			    u32 iohub_int_addr, int mb_irq,
+			    enum xdna_mailbox_channel_type type)
 {
 	struct mailbox_channel *mb_chann;
 	int ret;
 #if defined(CONFIG_DEBUG_FS)
 	struct mailbox_res_record *record;
-	int record_found = 0;
-
-	mutex_lock(&mb->mbox_lock);
-	list_for_each_entry(record, &mb->res_records, re_entry) {
-		if (record->re_irq != mb_irq)
-			continue;
-
-		record_found = 1;
-		break;
-	}
-
-	if (record_found)
-		goto skip_record;
-
-	record = kzalloc(sizeof(*record), GFP_KERNEL);
-	if (!record) {
-		mutex_unlock(&mb->mbox_lock);
-		return NULL;
-	}
-
-	memcpy(&record->re_x2i, x2i, sizeof(*x2i));
-	memcpy(&record->re_i2x, i2x, sizeof(*i2x));
-	record->re_irq = mb_irq;
-
 	/* Record will be released when mailbox device destroy*/
-	list_add_tail(&record->re_entry, &mb->res_records);
-
-skip_record:
-	mutex_unlock(&mb->mbox_lock);
+	record = xdna_mailbox_get_record(mb, mb_irq, x2i, i2x, type);
+	if (!record)
+		return NULL;
 #endif /* CONFIG_DEBUG_FS */
 
 	if (!is_power_of_2(x2i->rb_size) || !is_power_of_2(i2x->rb_size)) {
@@ -808,6 +834,11 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 		return NULL;
 
 	mb_chann->mb = mb;
+	mb_chann->type = type;
+#ifdef AMDXDNA_DEVEL
+	if (type != MB_CHANNEL_MGMT && MB_FORCE_USER_POLL)
+		mb_chann->type = MB_CHANNEL_USER_POLL;
+#endif
 	mb_chann->msix_irq = mb_irq;
 	mb_chann->iohub_int_addr = iohub_int_addr;
 	memcpy(&mb_chann->res[CHAN_RES_X2I], x2i, sizeof(*x2i));
@@ -817,10 +848,7 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 	idr_init(&mb_chann->chan_idr);
 	mb_chann->x2i_tail = mailbox_get_tailptr(mb_chann, CHAN_RES_X2I);
 	mb_chann->i2x_head = mailbox_get_headptr(mb_chann, CHAN_RES_I2X);
-#ifdef AMDXDNA_DEVEL
-	if (mb->polld)
-		goto skip_irq;
-#endif
+	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
 
 	INIT_WORK(&mb_chann->rx_work, mailbox_rx_worker);
 	mb_chann->work_q = alloc_ordered_workqueue(MAILBOX_NAME, 0);
@@ -830,7 +858,7 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 	}
 
 #ifdef AMDXDNA_DEVEL
-	if (mailbox_polling > 0) {
+	if (MB_PERIODIC_POLL) {
 		/* Poll response every few ms. Good for bring up a new device */
 		timer_setup(&mb_chann->timer, mailbox_timer, 0);
 
@@ -852,13 +880,18 @@ xdna_mailbox_create_channel(struct mailbox *mb,
 #endif
 	mb_chann->bad_state = false;
 	mutex_lock(&mb->mbox_lock);
-	list_add(&mb_chann->chann_entry, &mb->chann_list);
-	mutex_unlock(&mb->mbox_lock);
-
+	if (mb_chann->type == MB_CHANNEL_USER_POLL)
+		list_add_tail(&mb_chann->chann_entry, &mb->poll_chann_list);
+	else
+		list_add_tail(&mb_chann->chann_entry, &mb->chann_list);
 #if defined(CONFIG_DEBUG_FS)
 	mb_chann->record = record;
+	record->active = 1;
 #endif
-	MB_DBG(mb_chann, "Mailbox channel created (irq: %d)", mb_chann->msix_irq);
+	mutex_unlock(&mb->mbox_lock);
+
+	MB_DBG(mb_chann, "Mailbox channel created type %d (irq: %d)",
+	       mb_chann->type, mb_chann->msix_irq);
 	return mb_chann;
 
 destroy_wq:
@@ -875,13 +908,13 @@ int xdna_mailbox_destroy_channel(struct mailbox_channel *mb_chann)
 
 	mutex_lock(&mb_chann->mb->mbox_lock);
 	list_del(&mb_chann->chann_entry);
+#if defined(CONFIG_DEBUG_FS)
+	mb_chann->record->active = 0;
+#endif
 	mutex_unlock(&mb_chann->mb->mbox_lock);
 
 #ifdef AMDXDNA_DEVEL
-	if (mb_chann->mb->polld)
-		goto free_msg;
-
-	if (mailbox_polling > 0)
+	if (MB_PERIODIC_POLL)
 		goto destroy_wq;
 #endif
 	free_irq(mb_chann->msix_irq, mb_chann);
@@ -892,13 +925,11 @@ int xdna_mailbox_destroy_channel(struct mailbox_channel *mb_chann)
 	destroy_workqueue(mb_chann->work_q);
 	/* We can clean up and release resources */
 
-#ifdef AMDXDNA_DEVEL
-free_msg:
-#endif
 	idr_for_each(&mb_chann->chan_idr, mailbox_release_msg, mb_chann);
 	idr_destroy(&mb_chann->chan_idr);
 
-	MB_DBG(mb_chann, "Mailbox channel destroyed, irq: %d", mb_chann->msix_irq);
+	MB_DBG(mb_chann, "Mailbox channel destroyed type %d irq: %d",
+	       mb_chann->type, mb_chann->msix_irq);
 	kfree(mb_chann);
 	return 0;
 }
@@ -909,10 +940,7 @@ void xdna_mailbox_stop_channel(struct mailbox_channel *mb_chann)
 		return;
 
 #ifdef AMDXDNA_DEVEL
-	if (mb_chann->mb->polld)
-		return;
-
-	if (mailbox_polling > 0) {
+	if (MB_PERIODIC_POLL) {
 		timer_delete_sync(&mb_chann->timer);
 		goto skip_irq;
 	}
@@ -943,11 +971,13 @@ struct mailbox *xdna_mailbox_create(struct device *dev,
 
 	mutex_init(&mb->mbox_lock);
 	INIT_LIST_HEAD(&mb->chann_list);
-#ifdef AMDXDNA_DEVEL
-	if (mailbox_polling >= 0)
-		goto skip_polld;
+	INIT_LIST_HEAD(&mb->poll_chann_list);
 
-	/* Launch per device busy polling kthread */
+	/*
+	 * The polld kthread will only wakeup and handle those
+	 * MB_CHANNEL_USER_POLL channels. If no thing to do, polld should
+	 * just sleep. It is a per device kthread.
+	 */
 	mb->polld = kthread_run(mailbox_polld, mb, MAILBOX_NAME);
 	if (IS_ERR(mb->polld)) {
 		dev_err(mb->dev, "Failed to create polld ret %ld", PTR_ERR(mb->polld));
@@ -956,8 +986,6 @@ struct mailbox *xdna_mailbox_create(struct device *dev,
 	}
 	init_waitqueue_head(&mb->poll_wait);
 	mb->sent_msg = false;
-skip_polld:
-#endif
 
 #if defined(CONFIG_DEBUG_FS)
 	INIT_LIST_HEAD(&mb->res_records);
@@ -981,18 +1009,11 @@ void xdna_mailbox_destroy(struct mailbox *mb)
 	}
 done_release_record:
 #endif /* CONFIG_DEBUG_FS */
-#ifdef AMDXDNA_DEVEL
-	if (mailbox_polling >= 0)
-		goto skip_polld;
-
 	dev_dbg(mb->dev, "Stopping polld");
 	(void)kthread_stop(mb->polld);
-skip_polld:
-#endif
 
 	mutex_lock(&mb->mbox_lock);
-	if (!list_empty(&mb->chann_list))
-		WARN_ON("Channel not destroy");
+	WARN_ONCE(!list_empty(&mb->chann_list), "Channel not destroy");
 	mutex_unlock(&mb->mbox_lock);
 
 	mutex_destroy(&mb->mbox_lock);
diff --git a/src/driver/amdxdna/amdxdna_mailbox.h b/src/driver/amdxdna/amdxdna_mailbox.h
index 2e114644..8ac677d9 100644
--- a/src/driver/amdxdna/amdxdna_mailbox.h
+++ b/src/driver/amdxdna/amdxdna_mailbox.h
@@ -80,6 +80,13 @@ struct mailbox *xdna_mailbox_create(struct device *dev,
  */
 void xdna_mailbox_destroy(struct mailbox *mailbox);
 
+enum xdna_mailbox_channel_type {
+	MB_CHANNEL_MGMT = 0,
+	MB_CHANNEL_USER_NORMAL,
+	MB_CHANNEL_USER_POLL,
+	MB_CHANNEL_MAX_TYPE,
+};
+
 /*
  * xdna_mailbox_create_channel() -- Create a mailbox channel instance
  *
@@ -88,6 +95,7 @@ void xdna_mailbox_destroy(struct mailbox *mailbox);
  * @i2x: firmware to host mailbox resources
  * @xdna_mailbox_intr_reg: register addr of MSI-X interrupt
  * @mb_irq: Linux IRQ number associated with mailbox MSI-X interrupt vector index
+ * @type: Type of channel
  *
  * Return: If success, return a handle of mailbox channel. Otherwise, return NULL.
  */
@@ -96,7 +104,7 @@ xdna_mailbox_create_channel(struct mailbox *mailbox,
 			    const struct xdna_mailbox_chann_res *x2i,
 			    const struct xdna_mailbox_chann_res *i2x,
 			    u32 xdna_mailbox_intr_reg,
-			    int mb_irq);
+			    int mb_irq, enum xdna_mailbox_channel_type type);
 
 /*
  * xdna_mailbox_destroy_channel() -- destroy mailbox channel
diff --git a/src/include/uapi/drm_local/amdxdna_accel.h b/src/include/uapi/drm_local/amdxdna_accel.h
index 134ef87b..a9d0146a 100644
--- a/src/include/uapi/drm_local/amdxdna_accel.h
+++ b/src/include/uapi/drm_local/amdxdna_accel.h
@@ -461,6 +461,7 @@ enum amdxdna_power_mode_type {
 	POWER_MODE_LOW,     /**< Set frequency to lowest DPM */
 	POWER_MODE_MEDIUM,  /**< Set frequency to medium DPM */
 	POWER_MODE_HIGH,    /**< Set frequency to highest DPM */
+	POWER_MODE_TURBO,   /**< More power, more performance */
 };
 
 /**
diff --git a/src/shim/hwq.cpp b/src/shim/hwq.cpp
index 2bda0db1..9a4c6b39 100644
--- a/src/shim/hwq.cpp
+++ b/src/shim/hwq.cpp
@@ -99,10 +99,21 @@ submit_command(xrt_core::buffer_handle *cmd)
   }
 }
 
+int
+hw_q::
+poll_command(xrt_core::buffer_handle *cmd) const
+{
+  auto cmdpkt = reinterpret_cast<ert_packet *>(cmd->map(xrt_core::buffer_handle::map_type::write));
+  return (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) ? 1 : 0;
+}
+
 int
 hw_q::
 wait_command(xrt_core::buffer_handle *cmd, uint32_t timeout_ms) const
 {
+  if (poll_command(cmd))
+      return 1;
+
   auto pkt = get_chained_command_pkt(cmd);
   if (!m_pdev.is_force_unchained_command() || !pkt)
     return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms);
diff --git a/src/shim/hwq.h b/src/shim/hwq.h
index ce2c1c83..afb9ca97 100644
--- a/src/shim/hwq.h
+++ b/src/shim/hwq.h
@@ -20,6 +20,9 @@ class hw_q : public xrt_core::hwqueue_handle
   void
   submit_command(xrt_core::buffer_handle *) override;
 
+  int
+  poll_command(xrt_core::buffer_handle *) const override;
+
   int
   wait_command(xrt_core::buffer_handle *, uint32_t timeout_ms) const override;
 
diff --git a/test/shim_test/io_param.h b/test/shim_test/io_param.h
index b86446a6..452c1076 100644
--- a/test/shim_test/io_param.h
+++ b/test/shim_test/io_param.h
@@ -14,6 +14,9 @@ struct io_test_parameter {
 #define IO_TEST_NOOP_RUN      1
 #define IO_TEST_BAD_RUN       2
   int type;
+#define IO_TEST_IOCTL_WAIT    0
+#define IO_TEST_POLL_WAIT     1
+  int wait;
   bool debug;
 };
 
diff --git a/test/shim_test/io_test.cpp b/test/shim_test/io_test.cpp
index 8ff5e6d7..74df0346 100644
--- a/test/shim_test/io_test.cpp
+++ b/test/shim_test/io_test.cpp
@@ -19,10 +19,11 @@ namespace {
 io_test_parameter io_test_parameters;
 
 void
-io_test_parameter_init(int perf, int type, bool debug = false)
+io_test_parameter_init(int perf, int type, int wait, bool debug = false)
 {
   io_test_parameters.perf = perf;
   io_test_parameters.type = type;
+  io_test_parameters.wait = wait;
   io_test_parameters.debug = debug;
 }
 
@@ -82,7 +83,15 @@ io_test_init_runlist_cmd(bo* cmd_bo, std::vector<bo*>& cmd_bos)
   }
 }
 
-#define IO_TEST_TIMEOUT 5000 /* millisecond */
+void io_test_cmd_wait(hwqueue_handle *hwq, std::shared_ptr<bo> bo)
+{
+    if (io_test_parameters.wait == IO_TEST_POLL_WAIT) {
+        while(!hwq->poll_command(bo->get()));
+    } else {
+        hwq->wait_command(bo->get(), 0);
+    }
+}
+
 void
 io_test_cmd_submit_and_wait_latency(
   hwqueue_handle *hwq,
@@ -96,9 +105,10 @@ io_test_cmd_submit_and_wait_latency(
   while (completed < total_cmd_submission) {
     for (auto& cmd : cmdlist_bos) {
         hwq->submit_command(std::get<0>(cmd).get()->get());
-        hwq->wait_command(std::get<0>(cmd).get()->get(), IO_TEST_TIMEOUT);
+        io_test_cmd_wait(hwq, std::get<0>(cmd));
         if (std::get<1>(cmd)->state != ERT_CMD_STATE_COMPLETED)
           throw std::runtime_error("Command error");
+        std::get<1>(cmd)->state = ERT_CMD_STATE_NEW;
         completed++;
         if (completed >= total_cmd_submission)
           break;
@@ -125,9 +135,10 @@ io_test_cmd_submit_and_wait_thruput(
   }
 
   while (completed < issued) {
-    hwq->wait_command(std::get<0>(cmdlist_bos[wait_idx]).get()->get(), IO_TEST_TIMEOUT);
+    io_test_cmd_wait(hwq, std::get<0>(cmdlist_bos[wait_idx]));
     if (std::get<1>(cmdlist_bos[wait_idx])->state != ERT_CMD_STATE_COMPLETED)
       throw std::runtime_error("Command error");
+    std::get<1>(cmdlist_bos[wait_idx])->state = ERT_CMD_STATE_NEW;
     completed++;
 
     if (issued < total_cmd_submission) {
@@ -235,47 +246,63 @@ io_test(device::id_type id, device* dev, int total_hwq_submit, int num_cmdlist,
 void
 TEST_io(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
-  io_test_parameter_init(IO_TEST_NO_PERF, static_cast<unsigned int>(arg[0]));
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+
+  io_test_parameter_init(IO_TEST_NO_PERF, run_type, IO_TEST_IOCTL_WAIT);
   io_test(id, sdev.get(), 1, 1, arg[1]);
 }
 
 void
 TEST_io_latency(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
-  io_test_parameter_init(IO_TEST_LATENCY_PERF, static_cast<unsigned int>(arg[0]));
-  io_test(id, sdev.get(), 1000, 1, 1);
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total = static_cast<unsigned int>(arg[2]);
+
+  io_test_parameter_init(IO_TEST_LATENCY_PERF, run_type, wait_type);
+  io_test(id, sdev.get(), total, 1, 1);
 }
 
 void
-TEST_io_runlist_latency(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
+TEST_io_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
-  io_test_parameter_init(IO_TEST_LATENCY_PERF, static_cast<unsigned int>(arg[0]));
-  io_test(id, sdev.get(), 32000, 1,  1);
-  io_test(id, sdev.get(), 16000, 1,  2);
-  io_test(id, sdev.get(),  8000, 1,  4);
-  io_test(id, sdev.get(),  4000, 1,  8);
-  io_test(id, sdev.get(),  2000, 1, 16);
-  io_test(id, sdev.get(),  1333, 1, 24);
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total = static_cast<unsigned int>(arg[2]);
+
+  io_test_parameter_init(IO_TEST_THRUPUT_PERF, run_type, wait_type);
+  io_test(id, sdev.get(), total, 8, 1);
 }
 
 void
-TEST_io_e_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
+TEST_io_runlist_latency(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
-  io_test_parameter_init(IO_TEST_THRUPUT_PERF, static_cast<unsigned int>(arg[0]));
-  io_test(id, sdev.get(), 32000, 8, 1);
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total = static_cast<unsigned int>(arg[2]);
+  const size_t max_cmd_per_list = 24;
+
+  io_test_parameter_init(IO_TEST_LATENCY_PERF, run_type, wait_type);
+  for (int cmds_per_list = 1; cmds_per_list <=32; cmds_per_list *=2) {
+    if (cmds_per_list > max_cmd_per_list)
+      cmds_per_list = max_cmd_per_list;
+    int total_hwq_submit = total / cmds_per_list;
+    io_test(id, sdev.get(), total_hwq_submit, 1, cmds_per_list);
+  }
 }
 
 void
-TEST_io_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
+TEST_io_runlist_throughput(device::id_type id, std::shared_ptr<device> sdev, arg_type& arg)
 {
+  unsigned int run_type = static_cast<unsigned int>(arg[0]);
+  unsigned int wait_type = static_cast<unsigned int>(arg[1]);
+  unsigned int total_commands = static_cast<unsigned int>(arg[2]);
   int num_bo_set = 256;
-  int total_commands = 32000;
   const size_t max_cmd_per_list = 24;
 
-  io_test_parameter_init(IO_TEST_THRUPUT_PERF, static_cast<unsigned int>(arg[0]));
+  io_test_parameter_init(IO_TEST_THRUPUT_PERF, run_type, wait_type);
 
-  int cmds_per_list;
-  for (cmds_per_list = 1; cmds_per_list <= 32; cmds_per_list *= 2) {
+  for (int cmds_per_list = 1; cmds_per_list <= 32; cmds_per_list *= 2) {
     if (cmds_per_list > max_cmd_per_list)
       cmds_per_list = max_cmd_per_list;
     int num_cmdlist = num_bo_set / cmds_per_list;
diff --git a/test/shim_test/shim_test.cpp b/test/shim_test/shim_test.cpp
index fa4887fd..5ccbbdcd 100644
--- a/test/shim_test/shim_test.cpp
+++ b/test/shim_test/shim_test.cpp
@@ -29,9 +29,9 @@ using arg_type = const std::vector<uint64_t>;
 void TEST_export_import_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_latency(device::id_type, std::shared_ptr<device>, arg_type&);
-void TEST_io_runlist_latency(device::id_type, std::shared_ptr<device>, arg_type&);
-void TEST_io_e_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_io_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
+void TEST_io_runlist_latency(device::id_type, std::shared_ptr<device>, arg_type&);
+void TEST_io_runlist_throughput(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_noop_io_with_dup_bo(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_shim_umq_vadd(device::id_type, std::shared_ptr<device>, arg_type&);
 void TEST_shim_umq_memtiles(device::id_type, std::shared_ptr<device>, arg_type&);
@@ -521,10 +521,10 @@ std::vector<test_case> test_list {
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io, { IO_TEST_NORMAL_RUN, 1 }
   },
   test_case{ "measure no-op kernel latency",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN }
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "measure real kernel latency",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NORMAL_RUN }
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NORMAL_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "create and free debug bo",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_create_free_debug_bo, { 0x1000 }
@@ -536,7 +536,7 @@ std::vector<test_case> test_list {
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io, { IO_TEST_NORMAL_RUN, 3 }
   },
   test_case{ "measure no-op kernel throughput listed command",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN }
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "npu3 shim vadd",
     TEST_POSITIVE, dev_filter_is_aie4, TEST_shim_umq_vadd, {}
@@ -565,11 +565,23 @@ std::vector<test_case> test_list {
   test_case{ "io test no op with duplicated BOs",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {}
   },
-  test_case{ "io test no-op kernel latency listed command",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN }
+  test_case{ "measure no-op kernel latency listed command",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "measure no-op kernel throuput",
-    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_e_throughput, { IO_TEST_NOOP_RUN }
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel latency (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_latency, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel throuput (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel latency listed command (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
+  },
+  test_case{ "measure no-op kernel throughput listed command (polling)",
+    TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
   },
 };
 
diff --git a/test/shim_test/speed.h b/test/shim_test/speed.h
index 89d41fae..79702816 100644
--- a/test/shim_test/speed.h
+++ b/test/shim_test/speed.h
@@ -7,6 +7,7 @@
 #include <chrono>
 
 using clk = std::chrono::high_resolution_clock;
+using ms_t = std::chrono::milliseconds;
 using us_t = std::chrono::microseconds;
 using ns_t = std::chrono::nanoseconds;
 
diff --git a/tools/info.json b/tools/info.json
index cfe90782..c55558d5 100644
--- a/tools/info.json
+++ b/tools/info.json
@@ -1,7 +1,7 @@
 {
 	"copyright": "Copyright (C) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.",
 	"xrt" : {
-		"version": "202420.2.18.101",
+		"version": "202420.2.18.134",
 		"os_rel": "22.04"
 	},
 	"firmwares": [
diff --git a/xrt b/xrt
index 476f42f4..64d03f56 160000
--- a/xrt
+++ b/xrt
@@ -1 +1 @@
-Subproject commit 476f42f419bbc5d1545aded3627f03c1c2f1336e
+Subproject commit 64d03f567db628c9107b6fcf5d362668d1834567

From fba0bd9dbcabb4390efe037404fc9477a4e02e01 Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Wed, 11 Sep 2024 21:06:04 -0700
Subject: [PATCH 35/44] add missing trace event (#252)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/driver/tools/npu_perf_analyze.sh | 16 ++++++++++++++--
 src/shim/hwq.cpp                     |  8 +++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/driver/tools/npu_perf_analyze.sh b/src/driver/tools/npu_perf_analyze.sh
index 8941fdbb..a78dc41f 100755
--- a/src/driver/tools/npu_perf_analyze.sh
+++ b/src/driver/tools/npu_perf_analyze.sh
@@ -101,18 +101,30 @@ echo "${event2_ts_num} events for: '${event2}'"
 diffs=()
 i1=0
 i2=0
-while [ ${i1} -lt ${event1_ts_num} ]; do
+while [ 1 ]; do
 	while [[ ${i2} -lt ${event2_ts_num} && ${event2_ts[i2]} -lt ${event1_ts[i1]} ]]; do
 		(( i2++ ))
 	done
 	if [ ${i2} -eq ${event2_ts_num} ]; then
 		break
 	fi
+
+	while [[ ${i1} -lt ${event1_ts_num} && ${event1_ts[i1]} -lt ${event2_ts[i2]} ]]; do
+		(( i1++ ))
+	done
+	if [ ${i1} -eq ${event1_ts_num} ]; then
+		break
+	fi
+
+
+	(( i1-- ))
 	diffs+=( $((event2_ts[i2] - event1_ts[i1])) )
 	(( i1++ ))
 	(( i2++ ))
 done
-#echo ${diffs[@]}
+#echo ${event1_ts[@]} > /tmp/e1
+#echo ${event2_ts[@]} > /tmp/e2
+#echo ${diffs[@]} > /tmp/diffs
 
 
 # Data mining within specified range
diff --git a/src/shim/hwq.cpp b/src/shim/hwq.cpp
index 9a4c6b39..f754cb55 100644
--- a/src/shim/hwq.cpp
+++ b/src/shim/hwq.cpp
@@ -5,6 +5,7 @@
 #include "hwq.h"
 #include "fence.h"
 #include "shim_debug.h"
+#include "core/common/trace.h"
 
 namespace {
 
@@ -104,7 +105,12 @@ hw_q::
 poll_command(xrt_core::buffer_handle *cmd) const
 {
   auto cmdpkt = reinterpret_cast<ert_packet *>(cmd->map(xrt_core::buffer_handle::map_type::write));
-  return (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) ? 1 : 0;
+
+  if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) {
+    XRT_TRACE_POINT_LOG(poll_command_done);
+    return 1;
+  }
+  return 0;
 }
 
 int

From b9024615fe982e41fc92fc1c9393ffe62341a1a6 Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:29:10 -0700
Subject: [PATCH 36/44] fix turbo mode (#255)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/driver/amdxdna/aie2_ctx.c        | 17 +++++++++++++++--
 src/driver/amdxdna/aie2_pci.h        |  1 +
 src/driver/tools/npu_perf_analyze.sh | 10 +++++++---
 src/driver/tools/npu_perf_trace.sh   |  2 +-
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/src/driver/amdxdna/aie2_ctx.c b/src/driver/amdxdna/aie2_ctx.c
index 5bbc0514..3343aea2 100644
--- a/src/driver/amdxdna/aie2_ctx.c
+++ b/src/driver/amdxdna/aie2_ctx.c
@@ -540,6 +540,7 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 	struct drm_gpu_scheduler *sched;
 	struct amdxdna_hwctx_priv *priv;
 	struct amdxdna_gem_obj *heap;
+	unsigned int wq_flags;
 	int i, ret;
 
 	priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
@@ -587,12 +588,21 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 
 	sched = &priv->sched;
 	mutex_init(&priv->io_lock);
-	ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
+
+	wq_flags = __WQ_ORDERED;
+	if (!aie2_pm_is_turbo(xdna->dev_handle))
+		wq_flags |= WQ_UNBOUND;
+	priv->submit_wq = alloc_workqueue(hwctx->name, wq_flags, 1);
+	if (!priv->submit_wq) {
+		XDNA_ERR(xdna, "Failed to alloc submit wq");
+		goto free_cmd_bufs;
+	}
+	ret = drm_sched_init(sched, &sched_ops, priv->submit_wq, DRM_SCHED_PRIORITY_COUNT,
 			     HWCTX_MAX_CMDS, 0, MAX_SCHEDULE_TIMEOUT,
 			     NULL, NULL, hwctx->name, xdna->ddev.dev);
 	if (ret) {
 		XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
-		goto free_cmd_bufs;
+		goto free_wq;
 	}
 
 	ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
@@ -645,6 +655,8 @@ int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
 	drm_sched_entity_destroy(&priv->entity);
 free_sched:
 	drm_sched_fini(&priv->sched);
+free_wq:
+	destroy_workqueue(priv->submit_wq);
 free_cmd_bufs:
 	for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
 		if (!priv->cmd_buf[i])
@@ -681,6 +693,7 @@ void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
 	aie2_hwctx_wait_for_idle(hwctx);
 	drm_sched_entity_destroy(&hwctx->priv->entity);
 	drm_sched_fini(&hwctx->priv->sched);
+	destroy_workqueue(hwctx->priv->submit_wq);
 
 	for (idx = 0; idx < HWCTX_MAX_CMDS; idx++) {
 		job = hwctx->priv->pending[idx];
diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h
index f0bd4a5a..fd62cbe2 100644
--- a/src/driver/amdxdna/aie2_pci.h
+++ b/src/driver/amdxdna/aie2_pci.h
@@ -199,6 +199,7 @@ struct amdxdna_hwctx_priv {
 	u32				num_pending;
 
 	struct amdxdna_gem_obj		*cmd_buf[HWCTX_MAX_CMDS];
+	struct workqueue_struct		*submit_wq;
 };
 
 struct async_events;
diff --git a/src/driver/tools/npu_perf_analyze.sh b/src/driver/tools/npu_perf_analyze.sh
index a78dc41f..d32f2f3c 100755
--- a/src/driver/tools/npu_perf_analyze.sh
+++ b/src/driver/tools/npu_perf_analyze.sh
@@ -98,6 +98,8 @@ fi
 echo "${event2_ts_num} events for: '${event2}'"
 
 # Caculate time difference between two events
+diffs_event1=()
+diffs_event2=()
 diffs=()
 i1=0
 i2=0
@@ -118,6 +120,8 @@ while [ 1 ]; do
 
 
 	(( i1-- ))
+	diffs_event1+=( $((event1_ts[i1])) )
+	diffs_event2+=( $((event2_ts[i2])) )
 	diffs+=( $((event2_ts[i2] - event1_ts[i1])) )
 	(( i1++ ))
 	(( i2++ ))
@@ -162,6 +166,6 @@ done
 
 # Output result
 total_events=$(( range_end - range_start ))
-echo Average over ${total_events} events: $(( total / total_events ))us
-echo Largest:  ${largest}us@${largest_idx}
-echo Smallest: ${smallest}us@${smallest_idx}
+echo Average over ${total_events} events: $(( total / total_events ))ns
+echo Largest:  ${largest}ns@${largest_idx}: event1=${diffs_event1[largest_idx]}, event2=${diffs_event2[largest_idx]}
+echo Smallest: ${smallest}ns@${smallest_idx}: event1=${diffs_event1[smallest_idx]}, event2=${diffs_event2[smallest_idx]}
diff --git a/src/driver/tools/npu_perf_trace.sh b/src/driver/tools/npu_perf_trace.sh
index 1cdc2bdc..97892757 100755
--- a/src/driver/tools/npu_perf_trace.sh
+++ b/src/driver/tools/npu_perf_trace.sh
@@ -127,7 +127,7 @@ eval $command
 tmp_file=/tmp/perf.out
 # convert timestamp from second to microsecond to avoid floating numbers
 #perf script | awk '{ $4=$4*1000000; print }' > ${tmp_file}
-perf script --reltime > ${tmp_file}
+perf script --reltime --ns > ${tmp_file}
 # replace IOCTL cmd number to name
 sed "$ioctl_sed_expr" "${tmp_file}" > perf.converted.out
 rm -rf ${tmp_file}

From b326815f53209262e6866d7c2d147d8f2b46614e Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Fri, 13 Sep 2024 11:39:29 -0700
Subject: [PATCH 37/44] remove forcibly unchaining runlist hack (#254)

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/shim/bo.cpp     |  2 --
 src/shim/hwq.cpp    | 51 ++-------------------------------------------
 src/shim/pcidev.cpp | 16 ++------------
 src/shim/pcidev.h   | 29 +-------------------------
 4 files changed, 5 insertions(+), 93 deletions(-)

diff --git a/src/shim/bo.cpp b/src/shim/bo.cpp
index 4fdeebe5..36142f63 100644
--- a/src/shim/bo.cpp
+++ b/src/shim/bo.cpp
@@ -230,7 +230,6 @@ alloc_bo()
   amdxdna_drm_get_bo_info bo_info = {};
   get_drm_bo_info(m_pdev, boh, &bo_info);
   m_bo = std::make_unique<bo::drm_bo>(*this, bo_info);
-  m_pdev.insert_hdl_mapping(boh, reinterpret_cast<uint64_t>(this));
 }
 
 void
@@ -248,7 +247,6 @@ void
 bo::
 free_bo()
 {
-  m_pdev.remove_hdl_mapping(get_drm_bo_handle());
   m_bo.reset();
 }
 
diff --git a/src/shim/hwq.cpp b/src/shim/hwq.cpp
index f754cb55..6debbab5 100644
--- a/src/shim/hwq.cpp
+++ b/src/shim/hwq.cpp
@@ -83,21 +83,7 @@ void
 hw_q::
 submit_command(xrt_core::buffer_handle *cmd)
 {
-  auto pkt = get_chained_command_pkt(cmd);
-  if (!m_pdev.is_force_unchained_command() || !pkt) {
-    issue_command(cmd);
-    return;
-  }
-
-  // HACK: Forcibly unchain commands, to be removed later.
-  //
-  // Forcibly unchain commands and send to driver one by one.
-  auto payload = get_ert_cmd_chain_data(pkt);
-  for (size_t i = 0; i < payload->command_count; i++) {
-    auto boh = reinterpret_cast<xrt_core::buffer_handle*>(
-      m_pdev.lookup_hdl_mapping(static_cast<uint32_t>(payload->data[i])));
-    issue_command(boh);
-  }
+  issue_command(cmd);
 }
 
 int
@@ -119,40 +105,7 @@ wait_command(xrt_core::buffer_handle *cmd, uint32_t timeout_ms) const
 {
   if (poll_command(cmd))
       return 1;
-
-  auto pkt = get_chained_command_pkt(cmd);
-  if (!m_pdev.is_force_unchained_command() || !pkt)
-    return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms);
-
-  // HACK: handling forcibly unchained commands, to be removed later.
-  //
-  // Wait for the last unchained command.
-  auto payload = get_ert_cmd_chain_data(pkt);
-  auto last_boh = reinterpret_cast<xrt_core::buffer_handle*>(
-    m_pdev.lookup_hdl_mapping(static_cast<uint32_t>(payload->data[payload->command_count-1])));
-  auto ret = wait_cmd(m_pdev, m_hwctx, last_boh, timeout_ms);
-  if (ret != 1)
-    return ret;
-
-  // Check the state of the last command.
-  auto cmdpkt = reinterpret_cast<ert_packet *>(last_boh->map(xrt_core::buffer_handle::map_type::read));
-  if (cmdpkt->state == ERT_CMD_STATE_COMPLETED) {
-    pkt->state = ERT_CMD_STATE_COMPLETED;
-    return 1;
-  }
-
-  // Find out the first command failed.
-  for (int i = 0; i < payload->command_count; i++) {
-    auto boh = reinterpret_cast<xrt_core::buffer_handle*>(
-      m_pdev.lookup_hdl_mapping(static_cast<uint32_t>(payload->data[i])));
-    cmdpkt = reinterpret_cast<ert_packet *>(boh->map(xrt_core::buffer_handle::map_type::read));
-    if (cmdpkt->state != ERT_CMD_STATE_COMPLETED) {
-      pkt->state = cmdpkt->state;
-      payload->error_index = i;
-      break;
-    }
-  }
-  return 1;
+  return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms);
 }
 
 void
diff --git a/src/shim/pcidev.cpp b/src/shim/pcidev.cpp
index 7ea27527..4b9f17e6 100644
--- a/src/shim/pcidev.cpp
+++ b/src/shim/pcidev.cpp
@@ -6,7 +6,6 @@
 #include "pcidrv.h"
 #include "shim_debug.h"
 #include "drm_local/amdxdna_accel.h"
-#include "core/common/config_reader.h"
 #include "core/common/trace.h"
 
 namespace {
@@ -71,10 +70,6 @@ namespace shim_xdna {
 pdev::
 pdev(std::shared_ptr<const drv> driver, std::string sysfs_name)
   : xrt_core::pci::dev(driver, std::move(sysfs_name))
-  // Default of force_unchained_command should be false once command
-  // chaining is natively supported by driver/firmware.
-  , m_force_unchained_command(xrt_core::config::detail::get_bool_value(
-    "Debug.force_unchained_command", false))
 {
   m_is_ready = true; // We're always ready.
 }
@@ -99,7 +94,7 @@ pdev::
 open() const
 {
   int fd;
-  const std::lock_guard<std::recursive_mutex> lock(m_lock);
+  const std::lock_guard<std::mutex> lock(m_lock);
 
   if (m_dev_users == 0) {
     fd = xrt_core::pci::dev::open("", O_RDWR);
@@ -120,7 +115,7 @@ pdev::
 close() const
 {
   int fd;
-  const std::lock_guard<std::recursive_mutex> lock(m_lock);
+  const std::lock_guard<std::mutex> lock(m_lock);
 
   --m_dev_users;
   if (m_dev_users == 0) {
@@ -162,12 +157,5 @@ munmap(void* addr, size_t len) const
   ::munmap(addr, len);
 }
 
-bool
-pdev::
-is_force_unchained_command() const
-{
-  return m_force_unchained_command;
-}
-
 } // namespace shim_xdna
 
diff --git a/src/shim/pcidev.h b/src/shim/pcidev.h
index 9a770da6..da0cdeda 100644
--- a/src/shim/pcidev.h
+++ b/src/shim/pcidev.h
@@ -43,30 +43,6 @@ class pdev : public xrt_core::pci::dev
   void
   close() const;
 
-  bool
-  is_force_unchained_command() const;
-
-  // Below routines are for managing drm_bo_hdl -> buffer_handle* mapping.
-  // This is only a temporary hack for supporting forcibly unchained runlist.
-  void
-  insert_hdl_mapping(uint32_t hdl, uint64_t ptr) const
-  {
-    const std::lock_guard<std::recursive_mutex> lock(m_lock);
-    m_hdl_map[hdl] = ptr;
-  }
-  void
-  remove_hdl_mapping(uint32_t hdl) const
-  {
-    const std::lock_guard<std::recursive_mutex> lock(m_lock);
-    m_hdl_map.erase(hdl);
-  }
-  uint64_t
-  lookup_hdl_mapping(uint32_t hdl) const
-  {
-    const std::lock_guard<std::recursive_mutex> lock(m_lock);
-    return m_hdl_map[hdl];
-  }
-
 private:
   virtual void
   on_first_open() const {}
@@ -75,10 +51,7 @@ class pdev : public xrt_core::pci::dev
 
   mutable int m_dev_fd = -1;
   mutable int m_dev_users = 0;
-  mutable std::recursive_mutex m_lock;
-  const bool m_force_unchained_command = true;
-  // Mark it as mutable since pdev does not look at what is saved in this map
-  mutable std::map<uint32_t, uint64_t> m_hdl_map;
+  mutable std::mutex m_lock;
 };
 
 } // namespace shim_xdna

From f158f2e5ab27ce21ed7f488823d4fe33dc8597b3 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Fri, 13 Sep 2024 12:16:43 -0700
Subject: [PATCH 38/44] Add aie2_control_flags and some bug fix (#253)

* add aie2_control_flags; fix NPU1 clock incorrect issue; fix NPU4 not able to config DPM level issue
* more cleanup

---------

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/aie2_debugfs.c |  85 ++++-------------
 src/driver/amdxdna/aie2_pci.c     |  12 ++-
 src/driver/amdxdna/aie2_pci.h     |  26 ++----
 src/driver/amdxdna/aie2_smu.c     | 149 ++++++++++++------------------
 src/driver/amdxdna/npu1_regs.c    |   6 --
 src/driver/amdxdna/npu4_family.h  |   6 --
 6 files changed, 96 insertions(+), 188 deletions(-)

diff --git a/src/driver/amdxdna/aie2_debugfs.c b/src/driver/amdxdna/aie2_debugfs.c
index 1272efbe..4511ea45 100644
--- a/src/driver/amdxdna/aie2_debugfs.c
+++ b/src/driver/amdxdna/aie2_debugfs.c
@@ -77,71 +77,6 @@ static int aie2_dbgfs_entry_release(struct inode *inode, struct file *file)
 #define file_to_ndev_rw(file) \
 	(((struct seq_file *)(file)->private_data)->private)
 
-static ssize_t
-aie2_dbgfs_clock_write(struct amdxdna_dev_hdl *ndev, struct clock *clock,
-		       const char __user *ptr, size_t len, loff_t *off)
-{
-	u32 val;
-	int ret;
-
-	ret = kstrtouint_from_user(ptr, len, 10, &val);
-	if (ret) {
-		XDNA_ERR(ndev->xdna, "Invalid input value: %d", val);
-		return ret;
-	}
-
-	clock->dbg_freq_mhz = val;
-	if (!clock->dbg_freq_mhz) {
-		XDNA_INFO(ndev->xdna, "Auto %s", clock->name);
-		return 0;
-	}
-
-	ret = aie2_smu_set_clock_freq(ndev, clock, val);
-	if (ret) {
-		clock->dbg_freq_mhz = 0;
-		XDNA_ERR(ndev->xdna, "Set %s ret %d, use auto clock", clock->name, ret);
-		return ret;
-	}
-
-	return len;
-}
-
-static ssize_t aie2_dbgfs_mpnpu_clock_write(struct file *file, const char __user *ptr,
-					    size_t len, loff_t *off)
-{
-	struct amdxdna_dev_hdl *ndev = file_to_ndev_rw(file);
-
-	return aie2_dbgfs_clock_write(ndev, &ndev->smu.mp_npu_clock, ptr, len, off);
-}
-
-static int aie2_dbgfs_mpnpu_clock_show(struct seq_file *m, void *unused)
-{
-	struct amdxdna_dev_hdl *ndev = m->private;
-
-	seq_printf(m, "%d\n", aie2_smu_get_mpnpu_clock_freq(ndev));
-	return 0;
-}
-
-AIE2_DBGFS_FOPS(npuclock, aie2_dbgfs_mpnpu_clock_show, aie2_dbgfs_mpnpu_clock_write);
-
-static ssize_t aie2_dbgfs_hclock_write(struct file *file, const char __user *ptr,
-				       size_t len, loff_t *off)
-{
-	struct amdxdna_dev_hdl *ndev = file_to_ndev_rw(file);
-
-	return aie2_dbgfs_clock_write(ndev, &ndev->smu.h_clock, ptr, len, off);
-}
-
-static int aie2_dbgfs_hclock_show(struct seq_file *m, void *unused)
-{
-	struct amdxdna_dev_hdl *ndev = m->private;
-
-	seq_printf(m, "%d\n", aie2_smu_get_hclock_freq(ndev));
-	return 0;
-}
-
-AIE2_DBGFS_FOPS(hclock, aie2_dbgfs_hclock_show, aie2_dbgfs_hclock_write);
-
 static ssize_t aie2_pasid_write(struct file *file, const char __user *ptr,
 				size_t len, loff_t *off)
 {
@@ -302,8 +237,24 @@ static ssize_t aie2_dpm_level_set(struct file *file, const char __user *ptr,
 static int aie2_dpm_level_get(struct seq_file *m, void *unused)
 {
 	struct amdxdna_dev_hdl *ndev = m->private;
+	const struct dpm_clk *dpm_table;
+	u32 num_dpm_levels;
+	int dpm_level;
+	int i;
 
-	seq_printf(m, "%d\n", aie2_smu_get_dpm_level(ndev));
+	dpm_table = SMU_DPM_TABLE_ENTRY(ndev, 0);
+	dpm_level = aie2_smu_get_dpm_level(ndev);
+	num_dpm_levels = SMU_DPM_MAX(ndev);
+	for (i = 0; i <= num_dpm_levels; i++) {
+		u32 npuclk = dpm_table[i].npuclk;
+		u32 hclk = dpm_table[i].hclk;
+
+		if (dpm_level == i)
+			seq_printf(m, " [%d,%d] ", npuclk, hclk);
+		else
+			seq_printf(m, " %d,%d ", npuclk, hclk);
+	}
+	seq_puts(m, "\n");
 	return 0;
 }
 
@@ -609,8 +560,6 @@ const struct {
 	umode_t mode;
 } aie2_dbgfs_files[] = {
 	AIE2_DBGFS_FILE(nputest, 0600),
-	AIE2_DBGFS_FILE(hclock, 0600),
-	AIE2_DBGFS_FILE(npuclock, 0600),
 	AIE2_DBGFS_FILE(pasid, 0600),
 	AIE2_DBGFS_FILE(state, 0600),
 	AIE2_DBGFS_FILE(powerstate, 0600),
diff --git a/src/driver/amdxdna/aie2_pci.c b/src/driver/amdxdna/aie2_pci.c
index 3f95844d..2825710c 100644
--- a/src/driver/amdxdna/aie2_pci.c
+++ b/src/driver/amdxdna/aie2_pci.c
@@ -21,10 +21,17 @@
 #include "aie2_internal.h"
 #endif
 
-int aie2_max_col = XRS_MAX_COL;
-module_param(aie2_max_col, int, 0600);
+uint aie2_max_col = XRS_MAX_COL;
+module_param(aie2_max_col, uint, 0600);
 MODULE_PARM_DESC(aie2_max_col, "Maximum column could be used");
 
+uint aie2_control_flags;
+module_param(aie2_control_flags, uint, 0400);
+MODULE_PARM_DESC(aie2_control_flags,
+		 " Bit " __stringify(AIE2_BIT_BYPASS_POWER_SWITCH) ": Bypass power on/off,"
+		 " Bit " __stringify(AIE2_BIT_BYPASS_SET_FREQ) ": Bypass set freq,"
+		 " Bit " __stringify(AIE2_BIT_BYPASS_FW_LOAD) ": Bypass FW loading");
+
 /*
  * The management mailbox channel is allocated by firmware.
  * The related register and ring buffer information is on SRAM BAR.
@@ -477,6 +484,7 @@ static int aie2_init(struct amdxdna_dev *xdna)
 	void __iomem * const *tbl;
 	int i, bars, nvec, ret;
 
+	XDNA_DBG(xdna, "Control flags 0x%x", aie2_control_flags);
 	ndev = devm_kzalloc(&pdev->dev, sizeof(*ndev), GFP_KERNEL);
 	if (!ndev)
 		return -ENOMEM;
diff --git a/src/driver/amdxdna/aie2_pci.h b/src/driver/amdxdna/aie2_pci.h
index fd62cbe2..51cf66c9 100644
--- a/src/driver/amdxdna/aie2_pci.h
+++ b/src/driver/amdxdna/aie2_pci.h
@@ -67,15 +67,10 @@
 	pci_resource_len(NDEV2PDEV(_ndev), (_ndev)->xdna->dev_info->mbox_bar); \
 })
 
-#define SMU_MPNPUCLK_FREQ_MAX(ndev) \
-	((ndev)->priv->smu_mpnpuclk_freq_max)
-#define SMU_HCLK_FREQ_MAX(ndev) \
-	((ndev)->priv->smu_hclk_freq_max)
 #define SMU_DPM_MAX(ndev) \
-	((ndev)->priv->smu_dpm_max)
-
-#define SMU_NPU_DPM_TABLE_ENTRY(ndev, level) \
-	(&ndev->priv->smu_npu_dpm_clk_table[level])
+	((ndev)->smu.num_dpm_levels - 1)
+#define SMU_DPM_TABLE_ENTRY(ndev, level) \
+	(&(ndev)->smu.dpm_table[level])
 
 enum aie2_smu_reg_idx {
 	SMU_CMD_REG = 0,
@@ -154,12 +149,11 @@ struct clock {
 	char name[16];
 	u32 max_freq_mhz;
 	u32 freq_mhz;
-#if defined(CONFIG_DEBUG_FS)
-	u32 dbg_freq_mhz;
-#endif
 };
 
 struct smu {
+	const struct dpm_clk	*dpm_table;
+	u32			num_dpm_levels;
 	struct clock		mp_npu_clock;
 	struct clock		h_clock;
 	u32			curr_dpm_level;
@@ -269,10 +263,6 @@ struct amdxdna_dev_priv {
 	struct aie2_bar_off_pair	psp_regs_off[PSP_MAX_REGS];
 	struct aie2_bar_off_pair	smu_regs_off[SMU_MAX_REGS];
 	struct rt_config_clk_gating	clk_gating;
-	u32				smu_mpnpuclk_freq_max;
-	u32				smu_hclk_freq_max;
-	/* npu1: 0, not support dpm; npu2+: support dpm up to 7 */
-	u32				smu_dpm_max;
 	u32				smu_rev;
 	const struct dpm_clk		*smu_npu_dpm_clk_table;
 	u32				smu_npu_dpm_levels;
@@ -282,6 +272,10 @@ struct amdxdna_dev_priv {
 };
 
 /* aie2_pci.c */
+#define AIE2_BIT_BYPASS_POWER_SWITCH	0 /* NOSYS */
+#define AIE2_BIT_BYPASS_SET_FREQ	1
+#define AIE2_BIT_BYPASS_FW_LOAD		2 /* NOSYS */
+extern uint aie2_control_flags;
 extern const struct amdxdna_dev_ops aie2_ops;
 int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor);
 
@@ -289,7 +283,6 @@ int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor
 void aie2_smu_setup(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_start(struct amdxdna_dev_hdl *ndev);
 void aie2_smu_stop(struct amdxdna_dev_hdl *ndev);
-int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev, struct clock *clock, u32 freq_mhz);
 char *aie2_smu_get_mpnpu_clock_name(struct amdxdna_dev_hdl *ndev);
 char *aie2_smu_get_hclock_name(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_mpnpu_clock_freq(struct amdxdna_dev_hdl *ndev);
@@ -299,7 +292,6 @@ int aie2_smu_set_power_off(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_power_state(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_get_dpm_level(struct amdxdna_dev_hdl *ndev);
 int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level);
-void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev);
 
 /* aie2_psp.c */
 struct psp_device *aie2m_psp_create(struct device *dev, struct psp_config *conf);
diff --git a/src/driver/amdxdna/aie2_smu.c b/src/driver/amdxdna/aie2_smu.c
index abe9a1f9..dce72de0 100644
--- a/src/driver/amdxdna/aie2_smu.c
+++ b/src/driver/amdxdna/aie2_smu.c
@@ -10,12 +10,21 @@
 /* SMU commands */
 #define AIE2_SMU_POWER_ON		0x3
 #define AIE2_SMU_POWER_OFF		0x4
+/* For SMU v0 */
 #define AIE2_SMU_SET_MPNPUCLK_FREQ	0x5
 #define AIE2_SMU_SET_HCLK_FREQ		0x6
+/* For SMU v1 */
 #define AIE2_SMU_SET_SOFT_DPMLEVEL	0x7
 #define AIE2_SMU_SET_HARD_DPMLEVEL	0x8
 
-static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg)
+/* This is a hack for NPU1 device */
+const struct dpm_clk npu1_hack_dpm_clk_table[] = {
+	{400, 800},
+	{600, 1024},
+};
+
+static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd,
+			 u32 reg_arg, u32 *out)
 {
 	u32 resp;
 	int ret;
@@ -35,6 +44,9 @@ static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg)
 		return ret;
 	}
 
+	if (out)
+		*out = readl(SMU_REG(ndev, SMU_OUT_REG));
+
 	if (resp != SMU_RESULT_OK) {
 		XDNA_ERR(ndev->xdna, "SMU cmd %d failed, 0x%x", reg_cmd, resp);
 		return -EINVAL;
@@ -43,26 +55,12 @@ static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd, u32 reg_arg)
 	return 0;
 }
 
-static int aie2_smu_update_clock_freq(struct amdxdna_dev_hdl *ndev, u32 cmd,
-				      struct clock *clock, u32 freq_mhz)
-{
-	int ret;
-
-	ret = aie2_smu_exec(ndev, cmd, freq_mhz);
-	if (ret)
-		return ret;
-
-	clock->freq_mhz = freq_mhz;
-
-	return 0;
-}
-
 /*
  * Depending on the current running frequency and debugfs setting,
  * aie2_smu_set_clock_freq() might or might not update freqency.
  */
-int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev,
-			    struct clock *clock, u32 freq_mhz)
+static int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev,
+				   struct clock *clock, u32 freq_mhz)
 {
 	u32 smu_cmd;
 	int ret;
@@ -82,18 +80,11 @@ int aie2_smu_set_clock_freq(struct amdxdna_dev_hdl *ndev,
 	if (freq_mhz == clock->freq_mhz)
 		return 0;
 
-#if defined(CONFIG_DEBUG_FS)
-	/* If freq is set by debugfs, respect it until debugfs write freq 0 */
-	if (clock->dbg_freq_mhz && freq_mhz != clock->dbg_freq_mhz) {
-		XDNA_DBG(ndev->xdna, "%s debug freq %d, ignore target freq %d",
-			 clock->name, clock->dbg_freq_mhz, freq_mhz);
-		return 0;
-	}
-#endif
-	ret = aie2_smu_update_clock_freq(ndev, smu_cmd, clock, freq_mhz);
+	ret = aie2_smu_exec(ndev, smu_cmd, freq_mhz, NULL);
 	if (ret)
 		return ret;
 
+	clock->freq_mhz = freq_mhz;
 	XDNA_DBG(ndev->xdna, "Set %s = %d mhz", clock->name, clock->freq_mhz);
 	return 0;
 }
@@ -120,23 +111,23 @@ char *aie2_smu_get_hclock_name(struct amdxdna_dev_hdl *ndev)
 
 static int aie2_smu_set_dpm_level_v0(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
-	int ret;
-	const struct dpm_clk *dpm_entry = SMU_NPU_DPM_TABLE_ENTRY(ndev, dpm_level);
+	const struct dpm_clk *dpm_entry = SMU_DPM_TABLE_ENTRY(ndev, dpm_level);
 	struct clock *clk;
+	int ret;
 
 	clk = &ndev->smu.mp_npu_clock;
-
 	ret = aie2_smu_set_clock_freq(ndev, clk, dpm_entry->npuclk);
 	if (ret) {
-		XDNA_ERR(ndev->xdna, "setting npu clk failed for dpm level %d, ret: %d", dpm_level, ret);
+		XDNA_ERR(ndev->xdna, "setting npu clk failed for dpm level %d, ret: %d",
+			 dpm_level, ret);
 		return ret;
 	}
 
 	clk = &ndev->smu.h_clock;
-
 	ret = aie2_smu_set_clock_freq(ndev, clk, dpm_entry->hclk);
 	if (ret) {
-		XDNA_ERR(ndev->xdna, "setting hclk failed for dpm level %d, ret: %d", dpm_level, ret);
+		XDNA_ERR(ndev->xdna, "setting hclk failed for dpm level %d, ret: %d",
+			 dpm_level, ret);
 		return ret;
 	}
 
@@ -147,16 +138,19 @@ static int aie2_smu_set_dpm_level_v1(struct amdxdna_dev_hdl *ndev, u32 dpm_level
 {
 	int ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level, NULL);
 	if (!ret)
 		XDNA_INFO_ONCE(ndev->xdna, "Set hard dpm level = %d", dpm_level);
 	else
 		return ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level, NULL);
 	if (!ret)
 		XDNA_INFO_ONCE(ndev->xdna, "Set soft dpm level = %d", dpm_level);
 
+	ndev->smu.mp_npu_clock.freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, dpm_level)->npuclk;
+	ndev->smu.h_clock.freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, dpm_level)->hclk;
+
 	return ret;
 }
 
@@ -169,6 +163,11 @@ int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
 	int ret;
 
+	if (aie2_control_flags && BIT(AIE2_BIT_BYPASS_SET_FREQ)) {
+		XDNA_DBG(ndev->xdna, "Bypassed set dpm level");
+		return 0;
+	}
+
 	if (dpm_level > SMU_DPM_MAX(ndev))
 		return -EINVAL;
 
@@ -189,7 +188,7 @@ int aie2_smu_set_power_on(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0, NULL);
 	if (ret)
 		return ret;
 
@@ -201,7 +200,7 @@ int aie2_smu_set_power_off(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
-	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0, NULL);
 	if (ret)
 		return ret;
 
@@ -217,7 +216,6 @@ int aie2_smu_get_power_state(struct amdxdna_dev_hdl *ndev)
 int aie2_smu_start(struct amdxdna_dev_hdl *ndev)
 {
 	struct smu *smu = &ndev->smu;
-	u32 freq_mhz;
 	int ret;
 
 	ret = aie2_smu_set_power_on(ndev);
@@ -226,63 +224,23 @@ int aie2_smu_start(struct amdxdna_dev_hdl *ndev)
 		return ret;
 	}
 
-	freq_mhz = smu->mp_npu_clock.freq_mhz;
-	ret = aie2_smu_update_clock_freq(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ,
-					 &smu->mp_npu_clock, freq_mhz);
+	ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level);
 	if (ret) {
-		XDNA_ERR(ndev->xdna, "Set mpnpu clk freq failed, ret %d", ret);
+		XDNA_ERR(ndev->xdna, "Set dpm level failed, ret %d", ret);
 		return ret;
 	}
-	XDNA_INFO_ONCE(ndev->xdna, "Set %s = %d mhz", smu->mp_npu_clock.name, freq_mhz);
-
-	freq_mhz = smu->h_clock.freq_mhz;
-	ret = aie2_smu_update_clock_freq(ndev, AIE2_SMU_SET_HCLK_FREQ,
-					 &smu->h_clock, freq_mhz);
-	if (ret) {
-		XDNA_ERR(ndev->xdna, "Set hclk freq failed, ret %d", ret);
-		return ret;
-	}
-	XDNA_INFO_ONCE(ndev->xdna, "Set %s = %d mhz", smu->h_clock.name, freq_mhz);
-
-	if (SMU_DPM_MAX(ndev) > 0) {
-		ret = aie2_smu_set_dpm_level(ndev, smu->curr_dpm_level);
-		if (ret) {
-			XDNA_ERR(ndev->xdna, "Set dpm level failed, ret %d", ret);
-			return ret;
-		}
-	}
 
 	return 0;
 }
 
-void aie2_smu_prepare_s0i3(struct amdxdna_dev_hdl *ndev)
-{
-	u32 freq_mhz;
-	int ret;
-
-	freq_mhz = 400;
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, freq_mhz);
-	if (ret)
-		XDNA_ERR(ndev->xdna, "Set mpnpu clk freq %d mhz failed, ret %d", freq_mhz, ret);
-
-	freq_mhz = 800;
-	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HCLK_FREQ, freq_mhz);
-	if (ret)
-		XDNA_ERR(ndev->xdna, "Set hclk freq %d mhz failed, ret %d", freq_mhz, ret);
-
-	if (SMU_DPM_MAX(ndev) > 0) {
-		ret = aie2_smu_set_dpm_level(ndev, 0);
-		if (ret)
-			XDNA_ERR(ndev->xdna, "Set dpm level 0 failed, ret %d", ret);
-	}
-}
-
 void aie2_smu_stop(struct amdxdna_dev_hdl *ndev)
 {
 	int ret;
 
 	/* Minimize clocks/dpm level prior to power off */
-	aie2_smu_prepare_s0i3(ndev);
+	ret = aie2_smu_set_dpm_level(ndev, 0);
+	if (ret)
+		XDNA_WARN(ndev->xdna, "Set dpm level 0 failed, ret %d", ret);
 
 	ret = aie2_smu_set_power_off(ndev);
 	if (ret)
@@ -294,13 +252,26 @@ void aie2_smu_setup(struct amdxdna_dev_hdl *ndev)
 	struct smu *smu = &ndev->smu;
 
 	snprintf(smu->mp_npu_clock.name, sizeof(smu->mp_npu_clock.name), "MP-NPU Clock");
-	smu->mp_npu_clock.max_freq_mhz = SMU_MPNPUCLK_FREQ_MAX(ndev);
-
 	snprintf(smu->h_clock.name, sizeof(smu->h_clock.name), "H Clock");
-	smu->h_clock.max_freq_mhz = SMU_HCLK_FREQ_MAX(ndev);
-
-	/* The first time SMU start, it will use below clock frequency */
-	smu->mp_npu_clock.freq_mhz = smu->mp_npu_clock.max_freq_mhz;
-	smu->h_clock.freq_mhz = smu->h_clock.max_freq_mhz;
+	smu->dpm_table = ndev->priv->smu_npu_dpm_clk_table;
+	smu->num_dpm_levels = ndev->priv->smu_npu_dpm_levels;
 	smu->curr_dpm_level = SMU_DPM_MAX(ndev);
+
+	if (!ndev->priv->smu_rev) {
+		u32 npuclk_freq;
+		u32 out;
+
+		/* This is a hack for special NPU1 device */
+		npuclk_freq = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->npuclk;
+		aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ, npuclk_freq, &out);
+		if (npuclk_freq != out) {
+			XDNA_DBG(ndev->xdna, "Use small DPM table");
+			smu->dpm_table = npu1_hack_dpm_clk_table;
+			smu->num_dpm_levels = ARRAY_SIZE(npu1_hack_dpm_clk_table);
+			smu->curr_dpm_level = SMU_DPM_MAX(ndev);
+		}
+	}
+
+	smu->mp_npu_clock.max_freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->npuclk;
+	smu->h_clock.max_freq_mhz = SMU_DPM_TABLE_ENTRY(ndev, SMU_DPM_MAX(ndev))->hclk;
 }
diff --git a/src/driver/amdxdna/npu1_regs.c b/src/driver/amdxdna/npu1_regs.c
index 3c442021..11a1e3fd 100644
--- a/src/driver/amdxdna/npu1_regs.c
+++ b/src/driver/amdxdna/npu1_regs.c
@@ -51,9 +51,6 @@
 #define NPU1_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
 #define NPU1_RT_CFG_VAL_DEBUG_BO_LARGE   1
 
-#define NPU1_MPNPUCLK_FREQ_MAX  847
-#define NPU1_HCLK_FREQ_MAX      1600
-
 /*fill in the dpm clock frequencies */
 const struct dpm_clk npu1_dpm_clk_table[] = {
 	{400, 800},
@@ -109,9 +106,6 @@ const struct amdxdna_dev_priv npu1_dev_priv = {
 		.value_enable = NPU1_RT_CFG_VAL_CLK_GATING_ON,
 		.value_disable = NPU1_RT_CFG_VAL_CLK_GATING_OFF,
 	},
-	.smu_mpnpuclk_freq_max = NPU1_MPNPUCLK_FREQ_MAX,
-	.smu_hclk_freq_max     = NPU1_HCLK_FREQ_MAX,
-	.smu_dpm_max           = 7,
 	.smu_rev = SMU_REVISION_V0,
 	.smu_npu_dpm_clk_table = npu1_dpm_clk_table,
 	.smu_npu_dpm_levels = ARRAY_SIZE(npu1_dpm_clk_table),
diff --git a/src/driver/amdxdna/npu4_family.h b/src/driver/amdxdna/npu4_family.h
index 2fc82e20..9da6d971 100644
--- a/src/driver/amdxdna/npu4_family.h
+++ b/src/driver/amdxdna/npu4_family.h
@@ -74,9 +74,6 @@
 #define NPU4_RT_CFG_VAL_DEBUG_BO_DEFAULT 0
 #define NPU4_RT_CFG_VAL_DEBUG_BO_LARGE   1
 
-#define NPU4_MPNPUCLK_FREQ_MAX  1267
-#define NPU4_HCLK_FREQ_MAX      1800
-
 #define NPU4_INIT_RT_CFG_NUM	2
 #define NPU4_CLK_GATING_CFG_NUM 4
 
@@ -118,9 +115,6 @@ extern const u32 npu4_clk_gating_types[NPU4_CLK_GATING_CFG_NUM];
 		.value_enable = NPU4_RT_CFG_VAL_CLK_GATING_ON,					\
 		.value_disable = NPU4_RT_CFG_VAL_CLK_GATING_OFF,				\
 	},											\
-	.smu_mpnpuclk_freq_max = NPU4_MPNPUCLK_FREQ_MAX,					\
-	.smu_hclk_freq_max     = NPU4_HCLK_FREQ_MAX,						\
-	.smu_dpm_max           = 7,								\
 	.smu_rev = SMU_REVISION_V1,								\
 	.smu_npu_dpm_clk_table = npu4_dpm_clk_table,						\
 	.smu_npu_dpm_levels = ARRAY_SIZE(npu4_dpm_clk_table)

From 136c6d56a5dad97bb747bc7f60ce11ce96296748 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Fri, 13 Sep 2024 18:16:28 -0700
Subject: [PATCH 39/44] support response with status not at first word (#256)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/aie2_message.c           | 43 +++++----------------
 src/driver/amdxdna/amdxdna_mailbox_helper.h |  2 +
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/src/driver/amdxdna/aie2_message.c b/src/driver/amdxdna/aie2_message.c
index 0efc68d4..e1ccd9af 100644
--- a/src/driver/amdxdna/aie2_message.c
+++ b/src/driver/amdxdna/aie2_message.c
@@ -16,8 +16,13 @@
 #define DECLARE_AIE2_MSG(name, op) \
 	DECLARE_XDNA_MSG_COMMON(name, op, MAX_AIE2_STATUS_CODE)
 
-static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
-				   struct xdna_mailbox_msg *msg)
+#define aie2_send_mgmt_msg_wait(ndev, msg) \
+	aie2_send_mgmt_msg_wait_offset(ndev, msg, 0)
+
+static int
+aie2_send_mgmt_msg_wait_offset(struct amdxdna_dev_hdl *ndev,
+			       struct xdna_mailbox_msg *msg,
+			       u32 offset)
 {
 	struct amdxdna_dev *xdna = ndev->xdna;
 	struct xdna_notify *hdl = msg->handle;
@@ -34,7 +39,7 @@ static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
 		ndev->mgmt_chann = NULL;
 	}
 
-	if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) {
+	if (!ret && hdl->data[offset] != AIE2_STATUS_SUCCESS) {
 		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
 			 msg->opcode, *hdl->data);
 		ret = -EINVAL;
@@ -105,36 +110,6 @@ int aie2_check_protocol_version(struct amdxdna_dev_hdl *ndev)
 }
 
 #ifdef AMDXDNA_DEVEL
-/* TODO: Delete this. move status to the first word of struct get_telemetry_resp */
-static int aie2_send_mgmt_msg_wait_for_telemetry(struct amdxdna_dev_hdl *ndev,
-						 struct xdna_mailbox_msg *msg)
-{
-	struct amdxdna_dev *xdna = ndev->xdna;
-	struct xdna_notify *hdl = msg->handle;
-	struct get_telemetry_resp *resp;
-	int ret;
-
-	if (!ndev->mgmt_chann)
-		return -ENODEV;
-
-	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
-	ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
-	if (ret == -ETIME) {
-		xdna_mailbox_stop_channel(ndev->mgmt_chann);
-		xdna_mailbox_destroy_channel(ndev->mgmt_chann);
-		ndev->mgmt_chann = NULL;
-	}
-
-	resp = (struct get_telemetry_resp *)hdl->data;
-	if (!ret && resp->status != AIE2_STATUS_SUCCESS) {
-		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
-			 msg->opcode, resp->status);
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
 int aie2_get_telemetry(struct amdxdna_dev_hdl *ndev, u32 type, dma_addr_t addr, u32 size)
 {
 	DECLARE_AIE2_MSG(get_telemetry, MSG_OP_GET_TELEMETRY);
@@ -150,7 +125,7 @@ int aie2_get_telemetry(struct amdxdna_dev_hdl *ndev, u32 type, dma_addr_t addr,
 	req.buf_size = size;
 	req.type = type;
 
-	ret = aie2_send_mgmt_msg_wait_for_telemetry(ndev, &msg);
+	ret = aie2_send_mgmt_msg_wait_offset(ndev, &msg, XDNA_STATUS_OFFSET(get_telemetry));
 	if (ret) {
 		XDNA_ERR(xdna, "Failed to get telemetry, ret %d", ret);
 		return ret;
diff --git a/src/driver/amdxdna/amdxdna_mailbox_helper.h b/src/driver/amdxdna/amdxdna_mailbox_helper.h
index 20c1fe7b..e1c3f16f 100644
--- a/src/driver/amdxdna/amdxdna_mailbox_helper.h
+++ b/src/driver/amdxdna/amdxdna_mailbox_helper.h
@@ -37,6 +37,8 @@ struct xdna_notify {
 		.notify_cb = xdna_msg_cb,			\
 	}
 
+#define XDNA_STATUS_OFFSET(name) (offsetof(struct name##_resp, status) / sizeof(u32))
+
 int xdna_msg_cb(void *handle, const u32 *data, size_t size);
 int xdna_send_msg_wait(struct amdxdna_dev *xdna, struct mailbox_channel *chann,
 		       struct xdna_mailbox_msg *msg);

From ca8ecbf43bb62ff6e44185e5d6a34846e3ea1a7e Mon Sep 17 00:00:00 2001
From: Max Zhen <40219623+maxzhen@users.noreply.github.com>
Date: Mon, 16 Sep 2024 10:03:15 -0700
Subject: [PATCH 40/44] should use DRM_IOCTL_AMDXDNA_EXEC_CMD IOCTL for
 submitting wait and signal (#257)

* fix shim
* fix driver

---------

Signed-off-by: Max Zhen <max.zhen@amd.com>
---
 src/driver/amdxdna/aie2_ctx.c              | 16 ++--
 src/driver/amdxdna/aie2_debugfs.c          |  2 -
 src/include/uapi/drm_local/amdxdna_accel.h | 25 ------
 src/shim/device.cpp                        |  2 +-
 src/shim/fence.cpp                         | 99 ++++++++++------------
 src/shim/fence.h                           | 15 ++--
 src/shim/hwq.cpp                           |  6 +-
 src/shim/kmq/hwq.cpp                       | 21 +----
 src/shim/pcidev.cpp                        |  4 -
 test/shim_test/shim_test.cpp               |  8 +-
 10 files changed, 73 insertions(+), 125 deletions(-)

diff --git a/src/driver/amdxdna/aie2_ctx.c b/src/driver/amdxdna/aie2_ctx.c
index 3343aea2..1cab054c 100644
--- a/src/driver/amdxdna/aie2_ctx.c
+++ b/src/driver/amdxdna/aie2_ctx.c
@@ -47,12 +47,6 @@ aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
 {
 	int idx;
 
-	/* Special sequence number for oldest fence if exist */
-	if (seq == AMDXDNA_INVALID_CMD_HANDLE) {
-		idx = get_job_idx(hwctx->submitted);
-		goto out;
-	}
-
 	if (seq >= hwctx->submitted)
 		return ERR_PTR(-EINVAL);
 
@@ -60,8 +54,6 @@ aie2_hwctx_get_job(struct amdxdna_hwctx *hwctx, u64 seq)
 		return NULL;
 
 	idx = get_job_idx(seq);
-
-out:
 	return hwctx->priv->pending[idx];
 }
 
@@ -977,10 +969,18 @@ int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
 		dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
 	amdxdna_unlock_objects(job, &acquire_ctx);
 
+again:
 	mutex_lock(&hwctx->priv->io_lock);
 	ret = aie2_hwctx_add_job(hwctx, job);
 	if (ret) {
 		mutex_unlock(&hwctx->priv->io_lock);
+
+		if (ret == -EAGAIN) {
+			// Waiting for the first pending cmd to complete before trying again.
+			int res = aie2_cmd_wait(hwctx, hwctx->submitted - HWCTX_MAX_CMDS, 0);
+			if (!res)
+				goto again;
+		}
 		goto signal_fence;
 	}
 
diff --git a/src/driver/amdxdna/aie2_debugfs.c b/src/driver/amdxdna/aie2_debugfs.c
index 4511ea45..4023e6b2 100644
--- a/src/driver/amdxdna/aie2_debugfs.c
+++ b/src/driver/amdxdna/aie2_debugfs.c
@@ -462,8 +462,6 @@ seq_printf(m, "%ld:%s\n", _name, #_name)
 	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_WAIT_CMD);
 	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_GET_INFO);
 	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SET_STATE);
-	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL);
-	drm_ioctl_id_seq_print(DRM_IOCTL_AMDXDNA_SUBMIT_WAIT);
 
 	drm_ioctl_id_seq_print(DRM_IOCTL_GEM_CLOSE);
 	drm_ioctl_id_seq_print(DRM_IOCTL_PRIME_HANDLE_TO_FD);
diff --git a/src/include/uapi/drm_local/amdxdna_accel.h b/src/include/uapi/drm_local/amdxdna_accel.h
index a9d0146a..a3af52ba 100644
--- a/src/include/uapi/drm_local/amdxdna_accel.h
+++ b/src/include/uapi/drm_local/amdxdna_accel.h
@@ -17,7 +17,6 @@ extern "C" {
 #define AMDXDNA_DRIVER_MAJOR		1
 #define AMDXDNA_DRIVER_MINOR		0
 
-#define AMDXDNA_INVALID_CMD_HANDLE	(~0UL)
 #define AMDXDNA_INVALID_ADDR		(~0UL)
 #define AMDXDNA_INVALID_CTX_HANDLE	0
 #define AMDXDNA_INVALID_BO_HANDLE	0
@@ -49,8 +48,6 @@ enum amdxdna_drm_ioctl_id {
 	DRM_AMDXDNA_WAIT_CMD,
 	DRM_AMDXDNA_GET_INFO,
 	DRM_AMDXDNA_SET_STATE,
-	DRM_AMDXDNA_SUBMIT_WAIT,
-	DRM_AMDXDNA_SUBMIT_SIGNAL,
 	DRM_AMDXDNA_NUM_IOCTLS
 };
 
@@ -543,20 +540,6 @@ struct amdxdna_drm_set_state {
 	__u64 buffer; /* in */
 };
 
-
-/**
- * struct amdxdna_drm_syncobjs - Signal or wait on array of DRM timelined sync objects.
- * @handles: Array of handles of sync objects.
- * @points: Array of time points for each sync objects.
- * @count: Number of elements in the above array.
- */
-struct amdxdna_drm_syncobjs {
-	__u64 handles; /* in */
-	__u64 points; /* in */
-	__u32 count; /* in */
-	__u32 pad;
-};
-
 #define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \
 		 struct amdxdna_drm_create_hwctx)
@@ -597,14 +580,6 @@ struct amdxdna_drm_syncobjs {
 	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \
 		 struct amdxdna_drm_set_state)
 
-#define DRM_IOCTL_AMDXDNA_SUBMIT_WAIT \
-	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT, \
-		 struct amdxdna_drm_syncobjs)
-
-#define DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL \
-	DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL, \
-		 struct amdxdna_drm_syncobjs)
-
 #if defined(__cplusplus)
 } /* extern c end */
 #endif
diff --git a/src/shim/device.cpp b/src/shim/device.cpp
index 278298d5..cef74159 100644
--- a/src/shim/device.cpp
+++ b/src/shim/device.cpp
@@ -262,7 +262,7 @@ struct partition_info
     for (uint32_t i = 0; i < data_size; i++) {
       const auto& entry = data[i];
 
-      xrt_core::query::aie_partition_info::data new_entry;
+      xrt_core::query::aie_partition_info::data new_entry{};
       new_entry.metadata.id = std::to_string(entry.context_id);
       new_entry.metadata.xclbin_uuid = "N/A";
       new_entry.start_col = entry.start_col;
diff --git a/src/shim/fence.cpp b/src/shim/fence.cpp
index b67ce0ef..26cb428f 100644
--- a/src/shim/fence.cpp
+++ b/src/shim/fence.cpp
@@ -107,28 +107,35 @@ wait_syncobj_available(const shim_xdna::pdev& dev,
 }
 
 void
-submit_wait_syncobjs(const shim_xdna::pdev& dev,
+submit_wait_syncobjs(const shim_xdna::pdev& dev, const shim_xdna::hw_ctx *ctx,
   const uint32_t* sobj_hdls, const uint64_t* points, uint32_t num)
 {
   wait_syncobj_available(dev, sobj_hdls, points, num);
 
-  amdxdna_drm_syncobjs swsobj = {
-    .handles = reinterpret_cast<uintptr_t>(sobj_hdls),
-    .points = reinterpret_cast<uintptr_t>(points),
-    .count = num,
+  amdxdna_drm_exec_cmd ecmd = {
+    .hwctx = ctx->get_slotidx(),
+    .type = AMDXDNA_CMD_SUBMIT_DEPENDENCY,
+    .cmd_handles = reinterpret_cast<uintptr_t>(sobj_hdls),
+    .args = reinterpret_cast<uintptr_t>(points),
+    .cmd_count = num,
+    .arg_count = num,
   };
-  dev.ioctl(DRM_IOCTL_AMDXDNA_SUBMIT_WAIT, &swsobj);
+  dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
 }
 
 void
-submit_signal_syncobj(const shim_xdna::pdev& dev, uint32_t sobj_hdl, uint64_t point)
+submit_signal_syncobj(const shim_xdna::pdev& dev, const shim_xdna::hw_ctx *ctx,
+  uint32_t sobj_hdl, uint64_t point)
 {
-  amdxdna_drm_syncobjs sssobj = {
-    .handles = reinterpret_cast<uintptr_t>(&sobj_hdl),
-    .points = reinterpret_cast<uintptr_t>(&point),
-    .count = 1,
+  amdxdna_drm_exec_cmd ecmd = {
+    .hwctx = ctx->get_slotidx(),
+    .type = AMDXDNA_CMD_SUBMIT_SIGNAL,
+    .cmd_handles = reinterpret_cast<uintptr_t>(&sobj_hdl),
+    .args = reinterpret_cast<uintptr_t>(&point),
+    .cmd_count = 1,
+    .arg_count = 1,
   };
-  dev.ioctl(DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL, &sssobj);
+  dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
 }
 
 }
@@ -199,25 +206,15 @@ clone() const
   return std::make_unique<fence>(*this);
 }
 
-void
+uint64_t
 fence::
-wait(bool async) const
+wait_next_state() const
 {
   std::lock_guard<std::mutex> guard(m_lock);
-  auto st = m_state;
 
-  if (st != initial_state && m_signaled)
+  if (m_state != initial_state && m_signaled)
     shim_err(-EINVAL, "Can't wait on fence that has been signaled before.");
-
-  st++;
-  shim_debug("%s for command fence %d@%ld",
-    async ? "Submitting wait" : "Waiting", m_syncobj_hdl, st);
-  if (async)
-    submit_wait_syncobjs(m_pdev, &m_syncobj_hdl, &st, 1);
-  else
-    wait_syncobj_done(m_pdev, m_syncobj_hdl, st);
-
-  m_state = st;
+  return ++m_state;
 }
 
 // Timeout value is ignored for now.
@@ -225,57 +222,54 @@ void
 fence::
 wait(uint32_t timeout_ms) const
 {
-  wait(false);
+  auto st = signal_next_state();
+  shim_debug("Waiting for command fence %d@%ld", m_syncobj_hdl, st);
+  wait_syncobj_done(m_pdev, m_syncobj_hdl, st);
 }
 
 void
 fence::
-submit_wait() const
+submit_wait(const hw_ctx *ctx) const
 {
-  wait(true);
+  auto st = signal_next_state();
+  shim_debug("Submitting wait for command fence %d@%ld", m_syncobj_hdl, st);
+  submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1);
 }
 
-void
+uint64_t
 fence::
-signal(bool async) const
+signal_next_state() const
 {
   std::lock_guard<std::mutex> guard(m_lock);
-  auto st = m_state;
 
-  if (st != initial_state && !m_signaled)
+  if (m_state != initial_state && !m_signaled)
     shim_err(-EINVAL, "Can't signal fence that has been waited before.");
-
-  if (st == initial_state)
+  if (m_state == initial_state)
     m_signaled = true;
-
-  st++;
-  shim_debug("%s command fence %d@%ld",
-    async ? "Submitting signal" : "Signaling", m_syncobj_hdl, st);
-  if (async)
-    submit_signal_syncobj(m_pdev, m_syncobj_hdl, st);
-  else
-    signal_syncobj(m_pdev, m_syncobj_hdl, st);
-
-  m_state = st;
+  return ++m_state;
 }
 
 void
 fence::
 signal() const
 {
-  signal(false);
+  auto st = signal_next_state();
+  shim_debug("Signaling command fence %d@%ld", m_syncobj_hdl, st);
+  signal_syncobj(m_pdev, m_syncobj_hdl, st);
 }
 
 void
 fence::
-submit_signal() const
+submit_signal(const hw_ctx *ctx) const
 {
-  signal(true);
+  auto st = signal_next_state();
+  shim_debug("Submitting signal command fence %d@%ld", m_syncobj_hdl, st);
+  submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st);
 }
 
 void
 fence::
-submit_wait(const pdev& dev, const std::vector<xrt_core::fence_handle*>& fences)
+submit_wait(const pdev& dev, const hw_ctx *ctx, const std::vector<xrt_core::fence_handle*>& fences)
 {
   constexpr int max_fences = 1024;
   uint32_t hdls[max_fences];
@@ -287,12 +281,13 @@ submit_wait(const pdev& dev, const std::vector<xrt_core::fence_handle*>& fences)
 
   for (auto f : fences) {
     auto fh = static_cast<const fence*>(f);
-    std::lock_guard<std::mutex> guard(fh->m_lock);
+    auto st = fh->wait_next_state();
+    shim_debug("Waiting for command fence %d@%ld", fh->m_syncobj_hdl, st);
     hdls[i] = fh->m_syncobj_hdl;
-    pts[i] = ++fh->m_state;
+    pts[i] = st;
     i++;
   }
-  submit_wait_syncobjs(dev, hdls, pts, i);
+  submit_wait_syncobjs(dev, ctx, hdls, pts, i);
 }
 
 } // shim_xdna
diff --git a/src/shim/fence.h b/src/shim/fence.h
index fe3ff295..1b6cdbca 100644
--- a/src/shim/fence.h
+++ b/src/shim/fence.h
@@ -4,6 +4,7 @@
 #ifndef _FENCE_XDNA_H_
 #define _FENCE_XDNA_H_
 
+#include "hwctx.h"
 #include "device.h"
 #include "shared.h"
 
@@ -41,20 +42,20 @@ class fence : public xrt_core::fence_handle
 
 public:
   void
-  submit_wait() const;
+  submit_wait(const hw_ctx*) const;
 
   static void
-  submit_wait(const pdev& dev, const std::vector<xrt_core::fence_handle*>& fences);
+  submit_wait(const pdev& dev, const hw_ctx*, const std::vector<xrt_core::fence_handle*>& fences);
 
   void
-  submit_signal() const;
+  submit_signal(const hw_ctx*) const;
 
 private:
-  void
-  wait(bool async) const;
+  uint64_t
+  wait_next_state() const;
 
-  void
-  signal(bool async) const;
+  uint64_t
+  signal_next_state() const;
 
   const pdev& m_pdev;
   const std::unique_ptr<xrt_core::shared_handle> m_import;
diff --git a/src/shim/hwq.cpp b/src/shim/hwq.cpp
index 6debbab5..14a31a52 100644
--- a/src/shim/hwq.cpp
+++ b/src/shim/hwq.cpp
@@ -113,14 +113,14 @@ hw_q::
 submit_wait(const xrt_core::fence_handle* f)
 {
   auto fh = static_cast<const fence*>(f);
-  fh->submit_wait();
+  fh->submit_wait(m_hwctx);
 }
 
 void
 hw_q::
 submit_wait(const std::vector<xrt_core::fence_handle*>& fences)
 {
-  fence::submit_wait(m_pdev, fences);
+  fence::submit_wait(m_pdev, m_hwctx, fences);
 }
 
 void
@@ -128,7 +128,7 @@ hw_q::
 submit_signal(const xrt_core::fence_handle* f)
 {
   auto fh = static_cast<const fence*>(f);
-  fh->submit_signal();
+  fh->submit_signal(m_hwctx);
 }
 
 } // shim_xdna
diff --git a/src/shim/kmq/hwq.cpp b/src/shim/kmq/hwq.cpp
index b785aac6..ebb292dd 100644
--- a/src/shim/kmq/hwq.cpp
+++ b/src/shim/kmq/hwq.cpp
@@ -31,30 +31,13 @@ issue_command(xrt_core::buffer_handle *cmd_bo)
 
   amdxdna_drm_exec_cmd ecmd = {
     .hwctx = m_hwctx->get_slotidx(),
+    .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF,
     .cmd_handles = cmd_bo_hdl,
     .args = reinterpret_cast<uintptr_t>(arg_bo_hdls),
     .cmd_count = 1,
     .arg_count = static_cast<uint32_t>(boh->get_arg_bo_handles(arg_bo_hdls, max_arg_bos)),
   };
-
-  int ret = EAGAIN;
-  while (ret == EAGAIN) {
-    try {
-      m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
-      ret = 0;
-    }
-    catch (const xrt_core::system_error& ex) {
-      ret = ex.get_code();
-      if (ret != EAGAIN)
-        throw;
-      amdxdna_drm_wait_cmd wcmd = {
-        .hwctx = ecmd.hwctx,
-        .timeout = 0, // Infinite waiting
-        .seq = AMDXDNA_INVALID_CMD_HANDLE, // Wait for free slot
-      };
-      m_pdev.ioctl(DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd);
-    }
-  }
+  m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd);
 
   auto id = ecmd.seq;
   boh->set_cmd_id(id);
diff --git a/src/shim/pcidev.cpp b/src/shim/pcidev.cpp
index 4b9f17e6..faa089a4 100644
--- a/src/shim/pcidev.cpp
+++ b/src/shim/pcidev.cpp
@@ -34,10 +34,6 @@ namespace {
       return "DRM_IOCTL_AMDXDNA_GET_INFO";
     case DRM_IOCTL_AMDXDNA_SET_STATE:
       return "DRM_IOCTL_AMDXDNA_SET_STATE";
-    case DRM_IOCTL_AMDXDNA_SUBMIT_WAIT:
-      return "DRM_IOCTL_AMDXDNA_SUBMIT_WAIT";
-    case DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL:
-      return "DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL";
     case DRM_IOCTL_GEM_CLOSE:
       return "DRM_IOCTL_GEM_CLOSE";
     case DRM_IOCTL_PRIME_HANDLE_TO_FD:
diff --git a/test/shim_test/shim_test.cpp b/test/shim_test/shim_test.cpp
index 5ccbbdcd..b1c38aea 100644
--- a/test/shim_test/shim_test.cpp
+++ b/test/shim_test/shim_test.cpp
@@ -535,7 +535,7 @@ std::vector<test_case> test_list {
   test_case{ "multi-command io test real kernel good run",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io, { IO_TEST_NORMAL_RUN, 3 }
   },
-  test_case{ "measure no-op kernel throughput listed command",
+  test_case{ "measure no-op kernel throughput chained command",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "npu3 shim vadd",
@@ -565,7 +565,7 @@ std::vector<test_case> test_list {
   test_case{ "io test no op with duplicated BOs",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_noop_io_with_dup_bo, {}
   },
-  test_case{ "measure no-op kernel latency listed command",
+  test_case{ "measure no-op kernel latency chained command",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_IOCTL_WAIT, 32000 }
   },
   test_case{ "measure no-op kernel throuput",
@@ -577,10 +577,10 @@ std::vector<test_case> test_list {
   test_case{ "measure no-op kernel throuput (polling)",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
   },
-  test_case{ "measure no-op kernel latency listed command (polling)",
+  test_case{ "measure no-op kernel latency chained command (polling)",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_latency, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
   },
-  test_case{ "measure no-op kernel throughput listed command (polling)",
+  test_case{ "measure no-op kernel throughput chained command (polling)",
     TEST_POSITIVE, dev_filter_is_aie2, TEST_io_runlist_throughput, { IO_TEST_NOOP_RUN, IO_TEST_POLL_WAIT, 32000 }
   },
 };

From 2ee0fd63eba134aec9d1e7cae4c5c56438dedcb0 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Mon, 16 Sep 2024 10:46:44 -0700
Subject: [PATCH 41/44] fix CID-468941 (#258)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/aie2_smu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/driver/amdxdna/aie2_smu.c b/src/driver/amdxdna/aie2_smu.c
index dce72de0..c01a44ba 100644
--- a/src/driver/amdxdna/aie2_smu.c
+++ b/src/driver/amdxdna/aie2_smu.c
@@ -163,7 +163,7 @@ int aie2_smu_set_dpm_level(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
 {
 	int ret;
 
-	if (aie2_control_flags && BIT(AIE2_BIT_BYPASS_SET_FREQ)) {
+	if (aie2_control_flags & BIT(AIE2_BIT_BYPASS_SET_FREQ)) {
 		XDNA_DBG(ndev->xdna, "Bypassed set dpm level");
 		return 0;
 	}

From 6de1490549ae3831c0e425604c0c9e5e50eeb746 Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Mon, 16 Sep 2024 13:00:43 -0700
Subject: [PATCH 42/44] add validate.xclbin for npu6 (#259)

* add validate.xclbin for npu6
* update license for npu6 validate.xclbin

---------

Signed-off-by: Min Ma <min.ma@amd.com>
---
 WHENCE                             |   1 +
 tools/bins/17f0_20/validate.xclbin | Bin 0 -> 72417 bytes
 2 files changed, 1 insertion(+)
 create mode 100644 tools/bins/17f0_20/validate.xclbin

diff --git a/WHENCE b/WHENCE
index 46fe77ea..800b3567 100644
--- a/WHENCE
+++ b/WHENCE
@@ -11,5 +11,6 @@ File:
 	tools/bins/1502_00/validate.xclbin
 	tools/bins/17f0_10/validate.xclbin
 	tools/bins/17f0_11/validate.xclbin
+	tools/bins/17f0_20/validate.xclbin
 
 Licence: Redistributable. See LICENSE.amdnpu for details.
diff --git a/tools/bins/17f0_20/validate.xclbin b/tools/bins/17f0_20/validate.xclbin
new file mode 100644
index 0000000000000000000000000000000000000000..9c66f31bf69dfc5256365dcdf02dfe4d02a82042
GIT binary patch
literal 72417
zcmeHwOKc=bdS1_HSCU7Nu`75Ddu;=WvJb-UkgONUCegi84>=EFn%#6aduD)PBb`;1
zBx|RtvNo%#o7_eNI_hY^2fr?c4~AjDfQ}NR!}?|zHsGT#1}p)-`Jm$-Y>dAz9+{Du
z5mmuQaR$3p4R%IG#{b9vi1_1?k^g_u9d!Dm=Dn|1zqu0l!*~AOKe>nBU;gkNUhe68
zafAOpd-vVn`OZE0Oy}`>(I50j7vZZ$*dNcrdRR9fdZhpN|6Tu&+k5}*zyIz#2Y=T8
z!SDX_GRW^38NJzXco}y3)5$cAqoH{6N55edK=L>7H;jkjG#Mv@<m^&E)#biW8NaO`
zl&=3n;k!`R-zq)Vw@2#1U%V@G{w6N3qF!$u;`l1;%=&|#c<{aV<nzB$6|#6%PCgR<
z|MK_b&BI6X`nPczN8PWYv$*&Cax#sF`{Kd<WBL65{6Jp+p1f54|NWoJtm${<^@o)b
zGUMMQ@~Y+XcX1o_<FK2IrfD+ZRTTK8D)$#nSw4{E<c;zFpZ;5U^M#V163@MNv!;_J
zzQdBAJbkV>l5knMU&wp>FUmcC_UJInK|;3+VtYrRlmez~cllq~FpGPqsaBwSM|YO}
zqZi@P{y|vZJve&tS!3_X6K_<apv3o11z+Pv_};DHYu*UoZ&dKLZiMeU6@2X*;rng{
z-{y_*{bmK<){XG}Rt4Yojqv?;1>gM};A4|6wC86}KRS5);k7XCmYRRon~|^kI*W*Y
zsmd~6m!nzGX?AR#rkT<(st+PZ=4rgV>NN4_H1X&(@i0wX#G~_huS}B2(`jCHns{`Y
zc$n6BbUyEeLBf@|b(+t0npd4B9;P)OozHtd<BxcBns{`YWp$cY{%JfqpZ6T|_#+;j
zCLWz89-Sr}(;AP?*Y|iN@KARA5syw2k4_U0(;80a^Ingwq(`TTN2iHLr-_GYjYsG6
z{%_!qKjP78;?Zg1(P`pgTI13Ayyp<iD}QfllkRyujKWuOI_W2)+U}d$F#2hd)^=-+
z+WWO(e<ad%q{q>;dyf2i?d|(DF{FpXcpCMhX+*HzL|k6Qqn-efQz5{0k|^zk{T@JH
z5$7lx#z<p=gd0_Fs4HbfS)NgOFP?PM{&*^AtfA0ZJc`q38u!A^C979hKkH+(OL8Qr
zs9A3|)|>0Z+#TS))oiqyTfCk2hjBQWM#C|(50g=F7+nU9?V!=xZEx?^+W|oZw`9Sn
zMc`L_B0Y}MDKH=<dDZuj(`g({XK5U!$#7EJ{ql{hGae^el!5Vk34jYRLuXwM4qyHF
z$D(?ruL^q=#>m84eRHj`z1D2*Htz2>T0oEpS%Rk8$;LQMetNQTA#^%9*?9i>!O_Wv
z6kAx|Y}^m)JN3rNMt=lZ4F)KIS>0CqWaG2`w2wPk@?--mi<6Dn=ye*6$8is(Mi*P{
z_2gs&BnNA2{n2<f4Nv=ncoKvc>*ue-csN@x&X|D;|3XwIW`!3C;5dcEg#-`#KaYbt
zZ)Q^<3)WF7=aylwpL!uAu5~G5U>Ogh>1mP<13M8YkwR1HG#tdQ;(@4m61|GoF#Q})
z@N^obXYn+UobVp`B!zS>;t<7WlQ_jXA!H>Xde`|RIN5leeg##W_s)Bxr1RBm+&tMB
zM!oadWCn@uJ%hN<<G~;aUneQ}zDTFgi$oEyr3##Be1?7^+neeaPxl^&&p!P4;OUc4
zG?2?N`(Ss_MeQt%79rF->c)Z;)tsYA9AfsE#uK&_6#6PoNAaM5nwr8B3aXHQ7(V*q
zApH37!$UkBCOuX-j4*dkai2z`GiKB;K2kqq(v*a+`@Jd6OslDohEqwbq9f`IVm#@9
zGyzX@ly(2-qRzdiPmZ5GeT-77+Q|TF!);z`qh6;Ru3MwgX)DQ>@7K^SX2W<?lt+I=
z>!-K~e>RH-{nI{#AuX-2k0gn5L|#F96SRB5?0pnSPERK>i_|X~1UOG7)6mGw@(NH&
zh_8}<??<M@up6+5Dw2IAs`hS775aC`W-?b=x`Fsgt4ZnIOj<&ucCsjTp+s6&C9ZOF
z%rw$Q8AD|)-Y=$%rIa3Lqe=g46!(JuXj*$)X@f=vnxpR7h3+X=g$rP<m|qrhfnTr8
zi$k$Av$JV`0J5|BMXkBsXsmDU2wg-I_Vl<HceY#g&Q2$4b{f5%UTdq@?sU34JMn3I
zv)zg7&8>E`y+!WL5&QGF_kPgW5u-N8prF3F+uGi3)q@|Scf&5=r}i5j{K3U=5GW!Z
z)*6@%1o5bw^!lT-hqc-CbnSla4}S9A1CE4ocN)-YKCFo`h(kv0C+`KpgKW5vbDW1Y
zHN^?UsNe4apixtxoZSR!@PC#Jk3siC%$$e^m=qy_nDsC}5ECt?HdGN~!lSc9H41+X
z%y0tou!d^JXA!Ov5ZKWq;`B7?#uEj}ydkzuCc22u$dIvBJ*<7Qe{`S;5OfNtQlUei
zLXQiD9&8k=f6(t4lt@N0QbAiUe%0^BX3k{RnTTzlEE6~tk_Q_?FiLK6K5#lnUaO)H
z*sp>p?Vk6iP%S27Q26MpQSy4Ec~uoEMk!8cgqM9m2=v-dr}4%1?nQU|)kXXM=5+V;
z^sLd?+}>S-#%ZKJ=ctuy7)*4NG*%c3gKr>b4G(K#+E5Fm=#7y$=nj(ZS0APE&w%gn
zlb@qW&4zo)5EGvG#R%Jd4{Mmc8UmDK@*bfK4U+_B1++yEYx|FmABB65KLumJJVWEZ
zd|2D8*Voxq>Y|=ZwgBC*BYhyV6<=cH4+a>@XJ}myYhAR;!RVFl80+0+xO+aBhz+P+
z(dWi#d>W@|-1|I9zxp^$X5;4=+74%fX@5KrJtV-yCVzsiS_@zvX6U8UbQafW?|@)7
z21_#I{PmvPoI6UBY0^yw4{Oml>`qhc)}2TFks&GYg$coIp!Fi?&jl%(e5F8m)E~z%
z0+_?cQ4i$Mnm!Ts;Vd18!i*M+gr~Exv9+yL`y?6dV`9T5`N{JFHKM|FIyi!clt_nR
zbkPsbxM%z5^UoeVKG?4XNryJ+VU4-9047SYDaz_LOkDE!pono2Utj_|B0U<Bk(FbE
z&_>B9UWgA;A9V%75W>(Xi@8_FiXHRL>=bPy?4j>ztXZzfHVB0vlJv4n>VplbysK%n
zxmcr3q0#NFZ<j{5+qs~WMwxYW*%x)EWnav_I@VWBt44KhiO}29s5D9Qx~S-|?4kEJ
zucgnTTy9NTS^NLlwc9_;Te0isDkvbmk_xIwIXAcU)jWSmx_zBu1fSnqMG?_kcUsrd
z?$Ye<4(yC>EULFM>+1b#N$oG!URY=>d47pl{~l|z%MUm!QDdI5)D5^`f2g+!0<m7l
zEGXY32)JTblP&BJh{?^dn7o?X0|9pj2$#*L)zE0vT3nl{iseQ@Ua#0DxM7{x3{Z@6
z2SCggv7Pdd_4;2-akzLuzn4=rQyyF~l9BspJ!0L@8!Y<+&T5}#IVyvbDOFldE9G+E
znAo8^oAf|}N^VtpK&vl3d$&Um+gdt@BBk$#=H^IXl@ONtw;~}Tw-E$+Q)RF*QbeW3
zq-o*^o#_aR{P{HDxzt)=abHv(rqpBXpcbaj!G&l;n9oCHKj;5Nm|=8rrSdpWfMszw
z<xX1>lio-$=Kq)kmgZY*P|xsAVVw8Edi_2Z8MR`^!o5v;#VX~lGqxqoS{{6K@X^zQ
z5%+E*z7jA%^vWeKj+NMGgb6gt1(UQZtp!(SHQpXzHvh0zT%jTRNGzI0NjGdYagO7#
z(c;b`)*_RsL?9PJB157O^9m>cB_wdMO_(I!jG(PH2zp*`UYQ`nu9(ymmUP(*{#tQe
zCmUuDyBfJ*VN0;Kz&1qOTccSHuu7NZAcj+{5-9Qr>oF)AkfE#yZ-S%^E8FJWE+u2^
z%*^X|K&ZP|!7#Fk`5DF;){zk8wA58{8v3VLYt6Hbb^^TZ4%Vh=@FQpSc!d<rBLJq*
zN`U{+uz-MMF8?rm_Udnof3eJ;qX5`SD*;XK7O;SSn!0q>oz9^F>jygxw<Uy8-%SJf
zWJki9RMP;F7=z6sIpl*4L=79~?g>L|>o_|Cb`^B}g`T)@utRBSE3A4b4K(_&k=b1=
zhTdw)V2cVnR7<dA<pDRhA}dSAm*Xo-Hm3)zEZNGENyC-Ttcu(0dQOHhf7zC-X<FFA
zT5)B`R+fyfURIWDH764<ziu)o!}4L-maMh1WUJMsvCUB!%vP&QY!k0mmuegHCYB7d
z_hnnM_R5m2EZNGE&DktnS+ZNOWZ2DIwk6wKS+bQSTUoM|C9}Nxy2TxTyc1crCEHqA
zvXv!US+bQSTdpO;+tp=Tvh9^6TUoM|C0kjt<ytZv_gJ<iyT7tzD@(SrWGhRyL`x=J
z1zpFM`J9K%572(QP1)k3kvMc`Jr;O}CnJp$vW1h83s1=UpNzygzgIYCSv?%74re+~
z$)bu|I;Kn#AZ#ZIsL85`hW95M3r|Xu1PI$ng7cW{?MeZ{c9LM6k}aH4ZQiaFz>t%K
zn;(>QDB>C$XAd^``GY)e_FzMPZO^+08`{MITp<XrK=^FpCv>6k=DnaMKfs57o9GND
zIt|}X0iNP<$&r-E4YW6@payRpbbg>+N(J~TKxQbvQp`Ysx6G_v(3zCNpgl?<S9sS{
zVA9nRJ#GZTuarP8z1;-Txs=GsoJmQqR(ZAA1L<Wde_(iCAnub9za2Y1?@t0P9rX$S
zB*2^FfX@|4MUPFA{6Kl83ToOl70Ck0+Tj)?lR2o8kLaZ;$lX-QOw;p4;C5EU!a{^W
zEqWF82mFQyAFfZ5Sqgs;TBbqm@rz?oNxij2M>w($I0#~<zWo4BCVAhex3VJp&ySeZ
z#P>Y9*b7yR+S7H*HkG*@1<0483bqvdgrk&9;<)FbY~tPLSz$>{Kwm~eyO$tb=h2vb
z`T>j9FPi+ONRtr=k3hMzA{G!I9hpU@(HTKMftSr3;{YyrGN#dv$Hzt~L9Z03(x)aK
zUU3vNbBQ0p8JQ|qqCLy!$BIS5bEXu<^!J&SpSUO%r6k4t|D~w7hV~3sF(J>qz*mgc
zgm8Vdha4Q~GYSir5o{-4zC^C-jLIdM>W}kVxo}S=Xjt6DHJb&uF0wyV9F|Fxnb)$Q
zChU`TS|)s!Ra};39?PV|GU=~Om7=$@+*z6SRVH1P87{rDF7I>^R`3XBap^^mUIm9<
znLjVO^D2Au%A9$v4GIp$Jp73z>%Pv=q;oaX7uq6kWtlrR)d-cFx=`C&e*7kU{p#nl
znY7HUcq=+Rjhbt5dwYAWeSd3f?S6OX6t_`tD~{`}Ru|60q;g46cpX6He<*|9tnW0U
zW}~^*X>D(=wRbx8wP-uqT8rYHt+>-_)}t+W1Cy1iQB8eU8PZD!TK{P@n8YlgY-!Q0
zTFIDcArw9$)SX@%2gAXOexyfk<#5fgLGnDf=!SyIK-(UeN?n`qbf8cnJDy;K5dILW
z)?S!gW$|}573U&TS^P#5uBB8>z@-hpnWqfDnN=|SM5pW|Qo;e2Nx#8?!GrNdVqQI_
z2*Oz$=MK_aTt-=-J-E@j=Z@oy=cYq9j-dv<k|^a(4#T5#YbV{w85pEvIaZ*}6Ilyt
z@arc(BIRhY-%npCA3_;u&V8;egV*Bcpf+}um%0pLfm`XY&Up4sa;nLAnXzHIyi?tS
zQd8r-5TG;&MWO1(EPUf>tPq+~j_D;2^N@-%sR~CM;$X|i{B%B}5}4&Da`Fk5g61PG
z$udWJ)dnCQyU^-xE<d<8v0k^`NO2W;&c#)1*moo@`1Lb%^lcPZQ7l?q(yxF?``rkP
z4hO&G0)w+cgF-DZ(aOySR(TbB4?pE+w{qLLyziK!O|eVT6QV)T)!JAQR(HvQHw6><
z;-0A~G3=a}AZPQQoLKl$g|ju`<~XQlwaDCTK21CLRwC6XYn*w&vs*Z$Yc~i`E|d(@
zSzau`h^a5>wYo}Ef$l7$PdO#X>8Y+Yq-Tb_L<Aps(V(zq{8AH(s;|#~UIk_-@ib(E
z5rj(d<d3nwDOIoZP!*kY?C`2Tg%4;i$&E0zXrOF9FlB2`QxUmd^>WW_>{|ZV`4Vnk
zrPlR^29=BvEIrG#P#oQ|>CItzdXvrV%<3~Sv2%%~8u7o)#YL8yc9xWno~-N@m@!E(
zc`@_#Jk8A0ZPLthu%NZ7$;))2TD&p#)t0F@qn@(f%<*KUH#WS?faOiFtT#hwOVyiE
zPf>5>@qVSblIGM9Brd|Jip`$)FgCq?Pk79ois%6Oe>z?Oa~OX>|Cq0zF<++{mq4c(
zmq4c(mw;*h>39i9ihGThSDhvvohBZgCLX3W9-Ys7!tlpDo#s`iiASf2hiQ#R=kuQB
z_#+;j=5w9qRi}xEX^ltc^PbQ6BOaY59-U@co#vH)8jsHBy^c*mJUUH0I!!z}O*p1C
z9-Xi6@kr34<5UojP7{w#6OYytuR5Q2bZiRJqtnEr)5N3G#KW|%U+42)$EF}2ohBZg
zCLWz89;P)OozHtSt_5zzkNNsX{QAdy{fzlK&3bj3_3AY1Wtx9l9!QFNjh9!QCLWz8
z9-Ss0rZpa&&wIk~$2^_pRi}wZr-_GYjYsG6p5^!>9-ZcMo#s`iiHB*8N9XgN&-f!A
zohBZgW?7x)m46zK&gZ>ud&HyD#G}*1qtk?ATI13A`W}x2J-Qtdk4_VhP7{yT6R$d-
zcy!w%JvvQ1I!!z}O*~BN`gK0<b=xBzohBZgCLWz89;P)OozHu--4YjnS==uHA+m_>
z+VR1z9p{C?&k9jq(mvi?F%H^kAMaA$(2F%*2*8Uw7GJgE>Y~<c?CjK=t>z|v7_4if
zaX)|0!Ray<KZDtD6i&|jr;KRO#O7ozO<w02ECl!0h@bKr%LKSPlD86$IrJkBa8Uxc
zERR`l2(R-Dx&lmaw@uWT<mC?xjLavK6#x>PJum>sCj-17_DM1d`2fU_lH!8|Vj4u3
zvVK+MOF<rMByz?Hz#C)Z$M`R(Qtd^jVfMD;O|6qAQLl>yLl}?5J3ZmRZnq{js(}cv
zWXI<r{G%c!PLhU*SHgEa>M_uY=b#c{WmvM#6nB^eMRLy0m!^12G?@Y&v6E;h2SAc;
zidO%T2lJ2on1AFg|1($=NrVg#6-cF4rNUZFDb2sPV3%^Be&h;hsjO5gyF0ejwhUkF
zeks}?+u-&7EK(fOEjwc@g>VRYc7_+$tVXxEZ{}(tb3DjA3!?Exj9|kC4Px;oyN0Lq
zs6K#}LatXu_FMFbV_1koL@I)-OLm4w2OoYaV%r=ig(nEvBzg?6>He`kj>Gmv8}97~
zb^MpcY@f6rD1+{c8`*PI??Q7>DISra`+0S%kA&e?#!SMtNAh}o4;Y^%uVq}Dg>|!^
zEUKGjmQgo*{Gz&9W*K$Ux-6=jWp20b#sYm)(#z7ftlO>cyzblAx2)T(@4W8Y*SD<O
zt?#_<+t+t<fxgM@vh*$MCU^FBp4WZ*`j&OO^_|zv)a~{$S+`r?dEK|K@74l+v&LoV
zTh{H?cV74H>s!|C)^}d_?dx0C%^K|OJg=LntF|?B!hm__bGY4&dmo5UBi<u@PMY3O
zNBm0$D|v!>21f7U^y16<OO$eq`@LbbcOKzg1=k4(uqleK_eTW6Jv^ZDnVJ<CZ>4IJ
z=5WZUemO*=1i_{-2Ld~x3o988n-ilFf-I`V0@5rI#&%*7W&vrHK&ajYAng(e=eku8
zOz)htHUOw*=GG<Hzlts!FKxOk3&7N+geXh7%zqW4EaWo#Rfw{Txxh9BR)Vm#mokXd
z<V|G^Rty=l8RvG@(qEcZ8FS+8;zmN0WjwFH0K&$9xe~J_E09ICSU}3EEH@VmNLiQV
z_F@6KoBjgAuA<A9RhMM}m@QL6l%-tezY0+n^1S{c>nO{(-1tj%*xE}OM5^Vc{RNsd
zWK3br?JsSsjLS{g<VHf2WjwFH0K&$9xe~KkDv(9BSU}3EEH@VmNLiQV_F@6KoBpzS
zHC=96by*gGsY?k_mU5Z@Dnwbx^ZJXdqb%cc<1f`=YcFLGsg|4em#u<~$@<*>vSpQV
zxhb36NQknG=k*sr*!VA3Vm3<!vZxjdNLiKT=3)UU>$2QlEFgE&U$(EN%WbPJ%K|WU
zDIv;IF7sc7C<}RBf01>RWn6Cjr8;cwr3@m~^0pY17mJ2x$46=p^y9;Wr()%CHtZ*J
zxBl0oqcofh5Pdc5*5#UJkev0qI1r{@5*&PV7#<$8&C2t2;+24S0aIA~s5K4Wk90(g
zYl<J9lVf~2Cf++xxSd&w7aD_jbT-AiixGlmb}kX&4KHi>p2xATi^uYz2rLlejR!z`
z@dfe`j1_#7NgVM~?>E>XOYb+<JByh7{Nv+mF-fLOCbx0FJSK7IOEF2|mYM7>V)AKO
zlN6PQCaHUqNmjl*CR-Vk6mFTxo*$F!m#VpXGAWAPS#xb?Op?+vlZ8GBBlUjG+9ye}
zvrqEjH{2&TGbU-l%1jpeq>V}0C$G-r-S)|?j7jmTTJ8^JFOCX*(#E9hlLnJa7W<@C
zlXu%Ew=*Vb;L4gT^hr09Yy{3eX=PH#;?DZy{ftQ-F(@-x=#!WRIQk@Qqq9%);WykT
zcQPh-EKC;qq>V}0C$G-r-S$a*=A#Ene(_V$WuaHv*p$7}V6)mQt-8G1UfD1vJtVN)
zWDEV$%_i-ZvtL@-yxV@+$frF>R@p4{OiZ^OJ(Eq)*)#d@8}6BneBy&-mCZumw6Q7s
z=GECGggL!i?i{v}PkoTAvRUYxHa2D7G}x^6O+Kt>^N#ywKKW_bW<Q0#>1LC<aQ00r
zn|IYWvF<v0tX7%h!J<{>v^%x0E|2%b(y}<?5vm$Qui~ev`ea5P=QCR=Ps?D%xM<}k
zDz?o@P;s^4h#cRFbF*Z#N}^($(M829U*uTi_*R@73buf<)fW}pS`;c~`C`Q)XK}@l
z_1?bRQaHuAC+sbJDb7lEdkMeXil^)Ft^njbKgql}O>rm@uQ_}8#{aDJP<~N%hCsfR
zM?A^d9mG*u0*MEwldw0m<&Mj_eSBBPQ<&3!w^Uv{!AITVENkhRJY`wR?8Tj#q`vT0
z8PfQW=Pf_x$x6uL*W-*OUee7wR>@9l{;@C3`(u-z_s5nw@6W7@>2h6V+(lfbKAC=r
z_ln>TTgIYoYvdZKMy8S6#dFrpdReDt(C@y&9QM|0`|&I6^6evRGaD>UKYjo&G>@cD
z1fE1t-gT6V9TxZ~jej<aM_qhs370@{yaO!9Y1EC?_9D0CP;5_hr^!c2Djq%>VQfoi
zC0I{&C(^o=LH<nt26W_r+OnO!5>UQf=KvD6xdfDNe>s4JU9|#<ZC*$&eBJL&&moSS
zJTaXq0m&^>>d#S=m>iUV^6d@>kl1M`0p;5V4j{2|w*rY>Wjjw|C0qiMJhA;bY7#5U
z5>UQebO6bfS_vv&eYv2-!p925#f%He)+i)f>|!n`TdT;Y+Ab(tv&d(zE+|{OkWicm
zR-n*Bs&lZCSkwfo0&7g3st}H*%juR)h-ymXsKX{eH5aKsyv0v^kH~Ld{*?dyA^*E~
z?}z{E2lu}F?!EVR>hfNE9D^jkr}>4<x|RQl@0a?coLj*7KY?SvRr~%Qzdn8OU;q3+
z{7Zg)#X9~G{&;QUkH3HRNB`w-$&~#6VE^bS_+D^Nv1>lczYr?qF21?=(SQ0^tdI5T
zA17>ZBXkz)jC=R~ex59*{`K$g{DZ%PtRLWySM&F^`W+(^-*DxhGns{Mn2x^w`u|by
zn2Cyy!oG*}zyGs8`TE^=@D*0@4+UT3P2RUk_j{%L=h?kUmJYp}$z2KS7pl+&+6KS*
z`fGL1$A64GUjOMWwi}85```Z)ApXy|A^v}H3;4g-d;Gz{lko81*|Vq5!u?0bkHY82
zkDeW0`+KqTIO_3z-<#TDg!hIR8!v?SIF1z0;f5}n&eHg?c;km>r|O4?pT?IT(1V7N
ztDml}zg`kRYj`p0W3G9ykJiwx*Bj9myi}?mo*wsy@$+dk9OH3gtJ!GpH0yNrRr@Fw
z?dw@GG>dRH`I&aNSv#2lQFC2fK3Lz};qLySa5yHuoI0RK#8b>_U%U|2>uOzBKpzdF
z>1mP<Kl|XxXP~I)DX<>0`Y*bJ5Bej8GTuDxHqlR7+iiHi>(%SMI#}rJZ0$5Q>%AS0
z?fYD50s`nb@tsJ|V`1LAKE*q4JS6TX@+DJmHXL4t9XOE@uSoFV>FFt!;w*L1y3dl&
zg-<Nu*=!zCPMQ=_F2JYBIKiCYaxTVAig6rW4)B>Gdw&6t_gE{=#3zZ=6TQa{Lk{=Y
zIp9OJsdf0_VYn|`T_R7q{}c<kVMyOW^!o|ox%MD8fTE*L<;@efrLRxx7ic=7ERO?U
zt&VQ73h91xZWS*vRhgqGibu&5DN{2&KFT!H+AayQKMwI3Db9fisb*5dW<Mo>r&TE)
z9EG2L`0UAtk7bdO{$>=iQc!H;>1;feM=_)@feJez(=_2h^hZbG-tn`?;oc{Y4xR`!
z72ztsK_EhHInme9u5vh3kT~c$dTx?5nRS%HF}@1y`ct@wmFrC9<w$;}O10Yv86{KM
zQYa2ZNj7TlW}#h|LXebZNiy>YYAl%`*|)AhP;<!yaV1shnDZpmS~5Xgx?hQ)_L2$W
zy8B85Z7!K0TK_8%w6$b{I38Y!pzS3SL<@Z-g6{hfG)|Lo495cE8)Y@lUD4>t_!LHU
z3x*}i);cV0Zx$vD%}y63jqx2r#fXy<>3Yo^-^EOT!@Vq!U=D*Jo-V}nF}-ZOkpo*X
zU(E}QOqy;JJRAA02QF=7Hmn*IL95rVT~M_c+*P|LxU{EL;yuBo^}9B>v{jzqJd0UW
zob1S+;PUf#AH55aUSIN_;4aB~g1aQ|3GR}-72IF)R&amG3z=MB@}A%>$$NskB<~6C
zlDrk%U-DLPf5{7`t}l5{a6u&37$r;V3GR}-C%8-UR&amGTfzM$FQ~e{<UPS%lJ^96
zN!}A&6tn2v3hpm?E4aVp#l-ablJ^96N!}COC3#P9m*lPB{*t$X`%7L7+}D@9C%8-U
zp5QLYdxE<pZw2?4ycOJE@?xoReaU-*yCm-k?vlJGxJ&X@aDT~L!TlxA^Uxm40WpKJ
zm`YD@m*hRcU6S_%cS+s~?k{;OxWDAZ8SRSbQT>9otT;ca7PH)j6!B=n&a6K`gc)<o
zH4-&sTdw!vJ-e{B6(B{ghYUyd1_6ymy#z0}vOr|DJP4VHL1vu314<R*D;&zuV`8&D
z%ac*ZL|!4_7*Ej@BP_kqSakp~jTnUFp~K*q9rvwfqt)Ev0^5ui<_;HzEo;F7xl@~`
zf^fwr(sXqQ#7IeA^*ta@<=YmVk{a@q)Eil6JWjei69a|6L?QY5kN6#2iGWdaSgJ5X
zo-5&@YMHOZ+KGq^b)rK<oru^_Czaq(Cs}l;6B!=rM8=2GB~KV3>Ld>lg;Jx7t@e6y
zB7;P&^+)jAAByvj!J_y{fMJWgND$PCVUBPi!9%>&2<p6<O@S;}N2P492(p*1!?R{Q
zoUIpMCxLnV3mXcd8N+F9ZE>)w6xxB;{Q>+#4n)O;=v8o)c@`VNucq-3?-%+h=~~1g
zn6L!DI$`uHsN%eL-Ww&IuV&-sNf!U=WCH=PPRbFmpqDWYk&csqZ@!9wm(a#kD=o%a
z(ubnWAWOAUcqP$<3z62ClSNv}QBzZF_z;wA)~ZS~(2VjZo<%6qaf$GHDi2FkIiVww
z>X%)p24`g_G60cqoyuN=s6*U0QFWGrY3OQIq#&bK86N1`HNM1IT@}orz_W;4V2j2h
z-XvuXL@6r}Uyu;r*a?O>5bLNr;`JTw3voRJ(<-k3rCt)rm^y}3`XgFwB|kGpP)-#$
z#I8gYV%yx1D)iwnT|}1{YOWwHhLE`-T|%UGanZJBUt=qAm783q5l%J?Ju*^@2W2d!
z^jHYQ@)k`__s~L<({B)8V)#HA72B;Bc5v=A>lboVnZc72(5o>Ojjg%fXsB2E=h1|t
zP>eI%+pT(MrxP_hjowbLwbg5PI^CU}__V#*?!@)xR=e5e0;w=!ejfMU4;nkcVRRXA
zzzOP`yRGfrRz3K!e5H)ny)aMC7xUEf6Zv93n~NaJc=6k4o>I5ZbvJ1@TD`gN`kMw$
zEtTh&h>$kiFJj(ow+fgWn{jtOX0B2cwMA50k!XM!k(=Apr|}-(EKO$O#j)dUKtp`W
zzj_@Y{1a5?z*MPti(lGd^W4vOEgw9=smmMG?pi)rYxiG1Si!5J6U8hmFCo(FFPS~T
zU6S_%cS+t8+$DJ{xWDAB;Qo>qGP%CwJ;7a)_XKxI-V@v<c`LZT<gMWTk{3)}U-F*d
zf=Ek`^8|NE-V@v<c`LZT<gMWTk{486U-F*dF3Ed>yCm-kE{a+7ZUy(3ycOJE@}7Iz
z!lqe>_XKxI-V@v<c~5Yc<gMWTlDC5UOWt!&+a-BVaF^sg!CjK~1b0c^3hpm?E4aVp
zIjyhkj8?&2lJ^96N!}COC3#P9m*lPB{*t$XS4HQVReJ7eyCm-k?vlJGxJ&Y$;4aBq
z!Tlv~1^1V{^L0S!bkXWmS@ocX=Tgem>^#9;&CV0t)$FX`{>{z`?%(W0Prm-d&J)}v
zc~5Yc<UPS%lDC5UOWq3ZFL@!8>r37f+$DKWaF^sg!CjKKg8NI}3hpm?!PNC7?+Gr5
zw9Lpo!CjK~1b0c^3hpm?E4aVp1y$FVyeGIz@}A%>$$Nr}Vivty!Tlv~1^1V{=UPbE
zGz;;b;4aB~g1aQ|3GR}-72IF)R&amGd#;6ClJ^96N!}COC3#P9m*lPB{*t$X`%B()
zE##8CC%8-Up5QLYdxE<pZw2?4ycOJE@}6rUm*hRcU6S_%cS+t8+$DJ{xWDAB;Qo?#
zE+ZG5Dr?fp+24(xMqG2|??%}+Dz8Sfucz_y`q%v<Xc695$x^t6liwTjTQP{vcsIUv
zT6{O&s(gP&@Ks!55q?-YDHF~MZZR$~&+b9UzjCpNairuHV-EvcCH$|9gTON_;&dl%
zg#P26g!xB1Z~oD6=YKZjrbjwrA~Ydbga5q^vpP;CojGy{xc(o|e;Z>5gDgi(2`Gz)
z0avktm1ET<#3^P&wa0g4)0MG#Smvtn9kC0%Fh(*v$fCMg=635|#R|5|oMviS`d-Bf
zw#%G7OUj(mzuk7eiWO{^IW3BmIW64n%6t_oc#+JnW6v=`Sj7hxQ%N(Zsu}jJE<@PC
zgF2S8%BMc&PBbbDK;Zkl078qj00dsk3m{bK0+73jM<*P}7s8$8{oE3}b`>8u3xG6^
z!}bLi7OMN=FiVnN;=U43TlmV)WSq|#XiF_JUd0EVetDJtq5>D+O@9&L2mE!piVu8^
z{sK*MXJarM(rIKq-*fbrRea#9^%oVm_-^{kDn76|x#0VR>adTmubeA?04y9|R`G$0
zldF02F%^aQD&va+xtsp7iVy75U$~>ciU~}w$(&7&N})Lm@zm;;uyHcT1BX-fIjdOJ
zpZEDvPmQ|FsRSAm3sEC#CEwjIFI=vO#~?oGWrr>^S%{`)V^T(!FqjnEMT?mfvM`ur
z<;!ETMbnjiw_Ikj5KGO?q*#Sqjzzd>P+N?HW!0pRg~22>yF4b_8Iz>6%w!?Tp)H_U
z9(2apT**(Rxl(gxV728knTN&WC|_o>&?jw7%05XxuEylu_Q_RD;PpbEv}sZr9yS7{
z$*SS8Hdj&X&idpkCh&TpPuc<+%05Xm;_Q=rSTQ_z+b3}xNRJ`nWKh}k6h=xLo3g<g
zY*rhrmCd^yDOWLp>0@NQFj|V;s8W!tJnomX(XxG38ZA|PccbMhCa}<Ep>Nu>Df=cB
z<{V6{+PvewnNNOJF@dM+tC+w89lt~c&?_!UgsE8^nwO`T`pNti(kdqKGGYRAf90Ex
z1B`drw|l<vS1Qggr1Ps2*B2+(S8;r6`Ic2Be5i~VENo$Ez9)gWvEl#{yDuf6d`rav
zWQ*6DZ&5gaR&jjkn@)swUd8bhk)$DxU%@!O;L7POQ3P*%B6sBR(E+-p@i7P-H}cDx
U>^qwlA`BrcuqAZqhfntZKLmTtX8-^I

literal 0
HcmV?d00001


From c4a5a6d3a284bf708253c938bbe9bd390848c62c Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Wed, 18 Sep 2024 14:13:20 -0700
Subject: [PATCH 43/44] document fix (#260)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 README.md                                  | 2 +-
 src/include/uapi/drm_local/amdxdna_accel.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 231ecb7e..840d98cd 100644
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ cd <root-of-source-tree>/build
 cd xrt/build
 ./build.sh -noert -noalveo
 # To adapt according to your OS & version
-sudo apt reinstall ./Release/xrt_202410.2.17.0_23.10-amd64-xrt.deb ./Release/xrt_202410.2.17.0_23.10-amd64-xbflash2.deb
+sudo apt reinstall ./Release/xrt_202410.2.17.0_23.10-amd64-xrt.deb
 cd ../../build
 
 # Start XDNA driver release build
diff --git a/src/include/uapi/drm_local/amdxdna_accel.h b/src/include/uapi/drm_local/amdxdna_accel.h
index a3af52ba..fe41f6ee 100644
--- a/src/include/uapi/drm_local/amdxdna_accel.h
+++ b/src/include/uapi/drm_local/amdxdna_accel.h
@@ -270,8 +270,6 @@ struct amdxdna_drm_exec_cmd {
  * @seq: sequence number of the command returned by execute command.
  *
  * Wait a command specified by seq to be completed.
- * Using AMDXDNA_INVALID_CMD_HANDLE as seq means wait till there is a free slot
- * to submit a new command.
  */
 struct amdxdna_drm_wait_cmd {
 	__u32 hwctx;

From 867e2f9f5f21596ec642e9ca5321de7a0863c14d Mon Sep 17 00:00:00 2001
From: Min Ma <min.ma@amd.com>
Date: Thu, 19 Sep 2024 11:31:08 -0700
Subject: [PATCH 44/44] ignore child process flush (#261)

Signed-off-by: Min Ma <min.ma@amd.com>
---
 src/driver/amdxdna/amdxdna_drm.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/driver/amdxdna/amdxdna_drm.c b/src/driver/amdxdna/amdxdna_drm.c
index 179fb608..b080d1ea 100644
--- a/src/driver/amdxdna/amdxdna_drm.c
+++ b/src/driver/amdxdna/amdxdna_drm.c
@@ -117,8 +117,17 @@ static int amdxdna_flush(struct file *f, fl_owner_t id)
 	struct drm_file *filp = f->private_data;
 	struct amdxdna_client *client = filp->driver_priv;
 	struct amdxdna_dev *xdna = client->xdna;
+	pid_t pid = task_tgid_nr(current);
 	int idx;
 
+	/* When current PID not equals to Client PID, this is a flush()
+	 * triggered by closing a child process. If this is the case, flush() is
+	 * just a no-op. The process which open() device should finally flush()
+	 * and close() device.
+	 */
+	if (pid != client->pid)
+		return 0;
+
 	XDNA_DBG(xdna, "PID %d flushing...", client->pid);
 	if (!drm_dev_enter(&xdna->ddev, &idx))
 		return 0;