Skip to content

Commit

Permalink
hw/nvme: reimplement flush to allow cancellation
Browse files Browse the repository at this point in the history
Prior to this patch, a broadcast flush would result in submitting
multiple "fire and forget" aios (no reference saved to the aiocbs
returned from the blk_aio_flush calls).

Fix this by issuing the flushes one after another.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
  • Loading branch information
birkelund committed Jun 29, 2021
1 parent 3276dde commit 38f4ac6
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 83 deletions.
204 changes: 124 additions & 80 deletions hw/nvme/ctrl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1788,22 +1788,19 @@ static inline bool nvme_is_write(NvmeRequest *req)
rw->opcode == NVME_CMD_WRITE_ZEROES;
}

static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
{
return qemu_get_aio_context();
}

static void nvme_misc_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
NvmeNamespace *ns = req->ns;

BlockBackend *blk = ns->blkconf.blk;
BlockAcctCookie *acct = &req->acct;
BlockAcctStats *stats = blk_get_stats(blk);

trace_pci_nvme_misc_cb(nvme_cid(req), blk_name(blk));
trace_pci_nvme_misc_cb(nvme_cid(req));

if (ret) {
block_acct_failed(stats, acct);
nvme_aio_err(req, ret);
} else {
block_acct_done(stats, acct);
}

nvme_enqueue_req_completion(nvme_cq(req), req);
Expand Down Expand Up @@ -1919,41 +1916,6 @@ static void nvme_aio_format_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req);
}

struct nvme_aio_flush_ctx {
NvmeRequest *req;
NvmeNamespace *ns;
BlockAcctCookie acct;
};

static void nvme_aio_flush_cb(void *opaque, int ret)
{
struct nvme_aio_flush_ctx *ctx = opaque;
NvmeRequest *req = ctx->req;
uintptr_t *num_flushes = (uintptr_t *)&req->opaque;

BlockBackend *blk = ctx->ns->blkconf.blk;
BlockAcctCookie *acct = &ctx->acct;
BlockAcctStats *stats = blk_get_stats(blk);

trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));

if (!ret) {
block_acct_done(stats, acct);
} else {
block_acct_failed(stats, acct);
nvme_aio_err(req, ret);
}

(*num_flushes)--;
g_free(ctx);

if (*num_flushes) {
return;
}

nvme_enqueue_req_completion(nvme_cq(req), req);
}

static void nvme_verify_cb(void *opaque, int ret)
{
NvmeBounceContext *ctx = opaque;
Expand Down Expand Up @@ -2868,57 +2830,139 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
return NVME_NO_COMPLETE;
}

static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
{
uint32_t nsid = le32_to_cpu(req->cmd.nsid);
uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
uint16_t status;
struct nvme_aio_flush_ctx *ctx;
typedef struct NvmeFlushAIOCB {
BlockAIOCB common;
BlockAIOCB *aiocb;
NvmeRequest *req;
QEMUBH *bh;
int ret;

NvmeNamespace *ns;
uint32_t nsid;
bool broadcast;
} NvmeFlushAIOCB;

trace_pci_nvme_flush(nvme_cid(req), nsid);
static void nvme_flush_cancel(BlockAIOCB *acb)
{
NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);

if (nsid != NVME_NSID_BROADCAST) {
req->ns = nvme_ns(n, nsid);
if (unlikely(!req->ns)) {
return NVME_INVALID_FIELD | NVME_DNR;
}
iocb->ret = -ECANCELED;

block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
BLOCK_ACCT_FLUSH);
req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_misc_cb, req);
return NVME_NO_COMPLETE;
if (iocb->aiocb) {
blk_aio_cancel_async(iocb->aiocb);
}
}

/* 1-initialize; see comment in nvme_dsm */
*num_flushes = 1;
static const AIOCBInfo nvme_flush_aiocb_info = {
.aiocb_size = sizeof(NvmeFlushAIOCB),
.cancel_async = nvme_flush_cancel,
.get_aio_context = nvme_get_aio_context,
};

for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
ns = nvme_ns(n, i);
if (!ns) {
continue;
}
static void nvme_flush_ns_cb(void *opaque, int ret)
{
NvmeFlushAIOCB *iocb = opaque;
NvmeNamespace *ns = iocb->ns;

ctx = g_new(struct nvme_aio_flush_ctx, 1);
ctx->req = req;
ctx->ns = ns;
if (ret < 0) {
iocb->ret = ret;
goto out;
} else if (iocb->ret < 0) {
goto out;
}

(*num_flushes)++;
if (ns) {
trace_pci_nvme_flush_ns(iocb->nsid);

block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
BLOCK_ACCT_FLUSH);
blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
iocb->ns = NULL;
iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
return;
}

/* account for the 1-initialization */
(*num_flushes)--;
out:
iocb->aiocb = NULL;
qemu_bh_schedule(iocb->bh);
}

if (*num_flushes) {
status = NVME_NO_COMPLETE;
} else {
status = req->status;
static void nvme_flush_bh(void *opaque)
{
NvmeFlushAIOCB *iocb = opaque;
NvmeRequest *req = iocb->req;
NvmeCtrl *n = nvme_ctrl(req);
int i;

if (iocb->ret < 0) {
goto done;
}

if (iocb->broadcast) {
for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
iocb->ns = nvme_ns(n, i);
if (iocb->ns) {
iocb->nsid = i;
break;
}
}
}

if (!iocb->ns) {
goto done;
}

nvme_flush_ns_cb(iocb, 0);
return;

done:
qemu_bh_delete(iocb->bh);
iocb->bh = NULL;

iocb->common.cb(iocb->common.opaque, iocb->ret);

qemu_aio_unref(iocb);

return;
}

static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
{
NvmeFlushAIOCB *iocb;
uint32_t nsid = le32_to_cpu(req->cmd.nsid);
uint16_t status;

iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);

iocb->req = req;
iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
iocb->ret = 0;
iocb->ns = NULL;
iocb->nsid = 0;
iocb->broadcast = (nsid == NVME_NSID_BROADCAST);

if (!iocb->broadcast) {
if (!nvme_nsid_valid(n, nsid)) {
status = NVME_INVALID_NSID | NVME_DNR;
goto out;
}

iocb->ns = nvme_ns(n, nsid);
if (!iocb->ns) {
status = NVME_INVALID_FIELD | NVME_DNR;
goto out;
}

iocb->nsid = nsid;
}

req->aiocb = &iocb->common;
qemu_bh_schedule(iocb->bh);

return NVME_NO_COMPLETE;

out:
qemu_bh_delete(iocb->bh);
iocb->bh = NULL;
qemu_aio_unref(iocb);

return status;
}

Expand Down
2 changes: 2 additions & 0 deletions hw/nvme/nvme.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#define NVME_MAX_NAMESPACES 256
#define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000)

QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);

typedef struct NvmeCtrl NvmeCtrl;
typedef struct NvmeNamespace NvmeNamespace;

Expand Down
6 changes: 3 additions & 3 deletions hw/nvme/trace-events
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@ pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
pci_nvme_map_sgl(uint8_t typ, uint64_t len) "type 0x%"PRIx8" len %"PRIu64""
pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" nsid 0x%"PRIx32" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
pci_nvme_flush_ns(uint32_t nsid) "nsid 0x%"PRIx32""
pci_nvme_format(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
pci_nvme_format_ns(uint16_t cid, uint32_t nsid, uint8_t lbaf, uint8_t mset, uint8_t pi, uint8_t pil) "cid %"PRIu16" nsid %"PRIu32" lbaf %"PRIu8" mset %"PRIu8" pi %"PRIu8" pil %"PRIu8""
pci_nvme_format_cb(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_misc_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_misc_cb(uint16_t cid) "cid %"PRIu16""
pci_nvme_dif_rw(uint8_t pract, uint8_t prinfo) "pract 0x%"PRIx8" prinfo 0x%"PRIx8""
pci_nvme_dif_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_dif_rw_mdata_in_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
Expand Down

0 comments on commit 38f4ac6

Please sign in to comment.