diff mbox series

hw/block/nvme: add broadcast nsid support flush command

Message ID 20210125204231.254925-1-its@irrelevant.dk
State New
Headers show
Series hw/block/nvme: add broadcast nsid support flush command | expand

Commit Message

Klaus Jensen Jan. 25, 2021, 8:42 p.m. UTC
From: Gollu Appalanaidu <anaidu.gollu@samsung.com>

Add support for using the broadcast nsid to issue a flush on all
namespaces through a single command.

Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
---
 include/block/nvme.h  |   8 +++
 hw/block/nvme.c       | 123 +++++++++++++++++++++++++++++++++++++++---
 hw/block/trace-events |   2 +
 3 files changed, 126 insertions(+), 7 deletions(-)

Comments

Stefan Hajnoczi Jan. 26, 2021, 4:20 p.m. UTC | #1
On Mon, Jan 25, 2021 at 09:42:31PM +0100, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add support for using the broadcast nsid to issue a flush on all
> namespaces through a single command.
> 
> Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
> ---
>  include/block/nvme.h  |   8 +++
>  hw/block/nvme.c       | 123 +++++++++++++++++++++++++++++++++++++++---
>  hw/block/trace-events |   2 +
>  3 files changed, 126 insertions(+), 7 deletions(-)

Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
Klaus Jensen Feb. 8, 2021, 5:48 p.m. UTC | #2
On Jan 25 21:42, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add support for using the broadcast nsid to issue a flush on all
> namespaces through a single command.
> 

Gentle ping on this.
Keith Busch Feb. 8, 2021, 6:59 p.m. UTC | #3
On Mon, Jan 25, 2021 at 09:42:31PM +0100, Klaus Jensen wrote:
> @@ -1644,10 +1679,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
>  
>  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
>  {
> -    block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> -                     BLOCK_ACCT_FLUSH);
> -    req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
> -    return NVME_NO_COMPLETE;
> +    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
> +    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
> +    uint16_t status;
> +    struct nvme_aio_flush_ctx *ctx;
> +    NvmeNamespace *ns;
> +
> +    trace_pci_nvme_flush(nvme_cid(req), nsid);
> +
> +    if (nsid != NVME_NSID_BROADCAST) {
> +        req->ns = nvme_ns(n, nsid);
> +        if (unlikely(!req->ns)) {
> +            return NVME_INVALID_FIELD | NVME_DNR;
> +        }
> +
> +        block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> +                         BLOCK_ACCT_FLUSH);
> +        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
> +        return NVME_NO_COMPLETE;
> +    }
> +
> +    /* 1-initialize; see comment in nvme_dsm */
> +    *num_flushes = 1;
> +
> +    for (int i = 1; i <= n->num_namespaces; i++) {
> +        ns = nvme_ns(n, i);
> +        if (!ns) {
> +            continue;
> +        }
> +
> +        ctx = g_new(struct nvme_aio_flush_ctx, 1);
> +        ctx->req = req;
> +        ctx->ns = ns;
> +
> +        (*num_flushes)++;
> +
> +        block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
> +                         BLOCK_ACCT_FLUSH);
> +        req->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
> +    }

Overwriting req->aiocb with the most recent flush request doesn't seem
right.

This whole implementation would be much simpler with the synchronous
blk_flush() routine instead of the AIO equivalent. This is not really a
performant feature, so I don't think it's critical to get these
operations happening in parallel. What do you think?
Klaus Jensen Feb. 8, 2021, 7:08 p.m. UTC | #4
On Feb  9 03:59, Keith Busch wrote:
> On Mon, Jan 25, 2021 at 09:42:31PM +0100, Klaus Jensen wrote:
> > @@ -1644,10 +1679,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
> >  
> >  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
> >  {
> > -    block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> > -                     BLOCK_ACCT_FLUSH);
> > -    req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
> > -    return NVME_NO_COMPLETE;
> > +    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
> > +    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
> > +    uint16_t status;
> > +    struct nvme_aio_flush_ctx *ctx;
> > +    NvmeNamespace *ns;
> > +
> > +    trace_pci_nvme_flush(nvme_cid(req), nsid);
> > +
> > +    if (nsid != NVME_NSID_BROADCAST) {
> > +        req->ns = nvme_ns(n, nsid);
> > +        if (unlikely(!req->ns)) {
> > +            return NVME_INVALID_FIELD | NVME_DNR;
> > +        }
> > +
> > +        block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> > +                         BLOCK_ACCT_FLUSH);
> > +        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
> > +        return NVME_NO_COMPLETE;
> > +    }
> > +
> > +    /* 1-initialize; see comment in nvme_dsm */
> > +    *num_flushes = 1;
> > +
> > +    for (int i = 1; i <= n->num_namespaces; i++) {
> > +        ns = nvme_ns(n, i);
> > +        if (!ns) {
> > +            continue;
> > +        }
> > +
> > +        ctx = g_new(struct nvme_aio_flush_ctx, 1);
> > +        ctx->req = req;
> > +        ctx->ns = ns;
> > +
> > +        (*num_flushes)++;
> > +
> > +        block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
> > +                         BLOCK_ACCT_FLUSH);
> > +        req->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
> > +    }
> 
> Overwriting req->aiocb with the most recent flush request doesn't seem
> right.
> 

It doesn't really matter if this[1] patch is merged, but it is wrong and
to align with the other multi-aio commands, this should just ignore the
returned aiocb.

> This whole implementation would be much simpler with the synchronous
> blk_flush() routine instead of the AIO equivalent. This is not really a
> performant feature, so I don't think it's critical to get these
> operations happening in parallel. What do you think?

It would definitely be simpler, but I believe that if there is a lot to
flush, then we won't just block the nvme device. We are holding the Big
QEMU Lock and will block most other devices as well.


  [1]: hw/block/nvme: drain namespaces on sq deletion
Keith Busch Feb. 10, 2021, 3:32 a.m. UTC | #5
On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote:
> On Feb  9 03:59, Keith Busch wrote:
> > This whole implementation would be much simpler with the synchronous
> > blk_flush() routine instead of the AIO equivalent. This is not really a
> > performant feature, so I don't think it's critical to get these
> > operations happening in parallel. What do you think?
> 
> It would definitely be simpler, but I believe that if there is a lot to
> flush, then we won't just block the nvme device. We are holding the Big
> QEMU Lock and will block most other devices as well.

Hm, I feel like you may have told me this same explanation for a
different patch. :) Okay, I'm convinced: this is the way.
Klaus Jensen Feb. 10, 2021, 6:42 a.m. UTC | #6
On Feb 10 12:32, Keith Busch wrote:
> On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote:
> > On Feb  9 03:59, Keith Busch wrote:
> > > This whole implementation would be much simpler with the synchronous
> > > blk_flush() routine instead of the AIO equivalent. This is not really a
> > > performant feature, so I don't think it's critical to get these
> > > operations happening in parallel. What do you think?
> > 
> > It would definitely be simpler, but I believe that if there is a lot to
> > flush, then we won't just block the nvme device. We are holding the Big
> > QEMU Lock and will block most other devices as well.
> 
> Hm, I feel like you may have told me this same explanation for a
> different patch. :) Okay, I'm convinced: this is the way.
> 

Is that an Acked-by? ;)

And yes, I might have used that argument for Copy, can't remember ;)
Keith Busch Feb. 11, 2021, 4:43 p.m. UTC | #7
On Wed, Feb 10, 2021 at 07:42:17AM +0100, Klaus Jensen wrote:
> 
> Is that an Acked-by? ;)

Yep,

Acked-by: Keith Busch <kbusch@kernel.org>
Klaus Jensen Feb. 11, 2021, 6:33 p.m. UTC | #8
On Jan 25 21:42, Klaus Jensen wrote:
> From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
> 
> Add support for using the broadcast nsid to issue a flush on all
> namespaces through a single command.
> 

Applied to nvme-next.
diff mbox series

Patch

diff --git a/include/block/nvme.h b/include/block/nvme.h
index e4b918064df9..0e2e94c7ebbb 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1024,6 +1024,14 @@  enum NvmeIdCtrlOncs {
     NVME_ONCS_TIMESTAMP     = 1 << 6,
 };
 
+enum NvmeIdctrlVwc {
+    NVME_VWC_PRESENT                    = 1 << 0,
+    NVME_VWC_NSID_BROADCAST_NO_SUPPORT  = 0 << 1,
+    NVME_VWC_NSID_BROADCAST_RESERVED    = 1 << 1,
+    NVME_VWC_NSID_BROADCAST_CTRL_SPEC   = 2 << 1,
+    NVME_VWC_NSID_BROADCAST_SUPPORT     = 3 << 1,
+};
+
 enum NvmeIdCtrlFrmw {
     NVME_FRMW_SLOT1_RO = 1 << 0,
 };
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 98d84fe26644..b5ce8ed7c785 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1406,6 +1406,41 @@  static void nvme_rw_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+struct nvme_aio_flush_ctx {
+    NvmeRequest     *req;
+    NvmeNamespace   *ns;
+    BlockAcctCookie acct;
+};
+
+static void nvme_aio_flush_cb(void *opaque, int ret)
+{
+    struct nvme_aio_flush_ctx *ctx = opaque;
+    NvmeRequest *req = ctx->req;
+    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
+
+    BlockBackend *blk = ctx->ns->blkconf.blk;
+    BlockAcctCookie *acct = &ctx->acct;
+    BlockAcctStats *stats = blk_get_stats(blk);
+
+    trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
+
+    if (!ret) {
+        block_acct_done(stats, acct);
+    } else {
+        block_acct_failed(stats, acct);
+        nvme_aio_err(req, ret);
+    }
+
+    (*num_flushes)--;
+    g_free(ctx);
+
+    if (*num_flushes) {
+        return;
+    }
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
 static void nvme_aio_discard_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -1644,10 +1679,56 @@  static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
 
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
-    block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
-                     BLOCK_ACCT_FLUSH);
-    req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
-    return NVME_NO_COMPLETE;
+    uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+    uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
+    uint16_t status;
+    struct nvme_aio_flush_ctx *ctx;
+    NvmeNamespace *ns;
+
+    trace_pci_nvme_flush(nvme_cid(req), nsid);
+
+    if (nsid != NVME_NSID_BROADCAST) {
+        req->ns = nvme_ns(n, nsid);
+        if (unlikely(!req->ns)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
+                         BLOCK_ACCT_FLUSH);
+        req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
+        return NVME_NO_COMPLETE;
+    }
+
+    /* 1-initialize; see comment in nvme_dsm */
+    *num_flushes = 1;
+
+    for (int i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (!ns) {
+            continue;
+        }
+
+        ctx = g_new(struct nvme_aio_flush_ctx, 1);
+        ctx->req = req;
+        ctx->ns = ns;
+
+        (*num_flushes)++;
+
+        block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
+                         BLOCK_ACCT_FLUSH);
+        req->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
+    }
+
+    /* account for the 1-initialization */
+    (*num_flushes)--;
+
+    if (*num_flushes) {
+        status = NVME_NO_COMPLETE;
+    } else {
+        status = req->status;
+    }
+
+    return status;
 }
 
 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
@@ -2344,6 +2425,29 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_NSID | NVME_DNR;
     }
 
+    /*
+     * In the base NVM command set, Flush may apply to all namespaces
+     * (indicated by NSID being set to 0xFFFFFFFF). But if that feature is used
+     * along with TP 4056 (Namespace Types), it may be pretty screwed up.
+     *
+     * If NSID is indeed set to 0xFFFFFFFF, we simply cannot associate the
+     * opcode with a specific command since we cannot determine a unique I/O
+     * command set. Opcode 0x0 could have any other meaning than something
+     * equivalent to flushing and say it DOES have completely different
+     * semantics in some other command set - does an NSID of 0xFFFFFFFF then
+     * mean "for all namespaces, apply whatever command set specific command
+     * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply
+     * whatever command that uses the 0x0 opcode if, and only if, it allows
+     * NSID to be 0xFFFFFFFF"?
+     *
+     * Anyway (and luckily), for now, we do not care about this since the
+     * device only supports namespace types that includes the NVM Flush command
+     * (NVM and Zoned), so always do an NVM Flush.
+     */
+    if (req->cmd.opcode == NVME_CMD_FLUSH) {
+        return nvme_flush(n, req);
+    }
+
     req->ns = nvme_ns(n, nsid);
     if (unlikely(!req->ns)) {
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -2355,8 +2459,6 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     }
 
     switch (req->cmd.opcode) {
-    case NVME_CMD_FLUSH:
-        return nvme_flush(n, req);
     case NVME_CMD_WRITE_ZEROES:
         return nvme_write_zeroes(n, req);
     case NVME_CMD_ZONE_APPEND:
@@ -4450,7 +4552,14 @@  static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
                            NVME_ONCS_FEATURES | NVME_ONCS_DSM |
                            NVME_ONCS_COMPARE);
 
-    id->vwc = (0x2 << 1) | 0x1;
+    /*
+     * NOTE: If this device ever supports a command set that does NOT use 0x0
+     * as a Flush-equivalent operation, support for the broadcast NSID in Flush
+     * should probably be removed.
+     *
+     * See comment in nvme_io_cmd.
+     */
+    id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
                            NVME_CTRL_SGLS_BITBUCKET);
 
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 8e249ea910aa..ce2a35c34ea6 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,6 +43,8 @@  pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna
 pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
+pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
+pci_nvme_aio_flush_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
 pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
 pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
 pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""