diff mbox series

[v3,2/2] hw/block/nvme: add the dataset management command

Message ID 20201021221736.100779-3-its@irrelevant.dk
State New
Headers show
Series hw/block/nvme: dulbe and dsm support | expand

Commit Message

Klaus Jensen Oct. 21, 2020, 10:17 p.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).

     format | discard | dsm (512b)  dsm (4kb)  dsm (64kb)
    ------------------------------------------------------
      qcow2    ignore   n           n          n
      qcow2    unmap    n           n          y
      raw      ignore   n           n          n
      raw      unmap    n           y          y

Again, a raw format and 4kb LBAs are preferable.

See NVM Express 1.3d, Section 6.7 ("Dataset Management command").

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h |  2 +
 hw/block/nvme.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 100 insertions(+), 1 deletion(-)

Comments

Keith Busch Oct. 22, 2020, 12:59 a.m. UTC | #1
On Thu, Oct 22, 2020 at 12:17:36AM +0200, Klaus Jensen wrote:
> +static void nvme_aio_discard_cb(void *opaque, int ret)
> +{
> +    NvmeRequest *req = opaque;
> +    int *discards = req->opaque;
> +
> +    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
> +
> +    if (ret) {
> +        req->status = NVME_INTERNAL_DEV_ERROR;
> +        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
> +                               req->status);
> +    }
> +
> +    if (discards && --(*discards) > 0) {
> +        return;
> +    }
> +
> +    g_free(req->opaque);
> +    req->opaque = NULL;
> +
> +    nvme_enqueue_req_completion(nvme_cq(req), req);
> +}
> +
> +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> +{
> +    NvmeNamespace *ns = req->ns;
> +    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> +    NvmeDsmRange *range = NULL;
> +    int *discards = NULL;
> +
> +    uint32_t attr = le32_to_cpu(dsm->attributes);
> +    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> +
> +    uint16_t status = NVME_SUCCESS;
> +
> +    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> +
> +    if (attr & NVME_DSMGMT_AD) {
> +        int64_t offset;
> +        size_t len;
> +
> +        range = g_new(NvmeDsmRange, nr);
> +
> +        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
> +                          DMA_DIRECTION_TO_DEVICE, req);
> +        if (status) {
> +            goto out;
> +        }
> +
> +        discards = g_new0(int, 1);
> +        req->opaque = discards;

I think you need to initialize discards to 1 so that this function is
holding a reference on it while the asynchronous part is running.
Otherwise, the callback may see 'discards' at 0 and free it while we're
trying to discard the next range.

> +
> +        for (int i = 0; i < nr; i++) {
> +            uint64_t slba = le64_to_cpu(range[i].slba);
> +            uint32_t nlb = le32_to_cpu(range[i].nlb);
> +
> +            if (nvme_check_bounds(n, ns, slba, nlb)) {
> +                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
> +                                                     ns->id_ns.nsze);
> +                continue;
> +            }
> +
> +            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
> +                                          nlb);
> +
> +            offset = nvme_l2b(ns, slba);
> +            len = nvme_l2b(ns, nlb);
> +
> +            while (len) {
> +                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
> +
> +                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
> +                                 nvme_aio_discard_cb, req);
> +
> +                (*discards)++;

The increment ought to be before the aio call so that the _cb can't see
the value before it's accounted for. 

> +
> +                offset += bytes;
> +                len -= bytes;
> +            }
> +        }
> +
> +        if (*discards) {
> +            status = NVME_NO_COMPLETE;
> +        } else {
> +            g_free(discards);
> +            req->opaque = NULL;
> +        }
> +    }
> +
> +out:
> +    g_free(range);
> +
> +    return status;
> +}
> +
>  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
>  {
>      block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
>      case NVME_CMD_WRITE:
>      case NVME_CMD_READ:
>          return nvme_rw(n, req);
> +    case NVME_CMD_DSM:
> +        return nvme_dsm(n, req);
>      default:
>          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
>          return NVME_INVALID_OPCODE | NVME_DNR;
> @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
>      id->cqes = (0x4 << 4) | 0x4;
>      id->nn = cpu_to_le32(n->num_namespaces);
>      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
> -                           NVME_ONCS_FEATURES);
> +                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);

I think we should set ID_NS.NDPG and NDPA since there really is a
preferred granularity and alignment.

>  
>      id->vwc = 0x1;
>      id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
> -- 
> 2.28.0
>
Klaus Jensen Oct. 22, 2020, 7:34 a.m. UTC | #2
On Oct 21 17:59, Keith Busch wrote:
> On Thu, Oct 22, 2020 at 12:17:36AM +0200, Klaus Jensen wrote:
> > +static void nvme_aio_discard_cb(void *opaque, int ret)
> > +{
> > +    NvmeRequest *req = opaque;
> > +    int *discards = req->opaque;
> > +
> > +    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
> > +
> > +    if (ret) {
> > +        req->status = NVME_INTERNAL_DEV_ERROR;
> > +        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
> > +                               req->status);
> > +    }
> > +
> > +    if (discards && --(*discards) > 0) {
> > +        return;
> > +    }
> > +
> > +    g_free(req->opaque);
> > +    req->opaque = NULL;
> > +
> > +    nvme_enqueue_req_completion(nvme_cq(req), req);
> > +}
> > +
> > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> > +{
> > +    NvmeNamespace *ns = req->ns;
> > +    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> > +    NvmeDsmRange *range = NULL;
> > +    int *discards = NULL;
> > +
> > +    uint32_t attr = le32_to_cpu(dsm->attributes);
> > +    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> > +
> > +    uint16_t status = NVME_SUCCESS;
> > +
> > +    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> > +
> > +    if (attr & NVME_DSMGMT_AD) {
> > +        int64_t offset;
> > +        size_t len;
> > +
> > +        range = g_new(NvmeDsmRange, nr);
> > +
> > +        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
> > +                          DMA_DIRECTION_TO_DEVICE, req);
> > +        if (status) {
> > +            goto out;
> > +        }
> > +
> > +        discards = g_new0(int, 1);
> > +        req->opaque = discards;
> 
> I think you need to initialize discards to 1 so that this function is
> holding a reference on it while the asynchronous part is running.
> Otherwise, the callback may see 'discards' at 0 and free it while we're
> trying to discard the next range.
> 

Yes - you are right. The callback might actually get called immediately
in some cases.

> > +
> > +        for (int i = 0; i < nr; i++) {
> > +            uint64_t slba = le64_to_cpu(range[i].slba);
> > +            uint32_t nlb = le32_to_cpu(range[i].nlb);
> > +
> > +            if (nvme_check_bounds(n, ns, slba, nlb)) {
> > +                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
> > +                                                     ns->id_ns.nsze);
> > +                continue;
> > +            }
> > +
> > +            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
> > +                                          nlb);
> > +
> > +            offset = nvme_l2b(ns, slba);
> > +            len = nvme_l2b(ns, nlb);
> > +
> > +            while (len) {
> > +                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
> > +
> > +                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
> > +                                 nvme_aio_discard_cb, req);
> > +
> > +                (*discards)++;
> 
> The increment ought to be before the aio call so that the _cb can't see
> the value before it's accounted for. 
> 

Right. Same issue as above.

> > +
> > +                offset += bytes;
> > +                len -= bytes;
> > +            }
> > +        }
> > +
> > +        if (*discards) {

And then we decrement here before checking for 0.

Thanks, looks better now.

> > +            status = NVME_NO_COMPLETE;
> > +        } else {
> > +            g_free(discards);
> > +            req->opaque = NULL;
> > +        }
> > +    }
> > +
> > +out:
> > +    g_free(range);
> > +
> > +    return status;
> > +}
> > +
> >  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> > @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
> >      case NVME_CMD_WRITE:
> >      case NVME_CMD_READ:
> >          return nvme_rw(n, req);
> > +    case NVME_CMD_DSM:
> > +        return nvme_dsm(n, req);
> >      default:
> >          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
> >          return NVME_INVALID_OPCODE | NVME_DNR;
> > @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
> >      id->cqes = (0x4 << 4) | 0x4;
> >      id->nn = cpu_to_le32(n->num_namespaces);
> >      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
> > -                           NVME_ONCS_FEATURES);
> > +                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);
> 
> I think we should set ID_NS.NDPG and NDPA since there really is a
> preferred granularity and alignment.
> 

Yeah. I had some issues with doing this reliably. But I think I nailed
it, see my v4.

> >  
> >      id->vwc = 0x1;
> >      id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
> > -- 
> > 2.28.0
> >
diff mbox series

Patch

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a50..574333caa3f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -28,6 +28,7 @@  typedef struct NvmeRequest {
     struct NvmeNamespace    *ns;
     BlockAIOCB              *aiocb;
     uint16_t                status;
+    void                    *opaque;
     NvmeCqe                 cqe;
     NvmeCmd                 cmd;
     BlockAcctCookie         acct;
@@ -60,6 +61,7 @@  static inline const char *nvme_io_opc_str(uint8_t opc)
     case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
     case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
     case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
+    case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
     default:                        return "NVME_NVM_CMD_UNKNOWN";
     }
 }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1758cfed965c..a6dd8ae8e220 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -959,6 +959,101 @@  static void nvme_rw_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+static void nvme_aio_discard_cb(void *opaque, int ret)
+{
+    NvmeRequest *req = opaque;
+    int *discards = req->opaque;
+
+    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+
+    if (ret) {
+        req->status = NVME_INTERNAL_DEV_ERROR;
+        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
+                               req->status);
+    }
+
+    if (discards && --(*discards) > 0) {
+        return;
+    }
+
+    g_free(req->opaque);
+    req->opaque = NULL;
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+    NvmeDsmRange *range = NULL;
+    int *discards = NULL;
+
+    uint32_t attr = le32_to_cpu(dsm->attributes);
+    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+
+    uint16_t status = NVME_SUCCESS;
+
+    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+
+    if (attr & NVME_DSMGMT_AD) {
+        int64_t offset;
+        size_t len;
+
+        range = g_new(NvmeDsmRange, nr);
+
+        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
+                          DMA_DIRECTION_TO_DEVICE, req);
+        if (status) {
+            goto out;
+        }
+
+        discards = g_new0(int, 1);
+        req->opaque = discards;
+
+        for (int i = 0; i < nr; i++) {
+            uint64_t slba = le64_to_cpu(range[i].slba);
+            uint32_t nlb = le32_to_cpu(range[i].nlb);
+
+            if (nvme_check_bounds(n, ns, slba, nlb)) {
+                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                                     ns->id_ns.nsze);
+                continue;
+            }
+
+            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+                                          nlb);
+
+            offset = nvme_l2b(ns, slba);
+            len = nvme_l2b(ns, nlb);
+
+            while (len) {
+                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+
+                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
+                                 nvme_aio_discard_cb, req);
+
+                (*discards)++;
+
+                offset += bytes;
+                len -= bytes;
+            }
+        }
+
+        if (*discards) {
+            status = NVME_NO_COMPLETE;
+        } else {
+            g_free(discards);
+            req->opaque = NULL;
+        }
+    }
+
+out:
+    g_free(range);
+
+    return status;
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
     block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@@ -1088,6 +1183,8 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
         return nvme_rw(n, req);
+    case NVME_CMD_DSM:
+        return nvme_dsm(n, req);
     default:
         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
@@ -2810,7 +2907,7 @@  static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
-                           NVME_ONCS_FEATURES);
+                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);
 
     id->vwc = 0x1;
     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |