[v3,2/2] hw/block/nvme: add the dataset management command

Message ID	20201021221736.100779-3-its@irrelevant.dk
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> From: Klaus Jensen <its@irrelevant.dk> To: qemu-devel@nongnu.org Subject: [PATCH v3 2/2] hw/block/nvme: add the dataset management command Date: Thu, 22 Oct 2020 00:17:36 +0200 Message-Id: <20201021221736.100779-3-its@irrelevant.dk> In-Reply-To: <20201021221736.100779-1-its@irrelevant.dk> References: <20201021221736.100779-1-its@irrelevant.dk> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Received-SPF: pass client-ip=64.147.123.21; envelope-from=its@irrelevant.dk; helo=wout5-smtp.messagingengine.com Precedence: list Cc: Kevin Wolf <kwolf@redhat.com>, qemu-block@nongnu.org, Klaus Jensen <k.jensen@samsung.com>, Max Reitz <mreitz@redhat.com>, Keith Busch <kbusch@kernel.org>, Klaus Jensen <its@irrelevant.dk> Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>
Series	hw/block/nvme: dulbe and dsm support \| expand [v3,0/2] hw/block/nvme: dulbe and dsm support [v3,1/2] hw/block/nvme: add dulbe support [v3,2/2] hw/block/nvme: add the dataset management command

Message ID

20201021221736.100779-3-its@irrelevant.dk

State

New

Headers

From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org
Subject: [PATCH v3 2/2] hw/block/nvme: add the dataset management command
Date: Thu, 22 Oct 2020 00:17:36 +0200
Message-Id: <20201021221736.100779-3-its@irrelevant.dk>
In-Reply-To: <20201021221736.100779-1-its@irrelevant.dk>
References: <20201021221736.100779-1-its@irrelevant.dk>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Received-SPF: pass client-ip=64.147.123.21; envelope-from=its@irrelevant.dk;
 helo=wout5-smtp.messagingengine.com
X-Spam_score_int: -27
X-Spam_score: -2.8
X-Spam_bar: --
X-Spam_report: (-2.8 / 5.0 requ) BAYES_00=-1.9, DKIM_SIGNED=0.1,
 DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, DKIM_VALID_EF=-0.1,
 RCVD_IN_DNSWL_LOW=-0.7, RCVD_IN_MSPIKE_H2=-0.001, SPF_HELO_PASS=-0.001,
 SPF_PASS=-0.001 autolearn=ham autolearn_force=no
X-Spam_action: no action
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.23
Precedence: list
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
 <mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <https://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
 <mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Cc: Kevin Wolf <kwolf@redhat.com>, qemu-block@nongnu.org,
 Klaus Jensen <k.jensen@samsung.com>, Max Reitz <mreitz@redhat.com>,
 Keith Busch <kbusch@kernel.org>, Klaus Jensen <its@irrelevant.dk>
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Sender: "Qemu-devel"
 <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>

Series

hw/block/nvme: dulbe and dsm support | expand

Commit Message

Klaus Jensen Oct. 21, 2020, 10:17 p.m. UTC

From: Klaus Jensen <k.jensen@samsung.com>

Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).

     format | discard | dsm (512b)  dsm (4kb)  dsm (64kb)
    ------------------------------------------------------
      qcow2    ignore   n           n          n
      qcow2    unmap    n           n          y
      raw      ignore   n           n          n
      raw      unmap    n           y          y

Again, a raw format and 4kb LBAs are preferable.

See NVM Express 1.3d, Section 6.7 ("Dataset Management command").

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.h |  2 +
 hw/block/nvme.c | 99 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 100 insertions(+), 1 deletion(-)

Comments

Keith Busch Oct. 22, 2020, 12:59 a.m. UTC | #1

On Thu, Oct 22, 2020 at 12:17:36AM +0200, Klaus Jensen wrote:
> +static void nvme_aio_discard_cb(void *opaque, int ret)
> +{
> +    NvmeRequest *req = opaque;
> +    int *discards = req->opaque;
> +
> +    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
> +
> +    if (ret) {
> +        req->status = NVME_INTERNAL_DEV_ERROR;
> +        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
> +                               req->status);
> +    }
> +
> +    if (discards && --(*discards) > 0) {
> +        return;
> +    }
> +
> +    g_free(req->opaque);
> +    req->opaque = NULL;
> +
> +    nvme_enqueue_req_completion(nvme_cq(req), req);
> +}
> +
> +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> +{
> +    NvmeNamespace *ns = req->ns;
> +    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> +    NvmeDsmRange *range = NULL;
> +    int *discards = NULL;
> +
> +    uint32_t attr = le32_to_cpu(dsm->attributes);
> +    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> +
> +    uint16_t status = NVME_SUCCESS;
> +
> +    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> +
> +    if (attr & NVME_DSMGMT_AD) {
> +        int64_t offset;
> +        size_t len;
> +
> +        range = g_new(NvmeDsmRange, nr);
> +
> +        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
> +                          DMA_DIRECTION_TO_DEVICE, req);
> +        if (status) {
> +            goto out;
> +        }
> +
> +        discards = g_new0(int, 1);
> +        req->opaque = discards;

I think you need to initialize discards to 1 so that this function is
holding a reference on it while the asynchronous part is running.
Otherwise, the callback may see 'discards' at 0 and free it while we're
trying to discard the next range.

> +
> +        for (int i = 0; i < nr; i++) {
> +            uint64_t slba = le64_to_cpu(range[i].slba);
> +            uint32_t nlb = le32_to_cpu(range[i].nlb);
> +
> +            if (nvme_check_bounds(n, ns, slba, nlb)) {
> +                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
> +                                                     ns->id_ns.nsze);
> +                continue;
> +            }
> +
> +            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
> +                                          nlb);
> +
> +            offset = nvme_l2b(ns, slba);
> +            len = nvme_l2b(ns, nlb);
> +
> +            while (len) {
> +                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
> +
> +                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
> +                                 nvme_aio_discard_cb, req);
> +
> +                (*discards)++;

The increment ought to be before the aio call so that the _cb can't see
the value before it's accounted for. 

> +
> +                offset += bytes;
> +                len -= bytes;
> +            }
> +        }
> +
> +        if (*discards) {
> +            status = NVME_NO_COMPLETE;
> +        } else {
> +            g_free(discards);
> +            req->opaque = NULL;
> +        }
> +    }
> +
> +out:
> +    g_free(range);
> +
> +    return status;
> +}
> +
>  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
>  {
>      block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
>      case NVME_CMD_WRITE:
>      case NVME_CMD_READ:
>          return nvme_rw(n, req);
> +    case NVME_CMD_DSM:
> +        return nvme_dsm(n, req);
>      default:
>          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
>          return NVME_INVALID_OPCODE | NVME_DNR;
> @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
>      id->cqes = (0x4 << 4) | 0x4;
>      id->nn = cpu_to_le32(n->num_namespaces);
>      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
> -                           NVME_ONCS_FEATURES);
> +                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);

I think we should set ID_NS.NDPG and NDPA since there really is a
preferred granularity and alignment.

>  
>      id->vwc = 0x1;
>      id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
> -- 
> 2.28.0
>

Klaus Jensen Oct. 22, 2020, 7:34 a.m. UTC | #2

On Oct 21 17:59, Keith Busch wrote:
> On Thu, Oct 22, 2020 at 12:17:36AM +0200, Klaus Jensen wrote:
> > +static void nvme_aio_discard_cb(void *opaque, int ret)
> > +{
> > +    NvmeRequest *req = opaque;
> > +    int *discards = req->opaque;
> > +
> > +    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
> > +
> > +    if (ret) {
> > +        req->status = NVME_INTERNAL_DEV_ERROR;
> > +        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
> > +                               req->status);
> > +    }
> > +
> > +    if (discards && --(*discards) > 0) {
> > +        return;
> > +    }
> > +
> > +    g_free(req->opaque);
> > +    req->opaque = NULL;
> > +
> > +    nvme_enqueue_req_completion(nvme_cq(req), req);
> > +}
> > +
> > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> > +{
> > +    NvmeNamespace *ns = req->ns;
> > +    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> > +    NvmeDsmRange *range = NULL;
> > +    int *discards = NULL;
> > +
> > +    uint32_t attr = le32_to_cpu(dsm->attributes);
> > +    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> > +
> > +    uint16_t status = NVME_SUCCESS;
> > +
> > +    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> > +
> > +    if (attr & NVME_DSMGMT_AD) {
> > +        int64_t offset;
> > +        size_t len;
> > +
> > +        range = g_new(NvmeDsmRange, nr);
> > +
> > +        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
> > +                          DMA_DIRECTION_TO_DEVICE, req);
> > +        if (status) {
> > +            goto out;
> > +        }
> > +
> > +        discards = g_new0(int, 1);
> > +        req->opaque = discards;
> 
> I think you need to initialize discards to 1 so that this function is
> holding a reference on it while the asynchronous part is running.
> Otherwise, the callback may see 'discards' at 0 and free it while we're
> trying to discard the next range.
> 

Yes - you are right. The callback might actually get called immediately
in some cases.

> > +
> > +        for (int i = 0; i < nr; i++) {
> > +            uint64_t slba = le64_to_cpu(range[i].slba);
> > +            uint32_t nlb = le32_to_cpu(range[i].nlb);
> > +
> > +            if (nvme_check_bounds(n, ns, slba, nlb)) {
> > +                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
> > +                                                     ns->id_ns.nsze);
> > +                continue;
> > +            }
> > +
> > +            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
> > +                                          nlb);
> > +
> > +            offset = nvme_l2b(ns, slba);
> > +            len = nvme_l2b(ns, nlb);
> > +
> > +            while (len) {
> > +                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
> > +
> > +                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
> > +                                 nvme_aio_discard_cb, req);
> > +
> > +                (*discards)++;
> 
> The increment ought to be before the aio call so that the _cb can't see
> the value before it's accounted for. 
> 

Right. Same issue as above.

> > +
> > +                offset += bytes;
> > +                len -= bytes;
> > +            }
> > +        }
> > +
> > +        if (*discards) {

And then we decrement here before checking for 0.

Thanks, looks better now.

> > +            status = NVME_NO_COMPLETE;
> > +        } else {
> > +            g_free(discards);
> > +            req->opaque = NULL;
> > +        }
> > +    }
> > +
> > +out:
> > +    g_free(range);
> > +
> > +    return status;
> > +}
> > +
> >  static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
> > @@ -1088,6 +1183,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
> >      case NVME_CMD_WRITE:
> >      case NVME_CMD_READ:
> >          return nvme_rw(n, req);
> > +    case NVME_CMD_DSM:
> > +        return nvme_dsm(n, req);
> >      default:
> >          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
> >          return NVME_INVALID_OPCODE | NVME_DNR;
> > @@ -2810,7 +2907,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
> >      id->cqes = (0x4 << 4) | 0x4;
> >      id->nn = cpu_to_le32(n->num_namespaces);
> >      id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
> > -                           NVME_ONCS_FEATURES);
> > +                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);
> 
> I think we should set ID_NS.NDPG and NDPA since there really is a
> preferred granularity and alignment.
> 

Yeah. I had some issues with doing this reliably. But I think I nailed
it, see my v4.

> >  
> >      id->vwc = 0x1;
> >      id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
> > -- 
> > 2.28.0
> >

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a50..574333caa3f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -28,6 +28,7 @@  typedef struct NvmeRequest {
     struct NvmeNamespace    *ns;
     BlockAIOCB              *aiocb;
     uint16_t                status;
+    void                    *opaque;
     NvmeCqe                 cqe;
     NvmeCmd                 cmd;
     BlockAcctCookie         acct;
@@ -60,6 +61,7 @@  static inline const char *nvme_io_opc_str(uint8_t opc)
     case NVME_CMD_WRITE:            return "NVME_NVM_CMD_WRITE";
     case NVME_CMD_READ:             return "NVME_NVM_CMD_READ";
     case NVME_CMD_WRITE_ZEROES:     return "NVME_NVM_CMD_WRITE_ZEROES";
+    case NVME_CMD_DSM:              return "NVME_NVM_CMD_DSM";
     default:                        return "NVME_NVM_CMD_UNKNOWN";
     }
 }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1758cfed965c..a6dd8ae8e220 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -959,6 +959,101 @@  static void nvme_rw_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+static void nvme_aio_discard_cb(void *opaque, int ret)
+{
+    NvmeRequest *req = opaque;
+    int *discards = req->opaque;
+
+    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+
+    if (ret) {
+        req->status = NVME_INTERNAL_DEV_ERROR;
+        trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret),
+                               req->status);
+    }
+
+    if (discards && --(*discards) > 0) {
+        return;
+    }
+
+    g_free(req->opaque);
+    req->opaque = NULL;
+
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
+    NvmeDsmRange *range = NULL;
+    int *discards = NULL;
+
+    uint32_t attr = le32_to_cpu(dsm->attributes);
+    uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
+
+    uint16_t status = NVME_SUCCESS;
+
+    trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
+
+    if (attr & NVME_DSMGMT_AD) {
+        int64_t offset;
+        size_t len;
+
+        range = g_new(NvmeDsmRange, nr);
+
+        status = nvme_dma(n, (uint8_t *)range, nr * sizeof(NvmeDsmRange),
+                          DMA_DIRECTION_TO_DEVICE, req);
+        if (status) {
+            goto out;
+        }
+
+        discards = g_new0(int, 1);
+        req->opaque = discards;
+
+        for (int i = 0; i < nr; i++) {
+            uint64_t slba = le64_to_cpu(range[i].slba);
+            uint32_t nlb = le32_to_cpu(range[i].nlb);
+
+            if (nvme_check_bounds(n, ns, slba, nlb)) {
+                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                                     ns->id_ns.nsze);
+                continue;
+            }
+
+            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+                                          nlb);
+
+            offset = nvme_l2b(ns, slba);
+            len = nvme_l2b(ns, nlb);
+
+            while (len) {
+                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
+
+                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
+                                 nvme_aio_discard_cb, req);
+
+                (*discards)++;
+
+                offset += bytes;
+                len -= bytes;
+            }
+        }
+
+        if (*discards) {
+            status = NVME_NO_COMPLETE;
+        } else {
+            g_free(discards);
+            req->opaque = NULL;
+        }
+    }
+
+out:
+    g_free(range);
+
+    return status;
+}
+
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
     block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
@@ -1088,6 +1183,8 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
         return nvme_rw(n, req);
+    case NVME_CMD_DSM:
+        return nvme_dsm(n, req);
     default:
         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
@@ -2810,7 +2907,7 @@  static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
     id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
-                           NVME_ONCS_FEATURES);
+                           NVME_ONCS_FEATURES | NVME_ONCS_DSM);
 
     id->vwc = 0x1;
     id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |

[v3,2/2] hw/block/nvme: add the dataset management command

Commit Message

Comments

Patch