diff mbox

nvme: Implement Write Zeroes

Message ID 20170505090044.28754-1-hch@lst.de
State New
Headers show

Commit Message

Christoph Hellwig May 5, 2017, 9 a.m. UTC
Signed-off-by: Keith Busch <keith.busch@intel.com>
[hch: ported over from qemu-nvme.git to mainline]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 hw/block/nvme.c | 26 ++++++++++++++++++++++++++
 hw/block/nvme.h |  1 +
 2 files changed, 27 insertions(+)

Comments

Paolo Bonzini May 5, 2017, 9:30 a.m. UTC | #1
On 05/05/2017 11:00, Christoph Hellwig wrote:
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> [hch: ported over from qemu-nvme.git to mainline]
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  hw/block/nvme.c | 26 ++++++++++++++++++++++++++
>  hw/block/nvme.h |  1 +
>  2 files changed, 27 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index ae303d44e5..3f4d2bf2ba 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -227,6 +227,29 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
>      return NVME_NO_COMPLETE;
>  }
>  
> +static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> +    NvmeRequest *req)
> +{
> +    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
> +    const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> +    const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
> +    uint64_t slba = le64_to_cpu(rw->slba);
> +    uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
> +    uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
> +    uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
> +
> +    if (slba + nlb > ns->id_ns.nsze) {
> +        return NVME_LBA_RANGE | NVME_DNR;
> +    }
> +
> +    req->has_sg = false;
> +    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
> +                     BLOCK_ACCT_WRITE);
> +    req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, aio_slba, aio_nlb, 0,
> +                                        nvme_rw_cb, req);

Hi Christoph,

could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate
bit (dword 12 bit 25) is set?

Thanks,

Paolo

> +    return NVME_NO_COMPLETE;
> +}
> +
>  static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
>      NvmeRequest *req)
>  {
> @@ -279,6 +302,8 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      switch (cmd->opcode) {
>      case NVME_CMD_FLUSH:
>          return nvme_flush(n, ns, cmd, req);
> +    case NVME_CMD_WRITE_ZEROS:
> +        return nvme_write_zeros(n, ns, cmd, req);
>      case NVME_CMD_WRITE:
>      case NVME_CMD_READ:
>          return nvme_rw(n, ns, cmd, req);
> @@ -895,6 +920,7 @@ static int nvme_init(PCIDevice *pci_dev)
>      id->sqes = (0x6 << 4) | 0x6;
>      id->cqes = (0x4 << 4) | 0x4;
>      id->nn = cpu_to_le32(n->num_namespaces);
> +    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS);
>      id->psd[0].mp = cpu_to_le16(0x9c4);
>      id->psd[0].enlat = cpu_to_le32(0x10);
>      id->psd[0].exlat = cpu_to_le32(0x4);
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 8fb0c10756..a0d15649f9 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -179,6 +179,7 @@ enum NvmeIoCommands {
>      NVME_CMD_READ               = 0x02,
>      NVME_CMD_WRITE_UNCOR        = 0x04,
>      NVME_CMD_COMPARE            = 0x05,
> +    NVME_CMD_WRITE_ZEROS        = 0x08,
>      NVME_CMD_DSM                = 0x09,
>  };
>  
>
Christoph Hellwig May 5, 2017, 9:51 a.m. UTC | #2
On Fri, May 05, 2017 at 11:30:11AM +0200, Paolo Bonzini wrote:
> could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate
> bit (dword 12 bit 25) is set?

In fact we should do that unconditonally.  The deallocate bit is new
in 1.3 (which we don't claim to support) and forces deallocating, but
NVMe already allows for this behavior without the flag (we in fact
explicitly clarified this in an ECN for 1.2.1).
Paolo Bonzini May 5, 2017, 10:03 a.m. UTC | #3
On 05/05/2017 11:51, Christoph Hellwig wrote:
>> could you pass BDRV_REQ_MAY_UNMAP for the flags here if the deallocate
>> bit (dword 12 bit 25) is set?
> In fact we should do that unconditonally.  The deallocate bit is new
> in 1.3 (which we don't claim to support) and forces deallocating, but
> NVMe already allows for this behavior without the flag (we in fact
> explicitly clarified this in an ECN for 1.2.1).

While that's allowed and it makes sense indeed on SSDs, for QEMU's
typical usage it can lead to fragmentation and worse performance.  On
extent-based file systems, write zeroes without deallocate can be
implemented very efficiently with FALLOC_FL_ZERO_RANGE, and there's no
reason not to do it.

Is there anything backwards-incompatible in 1.3 that would make it hard
to just bump the supported version?

Paolo
Christoph Hellwig May 5, 2017, 10:10 a.m. UTC | #4
On Fri, May 05, 2017 at 12:03:40PM +0200, Paolo Bonzini wrote:
> While that's allowed and it makes sense indeed on SSDs, for QEMU's
> typical usage it can lead to fragmentation and worse performance.  On
> extent-based file systems, write zeroes without deallocate can be
> implemented very efficiently with FALLOC_FL_ZERO_RANGE, and there's no
> reason not to do it.

Ok..

> Is there anything backwards-incompatible in 1.3 that would make it hard
> to just bump the supported version?

1.2.1 requires a valid NQN in Identify Controller, and 1.3 requires
the new Identify CNS 03h for controller identification.  Otherwise
it's mostly fine tuning corner conditions for which I'd have to audit
the code.
diff mbox

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index ae303d44e5..3f4d2bf2ba 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -227,6 +227,29 @@  static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     return NVME_NO_COMPLETE;
 }
 
+static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+    NvmeRequest *req)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+    const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
+    uint64_t slba = le64_to_cpu(rw->slba);
+    uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
+    uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
+    uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
+
+    if (slba + nlb > ns->id_ns.nsze) {
+        return NVME_LBA_RANGE | NVME_DNR;
+    }
+
+    req->has_sg = false;
+    block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
+                     BLOCK_ACCT_WRITE);
+    req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, aio_slba, aio_nlb, 0,
+                                        nvme_rw_cb, req);
+    return NVME_NO_COMPLETE;
+}
+
 static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     NvmeRequest *req)
 {
@@ -279,6 +302,8 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     switch (cmd->opcode) {
     case NVME_CMD_FLUSH:
         return nvme_flush(n, ns, cmd, req);
+    case NVME_CMD_WRITE_ZEROS:
+        return nvme_write_zeros(n, ns, cmd, req);
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
         return nvme_rw(n, ns, cmd, req);
@@ -895,6 +920,7 @@  static int nvme_init(PCIDevice *pci_dev)
     id->sqes = (0x6 << 4) | 0x6;
     id->cqes = (0x4 << 4) | 0x4;
     id->nn = cpu_to_le32(n->num_namespaces);
+    id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS);
     id->psd[0].mp = cpu_to_le16(0x9c4);
     id->psd[0].enlat = cpu_to_le32(0x10);
     id->psd[0].exlat = cpu_to_le32(0x4);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 8fb0c10756..a0d15649f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -179,6 +179,7 @@  enum NvmeIoCommands {
     NVME_CMD_READ               = 0x02,
     NVME_CMD_WRITE_UNCOR        = 0x04,
     NVME_CMD_COMPARE            = 0x05,
+    NVME_CMD_WRITE_ZEROS        = 0x08,
     NVME_CMD_DSM                = 0x09,
 };