diff mbox series

[v5,09/14] hw/block/nvme: Support Zoned Namespace Command Set

Message ID 20200928023528.15260-10-dmitry.fomichev@wdc.com
State New
Headers show
Series hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set | expand

Commit Message

Dmitry Fomichev Sept. 28, 2020, 2:35 a.m. UTC
The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

The code to support for Zone Descriptor Extensions is not included in
this commit and ZDES 0 is always reported. A later commit in this
series will add ZDE support.

This commit doesn't yet include checks for active and open zone
limits. It is assumed that there are no limits on either active or
open zones.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 block/nvme.c         |   2 +-
 hw/block/nvme-ns.c   | 185 ++++++++-
 hw/block/nvme-ns.h   |   6 +-
 hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
 include/block/nvme.h |   6 +-
 5 files changed, 1033 insertions(+), 38 deletions(-)

Comments

Klaus Jensen Sept. 28, 2020, 6:44 a.m. UTC | #1
On Sep 28 11:35, Dmitry Fomichev wrote:
> The emulation code has been changed to advertise NVM Command Set when
> "zoned" device property is not set (default) and Zoned Namespace
> Command Set otherwise.
> 
> Handlers for three new NVMe commands introduced in Zoned Namespace
> Command Set specification are added, namely for Zone Management
> Receive, Zone Management Send and Zone Append.
> 
> Device initialization code has been extended to create a proper
> configuration for zoned operation using device properties.
> 
> Read/Write command handler is modified to only allow writes at the
> write pointer if the namespace is zoned. For Zone Append command,
> writes implicitly happen at the write pointer and the starting write
> pointer value is returned as the result of the command. Write Zeroes
> handler is modified to add zoned checks that are identical to those
> done as a part of Write flow.
> 
> The code to support for Zone Descriptor Extensions is not included in
> this commit and ZDES 0 is always reported. A later commit in this
> series will add ZDE support.
> 
> This commit doesn't yet include checks for active and open zone
> limits. It is assumed that there are no limits on either active or
> open zones.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  block/nvme.c         |   2 +-
>  hw/block/nvme-ns.c   | 185 ++++++++-
>  hw/block/nvme-ns.h   |   6 +-
>  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
>  include/block/nvme.h |   6 +-
>  5 files changed, 1033 insertions(+), 38 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 05485fdd11..7a513c9a17 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -1040,18 +1318,468 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
>          goto invalid;
>      }
>  
> +    if (ns->params.zoned) {
> +        zone_idx = nvme_zone_idx(ns, slba);
> +        assert(zone_idx < ns->num_zones);
> +        zone = &ns->zone_array[zone_idx];
> +
> +        if (is_write) {
> +            status = nvme_check_zone_write(zone, slba, nlb);
> +            if (status != NVME_SUCCESS) {
> +                trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
> +                goto invalid;
> +            }
> +
> +            assert(nvme_wp_is_valid(zone));
> +            if (append) {
> +                if (unlikely(slba != zone->d.zslba)) {
> +                    trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
> +                    status = NVME_ZONE_INVALID_WRITE | NVME_DNR;
> +                    goto invalid;
> +                }
> +                if (data_size > (n->page_size << n->zasl)) {
> +                    trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
> +                    status = NVME_INVALID_FIELD | NVME_DNR;
> +                    goto invalid;
> +                }
> +                slba = zone->w_ptr;
> +            } else if (unlikely(slba != zone->w_ptr)) {
> +                trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
> +                                                   zone->w_ptr);
> +                status = NVME_ZONE_INVALID_WRITE | NVME_DNR;
> +                goto invalid;
> +            }
> +            req->fill_ofs = -1LL;
> +        } else {
> +            status = nvme_check_zone_read(ns, zone, slba, nlb);
> +            if (status != NVME_SUCCESS) {
> +                trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
> +                goto invalid;
> +            }
> +
> +            if (slba + nlb > zone->w_ptr) {
> +                /*
> +                 * All or some data is read above the WP. Need to
> +                 * fill out the buffer area that has no backing data
> +                 * with a predefined data pattern (zeros by default)
> +                 */
> +                if (slba >= zone->w_ptr) {
> +                    req->fill_ofs = 0;
> +                } else {
> +                    req->fill_ofs = nvme_l2b(ns, zone->w_ptr - slba);
> +                }
> +                req->fill_len = nvme_l2b(ns,
> +                    nvme_zone_rd_boundary(ns, zone) - slba);

OK then. Next edge case.

Now what happens if the read crosses into a partially written zone and
reads above the write pointer in that zone?
Klaus Jensen Sept. 28, 2020, 10:42 a.m. UTC | #2
On Sep 28 11:35, Dmitry Fomichev wrote:
> The emulation code has been changed to advertise NVM Command Set when
> "zoned" device property is not set (default) and Zoned Namespace
> Command Set otherwise.
> 
> Handlers for three new NVMe commands introduced in Zoned Namespace
> Command Set specification are added, namely for Zone Management
> Receive, Zone Management Send and Zone Append.
> 
> Device initialization code has been extended to create a proper
> configuration for zoned operation using device properties.
> 
> Read/Write command handler is modified to only allow writes at the
> write pointer if the namespace is zoned. For Zone Append command,
> writes implicitly happen at the write pointer and the starting write
> pointer value is returned as the result of the command. Write Zeroes
> handler is modified to add zoned checks that are identical to those
> done as a part of Write flow.
> 
> The code to support for Zone Descriptor Extensions is not included in
> this commit and ZDES 0 is always reported. A later commit in this
> series will add ZDE support.
> 
> This commit doesn't yet include checks for active and open zone
> limits. It is assumed that there are no limits on either active or
> open zones.
> 

I think the fill_pattern feature stands separate, so it would be nice to
extract that to a patch on its own.

> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  block/nvme.c         |   2 +-
>  hw/block/nvme-ns.c   | 185 ++++++++-
>  hw/block/nvme-ns.h   |   6 +-
>  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
>  include/block/nvme.h |   6 +-
>  5 files changed, 1033 insertions(+), 38 deletions(-)
> 
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 04172f083e..daa13546c4 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -38,7 +38,6 @@ typedef struct NvmeZoneList {
>  
>  typedef struct NvmeNamespaceParams {
>      uint32_t nsid;
> -    uint8_t  csi;
>      bool     attached;
>      QemuUUID uuid;
>  
> @@ -52,6 +51,7 @@ typedef struct NvmeNamespace {
>      DeviceState  parent_obj;
>      BlockConf    blkconf;
>      int32_t      bootindex;
> +    uint8_t      csi;
>      int64_t      size;
>      NvmeIdNs     id_ns;

This should be squashed into the namespace types patch.

> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 63ad03d6d6..38e25a4d1f 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -54,6 +54,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/units.h"
>  #include "qemu/error-report.h"
> +#include "crypto/random.h"

I think this is not used until the offline/read-only zones injection
patch, right?

> +static bool nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
> +                                      bool failed)
> +{
> +    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
> +    NvmeZone *zone;
> +    uint64_t slba, start_wp = req->cqe.result64;
> +    uint32_t nlb, zone_idx;
> +    uint8_t zs;
> +
> +    if (rw->opcode != NVME_CMD_WRITE &&
> +        rw->opcode != NVME_CMD_ZONE_APPEND &&
> +        rw->opcode != NVME_CMD_WRITE_ZEROES) {
> +        return false;
> +    }
> +
> +    slba = le64_to_cpu(rw->slba);
> +    nlb = le16_to_cpu(rw->nlb) + 1;
> +    zone_idx = nvme_zone_idx(ns, slba);
> +    assert(zone_idx < ns->num_zones);
> +    zone = &ns->zone_array[zone_idx];
> +
> +    if (!failed && zone->w_ptr < start_wp + nlb) {
> +        /*
> +         * A preceding queued write to the zone has failed,
> +         * now this write is not at the WP, fail it too.
> +         */
> +        failed = true;
> +    }
> +
> +    if (failed) {
> +        if (zone->w_ptr > start_wp) {
> +            zone->w_ptr = start_wp;
> +        }

It is possible (though unlikely) that you already posted the CQE for the
write that moved the WP to w_ptr - and now you are reverting it.  This
looks like a recipe for data corruption to me.

Take this example. I use append, because if you have multiple regular
writes in queue you're screwed anyway.

  w_ptr = 0, d.wp = 0
  append 1 lba  -> w_ptr = 1, start_wp = 0, issues aio A
  append 2 lbas -> w_ptr = 3, start_wp = 1, issues aio B

  aio B success -> d.wp = 2 (since you are adding nlb),

Now, I totally do the same. Even though the zone descriptor write
pointer gets "out of sync", it will be reconciled in the absence of
failures and its fair to define that the host cannot expect a consistent
view of the write pointer without quescing I/O.

The problem is if a write then fails:

  aio A fails   -> w_ptr > start_wp (3 > 1), so you revert to w_ptr = 1

That looks bad to me. I dont think this is ever reconciled? If another
append then comes in:

  append 1 lba -> w_ptr = 2, start_wp = 1, issues aio C and overwrites
                                           the second append from before.
  aio C success -> d.wp = 3 (but it should be 2)

> @@ -1513,11 +2267,16 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
>  static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
>  {
>      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
> +    NvmeIdCtrlZoned id = {};
>  
>      trace_pci_nvme_identify_ctrl_csi(c->csi);
>  
>      if (c->csi == NVME_CSI_NVM) {
>          return nvme_rpt_empty_id_struct(n, req);
> +    } else if (c->csi == NVME_CSI_ZONED) {
> +        id.zasl = n->zasl;

I dont think it should overwrite the zasl value specified by the user.
If the user specified 0, then it should return 0 for zasl here.

> @@ -2310,16 +3086,28 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>              continue;
>          }
>          ns->params.attached = false;
> -        switch (ns->params.csi) {
> +        switch (ns->csi) {
>          case NVME_CSI_NVM:
>              if (NVME_CC_CSS(n->bar.cc) == CSS_NVM_ONLY ||
>                  NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
>                  ns->params.attached = true;
>              }
>              break;
> +        case NVME_CSI_ZONED:
> +            if (NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> +                ns->params.attached = true;
> +            }
> +            break;
>          }
>      }
>  
> +    if (!n->zasl_bs) {
> +        assert(n->params.mdts);

A value of 0 for MDTS is perfectly valid.

> @@ -2382,10 +3170,11 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
>                  case CSS_NVM_ONLY:
>                      trace_pci_nvme_css_nvm_cset_selected_by_host(data &
>                                                                   0xffffffff);
> -                    break;
> +                break;

Spurious misaligned break here.
Klaus Jensen Sept. 30, 2020, 5:20 a.m. UTC | #3
On Sep 28 12:42, Klaus Jensen wrote:
> On Sep 28 11:35, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> > 
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> > 
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> > 
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> > 
> > The code to support for Zone Descriptor Extensions is not included in
> > this commit and ZDES 0 is always reported. A later commit in this
> > series will add ZDE support.
> > 
> > This commit doesn't yet include checks for active and open zone
> > limits. It is assumed that there are no limits on either active or
> > open zones.
> > 
> 
> I think the fill_pattern feature stands separate, so it would be nice to
> extract that to a patch on its own.
> 

Please disregard this.

Since the fill_pattern feature is tightly bound to reading in zones, it
doesnt really make sense to extract it.
Klaus Jensen Sept. 30, 2020, 5:59 a.m. UTC | #4
On Sep 28 11:35, Dmitry Fomichev wrote:
> The emulation code has been changed to advertise NVM Command Set when
> "zoned" device property is not set (default) and Zoned Namespace
> Command Set otherwise.
> 
> Handlers for three new NVMe commands introduced in Zoned Namespace
> Command Set specification are added, namely for Zone Management
> Receive, Zone Management Send and Zone Append.
> 
> Device initialization code has been extended to create a proper
> configuration for zoned operation using device properties.
> 
> Read/Write command handler is modified to only allow writes at the
> write pointer if the namespace is zoned. For Zone Append command,
> writes implicitly happen at the write pointer and the starting write
> pointer value is returned as the result of the command. Write Zeroes
> handler is modified to add zoned checks that are identical to those
> done as a part of Write flow.
> 
> The code to support for Zone Descriptor Extensions is not included in
> this commit and ZDES 0 is always reported. A later commit in this
> series will add ZDE support.
> 
> This commit doesn't yet include checks for active and open zone
> limits. It is assumed that there are no limits on either active or
> open zones.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  block/nvme.c         |   2 +-
>  hw/block/nvme-ns.c   | 185 ++++++++-
>  hw/block/nvme-ns.h   |   6 +-
>  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
>  include/block/nvme.h |   6 +-
>  5 files changed, 1033 insertions(+), 38 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 05485fdd11..7a513c9a17 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> +static int nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
> +{
> +    uint64_t zone_size, zone_cap;
> +    uint32_t nz, lbasz = ns->blkconf.logical_block_size;
> +
> +    if (ns->params.zone_size_mb) {
> +        zone_size = ns->params.zone_size_mb;
> +    } else {
> +        zone_size = NVME_DEFAULT_ZONE_SIZE;
> +    }
> +    if (ns->params.zone_capacity_mb) {
> +        zone_cap = ns->params.zone_capacity_mb;
> +    } else {
> +        zone_cap = zone_size;
> +    }

I think a check that zone_capacity_mb is less than or equal to
zone_size_mb is missing earlier?

> +static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
> +                              Error **errp)
> +{
> +    NvmeIdNsZoned *id_ns_z;
> +
> +    if (n->params.fill_pattern == 0) {
> +        ns->id_ns.dlfeat |= 0x01;
> +    } else if (n->params.fill_pattern == 0xff) {
> +        ns->id_ns.dlfeat |= 0x02;
> +    }
> +
> +    if (nvme_calc_zone_geometry(ns, errp) != 0) {
> +        return -1;
> +    }
> +
> +    nvme_init_zone_meta(ns);
> +
> +    id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
> +
> +    /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
> +    id_ns_z->mar = 0xffffffff;
> +    id_ns_z->mor = 0xffffffff;
> +    id_ns_z->zoc = 0;
> +    id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
> +
> +    id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
> +    id_ns_z->lbafe[lba_index].zdes = 0; /* FIXME make helper */
> +
> +    ns->csi = NVME_CSI_ZONED;
> +    ns->id_ns.ncap = cpu_to_le64(ns->zone_capacity * ns->num_zones);
> +    ns->id_ns.nuse = ns->id_ns.ncap;
> +    ns->id_ns.nsze = ns->id_ns.ncap;
> +

NSZE should be in terms of ZSZE. We *can* report NCAP < NSZE if zcap !=
zsze, but that requires bit 1 set in NSFEAT and proper reporting of
NUSE.

> @@ -133,6 +304,14 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
>  static Property nvme_ns_props[] = {
>      DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
>      DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
> +
> +    DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
> +    DEFINE_PROP_UINT64("zone_size", NvmeNamespace, params.zone_size_mb,
> +                       NVME_DEFAULT_ZONE_SIZE),
> +    DEFINE_PROP_UINT64("zone_capacity", NvmeNamespace,
> +                       params.zone_capacity_mb, 0),

There is a nice DEFINE_PROP_SIZE that handles sizes in a nice way (i.e.
1G, 1M).
Niklas Cassel Sept. 30, 2020, 2:50 p.m. UTC | #5
On Mon, Sep 28, 2020 at 11:35:23AM +0900, Dmitry Fomichev wrote:
> The emulation code has been changed to advertise NVM Command Set when
> "zoned" device property is not set (default) and Zoned Namespace
> Command Set otherwise.
> 
> Handlers for three new NVMe commands introduced in Zoned Namespace
> Command Set specification are added, namely for Zone Management
> Receive, Zone Management Send and Zone Append.
> 
> Device initialization code has been extended to create a proper
> configuration for zoned operation using device properties.
> 
> Read/Write command handler is modified to only allow writes at the
> write pointer if the namespace is zoned. For Zone Append command,
> writes implicitly happen at the write pointer and the starting write
> pointer value is returned as the result of the command. Write Zeroes
> handler is modified to add zoned checks that are identical to those
> done as a part of Write flow.
> 
> The code to support for Zone Descriptor Extensions is not included in
> this commit and ZDES 0 is always reported. A later commit in this
> series will add ZDE support.
> 
> This commit doesn't yet include checks for active and open zone
> limits. It is assumed that there are no limits on either active or
> open zones.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  block/nvme.c         |   2 +-
>  hw/block/nvme-ns.c   | 185 ++++++++-
>  hw/block/nvme-ns.h   |   6 +-
>  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
>  include/block/nvme.h |   6 +-
>  5 files changed, 1033 insertions(+), 38 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 05485fdd11..7a513c9a17 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -333,7 +333,7 @@ static inline int nvme_translate_error(const NvmeCqe *c)
>  {
>      uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
>      if (status) {
> -        trace_nvme_error(le32_to_cpu(c->result),
> +        trace_nvme_error(le32_to_cpu(c->result32),
>                           le16_to_cpu(c->sq_head),
>                           le16_to_cpu(c->sq_id),
>                           le16_to_cpu(c->cid),
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index 31b7f986c3..6d9dc9205b 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -33,14 +33,14 @@ static void nvme_ns_init(NvmeNamespace *ns)
>      NvmeIdNs *id_ns = &ns->id_ns;
>  
>      if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> -        ns->id_ns.dlfeat = 0x9;
> +        ns->id_ns.dlfeat = 0x8;

You seem to change something that is NVM namespace specific here, why?
If it is indeed needed, I assume that this change should be in a separate
patch.

>      }
>  
>      id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
>  
>      id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
>  
> -    ns->params.csi = NVME_CSI_NVM;
> +    ns->csi = NVME_CSI_NVM;
>      qemu_uuid_generate(&ns->params.uuid); /* TODO make UUIDs persistent */
>  
>      /* no thin provisioning */
> @@ -73,7 +73,162 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
>      }
>  
>      lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> -    ns->id_ns.lbaf[lba_index].ds = 31 - clz32(n->conf.logical_block_size);
> +    ns->id_ns.lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
> +
> +    return 0;
> +}
> +
> +/*
> + * Add a zone to the tail of a zone list.
> + */
> +void nvme_add_zone_tail(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone)
> +{
> +    uint32_t idx = (uint32_t)(zone - ns->zone_array);
> +
> +    assert(nvme_zone_not_in_list(zone));
> +
> +    if (!zl->size) {
> +        zl->head = zl->tail = idx;
> +        zone->next = zone->prev = NVME_ZONE_LIST_NIL;
> +    } else {
> +        ns->zone_array[zl->tail].next = idx;
> +        zone->prev = zl->tail;
> +        zone->next = NVME_ZONE_LIST_NIL;
> +        zl->tail = idx;
> +    }
> +    zl->size++;
> +}
> +
> +/*
> + * Remove a zone from a zone list. The zone must be linked in the list.
> + */
> +void nvme_remove_zone(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone)
> +{
> +    uint32_t idx = (uint32_t)(zone - ns->zone_array);
> +
> +    assert(!nvme_zone_not_in_list(zone));
> +
> +    --zl->size;
> +    if (zl->size == 0) {
> +        zl->head = NVME_ZONE_LIST_NIL;
> +        zl->tail = NVME_ZONE_LIST_NIL;
> +    } else if (idx == zl->head) {
> +        zl->head = zone->next;
> +        ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
> +    } else if (idx == zl->tail) {
> +        zl->tail = zone->prev;
> +        ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL;
> +    } else {
> +        ns->zone_array[zone->next].prev = zone->prev;
> +        ns->zone_array[zone->prev].next = zone->next;
> +    }
> +
> +    zone->prev = zone->next = 0;
> +}
> +
> +static int nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
> +{
> +    uint64_t zone_size, zone_cap;
> +    uint32_t nz, lbasz = ns->blkconf.logical_block_size;
> +
> +    if (ns->params.zone_size_mb) {
> +        zone_size = ns->params.zone_size_mb;
> +    } else {
> +        zone_size = NVME_DEFAULT_ZONE_SIZE;
> +    }
> +    if (ns->params.zone_capacity_mb) {
> +        zone_cap = ns->params.zone_capacity_mb;
> +    } else {
> +        zone_cap = zone_size;
> +    }
> +    ns->zone_size = zone_size * MiB / lbasz;
> +    ns->zone_capacity = zone_cap * MiB / lbasz;
> +    if (ns->zone_capacity > ns->zone_size) {
> +        error_setg(errp, "zone capacity exceeds zone size");
> +        return -1;
> +    }
> +
> +    nz = DIV_ROUND_UP(ns->size / lbasz, ns->zone_size);
> +    ns->num_zones = nz;
> +    ns->zone_array_size = sizeof(NvmeZone) * nz;
> +    ns->zone_size_log2 = 0;
> +    if (is_power_of_2(ns->zone_size)) {
> +        ns->zone_size_log2 = 63 - clz64(ns->zone_size);
> +    }
> +
> +    return 0;
> +}
> +
> +static void nvme_init_zone_meta(NvmeNamespace *ns)
> +{
> +    uint64_t start = 0, zone_size = ns->zone_size;
> +    uint64_t capacity = ns->num_zones * zone_size;
> +    NvmeZone *zone;
> +    int i;
> +
> +    ns->zone_array = g_malloc0(ns->zone_array_size);
> +    ns->exp_open_zones = g_malloc0(sizeof(NvmeZoneList));
> +    ns->imp_open_zones = g_malloc0(sizeof(NvmeZoneList));
> +    ns->closed_zones = g_malloc0(sizeof(NvmeZoneList));
> +    ns->full_zones = g_malloc0(sizeof(NvmeZoneList));
> +
> +    nvme_init_zone_list(ns->exp_open_zones);
> +    nvme_init_zone_list(ns->imp_open_zones);
> +    nvme_init_zone_list(ns->closed_zones);
> +    nvme_init_zone_list(ns->full_zones);
> +
> +    zone = ns->zone_array;
> +    for (i = 0; i < ns->num_zones; i++, zone++) {
> +        if (start + zone_size > capacity) {
> +            zone_size = capacity - start;
> +        }
> +        zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE;
> +        nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
> +        zone->d.za = 0;
> +        zone->d.zcap = ns->zone_capacity;
> +        zone->d.zslba = start;
> +        zone->d.wp = start;
> +        zone->w_ptr = start;
> +        zone->prev = 0;
> +        zone->next = 0;
> +        start += zone_size;
> +    }
> +}
> +
> +static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
> +                              Error **errp)
> +{
> +    NvmeIdNsZoned *id_ns_z;
> +
> +    if (n->params.fill_pattern == 0) {
> +        ns->id_ns.dlfeat |= 0x01;
> +    } else if (n->params.fill_pattern == 0xff) {
> +        ns->id_ns.dlfeat |= 0x02;
> +    }
> +
> +    if (nvme_calc_zone_geometry(ns, errp) != 0) {
> +        return -1;
> +    }
> +
> +    nvme_init_zone_meta(ns);
> +
> +    id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
> +
> +    /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
> +    id_ns_z->mar = 0xffffffff;
> +    id_ns_z->mor = 0xffffffff;
> +    id_ns_z->zoc = 0;
> +    id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
> +
> +    id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
> +    id_ns_z->lbafe[lba_index].zdes = 0; /* FIXME make helper */
> +
> +    ns->csi = NVME_CSI_ZONED;
> +    ns->id_ns.ncap = cpu_to_le64(ns->zone_capacity * ns->num_zones);
> +    ns->id_ns.nuse = ns->id_ns.ncap;
> +    ns->id_ns.nsze = ns->id_ns.ncap;
> +
> +    ns->id_ns_zoned = id_ns_z;
>  
>      return 0;
>  }
> @@ -103,6 +258,12 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
>          return -1;
>      }
>  
> +    if (ns->params.zoned) {
> +        if (nvme_zoned_init_ns(n, ns, 0, errp) != 0) {
> +            return -1;
> +        }
> +    }
> +
>      return 0;
>  }
>  
> @@ -116,6 +277,16 @@ void nvme_ns_flush(NvmeNamespace *ns)
>      blk_flush(ns->blkconf.blk);
>  }
>  
> +void nvme_ns_cleanup(NvmeNamespace *ns)
> +{
> +    g_free(ns->id_ns_zoned);
> +    g_free(ns->zone_array);
> +    g_free(ns->exp_open_zones);
> +    g_free(ns->imp_open_zones);
> +    g_free(ns->closed_zones);
> +    g_free(ns->full_zones);
> +}
> +
>  static void nvme_ns_realize(DeviceState *dev, Error **errp)
>  {
>      NvmeNamespace *ns = NVME_NS(dev);
> @@ -133,6 +304,14 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
>  static Property nvme_ns_props[] = {
>      DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
>      DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
> +
> +    DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
> +    DEFINE_PROP_UINT64("zone_size", NvmeNamespace, params.zone_size_mb,
> +                       NVME_DEFAULT_ZONE_SIZE),
> +    DEFINE_PROP_UINT64("zone_capacity", NvmeNamespace,
> +                       params.zone_capacity_mb, 0),
> +    DEFINE_PROP_BOOL("cross_zone_read", NvmeNamespace,
> +                      params.cross_zone_read, false),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 04172f083e..daa13546c4 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -38,7 +38,6 @@ typedef struct NvmeZoneList {
>  
>  typedef struct NvmeNamespaceParams {
>      uint32_t nsid;
> -    uint8_t  csi;
>      bool     attached;
>      QemuUUID uuid;
>  
> @@ -52,6 +51,7 @@ typedef struct NvmeNamespace {
>      DeviceState  parent_obj;
>      BlockConf    blkconf;
>      int32_t      bootindex;
> +    uint8_t      csi;
>      int64_t      size;
>      NvmeIdNs     id_ns;
>  
> @@ -107,6 +107,7 @@ typedef struct NvmeCtrl NvmeCtrl;
>  int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
>  void nvme_ns_drain(NvmeNamespace *ns);
>  void nvme_ns_flush(NvmeNamespace *ns);
> +void nvme_ns_cleanup(NvmeNamespace *ns);
>  
>  static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
>  {
> @@ -188,4 +189,7 @@ static inline NvmeZone *nvme_next_zone_in_list(NvmeNamespace *ns, NvmeZone *z,
>      return &ns->zone_array[z->next];
>  }
>  
> +void nvme_add_zone_tail(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone);
> +void nvme_remove_zone(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone);
> +
>  #endif /* NVME_NS_H */
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 63ad03d6d6..38e25a4d1f 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -54,6 +54,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/units.h"
>  #include "qemu/error-report.h"
> +#include "crypto/random.h"
>  #include "hw/block/block.h"
>  #include "hw/pci/msix.h"
>  #include "hw/pci/pci.h"
> @@ -127,6 +128,46 @@ static uint16_t nvme_sqid(NvmeRequest *req)
>      return le16_to_cpu(req->sq->sqid);
>  }
>  
> +static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
> +                                   uint8_t state)
> +{
> +    if (!nvme_zone_not_in_list(zone)) {
> +        switch (nvme_get_zone_state(zone)) {
> +        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +            nvme_remove_zone(ns, ns->exp_open_zones, zone);
> +            break;
> +        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +            nvme_remove_zone(ns, ns->imp_open_zones, zone);
> +            break;
> +        case NVME_ZONE_STATE_CLOSED:
> +            nvme_remove_zone(ns, ns->closed_zones, zone);
> +            break;
> +        case NVME_ZONE_STATE_FULL:
> +            nvme_remove_zone(ns, ns->full_zones, zone);
> +        }
> +   }
> +
> +    nvme_set_zone_state(zone, state);
> +
> +    switch (state) {
> +    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +        nvme_add_zone_tail(ns, ns->exp_open_zones, zone);
> +        break;
> +    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +        nvme_add_zone_tail(ns, ns->imp_open_zones, zone);
> +        break;
> +    case NVME_ZONE_STATE_CLOSED:
> +        nvme_add_zone_tail(ns, ns->closed_zones, zone);
> +        break;
> +    case NVME_ZONE_STATE_FULL:
> +        nvme_add_zone_tail(ns, ns->full_zones, zone);
> +    case NVME_ZONE_STATE_READ_ONLY:
> +        break;
> +    default:
> +        zone->d.za = 0;
> +    }
> +}
> +
>  static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>  {
>      hwaddr low = n->ctrl_mem.addr;
> @@ -813,7 +854,7 @@ static void nvme_process_aers(void *opaque)
>  
>          req = n->aer_reqs[n->outstanding_aers];
>  
> -        result = (NvmeAerResult *) &req->cqe.result;
> +        result = (NvmeAerResult *) &req->cqe.result32;
>          result->event_type = event->result.event_type;
>          result->event_info = event->result.event_info;
>          result->log_page = event->result.log_page;
> @@ -882,6 +923,200 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
>      return NVME_SUCCESS;
>  }
>  
> +static void nvme_fill_data(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t offset,
> +                           uint32_t max_len, uint8_t pattern)
> +{
> +    ScatterGatherEntry *entry;
> +    uint32_t len, ent_len;
> +
> +    if (qsg->nsg > 0) {
> +        entry = qsg->sg;
> +        len = qsg->size;
> +        if (max_len) {
> +            len = MIN(len, max_len);
> +        }
> +        for (; len > 0; len -= ent_len) {
> +            ent_len = MIN(len, entry->len);
> +            if (offset > ent_len) {
> +                offset -= ent_len;
> +            } else if (offset != 0) {
> +                dma_memory_set(qsg->as, entry->base + offset,
> +                               pattern, ent_len - offset);
> +                offset = 0;
> +            } else {
> +                dma_memory_set(qsg->as, entry->base, pattern, ent_len);
> +            }
> +            entry++;
> +        }
> +    } else if (iov->iov) {
> +        len = iov_size(iov->iov, iov->niov);
> +        if (max_len) {
> +            len = MIN(len, max_len);
> +        }
> +        qemu_iovec_memset(iov, offset, pattern, len - offset);
> +    }
> +}
> +
> +static uint16_t nvme_check_zone_write(NvmeZone *zone, uint64_t slba,
> +                                      uint32_t nlb)
> +{
> +    uint16_t status;
> +
> +    if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) {
> +        return NVME_ZONE_BOUNDARY_ERROR;
> +    }
> +
> +    switch (nvme_get_zone_state(zone)) {
> +    case NVME_ZONE_STATE_EMPTY:
> +    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +    case NVME_ZONE_STATE_CLOSED:
> +        status = NVME_SUCCESS;
> +        break;
> +    case NVME_ZONE_STATE_FULL:
> +        status = NVME_ZONE_FULL;
> +        break;
> +    case NVME_ZONE_STATE_OFFLINE:
> +        status = NVME_ZONE_OFFLINE;
> +        break;
> +    case NVME_ZONE_STATE_READ_ONLY:
> +        status = NVME_ZONE_READ_ONLY;
> +        break;
> +    default:
> +        assert(false);
> +    }
> +    return status;
> +}
> +
> +static uint16_t nvme_check_zone_read(NvmeNamespace *ns, NvmeZone *zone,
> +                                     uint64_t slba, uint32_t nlb)
> +{
> +    uint64_t lba = slba, count;
> +    uint16_t status;
> +    uint8_t zs;
> +
> +    do {
> +        if (!ns->params.cross_zone_read &&
> +            (lba + nlb > nvme_zone_rd_boundary(ns, zone))) {
> +            return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
> +        }
> +
> +        zs = nvme_get_zone_state(zone);
> +        switch (zs) {
> +        case NVME_ZONE_STATE_EMPTY:
> +        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +        case NVME_ZONE_STATE_FULL:
> +        case NVME_ZONE_STATE_CLOSED:
> +        case NVME_ZONE_STATE_READ_ONLY:
> +            status = NVME_SUCCESS;
> +            break;
> +        case NVME_ZONE_STATE_OFFLINE:
> +            status = NVME_ZONE_OFFLINE | NVME_DNR;
> +            break;
> +        default:
> +            assert(false);
> +        }
> +        if (status != NVME_SUCCESS) {
> +            break;
> +        }
> +
> +        if (lba + nlb > nvme_zone_rd_boundary(ns, zone)) {
> +            count = nvme_zone_rd_boundary(ns, zone) - lba;
> +        } else {
> +            count = nlb;
> +        }
> +
> +        lba += count;
> +        nlb -= count;
> +        zone++;
> +    } while (nlb);
> +
> +    return status;
> +}
> +
> +static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
> +{
> +    return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
> +                                    slba / ns->zone_size;
> +}
> +
> +static bool nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
> +                                      bool failed)
> +{
> +    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
> +    NvmeZone *zone;
> +    uint64_t slba, start_wp = req->cqe.result64;
> +    uint32_t nlb, zone_idx;
> +    uint8_t zs;
> +
> +    if (rw->opcode != NVME_CMD_WRITE &&
> +        rw->opcode != NVME_CMD_ZONE_APPEND &&
> +        rw->opcode != NVME_CMD_WRITE_ZEROES) {
> +        return false;
> +    }
> +
> +    slba = le64_to_cpu(rw->slba);
> +    nlb = le16_to_cpu(rw->nlb) + 1;
> +    zone_idx = nvme_zone_idx(ns, slba);
> +    assert(zone_idx < ns->num_zones);
> +    zone = &ns->zone_array[zone_idx];
> +
> +    if (!failed && zone->w_ptr < start_wp + nlb) {
> +        /*
> +         * A preceding queued write to the zone has failed,
> +         * now this write is not at the WP, fail it too.
> +         */
> +        failed = true;
> +    }
> +
> +    if (failed) {
> +        if (zone->w_ptr > start_wp) {
> +            zone->w_ptr = start_wp;
> +        }
> +        req->cqe.result64 = 0;
> +    } else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) {
> +        zs = nvme_get_zone_state(zone);
> +        switch (zs) {
> +        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +        case NVME_ZONE_STATE_CLOSED:
> +        case NVME_ZONE_STATE_EMPTY:
> +            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
> +            /* fall through */
> +        case NVME_ZONE_STATE_FULL:
> +            break;
> +        default:
> +            assert(false);
> +        }
> +        zone->d.wp = zone->w_ptr;
> +    } else {
> +        zone->d.wp += nlb;
> +    }
> +
> +    return failed;
> +}
> +
> +static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
> +                                     uint32_t nlb)
> +{
> +    uint64_t result = zone->w_ptr;
> +    uint8_t zs;
> +
> +    zone->w_ptr += nlb;
> +
> +    if (zone->w_ptr < nvme_zone_wr_boundary(zone)) {
> +        zs = nvme_get_zone_state(zone);
> +        switch (zs) {
> +        case NVME_ZONE_STATE_EMPTY:
> +        case NVME_ZONE_STATE_CLOSED:
> +            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
> +        }
> +    }
> +
> +    return result;
> +}
> +
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>      NvmeRequest *req = opaque;
> @@ -896,10 +1131,27 @@ static void nvme_rw_cb(void *opaque, int ret)
>      trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
>  
>      if (!ret) {
> -        block_acct_done(stats, acct);
> +        if (ns->params.zoned) {
> +            if (nvme_finalize_zoned_write(ns, req, false)) {
> +                ret = EIO;
> +                block_acct_failed(stats, acct);
> +                req->status = NVME_ZONE_INVALID_WRITE;
> +            } else if (req->fill_ofs >= 0) {
> +                nvme_fill_data(&req->qsg, &req->iov, req->fill_ofs,
> +                               req->fill_len,
> +                               nvme_ctrl(req)->params.fill_pattern);
> +            }
> +        }
> +        if (!ret) {
> +            block_acct_done(stats, acct);
> +        }
>      } else {
>          uint16_t status;
>  
> +        if (ns->params.zoned) {
> +            nvme_finalize_zoned_write(ns, req, true);
> +        }
> +
>          block_acct_failed(stats, acct);
>  
>          switch (req->cmd.opcode) {
> @@ -953,6 +1205,7 @@ static uint16_t nvme_do_aio(BlockBackend *blk, int64_t offset, size_t len,
>          break;
>  
>      case NVME_CMD_WRITE:
> +    case NVME_CMD_ZONE_APPEND:
>          is_write = true;
>  
>          /* fallthrough */
> @@ -997,8 +1250,10 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
>      NvmeNamespace *ns = req->ns;
>      uint64_t slba = le64_to_cpu(rw->slba);
>      uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
> +    NvmeZone *zone = NULL;
>      uint64_t offset = nvme_l2b(ns, slba);
>      uint32_t count = nvme_l2b(ns, nlb);
> +    uint32_t zone_idx;
>      uint16_t status;
>  
>      trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
> @@ -1009,20 +1264,43 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
>          return status;
>      }
>  
> +    if (ns->params.zoned) {
> +        zone_idx = nvme_zone_idx(ns, slba);
> +        assert(zone_idx < ns->num_zones);
> +        zone = &ns->zone_array[zone_idx];
> +
> +        status = nvme_check_zone_write(zone, slba, nlb);
> +        if (status != NVME_SUCCESS) {
> +            trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
> +            return status | NVME_DNR;
> +        }
> +
> +        assert(nvme_wp_is_valid(zone));
> +        if (unlikely(slba != zone->w_ptr)) {
> +            trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
> +                                               zone->w_ptr);
> +            return NVME_ZONE_INVALID_WRITE | NVME_DNR;
> +        }
> +
> +        req->cqe.result64 = nvme_advance_zone_wp(ns, zone, nlb);
> +    }
> +
>      return nvme_do_aio(ns->blkconf.blk, offset, count, req);
>  }
>  
> -static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
> +static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req, bool append)
>  {
>      NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
>      NvmeNamespace *ns = req->ns;
> -    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
> +    uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
>      uint64_t slba = le64_to_cpu(rw->slba);
> -
>      uint64_t data_size = nvme_l2b(ns, nlb);
> -    uint64_t data_offset = nvme_l2b(ns, slba);
> -    enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
> -        BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
> +    uint64_t data_offset;
> +
> +    NvmeZone *zone = NULL;
> +    uint32_t zone_idx = 0;
> +    bool is_write = rw->opcode == NVME_CMD_WRITE || append;
> +    enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
>      uint16_t status;
>  
>      trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
> @@ -1040,18 +1318,468 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
>          goto invalid;
>      }
>  
> +    if (ns->params.zoned) {
> +        zone_idx = nvme_zone_idx(ns, slba);
> +        assert(zone_idx < ns->num_zones);
> +        zone = &ns->zone_array[zone_idx];
> +
> +        if (is_write) {
> +            status = nvme_check_zone_write(zone, slba, nlb);
> +            if (status != NVME_SUCCESS) {
> +                trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
> +                goto invalid;
> +            }
> +
> +            assert(nvme_wp_is_valid(zone));
> +            if (append) {
> +                if (unlikely(slba != zone->d.zslba)) {
> +                    trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
> +                    status = NVME_ZONE_INVALID_WRITE | NVME_DNR;
> +                    goto invalid;
> +                }
> +                if (data_size > (n->page_size << n->zasl)) {
> +                    trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
> +                    status = NVME_INVALID_FIELD | NVME_DNR;
> +                    goto invalid;
> +                }
> +                slba = zone->w_ptr;
> +            } else if (unlikely(slba != zone->w_ptr)) {
> +                trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
> +                                                   zone->w_ptr);
> +                status = NVME_ZONE_INVALID_WRITE | NVME_DNR;
> +                goto invalid;
> +            }
> +            req->fill_ofs = -1LL;
> +        } else {
> +            status = nvme_check_zone_read(ns, zone, slba, nlb);
> +            if (status != NVME_SUCCESS) {
> +                trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
> +                goto invalid;
> +            }
> +
> +            if (slba + nlb > zone->w_ptr) {
> +                /*
> +                 * All or some data is read above the WP. Need to
> +                 * fill out the buffer area that has no backing data
> +                 * with a predefined data pattern (zeros by default)
> +                 */
> +                if (slba >= zone->w_ptr) {
> +                    req->fill_ofs = 0;
> +                } else {
> +                    req->fill_ofs = nvme_l2b(ns, zone->w_ptr - slba);
> +                }
> +                req->fill_len = nvme_l2b(ns,
> +                    nvme_zone_rd_boundary(ns, zone) - slba);
> +            } else {
> +                req->fill_ofs = -1LL;
> +            }
> +        }
> +    } else if (append) {
> +        trace_pci_nvme_err_invalid_opc(rw->opcode);
> +        status = NVME_INVALID_OPCODE | NVME_DNR;
> +        goto invalid;
> +    }
> +
>      status = nvme_map_dptr(n, data_size, req);
>      if (status) {
>          goto invalid;
>      }
>  
> +    if (ns->params.zoned) {
> +        if (unlikely(req->fill_ofs == 0 &&
> +            slba + nlb <= nvme_zone_rd_boundary(ns, zone))) {
> +            /* No backend I/O necessary, only need to fill the buffer */
> +            nvme_fill_data(&req->qsg, &req->iov, 0, 0, n->params.fill_pattern);
> +            req->status = NVME_SUCCESS;
> +            return NVME_SUCCESS;
> +        }
> +        if (is_write) {
> +            req->cqe.result64 = nvme_advance_zone_wp(ns, zone, nlb);
> +        }
> +    }
> +
> +    data_offset = nvme_l2b(ns, slba);
> +
>      return nvme_do_aio(ns->blkconf.blk, data_offset, data_size, req);
>  
>  invalid:
>      block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct);
> +    return status | NVME_DNR;
> +}
> +
> +static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
> +                                            uint64_t *slba, uint32_t *zone_idx)
> +{
> +    uint32_t dw10 = le32_to_cpu(c->cdw10);
> +    uint32_t dw11 = le32_to_cpu(c->cdw11);
> +
> +    if (!ns->params.zoned) {
> +        trace_pci_nvme_err_invalid_opc(c->opcode);
> +        return NVME_INVALID_OPCODE | NVME_DNR;
> +    }
> +
> +    *slba = ((uint64_t)dw11) << 32 | dw10;
> +    if (unlikely(*slba >= ns->id_ns.nsze)) {
> +        trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
> +        *slba = 0;
> +        return NVME_LBA_RANGE | NVME_DNR;
> +    }
> +
> +    *zone_idx = nvme_zone_idx(ns, *slba);
> +    assert(*zone_idx < ns->num_zones);
> +
> +    return NVME_SUCCESS;
> +}
> +
> +static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
> +                               uint8_t state)
> +{
> +    switch (state) {
> +    case NVME_ZONE_STATE_EMPTY:
> +    case NVME_ZONE_STATE_CLOSED:
> +    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
> +        /* fall through */
> +    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +        return NVME_SUCCESS;
> +    }
> +
> +    return NVME_ZONE_INVAL_TRANSITION;
> +}
> +
> +static bool nvme_cond_open_all(uint8_t state)
> +{
> +    return state == NVME_ZONE_STATE_CLOSED;
> +}
> +
> +static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
> +                                uint8_t state)
> +{
> +    switch (state) {
> +    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
> +        /* fall through */
> +    case NVME_ZONE_STATE_CLOSED:
> +        return NVME_SUCCESS;
> +    }
> +
> +    return NVME_ZONE_INVAL_TRANSITION;
> +}
> +
> +static bool nvme_cond_close_all(uint8_t state)
> +{
> +    return state == NVME_ZONE_STATE_IMPLICITLY_OPEN ||
> +           state == NVME_ZONE_STATE_EXPLICITLY_OPEN;
> +}
> +
> +static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
> +                                 uint8_t state)
> +{
> +    switch (state) {
> +    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +    case NVME_ZONE_STATE_CLOSED:
> +    case NVME_ZONE_STATE_EMPTY:
> +        zone->w_ptr = nvme_zone_wr_boundary(zone);
> +        zone->d.wp = zone->w_ptr;
> +        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
> +        /* fall through */
> +    case NVME_ZONE_STATE_FULL:
> +        return NVME_SUCCESS;
> +    }
> +
> +    return NVME_ZONE_INVAL_TRANSITION;
> +}
> +
> +static bool nvme_cond_finish_all(uint8_t state)
> +{
> +    return state == NVME_ZONE_STATE_IMPLICITLY_OPEN ||
> +           state == NVME_ZONE_STATE_EXPLICITLY_OPEN ||
> +           state == NVME_ZONE_STATE_CLOSED;
> +}
> +
> +static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
> +                                uint8_t state)
> +{
> +    switch (state) {
> +    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
> +    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
> +    case NVME_ZONE_STATE_CLOSED:
> +    case NVME_ZONE_STATE_FULL:
> +        zone->w_ptr = zone->d.zslba;
> +        zone->d.wp = zone->w_ptr;
> +        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
> +        /* fall through */
> +    case NVME_ZONE_STATE_EMPTY:
> +        return NVME_SUCCESS;
> +    }
> +
> +    return NVME_ZONE_INVAL_TRANSITION;
> +}
> +
> +static bool nvme_cond_reset_all(uint8_t state)
> +{
> +    return state == NVME_ZONE_STATE_IMPLICITLY_OPEN ||
> +           state == NVME_ZONE_STATE_EXPLICITLY_OPEN ||
> +           state == NVME_ZONE_STATE_CLOSED ||
> +           state == NVME_ZONE_STATE_FULL;
> +}
> +
> +static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
> +                                  uint8_t state)
> +{
> +    switch (state) {
> +    case NVME_ZONE_STATE_READ_ONLY:
> +        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
> +        /* fall through */
> +    case NVME_ZONE_STATE_OFFLINE:
> +        return NVME_SUCCESS;
> +    }
> +
> +    return NVME_ZONE_INVAL_TRANSITION;
> +}
> +
> +static bool nvme_cond_offline_all(uint8_t state)
> +{
> +    return state == NVME_ZONE_STATE_READ_ONLY;
> +}
> +
> +typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *,
> +                                 uint8_t);
> +typedef bool (*need_to_proc_zone_t)(uint8_t);
> +
> +static uint16_t name_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
> +                                uint8_t state, bool all,
> +                                op_handler_t op_hndlr,
> +                                need_to_proc_zone_t proc_zone)
> +{
> +    int i;
> +    uint16_t status = 0;
> +
> +    if (!all) {
> +        status = op_hndlr(ns, zone, state);
> +    } else {
> +        for (i = 0; i < ns->num_zones; i++, zone++) {
> +            state = nvme_get_zone_state(zone);
> +            if (proc_zone(state)) {
> +                status = op_hndlr(ns, zone, state);
> +                if (status != NVME_SUCCESS) {
> +                    break;
> +                }
> +            }
> +        }
> +    }
> +
>      return status;
>  }
>  
> +static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
> +{
> +    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
> +    NvmeNamespace *ns = req->ns;
> +    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> +    uint64_t slba = 0;
> +    uint32_t zone_idx = 0;
> +    uint16_t status;
> +    uint8_t action, state;
> +    bool all;
> +    NvmeZone *zone;
> +
> +    action = dw13 & 0xff;
> +    all = dw13 & 0x100;
> +
> +    req->status = NVME_SUCCESS;
> +
> +    if (!all) {
> +        status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
> +        if (status) {
> +            return status;
> +        }
> +    }
> +
> +    zone = &ns->zone_array[zone_idx];
> +    if (slba != zone->d.zslba) {
> +        trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +    state = nvme_get_zone_state(zone);
> +
> +    switch (action) {
> +
> +    case NVME_ZONE_ACTION_OPEN:
> +        trace_pci_nvme_open_zone(slba, zone_idx, all);
> +        status = name_do_zone_op(ns, zone, state, all,
> +                                 nvme_open_zone, nvme_cond_open_all);
> +        break;
> +
> +    case NVME_ZONE_ACTION_CLOSE:
> +        trace_pci_nvme_close_zone(slba, zone_idx, all);
> +        status = name_do_zone_op(ns, zone, state, all,
> +                                 nvme_close_zone, nvme_cond_close_all);
> +        break;
> +
> +    case NVME_ZONE_ACTION_FINISH:
> +        trace_pci_nvme_finish_zone(slba, zone_idx, all);
> +        status = name_do_zone_op(ns, zone, state, all,
> +                                 nvme_finish_zone, nvme_cond_finish_all);
> +        break;
> +
> +    case NVME_ZONE_ACTION_RESET:
> +        trace_pci_nvme_reset_zone(slba, zone_idx, all);
> +        status = name_do_zone_op(ns, zone, state, all,
> +                                 nvme_reset_zone, nvme_cond_reset_all);
> +        break;
> +
> +    case NVME_ZONE_ACTION_OFFLINE:
> +        trace_pci_nvme_offline_zone(slba, zone_idx, all);
> +        status = name_do_zone_op(ns, zone, state, all,
> +                                 nvme_offline_zone, nvme_cond_offline_all);
> +        break;
> +
> +    case NVME_ZONE_ACTION_SET_ZD_EXT:
> +        trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +        break;
> +
> +    default:
> +        trace_pci_nvme_err_invalid_mgmt_action(action);
> +        status = NVME_INVALID_FIELD;
> +    }
> +
> +    if (status == NVME_ZONE_INVAL_TRANSITION) {
> +        trace_pci_nvme_err_invalid_zone_state_transition(state, action, slba,
> +                                                         zone->d.za);
> +    }
> +    if (status) {
> +        status |= NVME_DNR;
> +    }
> +
> +    return status;
> +}
> +
> +static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
> +{
> +    int zs = nvme_get_zone_state(zl);
> +
> +    switch (zafs) {
> +    case NVME_ZONE_REPORT_ALL:
> +        return true;
> +    case NVME_ZONE_REPORT_EMPTY:
> +        return zs == NVME_ZONE_STATE_EMPTY;
> +    case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
> +        return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
> +    case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
> +        return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
> +    case NVME_ZONE_REPORT_CLOSED:
> +        return zs == NVME_ZONE_STATE_CLOSED;
> +    case NVME_ZONE_REPORT_FULL:
> +        return zs == NVME_ZONE_STATE_FULL;
> +    case NVME_ZONE_REPORT_READ_ONLY:
> +        return zs == NVME_ZONE_STATE_READ_ONLY;
> +    case NVME_ZONE_REPORT_OFFLINE:
> +        return zs == NVME_ZONE_STATE_OFFLINE;
> +    default:
> +        return false;
> +    }
> +}
> +
> +static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
> +{
> +    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
> +    NvmeNamespace *ns = req->ns;
> +    /* cdw12 is zero-based number of dwords to return. Convert to bytes */
> +    uint32_t len = (le32_to_cpu(cmd->cdw12) + 1) << 2;
> +    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> +    uint32_t zone_idx, zra, zrasf, partial;
> +    uint64_t max_zones, nr_zones = 0;
> +    uint16_t ret;
> +    uint64_t slba;
> +    NvmeZoneDescr *z;
> +    NvmeZone *zs;
> +    NvmeZoneReportHeader *header;
> +    void *buf, *buf_p;
> +    size_t zone_entry_sz;
> +
> +    req->status = NVME_SUCCESS;
> +
> +    ret = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    if (len < sizeof(NvmeZoneReportHeader)) {
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +
> +    zra = dw13 & 0xff;
> +    if (!(zra == NVME_ZONE_REPORT || zra == NVME_ZONE_REPORT_EXTENDED)) {
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +
> +    if (zra == NVME_ZONE_REPORT_EXTENDED) {
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +
> +    zrasf = (dw13 >> 8) & 0xff;
> +    if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
> +        return NVME_INVALID_FIELD | NVME_DNR;
> +    }
> +
> +    partial = (dw13 >> 16) & 0x01;
> +
> +    zone_entry_sz = sizeof(NvmeZoneDescr);
> +
> +    max_zones = (len - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
> +    buf = g_malloc0(len);
> +
> +    header = (NvmeZoneReportHeader *)buf;
> +    buf_p = buf + sizeof(NvmeZoneReportHeader);
> +
> +    while (zone_idx < ns->num_zones && nr_zones < max_zones) {
> +        zs = &ns->zone_array[zone_idx];
> +
> +        if (!nvme_zone_matches_filter(zrasf, zs)) {
> +            zone_idx++;
> +            continue;
> +        }
> +
> +        z = (NvmeZoneDescr *)buf_p;
> +        buf_p += sizeof(NvmeZoneDescr);
> +        nr_zones++;
> +
> +        z->zt = zs->d.zt;
> +        z->zs = zs->d.zs;
> +        z->zcap = cpu_to_le64(zs->d.zcap);
> +        z->zslba = cpu_to_le64(zs->d.zslba);
> +        z->za = zs->d.za;
> +
> +        if (nvme_wp_is_valid(zs)) {
> +            z->wp = cpu_to_le64(zs->d.wp);
> +        } else {
> +            z->wp = cpu_to_le64(~0ULL);
> +        }
> +
> +        zone_idx++;
> +    }
> +
> +    if (!partial) {
> +        for (; zone_idx < ns->num_zones; zone_idx++) {
> +            zs = &ns->zone_array[zone_idx];
> +            if (nvme_zone_matches_filter(zrasf, zs)) {
> +                nr_zones++;
> +            }
> +        }
> +    }
> +    header->nr_zones = cpu_to_le64(nr_zones);
> +
> +    ret = nvme_dma(n, (uint8_t *)buf, len, DMA_DIRECTION_FROM_DEVICE, req);
> +
> +    g_free(buf);
> +
> +    return ret;
> +}
> +
>  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
>  {
>      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
> @@ -1073,9 +1801,15 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)

While you did make sure that we don't expose zone mgmt send/recv/zone append
in the cmd_effects log when CC.CSS != CSS_CSI, we also need to make sure we
return Invalid Command Opcode for any of those three commands, if a user tries
to use them anyway (while CC.CSS != CSI).

>          return nvme_flush(n, req);
>      case NVME_CMD_WRITE_ZEROES:
>          return nvme_write_zeroes(n, req);
> +    case NVME_CMD_ZONE_APPEND:
> +        return nvme_rw(n, req, true);
>      case NVME_CMD_WRITE:
>      case NVME_CMD_READ:
> -        return nvme_rw(n, req);
> +        return nvme_rw(n, req, false);
> +    case NVME_CMD_ZONE_MGMT_SEND:
> +        return nvme_zone_mgmt_send(n, req);
> +    case NVME_CMD_ZONE_MGMT_RECV:
> +        return nvme_zone_mgmt_recv(n, req);
>      default:
>          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
>          return NVME_INVALID_OPCODE | NVME_DNR;
> @@ -1301,7 +2035,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
>                      DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
> -static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
> +static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
>                                   uint64_t off, NvmeRequest *req)
>  {
>      NvmeEffectsLog cmd_eff_log = {};
> @@ -1326,11 +2060,20 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
>      acs[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFFECTS_CSUPP;
>      acs[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFFECTS_CSUPP;
>  
> -    iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> -    iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
> -                                  NVME_CMD_EFFECTS_LBCC;
> -    iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> -    iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
> +    if (NVME_CC_CSS(n->bar.cc) != CSS_ADMIN_ONLY) {
> +        iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
> +                                      NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
> +    }
> +
> +    if (csi == NVME_CSI_ZONED && NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> +        iocs[NVME_CMD_ZONE_APPEND] = NVME_CMD_EFFECTS_CSUPP |
> +                                     NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFFECTS_CSUPP;
> +        iocs[NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFFECTS_CSUPP;
> +    }
>  
>      trans_len = MIN(sizeof(cmd_eff_log) - off, buf_len);
>  
> @@ -1349,6 +2092,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
>      uint8_t  lid = dw10 & 0xff;
>      uint8_t  lsp = (dw10 >> 8) & 0xf;
>      uint8_t  rae = (dw10 >> 15) & 0x1;
> +    uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
>      uint32_t numdl, numdu;
>      uint64_t off, lpol, lpou;
>      size_t   len;
> @@ -1382,7 +2126,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
>      case NVME_LOG_FW_SLOT_INFO:
>          return nvme_fw_log_info(n, len, off, req);
>      case NVME_LOG_CMD_EFFECTS:
> -        return nvme_cmd_effects(n, len, off, req);
> +        return nvme_cmd_effects(n, csi, len, off, req);
>      default:
>          trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
>          return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1502,6 +2246,16 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
>      return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
> +static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
> +{
> +    switch (ns->csi) {
> +    case NVME_CSI_NVM:
> +    case NVME_CSI_ZONED:
> +        return true;
> +    }
> +    return false;
> +}
> +
>  static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
>  {
>      trace_pci_nvme_identify_ctrl();
> @@ -1513,11 +2267,16 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
>  static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
>  {
>      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
> +    NvmeIdCtrlZoned id = {};
>  
>      trace_pci_nvme_identify_ctrl_csi(c->csi);
>  
>      if (c->csi == NVME_CSI_NVM) {
>          return nvme_rpt_empty_id_struct(n, req);
> +    } else if (c->csi == NVME_CSI_ZONED) {
> +        id.zasl = n->zasl;
> +        return nvme_dma(n, (uint8_t *)&id, sizeof(id),
> +                        DMA_DIRECTION_FROM_DEVICE, req);

Please read my comment on nvme_identify_nslist_csi() before reading
this comment.

At least for this function, the specification is clear:

"If the host requests a data structure for an I/O Command Set that the
controller does not support, the controller shall abort the command with
a status of Invalid Field in Command."

If the controller supports the I/O command set == if the Command Set bit
is set in the data struct returned by the nvme_identify_cmd_set(),
so here we should do something like:

} else if (->csi == NVME_CSI_ZONED && ctrl_has_zns_namespaces()) {
	...
}

>      }
>  
>      return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1545,8 +2304,12 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req,
>          return nvme_rpt_empty_id_struct(n, req);
>      }
>  
> -    return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
> -                    DMA_DIRECTION_FROM_DEVICE, req);
> +    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
> +        return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
> +                        DMA_DIRECTION_FROM_DEVICE, req);
> +    }
> +
> +    return NVME_INVALID_CMD_SET | NVME_DNR;
>  }
>  
>  static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
> @@ -1571,8 +2334,11 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
>          return nvme_rpt_empty_id_struct(n, req);
>      }
>  
> -    if (c->csi == NVME_CSI_NVM) {
> +    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
>          return nvme_rpt_empty_id_struct(n, req);
> +    } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
> +        return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
> +                        DMA_DIRECTION_FROM_DEVICE, req);
>      }
>  
>      return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1634,7 +2400,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
>  
>      trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
>  
> -    if (c->csi != NVME_CSI_NVM) {
> +    if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {

When reading the specification for CNS 07h, I think that it is not clear
how this should behave...

I'm thinking in the case when c->csi == NVME_CSI_ZONED
when our QEMU model does only have NVMe namespaces.

Either we should return an empty list (1),
or we should return Invalid Field in Command (2).

If we decide to go with (2),
then we should probably take the code you have written in nvme_identify_cmd_set():

+    for (i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (ns && ns->params.zoned) {
+            NVME_SET_CSI(*list, NVME_CSI_ZONED);
+            break;
+        }
+    }

And move it into a ctrl_has_zns_namespaces() helper function,
and then do something like:
if (!(c->csi == NVME_CSI_NVM || (ctrl_has_zns_namespaces() && c->csi == NVME_CSI_ZONED)) 
	return NVME_INVALID_FIELD | NVME_DNR;


>          return NVME_INVALID_FIELD | NVME_DNR;
>      }
>  
> @@ -1643,7 +2409,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
>          if (!ns) {
>              continue;
>          }
> -        if (ns->params.nsid < min_nsid) {
> +        if (ns->params.nsid < min_nsid || c->csi != ns->csi) {
>              continue;
>          }
>          if (only_active && !ns->params.attached) {
> @@ -1696,19 +2462,29 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
>      desc->nidt = NVME_NIDT_CSI;
>      desc->nidl = NVME_NIDL_CSI;
>      list_ptr += sizeof(*desc);
> -    *(uint8_t *)list_ptr = NVME_CSI_NVM;
> +    *(uint8_t *)list_ptr = ns->csi;
>  
>      return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
>  {
> +    NvmeNamespace *ns;
>      uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
>      static const int data_len = sizeof(list);
> +    int i;
>  
>      trace_pci_nvme_identify_cmd_set();
>  
>      NVME_SET_CSI(*list, NVME_CSI_NVM);
> +    for (i = 1; i <= n->num_namespaces; i++) {
> +        ns = nvme_ns(n, i);
> +        if (ns && ns->params.zoned) {
> +            NVME_SET_CSI(*list, NVME_CSI_ZONED);
> +            break;
> +        }
> +    }
> +
>      return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
> @@ -1751,7 +2527,7 @@ static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
>  {
>      uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
>  
> -    req->cqe.result = 1;
> +    req->cqe.result32 = 1;
>      if (nvme_check_sqid(n, sqid)) {
>          return NVME_INVALID_FIELD | NVME_DNR;
>      }
> @@ -1932,7 +2708,7 @@ defaults:
>      }
>  
>  out:
> -    req->cqe.result = cpu_to_le32(result);
> +    req->cqe.result32 = cpu_to_le32(result);
>      return NVME_SUCCESS;
>  }
>  
> @@ -2057,8 +2833,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
>                                      ((dw11 >> 16) & 0xFFFF) + 1,
>                                      n->params.max_ioqpairs,
>                                      n->params.max_ioqpairs);
> -        req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
> -                                      ((n->params.max_ioqpairs - 1) << 16));
> +        req->cqe.result32 = cpu_to_le32((n->params.max_ioqpairs - 1) |
> +                                        ((n->params.max_ioqpairs - 1) << 16));
>          break;
>      case NVME_ASYNCHRONOUS_EVENT_CONF:
>          n->features.async_config = dw11;
> @@ -2310,16 +3086,28 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>              continue;
>          }
>          ns->params.attached = false;
> -        switch (ns->params.csi) {
> +        switch (ns->csi) {
>          case NVME_CSI_NVM:
>              if (NVME_CC_CSS(n->bar.cc) == CSS_NVM_ONLY ||
>                  NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
>                  ns->params.attached = true;
>              }
>              break;
> +        case NVME_CSI_ZONED:
> +            if (NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> +                ns->params.attached = true;
> +            }
> +            break;
>          }
>      }

Like I wrote in my review comment in the patch that added support for the new
allocated CNS values, I prefer if we remove this for-loop completely, and
simply set attached = true in nvme_ns_setup()/nvme_ns_init() instead.

(I was considering if we should set attach = true in nvme_zoned_init_ns(),
but because nvme_ns_setup()/nvme_ns_init() is called for all namespaces,
including ZNS namespaces, I don't think that any additional code in
nvme_zoned_init_ns() is warranted.)

>  
> +    if (!n->zasl_bs) {
> +        assert(n->params.mdts);
> +        n->zasl = n->params.mdts;
> +    } else {
> +        n->zasl = 31 - clz32(n->zasl_bs / n->page_size);
> +    }
> +
>      nvme_set_timestamp(n, 0ULL);
>  
>      QTAILQ_INIT(&n->aer_queue);
> @@ -2382,10 +3170,11 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
>                  case CSS_NVM_ONLY:
>                      trace_pci_nvme_css_nvm_cset_selected_by_host(data &
>                                                                   0xffffffff);
> -                    break;
> +                break;
>                  case CSS_CSI:
>                      NVME_SET_CC_CSS(n->bar.cc, CSS_CSI);
> -                    trace_pci_nvme_css_all_csets_sel_by_host(data & 0xffffffff);
> +                    trace_pci_nvme_css_all_csets_sel_by_host(data &
> +                                                             0xffffffff);
>                      break;
>                  case CSS_ADMIN_ONLY:
>                      break;
> @@ -2780,6 +3569,12 @@ static void nvme_init_state(NvmeCtrl *n)
>      n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
>      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
>      n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
> +
> +    if (!n->params.zasl_kb) {
> +        n->zasl_bs = n->params.mdts ? 0 : NVME_DEFAULT_MAX_ZA_SIZE * KiB;
> +    } else {
> +        n->zasl_bs = n->params.zasl_kb * KiB;
> +    }
>  }
>  
>  int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> @@ -2985,8 +3780,9 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
>      NVME_CAP_SET_CQR(n->bar.cap, 1);
>      NVME_CAP_SET_TO(n->bar.cap, 0xf);
>      /*
> -     * The device now always supports NS Types, but all commands
> -     * that support CSI field will only handle NVM Command Set.
> +     * The device now always supports NS Types, even when "zoned" property
> +     * is set to zero. If this is the case, all commands that support CSI
> +     * field only handle NVM Command Set.
>       */
>      NVME_CAP_SET_CSS(n->bar.cap, (CAP_CSS_NVM | CAP_CSS_CSI_SUPP));
>      NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
> @@ -3033,9 +3829,21 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>  static void nvme_exit(PCIDevice *pci_dev)
>  {
>      NvmeCtrl *n = NVME(pci_dev);
> +    NvmeNamespace *ns;
> +    int i;
>  
>      nvme_clear_ctrl(n);
> +
> +    for (i = 1; i <= n->num_namespaces; i++) {
> +        ns = nvme_ns(n, i);
> +        if (!ns) {
> +            continue;
> +        }
> +
> +        nvme_ns_cleanup(ns);
> +    }
>      g_free(n->namespaces);
> +
>      g_free(n->cq);
>      g_free(n->sq);
>      g_free(n->aer_reqs);
> @@ -3063,6 +3871,8 @@ static Property nvme_props[] = {
>      DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
>      DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
>      DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
> +    DEFINE_PROP_UINT8("fill_pattern", NvmeCtrl, params.fill_pattern, 0),
> +    DEFINE_PROP_UINT32("zone_append_size_limit", NvmeCtrl, params.zasl_kb, 0),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index a7126e123f..628c665728 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -651,8 +651,10 @@ typedef struct QEMU_PACKED NvmeAerResult {
>  } NvmeAerResult;
>  
>  typedef struct QEMU_PACKED NvmeCqe {
> -    uint32_t    result;
> -    uint32_t    rsvd;
> +    union {
> +        uint64_t     result64;
> +        uint32_t     result32;
> +    };
>      uint16_t    sq_head;
>      uint16_t    sq_id;
>      uint16_t    cid;
> -- 
> 2.21.0
>
Niklas Cassel Sept. 30, 2020, 3:12 p.m. UTC | #6
On Mon, Sep 28, 2020 at 11:35:23AM +0900, Dmitry Fomichev wrote:
> The emulation code has been changed to advertise NVM Command Set when
> "zoned" device property is not set (default) and Zoned Namespace
> Command Set otherwise.
> 
> Handlers for three new NVMe commands introduced in Zoned Namespace
> Command Set specification are added, namely for Zone Management
> Receive, Zone Management Send and Zone Append.
> 
> Device initialization code has been extended to create a proper
> configuration for zoned operation using device properties.
> 
> Read/Write command handler is modified to only allow writes at the
> write pointer if the namespace is zoned. For Zone Append command,
> writes implicitly happen at the write pointer and the starting write
> pointer value is returned as the result of the command. Write Zeroes
> handler is modified to add zoned checks that are identical to those
> done as a part of Write flow.
> 
> The code to support for Zone Descriptor Extensions is not included in
> this commit and ZDES 0 is always reported. A later commit in this
> series will add ZDE support.
> 
> This commit doesn't yet include checks for active and open zone
> limits. It is assumed that there are no limits on either active or
> open zones.
> 
> Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  block/nvme.c         |   2 +-
>  hw/block/nvme-ns.c   | 185 ++++++++-
>  hw/block/nvme-ns.h   |   6 +-
>  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
>  include/block/nvme.h |   6 +-
>  5 files changed, 1033 insertions(+), 38 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 05485fdd11..7a513c9a17 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c

(snip)

> @@ -1326,11 +2060,20 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
>      acs[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFFECTS_CSUPP;
>      acs[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFFECTS_CSUPP;
>  
> -    iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> -    iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
> -                                  NVME_CMD_EFFECTS_LBCC;
> -    iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> -    iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
> +    if (NVME_CC_CSS(n->bar.cc) != CSS_ADMIN_ONLY) {
> +        iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
> +                                      NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
> +    }
> +
> +    if (csi == NVME_CSI_ZONED && NVME_CC_CSS(n->bar.cc) == CSS_CSI) {

Actually, intead of naming the helper function, ctrl_has_zns_namespaces(),
a better name might be ctrl_has_zns_support()

Since this is what is used to set the bit in nvme_identify_cmd_set(),

Then, I think that this should be:

if (ctrl_has_zns_support() && csi == NVME_CSI_ZONED && NVME_CC_CSS(n->bar.cc) == CSS_CSI) {


> +        iocs[NVME_CMD_ZONE_APPEND] = NVME_CMD_EFFECTS_CSUPP |
> +                                     NVME_CMD_EFFECTS_LBCC;
> +        iocs[NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFFECTS_CSUPP;
> +        iocs[NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFFECTS_CSUPP;
> +    }
>  
>      trans_len = MIN(sizeof(cmd_eff_log) - off, buf_len);
>
Klaus Jensen Sept. 30, 2020, 6:23 p.m. UTC | #7
On Sep 30 14:50, Niklas Cassel wrote:
> On Mon, Sep 28, 2020 at 11:35:23AM +0900, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> > 
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> > 
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> > 
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> > 
> > The code to support for Zone Descriptor Extensions is not included in
> > this commit and ZDES 0 is always reported. A later commit in this
> > series will add ZDE support.
> > 
> > This commit doesn't yet include checks for active and open zone
> > limits. It is assumed that there are no limits on either active or
> > open zones.
> > 
> > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> > Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> > Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> > Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > ---
> >  block/nvme.c         |   2 +-
> >  hw/block/nvme-ns.c   | 185 ++++++++-
> >  hw/block/nvme-ns.h   |   6 +-
> >  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
> >  include/block/nvme.h |   6 +-
> >  5 files changed, 1033 insertions(+), 38 deletions(-)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index 05485fdd11..7a513c9a17 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > @@ -333,7 +333,7 @@ static inline int nvme_translate_error(const NvmeCqe *c)
> >  {
> >      uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
> >      if (status) {
> > -        trace_nvme_error(le32_to_cpu(c->result),
> > +        trace_nvme_error(le32_to_cpu(c->result32),
> >                           le16_to_cpu(c->sq_head),
> >                           le16_to_cpu(c->sq_id),
> >                           le16_to_cpu(c->cid),
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index 31b7f986c3..6d9dc9205b 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -33,14 +33,14 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >      NvmeIdNs *id_ns = &ns->id_ns;
> >  
> >      if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> > -        ns->id_ns.dlfeat = 0x9;
> > +        ns->id_ns.dlfeat = 0x8;
> 
> You seem to change something that is NVM namespace specific here, why?
> If it is indeed needed, I assume that this change should be in a separate
> patch.
> 

Stood out to me as well - and I thought it was sound enough, but now I'm
not sure sure.

DLFEAT is set to 0x8, which only signifies that Deallocate in Write
Zeroes is supported. Previously, it would also signify that returned
values would be 0x00 (DLFEAT 0x8 | 0x1). But since Dmitry added the
fill_pattern parameter...


> > +static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
> > +                              Error **errp)
> > +{
> > +    NvmeIdNsZoned *id_ns_z;
> > +
> > +    if (n->params.fill_pattern == 0) {
> > +        ns->id_ns.dlfeat |= 0x01;
> > +    } else if (n->params.fill_pattern == 0xff) {
> > +        ns->id_ns.dlfeat |= 0x02;
> > +    }

... then, when initialized, we look at the fill_pattern and set DLFEAT
accordingly instead.

But since fill_pattern only works for ZNS namespaces, I think dlfeat
should still be 0x9 for NVM namespaces. For NVM namespaces, since
neither DULBE or DSM is not supported, there is really only Write Zeroes
that can explicitly "deallocate" a block, and since that *will* write
zeroes no matter if DEAC is set or not, 0x00 pattern is guaranteed.
Dmitry Fomichev Oct. 4, 2020, 11:48 p.m. UTC | #8
On Wed, 2020-09-30 at 07:59 +0200, Klaus Jensen wrote:
> On Sep 28 11:35, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> > 
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> > 
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> > 
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> > 
> > The code to support for Zone Descriptor Extensions is not included in
> > this commit and ZDES 0 is always reported. A later commit in this
> > series will add ZDE support.
> > 
> > This commit doesn't yet include checks for active and open zone
> > limits. It is assumed that there are no limits on either active or
> > open zones.
> > 
> > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> > Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> > Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> > Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > ---
> >  block/nvme.c         |   2 +-
> >  hw/block/nvme-ns.c   | 185 ++++++++-
> >  hw/block/nvme-ns.h   |   6 +-
> >  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
> >  include/block/nvme.h |   6 +-
> >  5 files changed, 1033 insertions(+), 38 deletions(-)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index 05485fdd11..7a513c9a17 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > +static int nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
> > +{
> > +    uint64_t zone_size, zone_cap;
> > +    uint32_t nz, lbasz = ns->blkconf.logical_block_size;
> > +
> > +    if (ns->params.zone_size_mb) {
> > +        zone_size = ns->params.zone_size_mb;
> > +    } else {
> > +        zone_size = NVME_DEFAULT_ZONE_SIZE;
> > +    }
> > +    if (ns->params.zone_capacity_mb) {
> > +        zone_cap = ns->params.zone_capacity_mb;
> > +    } else {
> > +        zone_cap = zone_size;
> > +    }
> 
> I think a check that zone_capacity_mb is less than or equal to
> zone_size_mb is missing earlier?

The check is right below, but I now think it is better to
compare byte sizes rather than numbers of LBAs. There are also
checks missing for zone_size >= lbasz and zone_cap >= lbasz that
I am adding.

> 
> > +static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
> > +                              Error **errp)
> > +{
> > +    NvmeIdNsZoned *id_ns_z;
> > +
> > +    if (n->params.fill_pattern == 0) {
> > +        ns->id_ns.dlfeat |= 0x01;
> > +    } else if (n->params.fill_pattern == 0xff) {
> > +        ns->id_ns.dlfeat |= 0x02;
> > +    }
> > +
> > +    if (nvme_calc_zone_geometry(ns, errp) != 0) {
> > +        return -1;
> > +    }
> > +
> > +    nvme_init_zone_meta(ns);
> > +
> > +    id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
> > +
> > +    /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
> > +    id_ns_z->mar = 0xffffffff;
> > +    id_ns_z->mor = 0xffffffff;
> > +    id_ns_z->zoc = 0;
> > +    id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
> > +
> > +    id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
> > +    id_ns_z->lbafe[lba_index].zdes = 0; /* FIXME make helper */
> > +
> > +    ns->csi = NVME_CSI_ZONED;
> > +    ns->id_ns.ncap = cpu_to_le64(ns->zone_capacity * ns->num_zones);
> > +    ns->id_ns.nuse = ns->id_ns.ncap;
> > +    ns->id_ns.nsze = ns->id_ns.ncap;
> > +
> 
> NSZE should be in terms of ZSZE. We *can* report NCAP < NSZE if zcap !=
> zsze, but that requires bit 1 set in NSFEAT and proper reporting of
> NUSE.

Ok, will correct. I think it used to be this way, but got messed up
during multiple transformations of this code.

> 
> > @@ -133,6 +304,14 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
> >  static Property nvme_ns_props[] = {
> >      DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
> >      DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
> > +
> > +    DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
> > +    DEFINE_PROP_UINT64("zone_size", NvmeNamespace, params.zone_size_mb,
> > +                       NVME_DEFAULT_ZONE_SIZE),
> > +    DEFINE_PROP_UINT64("zone_capacity", NvmeNamespace,
> > +                       params.zone_capacity_mb, 0),
> 
> There is a nice DEFINE_PROP_SIZE that handles sizes in a nice way (i.e.
> 1G, 1M).

It should be nice to use these types, will add them. _SIZE32 sounds like
a good candidate to use for ZASL.

Thank you for your valuable feedback!
>
Dmitry Fomichev Oct. 4, 2020, 11:57 p.m. UTC | #9
On Wed, 2020-09-30 at 14:50 +0000, Niklas Cassel wrote:
> On Mon, Sep 28, 2020 at 11:35:23AM +0900, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> > 
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> > 
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> > 
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> > 
> > The code to support for Zone Descriptor Extensions is not included in
> > this commit and ZDES 0 is always reported. A later commit in this
> > series will add ZDE support.
> > 
> > This commit doesn't yet include checks for active and open zone
> > limits. It is assumed that there are no limits on either active or
> > open zones.
> > 
> > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> > Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> > Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> > Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > ---
> >  block/nvme.c         |   2 +-
> >  hw/block/nvme-ns.c   | 185 ++++++++-
> >  hw/block/nvme-ns.h   |   6 +-
> >  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
> >  include/block/nvme.h |   6 +-
> >  5 files changed, 1033 insertions(+), 38 deletions(-)
> > 
> > diff --git a/block/nvme.c b/block/nvme.c
> > index 05485fdd11..7a513c9a17 100644
> > --- a/block/nvme.c
> > +++ b/block/nvme.c
> > @@ -333,7 +333,7 @@ static inline int nvme_translate_error(const NvmeCqe *c)
> >  {
> >      uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
> >      if (status) {
> > -        trace_nvme_error(le32_to_cpu(c->result),
> > +        trace_nvme_error(le32_to_cpu(c->result32),
> >                           le16_to_cpu(c->sq_head),
> >                           le16_to_cpu(c->sq_id),
> >                           le16_to_cpu(c->cid),
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index 31b7f986c3..6d9dc9205b 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -33,14 +33,14 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >      NvmeIdNs *id_ns = &ns->id_ns;
> >  
> >      if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> > -        ns->id_ns.dlfeat = 0x9;
> > +        ns->id_ns.dlfeat = 0x8;
> 
> You seem to change something that is NVM namespace specific here, why?
> If it is indeed needed, I assume that this change should be in a separate
> patch.
> 

OK, this needs to be done in nvme_zoned_init_ns(). Thanks

> >      }
> >  
> >      id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
> >      uint16_t status;

<snip>

> >  
> > +    header->nr_zones = cpu_to_le64(nr_zones);
> > +
> > +    ret = nvme_dma(n, (uint8_t *)buf, len, DMA_DIRECTION_FROM_DEVICE, req);
> > +
> > +    g_free(buf);
> > +
> > +    return ret;
> > +}
> > +
> >  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      uint32_t nsid = le32_to_cpu(req->cmd.nsid);
> > @@ -1073,9 +1801,15 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
> 
> While you did make sure that we don't expose zone mgmt send/recv/zone append
> in the cmd_effects log when CC.CSS != CSS_CSI, we also need to make sure we
> return Invalid Command Opcode for any of those three commands, if a user tries
> to use them anyway (while CC.CSS != CSI).
> 

Yes, good catch. Only the commands that are marked as supported in Commands
Supported and Effects log page are allowed to be executed. I am making some
changes to ensure this.

> >          return nvme_flush(n, req);
> >      case NVME_CMD_WRITE_ZEROES:
> >          return nvme_write_zeroes(n, req);
> > +    case NVME_CMD_ZONE_APPEND:
> > +        return nvme_rw(n, req, true);
> >      case NVME_CMD_WRITE:
> >      case NVME_CMD_READ:
> > -        return nvme_rw(n, req);
> > +        return nvme_rw(n, req, false);
> > +    case NVME_CMD_ZONE_MGMT_SEND:
> > +        return nvme_zone_mgmt_send(n, req);
> > +    case NVME_CMD_ZONE_MGMT_RECV:
> > +        return nvme_zone_mgmt_recv(n, req);
> >      default:
> >          trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
> >          return NVME_INVALID_OPCODE | NVME_DNR;
> > @@ -1301,7 +2035,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
> >                      DMA_DIRECTION_FROM_DEVICE, req);
> >  }
> >  
> > -static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
> > +static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
> >                                   uint64_t off, NvmeRequest *req)
> >  {
> >      NvmeEffectsLog cmd_eff_log = {};
> > @@ -1326,11 +2060,20 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
> >      acs[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFFECTS_CSUPP;
> >      acs[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFFECTS_CSUPP;
> >  
> > -    iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> > -    iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
> > -                                  NVME_CMD_EFFECTS_LBCC;
> > -    iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> > -    iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
> > +    if (NVME_CC_CSS(n->bar.cc) != CSS_ADMIN_ONLY) {
> > +        iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> > +        iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
> > +                                      NVME_CMD_EFFECTS_LBCC;
> > +        iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
> > +        iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
> > +    }
> > +
> > +    if (csi == NVME_CSI_ZONED && NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> > +        iocs[NVME_CMD_ZONE_APPEND] = NVME_CMD_EFFECTS_CSUPP |
> > +                                     NVME_CMD_EFFECTS_LBCC;
> > +        iocs[NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFFECTS_CSUPP;
> > +        iocs[NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFFECTS_CSUPP;
> > +    }

I think the above needs to be changed to only allow admin commands if this
log request arrives with an unrecognized CSI. Some command sets possibly may
not support some or any NVM i/o commands.

> >  
> >      trans_len = MIN(sizeof(cmd_eff_log) - off, buf_len);
> >  
> > @@ -1349,6 +2092,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
> >      uint8_t  lid = dw10 & 0xff;
> >      uint8_t  lsp = (dw10 >> 8) & 0xf;
> >      uint8_t  rae = (dw10 >> 15) & 0x1;
> > +    uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
> >      uint32_t numdl, numdu;
> >      uint64_t off, lpol, lpou;
> >      size_t   len;
> > @@ -1382,7 +2126,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
> >      case NVME_LOG_FW_SLOT_INFO:
> >          return nvme_fw_log_info(n, len, off, req);
> >      case NVME_LOG_CMD_EFFECTS:
> > -        return nvme_cmd_effects(n, len, off, req);
> > +        return nvme_cmd_effects(n, csi, len, off, req);
> >      default:
> >          trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
> >          return NVME_INVALID_FIELD | NVME_DNR;
> > @@ -1502,6 +2246,16 @@ static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
> >      return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
> >  }
> >  
> > +static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
> > +{
> > +    switch (ns->csi) {
> > +    case NVME_CSI_NVM:
> > +    case NVME_CSI_ZONED:
> > +        return true;
> > +    }
> > +    return false;
> > +}
> > +
> >  static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      trace_pci_nvme_identify_ctrl();
> > @@ -1513,11 +2267,16 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
> >  static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
> > +    NvmeIdCtrlZoned id = {};
> >  
> >      trace_pci_nvme_identify_ctrl_csi(c->csi);
> >  
> >      if (c->csi == NVME_CSI_NVM) {
> >          return nvme_rpt_empty_id_struct(n, req);
> > +    } else if (c->csi == NVME_CSI_ZONED) {
> > +        id.zasl = n->zasl;
> > +        return nvme_dma(n, (uint8_t *)&id, sizeof(id),
> > +                        DMA_DIRECTION_FROM_DEVICE, req);
> 
> Please read my comment on nvme_identify_nslist_csi() before reading
> this comment.
> 
> At least for this function, the specification is clear:
> 
> "If the host requests a data structure for an I/O Command Set that the
> controller does not support, the controller shall abort the command with
> a status of Invalid Field in Command."
> 
> If the controller supports the I/O command set == if the Command Set bit
> is set in the data struct returned by the nvme_identify_cmd_set(),
> so here we should do something like:
> 
> } else if (->csi == NVME_CSI_ZONED && ctrl_has_zns_namespaces()) {
> 	...
> }
> 

With this commit, the controller supports ZNS command set regardless of
the number of attached ZNS namespaces. It could be zero, but the controller
still supports it. I think it would be better not to change the behavior
of this command to depend on whether there are any ZNS namespaces added
or not.

> >      }
> >  
> >      return NVME_INVALID_FIELD | NVME_DNR;
> > @@ -1545,8 +2304,12 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req,
> >          return nvme_rpt_empty_id_struct(n, req);
> >      }
> >  
> > -    return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
> > -                    DMA_DIRECTION_FROM_DEVICE, req);
> > +    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
> > +        return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
> > +                        DMA_DIRECTION_FROM_DEVICE, req);
> > +    }
> > +
> > +    return NVME_INVALID_CMD_SET | NVME_DNR;
> >  }
> >  
> >  static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
> > @@ -1571,8 +2334,11 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
> >          return nvme_rpt_empty_id_struct(n, req);
> >      }
> >  
> > -    if (c->csi == NVME_CSI_NVM) {
> > +    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
> >          return nvme_rpt_empty_id_struct(n, req);
> > +    } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
> > +        return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
> > +                        DMA_DIRECTION_FROM_DEVICE, req);
> >      }
> >  
> >      return NVME_INVALID_FIELD | NVME_DNR;
> > @@ -1634,7 +2400,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
> >  
> >      trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
> >  
> > -    if (c->csi != NVME_CSI_NVM) {
> > +    if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
> 
> When reading the specification for CNS 07h, I think that it is not clear
> how this should behave...
> 
> I'm thinking in the case when c->csi == NVME_CSI_ZONED
> when our QEMU model does only have NVMe namespaces.
> 

I think simply returning an empty list is fine in this case. The loop
that follows will not add any nsids to the list and this is what host
is going to receive.

> Either we should return an empty list (1),
> or we should return Invalid Field in Command (2).
> 
> If we decide to go with (2),
> then we should probably take the code you have written in nvme_identify_cmd_set():
> 
> +    for (i = 1; i <= n->num_namespaces; i++) {
> +        ns = nvme_ns(n, i);
> +        if (ns && ns->params.zoned) {
> +            NVME_SET_CSI(*list, NVME_CSI_ZONED);
> +            break;
> +        }
> +    }
> 
> And move it into a ctrl_has_zns_namespaces() helper function,
> and then do something like:
> if (!(c->csi == NVME_CSI_NVM || (ctrl_has_zns_namespaces() && c->csi == NVME_CSI_ZONED)) 
> 	return NVME_INVALID_FIELD | NVME_DNR;
> 
> 


> >          return NVME_INVALID_FIELD | NVME_DNR;
> >      }
> >  
> > @@ -1643,7 +2409,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
> >          if (!ns) {
> >              continue;
> >          }
> > -        if (ns->params.nsid < min_nsid) {
> > +        if (ns->params.nsid < min_nsid || c->csi != ns->csi) {
> >              continue;
> >          }
> >          if (only_active && !ns->params.attached) {
> > @@ -1696,19 +2462,29 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
> >      desc->nidt = NVME_NIDT_CSI;
> >      desc->nidl = NVME_NIDL_CSI;
> >      list_ptr += sizeof(*desc);
> > -    *(uint8_t *)list_ptr = NVME_CSI_NVM;
> > +    *(uint8_t *)list_ptr = ns->csi;
> >  
> >      return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
> >  }
> >  
> >  static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
> >  {
> > +    NvmeNamespace *ns;
> >      uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
> >      static const int data_len = sizeof(list);
> > +    int i;
> >  
> >      trace_pci_nvme_identify_cmd_set();
> >  
> >      NVME_SET_CSI(*list, NVME_CSI_NVM);
> > +    for (i = 1; i <= n->num_namespaces; i++) {
> > +        ns = nvme_ns(n, i);
> > +        if (ns && ns->params.zoned) {
> > +            NVME_SET_CSI(*list, NVME_CSI_ZONED);
> > +            break;
> > +        }
> > +    }
> > +
> >      return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
> >  }
> >  
> > @@ -1751,7 +2527,7 @@ static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
> >  
> > -    req->cqe.result = 1;
> > +    req->cqe.result32 = 1;
> >      if (nvme_check_sqid(n, sqid)) {
> >          return NVME_INVALID_FIELD | NVME_DNR;
> >      }
> > @@ -1932,7 +2708,7 @@ defaults:
> >      }
> >  
> >  out:
> > -    req->cqe.result = cpu_to_le32(result);
> > +    req->cqe.result32 = cpu_to_le32(result);
> >      return NVME_SUCCESS;
> >  }
> >  
> > @@ -2057,8 +2833,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
> >                                      ((dw11 >> 16) & 0xFFFF) + 1,
> >                                      n->params.max_ioqpairs,
> >                                      n->params.max_ioqpairs);
> > -        req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
> > -                                      ((n->params.max_ioqpairs - 1) << 16));
> > +        req->cqe.result32 = cpu_to_le32((n->params.max_ioqpairs - 1) |
> > +                                        ((n->params.max_ioqpairs - 1) << 16));
> >          break;
> >      case NVME_ASYNCHRONOUS_EVENT_CONF:
> >          n->features.async_config = dw11;
> > @@ -2310,16 +3086,28 @@ static int nvme_start_ctrl(NvmeCtrl *n)
> >              continue;
> >          }
> >          ns->params.attached = false;
> > -        switch (ns->params.csi) {
> > +        switch (ns->csi) {
> >          case NVME_CSI_NVM:
> >              if (NVME_CC_CSS(n->bar.cc) == CSS_NVM_ONLY ||
> >                  NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> >                  ns->params.attached = true;
> >              }
> >              break;
> > +        case NVME_CSI_ZONED:
> > +            if (NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> > +                ns->params.attached = true;
> > +            }
> > +            break;
> >          }
> >      }
> 
> Like I wrote in my review comment in the patch that added support for the new
> allocated CNS values, I prefer if we remove this for-loop completely, and
> simply set attached = true in nvme_ns_setup()/nvme_ns_init() instead.
> 
> (I was considering if we should set attach = true in nvme_zoned_init_ns(),
> but because nvme_ns_setup()/nvme_ns_init() is called for all namespaces,
> including ZNS namespaces, I don't think that any additional code in
> nvme_zoned_init_ns() is warranted.)

I think CC.CSS value is not available during namespace setup and if we
assign active flag in nvme_zoned_ns_setup(), zoned namespaces may end up
being active even if NVM Only command set is selected. So keeping this loop
seems like a good idea.

> 
> >  
> > +    if (!n->zasl_bs) {
> > +        assert(n->params.mdts);
> > +        n->zasl = n->params.mdts;
> > +    } else {
> > +        n->zasl = 31 - clz32(n->zasl_bs / n->page_size);
> > +    }
> > +
> >      nvme_set_timestamp(n, 0ULL);
> >  
> >      QTAILQ_INIT(&n->aer_queue);
> > @@ -2382,10 +3170,11 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
> >                  case CSS_NVM_ONLY:
> >                      trace_pci_nvme_css_nvm_cset_selected_by_host(data &
> >                                                                   0xffffffff);
> > -                    break;
> > +                break;
> >                  case CSS_CSI:
> >                      NVME_SET_CC_CSS(n->bar.cc, CSS_CSI);
> > -                    trace_pci_nvme_css_all_csets_sel_by_host(data & 0xffffffff);
> > +                    trace_pci_nvme_css_all_csets_sel_by_host(data &
> > +                                                             0xffffffff);
> >                      break;
> >                  case CSS_ADMIN_ONLY:
> >                      break;
> > @@ -2780,6 +3569,12 @@ static void nvme_init_state(NvmeCtrl *n)
> >      n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
> >      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> >      n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
> > +
> > +    if (!n->params.zasl_kb) {
> > +        n->zasl_bs = n->params.mdts ? 0 : NVME_DEFAULT_MAX_ZA_SIZE * KiB;
> > +    } else {
> > +        n->zasl_bs = n->params.zasl_kb * KiB;
> > +    }
> >  }
> >  
> >  int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> > @@ -2985,8 +3780,9 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
> >      NVME_CAP_SET_CQR(n->bar.cap, 1);
> >      NVME_CAP_SET_TO(n->bar.cap, 0xf);
> >      /*
> > -     * The device now always supports NS Types, but all commands
> > -     * that support CSI field will only handle NVM Command Set.
> > +     * The device now always supports NS Types, even when "zoned" property
> > +     * is set to zero. If this is the case, all commands that support CSI
> > +     * field only handle NVM Command Set.
> >       */
> >      NVME_CAP_SET_CSS(n->bar.cap, (CAP_CSS_NVM | CAP_CSS_CSI_SUPP));
> >      NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
> > @@ -3033,9 +3829,21 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> >  static void nvme_exit(PCIDevice *pci_dev)
> >  {
> >      NvmeCtrl *n = NVME(pci_dev);
> > +    NvmeNamespace *ns;
> > +    int i;
> >  
> >      nvme_clear_ctrl(n);
> > +
> > +    for (i = 1; i <= n->num_namespaces; i++) {
> > +        ns = nvme_ns(n, i);
> > +        if (!ns) {
> > +            continue;
> > +        }
> > +
> > +        nvme_ns_cleanup(ns);
> > +    }
> >      g_free(n->namespaces);
> > +
> >      g_free(n->cq);
> >      g_free(n->sq);
> >      g_free(n->aer_reqs);
> > @@ -3063,6 +3871,8 @@ static Property nvme_props[] = {
> >      DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
> >      DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
> >      DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
> > +    DEFINE_PROP_UINT8("fill_pattern", NvmeCtrl, params.fill_pattern, 0),
> > +    DEFINE_PROP_UINT32("zone_append_size_limit", NvmeCtrl, params.zasl_kb, 0),
> >      DEFINE_PROP_END_OF_LIST(),
> >  };
> >  
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index a7126e123f..628c665728 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -651,8 +651,10 @@ typedef struct QEMU_PACKED NvmeAerResult {
> >  } NvmeAerResult;
> >  
> >  typedef struct QEMU_PACKED NvmeCqe {
> > -    uint32_t    result;
> > -    uint32_t    rsvd;
> > +    union {
> > +        uint64_t     result64;
> > +        uint32_t     result32;
> > +    };
> >      uint16_t    sq_head;
> >      uint16_t    sq_id;
> >      uint16_t    cid;
> > -- 
> > 2.21.0
Dmitry Fomichev Oct. 5, 2020, 12:53 a.m. UTC | #10
> -----Original Message-----
> From: Klaus Jensen <its@irrelevant.dk>
> Sent: Monday, September 28, 2020 6:43 AM
> To: Dmitry Fomichev <Dmitry.Fomichev@wdc.com>
> Cc: Keith Busch <kbusch@kernel.org>; Klaus Jensen
> <k.jensen@samsung.com>; Kevin Wolf <kwolf@redhat.com>; Philippe
> Mathieu-Daudé <philmd@redhat.com>; Maxim Levitsky
> <mlevitsk@redhat.com>; Fam Zheng <fam@euphon.net>; Niklas Cassel
> <Niklas.Cassel@wdc.com>; Damien Le Moal <Damien.LeMoal@wdc.com>;
> qemu-block@nongnu.org; qemu-devel@nongnu.org; Alistair Francis
> <Alistair.Francis@wdc.com>; Matias Bjorling <Matias.Bjorling@wdc.com>
> Subject: Re: [PATCH v5 09/14] hw/block/nvme: Support Zoned Namespace
> Command Set
> 
> On Sep 28 11:35, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set
> when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> >
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> >
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> >
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> >
> > The code to support for Zone Descriptor Extensions is not included in
> > this commit and ZDES 0 is always reported. A later commit in this
> > series will add ZDE support.
> >
> > This commit doesn't yet include checks for active and open zone
> > limits. It is assumed that there are no limits on either active or
> > open zones.
> >
> 
> I think the fill_pattern feature stands separate, so it would be nice to
> extract that to a patch on its own.
> 
> > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> > Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> > Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> > Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > ---
> >  block/nvme.c         |   2 +-
> >  hw/block/nvme-ns.c   | 185 ++++++++-
> >  hw/block/nvme-ns.h   |   6 +-
> >  hw/block/nvme.c      | 872
> +++++++++++++++++++++++++++++++++++++++++--
> >  include/block/nvme.h |   6 +-
> >  5 files changed, 1033 insertions(+), 38 deletions(-)
> >
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 04172f083e..daa13546c4 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -38,7 +38,6 @@ typedef struct NvmeZoneList {
> >
> >  typedef struct NvmeNamespaceParams {
> >      uint32_t nsid;
> > -    uint8_t  csi;
> >      bool     attached;
> >      QemuUUID uuid;
> >
> > @@ -52,6 +51,7 @@ typedef struct NvmeNamespace {
> >      DeviceState  parent_obj;
> >      BlockConf    blkconf;
> >      int32_t      bootindex;
> > +    uint8_t      csi;
> >      int64_t      size;
> >      NvmeIdNs     id_ns;
> 
> This should be squashed into the namespace types patch.
> 

Yes, thanks.

> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 63ad03d6d6..38e25a4d1f 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -54,6 +54,7 @@
> >  #include "qemu/osdep.h"
> >  #include "qemu/units.h"
> >  #include "qemu/error-report.h"
> > +#include "crypto/random.h"
> 
> I think this is not used until the offline/read-only zones injection
> patch, right?
> 

Indeed, will move.

> > +static bool nvme_finalize_zoned_write(NvmeNamespace *ns,
> NvmeRequest *req,
> > +                                      bool failed)
> > +{
> > +    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
> > +    NvmeZone *zone;
> > +    uint64_t slba, start_wp = req->cqe.result64;
> > +    uint32_t nlb, zone_idx;
> > +    uint8_t zs;
> > +
> > +    if (rw->opcode != NVME_CMD_WRITE &&
> > +        rw->opcode != NVME_CMD_ZONE_APPEND &&
> > +        rw->opcode != NVME_CMD_WRITE_ZEROES) {
> > +        return false;
> > +    }
> > +
> > +    slba = le64_to_cpu(rw->slba);
> > +    nlb = le16_to_cpu(rw->nlb) + 1;
> > +    zone_idx = nvme_zone_idx(ns, slba);
> > +    assert(zone_idx < ns->num_zones);
> > +    zone = &ns->zone_array[zone_idx];
> > +
> > +    if (!failed && zone->w_ptr < start_wp + nlb) {
> > +        /*
> > +         * A preceding queued write to the zone has failed,
> > +         * now this write is not at the WP, fail it too.
> > +         */
> > +        failed = true;
> > +    }
> > +
> > +    if (failed) {
> > +        if (zone->w_ptr > start_wp) {
> > +            zone->w_ptr = start_wp;
> > +        }
> 
> It is possible (though unlikely) that you already posted the CQE for the
> write that moved the WP to w_ptr - and now you are reverting it.  This
> looks like a recipe for data corruption to me.
> 
> Take this example. I use append, because if you have multiple regular
> writes in queue you're screwed anyway.
> 
>   w_ptr = 0, d.wp = 0
>   append 1 lba  -> w_ptr = 1, start_wp = 0, issues aio A
>   append 2 lbas -> w_ptr = 3, start_wp = 1, issues aio B
> 
>   aio B success -> d.wp = 2 (since you are adding nlb),
> 
> Now, I totally do the same. Even though the zone descriptor write
> pointer gets "out of sync", it will be reconciled in the absence of
> failures and its fair to define that the host cannot expect a consistent
> view of the write pointer without quescing I/O.
> 
> The problem is if a write then fails:
> 
>   aio A fails   -> w_ptr > start_wp (3 > 1), so you revert to w_ptr = 1
> 
> That looks bad to me. I dont think this is ever reconciled? If another
> append then comes in:
> 
>   append 1 lba -> w_ptr = 2, start_wp = 1, issues aio C and overwrites
>                                            the second append from before.
>   aio C success -> d.wp = 3 (but it should be 2)
> 

Right, need to sync w_ptr and d.wp here. Good find!

> > @@ -1513,11 +2267,16 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl
> *n, NvmeRequest *req)
> >  static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
> > +    NvmeIdCtrlZoned id = {};
> >
> >      trace_pci_nvme_identify_ctrl_csi(c->csi);
> >
> >      if (c->csi == NVME_CSI_NVM) {
> >          return nvme_rpt_empty_id_struct(n, req);
> > +    } else if (c->csi == NVME_CSI_ZONED) {
> > +        id.zasl = n->zasl;
> 
> I dont think it should overwrite the zasl value specified by the user.
> If the user specified 0, then it should return 0 for zasl here.

Not sure if I get this. The value of n->zasl is calculated based on the
setting given by the user (or by default).

> 
> > @@ -2310,16 +3086,28 @@ static int nvme_start_ctrl(NvmeCtrl *n)
> >              continue;
> >          }
> >          ns->params.attached = false;
> > -        switch (ns->params.csi) {
> > +        switch (ns->csi) {
> >          case NVME_CSI_NVM:
> >              if (NVME_CC_CSS(n->bar.cc) == CSS_NVM_ONLY ||
> >                  NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> >                  ns->params.attached = true;
> >              }
> >              break;
> > +        case NVME_CSI_ZONED:
> > +            if (NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
> > +                ns->params.attached = true;
> > +            }
> > +            break;
> >          }
> >      }
> >
> > +    if (!n->zasl_bs) {
> > +        assert(n->params.mdts);
> 
> A value of 0 for MDTS is perfectly valid.

Ok, need to remove this assert.
 
> 
> > @@ -2382,10 +3170,11 @@ static void nvme_write_bar(NvmeCtrl *n,
> hwaddr offset, uint64_t data,
> >                  case CSS_NVM_ONLY:
> >                      trace_pci_nvme_css_nvm_cset_selected_by_host(data &
> >                                                                   0xffffffff);
> > -                    break;
> > +                break;
> 
> Spurious misaligned break here.

Nice catch! It's misaligned by 4, so checkpatch doesn't complain about it :)
Niklas Cassel Oct. 5, 2020, 11:41 a.m. UTC | #11
On Sun, Oct 04, 2020 at 11:57:07PM +0000, Dmitry Fomichev wrote:
> On Wed, 2020-09-30 at 14:50 +0000, Niklas Cassel wrote:
> > On Mon, Sep 28, 2020 at 11:35:23AM +0900, Dmitry Fomichev wrote:
> > > The emulation code has been changed to advertise NVM Command Set when
> > > "zoned" device property is not set (default) and Zoned Namespace
> > > Command Set otherwise.
> > > 
> > > Handlers for three new NVMe commands introduced in Zoned Namespace
> > > Command Set specification are added, namely for Zone Management
> > > Receive, Zone Management Send and Zone Append.
> > > 
> > > Device initialization code has been extended to create a proper
> > > configuration for zoned operation using device properties.
> > > 
> > > Read/Write command handler is modified to only allow writes at the
> > > write pointer if the namespace is zoned. For Zone Append command,
> > > writes implicitly happen at the write pointer and the starting write
> > > pointer value is returned as the result of the command. Write Zeroes
> > > handler is modified to add zoned checks that are identical to those
> > > done as a part of Write flow.
> > > 
> > > The code to support for Zone Descriptor Extensions is not included in
> > > this commit and ZDES 0 is always reported. A later commit in this
> > > series will add ZDE support.
> > > 
> > > This commit doesn't yet include checks for active and open zone
> > > limits. It is assumed that there are no limits on either active or
> > > open zones.
> > > 
> > > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > > Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> > > Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> > > Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> > > Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> > > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > > Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> > > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > > ---
> > >  block/nvme.c         |   2 +-
> > >  hw/block/nvme-ns.c   | 185 ++++++++-
> > >  hw/block/nvme-ns.h   |   6 +-
> > >  hw/block/nvme.c      | 872 +++++++++++++++++++++++++++++++++++++++++--
> > >  include/block/nvme.h |   6 +-
> > >  5 files changed, 1033 insertions(+), 38 deletions(-)
> > > 
> > > diff --git a/block/nvme.c b/block/nvme.c
> > > index 05485fdd11..7a513c9a17 100644
> > > --- a/block/nvme.c
> > > +++ b/block/nvme.c

(snip)

> > 
> > Please read my comment on nvme_identify_nslist_csi() before reading
> > this comment.
> > 
> > At least for this function, the specification is clear:
> > 
> > "If the host requests a data structure for an I/O Command Set that the
> > controller does not support, the controller shall abort the command with
> > a status of Invalid Field in Command."
> > 
> > If the controller supports the I/O command set == if the Command Set bit
> > is set in the data struct returned by the nvme_identify_cmd_set(),
> > so here we should do something like:
> > 
> > } else if (->csi == NVME_CSI_ZONED && ctrl_has_zns_namespaces()) {
> > 	...
> > }
> > 
> 
> With this commit, the controller supports ZNS command set regardless of
> the number of attached ZNS namespaces. It could be zero, but the controller
> still supports it. I think it would be better not to change the behavior
> of this command to depend on whether there are any ZNS namespaces added
> or not.

Ok, always having ZNS Command Set support, regardless if a user defines
a zoned namespace on the QEMU command line or not, does simplify things.

But then in nvme_identify_cmd_set(), you need to call
NVME_SET_CSI(*list, NVME_CSI_ZONED) unconditionally.

(Right now you loop though all namespaces, and only set the support bit
if you find a zoned namespace.)

> > Like I wrote in my review comment in the patch that added support for the new
> > allocated CNS values, I prefer if we remove this for-loop completely, and
> > simply set attached = true in nvme_ns_setup()/nvme_ns_init() instead.
> > 
> > (I was considering if we should set attach = true in nvme_zoned_init_ns(),
> > but because nvme_ns_setup()/nvme_ns_init() is called for all namespaces,
> > including ZNS namespaces, I don't think that any additional code in
> > nvme_zoned_init_ns() is warranted.)
> 
> I think CC.CSS value is not available during namespace setup and if we
> assign active flag in nvme_zoned_ns_setup(), zoned namespaces may end up
> being active even if NVM Only command set is selected. So keeping this loop
> seems like a good idea.

It is true that CC.CSS is not yet available during namespace setup,
but since the controller itself will never detach namespaces based on
CC.CSS, why are we dependant on CC.CSS being available?

Sure, once someone implements namespace management, they will need
to read if a certain namespace is attached or detached from some
persistent state, perhaps in the zone meta-data file, and set
attached boolean in nvme_ns_init() accordingly, but I still don't see
any dependance on CC.CSS even when namespace management is implemented.



Kind regards,
Niklas
Dmitry Fomichev Oct. 5, 2020, 11:08 p.m. UTC | #12
> -----Original Message-----
> From: Niklas Cassel <Niklas.Cassel@wdc.com>
> Sent: Monday, October 5, 2020 7:41 AM
> To: Dmitry Fomichev <Dmitry.Fomichev@wdc.com>
> Cc: Alistair Francis <Alistair.Francis@wdc.com>; qemu-devel@nongnu.org;
> Damien Le Moal <Damien.LeMoal@wdc.com>; fam@euphon.net; Matias
> Bjorling <Matias.Bjorling@wdc.com>; qemu-block@nongnu.org;
> kwolf@redhat.com; mlevitsk@redhat.com; k.jensen@samsung.com;
> kbusch@kernel.org; philmd@redhat.com
> Subject: Re: [PATCH v5 09/14] hw/block/nvme: Support Zoned Namespace
> Command Set
> 
> On Sun, Oct 04, 2020 at 11:57:07PM +0000, Dmitry Fomichev wrote:
> > On Wed, 2020-09-30 at 14:50 +0000, Niklas Cassel wrote:
> > > On Mon, Sep 28, 2020 at 11:35:23AM +0900, Dmitry Fomichev wrote:
> > > > The emulation code has been changed to advertise NVM Command Set
> when
> > > > "zoned" device property is not set (default) and Zoned Namespace
> > > > Command Set otherwise.
> > > >
> > > > Handlers for three new NVMe commands introduced in Zoned
> Namespace
> > > > Command Set specification are added, namely for Zone Management
> > > > Receive, Zone Management Send and Zone Append.
> > > >
> > > > Device initialization code has been extended to create a proper
> > > > configuration for zoned operation using device properties.
> > > >
> > > > Read/Write command handler is modified to only allow writes at the
> > > > write pointer if the namespace is zoned. For Zone Append command,
> > > > writes implicitly happen at the write pointer and the starting write
> > > > pointer value is returned as the result of the command. Write Zeroes
> > > > handler is modified to add zoned checks that are identical to those
> > > > done as a part of Write flow.
> > > >
> > > > The code to support for Zone Descriptor Extensions is not included in
> > > > this commit and ZDES 0 is always reported. A later commit in this
> > > > series will add ZDE support.
> > > >
> > > > This commit doesn't yet include checks for active and open zone
> > > > limits. It is assumed that there are no limits on either active or
> > > > open zones.
> > > >
> > > > Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
> > > > Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
> > > > Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
> > > > Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
> > > > Signed-off-by: Matias Bjorling <matias.bjorling@wdc.com>
> > > > Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
> > > > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > > > Signed-off-by: Adam Manzanares <adam.manzanares@wdc.com>
> > > > Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > > > ---
> > > >  block/nvme.c         |   2 +-
> > > >  hw/block/nvme-ns.c   | 185 ++++++++-
> > > >  hw/block/nvme-ns.h   |   6 +-
> > > >  hw/block/nvme.c      | 872
> +++++++++++++++++++++++++++++++++++++++++--
> > > >  include/block/nvme.h |   6 +-
> > > >  5 files changed, 1033 insertions(+), 38 deletions(-)
> > > >
> > > > diff --git a/block/nvme.c b/block/nvme.c
> > > > index 05485fdd11..7a513c9a17 100644
> > > > --- a/block/nvme.c
> > > > +++ b/block/nvme.c
> 
> (snip)
> 
> > >
> > > Please read my comment on nvme_identify_nslist_csi() before reading
> > > this comment.
> > >
> > > At least for this function, the specification is clear:
> > >
> > > "If the host requests a data structure for an I/O Command Set that the
> > > controller does not support, the controller shall abort the command with
> > > a status of Invalid Field in Command."
> > >
> > > If the controller supports the I/O command set == if the Command Set bit
> > > is set in the data struct returned by the nvme_identify_cmd_set(),
> > > so here we should do something like:
> > >
> > > } else if (->csi == NVME_CSI_ZONED && ctrl_has_zns_namespaces()) {
> > > 	...
> > > }
> > >
> >
> > With this commit, the controller supports ZNS command set regardless of
> > the number of attached ZNS namespaces. It could be zero, but the
> controller
> > still supports it. I think it would be better not to change the behavior
> > of this command to depend on whether there are any ZNS namespaces
> added
> > or not.
> 
> Ok, always having ZNS Command Set support, regardless if a user defines
> a zoned namespace on the QEMU command line or not, does simplify things.
> 
> But then in nvme_identify_cmd_set(), you need to call
> NVME_SET_CSI(*list, NVME_CSI_ZONED) unconditionally.
> 

Perhaps,
NVME_SET_CSI(*list, NVME_CSI_NVM)
NVME_SET_CSI(*list, NVME_CSI_ZONED)

since this is a vector...

> (Right now you loop though all namespaces, and only set the support bit
> if you find a zoned namespace.)
> 
> > > Like I wrote in my review comment in the patch that added support for
> the new
> > > allocated CNS values, I prefer if we remove this for-loop completely, and
> > > simply set attached = true in nvme_ns_setup()/nvme_ns_init() instead.
> > >
> > > (I was considering if we should set attach = true in
> nvme_zoned_init_ns(),
> > > but because nvme_ns_setup()/nvme_ns_init() is called for all
> namespaces,
> > > including ZNS namespaces, I don't think that any additional code in
> > > nvme_zoned_init_ns() is warranted.)
> >
> > I think CC.CSS value is not available during namespace setup and if we
> > assign active flag in nvme_zoned_ns_setup(), zoned namespaces may end
> up
> > being active even if NVM Only command set is selected. So keeping this
> loop
> > seems like a good idea.
> 
> It is true that CC.CSS is not yet available during namespace setup,
> but since the controller itself will never detach namespaces based on
> CC.CSS, why are we dependant on CC.CSS being available?
> 
> Sure, once someone implements namespace management, they will need
> to read if a certain namespace is attached or detached from some
> persistent state, perhaps in the zone meta-data file, and set
> attached boolean in nvme_ns_init() accordingly, but I still don't see
> any dependance on CC.CSS even when namespace management is
> implemented.
> 

Ok, thanks for the clarification. I think it would be the best to add "attached"
property to namespace code instead of hardcoding it to true. The default,
of course, will be true, but will be possible to set it to false to test how everything
works with inactive namespaces. And yes, no dependence on CC.CSS.

> 
> 
> Kind regards,
> Niklas
diff mbox series

Patch

diff --git a/block/nvme.c b/block/nvme.c
index 05485fdd11..7a513c9a17 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -333,7 +333,7 @@  static inline int nvme_translate_error(const NvmeCqe *c)
 {
     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
     if (status) {
-        trace_nvme_error(le32_to_cpu(c->result),
+        trace_nvme_error(le32_to_cpu(c->result32),
                          le16_to_cpu(c->sq_head),
                          le16_to_cpu(c->sq_id),
                          le16_to_cpu(c->cid),
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 31b7f986c3..6d9dc9205b 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -33,14 +33,14 @@  static void nvme_ns_init(NvmeNamespace *ns)
     NvmeIdNs *id_ns = &ns->id_ns;
 
     if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
-        ns->id_ns.dlfeat = 0x9;
+        ns->id_ns.dlfeat = 0x8;
     }
 
     id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
 
     id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
 
-    ns->params.csi = NVME_CSI_NVM;
+    ns->csi = NVME_CSI_NVM;
     qemu_uuid_generate(&ns->params.uuid); /* TODO make UUIDs persistent */
 
     /* no thin provisioning */
@@ -73,7 +73,162 @@  static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
     }
 
     lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
-    ns->id_ns.lbaf[lba_index].ds = 31 - clz32(n->conf.logical_block_size);
+    ns->id_ns.lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
+
+    return 0;
+}
+
+/*
+ * Add a zone to the tail of a zone list.
+ */
+void nvme_add_zone_tail(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone)
+{
+    uint32_t idx = (uint32_t)(zone - ns->zone_array);
+
+    assert(nvme_zone_not_in_list(zone));
+
+    if (!zl->size) {
+        zl->head = zl->tail = idx;
+        zone->next = zone->prev = NVME_ZONE_LIST_NIL;
+    } else {
+        ns->zone_array[zl->tail].next = idx;
+        zone->prev = zl->tail;
+        zone->next = NVME_ZONE_LIST_NIL;
+        zl->tail = idx;
+    }
+    zl->size++;
+}
+
+/*
+ * Remove a zone from a zone list. The zone must be linked in the list.
+ */
+void nvme_remove_zone(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone)
+{
+    uint32_t idx = (uint32_t)(zone - ns->zone_array);
+
+    assert(!nvme_zone_not_in_list(zone));
+
+    --zl->size;
+    if (zl->size == 0) {
+        zl->head = NVME_ZONE_LIST_NIL;
+        zl->tail = NVME_ZONE_LIST_NIL;
+    } else if (idx == zl->head) {
+        zl->head = zone->next;
+        ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
+    } else if (idx == zl->tail) {
+        zl->tail = zone->prev;
+        ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL;
+    } else {
+        ns->zone_array[zone->next].prev = zone->prev;
+        ns->zone_array[zone->prev].next = zone->next;
+    }
+
+    zone->prev = zone->next = 0;
+}
+
+static int nvme_calc_zone_geometry(NvmeNamespace *ns, Error **errp)
+{
+    uint64_t zone_size, zone_cap;
+    uint32_t nz, lbasz = ns->blkconf.logical_block_size;
+
+    if (ns->params.zone_size_mb) {
+        zone_size = ns->params.zone_size_mb;
+    } else {
+        zone_size = NVME_DEFAULT_ZONE_SIZE;
+    }
+    if (ns->params.zone_capacity_mb) {
+        zone_cap = ns->params.zone_capacity_mb;
+    } else {
+        zone_cap = zone_size;
+    }
+    ns->zone_size = zone_size * MiB / lbasz;
+    ns->zone_capacity = zone_cap * MiB / lbasz;
+    if (ns->zone_capacity > ns->zone_size) {
+        error_setg(errp, "zone capacity exceeds zone size");
+        return -1;
+    }
+
+    nz = DIV_ROUND_UP(ns->size / lbasz, ns->zone_size);
+    ns->num_zones = nz;
+    ns->zone_array_size = sizeof(NvmeZone) * nz;
+    ns->zone_size_log2 = 0;
+    if (is_power_of_2(ns->zone_size)) {
+        ns->zone_size_log2 = 63 - clz64(ns->zone_size);
+    }
+
+    return 0;
+}
+
+static void nvme_init_zone_meta(NvmeNamespace *ns)
+{
+    uint64_t start = 0, zone_size = ns->zone_size;
+    uint64_t capacity = ns->num_zones * zone_size;
+    NvmeZone *zone;
+    int i;
+
+    ns->zone_array = g_malloc0(ns->zone_array_size);
+    ns->exp_open_zones = g_malloc0(sizeof(NvmeZoneList));
+    ns->imp_open_zones = g_malloc0(sizeof(NvmeZoneList));
+    ns->closed_zones = g_malloc0(sizeof(NvmeZoneList));
+    ns->full_zones = g_malloc0(sizeof(NvmeZoneList));
+
+    nvme_init_zone_list(ns->exp_open_zones);
+    nvme_init_zone_list(ns->imp_open_zones);
+    nvme_init_zone_list(ns->closed_zones);
+    nvme_init_zone_list(ns->full_zones);
+
+    zone = ns->zone_array;
+    for (i = 0; i < ns->num_zones; i++, zone++) {
+        if (start + zone_size > capacity) {
+            zone_size = capacity - start;
+        }
+        zone->d.zt = NVME_ZONE_TYPE_SEQ_WRITE;
+        nvme_set_zone_state(zone, NVME_ZONE_STATE_EMPTY);
+        zone->d.za = 0;
+        zone->d.zcap = ns->zone_capacity;
+        zone->d.zslba = start;
+        zone->d.wp = start;
+        zone->w_ptr = start;
+        zone->prev = 0;
+        zone->next = 0;
+        start += zone_size;
+    }
+}
+
+static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index,
+                              Error **errp)
+{
+    NvmeIdNsZoned *id_ns_z;
+
+    if (n->params.fill_pattern == 0) {
+        ns->id_ns.dlfeat |= 0x01;
+    } else if (n->params.fill_pattern == 0xff) {
+        ns->id_ns.dlfeat |= 0x02;
+    }
+
+    if (nvme_calc_zone_geometry(ns, errp) != 0) {
+        return -1;
+    }
+
+    nvme_init_zone_meta(ns);
+
+    id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
+
+    /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
+    id_ns_z->mar = 0xffffffff;
+    id_ns_z->mor = 0xffffffff;
+    id_ns_z->zoc = 0;
+    id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
+
+    id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
+    id_ns_z->lbafe[lba_index].zdes = 0; /* FIXME make helper */
+
+    ns->csi = NVME_CSI_ZONED;
+    ns->id_ns.ncap = cpu_to_le64(ns->zone_capacity * ns->num_zones);
+    ns->id_ns.nuse = ns->id_ns.ncap;
+    ns->id_ns.nsze = ns->id_ns.ncap;
+
+    ns->id_ns_zoned = id_ns_z;
 
     return 0;
 }
@@ -103,6 +258,12 @@  int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
+    if (ns->params.zoned) {
+        if (nvme_zoned_init_ns(n, ns, 0, errp) != 0) {
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -116,6 +277,16 @@  void nvme_ns_flush(NvmeNamespace *ns)
     blk_flush(ns->blkconf.blk);
 }
 
+void nvme_ns_cleanup(NvmeNamespace *ns)
+{
+    g_free(ns->id_ns_zoned);
+    g_free(ns->zone_array);
+    g_free(ns->exp_open_zones);
+    g_free(ns->imp_open_zones);
+    g_free(ns->closed_zones);
+    g_free(ns->full_zones);
+}
+
 static void nvme_ns_realize(DeviceState *dev, Error **errp)
 {
     NvmeNamespace *ns = NVME_NS(dev);
@@ -133,6 +304,14 @@  static void nvme_ns_realize(DeviceState *dev, Error **errp)
 static Property nvme_ns_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeNamespace, blkconf),
     DEFINE_PROP_UINT32("nsid", NvmeNamespace, params.nsid, 0),
+
+    DEFINE_PROP_BOOL("zoned", NvmeNamespace, params.zoned, false),
+    DEFINE_PROP_UINT64("zone_size", NvmeNamespace, params.zone_size_mb,
+                       NVME_DEFAULT_ZONE_SIZE),
+    DEFINE_PROP_UINT64("zone_capacity", NvmeNamespace,
+                       params.zone_capacity_mb, 0),
+    DEFINE_PROP_BOOL("cross_zone_read", NvmeNamespace,
+                      params.cross_zone_read, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 04172f083e..daa13546c4 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -38,7 +38,6 @@  typedef struct NvmeZoneList {
 
 typedef struct NvmeNamespaceParams {
     uint32_t nsid;
-    uint8_t  csi;
     bool     attached;
     QemuUUID uuid;
 
@@ -52,6 +51,7 @@  typedef struct NvmeNamespace {
     DeviceState  parent_obj;
     BlockConf    blkconf;
     int32_t      bootindex;
+    uint8_t      csi;
     int64_t      size;
     NvmeIdNs     id_ns;
 
@@ -107,6 +107,7 @@  typedef struct NvmeCtrl NvmeCtrl;
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_flush(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
 
 static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
 {
@@ -188,4 +189,7 @@  static inline NvmeZone *nvme_next_zone_in_list(NvmeNamespace *ns, NvmeZone *z,
     return &ns->zone_array[z->next];
 }
 
+void nvme_add_zone_tail(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone);
+void nvme_remove_zone(NvmeNamespace *ns, NvmeZoneList *zl, NvmeZone *zone);
+
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 63ad03d6d6..38e25a4d1f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -54,6 +54,7 @@ 
 #include "qemu/osdep.h"
 #include "qemu/units.h"
 #include "qemu/error-report.h"
+#include "crypto/random.h"
 #include "hw/block/block.h"
 #include "hw/pci/msix.h"
 #include "hw/pci/pci.h"
@@ -127,6 +128,46 @@  static uint16_t nvme_sqid(NvmeRequest *req)
     return le16_to_cpu(req->sq->sqid);
 }
 
+static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
+                                   uint8_t state)
+{
+    if (!nvme_zone_not_in_list(zone)) {
+        switch (nvme_get_zone_state(zone)) {
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+            nvme_remove_zone(ns, ns->exp_open_zones, zone);
+            break;
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+            nvme_remove_zone(ns, ns->imp_open_zones, zone);
+            break;
+        case NVME_ZONE_STATE_CLOSED:
+            nvme_remove_zone(ns, ns->closed_zones, zone);
+            break;
+        case NVME_ZONE_STATE_FULL:
+            nvme_remove_zone(ns, ns->full_zones, zone);
+        }
+   }
+
+    nvme_set_zone_state(zone, state);
+
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        nvme_add_zone_tail(ns, ns->exp_open_zones, zone);
+        break;
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_add_zone_tail(ns, ns->imp_open_zones, zone);
+        break;
+    case NVME_ZONE_STATE_CLOSED:
+        nvme_add_zone_tail(ns, ns->closed_zones, zone);
+        break;
+    case NVME_ZONE_STATE_FULL:
+        nvme_add_zone_tail(ns, ns->full_zones, zone);
+    case NVME_ZONE_STATE_READ_ONLY:
+        break;
+    default:
+        zone->d.za = 0;
+    }
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
     hwaddr low = n->ctrl_mem.addr;
@@ -813,7 +854,7 @@  static void nvme_process_aers(void *opaque)
 
         req = n->aer_reqs[n->outstanding_aers];
 
-        result = (NvmeAerResult *) &req->cqe.result;
+        result = (NvmeAerResult *) &req->cqe.result32;
         result->event_type = event->result.event_type;
         result->event_info = event->result.event_info;
         result->log_page = event->result.log_page;
@@ -882,6 +923,200 @@  static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
     return NVME_SUCCESS;
 }
 
+static void nvme_fill_data(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t offset,
+                           uint32_t max_len, uint8_t pattern)
+{
+    ScatterGatherEntry *entry;
+    uint32_t len, ent_len;
+
+    if (qsg->nsg > 0) {
+        entry = qsg->sg;
+        len = qsg->size;
+        if (max_len) {
+            len = MIN(len, max_len);
+        }
+        for (; len > 0; len -= ent_len) {
+            ent_len = MIN(len, entry->len);
+            if (offset > ent_len) {
+                offset -= ent_len;
+            } else if (offset != 0) {
+                dma_memory_set(qsg->as, entry->base + offset,
+                               pattern, ent_len - offset);
+                offset = 0;
+            } else {
+                dma_memory_set(qsg->as, entry->base, pattern, ent_len);
+            }
+            entry++;
+        }
+    } else if (iov->iov) {
+        len = iov_size(iov->iov, iov->niov);
+        if (max_len) {
+            len = MIN(len, max_len);
+        }
+        qemu_iovec_memset(iov, offset, pattern, len - offset);
+    }
+}
+
+static uint16_t nvme_check_zone_write(NvmeZone *zone, uint64_t slba,
+                                      uint32_t nlb)
+{
+    uint16_t status;
+
+    if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) {
+        return NVME_ZONE_BOUNDARY_ERROR;
+    }
+
+    switch (nvme_get_zone_state(zone)) {
+    case NVME_ZONE_STATE_EMPTY:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_CLOSED:
+        status = NVME_SUCCESS;
+        break;
+    case NVME_ZONE_STATE_FULL:
+        status = NVME_ZONE_FULL;
+        break;
+    case NVME_ZONE_STATE_OFFLINE:
+        status = NVME_ZONE_OFFLINE;
+        break;
+    case NVME_ZONE_STATE_READ_ONLY:
+        status = NVME_ZONE_READ_ONLY;
+        break;
+    default:
+        assert(false);
+    }
+    return status;
+}
+
+static uint16_t nvme_check_zone_read(NvmeNamespace *ns, NvmeZone *zone,
+                                     uint64_t slba, uint32_t nlb)
+{
+    uint64_t lba = slba, count;
+    uint16_t status;
+    uint8_t zs;
+
+    do {
+        if (!ns->params.cross_zone_read &&
+            (lba + nlb > nvme_zone_rd_boundary(ns, zone))) {
+            return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
+        }
+
+        zs = nvme_get_zone_state(zone);
+        switch (zs) {
+        case NVME_ZONE_STATE_EMPTY:
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        case NVME_ZONE_STATE_FULL:
+        case NVME_ZONE_STATE_CLOSED:
+        case NVME_ZONE_STATE_READ_ONLY:
+            status = NVME_SUCCESS;
+            break;
+        case NVME_ZONE_STATE_OFFLINE:
+            status = NVME_ZONE_OFFLINE | NVME_DNR;
+            break;
+        default:
+            assert(false);
+        }
+        if (status != NVME_SUCCESS) {
+            break;
+        }
+
+        if (lba + nlb > nvme_zone_rd_boundary(ns, zone)) {
+            count = nvme_zone_rd_boundary(ns, zone) - lba;
+        } else {
+            count = nlb;
+        }
+
+        lba += count;
+        nlb -= count;
+        zone++;
+    } while (nlb);
+
+    return status;
+}
+
+static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
+{
+    return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
+                                    slba / ns->zone_size;
+}
+
+static bool nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req,
+                                      bool failed)
+{
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    NvmeZone *zone;
+    uint64_t slba, start_wp = req->cqe.result64;
+    uint32_t nlb, zone_idx;
+    uint8_t zs;
+
+    if (rw->opcode != NVME_CMD_WRITE &&
+        rw->opcode != NVME_CMD_ZONE_APPEND &&
+        rw->opcode != NVME_CMD_WRITE_ZEROES) {
+        return false;
+    }
+
+    slba = le64_to_cpu(rw->slba);
+    nlb = le16_to_cpu(rw->nlb) + 1;
+    zone_idx = nvme_zone_idx(ns, slba);
+    assert(zone_idx < ns->num_zones);
+    zone = &ns->zone_array[zone_idx];
+
+    if (!failed && zone->w_ptr < start_wp + nlb) {
+        /*
+         * A preceding queued write to the zone has failed,
+         * now this write is not at the WP, fail it too.
+         */
+        failed = true;
+    }
+
+    if (failed) {
+        if (zone->w_ptr > start_wp) {
+            zone->w_ptr = start_wp;
+        }
+        req->cqe.result64 = 0;
+    } else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) {
+        zs = nvme_get_zone_state(zone);
+        switch (zs) {
+        case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        case NVME_ZONE_STATE_CLOSED:
+        case NVME_ZONE_STATE_EMPTY:
+            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
+            /* fall through */
+        case NVME_ZONE_STATE_FULL:
+            break;
+        default:
+            assert(false);
+        }
+        zone->d.wp = zone->w_ptr;
+    } else {
+        zone->d.wp += nlb;
+    }
+
+    return failed;
+}
+
+static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+                                     uint32_t nlb)
+{
+    uint64_t result = zone->w_ptr;
+    uint8_t zs;
+
+    zone->w_ptr += nlb;
+
+    if (zone->w_ptr < nvme_zone_wr_boundary(zone)) {
+        zs = nvme_get_zone_state(zone);
+        switch (zs) {
+        case NVME_ZONE_STATE_EMPTY:
+        case NVME_ZONE_STATE_CLOSED:
+            nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
+        }
+    }
+
+    return result;
+}
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -896,10 +1131,27 @@  static void nvme_rw_cb(void *opaque, int ret)
     trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
     if (!ret) {
-        block_acct_done(stats, acct);
+        if (ns->params.zoned) {
+            if (nvme_finalize_zoned_write(ns, req, false)) {
+                ret = EIO;
+                block_acct_failed(stats, acct);
+                req->status = NVME_ZONE_INVALID_WRITE;
+            } else if (req->fill_ofs >= 0) {
+                nvme_fill_data(&req->qsg, &req->iov, req->fill_ofs,
+                               req->fill_len,
+                               nvme_ctrl(req)->params.fill_pattern);
+            }
+        }
+        if (!ret) {
+            block_acct_done(stats, acct);
+        }
     } else {
         uint16_t status;
 
+        if (ns->params.zoned) {
+            nvme_finalize_zoned_write(ns, req, true);
+        }
+
         block_acct_failed(stats, acct);
 
         switch (req->cmd.opcode) {
@@ -953,6 +1205,7 @@  static uint16_t nvme_do_aio(BlockBackend *blk, int64_t offset, size_t len,
         break;
 
     case NVME_CMD_WRITE:
+    case NVME_CMD_ZONE_APPEND:
         is_write = true;
 
         /* fallthrough */
@@ -997,8 +1250,10 @@  static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
     NvmeNamespace *ns = req->ns;
     uint64_t slba = le64_to_cpu(rw->slba);
     uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+    NvmeZone *zone = NULL;
     uint64_t offset = nvme_l2b(ns, slba);
     uint32_t count = nvme_l2b(ns, nlb);
+    uint32_t zone_idx;
     uint16_t status;
 
     trace_pci_nvme_write_zeroes(nvme_cid(req), nvme_nsid(ns), slba, nlb);
@@ -1009,20 +1264,43 @@  static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
         return status;
     }
 
+    if (ns->params.zoned) {
+        zone_idx = nvme_zone_idx(ns, slba);
+        assert(zone_idx < ns->num_zones);
+        zone = &ns->zone_array[zone_idx];
+
+        status = nvme_check_zone_write(zone, slba, nlb);
+        if (status != NVME_SUCCESS) {
+            trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
+            return status | NVME_DNR;
+        }
+
+        assert(nvme_wp_is_valid(zone));
+        if (unlikely(slba != zone->w_ptr)) {
+            trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
+                                               zone->w_ptr);
+            return NVME_ZONE_INVALID_WRITE | NVME_DNR;
+        }
+
+        req->cqe.result64 = nvme_advance_zone_wp(ns, zone, nlb);
+    }
+
     return nvme_do_aio(ns->blkconf.blk, offset, count, req);
 }
 
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req, bool append)
 {
     NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
     NvmeNamespace *ns = req->ns;
-    uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+    uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
     uint64_t slba = le64_to_cpu(rw->slba);
-
     uint64_t data_size = nvme_l2b(ns, nlb);
-    uint64_t data_offset = nvme_l2b(ns, slba);
-    enum BlockAcctType acct = req->cmd.opcode == NVME_CMD_WRITE ?
-        BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+    uint64_t data_offset;
+
+    NvmeZone *zone = NULL;
+    uint32_t zone_idx = 0;
+    bool is_write = rw->opcode == NVME_CMD_WRITE || append;
+    enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
     uint16_t status;
 
     trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode),
@@ -1040,18 +1318,468 @@  static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
         goto invalid;
     }
 
+    if (ns->params.zoned) {
+        zone_idx = nvme_zone_idx(ns, slba);
+        assert(zone_idx < ns->num_zones);
+        zone = &ns->zone_array[zone_idx];
+
+        if (is_write) {
+            status = nvme_check_zone_write(zone, slba, nlb);
+            if (status != NVME_SUCCESS) {
+                trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
+                goto invalid;
+            }
+
+            assert(nvme_wp_is_valid(zone));
+            if (append) {
+                if (unlikely(slba != zone->d.zslba)) {
+                    trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+                    status = NVME_ZONE_INVALID_WRITE | NVME_DNR;
+                    goto invalid;
+                }
+                if (data_size > (n->page_size << n->zasl)) {
+                    trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
+                    status = NVME_INVALID_FIELD | NVME_DNR;
+                    goto invalid;
+                }
+                slba = zone->w_ptr;
+            } else if (unlikely(slba != zone->w_ptr)) {
+                trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
+                                                   zone->w_ptr);
+                status = NVME_ZONE_INVALID_WRITE | NVME_DNR;
+                goto invalid;
+            }
+            req->fill_ofs = -1LL;
+        } else {
+            status = nvme_check_zone_read(ns, zone, slba, nlb);
+            if (status != NVME_SUCCESS) {
+                trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
+                goto invalid;
+            }
+
+            if (slba + nlb > zone->w_ptr) {
+                /*
+                 * All or some data is read above the WP. Need to
+                 * fill out the buffer area that has no backing data
+                 * with a predefined data pattern (zeros by default)
+                 */
+                if (slba >= zone->w_ptr) {
+                    req->fill_ofs = 0;
+                } else {
+                    req->fill_ofs = nvme_l2b(ns, zone->w_ptr - slba);
+                }
+                req->fill_len = nvme_l2b(ns,
+                    nvme_zone_rd_boundary(ns, zone) - slba);
+            } else {
+                req->fill_ofs = -1LL;
+            }
+        }
+    } else if (append) {
+        trace_pci_nvme_err_invalid_opc(rw->opcode);
+        status = NVME_INVALID_OPCODE | NVME_DNR;
+        goto invalid;
+    }
+
     status = nvme_map_dptr(n, data_size, req);
     if (status) {
         goto invalid;
     }
 
+    if (ns->params.zoned) {
+        if (unlikely(req->fill_ofs == 0 &&
+            slba + nlb <= nvme_zone_rd_boundary(ns, zone))) {
+            /* No backend I/O necessary, only need to fill the buffer */
+            nvme_fill_data(&req->qsg, &req->iov, 0, 0, n->params.fill_pattern);
+            req->status = NVME_SUCCESS;
+            return NVME_SUCCESS;
+        }
+        if (is_write) {
+            req->cqe.result64 = nvme_advance_zone_wp(ns, zone, nlb);
+        }
+    }
+
+    data_offset = nvme_l2b(ns, slba);
+
     return nvme_do_aio(ns->blkconf.blk, data_offset, data_size, req);
 
 invalid:
     block_acct_invalid(blk_get_stats(ns->blkconf.blk), acct);
+    return status | NVME_DNR;
+}
+
+static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
+                                            uint64_t *slba, uint32_t *zone_idx)
+{
+    uint32_t dw10 = le32_to_cpu(c->cdw10);
+    uint32_t dw11 = le32_to_cpu(c->cdw11);
+
+    if (!ns->params.zoned) {
+        trace_pci_nvme_err_invalid_opc(c->opcode);
+        return NVME_INVALID_OPCODE | NVME_DNR;
+    }
+
+    *slba = ((uint64_t)dw11) << 32 | dw10;
+    if (unlikely(*slba >= ns->id_ns.nsze)) {
+        trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
+        *slba = 0;
+        return NVME_LBA_RANGE | NVME_DNR;
+    }
+
+    *zone_idx = nvme_zone_idx(ns, *slba);
+    assert(*zone_idx < ns->num_zones);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
+                               uint8_t state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EMPTY:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
+        /* fall through */
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+        return NVME_SUCCESS;
+    }
+
+    return NVME_ZONE_INVAL_TRANSITION;
+}
+
+static bool nvme_cond_open_all(uint8_t state)
+{
+    return state == NVME_ZONE_STATE_CLOSED;
+}
+
+static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                uint8_t state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
+        /* fall through */
+    case NVME_ZONE_STATE_CLOSED:
+        return NVME_SUCCESS;
+    }
+
+    return NVME_ZONE_INVAL_TRANSITION;
+}
+
+static bool nvme_cond_close_all(uint8_t state)
+{
+    return state == NVME_ZONE_STATE_IMPLICITLY_OPEN ||
+           state == NVME_ZONE_STATE_EXPLICITLY_OPEN;
+}
+
+static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                 uint8_t state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_EMPTY:
+        zone->w_ptr = nvme_zone_wr_boundary(zone);
+        zone->d.wp = zone->w_ptr;
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
+        /* fall through */
+    case NVME_ZONE_STATE_FULL:
+        return NVME_SUCCESS;
+    }
+
+    return NVME_ZONE_INVAL_TRANSITION;
+}
+
+static bool nvme_cond_finish_all(uint8_t state)
+{
+    return state == NVME_ZONE_STATE_IMPLICITLY_OPEN ||
+           state == NVME_ZONE_STATE_EXPLICITLY_OPEN ||
+           state == NVME_ZONE_STATE_CLOSED;
+}
+
+static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                uint8_t state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+    case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+    case NVME_ZONE_STATE_CLOSED:
+    case NVME_ZONE_STATE_FULL:
+        zone->w_ptr = zone->d.zslba;
+        zone->d.wp = zone->w_ptr;
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
+        /* fall through */
+    case NVME_ZONE_STATE_EMPTY:
+        return NVME_SUCCESS;
+    }
+
+    return NVME_ZONE_INVAL_TRANSITION;
+}
+
+static bool nvme_cond_reset_all(uint8_t state)
+{
+    return state == NVME_ZONE_STATE_IMPLICITLY_OPEN ||
+           state == NVME_ZONE_STATE_EXPLICITLY_OPEN ||
+           state == NVME_ZONE_STATE_CLOSED ||
+           state == NVME_ZONE_STATE_FULL;
+}
+
+static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
+                                  uint8_t state)
+{
+    switch (state) {
+    case NVME_ZONE_STATE_READ_ONLY:
+        nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
+        /* fall through */
+    case NVME_ZONE_STATE_OFFLINE:
+        return NVME_SUCCESS;
+    }
+
+    return NVME_ZONE_INVAL_TRANSITION;
+}
+
+static bool nvme_cond_offline_all(uint8_t state)
+{
+    return state == NVME_ZONE_STATE_READ_ONLY;
+}
+
+typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *,
+                                 uint8_t);
+typedef bool (*need_to_proc_zone_t)(uint8_t);
+
+static uint16_t name_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
+                                uint8_t state, bool all,
+                                op_handler_t op_hndlr,
+                                need_to_proc_zone_t proc_zone)
+{
+    int i;
+    uint16_t status = 0;
+
+    if (!all) {
+        status = op_hndlr(ns, zone, state);
+    } else {
+        for (i = 0; i < ns->num_zones; i++, zone++) {
+            state = nvme_get_zone_state(zone);
+            if (proc_zone(state)) {
+                status = op_hndlr(ns, zone, state);
+                if (status != NVME_SUCCESS) {
+                    break;
+                }
+            }
+        }
+    }
+
     return status;
 }
 
+static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
+    NvmeNamespace *ns = req->ns;
+    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+    uint64_t slba = 0;
+    uint32_t zone_idx = 0;
+    uint16_t status;
+    uint8_t action, state;
+    bool all;
+    NvmeZone *zone;
+
+    action = dw13 & 0xff;
+    all = dw13 & 0x100;
+
+    req->status = NVME_SUCCESS;
+
+    if (!all) {
+        status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
+        if (status) {
+            return status;
+        }
+    }
+
+    zone = &ns->zone_array[zone_idx];
+    if (slba != zone->d.zslba) {
+        trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+    state = nvme_get_zone_state(zone);
+
+    switch (action) {
+
+    case NVME_ZONE_ACTION_OPEN:
+        trace_pci_nvme_open_zone(slba, zone_idx, all);
+        status = name_do_zone_op(ns, zone, state, all,
+                                 nvme_open_zone, nvme_cond_open_all);
+        break;
+
+    case NVME_ZONE_ACTION_CLOSE:
+        trace_pci_nvme_close_zone(slba, zone_idx, all);
+        status = name_do_zone_op(ns, zone, state, all,
+                                 nvme_close_zone, nvme_cond_close_all);
+        break;
+
+    case NVME_ZONE_ACTION_FINISH:
+        trace_pci_nvme_finish_zone(slba, zone_idx, all);
+        status = name_do_zone_op(ns, zone, state, all,
+                                 nvme_finish_zone, nvme_cond_finish_all);
+        break;
+
+    case NVME_ZONE_ACTION_RESET:
+        trace_pci_nvme_reset_zone(slba, zone_idx, all);
+        status = name_do_zone_op(ns, zone, state, all,
+                                 nvme_reset_zone, nvme_cond_reset_all);
+        break;
+
+    case NVME_ZONE_ACTION_OFFLINE:
+        trace_pci_nvme_offline_zone(slba, zone_idx, all);
+        status = name_do_zone_op(ns, zone, state, all,
+                                 nvme_offline_zone, nvme_cond_offline_all);
+        break;
+
+    case NVME_ZONE_ACTION_SET_ZD_EXT:
+        trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
+        return NVME_INVALID_FIELD | NVME_DNR;
+        break;
+
+    default:
+        trace_pci_nvme_err_invalid_mgmt_action(action);
+        status = NVME_INVALID_FIELD;
+    }
+
+    if (status == NVME_ZONE_INVAL_TRANSITION) {
+        trace_pci_nvme_err_invalid_zone_state_transition(state, action, slba,
+                                                         zone->d.za);
+    }
+    if (status) {
+        status |= NVME_DNR;
+    }
+
+    return status;
+}
+
+static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
+{
+    int zs = nvme_get_zone_state(zl);
+
+    switch (zafs) {
+    case NVME_ZONE_REPORT_ALL:
+        return true;
+    case NVME_ZONE_REPORT_EMPTY:
+        return zs == NVME_ZONE_STATE_EMPTY;
+    case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
+        return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
+    case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
+        return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
+    case NVME_ZONE_REPORT_CLOSED:
+        return zs == NVME_ZONE_STATE_CLOSED;
+    case NVME_ZONE_REPORT_FULL:
+        return zs == NVME_ZONE_STATE_FULL;
+    case NVME_ZONE_REPORT_READ_ONLY:
+        return zs == NVME_ZONE_STATE_READ_ONLY;
+    case NVME_ZONE_REPORT_OFFLINE:
+        return zs == NVME_ZONE_STATE_OFFLINE;
+    default:
+        return false;
+    }
+}
+
+static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
+    NvmeNamespace *ns = req->ns;
+    /* cdw12 is zero-based number of dwords to return. Convert to bytes */
+    uint32_t len = (le32_to_cpu(cmd->cdw12) + 1) << 2;
+    uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+    uint32_t zone_idx, zra, zrasf, partial;
+    uint64_t max_zones, nr_zones = 0;
+    uint16_t ret;
+    uint64_t slba;
+    NvmeZoneDescr *z;
+    NvmeZone *zs;
+    NvmeZoneReportHeader *header;
+    void *buf, *buf_p;
+    size_t zone_entry_sz;
+
+    req->status = NVME_SUCCESS;
+
+    ret = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
+    if (ret) {
+        return ret;
+    }
+
+    if (len < sizeof(NvmeZoneReportHeader)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    zra = dw13 & 0xff;
+    if (!(zra == NVME_ZONE_REPORT || zra == NVME_ZONE_REPORT_EXTENDED)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (zra == NVME_ZONE_REPORT_EXTENDED) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    zrasf = (dw13 >> 8) & 0xff;
+    if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    partial = (dw13 >> 16) & 0x01;
+
+    zone_entry_sz = sizeof(NvmeZoneDescr);
+
+    max_zones = (len - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
+    buf = g_malloc0(len);
+
+    header = (NvmeZoneReportHeader *)buf;
+    buf_p = buf + sizeof(NvmeZoneReportHeader);
+
+    while (zone_idx < ns->num_zones && nr_zones < max_zones) {
+        zs = &ns->zone_array[zone_idx];
+
+        if (!nvme_zone_matches_filter(zrasf, zs)) {
+            zone_idx++;
+            continue;
+        }
+
+        z = (NvmeZoneDescr *)buf_p;
+        buf_p += sizeof(NvmeZoneDescr);
+        nr_zones++;
+
+        z->zt = zs->d.zt;
+        z->zs = zs->d.zs;
+        z->zcap = cpu_to_le64(zs->d.zcap);
+        z->zslba = cpu_to_le64(zs->d.zslba);
+        z->za = zs->d.za;
+
+        if (nvme_wp_is_valid(zs)) {
+            z->wp = cpu_to_le64(zs->d.wp);
+        } else {
+            z->wp = cpu_to_le64(~0ULL);
+        }
+
+        zone_idx++;
+    }
+
+    if (!partial) {
+        for (; zone_idx < ns->num_zones; zone_idx++) {
+            zs = &ns->zone_array[zone_idx];
+            if (nvme_zone_matches_filter(zrasf, zs)) {
+                nr_zones++;
+            }
+        }
+    }
+    header->nr_zones = cpu_to_le64(nr_zones);
+
+    ret = nvme_dma(n, (uint8_t *)buf, len, DMA_DIRECTION_FROM_DEVICE, req);
+
+    g_free(buf);
+
+    return ret;
+}
+
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
@@ -1073,9 +1801,15 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_flush(n, req);
     case NVME_CMD_WRITE_ZEROES:
         return nvme_write_zeroes(n, req);
+    case NVME_CMD_ZONE_APPEND:
+        return nvme_rw(n, req, true);
     case NVME_CMD_WRITE:
     case NVME_CMD_READ:
-        return nvme_rw(n, req);
+        return nvme_rw(n, req, false);
+    case NVME_CMD_ZONE_MGMT_SEND:
+        return nvme_zone_mgmt_send(n, req);
+    case NVME_CMD_ZONE_MGMT_RECV:
+        return nvme_zone_mgmt_recv(n, req);
     default:
         trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
@@ -1301,7 +2035,7 @@  static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
                     DMA_DIRECTION_FROM_DEVICE, req);
 }
 
-static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
+static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
                                  uint64_t off, NvmeRequest *req)
 {
     NvmeEffectsLog cmd_eff_log = {};
@@ -1326,11 +2060,20 @@  static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint32_t buf_len,
     acs[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFFECTS_CSUPP;
     acs[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFFECTS_CSUPP;
 
-    iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
-    iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
-                                  NVME_CMD_EFFECTS_LBCC;
-    iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
-    iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
+    if (NVME_CC_CSS(n->bar.cc) != CSS_ADMIN_ONLY) {
+        iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
+        iocs[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFFECTS_CSUPP |
+                                      NVME_CMD_EFFECTS_LBCC;
+        iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
+        iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
+    }
+
+    if (csi == NVME_CSI_ZONED && NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
+        iocs[NVME_CMD_ZONE_APPEND] = NVME_CMD_EFFECTS_CSUPP |
+                                     NVME_CMD_EFFECTS_LBCC;
+        iocs[NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFFECTS_CSUPP;
+        iocs[NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFFECTS_CSUPP;
+    }
 
     trans_len = MIN(sizeof(cmd_eff_log) - off, buf_len);
 
@@ -1349,6 +2092,7 @@  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
     uint8_t  lid = dw10 & 0xff;
     uint8_t  lsp = (dw10 >> 8) & 0xf;
     uint8_t  rae = (dw10 >> 15) & 0x1;
+    uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
     uint32_t numdl, numdu;
     uint64_t off, lpol, lpou;
     size_t   len;
@@ -1382,7 +2126,7 @@  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
     case NVME_LOG_FW_SLOT_INFO:
         return nvme_fw_log_info(n, len, off, req);
     case NVME_LOG_CMD_EFFECTS:
-        return nvme_cmd_effects(n, len, off, req);
+        return nvme_cmd_effects(n, csi, len, off, req);
     default:
         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -1502,6 +2246,16 @@  static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
     return nvme_dma(n, id, sizeof(id), DMA_DIRECTION_FROM_DEVICE, req);
 }
 
+static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
+{
+    switch (ns->csi) {
+    case NVME_CSI_NVM:
+    case NVME_CSI_ZONED:
+        return true;
+    }
+    return false;
+}
+
 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
 {
     trace_pci_nvme_identify_ctrl();
@@ -1513,11 +2267,16 @@  static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
+    NvmeIdCtrlZoned id = {};
 
     trace_pci_nvme_identify_ctrl_csi(c->csi);
 
     if (c->csi == NVME_CSI_NVM) {
         return nvme_rpt_empty_id_struct(n, req);
+    } else if (c->csi == NVME_CSI_ZONED) {
+        id.zasl = n->zasl;
+        return nvme_dma(n, (uint8_t *)&id, sizeof(id),
+                        DMA_DIRECTION_FROM_DEVICE, req);
     }
 
     return NVME_INVALID_FIELD | NVME_DNR;
@@ -1545,8 +2304,12 @@  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req,
         return nvme_rpt_empty_id_struct(n, req);
     }
 
-    return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
-                    DMA_DIRECTION_FROM_DEVICE, req);
+    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
+        return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs),
+                        DMA_DIRECTION_FROM_DEVICE, req);
+    }
+
+    return NVME_INVALID_CMD_SET | NVME_DNR;
 }
 
 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
@@ -1571,8 +2334,11 @@  static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
         return nvme_rpt_empty_id_struct(n, req);
     }
 
-    if (c->csi == NVME_CSI_NVM) {
+    if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
         return nvme_rpt_empty_id_struct(n, req);
+    } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
+        return nvme_dma(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
+                        DMA_DIRECTION_FROM_DEVICE, req);
     }
 
     return NVME_INVALID_FIELD | NVME_DNR;
@@ -1634,7 +2400,7 @@  static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
 
     trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
 
-    if (c->csi != NVME_CSI_NVM) {
+    if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
@@ -1643,7 +2409,7 @@  static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
         if (!ns) {
             continue;
         }
-        if (ns->params.nsid < min_nsid) {
+        if (ns->params.nsid < min_nsid || c->csi != ns->csi) {
             continue;
         }
         if (only_active && !ns->params.attached) {
@@ -1696,19 +2462,29 @@  static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
     desc->nidt = NVME_NIDT_CSI;
     desc->nidl = NVME_NIDL_CSI;
     list_ptr += sizeof(*desc);
-    *(uint8_t *)list_ptr = NVME_CSI_NVM;
+    *(uint8_t *)list_ptr = ns->csi;
 
     return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
 {
+    NvmeNamespace *ns;
     uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
     static const int data_len = sizeof(list);
+    int i;
 
     trace_pci_nvme_identify_cmd_set();
 
     NVME_SET_CSI(*list, NVME_CSI_NVM);
+    for (i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (ns && ns->params.zoned) {
+            NVME_SET_CSI(*list, NVME_CSI_ZONED);
+            break;
+        }
+    }
+
     return nvme_dma(n, list, data_len, DMA_DIRECTION_FROM_DEVICE, req);
 }
 
@@ -1751,7 +2527,7 @@  static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
 {
     uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
 
-    req->cqe.result = 1;
+    req->cqe.result32 = 1;
     if (nvme_check_sqid(n, sqid)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
@@ -1932,7 +2708,7 @@  defaults:
     }
 
 out:
-    req->cqe.result = cpu_to_le32(result);
+    req->cqe.result32 = cpu_to_le32(result);
     return NVME_SUCCESS;
 }
 
@@ -2057,8 +2833,8 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
                                     ((dw11 >> 16) & 0xFFFF) + 1,
                                     n->params.max_ioqpairs,
                                     n->params.max_ioqpairs);
-        req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
-                                      ((n->params.max_ioqpairs - 1) << 16));
+        req->cqe.result32 = cpu_to_le32((n->params.max_ioqpairs - 1) |
+                                        ((n->params.max_ioqpairs - 1) << 16));
         break;
     case NVME_ASYNCHRONOUS_EVENT_CONF:
         n->features.async_config = dw11;
@@ -2310,16 +3086,28 @@  static int nvme_start_ctrl(NvmeCtrl *n)
             continue;
         }
         ns->params.attached = false;
-        switch (ns->params.csi) {
+        switch (ns->csi) {
         case NVME_CSI_NVM:
             if (NVME_CC_CSS(n->bar.cc) == CSS_NVM_ONLY ||
                 NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
                 ns->params.attached = true;
             }
             break;
+        case NVME_CSI_ZONED:
+            if (NVME_CC_CSS(n->bar.cc) == CSS_CSI) {
+                ns->params.attached = true;
+            }
+            break;
         }
     }
 
+    if (!n->zasl_bs) {
+        assert(n->params.mdts);
+        n->zasl = n->params.mdts;
+    } else {
+        n->zasl = 31 - clz32(n->zasl_bs / n->page_size);
+    }
+
     nvme_set_timestamp(n, 0ULL);
 
     QTAILQ_INIT(&n->aer_queue);
@@ -2382,10 +3170,11 @@  static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
                 case CSS_NVM_ONLY:
                     trace_pci_nvme_css_nvm_cset_selected_by_host(data &
                                                                  0xffffffff);
-                    break;
+                break;
                 case CSS_CSI:
                     NVME_SET_CC_CSS(n->bar.cc, CSS_CSI);
-                    trace_pci_nvme_css_all_csets_sel_by_host(data & 0xffffffff);
+                    trace_pci_nvme_css_all_csets_sel_by_host(data &
+                                                             0xffffffff);
                     break;
                 case CSS_ADMIN_ONLY:
                     break;
@@ -2780,6 +3569,12 @@  static void nvme_init_state(NvmeCtrl *n)
     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
     n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
+
+    if (!n->params.zasl_kb) {
+        n->zasl_bs = n->params.mdts ? 0 : NVME_DEFAULT_MAX_ZA_SIZE * KiB;
+    } else {
+        n->zasl_bs = n->params.zasl_kb * KiB;
+    }
 }
 
 int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -2985,8 +3780,9 @@  static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
     NVME_CAP_SET_CQR(n->bar.cap, 1);
     NVME_CAP_SET_TO(n->bar.cap, 0xf);
     /*
-     * The device now always supports NS Types, but all commands
-     * that support CSI field will only handle NVM Command Set.
+     * The device now always supports NS Types, even when "zoned" property
+     * is set to zero. If this is the case, all commands that support CSI
+     * field only handle NVM Command Set.
      */
     NVME_CAP_SET_CSS(n->bar.cap, (CAP_CSS_NVM | CAP_CSS_CSI_SUPP));
     NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
@@ -3033,9 +3829,21 @@  static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 static void nvme_exit(PCIDevice *pci_dev)
 {
     NvmeCtrl *n = NVME(pci_dev);
+    NvmeNamespace *ns;
+    int i;
 
     nvme_clear_ctrl(n);
+
+    for (i = 1; i <= n->num_namespaces; i++) {
+        ns = nvme_ns(n, i);
+        if (!ns) {
+            continue;
+        }
+
+        nvme_ns_cleanup(ns);
+    }
     g_free(n->namespaces);
+
     g_free(n->cq);
     g_free(n->sq);
     g_free(n->aer_reqs);
@@ -3063,6 +3871,8 @@  static Property nvme_props[] = {
     DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
     DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
     DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
+    DEFINE_PROP_UINT8("fill_pattern", NvmeCtrl, params.fill_pattern, 0),
+    DEFINE_PROP_UINT32("zone_append_size_limit", NvmeCtrl, params.zasl_kb, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index a7126e123f..628c665728 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -651,8 +651,10 @@  typedef struct QEMU_PACKED NvmeAerResult {
 } NvmeAerResult;
 
 typedef struct QEMU_PACKED NvmeCqe {
-    uint32_t    result;
-    uint32_t    rsvd;
+    union {
+        uint64_t     result64;
+        uint32_t     result32;
+    };
     uint16_t    sq_head;
     uint16_t    sq_id;
     uint16_t    cid;