diff mbox series

[2/2] virtio-blk: add zoned storage emulation for zoned devices

Message ID 20220910065057.35017-2-faithilikerun@gmail.com
State New
Headers show
Series [1/2] include: import virtio_blk headers from linux with zoned device support | expand

Commit Message

Sam Li Sept. 10, 2022, 6:50 a.m. UTC
This patch extends virtio-blk emulation to handle zoned device commands
by calling the new block layer APIs to perform zoned device I/O on
behalf of the guest. It supports Report Zone, four zone oparations (open,
close, finish, reset), and Append Zone.

The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
support zoned block devices. Regular block devices(conventional zones)
will not be set.

The guest os having zoned device support can use blkzone(8) to test those
commands. Furthermore, using zonefs to test zone append write is also
supported.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 hw/block/virtio-blk.c | 326 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 326 insertions(+)

Comments

Stefan Hajnoczi Sept. 19, 2022, 9:25 p.m. UTC | #1
On Sat, Sep 10, 2022 at 02:50:57PM +0800, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> The guest os having zoned device support can use blkzone(8) to test those
> commands. Furthermore, using zonefs to test zone append write is also
> supported.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  hw/block/virtio-blk.c | 326 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 326 insertions(+)
> 
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index e9ba752f6b..3ef74c01db 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -46,6 +46,8 @@ static const VirtIOFeature feature_sizes[] = {
>       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> +     .end = endof(struct virtio_blk_config, zoned)},
>      {}
>  };
>  
> @@ -614,6 +616,273 @@ err:
>      return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +        struct {
> +            unsigned int nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } ZoneReportData;
> +        struct {
> +            int64_t append_sector;
> +        } ZoneAppendData;

Field names should be lowercase:

  struct {
      unsigned int nr_zones;
      BlockZoneDescriptor *zones;
  } zone_report_data;
  struct {
      int64_t append_sector;
  } zone_append_data;

> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zone_model: error checking before issuing requests. If all checks

Maybe rename it to check_zoned_request()? It does more than check the
model.

> + * passed, return true.
> + * append: true if only zone append request issued.
> + */
> +static bool check_zone_model(VirtIOBlock *s, int64_t sector, int64_t nr_sector,
> +                             bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    BlockZoneDescriptor *zone = &bs->bl.zones[sector / bs->bl.zone_sectors];

Inputs from the guest driver are untrusted and must be validated before
using them. sector could have any value here, including invalid values.
Please check that sector is less than the device capacity and also that
it is positive.

> +    int64_t max_append_sector = bs->bl.max_append_sectors;
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +        *status = VIRTIO_BLK_S_UNSUPP;
> +        return false;
> +    }
> +
> +    if (zone->cond == BLK_ZS_OFFLINE) {
> +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        return false;
> +    }
> +
> +    if (append) {
> +        if ((zone->type != BLK_ZT_SWR) || (zone->cond == BLK_ZS_RDONLY) ||
> +            (sector + nr_sector > (*(zone + 1)).start)) {
> +            /* the end sector of the request exceeds to next zone */
> +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            return false;
> +        }
> +
> +        if (nr_sector > max_append_sector) {
> +            if (max_append_sector == 0) {
> +                *status = VIRTIO_BLK_S_UNSUPP;
> +            } else {
> +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            }
> +            return false;
> +        }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;
> +    int64_t zrp_size, nz, n, j = 0;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    nz = data->ZoneReportData.nr_zones;
> +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> +            .nr_zones = cpu_to_le64(nz),
> +    };
> +
> +    zrp_size = sizeof(struct virtio_blk_zone_report)
> +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> +    if (n != sizeof(zrp_hdr)) {
> +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> +        struct virtio_blk_zone_descriptor desc =
> +                (struct virtio_blk_zone_descriptor) {
> +                        .z_start = cpu_to_le64(data->ZoneReportData.zones[j].start),
> +                        .z_cap = cpu_to_le64(data->ZoneReportData.zones[j].cap),
> +                        .z_wp = cpu_to_le64(data->ZoneReportData.zones[j].wp),

If the QEMU block layer uses byte constants it will be necessary to
convert the above fields to sectors. I think the code is correct right
now because the QEMU block layer patches still use sectors, but using
bytes would be consistent with the QEMU block layer conventions.

> +                        .z_type = data->ZoneReportData.zones[j].type,
> +                        .z_state = data->ZoneReportData.zones[j].cond,

The QEMU type and cond field constants might not match the virtio-blk
constants. Please convert them explicitly (e.g. with a switch
statement in a helper function) so there is no assumption about the
values of the constants.

> +                };
> +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));

This is O(n^2) time complexity since it's called from the loop, but
nevermind for now... Maybe add a TODO comment so anyone trying to
optimize this code will immediately see the expensive part.

> +        if (n != sizeof(desc)) {
> +            virtio_error(vdev, "Driver provided input buffer "
> +                               "for descriptors that is too small!");
> +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            goto out;
> +        }
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data->ZoneReportData.zones);
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    unsigned int nr_zones;
> +    ZoneCmdData *data;
> +    int64_t zone_size, offset;
> +    uint8_t err_status;
> +
> +    if (req->in_len <= sizeof(struct virtio_blk_inhdr) +
> +                       sizeof(struct virtio_blk_zone_report)) {
> +        virtio_error(vdev, "in buffer too small for zone report");
> +        return -1;
> +    }
> +
> +    /* start byte offset of the zone report */
> +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> +                sizeof(struct virtio_blk_zone_report)) /
> +               sizeof(struct virtio_blk_zone_descriptor);
> +
> +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> +    data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->ZoneReportData.nr_zones = nr_zones;
> +    data->ZoneReportData.zones = g_malloc(zone_size),

nr_zones = 0 and zones = NULL is possible when in_len is sizeof(struct
virtio_blk_inhdr) + sizeof(struct virtio_blk_zone_report) + 1. Maybe the
code handles it okay without dereferencing a NULL pointer, but it would
be safer to change the check above like this:

  if (req->in_len < sizeof(struct virtio_blk_inhdr) +
                    sizeof(struct virtio_blk_zone_report) +
                    sizeof(struct virtio_blk_zone_descriptor)) {
      virtio_error(vdev, "in buffer too small for zone report");
      return -1;
  }

> +
> +    blk_aio_zone_report(s->blk, offset, &data->ZoneReportData.nr_zones,
> +                        data->ZoneReportData.zones,
> +                        virtio_blk_zone_report_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;

Missing ret < 0 error handling.

> +
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    uint64_t len;
> +    uint32_t type;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +
> +    type = virtio_ldl_p(vdev, &req->out.type);
> +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> +        /* Entire drive capacity */
> +        offset = 0;
> +        blk_get_geometry(s->blk, &len);
> +        len *= 512;
> +    } else {
> +        len = bs->bl.zone_sectors * 512;

Is this correct when accessing the last zone of the device?

> +    }
> +
> +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> +                      virtio_blk_zone_mgmt_complete, data);
> +
> +    return 0;
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_append_complete(void *opaque, int ret) {
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    int64_t append_sector, n;
> +    struct iovec *out_iov = req->elem.out_sg;
> +    unsigned out_num = req->elem.out_num;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;

Error handling code for the ret < 0 case is missing.

> +
> +    append_sector = data->ZoneAppendData.append_sector;
> +    int write_granularity = s->conf.conf.logical_block_size;
> +    if ((append_sector * 512 % write_granularity) != 0) {
> +        err_status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +        goto out;
> +    }

This looks like input validation. Why is it performed after the I/O
request has completed?

I guess the intent is to check the value that the guest driver provided,
not the value produced by the device after the I/O request completed?

> +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));

Please use virtio_stq_p() on append_sector first to ensure that the
endianness is correct.

> +    if (n != sizeof(append_sector)) {
> +        virtio_error(vdev, "Driver provided input buffer less than size of "
> +                     "append_sector");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +    goto out;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {

The return value is not used. Please change the return type to void.

> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    uint64_t niov = req->elem.out_num;
> +    struct iovec *out_iov = req->elem.out_sg;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> +    int64_t len = 0;
> +    for (int i = 1; i < niov; ++i) {
> +        len += out_iov[i].iov_len;
> +    }

Why is the first iovec skipped?

> +
> +    if (!check_zone_model(s, offset / 512, len / 512, true, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->ZoneAppendData.append_sector = offset;
> +    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
> +    blk_aio_zone_append(s->blk, &data->ZoneAppendData.append_sector, &req->qiov, 0,
> +                        virtio_blk_zone_append_complete, data);
> +
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>  {
>      uint32_t type;
> @@ -700,6 +969,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>      case VIRTIO_BLK_T_FLUSH:
>          virtio_blk_handle_flush(req, mrb);
>          break;
> +    case VIRTIO_BLK_T_ZONE_REPORT:
> +        virtio_blk_handle_zone_report(req);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_OPEN:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_CLOSE:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_FINISH:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
> +        break;
>      case VIRTIO_BLK_T_SCSI_CMD:
>          virtio_blk_handle_scsi(req);
>          break;
> @@ -718,6 +1005,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>          virtio_blk_free_request(req);
>          break;
>      }
> +   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> +       virtio_blk_handle_zone_append(req);
> +       break;
>      /*
>       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
>       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> @@ -917,6 +1207,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>      BlockConf *conf = &s->conf.conf;
> +    BlockDriverState *state = blk_bs(s->blk);

Usually the variable is called "bs". Please use that name here and
elsewhere in the patch.

>      struct virtio_blk_config blkcfg;
>      uint64_t capacity;
>      int64_t length;
> @@ -976,6 +1267,31 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>          blkcfg.write_zeroes_may_unmap = 1;
>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>      }
> +#ifdef CONFIG_BLKZONED

Is this ifdef appropriate? I think bs->bl.zoned should always be
available, even when <blkzoned.h> is missing (e.g. non-Linux system). In
the future there may be an emulated zoned BlockDriver. virtio-blk
should be able to use the emulated zoned BlockDriver even on non-Linux
systems.

I think CONFIG_BLKZONED should be limited to block/file-posix.c.

> +    if (state->bl.zoned != BLK_Z_NONE) {
> +        switch (state->bl.zoned) {
> +        case BLK_Z_HM:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> +            virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> +                         state->bl.zone_sectors);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> +                         state->bl.max_active_zones);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> +                         state->bl.max_open_zones);
> +            virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> +            virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> +                         state->bl.max_append_sectors);
> +            break;
> +        case BLK_Z_HA:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;

The block limits aren't relevant to host-aware zoned devices?

> +            break;
> +        default:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> +            virtio_error(vdev, "Invalid zoned model %x \n", (int)state->bl.zoned);
> +            break;
> +        }
> +    }
> +#endif
>      memcpy(config, &blkcfg, s->config_size);
>  }
>  
> @@ -995,6 +1311,7 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
>                                          Error **errp)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
> +    BlockDriverState *state = blk_bs(s->blk);
>  
>      /* Firstly sync all virtio-blk possible supported features */
>      features |= s->host_features;
> @@ -1003,6 +1320,12 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
>      virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
>      virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
>      virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
> +    if (state->bl.zoned != BLK_Z_NONE) {
> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> +        if (state->bl.zoned == BLK_Z_HM) {
> +            virtio_clear_feature(&features, VIRTIO_BLK_F_DISCARD);

Why is features modified here but s->host_features is modified in the
line above?

> +        }
> +    }

This detects VIRTIO_BLK_F_ZONED based on the BlockDriverState...

>      if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
>          if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_SCSI)) {
>              error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
> @@ -1286,6 +1609,9 @@ static Property virtio_blk_properties[] = {
>  #ifdef __linux__
>      DEFINE_PROP_BIT64("scsi", VirtIOBlock, host_features,
>                        VIRTIO_BLK_F_SCSI, false),
> +#endif
> +#ifdef CONFIG_BLKZONED
> +    DEFINE_PROP_BIT64("zoned", VirtIOBlock, host_features, VIRTIO_BLK_F_ZONED, true),
>  #endif

...but this allows users to enable/disable the flag from the
command-line.

I think DEFINE_PROP_BIT64() can be removed, but please check that the
config space size is still correct. It may be necessary to move the
bs->bl.zoned check into virtio_blk_device_realize() and set
the VIRTIO_BLK_F_ZONED s->host_features bit there instead. That will
allow the s->config_size calculation to work correctly.
Sam Li Sept. 28, 2022, 7:55 p.m. UTC | #2
Stefan Hajnoczi <stefanha@redhat.com> 于2022年9月20日周二 05:31写道:
>
> On Sat, Sep 10, 2022 at 02:50:57PM +0800, Sam Li wrote:
> > This patch extends virtio-blk emulation to handle zoned device commands
> > by calling the new block layer APIs to perform zoned device I/O on
> > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > close, finish, reset), and Append Zone.
> >
> > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > support zoned block devices. Regular block devices(conventional zones)
> > will not be set.
> >
> > The guest os having zoned device support can use blkzone(8) to test those
> > commands. Furthermore, using zonefs to test zone append write is also
> > supported.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  hw/block/virtio-blk.c | 326 ++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 326 insertions(+)
> >
> > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > index e9ba752f6b..3ef74c01db 100644
> > --- a/hw/block/virtio-blk.c
> > +++ b/hw/block/virtio-blk.c
> > @@ -46,6 +46,8 @@ static const VirtIOFeature feature_sizes[] = {
> >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > +     .end = endof(struct virtio_blk_config, zoned)},
> >      {}
> >  };
> >
> > @@ -614,6 +616,273 @@ err:
> >      return err_status;
> >  }
> >
> > +typedef struct ZoneCmdData {
> > +    VirtIOBlockReq *req;
> > +    union {
> > +        struct {
> > +            unsigned int nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } ZoneReportData;
> > +        struct {
> > +            int64_t append_sector;
> > +        } ZoneAppendData;
>
> Field names should be lowercase:
>
>   struct {
>       unsigned int nr_zones;
>       BlockZoneDescriptor *zones;
>   } zone_report_data;
>   struct {
>       int64_t append_sector;
>   } zone_append_data;
>
> > +    };
> > +} ZoneCmdData;
> > +
> > +/*
> > + * check zone_model: error checking before issuing requests. If all checks
>
> Maybe rename it to check_zoned_request()? It does more than check the
> model.
>
> > + * passed, return true.
> > + * append: true if only zone append request issued.
> > + */
> > +static bool check_zone_model(VirtIOBlock *s, int64_t sector, int64_t nr_sector,
> > +                             bool append, uint8_t *status) {
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    BlockZoneDescriptor *zone = &bs->bl.zones[sector / bs->bl.zone_sectors];
>
> Inputs from the guest driver are untrusted and must be validated before
> using them. sector could have any value here, including invalid values.
> Please check that sector is less than the device capacity and also that
> it is positive.
>
> > +    int64_t max_append_sector = bs->bl.max_append_sectors;
> > +
> > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > +        *status = VIRTIO_BLK_S_UNSUPP;
> > +        return false;
> > +    }
> > +
> > +    if (zone->cond == BLK_ZS_OFFLINE) {
> > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        return false;
> > +    }
> > +
> > +    if (append) {
> > +        if ((zone->type != BLK_ZT_SWR) || (zone->cond == BLK_ZS_RDONLY) ||
> > +            (sector + nr_sector > (*(zone + 1)).start)) {
> > +            /* the end sector of the request exceeds to next zone */
> > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            return false;
> > +        }
> > +
> > +        if (nr_sector > max_append_sector) {
> > +            if (max_append_sector == 0) {
> > +                *status = VIRTIO_BLK_S_UNSUPP;
> > +            } else {
> > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            }
> > +            return false;
> > +        }
> > +    }
> > +    return true;
> > +}
> > +
> > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    struct iovec *in_iov = req->elem.in_sg;
> > +    unsigned in_num = req->elem.in_num;
> > +    int64_t zrp_size, nz, n, j = 0;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    nz = data->ZoneReportData.nr_zones;
> > +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> > +            .nr_zones = cpu_to_le64(nz),
> > +    };
> > +
> > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > +    if (n != sizeof(zrp_hdr)) {
> > +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > +        struct virtio_blk_zone_descriptor desc =
> > +                (struct virtio_blk_zone_descriptor) {
> > +                        .z_start = cpu_to_le64(data->ZoneReportData.zones[j].start),
> > +                        .z_cap = cpu_to_le64(data->ZoneReportData.zones[j].cap),
> > +                        .z_wp = cpu_to_le64(data->ZoneReportData.zones[j].wp),
>
> If the QEMU block layer uses byte constants it will be necessary to
> convert the above fields to sectors. I think the code is correct right
> now because the QEMU block layer patches still use sectors, but using
> bytes would be consistent with the QEMU block layer conventions.
>
> > +                        .z_type = data->ZoneReportData.zones[j].type,
> > +                        .z_state = data->ZoneReportData.zones[j].cond,
>
> The QEMU type and cond field constants might not match the virtio-blk
> constants. Please convert them explicitly (e.g. with a switch
> statement in a helper function) so there is no assumption about the
> values of the constants.
>
> > +                };
> > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
>
> This is O(n^2) time complexity since it's called from the loop, but
> nevermind for now... Maybe add a TODO comment so anyone trying to
> optimize this code will immediately see the expensive part.
>
> > +        if (n != sizeof(desc)) {
> > +            virtio_error(vdev, "Driver provided input buffer "
> > +                               "for descriptors that is too small!");
> > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            goto out;
> > +        }
> > +    }
> > +    goto out;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data->ZoneReportData.zones);
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    unsigned int nr_zones;
> > +    ZoneCmdData *data;
> > +    int64_t zone_size, offset;
> > +    uint8_t err_status;
> > +
> > +    if (req->in_len <= sizeof(struct virtio_blk_inhdr) +
> > +                       sizeof(struct virtio_blk_zone_report)) {
> > +        virtio_error(vdev, "in buffer too small for zone report");
> > +        return -1;
> > +    }
> > +
> > +    /* start byte offset of the zone report */
> > +    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > +                sizeof(struct virtio_blk_zone_report)) /
> > +               sizeof(struct virtio_blk_zone_descriptor);
> > +
> > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > +    data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->ZoneReportData.nr_zones = nr_zones;
> > +    data->ZoneReportData.zones = g_malloc(zone_size),
>
> nr_zones = 0 and zones = NULL is possible when in_len is sizeof(struct
> virtio_blk_inhdr) + sizeof(struct virtio_blk_zone_report) + 1. Maybe the
> code handles it okay without dereferencing a NULL pointer, but it would
> be safer to change the check above like this:
>
>   if (req->in_len < sizeof(struct virtio_blk_inhdr) +
>                     sizeof(struct virtio_blk_zone_report) +
>                     sizeof(struct virtio_blk_zone_descriptor)) {
>       virtio_error(vdev, "in buffer too small for zone report");
>       return -1;
>   }
>
> > +
> > +    blk_aio_zone_report(s->blk, offset, &data->ZoneReportData.nr_zones,
> > +                        data->ZoneReportData.zones,
> > +                        virtio_blk_zone_report_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
>
> Missing ret < 0 error handling.
>
> > +
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    uint64_t len;
> > +    uint32_t type;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +
> > +    type = virtio_ldl_p(vdev, &req->out.type);
> > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > +        /* Entire drive capacity */
> > +        offset = 0;
> > +        blk_get_geometry(s->blk, &len);
> > +        len *= 512;
> > +    } else {
> > +        len = bs->bl.zone_sectors * 512;
>
> Is this correct when accessing the last zone of the device?
>
> > +    }
> > +
> > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > +                      virtio_blk_zone_mgmt_complete, data);
> > +
> > +    return 0;
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_append_complete(void *opaque, int ret) {
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    int64_t append_sector, n;
> > +    struct iovec *out_iov = req->elem.out_sg;
> > +    unsigned out_num = req->elem.out_num;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
>
> Error handling code for the ret < 0 case is missing.
>
> > +
> > +    append_sector = data->ZoneAppendData.append_sector;
> > +    int write_granularity = s->conf.conf.logical_block_size;
> > +    if ((append_sector * 512 % write_granularity) != 0) {
> > +        err_status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > +        goto out;
> > +    }
>
> This looks like input validation. Why is it performed after the I/O
> request has completed?

UNALIGNED_WP should be checked in both input and output validation. By
checking if the starting/ending sector of the request data is aligned
with write_granularity(value of physical block size), the device sets
the UNALIGNED_WP bit and completes the request. According to spec
here:

+VIRTIO_BLK_S_ZONE_UNALIGNED_WP is set by the device when the request received
+from the driver attempts to perform a write to an SWR zone and at least one of
+the following conditions is met:
+
+\begin{itemize}
+\item the starting sector of the request is not equal to the current value of
+    the zone write pointer.
+
+\item the ending sector of the request data multiplied by 512 is not a multiple
+    of the value reported by the device in the field \field{write_granularity}
+    in the device configuration space.
+\end{itemize}

>
> I guess the intent is to check the value that the guest driver provided,
> not the value produced by the device after the I/O request completed?
>
> > +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
>
> Please use virtio_stq_p() on append_sector first to ensure that the
> endianness is correct.
>
> > +    if (n != sizeof(append_sector)) {
> > +        virtio_error(vdev, "Driver provided input buffer less than size of "
> > +                     "append_sector");
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +    goto out;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {
>
> The return value is not used. Please change the return type to void.

The return type should be int actually. Otherwise, QEMU will terminate
when the zone_append request is issued from the guest. It is the one
cause that failed some of the zonefs tests. After coredump, backtrace
indicates address_space_unmap: Assertion `mr ! = NULL` failed rooted
from virtio_blk_zone_append_complete(). I haven't figured out exactly
why but my guess is virtio_blk_zone_op_complete() needs the return
value of virtio_blk_zone_op() ......

>
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    uint64_t niov = req->elem.out_num;
> > +    struct iovec *out_iov = req->elem.out_sg;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
> > +    int64_t len = 0;
> > +    for (int i = 1; i < niov; ++i) {
> > +        len += out_iov[i].iov_len;
> > +    }
>
> Why is the first iovec skipped?

Because the first iovec is normally headers, and the rest is buffer
that out_iov needs.

>
> > +
> > +    if (!check_zone_model(s, offset / 512, len / 512, true, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->ZoneAppendData.append_sector = offset;
> > +    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
> > +    blk_aio_zone_append(s->blk, &data->ZoneAppendData.append_sector, &req->qiov, 0,
> > +                        virtio_blk_zone_append_complete, data);
> > +
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >  {
> >      uint32_t type;
> > @@ -700,6 +969,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >      case VIRTIO_BLK_T_FLUSH:
> >          virtio_blk_handle_flush(req, mrb);
> >          break;
> > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > +        virtio_blk_handle_zone_report(req);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
> > +        break;
> >      case VIRTIO_BLK_T_SCSI_CMD:
> >          virtio_blk_handle_scsi(req);
> >          break;
> > @@ -718,6 +1005,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >          virtio_blk_free_request(req);
> >          break;
> >      }
> > +   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> > +       virtio_blk_handle_zone_append(req);
> > +       break;
> >      /*
> >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> > @@ -917,6 +1207,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >  {
> >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> >      BlockConf *conf = &s->conf.conf;
> > +    BlockDriverState *state = blk_bs(s->blk);
>
> Usually the variable is called "bs". Please use that name here and
> elsewhere in the patch.
>
> >      struct virtio_blk_config blkcfg;
> >      uint64_t capacity;
> >      int64_t length;
> > @@ -976,6 +1267,31 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >          blkcfg.write_zeroes_may_unmap = 1;
> >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> >      }
> > +#ifdef CONFIG_BLKZONED
>
> Is this ifdef appropriate? I think bs->bl.zoned should always be
> available, even when <blkzoned.h> is missing (e.g. non-Linux system). In
> the future there may be an emulated zoned BlockDriver. virtio-blk
> should be able to use the emulated zoned BlockDriver even on non-Linux
> systems.
>
Ok, you are right.

> I think CONFIG_BLKZONED should be limited to block/file-posix.c.

I'm not sure. There may be some places where virtio-blk needs this
config. Like in transforming blk_zone_descriptor to
virtio_blk_zone_descriptor, it needs to use Linux's constants to
ensure nothing go wrong here.

>
> > +    if (state->bl.zoned != BLK_Z_NONE) {
> > +        switch (state->bl.zoned) {
> > +        case BLK_Z_HM:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > +            virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,

> > +                         state->bl.zone_sectors);
> > +            virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > +                         state->bl.max_active_zones);
> > +            virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > +                         state->bl.max_open_zones);
> > +            virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > +            virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > +                         state->bl.max_append_sectors);
> > +            break;
> > +        case BLK_Z_HA:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
>
> The block limits aren't relevant to host-aware zoned devices?

Yes, the HA devices are seen as non-zoned device and the driver just
ignore all other fields in zoned.

+\item if the driver that can not support zoned devices reads
+    VIRTIO_BLK_Z_HA from the \field{model} field of \field{zoned}, the driver
+    MAY handle the device as a non-zoned device. In this case, the
+    driver SHOULD ignore all other fields in \field{zoned}.
+\end{itemize}

>
> > +            break;
> > +        default:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > +            virtio_error(vdev, "Invalid zoned model %x \n", (int)state->bl.zoned);
> > +            break;
> > +        }
> > +    }
> > +#endif
> >      memcpy(config, &blkcfg, s->config_size);
> >  }
> >
> > @@ -995,6 +1311,7 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
> >                                          Error **errp)
> >  {
> >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> > +    BlockDriverState *state = blk_bs(s->blk);
> >
> >      /* Firstly sync all virtio-blk possible supported features */
> >      features |= s->host_features;
> > @@ -1003,6 +1320,12 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
> >      virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
> >      virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
> >      virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
> > +    if (state->bl.zoned != BLK_Z_NONE) {
> > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > +        if (state->bl.zoned == BLK_Z_HM) {
> > +            virtio_clear_feature(&features, VIRTIO_BLK_F_DISCARD);
>
> Why is features modified here but s->host_features is modified in the
> line above?

Because F_DISCARD should not be offered by the driver when the device
offers F_ZONED with the HM model.

>
> > +        }
> > +    }
>
> This detects VIRTIO_BLK_F_ZONED based on the BlockDriverState...
(This part can be removed.)

>
> >      if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
> >          if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_SCSI)) {
> >              error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
> > @@ -1286,6 +1609,9 @@ static Property virtio_blk_properties[] = {
> >  #ifdef __linux__
> >      DEFINE_PROP_BIT64("scsi", VirtIOBlock, host_features,
> >                        VIRTIO_BLK_F_SCSI, false),
> > +#endif
> > +#ifdef CONFIG_BLKZONED
> > +    DEFINE_PROP_BIT64("zoned", VirtIOBlock, host_features, VIRTIO_BLK_F_ZONED, true),
> >  #endif
>
> ...but this allows users to enable/disable the flag from the
> command-line.
>
> I think DEFINE_PROP_BIT64() can be removed, but please check that the
> config space size is still correct. It may be necessary to move the
> bs->bl.zoned check into virtio_blk_device_realize() and set
> the VIRTIO_BLK_F_ZONED s->host_features bit there instead. That will
> allow the s->config_size calculation to work correctly.

Ok, it works. Thanks!

Additionally, the DEFINE_PROP_BIT here is to declare that the supports
zones. Then the virtio-blk driver will not need to look at that
feature. So the former part detecting the F_ZONED feature based on
BlockDriverState can be removed. If removing this macro, then the
virio-blk driver must set the F_ZONED feature by itself.
Damien Le Moal Sept. 28, 2022, 10:33 p.m. UTC | #3
On 2022/09/29 4:55, Sam Li wrote:
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年9月20日周二 05:31写道:
[...]
>>> +static void virtio_blk_zone_append_complete(void *opaque, int ret) {
>>> +    ZoneCmdData *data = opaque;
>>> +    VirtIOBlockReq *req = data->req;
>>> +    VirtIOBlock *s = req->dev;
>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
>>> +    int64_t append_sector, n;
>>> +    struct iovec *out_iov = req->elem.out_sg;
>>> +    unsigned out_num = req->elem.out_num;
>>> +    uint8_t err_status = VIRTIO_BLK_S_OK;
>>
>> Error handling code for the ret < 0 case is missing.
>>
>>> +
>>> +    append_sector = data->ZoneAppendData.append_sector;
>>> +    int write_granularity = s->conf.conf.logical_block_size;
>>> +    if ((append_sector * 512 % write_granularity) != 0) {
>>> +        err_status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
>>> +        goto out;
>>> +    }
>>
>> This looks like input validation. Why is it performed after the I/O
>> request has completed?
> 
> UNALIGNED_WP should be checked in both input and output validation. By
> checking if the starting/ending sector of the request data is aligned
> with write_granularity(value of physical block size), the device sets
> the UNALIGNED_WP bit and completes the request. According to spec
> here:

No, you only need input validation. You should not do this in the completion
path, simply because it will NEVER trigger. If it does, then the IO was totally
botched (by the host kernel or the drive itself) and then you are going to be in
a lot more troubles than not reporting an error... So I do not think this is
useful at all.

> +VIRTIO_BLK_S_ZONE_UNALIGNED_WP is set by the device when the request received
> +from the driver attempts to perform a write to an SWR zone and at least one of
> +the following conditions is met:

The text is clear here: this is for a write command, not a zone append command.
So your emulation of zone append cannot return an unaligned wp error. That would
is undefined in the specs.

> +
> +\begin{itemize}
> +\item the starting sector of the request is not equal to the current value of
> +    the zone write pointer.
> +
> +\item the ending sector of the request data multiplied by 512 is not a multiple
> +    of the value reported by the device in the field \field{write_granularity}
> +    in the device configuration space.
> +\end{itemize}
> 
>>
>> I guess the intent is to check the value that the guest driver provided,
>> not the value produced by the device after the I/O request completed?
>>
>>> +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
>>
>> Please use virtio_stq_p() on append_sector first to ensure that the
>> endianness is correct.
>>
>>> +    if (n != sizeof(append_sector)) {
>>> +        virtio_error(vdev, "Driver provided input buffer less than size of "
>>> +                     "append_sector");
>>> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
>>> +        goto out;
>>> +    }
>>> +    goto out;
>>> +
>>> +out:
>>> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
>>> +    virtio_blk_req_complete(req, err_status);
>>> +    virtio_blk_free_request(req);
>>> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
>>> +    g_free(data);
>>> +}
>>> +
>>> +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {
>>
>> The return value is not used. Please change the return type to void.
> 
> The return type should be int actually. Otherwise, QEMU will terminate
> when the zone_append request is issued from the guest. It is the one
> cause that failed some of the zonefs tests. After coredump, backtrace
> indicates address_space_unmap: Assertion `mr ! = NULL` failed rooted
> from virtio_blk_zone_append_complete(). I haven't figured out exactly
> why but my guess is virtio_blk_zone_op_complete() needs the return
> value of virtio_blk_zone_op() ......
> 
>>
>>> +    VirtIOBlock *s = req->dev;
>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
>>> +    uint64_t niov = req->elem.out_num;
>>> +    struct iovec *out_iov = req->elem.out_sg;
>>> +    uint8_t err_status = VIRTIO_BLK_S_OK;
>>> +
>>> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
>>> +    int64_t len = 0;
>>> +    for (int i = 1; i < niov; ++i) {
>>> +        len += out_iov[i].iov_len;
>>> +    }
>>
>> Why is the first iovec skipped?
> 
> Because the first iovec is normally headers, and the rest is buffer
> that out_iov needs.
> 
>>
>>> +
>>> +    if (!check_zone_model(s, offset / 512, len / 512, true, &err_status)) {
>>> +        goto out;
>>> +    }
>>> +
>>> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
>>> +    data->req = req;
>>> +    data->ZoneAppendData.append_sector = offset;
>>> +    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
>>> +    blk_aio_zone_append(s->blk, &data->ZoneAppendData.append_sector, &req->qiov, 0,
>>> +                        virtio_blk_zone_append_complete, data);
>>> +
>>> +    return 0;
>>> +
>>> +out:
>>> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
>>> +    virtio_blk_req_complete(req, err_status);
>>> +    virtio_blk_free_request(req);
>>> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
>>> +    return err_status;
>>> +}
>>> +
>>>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>>  {
>>>      uint32_t type;
>>> @@ -700,6 +969,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>>      case VIRTIO_BLK_T_FLUSH:
>>>          virtio_blk_handle_flush(req, mrb);
>>>          break;
>>> +    case VIRTIO_BLK_T_ZONE_REPORT:
>>> +        virtio_blk_handle_zone_report(req);
>>> +        break;
>>> +    case VIRTIO_BLK_T_ZONE_OPEN:
>>> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
>>> +        break;
>>> +    case VIRTIO_BLK_T_ZONE_CLOSE:
>>> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
>>> +        break;
>>> +    case VIRTIO_BLK_T_ZONE_FINISH:
>>> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
>>> +        break;
>>> +    case VIRTIO_BLK_T_ZONE_RESET:
>>> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
>>> +        break;
>>> +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
>>> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
>>> +        break;
>>>      case VIRTIO_BLK_T_SCSI_CMD:
>>>          virtio_blk_handle_scsi(req);
>>>          break;
>>> @@ -718,6 +1005,9 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>>          virtio_blk_free_request(req);
>>>          break;
>>>      }
>>> +   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
>>> +       virtio_blk_handle_zone_append(req);
>>> +       break;
>>>      /*
>>>       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
>>>       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
>>> @@ -917,6 +1207,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>>>  {
>>>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>>>      BlockConf *conf = &s->conf.conf;
>>> +    BlockDriverState *state = blk_bs(s->blk);
>>
>> Usually the variable is called "bs". Please use that name here and
>> elsewhere in the patch.
>>
>>>      struct virtio_blk_config blkcfg;
>>>      uint64_t capacity;
>>>      int64_t length;
>>> @@ -976,6 +1267,31 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>>>          blkcfg.write_zeroes_may_unmap = 1;
>>>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>>>      }
>>> +#ifdef CONFIG_BLKZONED
>>
>> Is this ifdef appropriate? I think bs->bl.zoned should always be
>> available, even when <blkzoned.h> is missing (e.g. non-Linux system). In
>> the future there may be an emulated zoned BlockDriver. virtio-blk
>> should be able to use the emulated zoned BlockDriver even on non-Linux
>> systems.
>>
> Ok, you are right.
> 
>> I think CONFIG_BLKZONED should be limited to block/file-posix.c.
> 
> I'm not sure. There may be some places where virtio-blk needs this
> config. Like in transforming blk_zone_descriptor to
> virtio_blk_zone_descriptor, it needs to use Linux's constants to
> ensure nothing go wrong here.
> 
>>
>>> +    if (state->bl.zoned != BLK_Z_NONE) {
>>> +        switch (state->bl.zoned) {
>>> +        case BLK_Z_HM:
>>> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
>>> +            virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> 
>>> +                         state->bl.zone_sectors);
>>> +            virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
>>> +                         state->bl.max_active_zones);
>>> +            virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
>>> +                         state->bl.max_open_zones);
>>> +            virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
>>> +            virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
>>> +                         state->bl.max_append_sectors);
>>> +            break;
>>> +        case BLK_Z_HA:
>>> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
>>
>> The block limits aren't relevant to host-aware zoned devices?
> 
> Yes, the HA devices are seen as non-zoned device and the driver just
> ignore all other fields in zoned.

This is incorrect. HA devices may be used as regular block devices (not zoned)
by the guest, but that is the guest decision to make. qemu handling of the
device should still have the proper zone information for the drive and be ready
to accept any zone operation for it.

> 
> +\item if the driver that can not support zoned devices reads
> +    VIRTIO_BLK_Z_HA from the \field{model} field of \field{zoned}, the driver
> +    MAY handle the device as a non-zoned device. In this case, the
> +    driver SHOULD ignore all other fields in \field{zoned}.
> +\end{itemize}

The driver here refers to the guest virtio driver. In the spec, qemu zone block
emulation/handling is called "the device". Confusing :)

> 
>>
>>> +            break;
>>> +        default:
>>> +            blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
>>> +            virtio_error(vdev, "Invalid zoned model %x \n", (int)state->bl.zoned);
>>> +            break;
>>> +        }
>>> +    }
>>> +#endif
>>>      memcpy(config, &blkcfg, s->config_size);
>>>  }
>>>
>>> @@ -995,6 +1311,7 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
>>>                                          Error **errp)
>>>  {
>>>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>>> +    BlockDriverState *state = blk_bs(s->blk);
>>>
>>>      /* Firstly sync all virtio-blk possible supported features */
>>>      features |= s->host_features;
>>> @@ -1003,6 +1320,12 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
>>>      virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
>>>      virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
>>>      virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
>>> +    if (state->bl.zoned != BLK_Z_NONE) {
>>> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
>>> +        if (state->bl.zoned == BLK_Z_HM) {
>>> +            virtio_clear_feature(&features, VIRTIO_BLK_F_DISCARD);
>>
>> Why is features modified here but s->host_features is modified in the
>> line above?
> 
> Because F_DISCARD should not be offered by the driver when the device
> offers F_ZONED with the HM model.
> 
>>
>>> +        }
>>> +    }
>>
>> This detects VIRTIO_BLK_F_ZONED based on the BlockDriverState...
> (This part can be removed.)
> 
>>
>>>      if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
>>>          if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_SCSI)) {
>>>              error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
>>> @@ -1286,6 +1609,9 @@ static Property virtio_blk_properties[] = {
>>>  #ifdef __linux__
>>>      DEFINE_PROP_BIT64("scsi", VirtIOBlock, host_features,
>>>                        VIRTIO_BLK_F_SCSI, false),
>>> +#endif
>>> +#ifdef CONFIG_BLKZONED
>>> +    DEFINE_PROP_BIT64("zoned", VirtIOBlock, host_features, VIRTIO_BLK_F_ZONED, true),
>>>  #endif
>>
>> ...but this allows users to enable/disable the flag from the
>> command-line.
>>
>> I think DEFINE_PROP_BIT64() can be removed, but please check that the
>> config space size is still correct. It may be necessary to move the
>> bs->bl.zoned check into virtio_blk_device_realize() and set
>> the VIRTIO_BLK_F_ZONED s->host_features bit there instead. That will
>> allow the s->config_size calculation to work correctly.
> 
> Ok, it works. Thanks!
> 
> Additionally, the DEFINE_PROP_BIT here is to declare that the supports
> zones. Then the virtio-blk driver will not need to look at that
> feature. So the former part detecting the F_ZONED feature based on
> BlockDriverState can be removed. If removing this macro, then the
> virio-blk driver must set the F_ZONED feature by itself.
diff mbox series

Patch

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index e9ba752f6b..3ef74c01db 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -46,6 +46,8 @@  static const VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_blk_config, discard_sector_alignment)},
     {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
      .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
+     .end = endof(struct virtio_blk_config, zoned)},
     {}
 };
 
@@ -614,6 +616,273 @@  err:
     return err_status;
 }
 
+typedef struct ZoneCmdData {
+    VirtIOBlockReq *req;
+    union {
+        struct {
+            unsigned int nr_zones;
+            BlockZoneDescriptor *zones;
+        } ZoneReportData;
+        struct {
+            int64_t append_sector;
+        } ZoneAppendData;
+    };
+} ZoneCmdData;
+
+/*
+ * check zone_model: error checking before issuing requests. If all checks
+ * passed, return true.
+ * append: true if only zone append request issued.
+ */
+static bool check_zone_model(VirtIOBlock *s, int64_t sector, int64_t nr_sector,
+                             bool append, uint8_t *status) {
+    BlockDriverState *bs = blk_bs(s->blk);
+    BlockZoneDescriptor *zone = &bs->bl.zones[sector / bs->bl.zone_sectors];
+    int64_t max_append_sector = bs->bl.max_append_sectors;
+
+    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
+        *status = VIRTIO_BLK_S_UNSUPP;
+        return false;
+    }
+
+    if (zone->cond == BLK_ZS_OFFLINE) {
+        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        return false;
+    }
+
+    if (append) {
+        if ((zone->type != BLK_ZT_SWR) || (zone->cond == BLK_ZS_RDONLY) ||
+            (sector + nr_sector > (*(zone + 1)).start)) {
+            /* the end sector of the request exceeds to next zone */
+            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            return false;
+        }
+
+        if (nr_sector > max_append_sector) {
+            if (max_append_sector == 0) {
+                *status = VIRTIO_BLK_S_UNSUPP;
+            } else {
+                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    struct iovec *in_iov = req->elem.in_sg;
+    unsigned in_num = req->elem.in_num;
+    int64_t zrp_size, nz, n, j = 0;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    nz = data->ZoneReportData.nr_zones;
+    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
+            .nr_zones = cpu_to_le64(nz),
+    };
+
+    zrp_size = sizeof(struct virtio_blk_zone_report)
+               + sizeof(struct virtio_blk_zone_descriptor) * nz;
+    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
+    if (n != sizeof(zrp_hdr)) {
+        virtio_error(vdev, "Driver provided intput buffer that is too small!");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    for (size_t i = sizeof(zrp_hdr); i < zrp_size; i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
+        struct virtio_blk_zone_descriptor desc =
+                (struct virtio_blk_zone_descriptor) {
+                        .z_start = cpu_to_le64(data->ZoneReportData.zones[j].start),
+                        .z_cap = cpu_to_le64(data->ZoneReportData.zones[j].cap),
+                        .z_wp = cpu_to_le64(data->ZoneReportData.zones[j].wp),
+                        .z_type = data->ZoneReportData.zones[j].type,
+                        .z_state = data->ZoneReportData.zones[j].cond,
+                };
+        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
+        if (n != sizeof(desc)) {
+            virtio_error(vdev, "Driver provided input buffer "
+                               "for descriptors that is too small!");
+            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            goto out;
+        }
+    }
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data->ZoneReportData.zones);
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_report(VirtIOBlockReq *req) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    unsigned int nr_zones;
+    ZoneCmdData *data;
+    int64_t zone_size, offset;
+    uint8_t err_status;
+
+    if (req->in_len <= sizeof(struct virtio_blk_inhdr) +
+                       sizeof(struct virtio_blk_zone_report)) {
+        virtio_error(vdev, "in buffer too small for zone report");
+        return -1;
+    }
+
+    /* start byte offset of the zone report */
+    offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
+        goto out;
+    }
+
+    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
+                sizeof(struct virtio_blk_zone_report)) /
+               sizeof(struct virtio_blk_zone_descriptor);
+
+    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
+    data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->ZoneReportData.nr_zones = nr_zones;
+    data->ZoneReportData.zones = g_malloc(zone_size),
+
+    blk_aio_zone_report(s->blk, offset, &data->ZoneReportData.nr_zones,
+                        data->ZoneReportData.zones,
+                        virtio_blk_zone_report_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_mgmt_complete(void *opaque, int ret) {
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    BlockDriverState *bs = blk_bs(s->blk);
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    uint64_t len;
+    uint32_t type;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (!check_zone_model(s, offset / 512, 0, false, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+
+    type = virtio_ldl_p(vdev, &req->out.type);
+    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
+        /* Entire drive capacity */
+        offset = 0;
+        blk_get_geometry(s->blk, &len);
+        len *= 512;
+    } else {
+        len = bs->bl.zone_sectors * 512;
+    }
+
+    blk_aio_zone_mgmt(s->blk, op, offset, len,
+                      virtio_blk_zone_mgmt_complete, data);
+
+    return 0;
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_append_complete(void *opaque, int ret) {
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    int64_t append_sector, n;
+    struct iovec *out_iov = req->elem.out_sg;
+    unsigned out_num = req->elem.out_num;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    append_sector = data->ZoneAppendData.append_sector;
+    int write_granularity = s->conf.conf.logical_block_size;
+    if ((append_sector * 512 % write_granularity) != 0) {
+        err_status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
+        goto out;
+    }
+    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
+    if (n != sizeof(append_sector)) {
+        virtio_error(vdev, "Driver provided input buffer less than size of "
+                     "append_sector");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+    goto out;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_append(VirtIOBlockReq *req) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    uint64_t niov = req->elem.out_num;
+    struct iovec *out_iov = req->elem.out_sg;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) * 512;
+    int64_t len = 0;
+    for (int i = 1; i < niov; ++i) {
+        len += out_iov[i].iov_len;
+    }
+
+    if (!check_zone_model(s, offset / 512, len / 512, true, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->ZoneAppendData.append_sector = offset;
+    qemu_iovec_init_external(&req->qiov, &out_iov[1], niov-1);
+    blk_aio_zone_append(s->blk, &data->ZoneAppendData.append_sector, &req->qiov, 0,
+                        virtio_blk_zone_append_complete, data);
+
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -700,6 +969,24 @@  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     case VIRTIO_BLK_T_FLUSH:
         virtio_blk_handle_flush(req, mrb);
         break;
+    case VIRTIO_BLK_T_ZONE_REPORT:
+        virtio_blk_handle_zone_report(req);
+        break;
+    case VIRTIO_BLK_T_ZONE_OPEN:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
+        break;
+    case VIRTIO_BLK_T_ZONE_CLOSE:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
+        break;
+    case VIRTIO_BLK_T_ZONE_FINISH:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET_ALL:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET_ALL);
+        break;
     case VIRTIO_BLK_T_SCSI_CMD:
         virtio_blk_handle_scsi(req);
         break;
@@ -718,6 +1005,9 @@  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         virtio_blk_free_request(req);
         break;
     }
+   case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
+       virtio_blk_handle_zone_append(req);
+       break;
     /*
      * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
      * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
@@ -917,6 +1207,7 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     BlockConf *conf = &s->conf.conf;
+    BlockDriverState *state = blk_bs(s->blk);
     struct virtio_blk_config blkcfg;
     uint64_t capacity;
     int64_t length;
@@ -976,6 +1267,31 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
         blkcfg.write_zeroes_may_unmap = 1;
         virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
     }
+#ifdef CONFIG_BLKZONED
+    if (state->bl.zoned != BLK_Z_NONE) {
+        switch (state->bl.zoned) {
+        case BLK_Z_HM:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
+            virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
+                         state->bl.zone_sectors);
+            virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
+                         state->bl.max_active_zones);
+            virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
+                         state->bl.max_open_zones);
+            virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
+            virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
+                         state->bl.max_append_sectors);
+            break;
+        case BLK_Z_HA:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
+            break;
+        default:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
+            virtio_error(vdev, "Invalid zoned model %x \n", (int)state->bl.zoned);
+            break;
+        }
+    }
+#endif
     memcpy(config, &blkcfg, s->config_size);
 }
 
@@ -995,6 +1311,7 @@  static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
                                         Error **errp)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
+    BlockDriverState *state = blk_bs(s->blk);
 
     /* Firstly sync all virtio-blk possible supported features */
     features |= s->host_features;
@@ -1003,6 +1320,12 @@  static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
     virtio_add_feature(&features, VIRTIO_BLK_F_GEOMETRY);
     virtio_add_feature(&features, VIRTIO_BLK_F_TOPOLOGY);
     virtio_add_feature(&features, VIRTIO_BLK_F_BLK_SIZE);
+    if (state->bl.zoned != BLK_Z_NONE) {
+        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
+        if (state->bl.zoned == BLK_Z_HM) {
+            virtio_clear_feature(&features, VIRTIO_BLK_F_DISCARD);
+        }
+    }
     if (virtio_has_feature(features, VIRTIO_F_VERSION_1)) {
         if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_SCSI)) {
             error_setg(errp, "Please set scsi=off for virtio-blk devices in order to use virtio 1.0");
@@ -1286,6 +1609,9 @@  static Property virtio_blk_properties[] = {
 #ifdef __linux__
     DEFINE_PROP_BIT64("scsi", VirtIOBlock, host_features,
                       VIRTIO_BLK_F_SCSI, false),
+#endif
+#ifdef CONFIG_BLKZONED
+    DEFINE_PROP_BIT64("zoned", VirtIOBlock, host_features, VIRTIO_BLK_F_ZONED, true),
 #endif
     DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
                     true),