diff mbox series

[RFC,v4,2/3] virtio-blk: add zoned storage emulation for zoned devices

Message ID 20221030093242.208839-3-faithilikerun@gmail.com
State New
Headers show
Series Add zoned storage emulation to virtio-blk driver | expand

Commit Message

Sam Li Oct. 30, 2022, 9:32 a.m. UTC
This patch extends virtio-blk emulation to handle zoned device commands
by calling the new block layer APIs to perform zoned device I/O on
behalf of the guest. It supports Report Zone, four zone oparations (open,
close, finish, reset), and Append Zone.

The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
support zoned block devices. Regular block devices(conventional zones)
will not be set.

The guest os can use blktests, fio to test those commands on zoned devices.
Furthermore, using zonefs to test zone append write is also supported.

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 hw/block/virtio-blk-common.c |   2 +
 hw/block/virtio-blk.c        | 387 +++++++++++++++++++++++++++++++++++
 2 files changed, 389 insertions(+)

Comments

Stefan Hajnoczi Nov. 2, 2022, 8:58 p.m. UTC | #1
On Sun, Oct 30, 2022 at 05:32:41AM -0400, Sam Li wrote:
> This patch extends virtio-blk emulation to handle zoned device commands
> by calling the new block layer APIs to perform zoned device I/O on
> behalf of the guest. It supports Report Zone, four zone oparations (open,
> close, finish, reset), and Append Zone.
> 
> The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> support zoned block devices. Regular block devices(conventional zones)
> will not be set.
> 
> The guest os can use blktests, fio to test those commands on zoned devices.
> Furthermore, using zonefs to test zone append write is also supported.
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  hw/block/virtio-blk-common.c |   2 +
>  hw/block/virtio-blk.c        | 387 +++++++++++++++++++++++++++++++++++
>  2 files changed, 389 insertions(+)
> 
> diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> index ac52d7c176..e2f8e2f6da 100644
> --- a/hw/block/virtio-blk-common.c
> +++ b/hw/block/virtio-blk-common.c
> @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
>       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
>      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
>       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> +     .end = endof(struct virtio_blk_config, zoned)},
>      {}
>  };
>  
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 8131ec2dbc..4f3625840a 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -26,6 +26,9 @@
>  #include "hw/virtio/virtio-blk.h"
>  #include "dataplane/virtio-blk.h"
>  #include "scsi/constants.h"
> +#if defined(CONFIG_BLKZONED)
> +#include <linux/blkzoned.h>
> +#endif

virtio-blk.c should use QEMU block layer APIs instead of Linux-specific
APIs. Can this include be removed?

>  #ifdef __linux__
>  # include <scsi/sg.h>
>  #endif
> @@ -592,6 +595,332 @@ err:
>      return err_status;
>  }
>  
> +typedef struct ZoneCmdData {
> +    VirtIOBlockReq *req;
> +    union {
> +        struct {
> +            unsigned int nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report_data;
> +        struct {
> +            int64_t offset;
> +        } zone_append_data;
> +    };
> +} ZoneCmdData;
> +
> +/*
> + * check zoned_request: error checking before issuing requests. If all checks
> + * passed, return true.
> + * append: true if only zone append requests issued.
> + */
> +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> +                             bool append, uint8_t *status) {
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int index = offset / bs->bl.zone_size;
> +
> +    if (offset < 0 || len < 0 || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {

Missing len > (bs->total_sectors << BDRV_SECTOR_BITS) check before
subtracting len.

> +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        return false;
> +    }
> +
> +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> +        *status = VIRTIO_BLK_S_UNSUPP;
> +        return false;
> +    }
> +
> +    if (append) {
> +        if ((offset % bs->bl.write_granularity) != 0) {
> +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> +            return false;
> +        }
> +
> +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            return false;
> +        }
> +
> +        if (len / 512 > bs->bl.max_append_sectors) {
> +            if (bs->bl.max_append_sectors == 0) {
> +                *status = VIRTIO_BLK_S_UNSUPP;
> +            } else {
> +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +            }
> +            return false;
> +        }
> +    }
> +    return true;
> +}
> +
> +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    struct iovec *in_iov = req->elem.in_sg;
> +    unsigned in_num = req->elem.in_num;
> +    int64_t zrp_size, n, j = 0;
> +    int64_t nz = data->zone_report_data.nr_zones;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> +            .nr_zones = cpu_to_le64(nz),
> +    };
> +    zrp_size = sizeof(struct virtio_blk_zone_report)
> +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> +    if (n != sizeof(zrp_hdr)) {
> +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> +        struct virtio_blk_zone_descriptor desc =
> +            (struct virtio_blk_zone_descriptor) {
> +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
> +                    >> BDRV_SECTOR_BITS),
> +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
> +                    >> BDRV_SECTOR_BITS),
> +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
> +                    >> BDRV_SECTOR_BITS),
> +        };
> +
> +        switch (data->zone_report_data.zones[j].type) {
> +        case BLK_ZT_CONV:
> +            desc.z_type = VIRTIO_BLK_ZT_CONV;
> +            break;
> +        case BLK_ZT_SWR:
> +            desc.z_type = VIRTIO_BLK_ZT_SWR;
> +            break;
> +        case BLK_ZT_SWP:
> +            desc.z_type = VIRTIO_BLK_ZT_SWP;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        switch (data->zone_report_data.zones[j].state) {
> +        case BLK_ZS_RDONLY:
> +            desc.z_state = VIRTIO_BLK_ZS_RDONLY;
> +            break;
> +        case BLK_ZS_OFFLINE:
> +            desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
> +            break;
> +        case BLK_ZS_EMPTY:
> +            desc.z_state = VIRTIO_BLK_ZS_EMPTY;
> +            break;
> +        case BLK_ZS_CLOSED:
> +            desc.z_state = VIRTIO_BLK_ZS_CLOSED;
> +            break;
> +        case BLK_ZS_FULL:
> +            desc.z_state = VIRTIO_BLK_ZS_FULL;
> +            break;
> +        case BLK_ZS_EOPEN:
> +            desc.z_state = VIRTIO_BLK_ZS_EOPEN;
> +            break;
> +        case BLK_ZS_IOPEN:
> +            desc.z_state = VIRTIO_BLK_ZS_IOPEN;
> +            break;
> +        case BLK_ZS_NOT_WP:
> +            desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
> +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> +        if (n != sizeof(desc)) {
> +            virtio_error(vdev, "Driver provided input buffer "
> +                               "for descriptors that is too small!");
> +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        }
> +    }
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data->zone_report_data.zones);
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> +{
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    unsigned int nr_zones;
> +    ZoneCmdData *data;
> +    int64_t zone_size, offset;
> +    uint8_t err_status;
> +
> +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> +            sizeof(struct virtio_blk_zone_report) +
> +            sizeof(struct virtio_blk_zone_descriptor)) {
> +        virtio_error(vdev, "in buffer too small for zone report");
> +        return -1;
> +    }
> +
> +    /* start byte offset of the zone report */
> +    offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> +        goto out;
> +    }
> +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> +                sizeof(struct virtio_blk_zone_report)) /
> +               sizeof(struct virtio_blk_zone_descriptor);
> +
> +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> +    data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_report_data.nr_zones = nr_zones;
> +    data->zone_report_data.zones = g_malloc(zone_size),
> +
> +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> +                        data->zone_report_data.zones,
> +                        virtio_blk_zone_report_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    int8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +    }
> +
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
> +{
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    BlockDriverState *bs = blk_bs(s->blk);
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> +    uint64_t len;
> +    uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> +        /* Entire drive capacity */
> +        offset = 0;
> +        len = capacity;
> +    } else {
> +        if (bs->bl.zone_size > capacity - offset) {
> +            /* The zoned device allows the last smaller zone. */
> +            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> +        } else {
> +            len = bs->bl.zone_size;
> +        }
> +    }
> +
> +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {

Why is the len argument 0? It wouldn't hurt to have the [offset,
offset+len) range checked.

> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;

Is ZoneCmdData needed? If req is passed as the argument to
virtio_blk_zone_mgmt_complete() then that should be enough (and g_free()
isn't required).

> +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> +                      virtio_blk_zone_mgmt_complete, data);
> +
> +    return 0;
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
> +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> +{
> +    ZoneCmdData *data = opaque;
> +    VirtIOBlockReq *req = data->req;
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> +    int64_t append_sector, n;
> +    struct iovec *out_iov = req->elem.out_sg;
> +    unsigned out_num = req->elem.out_num;
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    if (ret) {
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +    virtio_stl_p(vdev, &append_sector,
> +                 data->zone_append_data.offset >> BDRV_SECTOR_BITS);
> +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));

The append sector must be written to the first 8 bytes of
req->elem.in_sg/req->elem.in_num.

> +    if (n != sizeof(append_sector)) {
> +        virtio_error(vdev, "Driver provided input buffer less than size of "
> +                           "append_sector");
> +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> +        goto out;
> +    }
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    g_free(data);
> +}
> +
> +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> +                                         struct iovec *out_iov,
> +                                         uint64_t niov) {
> +    VirtIOBlock *s = req->dev;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +    uint8_t err_status = VIRTIO_BLK_S_OK;
> +
> +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> +    int64_t len = iov_size(out_iov, niov);
> +
> +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> +        goto out;
> +    }
> +
> +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> +    data->req = req;
> +    data->zone_append_data.offset = offset;
> +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
> +                        virtio_blk_zone_append_complete, data);
> +    return 0;
> +
> +out:
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    virtio_blk_req_complete(req, err_status);
> +    virtio_blk_free_request(req);
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +    return err_status;
> +}
> +
>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>  {
>      uint32_t type;
> @@ -678,6 +1007,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>      case VIRTIO_BLK_T_FLUSH:
>          virtio_blk_handle_flush(req, mrb);
>          break;
> +    case VIRTIO_BLK_T_ZONE_REPORT:
> +        virtio_blk_handle_zone_report(req);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_OPEN:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_CLOSE:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_FINISH:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
> +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> +        break;
>      case VIRTIO_BLK_T_SCSI_CMD:
>          virtio_blk_handle_scsi(req);
>          break;
> @@ -696,6 +1043,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>          virtio_blk_free_request(req);
>          break;
>      }
> +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> +        /*
> +         * It is not safe to access req->elem.out_sg directly because it
> +         * may be modified by virtio_blk_handle_request().
> +         */
> +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> +        break;
>      /*
>       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
>       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> @@ -895,6 +1249,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>  {
>      VirtIOBlock *s = VIRTIO_BLK(vdev);
>      BlockConf *conf = &s->conf.conf;
> +    BlockDriverState *bs = blk_bs(s->blk);
>      struct virtio_blk_config blkcfg;
>      uint64_t capacity;
>      int64_t length;
> @@ -954,6 +1309,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>          blkcfg.write_zeroes_may_unmap = 1;
>          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
>      }
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        switch (bs->bl.zoned) {
> +        case BLK_Z_HM:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> +            break;
> +        case BLK_Z_HA:
> +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> +            break;
> +        default:
> +            g_assert_not_reached();
> +        }
> +
> +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> +                     bs->bl.zone_size / 512);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> +                     bs->bl.max_active_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> +                     bs->bl.max_open_zones);
> +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> +                     bs->bl.max_append_sectors);
> +    } else {
> +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> +    }
>      memcpy(config, &blkcfg, s->config_size);
>  }
>  
> @@ -1118,6 +1497,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBlock *s = VIRTIO_BLK(dev);
>      VirtIOBlkConf *conf = &s->conf;
> +    BlockDriverState *bs = blk_bs(conf->conf.blk);
>      Error *err = NULL;
>      unsigned i;
>  
> @@ -1163,6 +1543,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    if (bs->bl.zoned != BLK_Z_NONE) {
> +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> +        if (bs->bl.zoned == BLK_Z_HM) {
> +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> +        }
> +    }
> +
>      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
>          (!conf->max_discard_sectors ||
>           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> -- 
> 2.38.1
>
Sam Li Nov. 4, 2022, 1:22 a.m. UTC | #2
Stefan Hajnoczi <stefanha@redhat.com> 于2022年11月3日周四 04:58写道:
>
> On Sun, Oct 30, 2022 at 05:32:41AM -0400, Sam Li wrote:
> > This patch extends virtio-blk emulation to handle zoned device commands
> > by calling the new block layer APIs to perform zoned device I/O on
> > behalf of the guest. It supports Report Zone, four zone oparations (open,
> > close, finish, reset), and Append Zone.
> >
> > The VIRTIO_BLK_F_ZONED feature bit will only be set if the host does
> > support zoned block devices. Regular block devices(conventional zones)
> > will not be set.
> >
> > The guest os can use blktests, fio to test those commands on zoned devices.
> > Furthermore, using zonefs to test zone append write is also supported.
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  hw/block/virtio-blk-common.c |   2 +
> >  hw/block/virtio-blk.c        | 387 +++++++++++++++++++++++++++++++++++
> >  2 files changed, 389 insertions(+)
> >
> > diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
> > index ac52d7c176..e2f8e2f6da 100644
> > --- a/hw/block/virtio-blk-common.c
> > +++ b/hw/block/virtio-blk-common.c
> > @@ -29,6 +29,8 @@ static const VirtIOFeature feature_sizes[] = {
> >       .end = endof(struct virtio_blk_config, discard_sector_alignment)},
> >      {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
> >       .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
> > +    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
> > +     .end = endof(struct virtio_blk_config, zoned)},
> >      {}
> >  };
> >
> > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > index 8131ec2dbc..4f3625840a 100644
> > --- a/hw/block/virtio-blk.c
> > +++ b/hw/block/virtio-blk.c
> > @@ -26,6 +26,9 @@
> >  #include "hw/virtio/virtio-blk.h"
> >  #include "dataplane/virtio-blk.h"
> >  #include "scsi/constants.h"
> > +#if defined(CONFIG_BLKZONED)
> > +#include <linux/blkzoned.h>
> > +#endif
>
> virtio-blk.c should use QEMU block layer APIs instead of Linux-specific
> APIs. Can this include be removed?

Yes, missed removing it.

>
> >  #ifdef __linux__
> >  # include <scsi/sg.h>
> >  #endif
> > @@ -592,6 +595,332 @@ err:
> >      return err_status;
> >  }
> >
> > +typedef struct ZoneCmdData {
> > +    VirtIOBlockReq *req;
> > +    union {
> > +        struct {
> > +            unsigned int nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report_data;
> > +        struct {
> > +            int64_t offset;
> > +        } zone_append_data;
> > +    };
> > +} ZoneCmdData;
> > +
> > +/*
> > + * check zoned_request: error checking before issuing requests. If all checks
> > + * passed, return true.
> > + * append: true if only zone append requests issued.
> > + */
> > +static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
> > +                             bool append, uint8_t *status) {
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int index = offset / bs->bl.zone_size;
> > +
> > +    if (offset < 0 || len < 0 || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
>
> Missing len > (bs->total_sectors << BDRV_SECTOR_BITS) check before
> subtracting len.
>
> > +        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        return false;
> > +    }
> > +
> > +    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
> > +        *status = VIRTIO_BLK_S_UNSUPP;
> > +        return false;
> > +    }
> > +
> > +    if (append) {
> > +        if ((offset % bs->bl.write_granularity) != 0) {
> > +            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
> > +            return false;
> > +        }
> > +
> > +        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
> > +            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            return false;
> > +        }
> > +
> > +        if (len / 512 > bs->bl.max_append_sectors) {
> > +            if (bs->bl.max_append_sectors == 0) {
> > +                *status = VIRTIO_BLK_S_UNSUPP;
> > +            } else {
> > +                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +            }
> > +            return false;
> > +        }
> > +    }
> > +    return true;
> > +}
> > +
> > +static void virtio_blk_zone_report_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    struct iovec *in_iov = req->elem.in_sg;
> > +    unsigned in_num = req->elem.in_num;
> > +    int64_t zrp_size, n, j = 0;
> > +    int64_t nz = data->zone_report_data.nr_zones;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
> > +            .nr_zones = cpu_to_le64(nz),
> > +    };
> > +    zrp_size = sizeof(struct virtio_blk_zone_report)
> > +               + sizeof(struct virtio_blk_zone_descriptor) * nz;
> > +    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
> > +    if (n != sizeof(zrp_hdr)) {
> > +        virtio_error(vdev, "Driver provided intput buffer that is too small!");
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
> > +        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
> > +        struct virtio_blk_zone_descriptor desc =
> > +            (struct virtio_blk_zone_descriptor) {
> > +                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
> > +                    >> BDRV_SECTOR_BITS),
> > +                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
> > +                    >> BDRV_SECTOR_BITS),
> > +                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
> > +                    >> BDRV_SECTOR_BITS),
> > +        };
> > +
> > +        switch (data->zone_report_data.zones[j].type) {
> > +        case BLK_ZT_CONV:
> > +            desc.z_type = VIRTIO_BLK_ZT_CONV;
> > +            break;
> > +        case BLK_ZT_SWR:
> > +            desc.z_type = VIRTIO_BLK_ZT_SWR;
> > +            break;
> > +        case BLK_ZT_SWP:
> > +            desc.z_type = VIRTIO_BLK_ZT_SWP;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        switch (data->zone_report_data.zones[j].state) {
> > +        case BLK_ZS_RDONLY:
> > +            desc.z_state = VIRTIO_BLK_ZS_RDONLY;
> > +            break;
> > +        case BLK_ZS_OFFLINE:
> > +            desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
> > +            break;
> > +        case BLK_ZS_EMPTY:
> > +            desc.z_state = VIRTIO_BLK_ZS_EMPTY;
> > +            break;
> > +        case BLK_ZS_CLOSED:
> > +            desc.z_state = VIRTIO_BLK_ZS_CLOSED;
> > +            break;
> > +        case BLK_ZS_FULL:
> > +            desc.z_state = VIRTIO_BLK_ZS_FULL;
> > +            break;
> > +        case BLK_ZS_EOPEN:
> > +            desc.z_state = VIRTIO_BLK_ZS_EOPEN;
> > +            break;
> > +        case BLK_ZS_IOPEN:
> > +            desc.z_state = VIRTIO_BLK_ZS_IOPEN;
> > +            break;
> > +        case BLK_ZS_NOT_WP:
> > +            desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
> > +        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
> > +        if (n != sizeof(desc)) {
> > +            virtio_error(vdev, "Driver provided input buffer "
> > +                               "for descriptors that is too small!");
> > +            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        }
> > +    }
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data->zone_report_data.zones);
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
> > +{
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    unsigned int nr_zones;
> > +    ZoneCmdData *data;
> > +    int64_t zone_size, offset;
> > +    uint8_t err_status;
> > +
> > +    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
> > +            sizeof(struct virtio_blk_zone_report) +
> > +            sizeof(struct virtio_blk_zone_descriptor)) {
> > +        virtio_error(vdev, "in buffer too small for zone report");
> > +        return -1;
> > +    }
> > +
> > +    /* start byte offset of the zone report */
> > +    offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
> > +        goto out;
> > +    }
> > +    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
> > +                sizeof(struct virtio_blk_zone_report)) /
> > +               sizeof(struct virtio_blk_zone_descriptor);
> > +
> > +    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
> > +    data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_report_data.nr_zones = nr_zones;
> > +    data->zone_report_data.zones = g_malloc(zone_size),
> > +
> > +    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
> > +                        data->zone_report_data.zones,
> > +                        virtio_blk_zone_report_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    int8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +    }
> > +
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
> > +{
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    BlockDriverState *bs = blk_bs(s->blk);
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> > +    uint64_t len;
> > +    uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
> > +    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
> > +        /* Entire drive capacity */
> > +        offset = 0;
> > +        len = capacity;
> > +    } else {
> > +        if (bs->bl.zone_size > capacity - offset) {
> > +            /* The zoned device allows the last smaller zone. */
> > +            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
> > +        } else {
> > +            len = bs->bl.zone_size;
> > +        }
> > +    }
> > +
> > +    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
>
> Why is the len argument 0? It wouldn't hurt to have the [offset,
> offset+len) range checked.
>
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
>
> Is ZoneCmdData needed? If req is passed as the argument to
> virtio_blk_zone_mgmt_complete() then that should be enough (and g_free()
> isn't required).

No, it can be dropped in this case.

>
> > +    blk_aio_zone_mgmt(s->blk, op, offset, len,
> > +                      virtio_blk_zone_mgmt_complete, data);
> > +
> > +    return 0;
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> > +static void virtio_blk_zone_append_complete(void *opaque, int ret)
> > +{
> > +    ZoneCmdData *data = opaque;
> > +    VirtIOBlockReq *req = data->req;
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
> > +    int64_t append_sector, n;
> > +    struct iovec *out_iov = req->elem.out_sg;
> > +    unsigned out_num = req->elem.out_num;
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    if (ret) {
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +    virtio_stl_p(vdev, &append_sector,
> > +                 data->zone_append_data.offset >> BDRV_SECTOR_BITS);
> > +    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
>
> The append sector must be written to the first 8 bytes of
> req->elem.in_sg/req->elem.in_num.

Thanks for the reminder!

>
> > +    if (n != sizeof(append_sector)) {
> > +        virtio_error(vdev, "Driver provided input buffer less than size of "
> > +                           "append_sector");
> > +        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
> > +        goto out;
> > +    }
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    g_free(data);
> > +}
> > +
> > +static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
> > +                                         struct iovec *out_iov,
> > +                                         uint64_t niov) {
> > +    VirtIOBlock *s = req->dev;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +    uint8_t err_status = VIRTIO_BLK_S_OK;
> > +
> > +    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
> > +    int64_t len = iov_size(out_iov, niov);
> > +
> > +    if (!check_zoned_request(s, offset, len, true, &err_status)) {
> > +        goto out;
> > +    }
> > +
> > +    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
> > +    data->req = req;
> > +    data->zone_append_data.offset = offset;
> > +    qemu_iovec_init_external(&req->qiov, out_iov, niov);
> > +    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
> > +                        virtio_blk_zone_append_complete, data);
> > +    return 0;
> > +
> > +out:
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    virtio_blk_req_complete(req, err_status);
> > +    virtio_blk_free_request(req);
> > +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> > +    return err_status;
> > +}
> > +
> >  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >  {
> >      uint32_t type;
> > @@ -678,6 +1007,24 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >      case VIRTIO_BLK_T_FLUSH:
> >          virtio_blk_handle_flush(req, mrb);
> >          break;
> > +    case VIRTIO_BLK_T_ZONE_REPORT:
> > +        virtio_blk_handle_zone_report(req);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_OPEN:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_CLOSE:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_FINISH:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> > +    case VIRTIO_BLK_T_ZONE_RESET_ALL:
> > +        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
> > +        break;
> >      case VIRTIO_BLK_T_SCSI_CMD:
> >          virtio_blk_handle_scsi(req);
> >          break;
> > @@ -696,6 +1043,13 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> >          virtio_blk_free_request(req);
> >          break;
> >      }
> > +    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
> > +        /*
> > +         * It is not safe to access req->elem.out_sg directly because it
> > +         * may be modified by virtio_blk_handle_request().
> > +         */
> > +        virtio_blk_handle_zone_append(req, out_iov, out_num);
> > +        break;
> >      /*
> >       * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> >       * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> > @@ -895,6 +1249,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >  {
> >      VirtIOBlock *s = VIRTIO_BLK(vdev);
> >      BlockConf *conf = &s->conf.conf;
> > +    BlockDriverState *bs = blk_bs(s->blk);
> >      struct virtio_blk_config blkcfg;
> >      uint64_t capacity;
> >      int64_t length;
> > @@ -954,6 +1309,30 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >          blkcfg.write_zeroes_may_unmap = 1;
> >          virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> >      }
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        switch (bs->bl.zoned) {
> > +        case BLK_Z_HM:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
> > +            break;
> > +        case BLK_Z_HA:
> > +            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
> > +            break;
> > +        default:
> > +            g_assert_not_reached();
> > +        }
> > +
> > +        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
> > +                     bs->bl.zone_size / 512);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
> > +                     bs->bl.max_active_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
> > +                     bs->bl.max_open_zones);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
> > +        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
> > +                     bs->bl.max_append_sectors);
> > +    } else {
> > +        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
> > +    }
> >      memcpy(config, &blkcfg, s->config_size);
> >  }
> >
> > @@ -1118,6 +1497,7 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> >      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> >      VirtIOBlock *s = VIRTIO_BLK(dev);
> >      VirtIOBlkConf *conf = &s->conf;
> > +    BlockDriverState *bs = blk_bs(conf->conf.blk);
> >      Error *err = NULL;
> >      unsigned i;
> >
> > @@ -1163,6 +1543,13 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
> >          return;
> >      }
> >
> > +    if (bs->bl.zoned != BLK_Z_NONE) {
> > +        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
> > +        if (bs->bl.zoned == BLK_Z_HM) {
> > +            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
> > +        }
> > +    }
> > +
> >      if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
> >          (!conf->max_discard_sectors ||
> >           conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {
> > --
> > 2.38.1
> >
diff mbox series

Patch

diff --git a/hw/block/virtio-blk-common.c b/hw/block/virtio-blk-common.c
index ac52d7c176..e2f8e2f6da 100644
--- a/hw/block/virtio-blk-common.c
+++ b/hw/block/virtio-blk-common.c
@@ -29,6 +29,8 @@  static const VirtIOFeature feature_sizes[] = {
      .end = endof(struct virtio_blk_config, discard_sector_alignment)},
     {.flags = 1ULL << VIRTIO_BLK_F_WRITE_ZEROES,
      .end = endof(struct virtio_blk_config, write_zeroes_may_unmap)},
+    {.flags = 1ULL << VIRTIO_BLK_F_ZONED,
+     .end = endof(struct virtio_blk_config, zoned)},
     {}
 };
 
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 8131ec2dbc..4f3625840a 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -26,6 +26,9 @@ 
 #include "hw/virtio/virtio-blk.h"
 #include "dataplane/virtio-blk.h"
 #include "scsi/constants.h"
+#if defined(CONFIG_BLKZONED)
+#include <linux/blkzoned.h>
+#endif
 #ifdef __linux__
 # include <scsi/sg.h>
 #endif
@@ -592,6 +595,332 @@  err:
     return err_status;
 }
 
+typedef struct ZoneCmdData {
+    VirtIOBlockReq *req;
+    union {
+        struct {
+            unsigned int nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report_data;
+        struct {
+            int64_t offset;
+        } zone_append_data;
+    };
+} ZoneCmdData;
+
+/*
+ * check zoned_request: error checking before issuing requests. If all checks
+ * passed, return true.
+ * append: true if only zone append requests issued.
+ */
+static bool check_zoned_request(VirtIOBlock *s, int64_t offset, int64_t len,
+                             bool append, uint8_t *status) {
+    BlockDriverState *bs = blk_bs(s->blk);
+    int index = offset / bs->bl.zone_size;
+
+    if (offset < 0 || len < 0 || offset > (bs->total_sectors << BDRV_SECTOR_BITS) - len) {
+        *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        return false;
+    }
+
+    if (!virtio_has_feature(s->host_features, VIRTIO_BLK_F_ZONED)) {
+        *status = VIRTIO_BLK_S_UNSUPP;
+        return false;
+    }
+
+    if (append) {
+        if ((offset % bs->bl.write_granularity) != 0) {
+            *status = VIRTIO_BLK_S_ZONE_UNALIGNED_WP;
+            return false;
+        }
+
+        if (BDRV_ZT_IS_CONV(bs->bl.wps->wp[index])) {
+            *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            return false;
+        }
+
+        if (len / 512 > bs->bl.max_append_sectors) {
+            if (bs->bl.max_append_sectors == 0) {
+                *status = VIRTIO_BLK_S_UNSUPP;
+            } else {
+                *status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+            }
+            return false;
+        }
+    }
+    return true;
+}
+
+static void virtio_blk_zone_report_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    struct iovec *in_iov = req->elem.in_sg;
+    unsigned in_num = req->elem.in_num;
+    int64_t zrp_size, n, j = 0;
+    int64_t nz = data->zone_report_data.nr_zones;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    struct virtio_blk_zone_report zrp_hdr = (struct virtio_blk_zone_report) {
+            .nr_zones = cpu_to_le64(nz),
+    };
+    zrp_size = sizeof(struct virtio_blk_zone_report)
+               + sizeof(struct virtio_blk_zone_descriptor) * nz;
+    n = iov_from_buf(in_iov, in_num, 0, &zrp_hdr, sizeof(zrp_hdr));
+    if (n != sizeof(zrp_hdr)) {
+        virtio_error(vdev, "Driver provided intput buffer that is too small!");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    for (size_t i = sizeof(zrp_hdr); i < zrp_size;
+        i += sizeof(struct virtio_blk_zone_descriptor), ++j) {
+        struct virtio_blk_zone_descriptor desc =
+            (struct virtio_blk_zone_descriptor) {
+                .z_start = cpu_to_le64(data->zone_report_data.zones[j].start
+                    >> BDRV_SECTOR_BITS),
+                .z_cap = cpu_to_le64(data->zone_report_data.zones[j].cap
+                    >> BDRV_SECTOR_BITS),
+                .z_wp = cpu_to_le64(data->zone_report_data.zones[j].wp
+                    >> BDRV_SECTOR_BITS),
+        };
+
+        switch (data->zone_report_data.zones[j].type) {
+        case BLK_ZT_CONV:
+            desc.z_type = VIRTIO_BLK_ZT_CONV;
+            break;
+        case BLK_ZT_SWR:
+            desc.z_type = VIRTIO_BLK_ZT_SWR;
+            break;
+        case BLK_ZT_SWP:
+            desc.z_type = VIRTIO_BLK_ZT_SWP;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        switch (data->zone_report_data.zones[j].state) {
+        case BLK_ZS_RDONLY:
+            desc.z_state = VIRTIO_BLK_ZS_RDONLY;
+            break;
+        case BLK_ZS_OFFLINE:
+            desc.z_state = VIRTIO_BLK_ZS_OFFLINE;
+            break;
+        case BLK_ZS_EMPTY:
+            desc.z_state = VIRTIO_BLK_ZS_EMPTY;
+            break;
+        case BLK_ZS_CLOSED:
+            desc.z_state = VIRTIO_BLK_ZS_CLOSED;
+            break;
+        case BLK_ZS_FULL:
+            desc.z_state = VIRTIO_BLK_ZS_FULL;
+            break;
+        case BLK_ZS_EOPEN:
+            desc.z_state = VIRTIO_BLK_ZS_EOPEN;
+            break;
+        case BLK_ZS_IOPEN:
+            desc.z_state = VIRTIO_BLK_ZS_IOPEN;
+            break;
+        case BLK_ZS_NOT_WP:
+            desc.z_state = VIRTIO_BLK_ZS_NOT_WP;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        /* TODO: it takes O(n^2) time complexity. Optimizations required. */
+        n = iov_from_buf(in_iov, in_num, i, &desc, sizeof(desc));
+        if (n != sizeof(desc)) {
+            virtio_error(vdev, "Driver provided input buffer "
+                               "for descriptors that is too small!");
+            err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        }
+    }
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data->zone_report_data.zones);
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_report(VirtIOBlockReq *req)
+{
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    unsigned int nr_zones;
+    ZoneCmdData *data;
+    int64_t zone_size, offset;
+    uint8_t err_status;
+
+    if (req->in_len < sizeof(struct virtio_blk_inhdr) +
+            sizeof(struct virtio_blk_zone_report) +
+            sizeof(struct virtio_blk_zone_descriptor)) {
+        virtio_error(vdev, "in buffer too small for zone report");
+        return -1;
+    }
+
+    /* start byte offset of the zone report */
+    offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
+    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
+        goto out;
+    }
+    nr_zones = (req->in_len - sizeof(struct virtio_blk_inhdr) -
+                sizeof(struct virtio_blk_zone_report)) /
+               sizeof(struct virtio_blk_zone_descriptor);
+
+    zone_size = sizeof(BlockZoneDescriptor) * nr_zones;
+    data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_report_data.nr_zones = nr_zones;
+    data->zone_report_data.zones = g_malloc(zone_size),
+
+    blk_aio_zone_report(s->blk, offset, &data->zone_report_data.nr_zones,
+                        data->zone_report_data.zones,
+                        virtio_blk_zone_report_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_mgmt_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    int8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+    }
+
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_mgmt(VirtIOBlockReq *req, BlockZoneOp op)
+{
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    BlockDriverState *bs = blk_bs(s->blk);
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
+    uint64_t len;
+    uint64_t capacity = bs->total_sectors << BDRV_SECTOR_BITS;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    uint32_t type = virtio_ldl_p(vdev, &req->out.type);
+    if (type == VIRTIO_BLK_T_ZONE_RESET_ALL) {
+        /* Entire drive capacity */
+        offset = 0;
+        len = capacity;
+    } else {
+        if (bs->bl.zone_size > capacity - offset) {
+            /* The zoned device allows the last smaller zone. */
+            len = capacity - bs->bl.zone_size * (bs->bl.nr_zones - 1);
+        } else {
+            len = bs->bl.zone_size;
+        }
+    }
+
+    if (!check_zoned_request(s, offset, 0, false, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    blk_aio_zone_mgmt(s->blk, op, offset, len,
+                      virtio_blk_zone_mgmt_complete, data);
+
+    return 0;
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
+static void virtio_blk_zone_append_complete(void *opaque, int ret)
+{
+    ZoneCmdData *data = opaque;
+    VirtIOBlockReq *req = data->req;
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
+    int64_t append_sector, n;
+    struct iovec *out_iov = req->elem.out_sg;
+    unsigned out_num = req->elem.out_num;
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    if (ret) {
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+    virtio_stl_p(vdev, &append_sector,
+                 data->zone_append_data.offset >> BDRV_SECTOR_BITS);
+    n = iov_to_buf(out_iov, out_num, 0, &append_sector, sizeof(append_sector));
+    if (n != sizeof(append_sector)) {
+        virtio_error(vdev, "Driver provided input buffer less than size of "
+                           "append_sector");
+        err_status = VIRTIO_BLK_S_ZONE_INVALID_CMD;
+        goto out;
+    }
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    g_free(data);
+}
+
+static int virtio_blk_handle_zone_append(VirtIOBlockReq *req,
+                                         struct iovec *out_iov,
+                                         uint64_t niov) {
+    VirtIOBlock *s = req->dev;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+    uint8_t err_status = VIRTIO_BLK_S_OK;
+
+    int64_t offset = virtio_ldq_p(vdev, &req->out.sector) << BDRV_SECTOR_BITS;
+    int64_t len = iov_size(out_iov, niov);
+
+    if (!check_zoned_request(s, offset, len, true, &err_status)) {
+        goto out;
+    }
+
+    ZoneCmdData *data = g_malloc(sizeof(ZoneCmdData));
+    data->req = req;
+    data->zone_append_data.offset = offset;
+    qemu_iovec_init_external(&req->qiov, out_iov, niov);
+    blk_aio_zone_append(s->blk, &data->zone_append_data.offset, &req->qiov, 0,
+                        virtio_blk_zone_append_complete, data);
+    return 0;
+
+out:
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    virtio_blk_req_complete(req, err_status);
+    virtio_blk_free_request(req);
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+    return err_status;
+}
+
 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -678,6 +1007,24 @@  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     case VIRTIO_BLK_T_FLUSH:
         virtio_blk_handle_flush(req, mrb);
         break;
+    case VIRTIO_BLK_T_ZONE_REPORT:
+        virtio_blk_handle_zone_report(req);
+        break;
+    case VIRTIO_BLK_T_ZONE_OPEN:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_OPEN);
+        break;
+    case VIRTIO_BLK_T_ZONE_CLOSE:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_CLOSE);
+        break;
+    case VIRTIO_BLK_T_ZONE_FINISH:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_FINISH);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
+    case VIRTIO_BLK_T_ZONE_RESET_ALL:
+        virtio_blk_handle_zone_mgmt(req, BLK_ZO_RESET);
+        break;
     case VIRTIO_BLK_T_SCSI_CMD:
         virtio_blk_handle_scsi(req);
         break;
@@ -696,6 +1043,13 @@  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         virtio_blk_free_request(req);
         break;
     }
+    case VIRTIO_BLK_T_ZONE_APPEND & ~VIRTIO_BLK_T_OUT:
+        /*
+         * It is not safe to access req->elem.out_sg directly because it
+         * may be modified by virtio_blk_handle_request().
+         */
+        virtio_blk_handle_zone_append(req, out_iov, out_num);
+        break;
     /*
      * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
      * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
@@ -895,6 +1249,7 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     BlockConf *conf = &s->conf.conf;
+    BlockDriverState *bs = blk_bs(s->blk);
     struct virtio_blk_config blkcfg;
     uint64_t capacity;
     int64_t length;
@@ -954,6 +1309,30 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
         blkcfg.write_zeroes_may_unmap = 1;
         virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
     }
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        switch (bs->bl.zoned) {
+        case BLK_Z_HM:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HM;
+            break;
+        case BLK_Z_HA:
+            blkcfg.zoned.model = VIRTIO_BLK_Z_HA;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        virtio_stl_p(vdev, &blkcfg.zoned.zone_sectors,
+                     bs->bl.zone_size / 512);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_active_zones,
+                     bs->bl.max_active_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_open_zones,
+                     bs->bl.max_open_zones);
+        virtio_stl_p(vdev, &blkcfg.zoned.write_granularity, blk_size);
+        virtio_stl_p(vdev, &blkcfg.zoned.max_append_sectors,
+                     bs->bl.max_append_sectors);
+    } else {
+        blkcfg.zoned.model = VIRTIO_BLK_Z_NONE;
+    }
     memcpy(config, &blkcfg, s->config_size);
 }
 
@@ -1118,6 +1497,7 @@  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBlock *s = VIRTIO_BLK(dev);
     VirtIOBlkConf *conf = &s->conf;
+    BlockDriverState *bs = blk_bs(conf->conf.blk);
     Error *err = NULL;
     unsigned i;
 
@@ -1163,6 +1543,13 @@  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    if (bs->bl.zoned != BLK_Z_NONE) {
+        virtio_add_feature(&s->host_features, VIRTIO_BLK_F_ZONED);
+        if (bs->bl.zoned == BLK_Z_HM) {
+            virtio_clear_feature(&s->host_features, VIRTIO_BLK_F_DISCARD);
+        }
+    }
+
     if (virtio_has_feature(s->host_features, VIRTIO_BLK_F_DISCARD) &&
         (!conf->max_discard_sectors ||
          conf->max_discard_sectors > BDRV_REQUEST_MAX_SECTORS)) {