diff mbox series

[v2,3/5] virtio-blk: add DISCARD and WRITE ZEROES features

Message ID 20190131151914.164903-4-sgarzare@redhat.com
State New
Headers show
Series virtio-blk: add DISCARD and WRITE ZEROES features | expand

Commit Message

Stefano Garzarella Jan. 31, 2019, 3:19 p.m. UTC
This patch adds the support of DISCARD and WRITE ZEROES commands,
that have been introduced in the virtio-blk protocol to have
better performance when using SSD backend.

We support only one segment per request since multiple segments
are not widely used and there are no userspace APIs that allow
applications to submit multiple segments in a single call.

Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
---
 hw/block/virtio-blk.c          | 173 +++++++++++++++++++++++++++++++++
 include/hw/virtio/virtio-blk.h |   2 +
 2 files changed, 175 insertions(+)

Comments

Michael S. Tsirkin Jan. 31, 2019, 4:04 p.m. UTC | #1
On Thu, Jan 31, 2019 at 04:19:12PM +0100, Stefano Garzarella wrote:
> This patch adds the support of DISCARD and WRITE ZEROES commands,
> that have been introduced in the virtio-blk protocol to have
> better performance when using SSD backend.
> 
> We support only one segment per request since multiple segments
> are not widely used and there are no userspace APIs that allow
> applications to submit multiple segments in a single call.
> 
> Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>


This does not seem to match the spec clarifications
that Stefan Hajnoczi has posted on
the virtio TC.

I think this can go any of the following ways:

- we agree that a request should have at most one segment
	no padding allowed
- we agree that a request should have at most one segment
	we require padding to 512 bytes
- we agree that a request should have at most one segment
	we also support padding to 512 bytes
- we agree that a request should have at most one segment
	we also support arbitrary padding
- we agree that a request can have any # of segments

I would also need your feedback on whether all this
is a material change to 1.1 public review according to the oasis definition:

	"Material Change" is any change to the content of a Work Product that
	that would require a compliant application or implementation to be
	modified or rewritten in order to remain compliant or which adds new
	features or otherwise expands the scope of the work product.


> ---
>  hw/block/virtio-blk.c          | 173 +++++++++++++++++++++++++++++++++
>  include/hw/virtio/virtio-blk.h |   2 +
>  2 files changed, 175 insertions(+)
> 
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 542ec52536..34ee676895 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -147,6 +147,30 @@ out:
>      aio_context_release(blk_get_aio_context(s->conf.conf.blk));
>  }
>  
> +static void virtio_blk_discard_wzeroes_complete(void *opaque, int ret)
> +{
> +    VirtIOBlockReq *req = opaque;
> +    VirtIOBlock *s = req->dev;
> +    bool is_wzeroes = (virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type) &
> +                       ~VIRTIO_BLK_T_BARRIER) == VIRTIO_BLK_T_WRITE_ZEROES;
> +
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    if (ret) {
> +        if (virtio_blk_handle_rw_error(req, -ret, 0, is_wzeroes)) {
> +            goto out;
> +        }
> +    }
> +
> +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +    if (is_wzeroes) {
> +        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
> +    }
> +    virtio_blk_free_request(req);
> +
> +out:
> +    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> +}
> +
>  #ifdef __linux__
>  
>  typedef struct {
> @@ -480,6 +504,82 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
>      return true;
>  }
>  
> +static uint8_t virtio_blk_handle_dwz(VirtIOBlockReq *req, bool is_wzeroes,
> +    struct virtio_blk_discard_write_zeroes *dwz_hdr)
> +{
> +    VirtIOBlock *s = req->dev;
> +    uint64_t sector;
> +    uint32_t num_sectors, flags;
> +    uint8_t err_status;
> +    int bytes;
> +
> +    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->sector);
> +    num_sectors = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->num_sectors);
> +    flags = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->flags);
> +
> +    /*
> +     * dwz_max_sectors is at most BDRV_REQUEST_MAX_SECTORS, this check
> +     * make us sure that "num_sectors << BDRV_SECTOR_BITS" can fit in
> +     * the integer variable.
> +     */
> +    if (unlikely(num_sectors > s->conf.dwz_max_sectors)) {
> +        err_status = VIRTIO_BLK_S_IOERR;
> +        goto err;
> +    }
> +
> +    bytes = num_sectors << BDRV_SECTOR_BITS;
> +
> +    if (unlikely(!virtio_blk_sect_range_ok(req->dev, sector, bytes))) {
> +        err_status = VIRTIO_BLK_S_IOERR;
> +        goto err;
> +    }
> +
> +    /*
> +     * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
> +     * and write zeroes commands if any unknown flag is set.
> +     */
> +    if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
> +        err_status = VIRTIO_BLK_S_UNSUPP;
> +        goto err;
> +    }
> +
> +    if (is_wzeroes) { /* VIRTIO_BLK_T_WRITE_ZEROES */
> +        int blk_aio_flags = 0;
> +
> +        if (s->conf.wz_may_unmap &&
> +            flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
> +            blk_aio_flags |= BDRV_REQ_MAY_UNMAP;
> +        }
> +
> +        block_acct_start(blk_get_stats(req->dev->blk), &req->acct, bytes,
> +                         BLOCK_ACCT_WRITE);
> +
> +        blk_aio_pwrite_zeroes(req->dev->blk, sector << BDRV_SECTOR_BITS,
> +                              bytes, blk_aio_flags,
> +                              virtio_blk_discard_wzeroes_complete, req);
> +    } else { /* VIRTIO_BLK_T_DISCARD */
> +        /*
> +         * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
> +         * discard commands if the unmap flag is set.
> +         */
> +        if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
> +            err_status = VIRTIO_BLK_S_UNSUPP;
> +            goto err;
> +        }
> +
> +        blk_aio_pdiscard(req->dev->blk, sector << BDRV_SECTOR_BITS, bytes,
> +                         virtio_blk_discard_wzeroes_complete, req);
> +    }
> +
> +    return VIRTIO_BLK_S_OK;
> +
> +err:
> +    if (is_wzeroes) {
> +        block_acct_invalid(blk_get_stats(req->dev->blk), BLOCK_ACCT_WRITE);
> +    }
> +    return err_status;
> +}
> +
>  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>  {
>      uint32_t type;
> @@ -586,6 +686,45 @@ static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>          virtio_blk_free_request(req);
>          break;
>      }
> +    /*
> +     * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
> +     * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
> +     * so we must mask it for these requests, then we will check if it is set.
> +     */
> +    case VIRTIO_BLK_T_DISCARD & ~VIRTIO_BLK_T_OUT:
> +    case VIRTIO_BLK_T_WRITE_ZEROES & ~VIRTIO_BLK_T_OUT:
> +    {
> +        struct virtio_blk_discard_write_zeroes dwz_hdr;
> +        size_t out_len = iov_size(out_iov, out_num);
> +        bool is_wzeroes = (type & ~VIRTIO_BLK_T_BARRIER) ==
> +                          VIRTIO_BLK_T_WRITE_ZEROES;
> +        uint8_t err_status;
> +
> +        /*
> +         * Unsupported if VIRTIO_BLK_T_OUT is not set or the request contains
> +         * more than one segment.
> +         */
> +        if (unlikely(!(type & VIRTIO_BLK_T_OUT) ||
> +                     out_len > sizeof(dwz_hdr))) {
> +            virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
> +            virtio_blk_free_request(req);
> +            return 0;
> +        }
> +
> +        if (unlikely(iov_to_buf(out_iov, out_num, 0, &dwz_hdr,
> +                                sizeof(dwz_hdr)) != sizeof(dwz_hdr))) {
> +            virtio_error(vdev, "virtio-blk discard/wzeroes header too short");
> +            return -1;
> +        }
> +
> +        err_status = virtio_blk_handle_dwz(req, is_wzeroes, &dwz_hdr);
> +        if (err_status != VIRTIO_BLK_S_OK) {
> +            virtio_blk_req_complete(req, err_status);
> +            virtio_blk_free_request(req);
> +        }
> +
> +        break;
> +    }
>      default:
>          virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
>          virtio_blk_free_request(req);
> @@ -765,6 +904,22 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>      blkcfg.alignment_offset = 0;
>      blkcfg.wce = blk_enable_write_cache(s->blk);
>      virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
> +    if (s->conf.discard_wzeroes) {
> +        virtio_stl_p(vdev, &blkcfg.max_discard_sectors,
> +                     s->conf.dwz_max_sectors);
> +        virtio_stl_p(vdev, &blkcfg.discard_sector_alignment,
> +                     blk_size >> BDRV_SECTOR_BITS);
> +        virtio_stl_p(vdev, &blkcfg.max_write_zeroes_sectors,
> +                     s->conf.dwz_max_sectors);
> +        blkcfg.write_zeroes_may_unmap = s->conf.wz_may_unmap;
> +        /*
> +         * We support only one segment per request since multiple segments
> +         * are not widely used and there are no userspace APIs that allow
> +         * applications to submit multiple segments in a single call.
> +         */
> +        virtio_stl_p(vdev, &blkcfg.max_discard_seg, 1);
> +        virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
> +    }
>      memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
>  }
>  
> @@ -811,6 +966,10 @@ static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
>      if (s->conf.num_queues > 1) {
>          virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
>      }
> +    if (s->conf.discard_wzeroes) {
> +        virtio_add_feature(&features, VIRTIO_BLK_F_DISCARD);
> +        virtio_add_feature(&features, VIRTIO_BLK_F_WRITE_ZEROES);
> +    }
>  
>      return features;
>  }
> @@ -956,6 +1115,16 @@ static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
>          return;
>      }
>  
> +    if (conf->discard_wzeroes) {
> +        if (!conf->dwz_max_sectors ||
> +            conf->dwz_max_sectors > BDRV_REQUEST_MAX_SECTORS) {
> +            error_setg(errp, "invalid dwz-max-sectors property (%" PRIu32 "), "
> +                       "must be between 1 and %lu",
> +                       conf->dwz_max_sectors, BDRV_REQUEST_MAX_SECTORS);
> +            return;
> +        }
> +    }
> +
>      virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
>                  sizeof(struct virtio_blk_config));
>  
> @@ -1028,6 +1197,10 @@ static Property virtio_blk_properties[] = {
>                       IOThread *),
>      DEFINE_PROP_BIT("discard-wzeroes", VirtIOBlock, conf.discard_wzeroes, 0,
>                       true),
> +    DEFINE_PROP_UINT32("discard-wzeroes-max-sectors", VirtIOBlock,
> +                       conf.dwz_max_sectors, BDRV_REQUEST_MAX_SECTORS),
> +    DEFINE_PROP_BIT("wzeroes-may-unmap", VirtIOBlock, conf.wz_may_unmap, 0,
> +                    true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
> index c336afb4cd..4e9d4434ff 100644
> --- a/include/hw/virtio/virtio-blk.h
> +++ b/include/hw/virtio/virtio-blk.h
> @@ -41,6 +41,8 @@ struct VirtIOBlkConf
>      uint16_t num_queues;
>      uint16_t queue_size;
>      uint32_t discard_wzeroes;
> +    uint32_t dwz_max_sectors;
> +    uint32_t wz_may_unmap;
>  };
>  
>  struct VirtIOBlockDataPlane;
> -- 
> 2.20.1
Stefano Garzarella Jan. 31, 2019, 5:01 p.m. UTC | #2
On Thu, Jan 31, 2019 at 11:04:10AM -0500, Michael S. Tsirkin wrote:
> On Thu, Jan 31, 2019 at 04:19:12PM +0100, Stefano Garzarella wrote:
> > This patch adds the support of DISCARD and WRITE ZEROES commands,
> > that have been introduced in the virtio-blk protocol to have
> > better performance when using SSD backend.
> > 
> > We support only one segment per request since multiple segments
> > are not widely used and there are no userspace APIs that allow
> > applications to submit multiple segments in a single call.
> > 
> > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> 
> 
> This does not seem to match the spec clarifications
> that Stefan Hajnoczi has posted on
> the virtio TC.
> 
> I think this can go any of the following ways:
> 
> - we agree that a request should have at most one segment
> 	no padding allowed
> - we agree that a request should have at most one segment
> 	we require padding to 512 bytes
> - we agree that a request should have at most one segment
> 	we also support padding to 512 bytes
> - we agree that a request should have at most one segment
> 	we also support arbitrary padding
> - we agree that a request can have any # of segments

Hi Michael,
reading the latest patch [1] sent by Stefan, I supposed that the padding
is not allowed, but if we need to support it, I'll fix the implementation.

About the number of segments, I followed the description of
max_discard_sectors [2] and the implementation provided by SPDK's
virtio-blk driver and vhost-user-blk device backend. They also only
support one segment per request, properly setting "max_discard_seg" and
"max_write_zeroes_seg", so if I understood correctly, the specification
leave to the device the freedom to support one or more segments.

> 
> I would also need your feedback on whether all this
> is a material change to 1.1 public review according to the oasis definition:
> 
> 	"Material Change" is any change to the content of a Work Product that
> 	that would require a compliant application or implementation to be
> 	modified or rewritten in order to remain compliant or which adds new
> 	features or otherwise expands the scope of the work product.
> 

IMHO and if I understood correctly, maybe only the second way (require
padding to 512 bytes) should be a "Material Change" because we need to
modify all the implementations (Linux driver, SPDK, vhost).

The other ways should be already supported, because all the
implementations set the status right behind the last byte (regardless of
padding).


#1: https://lists.oasis-open.org/archives/virtio-dev/201901/msg00135.html
#2: https://github.com/oasis-tcs/virtio-spec/blob/master/content.tex#L3876


Thanks,
Stefano
Michael S. Tsirkin Jan. 31, 2019, 5:13 p.m. UTC | #3
On Thu, Jan 31, 2019 at 06:01:34PM +0100, Stefano Garzarella wrote:
> On Thu, Jan 31, 2019 at 11:04:10AM -0500, Michael S. Tsirkin wrote:
> > On Thu, Jan 31, 2019 at 04:19:12PM +0100, Stefano Garzarella wrote:
> > > This patch adds the support of DISCARD and WRITE ZEROES commands,
> > > that have been introduced in the virtio-blk protocol to have
> > > better performance when using SSD backend.
> > > 
> > > We support only one segment per request since multiple segments
> > > are not widely used and there are no userspace APIs that allow
> > > applications to submit multiple segments in a single call.
> > > 
> > > Signed-off-by: Stefano Garzarella <sgarzare@redhat.com>
> > 
> > 
> > This does not seem to match the spec clarifications
> > that Stefan Hajnoczi has posted on
> > the virtio TC.
> > 
> > I think this can go any of the following ways:
> > 
> > - we agree that a request should have at most one segment
> > 	no padding allowed
> > - we agree that a request should have at most one segment
> > 	we require padding to 512 bytes
> > - we agree that a request should have at most one segment
> > 	we also support padding to 512 bytes
> > - we agree that a request should have at most one segment
> > 	we also support arbitrary padding
> > - we agree that a request can have any # of segments
> 
> Hi Michael,
> reading the latest patch [1] sent by Stefan, I supposed that the padding
> is not allowed, but if we need to support it, I'll fix the implementation.
> 
> About the number of segments, I followed the description of
> max_discard_sectors [2] and the implementation provided by SPDK's
> virtio-blk driver and vhost-user-blk device backend. They also only
> support one segment per request, properly setting "max_discard_seg" and
> "max_write_zeroes_seg", so if I understood correctly, the specification
> leave to the device the freedom to support one or more segments.


Oh I missed the fact that you set the config space values to 1.
I confused it with # of sectors.


Now it looks right to me, so pls ignore my comments.

Reviewed-by: Michael S. Tsirkin <mst@redhat.com>



> > 
> > I would also need your feedback on whether all this
> > is a material change to 1.1 public review according to the oasis definition:
> > 
> > 	"Material Change" is any change to the content of a Work Product that
> > 	that would require a compliant application or implementation to be
> > 	modified or rewritten in order to remain compliant or which adds new
> > 	features or otherwise expands the scope of the work product.
> > 
> 
> IMHO and if I understood correctly, maybe only the second way (require
> padding to 512 bytes) should be a "Material Change" because we need to
> modify all the implementations (Linux driver, SPDK, vhost).
> 
> The other ways should be already supported, because all the
> implementations set the status right behind the last byte (regardless of
> padding).
> 
> 
> #1: https://lists.oasis-open.org/archives/virtio-dev/201901/msg00135.html
> #2: https://github.com/oasis-tcs/virtio-spec/blob/master/content.tex#L3876
> 
> 
> Thanks,
> Stefano
Stefan Hajnoczi Feb. 1, 2019, 4:58 a.m. UTC | #4
On Thu, Jan 31, 2019 at 04:19:12PM +0100, Stefano Garzarella wrote:
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 542ec52536..34ee676895 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -147,6 +147,30 @@ out:
>      aio_context_release(blk_get_aio_context(s->conf.conf.blk));
>  }
>  
> +static void virtio_blk_discard_wzeroes_complete(void *opaque, int ret)
> +{
> +    VirtIOBlockReq *req = opaque;
> +    VirtIOBlock *s = req->dev;
> +    bool is_wzeroes = (virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type) &

s/req->dev/s/

> +                       ~VIRTIO_BLK_T_BARRIER) == VIRTIO_BLK_T_WRITE_ZEROES;
> +
> +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> +    if (ret) {
> +        if (virtio_blk_handle_rw_error(req, -ret, 0, is_wzeroes)) {

The third argument is bool, please use false instead of 0.

> +            goto out;
> +        }
> +    }
> +
> +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> +    if (is_wzeroes) {
> +        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);

s/req->dev->blk/s->blk/

> +static uint8_t virtio_blk_handle_dwz(VirtIOBlockReq *req, bool is_wzeroes,
> +    struct virtio_blk_discard_write_zeroes *dwz_hdr)
> +{
> +    VirtIOBlock *s = req->dev;
> +    uint64_t sector;
> +    uint32_t num_sectors, flags;
> +    uint8_t err_status;
> +    int bytes;
> +
> +    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->sector);

Here and throughout the rest of the function:

  VirtIODevice *vdev = VIRTIO_DEVICE(s);

s/VIRTIO_DEVICE(req->dev)/vdev/

and then to clean up the remaining instances:

s/req->dev/s/

> +    if (is_wzeroes) { /* VIRTIO_BLK_T_WRITE_ZEROES */
> +        int blk_aio_flags = 0;
> +
> +        if (s->conf.wz_may_unmap &&

The inconsistent naming is a bit messy and could confuse readers:
write_zeroes vs wzeroes vs wz

The VIRTIO spec and QEMU code uses write_zeroes, please stick to that
even though it is longer.

s/is_wzeroes/is_write_zeroes/
s/wz_map_unmap/write_zeroes_may_unmap/
s/virtio_blk_discard_wzeroes_complete/virtio_blk_discard_write_zeroes_complete/

This is a question of style and a local dwz_hdr variable does make the
code easier to read, so I'm not totally against shortening the name, but
please consistently use the long form in user-visible options, struct
field names, and function names because these things have a large scope.

> @@ -765,6 +904,22 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
>      blkcfg.alignment_offset = 0;
>      blkcfg.wce = blk_enable_write_cache(s->blk);
>      virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
> +    if (s->conf.discard_wzeroes) {
> +        virtio_stl_p(vdev, &blkcfg.max_discard_sectors,
> +                     s->conf.dwz_max_sectors);
> +        virtio_stl_p(vdev, &blkcfg.discard_sector_alignment,
> +                     blk_size >> BDRV_SECTOR_BITS);
> +        virtio_stl_p(vdev, &blkcfg.max_write_zeroes_sectors,
> +                     s->conf.dwz_max_sectors);
> +        blkcfg.write_zeroes_may_unmap = s->conf.wz_may_unmap;

Does this need to be an option since MAY_UNMAP is advisory anyway?

Another way of asking: what happens in the worst case if the guest sends
MAY_UNMAP but the QEMU block device doesn't support unmap?

Dropping this option would make the user interface simpler (no need to
worry about the flag) and the implementation too.
Stefano Garzarella Feb. 1, 2019, 9:54 a.m. UTC | #5
On Fri, Feb 01, 2019 at 12:58:31PM +0800, Stefan Hajnoczi wrote:
> On Thu, Jan 31, 2019 at 04:19:12PM +0100, Stefano Garzarella wrote:
> > diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> > index 542ec52536..34ee676895 100644
> > --- a/hw/block/virtio-blk.c
> > +++ b/hw/block/virtio-blk.c
> > @@ -147,6 +147,30 @@ out:
> >      aio_context_release(blk_get_aio_context(s->conf.conf.blk));
> >  }
> >  
> > +static void virtio_blk_discard_wzeroes_complete(void *opaque, int ret)
> > +{
> > +    VirtIOBlockReq *req = opaque;
> > +    VirtIOBlock *s = req->dev;
> > +    bool is_wzeroes = (virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type) &
> 
> s/req->dev/s/
> 
> > +                       ~VIRTIO_BLK_T_BARRIER) == VIRTIO_BLK_T_WRITE_ZEROES;
> > +
> > +    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
> > +    if (ret) {
> > +        if (virtio_blk_handle_rw_error(req, -ret, 0, is_wzeroes)) {
> 
> The third argument is bool, please use false instead of 0.
> 
> > +            goto out;
> > +        }
> > +    }
> > +
> > +    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
> > +    if (is_wzeroes) {
> > +        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
> 
> s/req->dev->blk/s->blk/
> 
> > +static uint8_t virtio_blk_handle_dwz(VirtIOBlockReq *req, bool is_wzeroes,
> > +    struct virtio_blk_discard_write_zeroes *dwz_hdr)
> > +{
> > +    VirtIOBlock *s = req->dev;
> > +    uint64_t sector;
> > +    uint32_t num_sectors, flags;
> > +    uint8_t err_status;
> > +    int bytes;
> > +
> > +    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->sector);
> 
> Here and throughout the rest of the function:
> 
>   VirtIODevice *vdev = VIRTIO_DEVICE(s);
> 
> s/VIRTIO_DEVICE(req->dev)/vdev/
> 
> and then to clean up the remaining instances:
> 
> s/req->dev/s/
> 

Thanks! I'll follow all of these suggestions.

> > +    if (is_wzeroes) { /* VIRTIO_BLK_T_WRITE_ZEROES */
> > +        int blk_aio_flags = 0;
> > +
> > +        if (s->conf.wz_may_unmap &&
> 
> The inconsistent naming is a bit messy and could confuse readers:
> write_zeroes vs wzeroes vs wz
> 
> The VIRTIO spec and QEMU code uses write_zeroes, please stick to that
> even though it is longer.
> 
> s/is_wzeroes/is_write_zeroes/
> s/wz_map_unmap/write_zeroes_may_unmap/
> s/virtio_blk_discard_wzeroes_complete/virtio_blk_discard_write_zeroes_complete/
> 
> This is a question of style and a local dwz_hdr variable does make the
> code easier to read, so I'm not totally against shortening the name, but
> please consistently use the long form in user-visible options, struct
> field names, and function names because these things have a large scope.
> 

Thanks! I'll change all the name.

> > @@ -765,6 +904,22 @@ static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
> >      blkcfg.alignment_offset = 0;
> >      blkcfg.wce = blk_enable_write_cache(s->blk);
> >      virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
> > +    if (s->conf.discard_wzeroes) {
> > +        virtio_stl_p(vdev, &blkcfg.max_discard_sectors,
> > +                     s->conf.dwz_max_sectors);
> > +        virtio_stl_p(vdev, &blkcfg.discard_sector_alignment,
> > +                     blk_size >> BDRV_SECTOR_BITS);
> > +        virtio_stl_p(vdev, &blkcfg.max_write_zeroes_sectors,
> > +                     s->conf.dwz_max_sectors);
> > +        blkcfg.write_zeroes_may_unmap = s->conf.wz_may_unmap;
> 
> Does this need to be an option since MAY_UNMAP is advisory anyway?
> 
> Another way of asking: what happens in the worst case if the guest sends
> MAY_UNMAP but the QEMU block device doesn't support unmap?
> 
> Dropping this option would make the user interface simpler (no need to
> worry about the flag) and the implementation too.

Make sense, I'll drop this option.

Only a question about options: I used a single option "dwz_max_sectors"
for both "max_discard_sectors" and "max_write_zeroes_sectors".
Since I'll include two options to enable/disable discard and
write_zeroes features, do you think make sense to split this
configurable option in two?

Thanks,
Stefano
Stefan Hajnoczi Feb. 1, 2019, 10:08 a.m. UTC | #6
On Fri, Feb 01, 2019 at 10:54:30AM +0100, Stefano Garzarella wrote:
> On Fri, Feb 01, 2019 at 12:58:31PM +0800, Stefan Hajnoczi wrote:
> > On Thu, Jan 31, 2019 at 04:19:12PM +0100, Stefano Garzarella wrote:
> Only a question about options: I used a single option "dwz_max_sectors"
> for both "max_discard_sectors" and "max_write_zeroes_sectors".
> Since I'll include two options to enable/disable discard and
> write_zeroes features, do you think make sense to split this
> configurable option in two?

Yes, please.  Eventually a user will want full control so it makes sense
to expose what virtio offers instead of coming up with higher-level
options.

Stefan
diff mbox series

Patch

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 542ec52536..34ee676895 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -147,6 +147,30 @@  out:
     aio_context_release(blk_get_aio_context(s->conf.conf.blk));
 }
 
+static void virtio_blk_discard_wzeroes_complete(void *opaque, int ret)
+{
+    VirtIOBlockReq *req = opaque;
+    VirtIOBlock *s = req->dev;
+    bool is_wzeroes = (virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type) &
+                       ~VIRTIO_BLK_T_BARRIER) == VIRTIO_BLK_T_WRITE_ZEROES;
+
+    aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+    if (ret) {
+        if (virtio_blk_handle_rw_error(req, -ret, 0, is_wzeroes)) {
+            goto out;
+        }
+    }
+
+    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+    if (is_wzeroes) {
+        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+    }
+    virtio_blk_free_request(req);
+
+out:
+    aio_context_release(blk_get_aio_context(s->conf.conf.blk));
+}
+
 #ifdef __linux__
 
 typedef struct {
@@ -480,6 +504,82 @@  static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     return true;
 }
 
+static uint8_t virtio_blk_handle_dwz(VirtIOBlockReq *req, bool is_wzeroes,
+    struct virtio_blk_discard_write_zeroes *dwz_hdr)
+{
+    VirtIOBlock *s = req->dev;
+    uint64_t sector;
+    uint32_t num_sectors, flags;
+    uint8_t err_status;
+    int bytes;
+
+    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->sector);
+    num_sectors = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->num_sectors);
+    flags = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &dwz_hdr->flags);
+
+    /*
+     * dwz_max_sectors is at most BDRV_REQUEST_MAX_SECTORS, this check
+     * make us sure that "num_sectors << BDRV_SECTOR_BITS" can fit in
+     * the integer variable.
+     */
+    if (unlikely(num_sectors > s->conf.dwz_max_sectors)) {
+        err_status = VIRTIO_BLK_S_IOERR;
+        goto err;
+    }
+
+    bytes = num_sectors << BDRV_SECTOR_BITS;
+
+    if (unlikely(!virtio_blk_sect_range_ok(req->dev, sector, bytes))) {
+        err_status = VIRTIO_BLK_S_IOERR;
+        goto err;
+    }
+
+    /*
+     * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
+     * and write zeroes commands if any unknown flag is set.
+     */
+    if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
+        err_status = VIRTIO_BLK_S_UNSUPP;
+        goto err;
+    }
+
+    if (is_wzeroes) { /* VIRTIO_BLK_T_WRITE_ZEROES */
+        int blk_aio_flags = 0;
+
+        if (s->conf.wz_may_unmap &&
+            flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+            blk_aio_flags |= BDRV_REQ_MAY_UNMAP;
+        }
+
+        block_acct_start(blk_get_stats(req->dev->blk), &req->acct, bytes,
+                         BLOCK_ACCT_WRITE);
+
+        blk_aio_pwrite_zeroes(req->dev->blk, sector << BDRV_SECTOR_BITS,
+                              bytes, blk_aio_flags,
+                              virtio_blk_discard_wzeroes_complete, req);
+    } else { /* VIRTIO_BLK_T_DISCARD */
+        /*
+         * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
+         * discard commands if the unmap flag is set.
+         */
+        if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
+            err_status = VIRTIO_BLK_S_UNSUPP;
+            goto err;
+        }
+
+        blk_aio_pdiscard(req->dev->blk, sector << BDRV_SECTOR_BITS, bytes,
+                         virtio_blk_discard_wzeroes_complete, req);
+    }
+
+    return VIRTIO_BLK_S_OK;
+
+err:
+    if (is_wzeroes) {
+        block_acct_invalid(blk_get_stats(req->dev->blk), BLOCK_ACCT_WRITE);
+    }
+    return err_status;
+}
+
 static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -586,6 +686,45 @@  static int virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         virtio_blk_free_request(req);
         break;
     }
+    /*
+     * VIRTIO_BLK_T_DISCARD and VIRTIO_BLK_T_WRITE_ZEROES are defined with
+     * VIRTIO_BLK_T_OUT flag set. We masked this flag in the switch statement,
+     * so we must mask it for these requests, then we will check if it is set.
+     */
+    case VIRTIO_BLK_T_DISCARD & ~VIRTIO_BLK_T_OUT:
+    case VIRTIO_BLK_T_WRITE_ZEROES & ~VIRTIO_BLK_T_OUT:
+    {
+        struct virtio_blk_discard_write_zeroes dwz_hdr;
+        size_t out_len = iov_size(out_iov, out_num);
+        bool is_wzeroes = (type & ~VIRTIO_BLK_T_BARRIER) ==
+                          VIRTIO_BLK_T_WRITE_ZEROES;
+        uint8_t err_status;
+
+        /*
+         * Unsupported if VIRTIO_BLK_T_OUT is not set or the request contains
+         * more than one segment.
+         */
+        if (unlikely(!(type & VIRTIO_BLK_T_OUT) ||
+                     out_len > sizeof(dwz_hdr))) {
+            virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+            virtio_blk_free_request(req);
+            return 0;
+        }
+
+        if (unlikely(iov_to_buf(out_iov, out_num, 0, &dwz_hdr,
+                                sizeof(dwz_hdr)) != sizeof(dwz_hdr))) {
+            virtio_error(vdev, "virtio-blk discard/wzeroes header too short");
+            return -1;
+        }
+
+        err_status = virtio_blk_handle_dwz(req, is_wzeroes, &dwz_hdr);
+        if (err_status != VIRTIO_BLK_S_OK) {
+            virtio_blk_req_complete(req, err_status);
+            virtio_blk_free_request(req);
+        }
+
+        break;
+    }
     default:
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
         virtio_blk_free_request(req);
@@ -765,6 +904,22 @@  static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
     blkcfg.alignment_offset = 0;
     blkcfg.wce = blk_enable_write_cache(s->blk);
     virtio_stw_p(vdev, &blkcfg.num_queues, s->conf.num_queues);
+    if (s->conf.discard_wzeroes) {
+        virtio_stl_p(vdev, &blkcfg.max_discard_sectors,
+                     s->conf.dwz_max_sectors);
+        virtio_stl_p(vdev, &blkcfg.discard_sector_alignment,
+                     blk_size >> BDRV_SECTOR_BITS);
+        virtio_stl_p(vdev, &blkcfg.max_write_zeroes_sectors,
+                     s->conf.dwz_max_sectors);
+        blkcfg.write_zeroes_may_unmap = s->conf.wz_may_unmap;
+        /*
+         * We support only one segment per request since multiple segments
+         * are not widely used and there are no userspace APIs that allow
+         * applications to submit multiple segments in a single call.
+         */
+        virtio_stl_p(vdev, &blkcfg.max_discard_seg, 1);
+        virtio_stl_p(vdev, &blkcfg.max_write_zeroes_seg, 1);
+    }
     memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
 }
 
@@ -811,6 +966,10 @@  static uint64_t virtio_blk_get_features(VirtIODevice *vdev, uint64_t features,
     if (s->conf.num_queues > 1) {
         virtio_add_feature(&features, VIRTIO_BLK_F_MQ);
     }
+    if (s->conf.discard_wzeroes) {
+        virtio_add_feature(&features, VIRTIO_BLK_F_DISCARD);
+        virtio_add_feature(&features, VIRTIO_BLK_F_WRITE_ZEROES);
+    }
 
     return features;
 }
@@ -956,6 +1115,16 @@  static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
         return;
     }
 
+    if (conf->discard_wzeroes) {
+        if (!conf->dwz_max_sectors ||
+            conf->dwz_max_sectors > BDRV_REQUEST_MAX_SECTORS) {
+            error_setg(errp, "invalid dwz-max-sectors property (%" PRIu32 "), "
+                       "must be between 1 and %lu",
+                       conf->dwz_max_sectors, BDRV_REQUEST_MAX_SECTORS);
+            return;
+        }
+    }
+
     virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
                 sizeof(struct virtio_blk_config));
 
@@ -1028,6 +1197,10 @@  static Property virtio_blk_properties[] = {
                      IOThread *),
     DEFINE_PROP_BIT("discard-wzeroes", VirtIOBlock, conf.discard_wzeroes, 0,
                      true),
+    DEFINE_PROP_UINT32("discard-wzeroes-max-sectors", VirtIOBlock,
+                       conf.dwz_max_sectors, BDRV_REQUEST_MAX_SECTORS),
+    DEFINE_PROP_BIT("wzeroes-may-unmap", VirtIOBlock, conf.wz_may_unmap, 0,
+                    true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index c336afb4cd..4e9d4434ff 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -41,6 +41,8 @@  struct VirtIOBlkConf
     uint16_t num_queues;
     uint16_t queue_size;
     uint32_t discard_wzeroes;
+    uint32_t dwz_max_sectors;
+    uint32_t wz_may_unmap;
 };
 
 struct VirtIOBlockDataPlane;