Patchwork [v3,2/6] block: add .bdrv_co_write_zeroes() interface

login
register
mail settings
Submitter Stefan Hajnoczi
Date Dec. 21, 2011, 4 p.m.
Message ID <1324483240-31726-3-git-send-email-stefanha@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/132680/
State New
Headers show

Comments

Stefan Hajnoczi - Dec. 21, 2011, 4 p.m.
The ability to zero regions of an image file is a useful primitive for
higher-level features such as image streaming or zero write detection.

Image formats may support an optimized metadata representation instead
of writing zeroes into the image file.  This allows zero writes to be
potentially faster than regular write operations and also preserve
sparseness of the image file.

The .bdrv_co_write_zeroes() interface should be implemented by block
drivers that wish to provide efficient zeroing.

Note that this operation is different from the discard operation, which
may leave the contents of the region indeterminate.  That means
discarded blocks are not guaranteed to contain zeroes and may contain
junk data instead.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block.c      |   50 ++++++++++++++++++++++++++++++++++++++++++++------
 block.h      |    7 +++++++
 block_int.h  |    8 ++++++++
 trace-events |    1 +
 4 files changed, 60 insertions(+), 6 deletions(-)
Christoph Hellwig - Dec. 21, 2011, 4:50 p.m.
On Wed, Dec 21, 2011 at 04:00:36PM +0000, Stefan Hajnoczi wrote:
> The ability to zero regions of an image file is a useful primitive for
> higher-level features such as image streaming or zero write detection.
> 
> Image formats may support an optimized metadata representation instead
> of writing zeroes into the image file.  This allows zero writes to be
> potentially faster than regular write operations and also preserve
> sparseness of the image file.
> 
> The .bdrv_co_write_zeroes() interface should be implemented by block
> drivers that wish to provide efficient zeroing.
> 
> Note that this operation is different from the discard operation, which
> may leave the contents of the region indeterminate.  That means
> discarded blocks are not guaranteed to contain zeroes and may contain
> junk data instead.

Most real life discard operations zero the data, and both the ATA and SCSI
spec allow the device to set a bit which gurantees this behaviour.  I think
we also should make these one interface, and if the caller needs it to
actually zero out the discarded blocks it should check if the discard
implementation guarantees that.
Stefan Hajnoczi - Dec. 22, 2011, 7:54 a.m.
On Wed, Dec 21, 2011 at 05:50:32PM +0100, Christoph Hellwig wrote:
> On Wed, Dec 21, 2011 at 04:00:36PM +0000, Stefan Hajnoczi wrote:
> > The ability to zero regions of an image file is a useful primitive for
> > higher-level features such as image streaming or zero write detection.
> > 
> > Image formats may support an optimized metadata representation instead
> > of writing zeroes into the image file.  This allows zero writes to be
> > potentially faster than regular write operations and also preserve
> > sparseness of the image file.
> > 
> > The .bdrv_co_write_zeroes() interface should be implemented by block
> > drivers that wish to provide efficient zeroing.
> > 
> > Note that this operation is different from the discard operation, which
> > may leave the contents of the region indeterminate.  That means
> > discarded blocks are not guaranteed to contain zeroes and may contain
> > junk data instead.
> 
> Most real life discard operations zero the data, and both the ATA and SCSI
> spec allow the device to set a bit which gurantees this behaviour.  I think
> we also should make these one interface, and if the caller needs it to
> actually zero out the discarded blocks it should check if the discard
> implementation guarantees that.

Okay, I see how that could work but still need to look into the details
of how to combine the two and check the zero/indeterminate bit coming
from ATA/SCSI.

Stefan

Patch

diff --git a/block.c b/block.c
index 3f072f6..5ebbd4d 100644
--- a/block.c
+++ b/block.c
@@ -64,7 +64,7 @@  static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
 static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, bool write_zeroes);
 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                                int64_t sector_num,
                                                QEMUIOVector *qiov,
@@ -1291,7 +1291,7 @@  static void coroutine_fn bdrv_rw_co_entry(void *opaque)
                                      rwco->nb_sectors, rwco->qiov);
     } else {
         rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num,
-                                      rwco->nb_sectors, rwco->qiov);
+                                      rwco->nb_sectors, rwco->qiov, false);
     }
 }
 
@@ -1608,11 +1608,37 @@  int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov);
 }
 
+static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    BlockDriver *drv = bs->drv;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int ret;
+
+    /* First try the efficient write zeroes operation */
+    if (drv->bdrv_co_write_zeroes) {
+        return drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+    }
+
+    /* Fall back to bounce buffer if write zeroes is unsupported */
+    iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = qemu_blockalign(bs, iov.iov_len);
+    memset(iov.iov_base, 0, iov.iov_len);
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
+
+    qemu_vfree(iov.iov_base);
+    return ret;
+}
+
 /*
  * Handle a write request in coroutine context
  */
 static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+    bool write_zeroes)
 {
     BlockDriver *drv = bs->drv;
     BdrvTrackedRequest req;
@@ -1639,7 +1665,11 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
-    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    if (write_zeroes) {
+        ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+    } else {
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    }
 
     if (bs->dirty_bitmap) {
         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
@@ -1659,7 +1689,15 @@  int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
 {
     trace_bdrv_co_writev(bs, sector_num, nb_sectors);
 
-    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov);
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, false);
+}
+
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors)
+{
+    trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+
+    return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, true);
 }
 
 /**
@@ -3143,7 +3181,7 @@  static void coroutine_fn bdrv_co_do_rw(void *opaque)
             acb->req.nb_sectors, acb->req.qiov);
     } else {
         acb->req.error = bdrv_co_do_writev(bs, acb->req.sector,
-            acb->req.nb_sectors, acb->req.qiov);
+            acb->req.nb_sectors, acb->req.qiov, false);
     }
 
     acb->bh = qemu_bh_new(bdrv_co_em_bh, acb);
diff --git a/block.h b/block.h
index 3bd4398..51b90c7 100644
--- a/block.h
+++ b/block.h
@@ -144,6 +144,13 @@  int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
+/*
+ * Efficiently zero a region of the disk image.  Note that this is a regular
+ * I/O request like read or write and should have a reasonable size.  This
+ * function is not suitable for zeroing the entire image in a single request.
+ */
+int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors);
 int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, int *pnum);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
diff --git a/block_int.h b/block_int.h
index 311bd2a..5362180 100644
--- a/block_int.h
+++ b/block_int.h
@@ -101,6 +101,14 @@  struct BlockDriver {
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+    /*
+     * Efficiently zero a region of the disk image.  Typically an image format
+     * would use a compact metadata representation to implement this.  This
+     * function pointer may be NULL and .bdrv_co_writev() will be called
+     * instead.
+     */
+    int coroutine_fn (*bdrv_co_write_zeroes)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors);
     int coroutine_fn (*bdrv_co_is_allocated)(BlockDriverState *bs,
diff --git a/trace-events b/trace-events
index 514849a..fd2d7d9 100644
--- a/trace-events
+++ b/trace-events
@@ -66,6 +66,7 @@  bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs
 bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
 bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"