Patchwork [5/8] qed: detect zero writes and skip them when to an unalloc cluster

login
register
mail settings
Submitter Stefan Hajnoczi
Date April 27, 2011, 1:27 p.m.
Message ID <1303910855-28999-6-git-send-email-stefanha@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/93047/
State New
Headers show

Comments

Stefan Hajnoczi - April 27, 2011, 1:27 p.m.
From: Anthony Liguori <aliguori@us.ibm.com>

A value of 1 is used to indicate that a cluster contains all zeros.  Update the
code to detect zero writes only when a flag is set on the AIOCB.  For now, only
set the flag on copy-on-read based write requests to avoid polluting the
cache on write in the zero copy case.

After this patch, we can stream an image file from a backing file without
fully expanding the image.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 block/qed.c |  124 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
 block/qed.h |    1 +
 2 files changed, 107 insertions(+), 18 deletions(-)

Patch

diff --git a/block/qed.c b/block/qed.c
index 56150c3..2c155d9 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -33,6 +33,13 @@  static AIOPool qed_aio_pool = {
     .cancel             = qed_aio_cancel,
 };
 
+static BlockDriverAIOCB *qed_aio_writev_check(BlockDriverState *bs,
+                                              int64_t sector_num,
+                                              QEMUIOVector *qiov,
+                                              int nb_sectors,
+                                              BlockDriverCompletionFunc *cb,
+                                              void *opaque);
+
 static int bdrv_qed_probe(const uint8_t *buf, int buf_size,
                           const char *filename)
 {
@@ -871,9 +878,8 @@  static void qed_aio_write_l1_update(void *opaque, int ret)
 /**
  * Update L2 table with new cluster offsets and write them out
  */
-static void qed_aio_write_l2_update(void *opaque, int ret)
+static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
 {
-    QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
     bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1;
     int index;
@@ -889,7 +895,7 @@  static void qed_aio_write_l2_update(void *opaque, int ret)
 
     index = qed_l2_index(s, acb->cur_pos);
     qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters,
-                         acb->cur_cluster);
+                         offset);
 
     if (need_alloc) {
         /* Write out the whole new L2 table */
@@ -906,6 +912,51 @@  err:
     qed_aio_complete(acb, ret);
 }
 
+static void qed_aio_write_l2_update_cb(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+    qed_aio_write_l2_update(acb, ret, acb->cur_cluster);
+}
+
+/**
+ * Determine if we have a zero write to a block of clusters
+ *
+ * We validate that the write is aligned to a cluster boundary, and that it's
+ * a multiple of cluster size with all zeros.
+ */
+static bool qed_is_zero_write(QEDAIOCB *acb)
+{
+    BDRVQEDState *s = acb_to_s(acb);
+    int i;
+
+    if (!qed_offset_is_cluster_aligned(s, acb->cur_pos)) {
+        return false;
+    }
+
+    if (!qed_offset_is_cluster_aligned(s, acb->cur_qiov.size)) {
+        return false;
+    }
+
+    for (i = 0; i < acb->cur_qiov.niov; i++) {
+        struct iovec *iov = &acb->cur_qiov.iov[i];
+        uint64_t *v;
+        int j;
+
+        if ((iov->iov_len & 0x07)) {
+            return false;
+        }
+
+        v = iov->iov_base;
+        for (j = 0; j < iov->iov_len; j += sizeof(v[0])) {
+            if (v[j >> 3]) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 /**
  * Flush new data clusters before updating the L2 table
  *
@@ -920,7 +971,7 @@  static void qed_aio_write_flush_before_l2_update(void *opaque, int ret)
     QEDAIOCB *acb = opaque;
     BDRVQEDState *s = acb_to_s(acb);
 
-    if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update, opaque)) {
+    if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) {
         qed_aio_complete(acb, -EIO);
     }
 }
@@ -950,7 +1001,7 @@  static void qed_aio_write_main(void *opaque, int ret)
         if (s->bs->backing_hd) {
             next_fn = qed_aio_write_flush_before_l2_update;
         } else {
-            next_fn = qed_aio_write_l2_update;
+            next_fn = qed_aio_write_l2_update_cb;
         }
     }
 
@@ -1016,6 +1067,18 @@  static bool qed_should_set_need_check(BDRVQEDState *s)
     return !(s->header.features & QED_F_NEED_CHECK);
 }
 
+static void qed_aio_write_zero_cluster(void *opaque, int ret)
+{
+    QEDAIOCB *acb = opaque;
+
+    if (ret) {
+        qed_aio_complete(acb, ret);
+        return;
+    }
+
+    qed_aio_write_l2_update(acb, 0, 1);
+}
+
 /**
  * Write new data cluster
  *
@@ -1027,6 +1090,7 @@  static bool qed_should_set_need_check(BDRVQEDState *s)
 static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
 {
     BDRVQEDState *s = acb_to_s(acb);
+    BlockDriverCompletionFunc *cb;
 
     /* Freeze this request if another allocating write is in progress */
     if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) {
@@ -1041,11 +1105,18 @@  static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
     acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters);
     qemu_iovec_copy(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len);
 
+    cb = qed_aio_write_prefill;
+
+    /* Zero write detection */
+    if (acb->check_zero_write && qed_is_zero_write(acb)) {
+        cb = qed_aio_write_zero_cluster;
+    }
+
     if (qed_should_set_need_check(s)) {
         s->header.features |= QED_F_NEED_CHECK;
-        qed_write_header(s, qed_aio_write_prefill, acb);
+        qed_write_header(s, cb, acb);
     } else {
-        qed_aio_write_prefill(acb, 0);
+        cb(acb, 0);
     }
 }
 
@@ -1116,11 +1187,11 @@  static void qed_copy_on_read_cb(void *opaque, int ret)
     BDRVQEDState *s = acb_to_s(acb);
     BlockDriverAIOCB *cor_acb;
 
-    cor_acb = bdrv_aio_writev(s->bs,
-                              acb->cur_pos / BDRV_SECTOR_SIZE,
-                              &acb->cur_qiov,
-                              acb->cur_qiov.size / BDRV_SECTOR_SIZE,
-                              qed_aio_next_io, acb);
+    cor_acb = qed_aio_writev_check(s->bs,
+                                   acb->cur_pos / BDRV_SECTOR_SIZE,
+                                   &acb->cur_qiov,
+                                   acb->cur_qiov.size / BDRV_SECTOR_SIZE,
+                                   qed_aio_next_io, acb);
     if (!cor_acb) {
         qed_aio_complete(acb, -EIO);
     }
@@ -1226,7 +1297,8 @@  static QEDAIOCB *qed_aio_setup(BlockDriverState *bs,
                                int64_t sector_num,
                                QEMUIOVector *qiov, int nb_sectors,
                                BlockDriverCompletionFunc *cb,
-                               void *opaque, bool is_write)
+                               void *opaque, bool is_write,
+                               bool check_zero_write)
 {
     QEDAIOCB *acb = qemu_aio_get(&qed_aio_pool, bs, cb, opaque);
 
@@ -1235,6 +1307,7 @@  static QEDAIOCB *qed_aio_setup(BlockDriverState *bs,
 
     acb->is_write = is_write;
     acb->finished = NULL;
+    acb->check_zero_write = check_zero_write;
     acb->qiov = qiov;
     acb->qiov_offset = 0;
     acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE;
@@ -1249,12 +1322,13 @@  static BlockDriverAIOCB *bdrv_qed_aio_setup(BlockDriverState *bs,
                                             int64_t sector_num,
                                             QEMUIOVector *qiov, int nb_sectors,
                                             BlockDriverCompletionFunc *cb,
-                                            void *opaque, bool is_write)
+                                            void *opaque, bool is_write,
+                                            bool check_zero_write)
 {
     QEDAIOCB *acb;
 
     acb = qed_aio_setup(bs, sector_num, qiov, nb_sectors,
-                        cb, opaque, is_write);
+                        cb, opaque, is_write, check_zero_write);
     /* Start request */
     qed_aio_next_io(acb, 0);
 
@@ -1268,7 +1342,7 @@  static BlockDriverAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs,
                                             void *opaque)
 {
     return bdrv_qed_aio_setup(bs, sector_num, qiov, nb_sectors,
-                              cb, opaque, false);
+                              cb, opaque, false, false);
 }
 
 static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
@@ -1278,7 +1352,21 @@  static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
                                              void *opaque)
 {
     return bdrv_qed_aio_setup(bs, sector_num, qiov, nb_sectors,
-                              cb, opaque, true);
+                              cb, opaque, true, false);
+}
+
+/**
+ * Perform a write with a zero-check.
+ */
+static BlockDriverAIOCB *qed_aio_writev_check(BlockDriverState *bs,
+                                              int64_t sector_num,
+                                              QEMUIOVector *qiov,
+                                              int nb_sectors,
+                                              BlockDriverCompletionFunc *cb,
+                                              void *opaque)
+{
+    return bdrv_qed_aio_setup(bs, sector_num, qiov, nb_sectors,
+                              cb, opaque, true, true);
 }
 
 typedef struct QEDStreamData {
@@ -1405,7 +1493,7 @@  static BlockDriverAIOCB *bdrv_qed_aio_stream(BlockDriverState *bs,
 
     acb = qed_aio_setup(bs, sector_num, qiov,
                         cluster_size / BDRV_SECTOR_SIZE,
-                        qed_aio_stream_cb, stream_data, false);
+                        qed_aio_stream_cb, stream_data, false, false);
     stream_data->acb = acb;
 
     qed_find_cluster(s, &acb->request, acb->cur_pos,
diff --git a/block/qed.h b/block/qed.h
index 845a80e..8e9e415 100644
--- a/block/qed.h
+++ b/block/qed.h
@@ -135,6 +135,7 @@  typedef struct QEDAIOCB {
     bool is_write;                  /* false - read, true - write */
     bool *finished;                 /* signal for cancel completion */
     uint64_t end_pos;               /* request end on block device, in bytes */
+    bool check_zero_write;          /* true - check blocks for zero write */
 
     /* User scatter-gather list */
     QEMUIOVector *qiov;