Patchwork [17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush()

login
register
mail settings
Submitter Chunqiang Tang
Date Feb. 25, 2011, 10:37 p.m.
Message ID <1298673486-3573-17-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/84587/
State New
Headers show

Comments

Chunqiang Tang - Feb. 25, 2011, 10:37 p.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_flush() and bdrv_aio_flush()
interfaces.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-flush.c       |  176 +++++++++++++++++++++++++++++++++++++-
 block/fvd-journal-buf.c |  218 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 390 insertions(+), 4 deletions(-)

Patch

diff --git a/block/fvd-flush.c b/block/fvd-flush.c
index 34bd5cb..6658d27 100644
--- a/block/fvd-flush.c
+++ b/block/fvd-flush.c
@@ -1,5 +1,5 @@ 
 /*
- * QEMU Fast Virtual Disk Format bdrv_flush() and bdrv_aio_flush()
+ * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface
  *
  * Copyright IBM, Corp. 2010
  *
@@ -11,14 +11,182 @@ 
  *
  */
 
+static void aio_wrapper_bh(void *opaque);
+static int bjnl_sync_flush(BlockDriverState * bs);
+static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
+                              BlockDriverCompletionFunc * cb,
+                              void *opaque, BlockDriverAIOCB **p_acb);
+static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
+                              BlockDriverCompletionFunc * cb,
+                              void *opaque, FvdAIOCB *parent_acb);
+
+static int fvd_flush(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+
+    QDEBUG("fvd_flush() invoked\n");
+
+    if (s->metadata_err_prohibit_write) {
+        return -EIO;
+    }
+
+    if (!s->fvd_metadata->enable_write_cache) {
+        /* No need to flush since it uses O_DSYNC. */
+        return 0;
+    }
+
+    if (s->use_bjnl) {
+        return bjnl_sync_flush(bs);
+    }
+
+    /* Simply flush for unbuffered journal update. */
+    if ((ret = bdrv_flush(s->fvd_data))) {
+        return ret;
+    }
+    if (s->fvd_metadata == s->fvd_data) {
+        return 0;
+    }
+    return bdrv_flush(s->fvd_metadata);
+}
+
 static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
                                        BlockDriverCompletionFunc * cb,
                                        void *opaque)
 {
-    return NULL;
+    BDRVFvdState *s = bs->opaque;
+    BlockDriverAIOCB * pacb;
+    FvdAIOCB  *acb;
+
+    QDEBUG("fvd_aio_flush() invoked\n");
+
+    if (s->metadata_err_prohibit_write) {
+        return NULL;
+    }
+
+    if (!s->fvd_data->enable_write_cache) {
+        /* Need to flush since it uses O_DSYNC. Use a QEMUBH to invoke the
+         * callback. */
+
+        if (!(acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque))) {
+            return NULL;
+        }
+
+        acb->type = OP_WRAPPER;
+        acb->cancel_in_progress = false;
+        acb->wrapper.bh = qemu_bh_new(aio_wrapper_bh, acb);
+        qemu_bh_schedule(acb->wrapper.bh);
+        return &acb->common;
+    }
+
+    if (!s->use_bjnl) {
+        QDEBUG("FLUSH: start now for unbuffered journal update");
+        return fvd_aio_flush_start(bs, cb, opaque, NULL);
+    }
+
+    if (bjnl_clean_buf_on_aio_flush(bs, cb, opaque, &pacb)) {
+        /* Waiting for the journal buffer to be cleaned first. */
+        return pacb;
+    }
+
+    /* No buffered journal data. Start flush now. */
+    QDEBUG("FLUSH: start now as no buffered journal data");
+    return fvd_aio_flush_start(bs, cb, opaque, NULL);
+}
+
+static inline void finish_flush(FvdAIOCB * acb)
+{
+    QDEBUG("FLUSH: acb%llu-%p  finish_flush ret=%d\n",
+           acb->uuid, acb, acb->flush.ret);
+    acb->common.cb(acb->common.opaque, acb->flush.ret);
+    my_qemu_aio_release(acb);
 }
 
-static int fvd_flush(BlockDriverState * bs)
+static void flush_data_cb(void *opaque, int ret)
 {
-    return -ENOTSUP;
+    FvdAIOCB *acb = opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("FLUSH: acb%llu-%p  flush_data_cb ret=%d\n", acb->uuid, acb, ret);
+
+    if (acb->flush.ret == 0) {
+        acb->flush.ret = ret;
+    }
+
+    acb->flush.data_acb = NULL;
+    acb->flush.num_finished++;
+    if (acb->flush.num_finished == 2) {
+        finish_flush(acb);
+    }
+}
+
+static void flush_metadata_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("FLUSH: acb%llu-%p  flush_metadata_cb ret=%d\n",
+           acb->uuid, acb, ret);
+
+    if (acb->flush.ret == 0) {
+        acb->flush.ret = ret;
+    }
+
+    acb->flush.metadata_acb = NULL;
+    acb->flush.num_finished++;
+    if (acb->flush.num_finished == 2) {
+        finish_flush(acb);
+    }
+}
+
+static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque, FvdAIOCB *parent_acb)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB  *acb;
+
+    if (s->fvd_data == s->fvd_metadata) {
+        if (parent_acb) {
+            QDEBUG("FLUSH: acb%llu-%p  started.\n",parent_acb->uuid,parent_acb);
+        }
+        return bdrv_aio_flush(s->fvd_metadata, cb, opaque);
+    }
+
+    acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+    COPY_UUID(acb, parent_acb); /* UUID helps debugging. */
+
+    /* fvd_data and fvd_metadata are different. Need to flush both. The order
+     * is not important. If (cache != writethrough && bitmap_updated), a flush
+     * on fvd_data must have already been performed by write_journal_buf(). */
+
+    acb->type = OP_FLUSH;
+    acb->cancel_in_progress = false;
+    acb->flush.num_finished = 0;
+    acb->flush.ret = 0;
+    acb->flush.data_acb = bdrv_aio_flush(s->fvd_data, flush_data_cb, acb);
+    if (!acb->flush.data_acb) {
+        my_qemu_aio_release(acb);
+        return NULL;
+    }
+
+    acb->flush.metadata_acb = bdrv_aio_flush(s->fvd_metadata,
+                                             flush_metadata_cb, acb);
+    if (!acb->flush.metadata_acb) {
+        bdrv_aio_cancel(acb->flush.data_acb);
+        my_qemu_aio_release(acb);
+        return NULL;
+    }
+
+    QDEBUG("FLUSH: acb%llu-%p  started.\n", acb->uuid, acb);
+    return &acb->common;
 }
diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
index b4077ce..e99a585 100644
--- a/block/fvd-journal-buf.c
+++ b/block/fvd-journal-buf.c
@@ -23,6 +23,48 @@ 
 static inline int bjnl_write_buf(FvdAIOCB *acb);
 static void bjnl_send_current_buf_to_write_queue(BlockDriverState *bs);
 
+/* Return false if no buffered journal data. Invoked by fvd_aio_flush(). */
+static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
+                              BlockDriverCompletionFunc * cb,
+                              void *opaque, BlockDriverAIOCB **p_acb)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+
+    if (!s->bjnl.buf || s->bjnl.buf_used == 0) {
+        /* The current journal buffer is empty. */
+
+        if (QTAILQ_EMPTY(&s->bjnl.queued_bufs)) {
+            return false; /* Indicatte no previously buffered journal data. */
+        }
+    } else {
+        QDEBUG("JOURNAL: bjnl_clean_buf_on_aio_flush invoke "
+               "bjnl_send_current_buf_to_write_queue\n");
+        bjnl_send_current_buf_to_write_queue(bs);
+    }
+
+    /* Append an acb at the tail of bjnl.queued_bufs to invoke the aio_flush
+     * callback after all previous pending journal writes finish. See
+     * bjnl_write_next_buf() -> bjnl_write_buf(). */
+
+    acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        *p_acb = NULL; /* Indicate failure. */
+        return true;
+    }
+
+    acb->type = OP_BJNL_FLUSH;
+    acb->cancel_in_progress = false;
+    acb->jcb.iov.iov_base = NULL; /* Indicate no data. */
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.bitmap_updated = false;
+    QTAILQ_INSERT_TAIL(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+    *p_acb = &acb->common;
+
+    QDEBUG("JOURNAL: inserted OP_BJNL_FLUSH acb%llu-%p\n", acb->uuid, acb);
+    return true;
+}
+
 static inline void bjnl_finish_write_buf(FvdAIOCB *acb, int ret)
 {
     ASSERT (acb->type == OP_BJNL_BUF_WRITE);
@@ -65,6 +107,30 @@  static inline void bjnl_aio_flush_cb(void *opaque, int ret)
     my_qemu_aio_release(acb);
 }
 
+/* This acb is inserted by clean_journal_buf() on behalf of a pending
+ * bdrv_aio_flush(). */
+static inline void bjnl_handle_aio_flush(FvdAIOCB *acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    QTAILQ_REMOVE(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+
+    if (!s->metadata_err_prohibit_write) {
+        /* Buffered data have been written to journal. Now start flush. */
+        QDEBUG("JOURNAL: bjnl_start_flush for acb%llu-%p\n", acb->uuid, acb);
+        acb->jcb.hd_acb = fvd_aio_flush_start(bs, bjnl_aio_flush_cb, acb, acb);
+        if (acb->jcb.hd_acb) {
+            return;
+        }
+    }
+
+    QDEBUG("JOURNAL: bjnl_handle_aio_flush err acb%llu-%p\n", acb->uuid, acb);
+    /* Failed. Invoke aio_flush callback. */
+    acb->common.cb(acb->common.opaque, -EIO);
+    my_qemu_aio_release(acb);
+}
+
 static inline void bjnl_write_buf_cb(void *opaque, int ret)
 {
     FvdAIOCB *acb = (FvdAIOCB *) opaque;
@@ -153,6 +219,14 @@  static inline int bjnl_write_buf(FvdAIOCB *acb)
 
     QDEBUG("JOURNAL: bjnl_write_buf acb%llu-%p\n", acb->uuid, acb);
 
+    if (acb->type == OP_BJNL_FLUSH) {
+        bjnl_handle_aio_flush(acb);
+
+        /* Return -1 to tell bjnl_write_next_buf() to move on to the next
+         * buffer write as no buffered journal data are being written.*/
+        return -1;
+    }
+
     if (!acb->jcb.bitmap_updated) {
         return bjnl_write_buf_start(acb);
     }
@@ -313,6 +387,150 @@  static void bjnl_clean_buf_timer_cb(BlockDriverState * bs)
     s->bjnl.timer_scheduled = false;
 }
 
+/* Perform a synchronous flush. Invoked by fvd_close() and fvd_flush(). */
+static int bjnl_sync_flush(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb, *a;
+    int ret = 0;
+    size_t buf_size;
+    uint8_t *p, *buf = NULL;
+    bool bitmap_updated = false;
+    int nb_sectors;
+    int64_t journal_sec;
+
+    /* Calculate the total buffered metadata updates. Check the current buffer
+     * first. */
+    if (!s->bjnl.buf) {
+        buf_size = 0;
+    } else if (s->bjnl.buf_used == 0) {
+        buf_size = 0;
+    } else {
+        if (s->bjnl.buf_used < s->bjnl.buf_size) {
+            /* Mark the end of the buffer as EMPTY_JRECORD. */
+            *((uint32_t*)(s->bjnl.buf + s->bjnl.buf_used)) = EMPTY_JRECORD;
+        }
+        buf_size = s->bjnl.buf_used = ROUND_UP(s->bjnl.buf_used, 512);
+        bitmap_updated = s->bjnl.buf_contains_bitmap_update;
+    }
+
+    /* Go through the queued buffers. */
+    acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+    if (acb) {
+        if (acb->jcb.hd_acb) {
+            /* The first acb is the ongoing operation. Cancel and re-do it
+             * synchronously below. */
+            QDEBUG("JOURNAL: bjnl_sync_flush cancel ongoing buf_write "
+                   "acb%llu-%p\n", acb->uuid, acb);
+            bdrv_aio_cancel(acb->jcb.hd_acb);
+        }
+
+        /* Calcualte buf_size. */
+        while (acb) {
+            if (acb->type == OP_BJNL_BUF_WRITE) {
+                buf_size += acb->jcb.iov.iov_len;
+                if (acb->jcb.bitmap_updated) {
+                    bitmap_updated = true;
+                }
+            }
+            acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+        }
+    }
+
+    if (buf_size == 0) {
+        QDEBUG("JOURNAL: bjnl_sync_flush no_data\n");
+        goto done; /* No buffered metadata updates. */
+    }
+
+    if (bitmap_updated) {
+        /* Need a flush to ensure the correct semantics of copy-on-write in
+         * the event of a host crash. */
+        QDEBUG("JOURNAL: bjnl_sync_flush bitmap_updated flush_fvd_data\n");
+        if ((ret = bdrv_flush(s->fvd_data))) {
+            goto cleanup;
+        }
+    }
+
+    /* Allocate journal sectors. */
+    ASSERT(buf_size % 512 == 0);
+    nb_sectors = buf_size / 512;
+    if (s->next_journal_sector + nb_sectors > s->journal_size) {
+        QDEBUG("JOURNAL: bjnl_sync_flush recycle_journal\n");
+        ret = recycle_journal(bs);
+        /* Journal recycle writes out the entire bitmap and table. Therefore,
+         * there is no need to write buffered metadata updates to journal. */
+        goto done;
+    }
+    journal_sec = s->next_journal_sector;
+    s->next_journal_sector += nb_sectors;
+
+    /* Copy all metadata updates into one buffer. */
+    p = buf = my_qemu_blockalign(s->fvd_metadata, buf_size);
+    acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+    while (acb) {
+        if (acb->type == OP_BJNL_BUF_WRITE) {
+            QDEBUG("JOURNAL: bjnl_sync_flush takes care buf_write acb%llu-%p\n",
+                   acb->uuid, acb);
+            ASSERT(acb->jcb.iov.iov_len > 0);
+            memcpy(p, acb->jcb.iov.iov_base, acb->jcb.iov.iov_len);
+            PRINT_JRECORDS(p, acb->jcb.iov.iov_len);
+            p += acb->jcb.iov.iov_len;
+        }
+        acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+    }
+
+    if (s->bjnl.buf && s->bjnl.buf_used > 0) {
+        /* Copy the current buffer. */
+        memcpy(p, s->bjnl.buf, s->bjnl.buf_used);
+        PRINT_JRECORDS(p, s->bjnl.buf_used);
+    }
+
+    /* Write all metadata updates synchronously. */
+    QDEBUG("JOURNAL: bjnl_sync_flush write_buffer\n");
+    if ((ret=bdrv_write(s->fvd_metadata, s->journal_offset + journal_sec,
+                        buf, nb_sectors)) < 0) {
+        goto cleanup;
+    }
+
+done:
+    /* Flush finally. */
+    QDEBUG("JOURNAL: bjnl_sync_flush do final flush\n");
+    if (s->fvd_data != s->fvd_metadata) {
+        if ((ret = bdrv_flush(s->fvd_data)) != 0) {
+            goto cleanup;
+        }
+    }
+    ret = bdrv_flush(s->fvd_metadata);
+
+cleanup:
+    if (buf) {
+        my_qemu_vfree(buf);
+    }
+    if (s->bjnl.buf) {
+        my_qemu_vfree (s->bjnl.buf);
+        s->bjnl.buf = NULL;
+    }
+
+    acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+    QTAILQ_INIT(&s->bjnl.queued_bufs);
+    while (acb) {
+        if (acb->type == OP_BJNL_BUF_WRITE) {
+            my_qemu_vfree(acb->jcb.iov.iov_base);
+        } else {
+            ASSERT(acb->type == OP_BJNL_FLUSH);
+            /* Invoke the callback for bdrv_aio_flush(). */
+            QDEBUG("JOURNAL: aio_flush acb%llu-%p finished by sync_flush\n",
+                   acb->uuid, acb);
+            acb->common.cb(acb->common.opaque, ret);
+        }
+        a = acb;
+        acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+        my_qemu_aio_release(a);
+    }
+
+    return ret;
+}
+
 #ifdef ENABLE_QDEBUG
 static void print_jrecords(const uint8_t *sector, size_t len)
 {