Patchwork [11/26] FVD: add impl of interface bdrv_aio_writev()

login
register
mail settings
Submitter Chunqiang Tang
Date Feb. 25, 2011, 10:37 p.m.
Message ID <1298673486-3573-11-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/84620/
State New
Headers show

Comments

Chunqiang Tang - Feb. 25, 2011, 10:37 p.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_aio_writev() interface. It
supports copy-on-write in FVD.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-bitmap.c  |  150 ++++++++++++++++
 block/fvd-journal.c |    4 +
 block/fvd-store.c   |   20 +++
 block/fvd-write.c   |  468 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd.c         |    4 +-
 block/fvd.h         |    1 +
 6 files changed, 645 insertions(+), 2 deletions(-)
 create mode 100644 block/fvd-bitmap.c
 create mode 100644 block/fvd-store.c

Patch

diff --git a/block/fvd-bitmap.c b/block/fvd-bitmap.c
new file mode 100644
index 0000000..7e96201
--- /dev/null
+++ b/block/fvd-bitmap.c
@@ -0,0 +1,150 @@ 
+/*
+ * QEMU Fast Virtual Disk Format Utility Functions for Bitmap
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static inline bool stale_bitmap_show_sector_in_base_img(int64_t sector_num,
+                                                        const BDRVFvdState * s)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+    return 0 == (int)((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline bool fresh_bitmap_show_sector_in_base_img(int64_t sector_num,
+                                                        const BDRVFvdState * s)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+    return 0 == (int)((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline void update_fresh_bitmap(int64_t sector_num, int nb_sectors,
+                                       const BDRVFvdState * s)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
+static inline bool bitmap_show_sector_in_base_img(int64_t sector_num,
+                                                  const BDRVFvdState * s,
+                                                  int bitmap_offset,
+                                                  uint8_t * bitmap)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8 - bitmap_offset;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = bitmap[bitmap_byte_offset];
+    return 0 == (int)((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline bool stale_bitmap_need_update(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t end = acb->sector_num + acb->nb_sectors;
+
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+    int64_t block_end = (end - 1) / s->block_size;
+    int64_t block_num = acb->sector_num / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Return true if stable_bitmap needs update. */
+static bool update_fresh_bitmap_and_check_stale_bitmap(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    bool need_update = false;
+    int64_t end = acb->sector_num + acb->nb_sectors;
+
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_end = (end - 1) / s->block_size;
+    int64_t block_num = acb->sector_num / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (b & mask) {
+            /* If the bit in stale_bitmap is set, the corresponding bit in
+             * fresh_bitmap must be set already. */
+            continue;
+        }
+
+        need_update = true;
+        b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+
+    return need_update;
+}
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 5ba34bd..2edfc70 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -28,6 +28,10 @@  static int init_journal(int read_only, BlockDriverState * bs,
     return -ENOTSUP;
 }
 
+static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap)
+{
+}
+
 void fvd_emulate_host_crash(bool cond)
 {
     emulate_host_crash = cond;
diff --git a/block/fvd-store.c b/block/fvd-store.c
new file mode 100644
index 0000000..85e45d4
--- /dev/null
+++ b/block/fvd-store.c
@@ -0,0 +1,20 @@ 
+/*
+ * QEMU Fast Virtual Disk Format Store Data in Compact Image
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static inline BlockDriverAIOCB *store_data(int soft_write,
+                FvdAIOCB * parent_acb, BlockDriverState * bs,
+                int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
+                BlockDriverCompletionFunc * cb, void *opaque)
+{
+    return NULL;
+}
diff --git a/block/fvd-write.c b/block/fvd-write.c
index a736a37..f0580d4 100644
--- a/block/fvd-write.c
+++ b/block/fvd-write.c
@@ -11,11 +11,477 @@ 
  *
  */
 
+static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap);
+static int do_aio_write(struct FvdAIOCB *acb);
+static void restart_dependent_writes(struct FvdAIOCB *acb);
+static void free_write_resource(struct FvdAIOCB *acb);
+static inline BlockDriverAIOCB *store_data(int soft_write,
+                FvdAIOCB * parent_acb, BlockDriverState * bs,
+                int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
+                BlockDriverCompletionFunc * cb, void *opaque);
+
+static inline void init_data_region(BDRVFvdState * s)
+{
+    bdrv_truncate(s->fvd_data, s->data_offset * 512 + s->virtual_disk_size);
+    s->data_region_prepared = true;
+}
+
 static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
                                         int64_t sector_num,
                                         QEMUIOVector * qiov, int nb_sectors,
                                         BlockDriverCompletionFunc * cb,
                                         void *opaque)
 {
-    return NULL;
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+
+    TRACE_REQUEST(true, sector_num, nb_sectors);
+
+    if (s->metadata_err_prohibit_write) {
+        return NULL;
+    }
+
+    if (!s->data_region_prepared) {
+        init_data_region(s);
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED
+        || sector_num >= s->base_img_sectors) {
+        /* This is an  efficient case. See Section 3.3.5 of the FVD-cow paper.
+         * This also covers the case of no base image. */
+        return store_data(false, NULL, bs, sector_num, qiov,
+                          nb_sectors, cb, opaque);
+    }
+
+    /* Check if all requested sectors are in the FVD data file. */
+    int64_t sec = ROUND_DOWN(sector_num, s->block_size);
+    int64_t sec_in_last_block = ROUND_DOWN(sector_num + nb_sectors - 1,
+                                           s->block_size);
+    do {
+        if (stale_bitmap_show_sector_in_base_img(sec, s)) {
+            goto slow_path;
+        }
+        sec += s->block_size;
+    } while (sec <= sec_in_last_block);
+
+    /* This is the fast path, as all requested data are in the FVD data file
+     * and no need to update the bitmap. */
+    return store_data(false, NULL, bs, sector_num, qiov,
+                      nb_sectors, cb, opaque);
+
+slow_path:
+    acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->type = OP_WRITE;
+    acb->cancel_in_progress = false;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->write.ret = 0;
+    acb->write.update_table = false;
+    acb->write.qiov = qiov;
+    acb->write.hd_acb = NULL;
+    acb->write.cow_buf = NULL;
+    acb->copy_lock.next.le_prev = NULL;
+    acb->write.next_write_lock.le_prev = NULL;
+    acb->write.next_dependent_write.le_prev = NULL;
+    acb->jcb.iov.iov_base = NULL;
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.ujnl_next_wait4_recycle.le_prev = NULL;
+    QLIST_INIT(&acb->copy_lock.dependent_writes);
+
+    QDEBUG("WRITE: acb%llu-%p  start  sector_num=%" PRId64 " nb_sectors=%d\n",
+           acb->uuid, acb, acb->sector_num, acb->nb_sectors);
+
+    if (do_aio_write(acb) < 0) {
+        my_qemu_aio_release(acb);
+        return NULL;
+    }
+#ifdef FVD_DEBUG
+    pending_local_writes++;
+#endif
+    return &acb->common;
+}
+
+static void free_write_resource(FvdAIOCB * acb)
+{
+    if (acb->write.next_write_lock.le_prev) {
+        QLIST_REMOVE(acb, write.next_write_lock);
+    }
+    if (acb->copy_lock.next.le_prev) {
+        QLIST_REMOVE(acb, copy_lock.next);
+        restart_dependent_writes(acb);
+    }
+    if (acb->write.cow_buf) {
+        my_qemu_vfree(acb->write.cow_buf);
+    }
+    if (acb->jcb.iov.iov_base != NULL) {
+        my_qemu_vfree(acb->jcb.iov.iov_base);
+    }
+
+    my_qemu_aio_release(acb);
+
+#ifdef FVD_DEBUG
+    pending_local_writes--;
+#endif
+}
+
+static inline void finish_write(FvdAIOCB * acb, int ret)
+{
+    QDEBUG("WRITE: acb%llu-%p  completely_finished ret=%d\n", acb->uuid, acb,
+           ret);
+    acb->common.cb(acb->common.opaque, ret);
+    free_write_resource(acb);
+}
+
+static void write_data_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    acb->write.ret = ret;
+    acb->write.hd_acb = NULL;
+
+    if (ret != 0) {
+        QDEBUG("WRITE: acb%llu-%p  write_data_cb error ret=%d\n",
+               acb->uuid, acb, ret);
+        finish_write(acb, ret);
+        return;
+    }
+
+    QDEBUG("WRITE: acb%llu-%p  write_data_cb\n", acb->uuid, acb);
+
+    /* Figure out whether to update metadata or not. */
+    if (s->fresh_bitmap == s->stale_bitmap) {
+        /* Neither copy_on_read nor prefetching is enabled. Cannot update
+         * fresh_bitmap until the on-disk metadata is updated. */
+        if (stale_bitmap_need_update(acb)) {
+            write_metadata_to_journal(acb, true);
+        } else if (acb->write.update_table) {
+            write_metadata_to_journal(acb, false);
+        } else {
+            finish_write(acb, ret);     /* No need to update metadata. */
+        }
+
+        return;
+    }
+
+    /* stale_bitmap and fresh_bitmap are different. Update fresh_bitmap now
+     * and stale_bitmap will be updated after on-disk metadata are updated. */
+    bool bitmap_need_update = update_fresh_bitmap_and_check_stale_bitmap(acb);
+
+    /* Release lock on data now since fresh_bitmap has been updated. */
+    QLIST_REMOVE(acb, write.next_write_lock);
+    acb->write.next_write_lock.le_prev = NULL;
+    if (acb->copy_lock.next.le_prev) {
+        QLIST_REMOVE(acb, copy_lock.next);
+        restart_dependent_writes(acb);
+    }
+
+    if (bitmap_need_update) {
+        write_metadata_to_journal(acb, true);
+    } else if (acb->write.update_table) {
+        write_metadata_to_journal(acb, false);
+    } else {
+        finish_write(acb, ret);
+    }
+}
+
+static void read_backing_for_copy_on_write_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    if (ret != 0) {
+        QDEBUG("WRITE: acb%llu-%p  read_backing with error "
+               "ret=%d\n", acb->uuid, acb, ret);
+        finish_write(acb, ret);
+    } else {
+        QDEBUG("WRITE: acb%llu-%p  "
+               "finish_read_from_backing_and_start_write_data\n",
+               acb->uuid, acb);
+        acb->write.hd_acb = store_data(false, acb, bs,
+                                       acb->write.cow_start_sector,
+                                       acb->write.cow_qiov,
+                                       acb->write.cow_qiov->size / 512,
+                                       write_data_cb, acb);
+        if (!acb->write.hd_acb) {
+            finish_write(acb, -EIO);
+        }
+    }
+}
+
+static int do_aio_write(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    /* Calculate the data region need be locked. */
+    const int64_t sector_end = acb->sector_num + acb->nb_sectors;
+    const int64_t block_begin = ROUND_DOWN(acb->sector_num, s->block_size);
+    int64_t block_end = ROUND_UP(sector_end, s->block_size);
+
+    /* Check for conflicting copy-on-reads. */
+    FvdAIOCB *old;
+    QLIST_FOREACH(old, &s->copy_locks, copy_lock.next) {
+        if (old->copy_lock.end > acb->sector_num &&
+            sector_end > old->copy_lock.begin) {
+            QLIST_INSERT_HEAD(&old->copy_lock.dependent_writes, acb,
+                              write.next_dependent_write);
+            QDEBUG("WRITE: acb%llu-%p  put_on_hold_due_to_data_conflict "
+                   "with %s acb%llu-%p\n", acb->uuid, acb,
+                   old->type == OP_WRITE ? "write" : "copy_on_read",
+                   old->uuid, old);
+            return 0;
+        }
+    }
+
+    /* No conflict. check if this write updates partial blocks and need to
+     * read those blocks from the base image and merge with this write. */
+    int read_first_block, read_last_block;
+    if (acb->sector_num % s->block_size == 0) {
+        read_first_block = false;
+    } else if (fresh_bitmap_show_sector_in_base_img(acb->sector_num, s)) {
+        read_first_block = true;
+    } else {
+        read_first_block = false;
+    }
+
+    if (sector_end % s->block_size == 0) {
+        read_last_block = false;
+    } else if (fresh_bitmap_show_sector_in_base_img(sector_end, s)) {
+        read_last_block = true;
+    } else {
+        read_last_block = false;
+    }
+
+    if (read_first_block) {
+        if (read_last_block) {
+            /* Case 1: Read all the blocks involved from the base image. */
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+            if (block_end > s->base_img_sectors) {
+                block_end = s->base_img_sectors;
+            }
+
+            int buf_size = (block_end - block_begin) * 512
+                + 2 * sizeof(QEMUIOVector)
+                + sizeof(struct iovec) * (old_qiov->niov + 3);
+            buf_size = ROUND_UP(buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf +
+                                            (block_end - block_begin) * 512);
+            read_qiov->iov = (struct iovec *)(read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size =
+                (block_end - block_begin) * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *)(write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 2;
+            write_qiov->size = read_qiov->size;
+
+            /* The first entry is for data read from the base image. */
+            write_qiov->iov[0].iov_base = acb->write.cow_buf;
+            write_qiov->iov[0].iov_len = (acb->sector_num - block_begin) * 512;
+            memcpy(&write_qiov->iov[1], old_qiov->iov,
+                   sizeof(struct iovec) * old_qiov->niov);
+
+            /* The last entry is for data read from the base image. */
+            const int last = old_qiov->niov + 1;
+            write_qiov->iov[last].iov_base = acb->write.cow_buf
+                                        + (sector_end - block_begin) * 512;
+            write_qiov->iov[last].iov_len = (block_end - sector_end) * 512;
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = block_begin;
+
+            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd, block_begin,
+                                    read_qiov, block_end - block_begin,
+                                    read_backing_for_copy_on_write_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.begin = block_begin;
+            acb->copy_lock.end = block_end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            QDEBUG("WRITE: acb%llu-%p  "
+                   "read_first_last_partial_blocks_from_backing  sector_num=%"
+                   PRId64 " nb_sectors=%d\n", acb->uuid, acb, block_begin,
+                   (int)(block_end - block_begin));
+        } else {
+            /* Case 2: Read the first block from the base image. */
+            int nb = acb->sector_num - block_begin;
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+
+            /* Space for data and metadata. */
+            int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector)
+                                + sizeof(struct iovec) * (old_qiov->niov + 2);
+            buf_size = ROUND_UP(buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov =
+                (QEMUIOVector *) (acb->write.cow_buf + nb * 512);
+            read_qiov->iov = (struct iovec *)(read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *)(write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 1;
+            write_qiov->size = old_qiov->size + read_qiov->size;
+
+            /* The first entry is added for data read from the base image. */
+            write_qiov->iov[0].iov_base = acb->write.cow_buf;
+            write_qiov->iov[0].iov_len = read_qiov->size;
+            memcpy(&write_qiov->iov[1], old_qiov->iov,
+                   sizeof(struct iovec) * old_qiov->niov);
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = block_begin;
+
+            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd,
+                                    block_begin, read_qiov, nb,
+                                    read_backing_for_copy_on_write_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.begin = block_begin;
+            acb->copy_lock.end = block_begin + s->block_size;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            QDEBUG("WRITE: acb%llu-%p  read_first_partial_block_from_backing  "
+                   "sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb, block_begin, nb);
+        }
+    } else {
+        if (read_last_block) {
+            /* Case 3: Read the last block from the base image. */
+            int nb;
+            if (block_end < s->base_img_sectors) {
+                nb = block_end - sector_end;
+            } else {
+                nb = s->base_img_sectors - sector_end;
+            }
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+
+            /* Space for data and metadata. */
+            int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector)
+                                + sizeof(struct iovec) * (old_qiov->niov + 2);
+            buf_size = ROUND_UP(buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf
+                                                        + nb * 512);
+            read_qiov->iov = (struct iovec *)(read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *)(write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 1;
+            write_qiov->size = old_qiov->size + read_qiov->size;
+            memcpy(write_qiov->iov, old_qiov->iov,
+                   sizeof(struct iovec) * old_qiov->niov);
+
+            /* The last appended entry is for data read from the base image. */
+            write_qiov->iov[old_qiov->niov].iov_base = acb->write.cow_buf;
+            write_qiov->iov[old_qiov->niov].iov_len = read_qiov->size;
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = acb->sector_num;
+
+            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd,
+                                    sector_end, read_qiov, nb,
+                                    read_backing_for_copy_on_write_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.end = block_end;
+            acb->copy_lock.begin = block_end - s->block_size;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            QDEBUG("WRITE: acb%llu-%p  read_last_partial_block_from_backing  "
+                   "sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb, sector_end, nb);
+        } else {
+            /* Case 4: Can write directly and no need to merge with data from
+             * the base image. */
+            QDEBUG("WRITE: acb%llu-%p  "
+                   "write_fvd_without_read_partial_block_from_backing\n",
+                   acb->uuid, acb);
+            acb->write.hd_acb = store_data(false, acb, bs, acb->sector_num,
+                                           acb->write.qiov, acb->nb_sectors,
+                                           write_data_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+        }
+    }
+
+    QLIST_INSERT_HEAD(&s->write_locks, acb, write.next_write_lock);
+    return 0;
+
+fail:
+    if (acb->write.cow_buf) {
+        my_qemu_vfree(acb->write.cow_buf);
+    }
+    return -EIO;
+}
+
+static void restart_dependent_writes(FvdAIOCB * acb)
+{
+    acb->copy_lock.next.le_prev = NULL;
+    FvdAIOCB *req = acb->copy_lock.dependent_writes.lh_first;
+
+    while (req) {
+        /* Keep a copy of 'next' as it may be changed in do_aiO_write(). */
+        FvdAIOCB *next = req->write.next_dependent_write.le_next;
+
+        /* Indicate that this write is no longer on any depedent list. This
+         * helps fvd_aio_cancel_read() work properly. */
+        req->write.next_dependent_write.le_prev = NULL;
+
+        if (acb->type == OP_WRITE) {
+            QDEBUG("WRITE: acb%llu-%p  finished_and_restart_conflict_write "
+                   "acb%llu-%p\n", acb->uuid, acb, req->uuid, req);
+        } else {
+            QDEBUG("READ: copy_on_read acb%llu-%p  "
+                   "finished_and_restart_conflict_write acb%llu-%p\n",
+                   acb->uuid, acb, req->uuid, req);
+        }
+
+        if (do_aio_write(req) < 0) {
+            QDEBUG("WRITE: acb%llu-%p  finished with error ret=%d\n",
+                   req->uuid, req, -1);
+            req->common.cb(req->common.opaque, -1);
+            my_qemu_aio_release(req);
+        }
+
+        req = next;
+    }
 }
diff --git a/block/fvd.c b/block/fvd.c
index e41f419..5b3dcac 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -27,11 +27,13 @@ 
  * function optimization. */
 #include "block/fvd-debug.c"
 #include "block/fvd-flush.c"
+#include "block/fvd-bitmap.c"
 #include "block/fvd-misc.c"
 #include "block/fvd-create.c"
 #include "block/fvd-open.c"
-#include "block/fvd-read.c"
 #include "block/fvd-write.c"
+#include "block/fvd-read.c"
+#include "block/fvd-store.c"
 #include "block/fvd-journal.c"
 #include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
diff --git a/block/fvd.h b/block/fvd.h
index 9847e7f..34ea2b4 100644
--- a/block/fvd.h
+++ b/block/fvd.h
@@ -432,6 +432,7 @@  typedef struct FvdAIOCB {
 #endif
 } FvdAIOCB;
 
+static AIOPool fvd_aio_pool;
 static BlockDriver bdrv_fvd;
 static QEMUOptionParameter fvd_create_options[];
 static QEMUOptionParameter fvd_update_options[];