Patchwork [18/26] FVD: add support for base image prefetching

login
register
mail settings
Submitter Chunqiang Tang
Date Feb. 25, 2011, 10:37 p.m.
Message ID <1298673486-3573-18-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/84606/
State New
Headers show

Comments

Chunqiang Tang - Feb. 25, 2011, 10:37 p.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds adaptive prefetching of base image to FVD.  FVD supports both
copy-on-write and copy-on-read of base image. Adaptive prefetching is similar
to copy-on-read except that it is initiated by the FVD driver rather than
triggered by the VM's read requests. FVD's prefetching is conservative in
that, if it detects resource contention, it will back off and temporarily
pause prefetching.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-prefetch.c |  600 +++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-read.c     |    1 +
 qemu-io-sim.c        |   13 +
 3 files changed, 613 insertions(+), 1 deletions(-)

Patch

diff --git a/block/fvd-prefetch.c b/block/fvd-prefetch.c
index 5844aa7..b8be98c 100644
--- a/block/fvd-prefetch.c
+++ b/block/fvd-prefetch.c
@@ -11,7 +11,605 @@ 
  *
  */
 
+static void prefetch_read_cb(void *opaque, int ret);
+static void resume_prefetch(BlockDriverState * bs);
+static void do_next_prefetch_read(BlockDriverState * bs, int64_t current_time);
+
 void fvd_init_prefetch(void *opaque)
 {
-    /* To be implemented. */
+    BlockDriverState *bs = opaque;
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+    int i;
+
+    QDEBUG("Start prefetching\n");
+
+    if (!s->data_region_prepared) {
+        init_data_region(s);
+    }
+
+    s->prefetch_acb = my_qemu_malloc(sizeof(FvdAIOCB *)*s->num_prefetch_slots);
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        acb = my_qemu_aio_get(&fvd_aio_pool, bs, prefetch_null_cb, NULL);
+        s->prefetch_acb[i] = acb;
+        if (!acb) {
+            int j;
+            for (j = 0; j < i; j++) {
+                my_qemu_aio_release(s->prefetch_acb[j]);
+                s->prefetch_acb[j] = NULL;
+            }
+
+            my_qemu_free(s->prefetch_acb);
+            s->prefetch_acb = NULL;
+            fprintf(stderr, "No acb and cannot start prefetching.\n");
+            return;
+        }
+
+        acb->type = OP_COPY;
+        acb->cancel_in_progress = false;
+    }
+
+    s->prefetch_state = PREFETCH_STATE_RUNNING;
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        acb = s->prefetch_acb[i];
+        acb->copy.buffered_sector_begin = acb->copy.buffered_sector_end = 0;
+        QLIST_INIT(&acb->copy_lock.dependent_writes);
+        acb->copy_lock.next.le_prev = NULL;
+        acb->copy.hd_acb = NULL;
+        acb->sector_num = 0;
+        acb->nb_sectors = 0;
+        acb->copy.iov.iov_len = s->sectors_per_prefetch * 512;
+        acb->copy.buf = acb->copy.iov.iov_base =
+            my_qemu_blockalign(bs->backing_hd, acb->copy.iov.iov_len);
+        qemu_iovec_init_external(&acb->copy.qiov, &acb->copy.iov, 1);
+    }
+
+    if (s->prefetch_timer) {
+        qemu_free_timer(s->prefetch_timer);
+        s->prefetch_timer =
+            qemu_new_timer(rt_clock, (QEMUTimerCB *) resume_prefetch, bs);
+    }
+
+    s->pause_prefetch_requested = false;
+    s->unclaimed_prefetch_region_start = 0;
+    s->prefetch_read_throughput = -1;   /* Indicate not initialized. */
+    s->prefetch_write_throughput = -1;  /* Indicate not initialized. */
+    s->prefetch_read_time = 0;
+    s->prefetch_write_time = 0;
+    s->prefetch_data_read = 0;
+    s->prefetch_data_written = 0;
+    s->next_prefetch_read_slot = 0;
+    s->num_filled_prefetch_slots = 0;
+    s->prefetch_read_active = false;
+
+    do_next_prefetch_read(bs, qemu_get_clock(rt_clock));
+}
+
+static void pause_prefetch(BDRVFvdState * s)
+{
+    int64_t ms = 1 + (int64_t) ((rand() / ((double)RAND_MAX))
+                                * s->prefetch_throttle_time);
+    QDEBUG("Pause prefetch for %" PRId64 " milliseconds\n", ms);
+    /* When the timer expires, it goes to resume_prefetch(). */
+    qemu_mod_timer(s->prefetch_timer, qemu_get_clock(rt_clock) + ms);
+}
+
+/* Return true if every bit of freshbitmap is set to 1. */
+static bool all_data_prefetched(BDRVFvdState *s)
+{
+    uint64_t n = s->base_img_sectors / s->block_size / sizeof(uint64_t) / 8;
+    uint64_t *p = (uint64_t*)s->fresh_bitmap;
+    uint64_t i;
+
+    for (i = 0; i < n; i++, p++) {
+        if (*p != UINT64_C(0xFFFFFFFFFFFFFFFF)) {
+            return false;
+        }
+    }
+
+    uint64_t sec = n * sizeof(uint64_t) * 8 * s->block_size;
+    while (sec < s->base_img_sectors) {
+        if (fresh_bitmap_show_sector_in_base_img(sec, s)) {
+            return false;
+        }
+        sec += s->block_size;
+    }
+
+    return true;
+}
+
+static void terminate_prefetch(BlockDriverState * bs, int final_state)
+{
+    BDRVFvdState *s = bs->opaque;
+    int i;
+
+    ASSERT(!s->prefetch_read_active && s->num_filled_prefetch_slots == 0);
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        if (s->prefetch_acb) {
+            my_qemu_vfree(s->prefetch_acb[i]->copy.buf);
+            my_qemu_aio_release(s->prefetch_acb[i]);
+            s->prefetch_acb[i] = NULL;
+        }
+    }
+    my_qemu_free(s->prefetch_acb);
+    s->prefetch_acb = NULL;
+
+    if (s->prefetch_timer) {
+        qemu_del_timer(s->prefetch_timer);
+        qemu_free_timer(s->prefetch_timer);
+        s->prefetch_timer = NULL;
+    }
+
+    if (final_state == PREFETCH_STATE_FINISHED) {
+        if (all_data_prefetched(s)) {
+            s->prefetch_state = PREFETCH_STATE_FINISHED;
+            s->copy_on_read = false;
+        } else {
+            s->prefetch_state = PREFETCH_STATE_DISABLED;
+        }
+    } else {
+        s->prefetch_state = final_state;
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED) {
+        flush_metadata_to_disk(bs, false/*journal*/, true/*all_prefetched*/);
+        QDEBUG("FVD prefetching finished successfully.\n");
+    } else {
+        flush_metadata_to_disk(bs, false/*journal*/, false/*all_prefetched*/);
+        QDEBUG("FVD prefetching disabled.\n");
+    }
+}
+
+static void do_next_prefetch_read(BlockDriverState * bs, int64_t current_time)
+{
+    FvdAIOCB *acb;
+    BDRVFvdState *s = bs->opaque;
+    int64_t begin, end;
+
+    ASSERT(!s->prefetch_read_active
+           && s->num_filled_prefetch_slots < s->num_prefetch_slots
+           && !s->pause_prefetch_requested);
+
+    /* Find the next region to prefetch. */
+    begin = s->unclaimed_prefetch_region_start;
+    while (1) {
+        /*Check the bitmap to determine if it is truly finished. If not
+            schedule a timer to retry again. */
+
+        if (begin >= s->base_img_sectors) {
+            s->unclaimed_prefetch_region_start = s->base_img_sectors;
+            if (s->num_filled_prefetch_slots == 0) {
+                terminate_prefetch(bs, PREFETCH_STATE_FINISHED);
+            }
+            return;
+        }
+        end = begin + s->sectors_per_prefetch;
+        if (end > s->base_img_sectors) {
+            end = s->base_img_sectors;
+        }
+        if (find_region_in_base_img(s, &begin, &end)) {
+            break;
+        }
+        begin = end;
+    }
+
+    ASSERT(begin % s->block_size == 0 && (end % s->block_size == 0
+           || end == s->base_img_sectors));
+
+    acb = s->prefetch_acb[s->next_prefetch_read_slot];
+    acb->copy.buffered_sector_begin = acb->sector_num = begin;
+    acb->copy.buffered_sector_end = s->unclaimed_prefetch_region_start = end;
+    acb->nb_sectors = end - begin;
+    acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+    acb->copy.iov.iov_base = acb->copy.buf;
+    acb->copy.last_prefetch_op_start_time = current_time;
+    acb->copy.hd_acb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+                                      &acb->copy.qiov, acb->nb_sectors,
+                                      prefetch_read_cb, acb);
+
+
+    if (acb->copy.hd_acb == NULL) {
+        QDEBUG("PREFETCH: error when starting read for sector_num=%" PRId64
+               " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+    } else {
+        s->prefetch_read_active = true;
+        QDEBUG("PREFETCH: start read for sector_num=%" PRId64
+               " nb_sectors=%d total_prefetched_bytes=%" PRId64 "\n",
+               acb->sector_num, acb->nb_sectors, s->total_prefetch_data);
+#ifdef FVD_DEBUG
+        s->total_prefetch_data += acb->copy.iov.iov_len;
+#endif
+    }
+}
+
+static void prefetch_write_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t begin, end;
+    const int64_t current_time = qemu_get_clock(rt_clock);
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    ASSERT(acb->nb_sectors > 0 && s->num_filled_prefetch_slots > 0);
+
+    if (ret == 0) {
+        /* No need to update the on-disk bitmap or the stale bitmap.
+         * See Section 3.3.4 of the FVD-cow paper. */
+        update_fresh_bitmap(acb->sector_num, acb->nb_sectors, s);
+    }
+
+    QLIST_REMOVE(acb, copy_lock.next);
+    restart_dependent_writes(acb);
+    acb->copy.hd_acb = NULL;
+    QLIST_INIT(&acb->copy_lock.dependent_writes);
+
+    if (ret != 0) {
+        QDEBUG("PREFETCH: finished write with error for sector_num=%" PRId64
+               " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+        s->num_filled_prefetch_slots = 0;
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (!s->prefetch_read_active) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    const int64_t write_time =
+        current_time - acb->copy.last_prefetch_op_start_time;
+    s->prefetch_write_time += write_time;
+    s->prefetch_data_written += acb->nb_sectors * 512;
+
+    QDEBUG("PREFETCH: write_finished  sector_num=%" PRId64
+           " nb_sectors=%d  write_time=%"PRId64" (ms)\n", acb->sector_num,
+           acb->nb_sectors, write_time);
+
+    /* Calculate throughput and determine if it needs to pause prefetching due
+     * to low throughput. */
+    if (s->prefetch_timer && s->prefetch_throttle_time > 0
+        && !s->pause_prefetch_requested
+        && s->prefetch_write_time > s->prefetch_write_throughput_measure_time) {
+        const double this_round_throughput =
+            s->prefetch_data_written / (double)s->prefetch_write_time;
+        if (s->prefetch_write_throughput < 0) {
+            /* Previously not initialized. */
+            s->prefetch_write_throughput = this_round_throughput;
+        } else {
+            s->prefetch_write_throughput =
+                PREFETCH_PERF_CALC_ALPHA * s->prefetch_write_throughput +
+                (1 - PREFETCH_PERF_CALC_ALPHA) * this_round_throughput;
+        }
+        if (s->prefetch_write_throughput < s->prefetch_min_write_throughput) {
+            QDEBUG("PREFETCH: slow_write  this_write=%"PRId64" (ms)  "
+                   "this_write_throughput=%.3lf (MB/s)   "
+                   "avg_write_throughput=%.3lf (MB/s)\n",
+                   write_time, this_round_throughput / 1048576 * 1000,
+                   s->prefetch_write_throughput / 1048576 * 1000);
+
+            /* Make a randomized decision to pause prefetching. This avoids
+             * pausing all contending FVD drivers. See Section 3.4.2 of the
+             * FVD-cow paper. */
+            if (rand() > (RAND_MAX / 2)) {
+                QDEBUG("PREFETCH: pause requested.\n");
+                s->pause_prefetch_requested = true;
+            } else {
+                QDEBUG("PREFETCH: continue due to 50%% probability, despite "
+                       "slow write.\n");
+                s->prefetch_write_throughput = -1; /*Indicate not initialized*/
+            }
+        } else {
+            QDEBUG("PREFETCH: this_write_throughput=%.3lf (MB/s)   "
+                   "avg_write_throughput=%.3lf (MB/s)\n",
+                   this_round_throughput / 1048576 * 1000,
+                   s->prefetch_write_throughput / 1048576 * 1000);
+        }
+
+        /* Preparing for measuring the next round of throughput. */
+        s->prefetch_data_written = 0;
+        s->prefetch_write_time = 0;
+    }
+
+    /* Find in this prefetch slot the next section of prefetched but
+     * not-yet-written data. */
+    begin = acb->sector_num + acb->nb_sectors;
+    if (begin < acb->copy.buffered_sector_end) {
+        end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img(s, &begin, &end)) {
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base = acb->copy.buf +
+                (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG("PREFETCH: write_data  sector_num=%" PRId64
+                   " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                          &acb->copy.qiov, acb->nb_sectors,
+                                          prefetch_write_cb, acb);
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG("PREFETCH: error in starting bdrv_aio_writev().\n");
+                s->num_filled_prefetch_slots = 0;
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                if (!s->prefetch_read_active) {
+                    terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+                }
+            } else {
+                acb->copy_lock.begin = begin;
+                acb->copy_lock.end = end;
+                QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            }
+
+            return;
+        }
+    }
+
+    s->num_filled_prefetch_slots--;
+
+    if (s->prefetch_state == PREFETCH_STATE_DISABLED) {
+        if (s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    if (begin >= s->base_img_sectors) {
+        /* Prefetching finished. */
+        ASSERT(s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active);
+        terminate_prefetch(bs, PREFETCH_STATE_FINISHED);
+        return;
+    }
+
+    if (s->pause_prefetch_requested) {
+        if (s->num_filled_prefetch_slots == 0) {
+            if (!s->prefetch_read_active) {
+                pause_prefetch(s);
+            } else {
+                QDEBUG("PREFETCH: wait for the read operation to finish in "
+                       "order to pause prefetch.\n");
+            }
+            return;
+        }
+    }
+
+    /* Write out data in the next prefetched slot. */
+    while (s->num_filled_prefetch_slots > 0) {
+        int k = s->next_prefetch_read_slot - s->num_filled_prefetch_slots;
+        if (k < 0) {
+            k += s->num_prefetch_slots;
+        }
+        acb = s->prefetch_acb[k];
+
+        int64_t begin = acb->copy.buffered_sector_begin;
+        int64_t end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img(s, &begin, &end)) {
+            acb->copy.last_prefetch_op_start_time = current_time;
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base =
+                acb->copy.buf + (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG("PREFETCH: writes data: sector_num=%" PRId64
+                   " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                          &acb->copy.qiov, acb->nb_sectors,
+                                          prefetch_write_cb, acb);
+
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG("PREFETCH: error cannot get a control block to write "
+                       "a prefetched block.\n");
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                s->num_filled_prefetch_slots = 0;
+                if (!s->prefetch_read_active) {
+                    terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+                }
+                return;
+            }
+
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            break;
+        } else {
+            QDEBUG("PREFETCH: discard prefetched data as they have been "
+                   "covered: sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->sector_num, acb->nb_sectors);
+            s->num_filled_prefetch_slots--;
+        }
+    }
+
+    /* If the reader was stopped due to lack of slots, start the reader. */
+    if (!s->prefetch_read_active && !s->pause_prefetch_requested) {
+        do_next_prefetch_read(bs, current_time);
+    }
+}
+
+static void prefetch_read_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    ASSERT(s->prefetch_read_active && s->num_filled_prefetch_slots >= 0
+           && s->num_filled_prefetch_slots < s->num_prefetch_slots);
+
+    s->prefetch_read_active = false;
+    acb->copy.hd_acb = NULL;
+
+    if (s->prefetch_state == PREFETCH_STATE_DISABLED) {
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    if (ret != 0) {
+        QDEBUG("PREFETCH: read_error  sector_num=%" PRId64 " nb_sectors=%d.\n",
+               acb->sector_num, acb->nb_sectors);
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    const int64_t current_time = qemu_get_clock(rt_clock);
+    const int64_t read_time = current_time -
+        acb->copy.last_prefetch_op_start_time;
+    s->prefetch_read_time += read_time;
+    s->prefetch_data_read += acb->nb_sectors * 512;
+
+    QDEBUG("PREFETCH: read_finished  sector_num=%" PRId64
+           " nb_sectors=%d  read_time=%"PRId64" (ms)\n", acb->sector_num,
+           acb->nb_sectors, read_time);
+
+    /* Calculate throughput and determine if it needs to pause prefetching due
+     * to low throughput. */
+    if (s->prefetch_timer && s->prefetch_throttle_time > 0
+        && !s->pause_prefetch_requested
+        && s->prefetch_read_time > s->prefetch_read_throughput_measure_time) {
+        const double this_round_throughput =
+            s->prefetch_data_read / (double)s->prefetch_read_time;
+        if (s->prefetch_read_throughput < 0) {
+            /* Previously not initialized. */
+            s->prefetch_read_throughput = this_round_throughput;
+        } else {
+            s->prefetch_read_throughput = PREFETCH_PERF_CALC_ALPHA *
+                s->prefetch_read_throughput +
+                (1 - PREFETCH_PERF_CALC_ALPHA) * this_round_throughput;
+        }
+        if (s->prefetch_read_throughput < s->prefetch_min_read_throughput) {
+            QDEBUG("PREFETCH: slow_read read_time=%"PRId64" (ms)   "
+                   "this_read_throughput=%.3lf (MB/s) "
+                   "avg_read_throughput=%.3lf (MB/s)\n",
+                   read_time, this_round_throughput / 1048576 * 1000,
+                   s->prefetch_read_throughput / 1048576 * 1000);
+
+            /* Make a randomized decision to pause prefetching. This avoids
+             * pausing all contending FVD drivers. See Section 3.4.2 of the
+             * FVD-cow paper. */
+            if (rand() > (RAND_MAX / 2)) {
+                QDEBUG("PREFETCH: pause requested.\n");
+                s->pause_prefetch_requested = true;
+            } else {
+                QDEBUG("PREFETCH: continue due to 50%% probability, "
+                       "despite slow read.\n");
+                s->prefetch_read_throughput = -1;  /*Indicate not initialized*/
+            }
+        } else {
+            QDEBUG("PREFETCH: this_read_throughput=%.3lf (MB/s)    "
+                   "avg_read_throughput=%.3lf (MB/s)\n",
+                   this_round_throughput / 1048576 * 1000,
+                   s->prefetch_read_throughput / 1048576 * 1000);
+        }
+
+        /* Preparing for measuring the next round of throughput. */
+        s->prefetch_data_read = 0;
+        s->prefetch_read_time = 0;
+    }
+
+    if (s->num_filled_prefetch_slots > 0) {
+        /* There is one ongoing write for prefetched data. This slot will be
+         * written out later. */
+        s->num_filled_prefetch_slots++;
+        s->next_prefetch_read_slot++;
+        if (s->next_prefetch_read_slot >= s->num_prefetch_slots) {
+            s->next_prefetch_read_slot = 0;
+        }
+    } else {
+        /* The writer is not active. Start the writer. */
+        int64_t begin = acb->copy.buffered_sector_begin;
+        int64_t end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img(s, &begin, &end)) {
+            acb->copy.last_prefetch_op_start_time = current_time;
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base =
+                acb->copy.buf + (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG("PREFETCH: writes_data sector_num=%" PRId64
+                   " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                          &acb->copy.qiov, acb->nb_sectors,
+                                          prefetch_write_cb, acb);
+
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG("PREFETCH: error cannot get control block to write a "
+                       "prefetched block.\n");
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                if (s->num_filled_prefetch_slots == 0) {
+                    terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+                }
+                return;
+            }
+
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            s->num_filled_prefetch_slots++;
+            s->next_prefetch_read_slot++;
+            if (s->next_prefetch_read_slot >= s->num_prefetch_slots) {
+                s->next_prefetch_read_slot = 0;
+            }
+        } else {
+            /* The current prefetch slot will be reused to prefetch the next
+             * bunch of data. */
+            QDEBUG("PREFETCH: discard prefetched data as they have been "
+                   "covered: sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->sector_num, acb->nb_sectors);
+        }
+    }
+
+    if (s->num_filled_prefetch_slots >= s->num_prefetch_slots) {
+        QDEBUG("PREFETCH: halt read because no slot is available.\n");
+    } else {
+        if (s->pause_prefetch_requested) {
+            if (s->num_filled_prefetch_slots == 0) {
+                pause_prefetch(s);
+            }
+        } else {
+            do_next_prefetch_read(bs, current_time);
+        }
+    }
+}
+
+static void resume_prefetch(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->prefetch_state != PREFETCH_STATE_RUNNING) {
+        return;
+    }
+
+    ASSERT(s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active);
+    QDEBUG("PREFETCH: resume.\n");
+
+    s->pause_prefetch_requested = false;
+    s->prefetch_read_throughput = -1;   /* Indicate not initialized. */
+    s->prefetch_write_throughput = -1;  /* Indicate not initialized. */
+    s->prefetch_read_time = 0;
+    s->prefetch_write_time = 0;
+    s->prefetch_data_read = 0;
+    s->prefetch_data_written = 0;
+
+    do_next_prefetch_read(bs, qemu_get_clock(rt_clock));
+}
+
+static void prefetch_null_cb(void *opaque, int ret)
+{
+    /* Nothing to do and will never be invoked. Only need it to distinguish
+     * copy-on-read from prefetch. */
+    ASSERT(false);
 }
diff --git a/block/fvd-read.c b/block/fvd-read.c
index cd041e5..675af9e 100644
--- a/block/fvd-read.c
+++ b/block/fvd-read.c
@@ -11,6 +11,7 @@ 
  *
  */
 
+static void prefetch_null_cb(void *opaque, int ret);
 static void read_backing_for_copy_on_read_cb(void *opaque, int ret);
 static void read_fvd_cb(void *opaque, int ret);
 static inline void calc_read_region(BDRVFvdState * s, int64_t sector_num,
diff --git a/qemu-io-sim.c b/qemu-io-sim.c
index 923c1b8..d420fdb 100644
--- a/qemu-io-sim.c
+++ b/qemu-io-sim.c
@@ -77,6 +77,17 @@  wrote 1024/1024 bytes at offset 65536
 *=============================================================================*/
 
 #include "block/blksim.h"
+#include "block/fvd-ext.h"
+
+static void sim_start_prefetch(void)
+{
+    if (!bs->drv->format_name || !strncmp(bs->drv->format_name, "fvd", 3)) {
+        printf("This image does not support prefetching.\n");
+        return;
+    }
+    fvd_init_prefetch(bs);
+    printf("Prefetching started\n");
+}
 
 static void sim_help(void)
 {
@@ -101,6 +112,8 @@  static int sim_f(int argc, char **argv)
 
     if (strcmp(argv[1], "list") == 0) {
         blksim_list_tasks();
+    } else if (strcmp(argv[1], "prefetch") == 0) {
+        sim_start_prefetch();
     } else if (strcmp(argv[1], "all") == 0) {
         blksim_set_disk_io_return_code(ret);
         int n = blksim_run_all_tasks();