Patchwork [2/5] Fast Virtual Disk (FVD) Proposal Part 2

login
register
mail settings
Submitter Chunqiang Tang
Date Jan. 19, 2011, 10:04 p.m.
Message ID <1295474688-6219-2-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/79610/
State New
Headers show

Comments

Chunqiang Tang - Jan. 19, 2011, 10:04 p.m.
Part 2 of the block device driver for the proposed FVD image format.
Multiple patches are used in order to manage the size of each patch.
This patch includes the new testing tools developed together with FVD.

See the related discussions at
http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/blksim.c   |  688 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blksim.h   |   30 ++
 qemu-io-sim.c    |  107 ++++++++
 qemu-test.c      |  794 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-tool-time.c |   88 ++++++
 test-fvd.sh      |  120 ++++++++
 test-qcow2.sh    |   75 +++++
 7 files changed, 1902 insertions(+), 0 deletions(-)
 create mode 100644 block/blksim.c
 create mode 100644 block/blksim.h
 create mode 100644 qemu-io-sim.c
 create mode 100644 qemu-test.c
 create mode 100644 qemu-tool-time.c
 create mode 100755 test-fvd.sh
 create mode 100755 test-qcow2.sh

Patch

diff --git a/block/blksim.c b/block/blksim.c
new file mode 100644
index 0000000..35d918f
--- /dev/null
+++ b/block/blksim.c
@@ -0,0 +1,688 @@ 
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements a simulated block device
+ *  driver "blksim". It works with qemu-io and qemu-test to perform testing,
+ *  allowing changing the  order of disk I/O and callback activities to test
+ *  rare race conditions. See qemu-test.c, qemu-io.c, and qemu-io-sim.c.
+ *============================================================================*/
+
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "block/blksim.h"
+#include "block/fvd-ext.h"
+
+typedef enum {
+    SIM_NULL,
+    SIM_READ,
+    SIM_WRITE,
+    SIM_FLUSH,
+    SIM_READ_CALLBACK,
+    SIM_WRITE_CALLBACK,
+    SIM_FLUSH_CALLBACK,
+    SIM_TIMER
+} sim_op_t;
+
+static void sim_aio_cancel (BlockDriverAIOCB * acb);
+static int64_t sim_uuid = 0;
+static int64_t current_time = 0;
+static int64_t rand_time = 0;
+static int interactive_print = FALSE;
+struct SimAIOCB;
+
+/*
+ * Note: disk_io_return_code, set_disk_io_return_code(), and insert_task() work
+ * together to ensure that multiple subrequests triggered by the same
+ * outtermost request either succeed together or fail together. This behavior
+ * is required by qemu-test.  Here is one example of problems caused by
+ * departuring from this behavior.  Consider a write request that generates
+ * two subrequests, w1 and w2. If w1 succeeds but w2 fails, the data will not
+ * be written into qemu-test's "truth image" but the part of the data handled
+ * by w1 will be written into qemu-test's "test image". As a result, their
+ * contents diverge can automated testing cannot continue.
+ */
+static int disk_io_return_code = 0;
+
+typedef struct BDRVSimState {
+    int fd;
+} BDRVSimState;
+
+typedef struct SimAIOCB {
+    BlockDriverAIOCB common;
+    int64_t uuid;
+    sim_op_t op;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    int nb_sectors;
+    int ret;
+    int64_t time;
+    struct SimAIOCB *next;
+    struct SimAIOCB *prev;
+
+} SimAIOCB;
+
+static AIOPool sim_aio_pool = {
+    .aiocb_size = sizeof (SimAIOCB),
+    .cancel = sim_aio_cancel,
+};
+
+static SimAIOCB head = {
+    .uuid = -1,
+    .time = (int64_t) (9223372036854775807ULL),
+    .op = SIM_NULL,
+    .next = &head,
+    .prev = &head,
+};
+
+/* Debug a specific task.*/
+#if 1
+# define CHECK_TASK(acb) do { } while (0)
+#else
+static inline void CHECK_TASK (int64_t uuid)
+{
+    if (uuid == 19LL) {
+        printf ("CHECK_TASK pause for task %" PRId64 "\n", uuid);
+    }
+}
+#endif
+
+/* do_io() should never fail. A failure indicates a bug in the upper layer
+ * block device driver, or failure in the real hardware. */
+static int do_io (BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
+                  int nb_sectors, int do_read)
+{
+    BDRVSimState *s = bs->opaque;
+    size_t size = nb_sectors * 512;
+    int ret;
+
+    if (lseek (s->fd, sector_num * 512, SEEK_SET) < 0) {
+        fprintf (stderr, "Error: lseek %s sector_num=%" PRId64 ". "
+                 "Pause process %d for debugging...\n",
+                 bs->filename, sector_num, getpid ());
+        fgetc (stdin);
+    }
+
+    while (size > 0) {
+
+        if (do_read) {
+            ret = read (s->fd, buf, size);
+            if (ret == 0) {
+                fprintf (stderr,
+                         "Error: read beyond the size of %s sector_num=%" PRId64
+                         " nb_sectors=%d. Pause process %d for debugging...\n",
+                         bs->filename, sector_num, nb_sectors, getpid ());
+                fgetc (stdin);
+            }
+        } else {
+            ret = write (s->fd, buf, size);
+        }
+
+        if (ret >= 0) {
+            size -= ret;
+            buf += ret;
+        } else if (errno != EINTR) {
+            fprintf (stderr, "Error: %s %s sector_num=%" PRId64
+                     " nb_sectors=%d. Pause process %d for debugging...\n",
+                     do_read ? "READ" : "WRITE", bs->filename, sector_num,
+                     nb_sectors, getpid ());
+            fgetc (stdin);
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+static int sim_read (BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
+                     int nb_sectors)
+{
+    return do_io (bs, sector_num, buf, nb_sectors, TRUE);
+}
+
+static int sim_write (BlockDriverState * bs, int64_t sector_num,
+                      const uint8_t * buf, int nb_sectors)
+{
+    return do_io (bs, sector_num, (uint8_t *) buf, nb_sectors, FALSE);
+}
+
+static void insert_in_list (SimAIOCB * acb)
+{
+    int64_t new_id = sim_uuid++;
+    CHECK_TASK (new_id);
+    acb->uuid = new_id;
+
+    if (rand_time <= 0) {
+        /* Working with qemu-io.c and not doing delay randomization.
+         * Insert it to the tail. */
+        acb->time = 0;
+        acb->prev = head.prev;
+        acb->next = &head;
+        head.prev->next = acb;
+        head.prev = acb;
+        return;
+    }
+
+    if (acb->time >= 0) {
+        /* Introduce a random delay to better trigger rare race conditions. */
+        acb->time += random () % rand_time;
+    }
+
+    /* Find the position to insert. The list is sorted in ascending time. */
+    SimAIOCB *p = head.next;
+    while (1) {
+        if (p->time > acb->time) {
+            break;
+        }
+        if (p->time == acb->time && (random () % 2 == 0)) {
+            break;
+        }
+        p = p->next;
+    }
+
+    /* Insert acb before p. */
+    acb->next = p;
+    acb->prev = p->prev;
+    p->prev->next = acb;
+    p->prev = acb;
+}
+
+/* Debug problems related to reusing task objects. Problem already solved.*/
+#if 1
+# define my_qemu_aio_get qemu_aio_get
+# define my_qemu_aio_release qemu_aio_release
+
+#else
+static SimAIOCB *search_task_list (SimAIOCB * acb)
+{
+    SimAIOCB *p;
+    for (p = head.next; p != &head; p = p->next) {
+        if (p == acb) {
+            return p;
+        }
+    }
+
+    return NULL;
+}
+
+static inline void *my_qemu_aio_get (AIOPool * pool, BlockDriverState * bs,
+                                     BlockDriverCompletionFunc * cb,
+                                     void *opaque)
+{
+    SimAIOCB *acb = (SimAIOCB *) qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+    QDEBUG ("SIM: qemu_aio_get reuse old task%" PRId64 "\n", acb->uuid);
+    ASSERT (!search_task_list (acb));
+    return acb;
+}
+
+static inline void my_qemu_aio_release (SimAIOCB * acb)
+{
+    QDEBUG ("SIM: qemu_aio_release task%" PRId64 "\n", acb->uuid);
+    qemu_aio_release (acb);
+}
+#endif
+
+static BlockDriverAIOCB *insert_task (int op, BlockDriverState * bs,
+                                      int64_t sector_num, QEMUIOVector * qiov,
+                                      int nb_sectors,
+                                      BlockDriverCompletionFunc * cb,
+                                      void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->op = op;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    acb->nb_sectors = nb_sectors;
+    acb->ret = disk_io_return_code;
+    acb->time = current_time;
+    insert_in_list (acb);
+
+    if (interactive_print) {
+        if (op == SIM_READ) {
+            printf ("Added READ uuid=%" PRId64 "  filename=%s  sector_num=%"
+                    PRId64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (op == SIM_WRITE) {
+            printf ("Added WRITE uuid=%" PRId64 "  filename=%s  sector_num=%"
+                    PRId64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf (stderr, "Unknown op %d\n", op);
+            exit (1);
+        }
+    }
+
+    return &acb->common;
+}
+
+static void insert_aio_callback (SimAIOCB * acb)
+{
+    acb->time = current_time;
+    insert_in_list (acb);
+
+    if (acb->op == SIM_FLUSH) {
+        acb->op = SIM_FLUSH_CALLBACK;
+        if (interactive_print) {
+            printf ("Added FLUSH_CALLBACK uuid=%" PRId64 "  filename=%s\n",
+                    acb->uuid, acb->common.bs->filename);
+        }
+    } else if (acb->op == SIM_READ) {
+        acb->op = SIM_READ_CALLBACK;
+        if (interactive_print) {
+            printf ("Added READ_CALLBACK uuid=%" PRId64
+                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
+                    acb->uuid, acb->common.bs->filename, acb->sector_num,
+                    acb->nb_sectors);
+        }
+    } else if (acb->op == SIM_WRITE) {
+        acb->op = SIM_WRITE_CALLBACK;
+        if (interactive_print) {
+            printf ("Added WRITE_CALLBACK uuid=%" PRId64
+                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
+                    acb->uuid, acb->common.bs->filename, acb->sector_num,
+                    acb->nb_sectors);
+        }
+    } else {
+        fprintf (stderr, "Wrong op %d\n", acb->op);
+        exit (1);
+    }
+}
+
+void sim_list_tasks (void)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->op == SIM_READ) {
+            printf ("uuid=%" PRId64 "  READ           file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE) {
+            printf ("uuid=%" PRId64 "  WRITE          file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_READ_CALLBACK) {
+            printf ("uuid=%" PRId64 "  CALLBACK READ  file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE_CALLBACK) {
+            printf ("uuid=%" PRId64 "  CALLBACK WRITE file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf (stderr, "Wrong OP %d\n", acb->op);
+            exit (1);
+        }
+    }
+}
+
+static inline void sim_callback (SimAIOCB * acb)
+{
+    ASSERT (disk_io_return_code == 0);
+    FVD_DEBUG_ACB (acb->common.opaque);
+    acb->common.cb (acb->common.opaque, acb->ret);
+}
+
+int64_t sim_get_time (void)
+{
+    return current_time;
+}
+
+void *sim_new_timer (void *cb, void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, NULL, cb, opaque);
+    acb->op = SIM_TIMER;
+    acb->prev = NULL;
+    return acb;
+}
+
+void sim_mod_timer (void *ts, int64_t expire_time)
+{
+    SimAIOCB *acb = ts;
+
+    if (acb->prev) {
+        /* Remove it first. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+    }
+    acb->time = expire_time;
+    insert_in_list (acb);
+}
+
+void sim_free_timer (void *ts)
+{
+    SimAIOCB *acb = ts;
+    CHECK_TASK (acb->uuid);
+    my_qemu_aio_release (acb);
+}
+
+void sim_del_timer (void *ts)
+{
+    SimAIOCB *acb = ts;
+
+    CHECK_TASK (acb->uuid);
+    if (acb->prev) {
+        /* Remove it from the list. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+
+        /* Mark it as not in list. */
+        acb->prev = NULL;
+    }
+}
+
+void sim_set_disk_io_return_code (int ret)
+{
+    disk_io_return_code = ret;
+}
+
+static void sim_task_by_acb (SimAIOCB * acb)
+{
+    CHECK_TASK (acb->uuid);
+
+    /* Remove it from the list. */
+    acb->next->prev = acb->prev;
+    acb->prev->next = acb->next;
+    acb->prev = NULL;        /* Indicate that it is no longer in the list. */
+
+    if (acb->time > current_time) {
+        current_time = acb->time;
+    }
+
+    if (acb->op == SIM_TIMER) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " TIMER \n",
+                acb->uuid, acb->time);
+
+        FVD_DEBUG_ACB (acb->common.opaque);
+        ((QEMUTimerCB *) acb->common.cb) (acb->common.opaque);
+        return;
+    }
+
+    BlockDriverState *bs = acb->common.bs;
+
+    if (acb->op == SIM_READ) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+                " READ sector_num=%" PRId64 " nb_sectors=%d\n",
+                acb->uuid, acb->time, acb->sector_num, acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (sim_read
+                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                     acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in reading %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+            } else {
+                uint8_t *buf =
+                    qemu_blockalign (acb->common.bs, acb->qiov->size);
+                if (sim_read (bs, acb->sector_num, buf, acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in reading %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+                qemu_iovec_from_buffer (acb->qiov, buf, acb->qiov->size);
+                qemu_vfree (buf);
+            }
+        }
+
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_WRITE) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+                " WRITE sector_num=%" PRId64 " nb_sectors=%d\n",
+                acb->uuid, acb->time, acb->sector_num, acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (sim_write
+                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                     acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in writing %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+            } else {
+                uint8_t *buf = qemu_blockalign (acb->common.bs,
+                                                acb->qiov->size);
+                qemu_iovec_to_buffer (acb->qiov, buf);
+                if (sim_write (bs, acb->sector_num, buf, acb->nb_sectors)!= 0) {
+                    fprintf (stderr, "Error in writing %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+                qemu_vfree (buf);
+            }
+        }
+
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_FLUSH) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " FLUSH\n",
+                acb->uuid, acb->time);
+        /* Skip real flushing to speed up simulation:
+         *         if (ret == 0) { * fdatasync (s->fd); } */
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_WRITE_CALLBACK || acb->op == SIM_READ_CALLBACK
+               || acb->op == SIM_FLUSH_CALLBACK) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " CALLBACK\n",
+                acb->uuid, acb->time);
+        sim_callback (acb);
+        CHECK_TASK (acb->uuid);
+        my_qemu_aio_release (acb);
+    } else {
+        fprintf (stderr, "Unknown op %d\n", acb->op);
+        exit (1);
+    }
+}
+
+int sim_task_by_uuid (int64_t uuid)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->uuid == uuid) {
+            sim_task_by_acb (acb);
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
+int sim_all_tasks (void)
+{
+    int n = 0;
+
+    while (1) {
+        SimAIOCB *acb = head.next;
+        if (acb == &head) {
+            return n;
+        }
+
+        sim_task_by_acb (acb);
+        n++;
+    }
+}
+
+static BlockDriverAIOCB *sim_aio_readv (BlockDriverState * bs,
+                                        int64_t sector_num,
+                                        QEMUIOVector * qiov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return insert_task (SIM_READ, bs, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *sim_aio_writev (BlockDriverState * bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector * qiov,
+                                         int nb_sectors,
+                                         BlockDriverCompletionFunc * cb,
+                                         void *opaque)
+{
+    return insert_task (SIM_WRITE, bs, sector_num, qiov, nb_sectors, cb,
+                        opaque);
+}
+
+static BlockDriverAIOCB *sim_aio_flush (BlockDriverState * bs,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return insert_task (SIM_FLUSH, bs, 0, NULL, 0, cb, opaque);
+}
+
+static void sim_aio_cancel (BlockDriverAIOCB * blockacb)
+{
+    SimAIOCB *acb = container_of (blockacb, SimAIOCB, common);
+
+    CHECK_TASK (acb->uuid);
+
+    if (acb->prev) {
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+        acb->prev = NULL;
+        my_qemu_aio_release (acb);
+    } else {
+        ASSERT (FALSE);        /* Cancel a task not in the list. */
+    }
+}
+
+static int sim_probe (const uint8_t * buf, int buf_size, const char *filename)
+{
+    /* Return a score higher than RAW so that the image will be openned using
+     * the 'sim' format. */
+    return 2;
+}
+
+static int sim_open (BlockDriverState * bs, const char *filename,
+                     int bdrv_flags)
+{
+    BDRVSimState *s = bs->opaque;
+    int open_flags = O_BINARY | O_LARGEFILE;
+
+    if ((bdrv_flags & BDRV_O_RDWR)) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    } else if (!(bdrv_flags & BDRV_O_CACHE_WB)) {
+        open_flags |= O_DSYNC;
+    }
+
+    /* Parse the "blksim:" prefix */
+    if (!strncmp(filename, "blksim:", strlen("blksim:"))) {
+        filename += strlen("blksim:");
+    }
+
+    s->fd = open (filename, open_flags);
+    if (s->fd < 0)
+        return -1;
+
+    int64_t len = lseek (s->fd, 0, SEEK_END);
+    if (len >= 0) {
+        bs->total_sectors = len / 512;
+    } else {
+        bs->total_sectors = 0;
+    }
+
+    bs->growable = 1;
+    return 0;
+}
+
+static void sim_close (BlockDriverState * bs)
+{
+    BDRVSimState *s = bs->opaque;
+    close (s->fd);
+}
+
+static int sim_flush (BlockDriverState * bs)
+{
+    /*
+     * Skip real flushing to speed up simulation.
+         * BDRVSimState *s = bs->opaque;
+         * fdatasync (s->fd);
+     */
+    return 0;
+}
+
+static int sim_has_zero_init (BlockDriverState * bs)
+{
+    struct stat buf;
+
+    if (stat (bs->filename, &buf) != 0) {
+        fprintf (stderr, "Failed to stat() %s\n", bs->filename);
+        exit (1);
+    }
+
+    if (S_ISBLK (buf.st_mode) || S_ISCHR (buf.st_mode)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+static int sim_truncate (BlockDriverState * bs, int64_t offset)
+{
+    BDRVSimState *s = bs->opaque;
+    return ftruncate (s->fd, offset);
+}
+
+BlockDriver bdrv_sim = {
+    .format_name = "blksim",
+    .protocol_name = "blksim",
+    .instance_size = sizeof (BDRVSimState),
+    .bdrv_probe = sim_probe,
+    .bdrv_file_open = sim_open,
+    .bdrv_close = sim_close,
+    .bdrv_flush = sim_flush,
+    .bdrv_read = sim_read,
+    .bdrv_write = sim_write,
+    .bdrv_aio_readv = sim_aio_readv,
+    .bdrv_aio_writev = sim_aio_writev,
+    .bdrv_aio_flush = sim_aio_flush,
+    .bdrv_has_zero_init = sim_has_zero_init,
+    .bdrv_truncate = sim_truncate,
+};
+
+void enable_block_sim (int print, int64_t _rand_time)
+{
+    BlockDriver *drv = bdrv_find_format ("blksim");
+    if (!drv) {
+        bdrv_register (&bdrv_sim);
+    }
+    interactive_print = print;
+    rand_time = _rand_time;
+}
diff --git a/block/blksim.h b/block/blksim.h
new file mode 100644
index 0000000..7afca98
--- /dev/null
+++ b/block/blksim.h
@@ -0,0 +1,30 @@ 
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this is the header of the simulated block device
+ *  driver "sim".
+ *============================================================================*/
+
+#ifndef __block_sim_h__
+#define __block_sim_h__
+
+void enable_block_sim (int print, int64_t _rand_time);
+void sim_list_tasks (void);
+int sim_task_by_uuid (int64_t uuid);
+int sim_all_tasks (void);
+int64_t sim_get_time (void);
+void *sim_new_timer (void *cb, void *opaque);
+void sim_mod_timer (void *ts, int64_t expire_time);
+void sim_free_timer (void *ts);
+void sim_del_timer (void *ts);
+void sim_set_disk_io_return_code (int ret);
+
+#endif
diff --git a/qemu-io-sim.c b/qemu-io-sim.c
new file mode 100644
index 0000000..1e7a2aa
--- /dev/null
+++ b/qemu-io-sim.c
@@ -0,0 +1,107 @@ 
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * qemu-io-sim works with qemu-io to perform simulated testing. The 'sim'
+ * command allows the user to control the order of disk I/O and callback
+ * activities in order to test rare race conditions. Note that once 'sim
+ * enable' is done, it can only test aio_read and aio_write. See block/sim.c
+ * for the simulated block device driver.
+ *============================================================================*/
+
+#include "block/blksim.h"
+
+void fvd_init_prefetch (BlockDriverState * bs);
+static void sim_start_prefetch (void)
+{
+    if (!bs->drv->format_name || !strncmp (bs->drv->format_name, "fvd", 3)) {
+        printf ("This image does not support prefetching.\n");
+        return;
+    }
+    fvd_init_prefetch (bs);
+    printf ("Prefetching started\n");
+}
+
+static void sim_help (void)
+{
+    printf ("\n"
+            " sim enable\t\tenable simulation\n"
+            " sim list\t\tlist all simulation tasks\n"
+            " sim <#task> [#ret]\trun a simulation task, optionally uing #ret as the return value of a read/write operation\n"
+            " sim all [#ret]\t\trun all tasks, optionally using #ret as the return value of read/write tasks\n"
+            " sim prefetch\t\tstart prefetching\n");
+}
+
+static int sim_f (int argc, char **argv)
+{
+    int ret = 0;
+
+    if (argc == 3) {
+        ret = atoi (argv[2]);
+    }
+    else if (argc != 2) {
+        sim_help ();
+        return 0;
+    }
+
+    if (strcmp (argv[1], "enable") == 0) {
+        if (bs) {
+            printf ("Please close the image first. \"sim enable\" must be done before the\n"
+                    "image is openned so that the image is openned with simulation support.\n");
+        }
+        else {
+            enable_block_sim(1/*print*/, 0 /*no random time*/);
+            printf ("Block device simulation is enabled.\n");
+        }
+        return 0;
+    }
+
+    if (!bs) {
+        fprintf(stderr, "no file open, try 'help open'\n");
+        return 0;
+    }
+
+    if (!bdrv_find_format("blksim")) {
+        printf ("\"sim enable\" must be done before invoking any other sim commands.\n");
+        return 0;
+    }
+
+    if (strcmp (argv[1], "list") == 0) {
+        sim_list_tasks ();
+    }
+    else if (strcmp (argv[1], "prefetch") == 0) {
+        sim_start_prefetch ();
+    }
+    else if (strcmp (argv[1], "all") == 0) {
+        sim_set_disk_io_return_code (ret);
+        int n = sim_all_tasks ();
+        sim_set_disk_io_return_code (0);
+        printf ("Executed %d tasks.\n", n);
+    }
+    else {
+        sim_set_disk_io_return_code (ret);
+        sim_task_by_uuid (atoll (argv[1]));
+        sim_set_disk_io_return_code (0);
+    }
+
+    return 0;
+}
+
+static const cmdinfo_t sim_cmd = {
+    .name = "sim",
+    .altname = "s",
+    .cfunc = sim_f,
+    .argmin = 1,
+    .argmax = 2,
+    .args = "",
+    .oneline = "use simulation to control the order of disk I/Os and callbacks",
+    .flags = CMD_NOFILE_OK,
+    .help = sim_help,
+};
diff --git a/qemu-test.c b/qemu-test.c
new file mode 100644
index 0000000..12aefa3
--- /dev/null
+++ b/qemu-test.c
@@ -0,0 +1,794 @@ 
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *        Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements a fully automated testing tool
+ *  for block device drivers. It works with block/sim.c.
+ *=============================================================================
+ */
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <getopt.h>
+
+#include "qemu-timer.h"
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/fvd-ext.h"
+#include "block/blksim.h"
+
+#define die(format,...) \
+    do { \
+        fprintf (stderr, "%s:%d --- ", __FILE__, __LINE__); \
+        fprintf (stderr, format, ##__VA_ARGS__); \
+        exit (-1);\
+    } while(0)
+
+typedef enum { OP_NULL = 0, OP_READ, OP_WRITE, OP_FLUSH } op_type_t;
+const char *op_type_str[] = { "NULL ", "READ ", "WRITE", "FLUSH" };
+
+typedef struct CompareFullCB {
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int64_t sector_num;
+    int nb_sectors;
+    int max_nb_sectors;
+    uint8_t *truth_buf;
+} CompareFullCB;
+
+typedef struct RandomIO {
+    QEMUIOVector qiov;
+    int64_t sector_num;
+    int nb_sectors;
+    uint8_t *truth_buf;
+    uint8_t *test_buf;
+    op_type_t type;
+    int tester;
+    int64_t uuid;
+    int allow_cancel;
+    BlockDriverAIOCB *acb;
+} RandomIO;
+
+static char *progname;
+static BlockDriverState *bs;
+static int fd;
+static int64_t total_sectors;
+static int64_t io_size = 262144;
+static int verify_write = TRUE;
+static int parallel = 1;
+static int max_iov = 10;
+static int64_t round = 10;
+static int64_t finished_round = 0;
+static RandomIO *testers = NULL;
+static double fail_prob = 0;
+static double cancel_prob = 0;
+static double flush_prob = 0;
+static int64_t rand_time = 1000;
+static int64_t test_uuid = 0;
+static int instant_qemubh = FALSE;
+
+static void rand_io_cb (void *opaque, int ret);
+static void perform_next_io (RandomIO * r);
+
+int64_t qemu_get_clock (QEMUClock * clock)
+{
+    return sim_get_time ();
+}
+
+void qemu_mod_timer (QEMUTimer * ts, int64_t expire_time)
+{
+    sim_mod_timer (ts, expire_time);
+}
+
+QEMUTimer *qemu_new_timer (QEMUClock * clock, QEMUTimerCB * cb, void *opaque)
+{
+    return sim_new_timer (cb, opaque);
+}
+
+void qemu_free_timer (QEMUTimer * ts)
+{
+    sim_free_timer (ts);
+}
+
+void qemu_del_timer (QEMUTimer * ts)
+{
+    sim_del_timer (ts);
+}
+
+QEMUBH *qemu_bh_new (QEMUBHFunc * cb, void *opaque)
+{
+    return sim_new_timer (cb, opaque);
+}
+
+int qemu_bh_poll (void)
+{
+    return 0;
+}
+
+void qemu_bh_schedule (QEMUBH * bh)
+{
+    if (instant_qemubh) {
+        sim_mod_timer (bh, -1);        /* Run this bh next. */
+    } else {
+        sim_mod_timer (bh, sim_get_time ());
+    }
+}
+
+void qemu_bh_cancel (QEMUBH * bh)
+{
+    sim_del_timer (bh);
+}
+
+void qemu_bh_delete (QEMUBH * bh)
+{
+    sim_free_timer (bh);
+}
+
+static void usage (void)
+{
+    printf ("%s [--help]\n"
+            "\t--truth=<truth_img>\n"
+            "\t--test=<img_to_test>\n"
+            "\t[--format=<test_img_fmt>]\n"
+            "\t[--round=<#d>]\n"
+            "\t[--instant_qemubh=<true|false>]\n"
+            "\t[--fail_prob=<#f>]\n"
+            "\t[--cancel_prob=<#f>]\n"
+            "\t[--flush_prob=<#f>]\n"
+            "\t[--io_size=<#d>]\n"
+            "\t[--verify_write=[true|false]]\n"
+            "\t[--parallel=[#d]\n"
+            "\t[--max_iov=[#d]\n"
+            "\t[--compare_before=[true|false]]\n"
+            "\t[--compare_after=[true|false]]\n" "\n", progname);
+    exit (1);
+}
+
+static int truth_io (void *buf, int64_t sector_num, int nb_sectors, int do_read)
+{
+    off_t offset = sector_num * 512;
+    size_t size = nb_sectors * 512;
+
+    while (size > 0) {
+        int r;
+        if (do_read) {
+            r = pread (fd, buf, size, offset);
+        } else {
+            r = pwrite (fd, buf, size, offset);
+        }
+        if (r >= 0) {
+            size -= r;
+            offset += r;
+            buf = (void *) (((char *) buf) + r);
+        } else if (errno != EINTR) {
+            perror ("io");
+            die ("I/O error on the truth file.\n");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int verify (uint8_t * truth_buf, uint8_t * test_buf,
+                   int64_t sector_num, int nb_sectors)
+{
+    int i;
+    for (i = 0; i < nb_sectors; i++) {
+        int64_t offset = i * 512;
+        if (memcmp (&truth_buf[offset], &test_buf[offset], 512) != 0) {
+            int j;
+            printf ("Sector %lld differs\n", sector_num + i);
+            QDEBUG ("Sector %lld differs\n", sector_num + i);
+            for (j = 0; j < 512; j++) {
+                if (truth_buf[offset + j] == test_buf[offset + j]) {
+                    QDEBUG ("%02d: %02X  %02X\n", j, truth_buf[offset + j],
+                            test_buf[offset + j]);
+                } else {
+                    QDEBUG ("%02d: %02X  %02X   ***\n", j,
+                            truth_buf[offset + j], test_buf[offset + j]);
+                }
+            }
+
+            fprintf (stderr, "Pause process %d for debugging...\n", getpid ());
+            fgetc (stdin);
+
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static void compare_full_images_cb (void *opaque, int ret)
+{
+    CompareFullCB *cf = opaque;
+
+    if (ret) {
+        /* Failed. Retry the operation. */
+        bdrv_aio_readv (bs, cf->sector_num, &cf->qiov, cf->nb_sectors,
+                        compare_full_images_cb, cf);
+        return;
+    }
+
+    truth_io (cf->truth_buf, cf->sector_num, cf->nb_sectors, TRUE);
+    verify (cf->truth_buf, cf->iov.iov_base, cf->sector_num, cf->nb_sectors);
+
+    cf->sector_num += cf->nb_sectors;
+    if (cf->sector_num >= total_sectors) {
+        /* Finished. */
+        free (cf->truth_buf);
+        qemu_vfree (cf->iov.iov_base);
+        qemu_free (cf);
+        return;
+    }
+
+    /* Read more data to compare. */
+    if (cf->sector_num + cf->max_nb_sectors > total_sectors) {
+        cf->nb_sectors = total_sectors - cf->sector_num;
+    } else {
+        cf->nb_sectors = cf->max_nb_sectors;
+    }
+    cf->iov.iov_len = cf->nb_sectors * 512;
+    qemu_iovec_init_external (&cf->qiov, &cf->iov, 1);
+    if (!bdrv_aio_readv (bs, cf->sector_num, &cf->qiov,
+                         cf->nb_sectors, compare_full_images_cb, cf)) {
+        die ("bdrv_aio_readv\n");
+    }
+}
+
+static int compare_full_images (void)
+{
+    CompareFullCB *cf;
+    int old_copy_on_read = FALSE;
+
+    printf ("Performing a full comparison of the truth image and "
+            "the test image...\n");
+
+    if (!strncmp (bs->drv->format_name, "fvd", 3)) {
+        /* Disable copy-on-read when scanning through the entire image. */
+        old_copy_on_read = fvd_get_copy_on_read (bs);
+        fvd_set_copy_on_read (bs, FALSE);
+    }
+
+    cf = qemu_malloc (sizeof (CompareFullCB));
+    cf->max_nb_sectors = 1048576L / 512;
+    cf->nb_sectors = MIN (cf->max_nb_sectors, total_sectors);
+    if (posix_memalign ((void **) &cf->truth_buf, 512,
+                        cf->max_nb_sectors * 512) != 0) {
+        die ("posix_memalign");
+    }
+    cf->iov.iov_base = qemu_blockalign (bs, cf->max_nb_sectors * 512);
+    cf->iov.iov_len = cf->nb_sectors * 512;
+    cf->sector_num = 0;
+    qemu_iovec_init_external (&cf->qiov, &cf->iov, 1);
+    if (!bdrv_aio_readv (bs, cf->sector_num, &cf->qiov,
+                         cf->nb_sectors, compare_full_images_cb, cf)) {
+        die ("bdrv_aio_readv\n");
+    }
+
+    sim_all_tasks ();
+
+    if (!strncmp (bs->drv->format_name, "fvd", 3)) {
+        fvd_set_copy_on_read (bs, old_copy_on_read);
+    }
+
+    return 0;
+}
+
+static inline int64_t rand64 (void)
+{
+    int64_t f1 = random ();
+    int64_t f2 = random ();
+    int64_t f3 = (f1 << 32) | f2;
+    return f3 >= 0 ? f3 : -f3;
+}
+
+static int check_conflict (RandomIO * r)
+{
+    int i;
+
+    for (i = 0; i < parallel; i++) {
+        RandomIO *s = &testers[i];
+        if (s == r || s->type == OP_FLUSH ||
+            (r->type == OP_READ && s->type == OP_READ)) {
+            continue;
+        }
+
+        if ((r->sector_num <= s->sector_num &&
+             s->sector_num < r->sector_num + r->nb_sectors) ||
+            (s->sector_num <= r->sector_num &&
+             r->sector_num < s->sector_num + s->nb_sectors)) {
+            return 1;        /* Conflict. */
+        }
+    }
+
+    return 0;        /* No confict. */
+}
+
+/* Return FALSE if the submitted request is cancelled. */
+static int submit_rand_io (RandomIO * r)
+{
+    BlockDriverAIOCB *acb = NULL;
+
+    QDEBUG ("TESTER %03d:  %s  test%" PRIX64 " sector_num=%" PRId64
+            " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+            r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+    printf ("TESTER %03d:  %s  sector_num=%" PRId64 " nb_sectors=%d niov=%d\n",
+            r->tester, op_type_str[r->type], r->sector_num, r->nb_sectors,
+            r->qiov.niov);
+
+    int ret;
+    if (fail_prob <= 0) {
+        ret = 0;
+    } else if (random () / (double) RAND_MAX <= fail_prob) {
+        ret = -EIO;
+    } else {
+        ret = 0;
+    }
+
+    /* This affects whether this request will fail or not. */
+    sim_set_disk_io_return_code (ret);
+
+    switch (r->type) {
+    case OP_READ:
+        if (!(acb = bdrv_aio_readv (bs, r->sector_num, &r->qiov, r->nb_sectors,
+                             rand_io_cb, r))) {
+            die ("bdrv_aio_readv\n");
+        }
+        break;
+    case OP_WRITE:
+        if (!(acb = bdrv_aio_writev (bs, r->sector_num, &r->qiov, r->nb_sectors,
+                              rand_io_cb, r))) {
+            die ("bdrv_aio_writev\n");
+        }
+        break;
+    case OP_FLUSH:
+        if (!(acb = bdrv_aio_flush (bs, rand_io_cb, r))) {
+            die ("bdrv_aio_flush\n");
+        }
+        break;
+    case OP_NULL:
+        die ("OP_NULL");
+        break;
+    }
+
+    sim_set_disk_io_return_code (0);        /* Reset to no failure state. */
+
+    if (r->allow_cancel && cancel_prob > 0 &&
+                random () / (double) RAND_MAX <= cancel_prob) {
+        QDEBUG ("TESTER %03d:  cancel %s test%" PRIX64 " sector_num=%" PRId64
+                " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+                r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+        printf ("TESTER %03d:  cancel %s sector_num=%" PRId64
+                " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+                r->sector_num, r->nb_sectors, r->qiov.niov);
+        bdrv_aio_cancel (acb);
+        return FALSE;
+    } else {
+        return TRUE;
+    }
+}
+
+static void prepare_read_write (RandomIO * r)
+{
+    /* Do a READ or WRITE? */
+    if (random () % 2) {
+        r->type = OP_READ;
+    } else {
+        r->type = OP_WRITE;
+    }
+
+    /* Find the next region to perform io. */
+    do {
+        if (parallel <= 1 || (random () % 2 == 0)) {
+            /* Perform a random I/O. */
+            r->sector_num = rand64 () % total_sectors;
+        } else {
+            /* Perform an I/O next to a currently ongoing I/O. */
+            int id;
+            do {
+                id = random () % parallel;
+            } while (id == r->tester);
+
+            RandomIO *p = &testers[id];
+            r->sector_num =
+                p->sector_num + 2 * io_size - rand64 () % (4 * io_size);
+            if (r->sector_num < 0) {
+                r->sector_num = 0;
+            } else if (r->sector_num >= total_sectors) {
+                r->sector_num = total_sectors - 1;
+            }
+        }
+
+        r->nb_sectors = 1 + rand64 () % io_size;
+        if (r->sector_num + r->nb_sectors > total_sectors) {
+            r->nb_sectors = total_sectors - r->sector_num;
+        }
+    } while (check_conflict (r));
+
+    if (r->type == OP_WRITE) {
+        /* Fill test_buf with random data. */
+        int i, j;
+        for (i = 0; i < r->nb_sectors; i++) {
+            const uint64_t TEST_MAGIC = 0x0123456789ABCDEFULL;
+            /* This first 8 bytes of the sector stores the current testing
+             * round. The next 8 bytes store a magic number.  This info helps
+             * debugging. */
+            uint64_t *p = (uint64_t *) & r->test_buf[i * 512];
+            *p = r->uuid;
+            cpu_to_be64s (p);
+            p++;
+            *p = TEST_MAGIC;
+            cpu_to_be64s (p);
+
+            /* The rest of the sector are filled with random data. */
+            uint32_t *q = (uint32_t *) (p + 1);
+            int n = (512 - 2 * sizeof (uint64_t)) / sizeof (uint32_t);
+            for (j = 0; j < n; j++) {
+                *q++ = random ();
+            }
+        }
+    }
+
+    /* Determine the number of iov. */
+    int niov = 0;
+    uint8_t *p = r->test_buf;
+    int left = r->nb_sectors;
+    do {
+        if (niov == max_iov - 1) {
+            r->qiov.iov[niov].iov_len = left * 512;
+            r->qiov.iov[niov].iov_base = p;
+            niov++;
+            break;
+        }
+
+        int nb = 1 + random () % left;
+        r->qiov.iov[niov].iov_len = nb * 512;
+        r->qiov.iov[niov].iov_base = p;
+        p += r->qiov.iov[niov].iov_len;
+        left -= nb;
+        niov++;
+    } while (left > 0);
+
+    qemu_iovec_init_external (&r->qiov, r->qiov.iov, niov);
+}
+
+static void perform_next_io (RandomIO * r)
+{
+    if (finished_round >= round) {
+        return;
+    }
+
+    finished_round++;
+    r->allow_cancel = TRUE;
+
+    do {
+        r->uuid = test_uuid++;
+
+        if (flush_prob > 0 && random () / (double) RAND_MAX < flush_prob) {
+            r->type = OP_FLUSH;
+        } else {
+            prepare_read_write (r);
+        }
+    } while (!submit_rand_io (r));
+}
+
+static void rand_io_cb (void *opaque, int ret)
+{
+    RandomIO *r = opaque;
+
+    if (ret) {
+        if (fail_prob <= 0) {
+            fprintf (stderr, "Request %s sector_num=%" PRId64
+                     " nb_sectors=%d failed while fail_prob=0. "
+                     "Pause for debugging...\n",
+                     op_type_str[r->type], r->sector_num, r->nb_sectors);
+            fgetc (stdin);
+        } else {
+            /* Failed. Retry the operation. */
+            QDEBUG ("TESTER %03d:  retry %s  test%" PRIX64 " sector_num=%"
+                    PRId64 " nb_sectors=%d niov=%d\n",
+                    r->tester, op_type_str[r->type], r->uuid,
+                    r->sector_num, r->nb_sectors, r->qiov.niov);
+            if (!submit_rand_io (r)) {
+                perform_next_io (r);
+            }
+            return;
+        }
+    } else {
+        QDEBUG ("TESTER %03d:  finished %s  test%" PRIX64 " sector_num=%"PRId64
+                " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+                r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+    }
+
+    switch (r->type) {
+    case OP_FLUSH:
+        perform_next_io (r);
+        return;
+
+    case OP_READ:
+        truth_io (r->truth_buf, r->sector_num, r->nb_sectors, TRUE);
+        verify (r->truth_buf, r->test_buf, r->sector_num, r->nb_sectors);
+        perform_next_io (r);
+        return;
+
+    case OP_WRITE:
+        truth_io (r->test_buf, r->sector_num, r->nb_sectors, FALSE);
+        if (verify_write) {
+            /* Perform a read for the same data. */
+            r->type = OP_READ;
+
+            /* To verify the write, this read cannot be cancelled. */
+            r->allow_cancel = FALSE;
+            r->qiov.niov = 1;
+            r->qiov.iov[0].iov_len = r->qiov.size;
+            memset (r->test_buf, 0xA5, r->qiov.size); /* Fill in garbage. */
+            submit_rand_io (r);
+        } else {
+            perform_next_io (r);
+        }
+        return;
+
+    case OP_NULL:
+        die ("OP_NULL");
+        return;
+    }
+}
+
+static int read_bool (const char *arg)
+{
+    int val = TRUE;
+    if (strcmp (optarg, "true") == 0) {
+        val = TRUE;
+    } else if (strcmp (optarg, "false") == 0) {
+        val = FALSE;
+    } else {
+        printf ("%s is neither 'true' nor 'false'\n", arg);
+        usage ();
+    }
+
+    return val;
+}
+
+
+static void perform_test(const char *truth_file, const char *test_file,
+                         const char *format, int compare_before,
+                         int compare_after)
+{
+    int flags, i;
+
+    bs = bdrv_new ("hda");
+    if (!bs) {
+        die ("bdrv_new failed\n");
+    }
+
+    BlockDriver *drv = NULL;
+    if (format) {
+        drv = bdrv_find_format (format);
+        if (!drv) {
+            die ("Found no driver for format '%s'.\n", format);
+        }
+    }
+
+    flags = BDRV_O_RDWR | BDRV_O_CACHE_WB;
+
+    if (bdrv_open (bs, test_file, flags, drv) < 0) {
+        die ("Failed to open '%s'\n", test_file);
+    }
+
+    fd = open (truth_file, O_RDWR | O_LARGEFILE, 0);
+    if (fd < 0) {
+        perror ("open");
+        die ("Failed to open '%s'\n", truth_file);
+    }
+
+    int64_t l0 = lseek (fd, 0, SEEK_END);
+    int64_t l1 = bdrv_getlength (bs);
+    if (l0 < 0 || l1 < 0 || l0 < l1) {
+        die ("Mismatch: truth image %s length %lld, test image %s "
+             "length %lld\n", truth_file, l0, test_file, l1);
+    }
+
+    total_sectors = l1 / 512;
+    if (total_sectors <= 1) {
+        die ("Total sectors: %" PRId64 "\n", total_sectors);
+    }
+
+    io_size /= 512;
+    if (io_size <= 0) {
+        io_size = 1;
+    } else if (io_size > total_sectors / 2) {
+        io_size = total_sectors / 2;
+    }
+
+    if (compare_before) {
+        if (compare_full_images ()) {
+            die ("The original two files do not match.\n");
+        }
+    }
+
+    if (round > 0) {
+        /* Create testers. */
+        testers = qemu_malloc (sizeof (RandomIO) * parallel);
+        for (i = 0; i < parallel; i++) {
+            RandomIO *r = &testers[i];
+            r->test_buf = qemu_blockalign (bs, io_size * 512);
+            if (posix_memalign ((void **) &r->truth_buf, 512, io_size * 512)) {
+                die ("posix_memalign");
+            }
+            r->qiov.iov = qemu_malloc (sizeof (struct iovec) * max_iov);
+            r->sector_num = 0;
+            r->nb_sectors = 0;
+            r->type = OP_READ;
+            r->tester = i;
+        }
+        for (i = 0; i < parallel; i++) {
+            perform_next_io (&testers[i]);
+        }
+    }
+
+    sim_all_tasks ();        /* Run tests. */
+
+    if (round > 0) {
+        /* Create testers. */
+        if (compare_after) {
+            if (compare_full_images ()) {
+                die ("The two files do not match after I/O operations.\n");
+            }
+        }
+
+        for (i = 0; i < parallel; i++) {
+            RandomIO *r = &testers[i];
+            qemu_vfree (r->test_buf);
+            free (r->truth_buf);
+            qemu_free (r->qiov.iov);
+        }
+        qemu_free (testers);
+    }
+
+    printf ("Test process %d finished successfully\n", getpid ());
+
+    int fvd = (strncmp (bs->drv->format_name, "fvd", 3) == 0);
+    bdrv_delete (bs);
+    if (fvd) {
+        fvd_check_memory_usage ();
+    }
+    close (fd);
+}
+
+int main (int argc, char **argv)
+{
+    int c;
+    const char *truth_file = NULL;
+    const char *test_file = NULL;
+    const char *format = NULL;
+    int compare_before = FALSE;
+    int compare_after = TRUE;
+    int seed = 0;
+
+    const struct option lopt[] = {
+        {"help", 0, 0, 'h'},
+        {"seed", 1, 0, 'd'},
+        {"truth", 1, 0, 'b'},
+        {"test", 1, 0, 't'},
+        {"format", 1, 0, 'f'},
+        {"rand_time", 1, 0, 'n'},
+        {"fail_prob", 1, 0, 'u'},
+        {"cancel_prob", 1, 0, 'c'},
+        {"flush_prob", 1, 0, 'w'},
+        {"round", 1, 0, 'r'},
+        {"parallel", 1, 0, 'p'},
+        {"compare_before", 1, 0, 'm'},
+        {"verify_write", 1, 0, 'v'},
+        {"compare_after", 1, 0, 'a'},
+        {"max_iov", 1, 0, 'i'},
+        {"io_size", 1, 0, 's'},
+        {"instant_qemubh", 1, 0, 'q'},
+        {NULL, 0, NULL, 0}
+    };
+
+    progname = basename (argv[0]);
+
+    while ((c = getopt_long (argc, argv, "hc:u:p:q:i:f:d:b:t:r:m:v:a:s:",
+                             lopt, NULL)) != -1) {
+        switch (c) {
+        case 'h':
+            usage ();
+            return 0;
+
+        case 'q':
+            instant_qemubh = read_bool (optarg);
+            break;
+
+        case 'w':
+            flush_prob = atof (optarg);
+            break;
+
+        case 'c':
+            cancel_prob = atof (optarg);
+            break;
+
+        case 'u':
+            fail_prob = atof (optarg);
+            break;
+
+        case 'n':
+            rand_time = atoll (optarg);
+            break;
+
+        case 'i':
+            max_iov = atoi (optarg);
+            break;
+
+        case 'p':
+            parallel = atoi (optarg);
+            break;
+
+        case 'v':
+            verify_write = read_bool (optarg);
+            break;
+
+        case 'm':
+            compare_before = read_bool (optarg);
+            break;
+
+        case 'a':
+            compare_after = read_bool (optarg);
+            break;
+
+        case 'd':
+            seed = atoll (optarg);
+            break;
+
+        case 'f':
+            format = optarg;
+            break;
+
+        case 'b':
+            truth_file = optarg;
+            break;
+
+        case 't':
+            test_file = optarg;
+            break;
+
+        case 's':
+            io_size = atoll (optarg);
+            break;
+
+        case 'r':
+            round = atoll (optarg);
+            break;
+
+        default:
+            usage ();
+            return 1;
+        }
+    }
+
+    if (!truth_file || !test_file) {
+        usage ();
+        return 1;
+    }
+
+    if (parallel <= 0) {
+        parallel = 1;
+    }
+    srandom (seed);
+    rt_clock = (QEMUClock *) - 1; /* Convince FVD this is not in a qemu-tool. */
+    enable_block_sim (FALSE /*no print */ , rand_time);
+    fvd_enable_host_crash_test ();
+    bdrv_init ();
+    perform_test (truth_file, test_file, format, compare_before, compare_after);
+    return 0;
+}
diff --git a/qemu-tool-time.c b/qemu-tool-time.c
new file mode 100644
index 0000000..4aa2466
--- /dev/null
+++ b/qemu-tool-time.c
@@ -0,0 +1,88 @@ 
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements the qemu-tool functions that
+ *  are related to time. In the simulation mode (see block/sim.c), these
+ *  functions are implemented differently in qemu-test.c because they have to
+ *  work with the simulation engine block/sim.c
+ *============================================================================*/
+
+#include "qemu-timer.h"
+#include "sysemu.h"
+
+struct QEMUBH {
+    QEMUBHFunc *cb;
+    void *opaque;
+};
+
+#if 1
+int64_t qemu_get_clock (QEMUClock * clock)
+{
+    qemu_timeval tv;
+    qemu_gettimeofday (&tv);
+    return (tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000)) / 1000000;
+}
+#endif
+
+QEMUBH *qemu_bh_new (QEMUBHFunc * cb, void *opaque)
+{
+    QEMUBH *bh;
+
+    bh = qemu_malloc (sizeof (*bh));
+    bh->cb = cb;
+    bh->opaque = opaque;
+
+    return bh;
+}
+
+int qemu_bh_poll (void)
+{
+    return 0;
+}
+
+void qemu_bh_schedule (QEMUBH * bh)
+{
+    bh->cb (bh->opaque);
+}
+
+void qemu_bh_cancel (QEMUBH * bh)
+{
+}
+
+void qemu_bh_delete (QEMUBH * bh)
+{
+    qemu_free (bh);
+}
+
+void qemu_mod_timer (QEMUTimer * ts, int64_t expire_time)
+{
+    fprintf (stderr, "qemu_mod_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+}
+
+QEMUTimer *qemu_new_timer (QEMUClock * clock, QEMUTimerCB * cb, void *opaque)
+{
+    fprintf (stderr, "qemu_new_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+    return NULL;
+}
+
+void qemu_free_timer (QEMUTimer * ts)
+{
+    fprintf (stderr, "qemu_free_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+}
+
+void qemu_del_timer (QEMUTimer * ts)
+{
+    fprintf (stderr, "qemu_del_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+}
diff --git a/test-fvd.sh b/test-fvd.sh
new file mode 100755
index 0000000..adf4e1f
--- /dev/null
+++ b/test-fvd.sh
@@ -0,0 +1,120 @@ 
+#!/bin/bash
+
+if [ $USER != "root" ]; then
+    echo "This command must be run by root in order to mount tmpfs."
+    exit 1
+fi
+
+QEMU_DIR=.
+QEMU_IMG=$QEMU_DIR/qemu-img
+QEMU_TEST=$QEMU_DIR/qemu-test
+
+if [ ! -e $QEMU_IMG ]; then
+    echo "$QEMU_IMG does not exist."
+    exit 1;
+fi
+
+if [ ! -e $QEMU_TEST ]; then
+    echo "$QEMU_TEST does not exist."
+    exit 1;
+fi
+
+DATA_DIR=/var/ramdisk
+TRUTH_IMG=$DATA_DIR/truth.raw
+TEST_IMG=$DATA_DIR/test.fvd
+TEST_BASE=$DATA_DIR/zero-500M.raw
+TEST_IMG_DATA=$DATA_DIR/test.dat
+CMD_LOG=/tmp/test-fvd.log
+
+mount | grep $DATA_DIR > /dev/null
+if [ $? -ne 0 ]; then
+    echo "Create tmpfs at $DATA_DIR to store testing images."
+    if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
+    mount -t tmpfs none $DATA_DIR -o size=4G
+    if [ $? -ne 0 ]; then exit 1; fi
+fi
+
+G1=1073741824
+MAX_MEM=536870912
+MAX_ROUND=1000000
+MAX_IO_SIZE=100000000
+fail_prob=0.1
+cancel_prob=0.1
+flush_prob=0.01
+seed=$RANDOM$RANDOM
+count=0
+
+function invoke() {
+    echo "$*" >> $CMD_LOG
+    sync
+    $*
+    ret=$?
+    if [ $ret -ne 0 ]; then
+        echo "$Exit with error code $ret: $*"
+        exit $ret;
+    fi
+}
+
+/bin/rm -f $CMD_LOG
+touch $CMD_LOG
+
+while [ -t ]; do
+    for compact_image in on off ; do
+    for prefetch_delay in 1 0; do
+    for copy_on_read in on off; do
+    for block_size in 7680 512 1024 15872 65536 65024 1048576 1048064; do
+    for chunk_mult in 5 1 2 3 7 9 12 16 33 99 ; do
+    for base_img in ""  "-b $TEST_BASE"; do
+        chunk_size=$[$block_size * $chunk_mult]
+        large_io_size=$[$chunk_size * 5]
+        if [ $large_io_size -gt $MAX_IO_SIZE ]; then large_io_size=$MAX_IO_SIZE; fi
+    for io_size in $large_io_size 1048576 ; do
+    for use_data_file in "" "data_file=$TEST_IMG_DATA," ; do
+
+        # FVD image is about 1G
+        img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+        # base image is about 500MB
+        base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+        count=$[$count + 1]
+        echo "Round $count" >> $CMD_LOG
+
+        invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE $TEST_IMG_DATA"
+        invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+        invoke "dd if=/dev/zero of=$TEST_BASE count=0 bs=1 seek=$base_size"
+        if [ ! -z $use_data_file ]; then invoke "touch $TEST_IMG_DATA"; fi
+
+        mixed_records_per_journal_sector=121
+        journal_size=$[(((($io_size / $chunk_size ) + 1 ) / $mixed_records_per_journal_sector ) + 1) * 512 * 100]
+
+        invoke "$QEMU_IMG create -f fvd $base_img -o${use_data_file}data_file_fmt=blksim,compact_image=$compact_image,copy_on_read=$copy_on_read,block_size=$block_size,chunk_size=$chunk_size,journal_size=$journal_size,prefetch_start_delay=$prefetch_delay $TEST_IMG $img_size"
+        if [ $prefetch_delay -eq 1 ]; then $QEMU_IMG update $TEST_IMG prefetch_over_threshold_throttle_time=0; fi
+
+        # Use no more 1GB memory.
+        mem=$[$io_size * 1000]
+        if [ $mem -gt $MAX_MEM ]; then
+            parallel=$[$MAX_MEM / $io_size]
+        else
+            parallel=1000
+        fi
+        parallel=$[${RANDOM}${RANDOM} % $parallel]
+
+        round=$[$G1 * 10 / $io_size]
+        if [ $round -gt $MAX_ROUND ]; then round=$MAX_ROUND; fi
+
+        b3=$[$round * 2 / 3]
+        [ $b3 -eq 0 ] && b3=1
+        for rep in 0 1 2 ; do
+            if [ $rep -eq 0 ]; then
+                compare_before=false
+            else
+                compare_before=true
+            fi
+            r=$[${RANDOM}${RANDOM} % $b3]
+            seed=$[$seed + 1]
+            invoke "$QEMU_TEST --truth=$TRUTH_IMG --format=fvd --test="blksim:$TEST_IMG" --verify_write=true --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --flush_prob=$flush_prob --compare_after=true --round=$r --compare_before=$compare_before --seed=$seed"
+        done
+
+        /bin/rm -rf /tmp/fvd.log*
+done; done; done; done; done; done; done; done; done
diff --git a/test-qcow2.sh b/test-qcow2.sh
new file mode 100755
index 0000000..1b6a39b
--- /dev/null
+++ b/test-qcow2.sh
@@ -0,0 +1,75 @@ 
+#!/bin/bash
+
+if [ $USER != "root" ]; then
+    echo "This command must be run by root in order to mount tmpfs."
+    exit 1
+fi
+
+QEMU_DIR=.
+QEMU_IMG=$QEMU_DIR/qemu-img
+QEMU_TEST=$QEMU_DIR/qemu-test
+
+if [ ! -e $QEMU_IMG ]; then
+    echo "$QEMU_IMG does not exist."
+    exit 1;
+fi
+
+if [ ! -e $QEMU_TEST ]; then
+    echo "$QEMU_TEST does not exist."
+    exit 1;
+fi
+
+DATA_DIR=/var/ramdisk
+TRUTH_IMG=$DATA_DIR/truth.raw
+TEST_IMG=$DATA_DIR/test.qcow2
+TEST_BASE=$DATA_DIR/zero-500M.raw
+CMD_LOG=/tmp/test-qcow2.log
+
+mount | grep $DATA_DIR > /dev/null
+if [ $? -ne 0 ]; then
+    echo "Create tmpfs at $DATA_DIR to store testing images."
+    if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
+    mount -t tmpfs none $DATA_DIR -o size=4G
+    if [ $? -ne 0 ]; then exit 1; fi
+fi
+
+parallel=100
+round=100000
+fail_prob=0
+cancel_prob=0
+instant_qemubh=true
+seed=$RANDOM$RANDOM
+count=0
+
+function invoke() {
+    echo "$*" >> $CMD_LOG
+    $*
+    if [ $? -ne 0 ]; then
+        echo "Exit with error code $?: $*"
+    fi
+}
+
+/bin/rm -f $CMD_LOG
+touch $CMD_LOG
+
+while [ -t ]; do
+for cluster_size in 65536 7680 512 1024 15872 65024 1048576 1048064; do
+for io_size in 10485760 ; do
+    count=$[$count + 1]
+    echo "Round $count" >> $CMD_LOG
+
+    # QCOW2 image is about 1G
+    img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+    # base image is about 500MB
+    base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+    invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE"
+    invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+    invoke "dd if=/dev/zero of=$TEST_BASE count=0 bs=1 seek=$base_size"
+    invoke "$QEMU_IMG create -f qcow2 -ocluster_size=$cluster_size -b $TEST_BASE $TEST_IMG $img_size"
+
+    invoke "$QEMU_TEST --seed=$seed --truth=$TRUTH_IMG --format=qcow2 --test="blksim:$TEST_IMG" --verify_write=true --compare_before=false --compare_after=true --round=$round --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --instant_qemubh=$instant_qemubh"
+
+    seed=$[$seed + 1]
+done; done; done