Patchwork [01/26] FVD: add simulated block driver 'blksim'

login
register
mail settings
Submitter Chunqiang Tang
Date Feb. 25, 2011, 10:37 p.m.
Message ID <1298673486-3573-1-git-send-email-ctang@us.ibm.com>
Download mbox | patch
Permalink /patch/84612/
State New
Headers show

Comments

Chunqiang Tang - Feb. 25, 2011, 10:37 p.m.
This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the 'blksim' block device driver, which is a tool to
facilitate testing and debugging. blksim operates on a RAW image, but it uses
neither AIO nor posix threads to perform actual I/Os.  blksim function like an
event-driven disk simulator, and allows a block device driver developer to
fully control the order of disk I/Os, the order of callbacks, and the return
code of every I/O operation. The purpose is to extensively test a block device
driver under failures and race conditions.  Bugs found by blksim under rare
race conditions are guranteed to be precisely reproducible.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 Makefile.objs  |    1 +
 block/blksim.c |  757 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/blksim.h |   35 +++
 3 files changed, 793 insertions(+), 0 deletions(-)
 create mode 100644 block/blksim.c
 create mode 100644 block/blksim.h

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 9e98a66..264aab3 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,6 +23,7 @@  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += blksim.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/blksim.c b/block/blksim.c
new file mode 100644
index 0000000..5c7ef43
--- /dev/null
+++ b/block/blksim.c
@@ -0,0 +1,757 @@ 
+/*
+ * QEMU Simulated Block Device to Facilitate Testing and Debugging
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "block/blksim.h"
+
+#if 1
+# define QDEBUG(format,...) do {} while (0)
+#else
+# define QDEBUG printf
+#endif
+
+typedef enum
+{
+    SIM_NULL,
+    SIM_READ,
+    SIM_WRITE,
+    SIM_FLUSH,
+    SIM_READ_CALLBACK,
+    SIM_WRITE_CALLBACK,
+    SIM_FLUSH_CALLBACK,
+    SIM_TIMER
+} sim_op_t;
+
+static void sim_aio_cancel(BlockDriverAIOCB * acb);
+static int64_t sim_uuid = 0;
+static int64_t current_time = 0;
+static int64_t rand_time = 0;
+static int interactive_print = true;
+static int blksim_invoked = false;
+static bool instant_qemubh = true;
+struct SimAIOCB;
+
+/*
+ * Note: disk_io_return_code, set_disk_io_return_code(), and insert_task() work
+ * together to ensure that multiple subrequests triggered by the same
+ * outtermost request either succeed together or fail together. This behavior
+ * is required by qemu-test.  Here is one example of problems caused by
+ * departuring from this behavior.  Consider a write request that generates
+ * two subrequests, w1 and w2. If w1 succeeds but w2 fails, the data will not
+ * be written into qemu-test's "truth image" but the part of the data handled
+ * by w1 will be written into qemu-test's "test image". As a result, their
+ * contents diverge can automated testing cannot continue.
+ */
+static int disk_io_return_code = 0;
+
+typedef struct BDRVSimState
+{
+    int fd;
+} BDRVSimState;
+
+typedef struct SimAIOCB
+{
+    BlockDriverAIOCB common;
+    int64_t uuid;
+    sim_op_t op;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    int nb_sectors;
+    int ret;
+    int64_t time;
+    struct SimAIOCB *next;
+    struct SimAIOCB *prev;
+
+} SimAIOCB;
+
+static AIOPool sim_aio_pool = {
+    .aiocb_size = sizeof(SimAIOCB),
+    .cancel = sim_aio_cancel,
+};
+
+static SimAIOCB head = {
+    .uuid = -1,
+    .time = (int64_t) (9223372036854775807ULL),
+    .op = SIM_NULL,
+    .next = &head,
+    .prev = &head,
+};
+
+/* Debug a specific task.*/
+#if 0
+static inline void CHECK_TASK(int64_t uuid)
+{
+    if (uuid == 19LL) {
+        printf("CHECK_TASK pause for task %" PRId64 "\n", uuid);
+    }
+}
+#else
+#  define CHECK_TASK(acb) do { } while (0)
+#endif
+
+/* do_io() should never fail. A failure indicates a bug in the upper layer
+ * block device driver, or failure in the real hardware. */
+static int do_io(BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
+                 int nb_sectors, int do_read)
+{
+    BDRVSimState *s = bs->opaque;
+    size_t size = nb_sectors * 512;
+    uint8_t *new_buf, *p;
+    int ret;
+
+    if (interactive_print) {
+        printf ("Do %s %s sector_num=%"PRId64" nb_sectors=%d\n",
+                do_read ? "READ" : "WRITE", bs->filename,
+                sector_num, nb_sectors);
+    }
+
+    if ((ret=lseek(s->fd, sector_num * 512, SEEK_SET)) < 0) {
+        fprintf(stderr, "Error: lseek %s sector_num=%"PRId64"\n",
+                bs->filename, sector_num);
+    }
+
+    /* Buffer must be aligned for O_DIRECT. */
+    if ((bs->open_flags & BDRV_O_NOCACHE) &&
+        ((uintptr_t)buf & (unsigned)(bs->buffer_alignment - 1))) {
+        new_buf = p = qemu_blockalign(bs, size);
+        if (!do_read) {
+            memcpy (p, buf, size);
+        }
+    } else {
+        p = buf;
+        new_buf = NULL;
+    }
+
+    while (size > 0) {
+        if (do_read) {
+            ret = read(s->fd, p, size);
+            if (ret == 0) {
+                fprintf(stderr, "Error: read beyond the end of %s: sector_num=%"
+                        PRId64" nb_sectors=%d\n",
+                        bs->filename, sector_num, nb_sectors);
+                abort();
+            }
+        } else {
+            ret = write(s->fd, p, size);
+        }
+
+        if (ret >= 0) {
+            size -= ret;
+            p += ret;
+        } else if (errno != EINTR) {
+            fprintf(stderr, "Error: %s %s sector_num=%"PRId64" nb_sectors=%d\n",
+                    do_read ? "READ" : "WRITE", bs->filename,
+                    sector_num, nb_sectors);
+            abort();
+        }
+    }
+
+    if (new_buf) {
+        if (do_read) {
+            memcpy (buf, new_buf, nb_sectors * 512);
+        }
+        qemu_vfree (new_buf);
+    }
+
+    return 0;
+}
+
+static int blksim_read(BlockDriverState * bs, int64_t sector_num,
+                       uint8_t * buf, int nb_sectors)
+{
+    return do_io(bs, sector_num, buf, nb_sectors, true);
+}
+
+static int blksim_write(BlockDriverState * bs, int64_t sector_num,
+                        const uint8_t * buf, int nb_sectors)
+{
+    return do_io(bs, sector_num, (uint8_t *) buf, nb_sectors, false);
+}
+
+static void insert_in_list(SimAIOCB * acb)
+{
+    int64_t new_id = sim_uuid++;
+    CHECK_TASK(new_id);
+    acb->uuid = new_id;
+
+    if (rand_time <= 0) {
+        /* Working with qemu-io.c and not doing delay randomization.
+         * Insert it to the tail. */
+        acb->time = 0;
+        acb->prev = head.prev;
+        acb->next = &head;
+        head.prev->next = acb;
+        head.prev = acb;
+    } else {
+        SimAIOCB *p = head.next;
+
+        if (acb->time >= 0) {
+            /* Introduce a random delay to trigger rare race conditions. */
+            acb->time += rand() % rand_time;
+
+            /* Find the position to insert. List sorted in ascending time. */
+            while (1) {
+                if (p->time > acb->time) {
+                    break;
+                }
+                if (p->time == acb->time && (rand() % 2 == 0)) {
+                    break;
+                }
+                p = p->next;
+            }
+        }
+
+        /* Insert acb before p. */
+        acb->next = p;
+        acb->prev = p->prev;
+        p->prev->next = acb;
+        p->prev = acb;
+    }
+
+    QDEBUG("BLKSIM: insert task%" PRId64 " time=%" PRId64 "\n",
+           acb->uuid, acb->time);
+}
+
+/* Debug problems related to reusing task objects. */
+#if 1
+#  define my_qemu_aio_get qemu_aio_get
+#  define my_qemu_aio_release qemu_aio_release
+
+#else
+static SimAIOCB *search_task_list(SimAIOCB * acb)
+{
+    SimAIOCB *p;
+    for (p = head.next; p != &head; p = p->next) {
+        if (p == acb) {
+            return p;
+        }
+    }
+
+    return NULL;
+}
+
+static inline void *my_qemu_aio_get(AIOPool * pool, BlockDriverState * bs,
+                                    BlockDriverCompletionFunc * cb,
+                                    void *opaque)
+{
+    SimAIOCB *acb = (SimAIOCB *) qemu_aio_get(&sim_aio_pool, bs, cb, opaque);
+    QDEBUG("BLKSIM: qemu_aio_get reuse old task%" PRId64 "\n", acb->uuid);
+    ASSERT(!search_task_list(acb));
+    return acb;
+}
+
+static inline void my_qemu_aio_release(SimAIOCB * acb)
+{
+    QDEBUG("BLKSIM: qemu_aio_release task%" PRId64 "\n", acb->uuid);
+    qemu_aio_release(acb);
+}
+#endif
+
+static BlockDriverAIOCB *insert_task(int op, BlockDriverState * bs,
+                                     int64_t sector_num, QEMUIOVector * qiov,
+                                     int nb_sectors,
+                                     BlockDriverCompletionFunc * cb,
+                                     void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get(&sim_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->op = op;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    acb->nb_sectors = nb_sectors;
+    acb->ret = disk_io_return_code;
+    acb->time = current_time;
+    insert_in_list(acb);
+
+    if (interactive_print) {
+        if (op == SIM_READ) {
+            printf("Queue READ uuid=%" PRId64 " filename=%s sector_num=%"
+                   PRId64 " nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (op == SIM_WRITE) {
+            printf("Queue WRITE uuid=%" PRId64 " filename=%s sector_num=%"
+                   PRId64 " nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf(stderr, "Unknown op %d\n", op);
+            exit(1);
+        }
+    }
+
+    return &acb->common;
+}
+
+static void insert_aio_callback(SimAIOCB * acb)
+{
+    acb->time = current_time;
+    insert_in_list(acb);
+
+    if (acb->op == SIM_FLUSH) {
+        acb->op = SIM_FLUSH_CALLBACK;
+        if (interactive_print) {
+            printf("Queue FLUSH_CALLBACK uuid=%" PRId64 " filename=%s\n",
+                   acb->uuid, acb->common.bs->filename);
+        }
+    } else if (acb->op == SIM_READ) {
+        acb->op = SIM_READ_CALLBACK;
+        if (interactive_print) {
+            printf("Queue READ_CALLBACK uuid=%" PRId64
+                   " filename=%s sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb->common.bs->filename, acb->sector_num,
+                   acb->nb_sectors);
+        }
+    } else if (acb->op == SIM_WRITE) {
+        acb->op = SIM_WRITE_CALLBACK;
+        if (interactive_print) {
+            printf("Queue WRITE_CALLBACK uuid=%" PRId64
+                   " filename=%s sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb->common.bs->filename, acb->sector_num,
+                   acb->nb_sectors);
+        }
+    } else {
+        fprintf(stderr, "Wrong op %d\n", acb->op);
+        exit(1);
+    }
+}
+
+void blksim_list_tasks(void)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->op == SIM_READ) {
+            printf("uuid=%" PRId64 "  READ           file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE) {
+            printf("uuid=%" PRId64 "  WRITE          file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_READ_CALLBACK) {
+            printf("uuid=%" PRId64 "  CALLBACK READ  file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE_CALLBACK) {
+            printf("uuid=%" PRId64 "  CALLBACK WRITE file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf(stderr, "Wrong OP %d\n", acb->op);
+            exit(1);
+        }
+    }
+}
+
+static inline void sim_callback(SimAIOCB * acb)
+{
+    acb->common.cb(acb->common.opaque, acb->ret);
+}
+
+int64_t blksim_get_time(void)
+{
+    return current_time;
+}
+
+void *blksim_new_timer(void *cb, void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get(&sim_aio_pool, NULL, cb, opaque);
+    acb->op = SIM_TIMER;
+    acb->prev = NULL;
+    return acb;
+}
+
+void blksim_mod_timer(void *ts, int64_t expire_time)
+{
+    SimAIOCB *acb = ts;
+
+    if (acb->prev) {
+        /* Remove it first. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+    }
+    acb->time = expire_time;
+    insert_in_list(acb);
+
+    if (interactive_print) {
+        printf("Queue TIMER uuid=%" PRId64 " expire_time=%" PRId64
+               " current_time=%" PRId64 "\n",
+               acb->uuid, expire_time, current_time);
+    }
+}
+
+void blksim_free_timer(void *ts)
+{
+    SimAIOCB *acb = ts;
+    CHECK_TASK(acb->uuid);
+    my_qemu_aio_release(acb);
+}
+
+void blksim_del_timer(void *ts)
+{
+    SimAIOCB *acb = ts;
+
+    CHECK_TASK(acb->uuid);
+    if (acb->prev) {
+        /* Remove it from the list. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+
+        /* Mark it as not in list. */
+        acb->prev = NULL;
+    }
+}
+
+void blksim_bh_schedule(void *bh)
+{
+    if (instant_qemubh) {
+        blksim_mod_timer(bh, -1);
+    } else {
+        blksim_mod_timer(bh, current_time);
+    }
+}
+
+void blksim_set_instant_qemubh(bool instant)
+{
+    instant_qemubh = instant;
+}
+
+void blksim_set_disk_io_return_code(int ret)
+{
+    disk_io_return_code = ret;
+}
+
+static void run_task_by_acb(SimAIOCB * acb)
+{
+    CHECK_TASK(acb->uuid);
+
+    /* Remove it from the list. */
+    acb->next->prev = acb->prev;
+    acb->prev->next = acb->next;
+    acb->prev = NULL;   /* Indicate that it is no longer in the list. */
+
+    if (acb->time > current_time) {
+        current_time = acb->time;
+    }
+
+    if (acb->op == SIM_TIMER) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64 " TIMER \n",
+               acb->uuid, acb->time);
+        ((QEMUTimerCB *) acb->common.cb) (acb->common.opaque);
+        return;
+    }
+
+    BlockDriverState *bs = acb->common.bs;
+
+    if (acb->op == SIM_READ) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64
+               " READ %s sector_num=%" PRId64 " nb_sectors=%d\n",
+               acb->uuid, acb->time, bs->filename, acb->sector_num,
+               acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (blksim_read
+                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                     acb->nb_sectors) != 0) {
+                    fprintf(stderr, "Error in reading %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+            } else {
+                uint8_t *buf = qemu_blockalign(acb->common.bs, acb->qiov->size);
+                if (blksim_read(bs, acb->sector_num, buf, acb->nb_sectors)!=0) {
+                    fprintf(stderr, "Error in reading %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+                qemu_iovec_from_buffer(acb->qiov, buf, acb->qiov->size);
+                qemu_vfree(buf);
+            }
+        }
+
+        insert_aio_callback(acb);
+    } else if (acb->op == SIM_WRITE) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64
+               " WRITE %s sector_num=%" PRId64 " nb_sectors=%d\n",
+               acb->uuid, acb->time, bs->filename,
+               acb->sector_num, acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (blksim_write(bs, acb->sector_num, acb->qiov->iov->iov_base,
+                                 acb->nb_sectors) != 0) {
+                    fprintf(stderr, "Error in writing %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+            } else {
+                uint8_t *buf = qemu_blockalign(acb->common.bs,
+                                               acb->qiov->size);
+                qemu_iovec_to_buffer(acb->qiov, buf);
+                if (blksim_write(bs, acb->sector_num, buf,acb->nb_sectors)!=0) {
+                    fprintf(stderr, "Error in writing %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+                qemu_vfree(buf);
+            }
+        }
+
+        insert_aio_callback(acb);
+    } else if (acb->op == SIM_FLUSH) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64 " FLUSH %s\n",
+               acb->uuid, acb->time, bs->filename);
+        if (interactive_print) {
+            printf ("Do FLUSH %s\n", bs->filename);
+        }
+         if (acb->ret == 0) {
+             BDRVSimState *s = acb->common.bs->opaque;
+             qemu_fdatasync (s->fd);
+         }
+        insert_aio_callback(acb);
+    } else if (acb->op == SIM_WRITE_CALLBACK || acb->op == SIM_READ_CALLBACK
+               || acb->op == SIM_FLUSH_CALLBACK) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64 " CALLBACK\n",
+               acb->uuid, acb->time);
+        sim_callback(acb);
+        CHECK_TASK(acb->uuid);
+        my_qemu_aio_release(acb);
+    } else {
+        fprintf(stderr, "Unknown op %d\n", acb->op);
+        exit(1);
+    }
+}
+
+int blksim_run_task_by_uuid(int64_t uuid)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->uuid == uuid) {
+            run_task_by_acb(acb);
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
+int blksim_run_all_tasks(void)
+{
+    int n = 0;
+
+    while (1) {
+        SimAIOCB *acb = head.next;
+        if (acb == &head) {
+            /* No more tasks. */
+            return n;
+        }
+
+        run_task_by_acb(acb);
+        n++;
+    }
+}
+
+static BlockDriverAIOCB *blksim_aio_readv(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque)
+{
+    if (disk_io_return_code == RETURN_CODE_FOR_NULL_ACB) {
+        return NULL;
+    }
+    return insert_task(SIM_READ, bs, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_writev(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque)
+{
+    if (disk_io_return_code == RETURN_CODE_FOR_NULL_ACB) {
+        return NULL;
+    }
+
+    return insert_task(SIM_WRITE, bs, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_flush(BlockDriverState * bs,
+            BlockDriverCompletionFunc * cb, void *opaque)
+{
+    if (disk_io_return_code == RETURN_CODE_FOR_NULL_ACB) {
+        return NULL;
+    }
+    return insert_task(SIM_FLUSH, bs, 0, NULL, 0, cb, opaque);
+}
+
+static void sim_aio_cancel(BlockDriverAIOCB * blockacb)
+{
+    SimAIOCB *acb = container_of(blockacb, SimAIOCB, common);
+
+    CHECK_TASK(acb->uuid);
+    QDEBUG("BLKSIM: cancel task%" PRId64 "\n", acb->uuid);
+
+    if (acb->prev) {
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+        acb->prev = NULL;
+        my_qemu_aio_release(acb);
+    } else {
+        fprintf(stderr, "Error: cancel a blksim task that does not exist: "
+                "uuid=%" PRId64 ". Halt process %d for debugging...\n",
+                acb->uuid, getpid());
+        abort();
+    }
+}
+
+static int blksim_open(BlockDriverState * bs, const char *filename,
+                       int bdrv_flags)
+{
+    BDRVSimState *s = bs->opaque;
+    int open_flags = O_BINARY | O_LARGEFILE;
+
+    blksim_invoked = true;
+
+    if ((bdrv_flags & BDRV_O_RDWR)) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    /* Parse the "blksim:" prefix */
+    if (!strncmp(filename, "blksim:", strlen("blksim:"))) {
+        filename += strlen("blksim:");
+    }
+
+    s->fd = qemu_open(filename, open_flags);
+    if (s->fd < 0) {
+        return -errno;
+    }
+
+    int64_t len = lseek(s->fd, 0, SEEK_END);
+    if (len >= 0) {
+        bs->total_sectors = len / 512;
+    } else {
+        bs->total_sectors = 0;
+    }
+
+    bs->growable = 1;
+    bs->buffer_alignment = 512;
+    return 0;
+}
+
+static void blksim_close(BlockDriverState * bs)
+{
+    BDRVSimState *s = bs->opaque;
+    close(s->fd);
+}
+
+static int blksim_flush(BlockDriverState * bs)
+{
+    if (interactive_print) {
+        printf ("Do FLUSH %s\n", bs->filename);
+    }
+    BDRVSimState *s = bs->opaque;
+    qemu_fdatasync (s->fd);
+    return 0;
+}
+
+static int blksim_has_zero_init(BlockDriverState * bs)
+{
+    struct stat buf;
+
+    if (stat(bs->filename, &buf) != 0) {
+        fprintf(stderr, "Failed to stat() %s\n", bs->filename);
+        exit(1);
+    }
+
+    if (S_ISBLK(buf.st_mode) || S_ISCHR(buf.st_mode)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+static int blksim_truncate(BlockDriverState * bs, int64_t offset)
+{
+    BDRVSimState *s = bs->opaque;
+    return ftruncate(s->fd, offset);
+}
+
+static BlockDriver bdrv_blksim = {
+    .format_name = "blksim",
+    .protocol_name = "blksim",
+    .instance_size = sizeof(BDRVSimState),
+    .bdrv_file_open = blksim_open,
+    .bdrv_close = blksim_close,
+    .bdrv_flush = blksim_flush,
+    .bdrv_read = blksim_read,
+    .bdrv_write = blksim_write,
+    .bdrv_aio_readv = blksim_aio_readv,
+    .bdrv_aio_writev = blksim_aio_writev,
+    .bdrv_aio_flush = blksim_aio_flush,
+    .bdrv_has_zero_init = blksim_has_zero_init,
+    .bdrv_truncate = blksim_truncate,
+};
+
+static void bdrv_blksim_init(void)
+{
+    bdrv_register(&bdrv_blksim);
+}
+
+block_init(bdrv_blksim_init);
+
+void init_blksim(int print, int64_t _rand_time)
+{
+    interactive_print = print;
+    rand_time = _rand_time;
+}
+
+/*
+ * To work properly in the simulation mode, block device drivers that
+ * explicitly invoke qemu_aio_wait() should invoke blksim_qemu_aio_wait() if
+ * the block device is openned using blksim. Most block device drivers do not
+ * invoke qemu_aio_wait() and hence should not be concerned about this.
+ * Return false if no more tasks to run.
+ */
+bool blksim_qemu_aio_wait(void)
+{
+    SimAIOCB *acb = head.next;
+    if (acb == &head) {
+        return false;
+    } else {
+        run_task_by_acb(acb);
+        return true;
+    }
+}
+
+int blksim_has_task(void)
+{
+    return head.next != &head;
+}
+
+int using_blksim(void)
+{
+    return blksim_invoked;
+}
diff --git a/block/blksim.h b/block/blksim.h
new file mode 100644
index 0000000..5c9533d
--- /dev/null
+++ b/block/blksim.h
@@ -0,0 +1,35 @@ 
+/*
+ * QEMU Simulated Block Device to Facilitate Testing and Debugging
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef __block_sim_h__
+#define __block_sim_h__
+
+#define RETURN_CODE_FOR_NULL_ACB        (-9999)
+
+void init_blksim (int print, int64_t _rand_time);
+int using_blksim (void);
+int blksim_has_task (void);
+void blksim_list_tasks (void);
+int blksim_run_task_by_uuid (int64_t uuid);
+int blksim_run_all_tasks (void);
+int64_t blksim_get_time (void);
+void *blksim_new_timer (void *cb, void *opaque);
+void blksim_mod_timer (void *ts, int64_t expire_time);
+void blksim_free_timer (void *ts);
+void blksim_del_timer (void *ts);
+void blksim_bh_schedule (void *bh);
+void blksim_set_disk_io_return_code (int ret);
+bool blksim_qemu_aio_wait(void);
+void blksim_set_instant_qemubh (bool instant);
+
+#endif