diff mbox

[2/3] FVD: Added the simulated 'blksim' driver

Message ID 1295648355-17359-2-git-send-email-ctang@us.ibm.com
State New
Headers show

Commit Message

Chunqiang Tang Jan. 21, 2011, 10:19 p.m. UTC
This patch is part of the Fast Virtual Disk (FVD) proposal. See the related
discussions at
http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html.

This patch adds the 'blksim' block device driver, which is a tool to
facilitate testing and debugging. blksim operates on a RAW image, but it uses
neither AIO nor posix threads to perform I/Os. Instead, blksim functions like
an event-driven disk simulator, and allows a block device driver developer to
fully control the order of disk I/Os, the order of callbacks, and the return
code of every I/O operation. The purpose is to comprehensively test a block
device driver under failures and race conditions. Bugs found by blksim under
rare race conditions are guaranteed to be precisely reproducible, with no
dependency on thread timing etc., which makes debugging much easier.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 Makefile.objs  |    1 +
 block/blksim.c |  752 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/blksim.h |   35 +++
 3 files changed, 788 insertions(+), 0 deletions(-)
 create mode 100644 block/blksim.c
 create mode 100644 block/blksim.h

Comments

Anthony Liguori Jan. 21, 2011, 10:49 p.m. UTC | #1
On 01/21/2011 04:19 PM, Chunqiang Tang wrote:
> This patch is part of the Fast Virtual Disk (FVD) proposal. See the related
> discussions at
> http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html.
>
> This patch adds the 'blksim' block device driver, which is a tool to
> facilitate testing and debugging. blksim operates on a RAW image, but it uses
> neither AIO nor posix threads to perform I/Os. Instead, blksim functions like
> an event-driven disk simulator, and allows a block device driver developer to
> fully control the order of disk I/Os, the order of callbacks, and the return
> code of every I/O operation. The purpose is to comprehensively test a block
> device driver under failures and race conditions. Bugs found by blksim under
> rare race conditions are guaranteed to be precisely reproducible, with no
> dependency on thread timing etc., which makes debugging much easier.
>
> Signed-off-by: Chunqiang Tang<ctang@us.ibm.com>
> ---
>   Makefile.objs  |    1 +
>   block/blksim.c |  752 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   block/blksim.h |   35 +++
>   3 files changed, 788 insertions(+), 0 deletions(-)
>   create mode 100644 block/blksim.c
>   create mode 100644 block/blksim.h
>
> diff --git a/Makefile.objs b/Makefile.objs
> index c3e52c5..ce5cc8d 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -23,6 +23,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
>   block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>   block-nested-y += qed-check.o
>   block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> +block-nested-y += blksim.o
>   block-nested-$(CONFIG_WIN32) += raw-win32.o
>   block-nested-$(CONFIG_POSIX) += raw-posix.o
>   block-nested-$(CONFIG_CURL) += curl.o
> diff --git a/block/blksim.c b/block/blksim.c
> new file mode 100644
> index 0000000..a92ba11
> --- /dev/null
> +++ b/block/blksim.c
> @@ -0,0 +1,752 @@
> +/*
> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang<ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +/*=============================================================================
> + *  A short description: this module implements a simulated block device
> + *  driver "blksim". It works with qemu-io and qemu-test to perform testing,
> + *  allowing changing the  order of disk I/O and callback activities to test
> + *  rare race conditions. See qemu-test.c, qemu-io.c, and qemu-io-sim.c.
> + *============================================================================*/
> +
> +#include<sys/vfs.h>
> +#include<sys/mman.h>
> +#include<pthread.h>
> +#include<execinfo.h>
> +#include<stdlib.h>
> +#include<sys/ioctl.h>
> +#include<stdint.h>
> +#include<stdio.h>
> +#include<inttypes.h>
>    

QEMU code shouldn't include headers like this.  It almost guarantees 
breaking portability.

> +#include "block_int.h"
> +#include "osdep.h"
> +#include "qemu-option.h"
> +#include "qemu-timer.h"
> +#include "block.h"
> +#include "qemu-queue.h"
> +#include "qemu-common.h"
> +#include "block/blksim.h"
> +
> +#ifndef TRUE
> +# define TRUE 1
> +#endif
> +
> +#ifndef FALSE
> +# define FALSE 0
> +#endif
>    

C99 introduces stdbool.h.  That's the appropriate defines to use for 
booleans.

> +
> +#if 0
> +# define QDEBUG printf
> +#else
> +# define QDEBUG(format,...) do {} while (0)
> +#endif
> +
> +typedef enum {
> +    SIM_NULL,
> +    SIM_READ,
> +    SIM_WRITE,
> +    SIM_FLUSH,
> +    SIM_READ_CALLBACK,
> +    SIM_WRITE_CALLBACK,
> +    SIM_FLUSH_CALLBACK,
> +    SIM_TIMER
> +} sim_op_t;
>    

Breaks coding style (and the C standard).

> +static void sim_aio_cancel (BlockDriverAIOCB * acb);
> +static int64_t sim_uuid = 0;
> +static int64_t current_time = 0;
> +static int64_t rand_time = 0;
> +static int interactive_print = TRUE;
> +static int blksim_invoked = FALSE;
> +static int instant_qemubh = TRUE;
> +struct SimAIOCB;
> +
> +/*
> + * Note: disk_io_return_code, set_disk_io_return_code(), and insert_task() work
> + * together to ensure that multiple subrequests triggered by the same
> + * outtermost request either succeed together or fail together. This behavior
> + * is required by qemu-test.  Here is one example of problems caused by
> + * departuring from this behavior.  Consider a write request that generates
> + * two subrequests, w1 and w2. If w1 succeeds but w2 fails, the data will not
> + * be written into qemu-test's "truth image" but the part of the data handled
> + * by w1 will be written into qemu-test's "test image". As a result, their
> + * contents diverge can automated testing cannot continue.
> + */
> +static int disk_io_return_code = 0;
> +
> +typedef struct BDRVSimState {
> +    int fd;
> +} BDRVSimState;
> +
> +typedef struct SimAIOCB {
> +    BlockDriverAIOCB common;
> +    int64_t uuid;
> +    sim_op_t op;
> +    int64_t sector_num;
> +    QEMUIOVector *qiov;
> +    int nb_sectors;
> +    int ret;
> +    int64_t time;
> +    struct SimAIOCB *next;
> +    struct SimAIOCB *prev;
>    

Use qemu-queue instead of open coding data structures.

> +} SimAIOCB;
> +
> +static AIOPool sim_aio_pool = {
> +    .aiocb_size = sizeof (SimAIOCB),
> +    .cancel = sim_aio_cancel,
> +};
> +
> +static SimAIOCB head = {
> +    .uuid = -1,
> +    .time = (int64_t) (9223372036854775807ULL),
>    

This number has to mean something but I'll be damned if I know what it 
means.

> +    .op = SIM_NULL,
> +    .next =&head,
> +    .prev =&head,
> +};
> +
> +/* Debug a specific task.*/
> +#if 1
> +# define CHECK_TASK(acb) do { } while (0)
> +#else
> +static inline void CHECK_TASK (int64_t uuid)
> +{
> +    if (uuid == 19LL) {
> +        printf ("CHECK_TASK pause for task %" PRId64 "\n", uuid);
> +    }
> +}
> +#endif
>    

19LL?  Why is 19 significant?

> +/* do_io() should never fail. A failure indicates a bug in the upper layer
> + * block device driver, or failure in the real hardware. */
> +static int do_io (BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
> +                  int nb_sectors, int do_read)
> +{
> +    BDRVSimState *s = bs->opaque;
> +    size_t size = nb_sectors * 512;
> +    int ret;
> +
> +    if (lseek (s->fd, sector_num * 512, SEEK_SET)<  0) {
> +        fprintf (stderr, "Error: lseek %s sector_num=%" PRId64 ". "
> +                 "Pause process %d for debugging...\n",
> +                 bs->filename, sector_num, getpid ());
> +        fgetc (stdin);
> +    }
> +
> +    while (size>  0) {
> +
> +        if (do_read) {
> +            ret = read (s->fd, buf, size);
> +            if (ret == 0) {
> +                fprintf (stderr,
> +                         "Error: read beyond the size of %s sector_num=%" PRId64
> +                         " nb_sectors=%d. Pause process %d for debugging...\n",
> +                         bs->filename, sector_num, nb_sectors, getpid ());
> +                fgetc (stdin);
> +            }
> +        } else {
> +            ret = write (s->fd, buf, size);
> +        }
> +
> +        if (ret>= 0) {
> +            size -= ret;
> +            buf += ret;
>    

You'll hit an infinite loop on EOF.

> +        } else if (errno != EINTR) {
> +            fprintf (stderr, "Error: %s %s sector_num=%" PRId64
> +                     " nb_sectors=%d. Pause process %d for debugging...\n",
> +                     do_read ? "READ" : "WRITE", bs->filename, sector_num,
> +                     nb_sectors, getpid ());
> +            fgetc (stdin);
> +            return -errno;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +static int blksim_read (BlockDriverState * bs, int64_t sector_num,
> +                        uint8_t * buf, int nb_sectors)
> +{
> +    return do_io (bs, sector_num, buf, nb_sectors, TRUE);
> +}
> +
> +static int blksim_write (BlockDriverState * bs, int64_t sector_num,
> +                      const uint8_t * buf, int nb_sectors)
> +{
> +    return do_io (bs, sector_num, (uint8_t *) buf, nb_sectors, FALSE);
> +}
> +
> +static void insert_in_list (SimAIOCB * acb)
> +{
> +    int64_t new_id = sim_uuid++;
> +    CHECK_TASK (new_id);
> +    acb->uuid = new_id;
> +
> +    if (rand_time<= 0) {
> +        /* Working with qemu-io.c and not doing delay randomization.
> +         * Insert it to the tail. */
> +        acb->time = 0;
> +        acb->prev = head.prev;
> +        acb->next =&head;
> +        head.prev->next = acb;
> +        head.prev = acb;
> +        return;
> +    }
> +
> +    SimAIOCB *p = head.next;
> +
> +    if (acb->time>= 0) {
> +        /* Introduce a random delay to better trigger rare race conditions. */
> +        acb->time += random () % rand_time;
> +
> +        /* Find the position to insert. The list is sorted in ascending time. */
> +        while (1) {
> +            if (p->time>  acb->time) {
> +                break;
> +            }
> +            if (p->time == acb->time&&  (random () % 2 == 0)) {
> +                break;
> +            }
> +            p = p->next;
> +        }
> +    }
> +
> +    /* Insert acb before p. */
> +    acb->next = p;
> +    acb->prev = p->prev;
> +    p->prev->next = acb;
> +    p->prev = acb;
> +}
> +
> +/* Debug problems related to reusing task objects. Problem already solved.*/
> +#if 1
> +# define my_qemu_aio_get qemu_aio_get
> +# define my_qemu_aio_release qemu_aio_release
> +
> +#else
> +static SimAIOCB *search_task_list (SimAIOCB * acb)
> +{
> +    SimAIOCB *p;
> +    for (p = head.next; p !=&head; p = p->next) {
> +        if (p == acb) {
> +            return p;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static inline void *my_qemu_aio_get (AIOPool * pool, BlockDriverState * bs,
> +                                     BlockDriverCompletionFunc * cb,
> +                                     void *opaque)
> +{
> +    SimAIOCB *acb = (SimAIOCB *) qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
> +    QDEBUG ("SIM: qemu_aio_get reuse old task%" PRId64 "\n", acb->uuid);
> +    ASSERT (!search_task_list (acb));
> +    return acb;
> +}
> +
> +static inline void my_qemu_aio_release (SimAIOCB * acb)
> +{
> +    QDEBUG ("SIM: qemu_aio_release task%" PRId64 "\n", acb->uuid);
> +    qemu_aio_release (acb);
> +}
> +#endif
> +
> +static BlockDriverAIOCB *insert_task (int op, BlockDriverState * bs,
> +                                      int64_t sector_num, QEMUIOVector * qiov,
> +                                      int nb_sectors,
> +                                      BlockDriverCompletionFunc * cb,
> +                                      void *opaque)
> +{
> +    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
> +    if (!acb) {
> +        return NULL;
> +    }
> +
> +    acb->op = op;
> +    acb->sector_num = sector_num;
> +    acb->qiov = qiov;
> +    acb->nb_sectors = nb_sectors;
> +    acb->ret = disk_io_return_code;
> +    acb->time = current_time;
> +    insert_in_list (acb);
> +
> +    if (interactive_print) {
> +        if (op == SIM_READ) {
> +            printf ("Added READ uuid=%" PRId64 "  filename=%s  sector_num=%"
> +                    PRId64 "  nb_sectors=%d\n", acb->uuid,
> +                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
> +        } else if (op == SIM_WRITE) {
> +            printf ("Added WRITE uuid=%" PRId64 "  filename=%s  sector_num=%"
> +                    PRId64 "  nb_sectors=%d\n", acb->uuid,
> +                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
> +        } else {
> +            fprintf (stderr, "Unknown op %d\n", op);
> +            exit (1);
> +        }
> +    }
> +
> +    return&acb->common;
> +}
> +
> +static void insert_aio_callback (SimAIOCB * acb)
> +{
> +    acb->time = current_time;
> +    insert_in_list (acb);
> +
> +    if (acb->op == SIM_FLUSH) {
> +        acb->op = SIM_FLUSH_CALLBACK;
> +        if (interactive_print) {
> +            printf ("Added FLUSH_CALLBACK uuid=%" PRId64 "  filename=%s\n",
> +                    acb->uuid, acb->common.bs->filename);
> +        }
> +    } else if (acb->op == SIM_READ) {
> +        acb->op = SIM_READ_CALLBACK;
> +        if (interactive_print) {
> +            printf ("Added READ_CALLBACK uuid=%" PRId64
> +                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
> +                    acb->uuid, acb->common.bs->filename, acb->sector_num,
> +                    acb->nb_sectors);
> +        }
> +    } else if (acb->op == SIM_WRITE) {
> +        acb->op = SIM_WRITE_CALLBACK;
> +        if (interactive_print) {
> +            printf ("Added WRITE_CALLBACK uuid=%" PRId64
> +                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
> +                    acb->uuid, acb->common.bs->filename, acb->sector_num,
> +                    acb->nb_sectors);
> +        }
> +    } else {
> +        fprintf (stderr, "Wrong op %d\n", acb->op);
> +        exit (1);
> +    }
> +}
> +
> +void blksim_list_tasks (void)
> +{
> +    SimAIOCB *acb;
> +
> +    for (acb = head.next; acb !=&head; acb = acb->next) {
> +        if (acb->op == SIM_READ) {
> +            printf ("uuid=%" PRId64 "  READ           file=%s  sector_num=%"
> +                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
> +                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
> +        } else if (acb->op == SIM_WRITE) {
> +            printf ("uuid=%" PRId64 "  WRITE          file=%s  sector_num=%"
> +                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
> +                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
> +        } else if (acb->op == SIM_READ_CALLBACK) {
> +            printf ("uuid=%" PRId64 "  CALLBACK READ  file=%s  sector_num=%"
> +                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
> +                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
> +        } else if (acb->op == SIM_WRITE_CALLBACK) {
> +            printf ("uuid=%" PRId64 "  CALLBACK WRITE file=%s  sector_num=%"
> +                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
> +                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
> +        } else {
> +            fprintf (stderr, "Wrong OP %d\n", acb->op);
> +            exit (1);
> +        }
> +    }
> +}
> +
> +static inline void sim_callback (SimAIOCB * acb)
> +{
> +    acb->common.cb (acb->common.opaque, acb->ret);
> +}
> +
> +int64_t blksim_get_time (void)
> +{
> +    return current_time;
> +}
> +
> +void *blksim_new_timer (void *cb, void *opaque)
> +{
> +    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, NULL, cb, opaque);
> +    acb->op = SIM_TIMER;
> +    acb->prev = NULL;
> +    return acb;
> +}
> +
> +void blksim_mod_timer (void *ts, int64_t expire_time)
> +{
> +    SimAIOCB *acb = ts;
> +
> +    if (acb->prev) {
> +        /* Remove it first. */
> +        acb->next->prev = acb->prev;
> +        acb->prev->next = acb->next;
> +    }
> +    acb->time = expire_time;
> +    insert_in_list (acb);
> +
> +    if (interactive_print) {
> +        printf ("Added TIMER uuid=%" PRId64 "  expire_time=%"PRId64
> +                " current_time=%"PRId64"\n",
> +                acb->uuid, expire_time, current_time);
> +    }
> +}
> +
> +void blksim_free_timer (void *ts)
> +{
> +    SimAIOCB *acb = ts;
> +    CHECK_TASK (acb->uuid);
> +    my_qemu_aio_release (acb);
> +}
> +
> +void blksim_del_timer (void *ts)
> +{
> +    SimAIOCB *acb = ts;
> +
> +    CHECK_TASK (acb->uuid);
> +    if (acb->prev) {
> +        /* Remove it from the list. */
> +        acb->next->prev = acb->prev;
> +        acb->prev->next = acb->next;
> +
> +        /* Mark it as not in list. */
> +        acb->prev = NULL;
> +    }
> +}
> +
> +void blksim_bh_schedule (void *bh)
> +{
> +    if (instant_qemubh) {
> +        blksim_mod_timer (bh, -1);
> +    } else {
> +        blksim_mod_timer (bh, current_time);
> +    }
> +}
> +
> +void blksim_set_instant_qemubh (int instant)
> +{
> +    instant_qemubh = instant;
> +}
> +
> +void blksim_set_disk_io_return_code (int ret)
> +{
> +    disk_io_return_code = ret;
> +}
> +
> +static void run_task_by_acb (SimAIOCB * acb)
> +{
> +    CHECK_TASK (acb->uuid);
> +
> +    /* Remove it from the list. */
> +    acb->next->prev = acb->prev;
> +    acb->prev->next = acb->next;
> +    acb->prev = NULL;        /* Indicate that it is no longer in the list. */
> +
> +    if (acb->time>  current_time) {
> +        current_time = acb->time;
> +    }
> +
> +    if (acb->op == SIM_TIMER) {
> +        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " TIMER \n",
> +                acb->uuid, acb->time);
> +
> +        ((QEMUTimerCB *) acb->common.cb) (acb->common.opaque);
> +        return;
> +    }
> +
> +    BlockDriverState *bs = acb->common.bs;
> +
> +    if (acb->op == SIM_READ) {
> +        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
> +                " READ %s sector_num=%" PRId64 " nb_sectors=%d\n",
> +                acb->uuid, acb->time, bs->filename, acb->sector_num,
> +                acb->nb_sectors);
> +
> +        if (acb->ret == 0) {
> +            if (acb->qiov->niov == 1) {
> +                if (blksim_read
> +                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
> +                     acb->nb_sectors) != 0) {
> +                    fprintf (stderr, "Error in reading %s sector_num=%lld "
> +                             "nb_sectors=%d\n", acb->common.bs->filename,
> +                             acb->sector_num, acb->nb_sectors);
> +                    exit (1);
> +                }
> +            } else {
> +                uint8_t *buf=qemu_blockalign (acb->common.bs, acb->qiov->size);
> +                if (blksim_read (bs, acb->sector_num, buf,
> +                                 acb->nb_sectors) != 0) {
> +                    fprintf (stderr, "Error in reading %s sector_num=%lld "
> +                             "nb_sectors=%d\n", acb->common.bs->filename,
> +                             acb->sector_num, acb->nb_sectors);
> +                    exit (1);
> +                }
> +                qemu_iovec_from_buffer (acb->qiov, buf, acb->qiov->size);
> +                qemu_vfree (buf);
> +            }
> +        }
> +
> +        insert_aio_callback (acb);
> +    } else if (acb->op == SIM_WRITE) {
> +        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
> +                " WRITE %s sector_num=%" PRId64 " nb_sectors=%d\n",
> +                acb->uuid, acb->time, bs->filename,
> +                acb->sector_num, acb->nb_sectors);
> +
> +        if (acb->ret == 0) {
> +            if (acb->qiov->niov == 1) {
> +                if (blksim_write (bs, acb->sector_num, acb->qiov->iov->iov_base,
> +                                  acb->nb_sectors) != 0) {
> +                    fprintf (stderr, "Error in writing %s sector_num=%lld "
> +                             "nb_sectors=%d\n", acb->common.bs->filename,
> +                             acb->sector_num, acb->nb_sectors);
> +                    exit (1);
> +                }
> +            } else {
> +                uint8_t *buf = qemu_blockalign (acb->common.bs,
> +                                                acb->qiov->size);
> +                qemu_iovec_to_buffer (acb->qiov, buf);
> +                if (blksim_write (bs, acb->sector_num, buf,
> +                                  acb->nb_sectors)!= 0) {
> +                    fprintf (stderr, "Error in writing %s sector_num=%lld "
> +                             "nb_sectors=%d\n", acb->common.bs->filename,
> +                             acb->sector_num, acb->nb_sectors);
> +                    exit (1);
> +                }
> +                qemu_vfree (buf);
> +            }
> +        }
> +
> +        insert_aio_callback (acb);
> +    } else if (acb->op == SIM_FLUSH) {
> +        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " FLUSH %s\n",
> +                acb->uuid, acb->time, bs->filename);
> +        /* Skip real flushing to speed up simulation:
> +         *         if (ret == 0) { * fdatasync (s->fd); } */
> +        insert_aio_callback (acb);
> +    } else if (acb->op == SIM_WRITE_CALLBACK || acb->op == SIM_READ_CALLBACK
> +               || acb->op == SIM_FLUSH_CALLBACK) {
> +        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " CALLBACK\n",
> +                acb->uuid, acb->time);
> +        sim_callback (acb);
> +        CHECK_TASK (acb->uuid);
> +        my_qemu_aio_release (acb);
> +    } else {
> +        fprintf (stderr, "Unknown op %d\n", acb->op);
> +        exit (1);
> +    }
> +}
> +
> +int blksim_run_task_by_uuid (int64_t uuid)
> +{
> +    SimAIOCB *acb;
> +
> +    for (acb = head.next; acb !=&head; acb = acb->next) {
> +        if (acb->uuid == uuid) {
> +            run_task_by_acb (acb);
> +            return 0;
> +        }
> +    }
> +
> +    return -1;
> +}
> +
> +int blksim_run_all_tasks (void)
> +{
> +    int n = 0;
> +
> +    while (1) {
> +        SimAIOCB *acb = head.next;
> +        if (acb ==&head) {
> +            return n; /* No more tasks.*/
> +        }
> +
> +        run_task_by_acb (acb);
> +        n++;
> +    }
> +}
> +
> +static BlockDriverAIOCB *blksim_aio_readv (BlockDriverState * bs,
> +                                        int64_t sector_num,
> +                                        QEMUIOVector * qiov,
> +                                        int nb_sectors,
> +                                        BlockDriverCompletionFunc * cb,
> +                                        void *opaque)
> +{
> +    return insert_task (SIM_READ, bs, sector_num, qiov, nb_sectors, cb, opaque);
> +}
> +
> +static BlockDriverAIOCB *blksim_aio_writev (BlockDriverState * bs,
> +                                         int64_t sector_num,
> +                                         QEMUIOVector * qiov,
> +                                         int nb_sectors,
> +                                         BlockDriverCompletionFunc * cb,
> +                                         void *opaque)
> +{
> +    return insert_task (SIM_WRITE, bs, sector_num, qiov, nb_sectors, cb,
> +                        opaque);
> +}
> +
> +static BlockDriverAIOCB *blksim_aio_flush (BlockDriverState * bs,
> +                                        BlockDriverCompletionFunc * cb,
> +                                        void *opaque)
> +{
> +    return insert_task (SIM_FLUSH, bs, 0, NULL, 0, cb, opaque);
> +}
> +
> +static void sim_aio_cancel (BlockDriverAIOCB * blockacb)
> +{
> +    SimAIOCB *acb = container_of (blockacb, SimAIOCB, common);
> +
> +    CHECK_TASK (acb->uuid);
> +    QDEBUG ("SIM: cancel task%" PRId64 "\n", acb->uuid);
> +
> +    if (acb->prev) {
> +        acb->next->prev = acb->prev;
> +        acb->prev->next = acb->next;
> +        acb->prev = NULL;
> +        my_qemu_aio_release (acb);
> +    } else {
> +        fprintf (stderr, "Error: cancel a blksim task that does not exist: "
> +                 "uuid=%"PRId64". Halt process %d for debugging...\n",
> +                 acb->uuid, getpid());
> +        fgetc (stdin);
> +        exit (1);
> +    }
> +}
> +
> +static int blksim_open (BlockDriverState * bs, const char *filename,
> +                     int bdrv_flags)
> +{
> +    BDRVSimState *s = bs->opaque;
> +    int open_flags = O_BINARY | O_LARGEFILE;
> +
> +    blksim_invoked  = TRUE;
> +
> +    if ((bdrv_flags&  BDRV_O_RDWR)) {
> +        open_flags |= O_RDWR;
> +    } else {
> +        open_flags |= O_RDONLY;
> +    }
> +
> +    if ((bdrv_flags&  BDRV_O_NOCACHE)) {
> +        open_flags |= O_DIRECT;
> +    } else if (!(bdrv_flags&  BDRV_O_CACHE_WB)) {
> +        open_flags |= O_DSYNC;
> +    }
> +
> +    /* Parse the "blksim:" prefix */
> +    if (!strncmp(filename, "blksim:", strlen("blksim:"))) {
> +        filename += strlen("blksim:");
> +    }
> +
> +    s->fd = open (filename, open_flags);
> +    if (s->fd<  0)
> +        return -1;
> +
> +    int64_t len = lseek (s->fd, 0, SEEK_END);
> +    if (len>= 0) {
> +        bs->total_sectors = len / 512;
> +    } else {
> +        bs->total_sectors = 0;
> +    }
> +
> +    bs->growable = 1;
> +    return 0;
> +}
> +
> +static void blksim_close (BlockDriverState * bs)
> +{
> +    BDRVSimState *s = bs->opaque;
> +    close (s->fd);
> +}
> +
> +static int blksim_flush (BlockDriverState * bs)
> +{
> +    /*
> +     * Skip real flushing to speed up simulation.
> +         * BDRVSimState *s = bs->opaque;
> +         * fdatasync (s->fd);
> +     */
> +    return 0;
> +}
> +
> +static int blksim_has_zero_init (BlockDriverState * bs)
> +{
> +    struct stat buf;
> +
> +    if (stat (bs->filename,&buf) != 0) {
> +        fprintf (stderr, "Failed to stat() %s\n", bs->filename);
> +        exit (1);
> +    }
> +
> +    if (S_ISBLK (buf.st_mode) || S_ISCHR (buf.st_mode)) {
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
> +static int blksim_truncate (BlockDriverState * bs, int64_t offset)
> +{
> +    BDRVSimState *s = bs->opaque;
> +    return ftruncate (s->fd, offset);
> +}
> +
> +static BlockDriver bdrv_blksim = {
> +    .format_name = "blksim",
> +    .protocol_name = "blksim",
> +    .instance_size = sizeof (BDRVSimState),
> +    .bdrv_file_open = blksim_open,
> +    .bdrv_close = blksim_close,
> +    .bdrv_flush = blksim_flush,
> +    .bdrv_read = blksim_read,
> +    .bdrv_write = blksim_write,
> +    .bdrv_aio_readv = blksim_aio_readv,
> +    .bdrv_aio_writev = blksim_aio_writev,
> +    .bdrv_aio_flush = blksim_aio_flush,
> +    .bdrv_has_zero_init = blksim_has_zero_init,
> +    .bdrv_truncate = blksim_truncate,
> +};
> +
> +static void bdrv_blksim_init(void)
> +{
> +    bdrv_register(&bdrv_blksim);
> +}
> +block_init(bdrv_blksim_init);
> +
> +void init_blksim (int print, int64_t _rand_time)
> +{
> +    interactive_print = print;
> +    rand_time = _rand_time;
> +}
> +
> +/*
> + * To work properly in the simulation mode, block device drivers that
> + * explicitly invoke qemu_aio_wait() should invoke blksim_qemu_aio_wait() if
> + * the block device is openned using blksim. Most block device drivers do not
> + * invoke qemu_aio_wait() and hence should not be concerned about this.
> + */
> +int blksim_qemu_aio_wait (void)
> +{
> +    SimAIOCB *acb = head.next;
> +    if (acb ==&head) {
> +        return 0;
> +    }
> +    else {
> +        run_task_by_acb (acb);
> +        return 1;
> +    }
> +}
> +
> +int blksim_has_task (void)
> +{
> +    return head.next !=&head;
> +}
> +
> +int using_blksim (void)
> +{
> +    return blksim_invoked;
> +}
> diff --git a/block/blksim.h b/block/blksim.h
> new file mode 100644
> index 0000000..fa1e20d
> --- /dev/null
> +++ b/block/blksim.h
> @@ -0,0 +1,35 @@
> +/*
> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang<ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +/*=============================================================================
> + *  A short description: this is the header of the simulated block device
> + *  driver "blksim".
> + *============================================================================*/
> +
> +#ifndef __block_sim_h__
> +#define __block_sim_h__
>    

Coding style.

In general, I like the idea of the simulator but the coding style is off 
quite a bit.

Regards,

Anthony Liguori

> +void init_blksim (int print, int64_t _rand_time);
> +int using_blksim (void);
> +int blksim_has_task (void);
> +void blksim_list_tasks (void);
> +int blksim_run_task_by_uuid (int64_t uuid);
> +int blksim_run_all_tasks (void);
> +int64_t blksim_get_time (void);
> +void *blksim_new_timer (void *cb, void *opaque);
> +void blksim_mod_timer (void *ts, int64_t expire_time);
> +void blksim_free_timer (void *ts);
> +void blksim_del_timer (void *ts);
> +void blksim_bh_schedule (void *bh);
> +void blksim_set_disk_io_return_code (int ret);
> +int blksim_qemu_aio_wait(void);
> +void blksim_set_instant_qemubh (int instant /* TRUE or FALSE */);
> +
> +#endif
>
Chunqiang Tang Jan. 22, 2011, 3:09 a.m. UTC | #2
> Coding style.
> 
> In general, I like the idea of the simulator but the coding style is off 

> quite a bit.

Please be specific and I would be happy to take suggestions. The header 
issue should be easy to fix.
Andreas Färber Jan. 23, 2011, 3:26 p.m. UTC | #3
Am 21.01.2011 um 23:19 schrieb Chunqiang Tang:

> diff --git a/block/blksim.c b/block/blksim.c
> new file mode 100644
> index 0000000..a92ba11
> --- /dev/null
> +++ b/block/blksim.c

Some formal comments, since you're introducing a new file:

> @@ -0,0 +1,752 @@
> +/*

Headers usually start with a one-line summary, "QEMU simulated block  
driver" maybe?

> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang <ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.

Can you make this GPLv2-or-later to avoid future hassles?

> +#ifndef TRUE
> +# define TRUE 1
> +#endif
> +
> +#ifndef FALSE
> +# define FALSE 0
> +#endif

I don't think these two belong here.

stdbool.h defines true and false with identical values.
http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/stdbool.h.html

Not sure about TRUE and FALSE.
If we want them as local definitions, they should rather go to qemu- 
common.h than to individual source files.

Regards,
Andreas
Anthony Liguori Jan. 23, 2011, 11:26 p.m. UTC | #4
On 01/21/2011 09:09 PM, Chunqiang Tang wrote:
>> Coding style.
>>
>> In general, I like the idea of the simulator but the coding style is off
>>      
>    
>> quite a bit.
>>      
> Please be specific and I would be happy to take suggestions. The header
> issue should be easy to fix.
>    

Read CODING_STYLE and go through your code.

The code pretty consistently doesn't follow it.

Regards,

Anthony Liguori
Chunqiang Tang Jan. 24, 2011, 3:07 p.m. UTC | #5
> Read CODING_STYLE and go through your code.


Went through CODING_STYLE. The white space issue in FVD was already been 
fixed previously. FVD’s variable and type names are fine, and line width 
is fine. The only remaining issue in FVD is '}' before 'else', which will 
be fixed. CODING_STYLE does not require, but I noticed through example 
that, function calls are 'do_something()', while FVD uses 'do_something 
()' (with a white space before '()'). Is this a hard requirement and need 
be fixed? Is there anything else that are not specified in CODING_STYLE 
but is adopted in QEMU by convention? I would like to take all suggestions 
and fix code style in one pass, rather than doing it again and again. 
Thanks.

CODING_STYLE:
    if (a == 5) {
    } else if (a == 6) {
    }

FVD: 
    if (a == 5) {
    } 
    else if (a == 6) {
    }
Chunqiang Tang Jan. 25, 2011, 4:54 p.m. UTC | #6
> Headers usually start with a one-line summary, "QEMU simulated block 
> driver" maybe?
> > + * Copyright (c) 2010-2011 IBM
> > + *
> > + * Authors:
> > + *         Chunqiang Tang <ctang@us.ibm.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.
> > + * See the COPYING file in the top-level directory.
> 
> Can you make this GPLv2-or-later to avoid future hassles?

Will do.

> > +#ifndef TRUE
> > +# define TRUE 1
> > +#endif
> > +
> > +#ifndef FALSE
> > +# define FALSE 0
> > +#endif
> 
> I don't think these two belong here.
> 
> stdbool.h defines true and false with identical values.
> http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/stdbool.h.html
> 
> Not sure about TRUE and FALSE.
> If we want them as local definitions, they should rather go to qemu- 
> common.h than to individual source files.

You are right. "true" and "false" should be used instead, and 
qemu-common.h already includes stdbool.h.
diff mbox

Patch

diff --git a/Makefile.objs b/Makefile.objs
index c3e52c5..ce5cc8d 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,6 +23,7 @@  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += blksim.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/blksim.c b/block/blksim.c
new file mode 100644
index 0000000..a92ba11
--- /dev/null
+++ b/block/blksim.c
@@ -0,0 +1,752 @@ 
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements a simulated block device
+ *  driver "blksim". It works with qemu-io and qemu-test to perform testing,
+ *  allowing changing the  order of disk I/O and callback activities to test
+ *  rare race conditions. See qemu-test.c, qemu-io.c, and qemu-io-sim.c.
+ *============================================================================*/
+
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "block/blksim.h"
+
+#ifndef TRUE
+# define TRUE 1
+#endif
+
+#ifndef FALSE
+# define FALSE 0
+#endif
+
+#if 0
+# define QDEBUG printf
+#else
+# define QDEBUG(format,...) do {} while (0)
+#endif
+
+typedef enum {
+    SIM_NULL,
+    SIM_READ,
+    SIM_WRITE,
+    SIM_FLUSH,
+    SIM_READ_CALLBACK,
+    SIM_WRITE_CALLBACK,
+    SIM_FLUSH_CALLBACK,
+    SIM_TIMER
+} sim_op_t;
+
+static void sim_aio_cancel (BlockDriverAIOCB * acb);
+static int64_t sim_uuid = 0;
+static int64_t current_time = 0;
+static int64_t rand_time = 0;
+static int interactive_print = TRUE;
+static int blksim_invoked = FALSE;
+static int instant_qemubh = TRUE;
+struct SimAIOCB;
+
+/*
+ * Note: disk_io_return_code, set_disk_io_return_code(), and insert_task() work
+ * together to ensure that multiple subrequests triggered by the same
+ * outtermost request either succeed together or fail together. This behavior
+ * is required by qemu-test.  Here is one example of problems caused by
+ * departuring from this behavior.  Consider a write request that generates
+ * two subrequests, w1 and w2. If w1 succeeds but w2 fails, the data will not
+ * be written into qemu-test's "truth image" but the part of the data handled
+ * by w1 will be written into qemu-test's "test image". As a result, their
+ * contents diverge can automated testing cannot continue.
+ */
+static int disk_io_return_code = 0;
+
+typedef struct BDRVSimState {
+    int fd;
+} BDRVSimState;
+
+typedef struct SimAIOCB {
+    BlockDriverAIOCB common;
+    int64_t uuid;
+    sim_op_t op;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    int nb_sectors;
+    int ret;
+    int64_t time;
+    struct SimAIOCB *next;
+    struct SimAIOCB *prev;
+
+} SimAIOCB;
+
+static AIOPool sim_aio_pool = {
+    .aiocb_size = sizeof (SimAIOCB),
+    .cancel = sim_aio_cancel,
+};
+
+static SimAIOCB head = {
+    .uuid = -1,
+    .time = (int64_t) (9223372036854775807ULL),
+    .op = SIM_NULL,
+    .next = &head,
+    .prev = &head,
+};
+
+/* Debug a specific task.*/
+#if 1
+# define CHECK_TASK(acb) do { } while (0)
+#else
+static inline void CHECK_TASK (int64_t uuid)
+{
+    if (uuid == 19LL) {
+        printf ("CHECK_TASK pause for task %" PRId64 "\n", uuid);
+    }
+}
+#endif
+
+/* do_io() should never fail. A failure indicates a bug in the upper layer
+ * block device driver, or failure in the real hardware. */
+static int do_io (BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
+                  int nb_sectors, int do_read)
+{
+    BDRVSimState *s = bs->opaque;
+    size_t size = nb_sectors * 512;
+    int ret;
+
+    if (lseek (s->fd, sector_num * 512, SEEK_SET) < 0) {
+        fprintf (stderr, "Error: lseek %s sector_num=%" PRId64 ". "
+                 "Pause process %d for debugging...\n",
+                 bs->filename, sector_num, getpid ());
+        fgetc (stdin);
+    }
+
+    while (size > 0) {
+
+        if (do_read) {
+            ret = read (s->fd, buf, size);
+            if (ret == 0) {
+                fprintf (stderr,
+                         "Error: read beyond the size of %s sector_num=%" PRId64
+                         " nb_sectors=%d. Pause process %d for debugging...\n",
+                         bs->filename, sector_num, nb_sectors, getpid ());
+                fgetc (stdin);
+            }
+        } else {
+            ret = write (s->fd, buf, size);
+        }
+
+        if (ret >= 0) {
+            size -= ret;
+            buf += ret;
+        } else if (errno != EINTR) {
+            fprintf (stderr, "Error: %s %s sector_num=%" PRId64
+                     " nb_sectors=%d. Pause process %d for debugging...\n",
+                     do_read ? "READ" : "WRITE", bs->filename, sector_num,
+                     nb_sectors, getpid ());
+            fgetc (stdin);
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+static int blksim_read (BlockDriverState * bs, int64_t sector_num, 
+                        uint8_t * buf, int nb_sectors)
+{
+    return do_io (bs, sector_num, buf, nb_sectors, TRUE);
+}
+
+static int blksim_write (BlockDriverState * bs, int64_t sector_num,
+                      const uint8_t * buf, int nb_sectors)
+{
+    return do_io (bs, sector_num, (uint8_t *) buf, nb_sectors, FALSE);
+}
+
+static void insert_in_list (SimAIOCB * acb)
+{
+    int64_t new_id = sim_uuid++;
+    CHECK_TASK (new_id);
+    acb->uuid = new_id;
+
+    if (rand_time <= 0) {
+        /* Working with qemu-io.c and not doing delay randomization.
+         * Insert it to the tail. */
+        acb->time = 0;
+        acb->prev = head.prev;
+        acb->next = &head;
+        head.prev->next = acb;
+        head.prev = acb;
+        return;
+    }
+
+    SimAIOCB *p = head.next;
+
+    if (acb->time >= 0) {
+        /* Introduce a random delay to better trigger rare race conditions. */
+        acb->time += random () % rand_time;
+
+        /* Find the position to insert. The list is sorted in ascending time. */
+        while (1) {
+            if (p->time > acb->time) {
+                break;
+            }
+            if (p->time == acb->time && (random () % 2 == 0)) {
+                break;
+            }
+            p = p->next;
+        }
+    }
+
+    /* Insert acb before p. */
+    acb->next = p;
+    acb->prev = p->prev;
+    p->prev->next = acb;
+    p->prev = acb;
+}
+
+/* Debug problems related to reusing task objects. Problem already solved.*/
+#if 1
+# define my_qemu_aio_get qemu_aio_get
+# define my_qemu_aio_release qemu_aio_release
+
+#else
+static SimAIOCB *search_task_list (SimAIOCB * acb)
+{
+    SimAIOCB *p;
+    for (p = head.next; p != &head; p = p->next) {
+        if (p == acb) {
+            return p;
+        }
+    }
+
+    return NULL;
+}
+
+static inline void *my_qemu_aio_get (AIOPool * pool, BlockDriverState * bs,
+                                     BlockDriverCompletionFunc * cb,
+                                     void *opaque)
+{
+    SimAIOCB *acb = (SimAIOCB *) qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+    QDEBUG ("SIM: qemu_aio_get reuse old task%" PRId64 "\n", acb->uuid);
+    ASSERT (!search_task_list (acb));
+    return acb;
+}
+
+static inline void my_qemu_aio_release (SimAIOCB * acb)
+{
+    QDEBUG ("SIM: qemu_aio_release task%" PRId64 "\n", acb->uuid);
+    qemu_aio_release (acb);
+}
+#endif
+
+static BlockDriverAIOCB *insert_task (int op, BlockDriverState * bs,
+                                      int64_t sector_num, QEMUIOVector * qiov,
+                                      int nb_sectors,
+                                      BlockDriverCompletionFunc * cb,
+                                      void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->op = op;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    acb->nb_sectors = nb_sectors;
+    acb->ret = disk_io_return_code;
+    acb->time = current_time;
+    insert_in_list (acb);
+
+    if (interactive_print) {
+        if (op == SIM_READ) {
+            printf ("Added READ uuid=%" PRId64 "  filename=%s  sector_num=%"
+                    PRId64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (op == SIM_WRITE) {
+            printf ("Added WRITE uuid=%" PRId64 "  filename=%s  sector_num=%"
+                    PRId64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf (stderr, "Unknown op %d\n", op);
+            exit (1);
+        }
+    }
+
+    return &acb->common;
+}
+
+static void insert_aio_callback (SimAIOCB * acb)
+{
+    acb->time = current_time;
+    insert_in_list (acb);
+
+    if (acb->op == SIM_FLUSH) {
+        acb->op = SIM_FLUSH_CALLBACK;
+        if (interactive_print) {
+            printf ("Added FLUSH_CALLBACK uuid=%" PRId64 "  filename=%s\n",
+                    acb->uuid, acb->common.bs->filename);
+        }
+    } else if (acb->op == SIM_READ) {
+        acb->op = SIM_READ_CALLBACK;
+        if (interactive_print) {
+            printf ("Added READ_CALLBACK uuid=%" PRId64
+                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
+                    acb->uuid, acb->common.bs->filename, acb->sector_num,
+                    acb->nb_sectors);
+        }
+    } else if (acb->op == SIM_WRITE) {
+        acb->op = SIM_WRITE_CALLBACK;
+        if (interactive_print) {
+            printf ("Added WRITE_CALLBACK uuid=%" PRId64
+                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
+                    acb->uuid, acb->common.bs->filename, acb->sector_num,
+                    acb->nb_sectors);
+        }
+    } else {
+        fprintf (stderr, "Wrong op %d\n", acb->op);
+        exit (1);
+    }
+}
+
+void blksim_list_tasks (void)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->op == SIM_READ) {
+            printf ("uuid=%" PRId64 "  READ           file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE) {
+            printf ("uuid=%" PRId64 "  WRITE          file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_READ_CALLBACK) {
+            printf ("uuid=%" PRId64 "  CALLBACK READ  file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE_CALLBACK) {
+            printf ("uuid=%" PRId64 "  CALLBACK WRITE file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf (stderr, "Wrong OP %d\n", acb->op);
+            exit (1);
+        }
+    }
+}
+
+static inline void sim_callback (SimAIOCB * acb)
+{
+    acb->common.cb (acb->common.opaque, acb->ret);
+}
+
+int64_t blksim_get_time (void)
+{
+    return current_time;
+}
+
+void *blksim_new_timer (void *cb, void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, NULL, cb, opaque);
+    acb->op = SIM_TIMER;
+    acb->prev = NULL;
+    return acb;
+}
+
+void blksim_mod_timer (void *ts, int64_t expire_time)
+{
+    SimAIOCB *acb = ts;
+
+    if (acb->prev) {
+        /* Remove it first. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+    }
+    acb->time = expire_time;
+    insert_in_list (acb);
+
+    if (interactive_print) {
+        printf ("Added TIMER uuid=%" PRId64 "  expire_time=%"PRId64
+                " current_time=%"PRId64"\n", 
+                acb->uuid, expire_time, current_time);
+    }
+}
+
+void blksim_free_timer (void *ts)
+{
+    SimAIOCB *acb = ts;
+    CHECK_TASK (acb->uuid);
+    my_qemu_aio_release (acb);
+}
+
+void blksim_del_timer (void *ts)
+{
+    SimAIOCB *acb = ts;
+
+    CHECK_TASK (acb->uuid);
+    if (acb->prev) {
+        /* Remove it from the list. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+
+        /* Mark it as not in list. */
+        acb->prev = NULL;
+    }
+}
+
+void blksim_bh_schedule (void *bh)
+{
+    if (instant_qemubh) {
+        blksim_mod_timer (bh, -1);
+    } else {
+        blksim_mod_timer (bh, current_time);
+    }
+}
+
+void blksim_set_instant_qemubh (int instant)
+{
+    instant_qemubh = instant;
+}
+
+void blksim_set_disk_io_return_code (int ret)
+{
+    disk_io_return_code = ret;
+}
+
+static void run_task_by_acb (SimAIOCB * acb)
+{
+    CHECK_TASK (acb->uuid);
+
+    /* Remove it from the list. */
+    acb->next->prev = acb->prev;
+    acb->prev->next = acb->next;
+    acb->prev = NULL;        /* Indicate that it is no longer in the list. */
+
+    if (acb->time > current_time) {
+        current_time = acb->time;
+    }
+
+    if (acb->op == SIM_TIMER) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " TIMER \n",
+                acb->uuid, acb->time);
+
+        ((QEMUTimerCB *) acb->common.cb) (acb->common.opaque);
+        return;
+    }
+
+    BlockDriverState *bs = acb->common.bs;
+
+    if (acb->op == SIM_READ) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+                " READ %s sector_num=%" PRId64 " nb_sectors=%d\n",
+                acb->uuid, acb->time, bs->filename, acb->sector_num, 
+                acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (blksim_read
+                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                     acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in reading %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+            } else {
+                uint8_t *buf=qemu_blockalign (acb->common.bs, acb->qiov->size);
+                if (blksim_read (bs, acb->sector_num, buf, 
+                                 acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in reading %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+                qemu_iovec_from_buffer (acb->qiov, buf, acb->qiov->size);
+                qemu_vfree (buf);
+            }
+        }
+
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_WRITE) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+                " WRITE %s sector_num=%" PRId64 " nb_sectors=%d\n",
+                acb->uuid, acb->time, bs->filename, 
+                acb->sector_num, acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (blksim_write (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                                  acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in writing %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+            } else {
+                uint8_t *buf = qemu_blockalign (acb->common.bs,
+                                                acb->qiov->size);
+                qemu_iovec_to_buffer (acb->qiov, buf);
+                if (blksim_write (bs, acb->sector_num, buf, 
+                                  acb->nb_sectors)!= 0) {
+                    fprintf (stderr, "Error in writing %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+                qemu_vfree (buf);
+            }
+        }
+
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_FLUSH) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " FLUSH %s\n",
+                acb->uuid, acb->time, bs->filename);
+        /* Skip real flushing to speed up simulation:
+         *         if (ret == 0) { * fdatasync (s->fd); } */
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_WRITE_CALLBACK || acb->op == SIM_READ_CALLBACK
+               || acb->op == SIM_FLUSH_CALLBACK) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " CALLBACK\n",
+                acb->uuid, acb->time);
+        sim_callback (acb);
+        CHECK_TASK (acb->uuid);
+        my_qemu_aio_release (acb);
+    } else {
+        fprintf (stderr, "Unknown op %d\n", acb->op);
+        exit (1);
+    }
+}
+
+int blksim_run_task_by_uuid (int64_t uuid)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->uuid == uuid) {
+            run_task_by_acb (acb);
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
+int blksim_run_all_tasks (void)
+{
+    int n = 0;
+
+    while (1) {
+        SimAIOCB *acb = head.next;
+        if (acb == &head) {
+            return n; /* No more tasks.*/
+        }
+
+        run_task_by_acb (acb);
+        n++;
+    }
+}
+
+static BlockDriverAIOCB *blksim_aio_readv (BlockDriverState * bs,
+                                        int64_t sector_num,
+                                        QEMUIOVector * qiov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return insert_task (SIM_READ, bs, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_writev (BlockDriverState * bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector * qiov,
+                                         int nb_sectors,
+                                         BlockDriverCompletionFunc * cb,
+                                         void *opaque)
+{
+    return insert_task (SIM_WRITE, bs, sector_num, qiov, nb_sectors, cb,
+                        opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_flush (BlockDriverState * bs,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return insert_task (SIM_FLUSH, bs, 0, NULL, 0, cb, opaque);
+}
+
+static void sim_aio_cancel (BlockDriverAIOCB * blockacb)
+{
+    SimAIOCB *acb = container_of (blockacb, SimAIOCB, common);
+
+    CHECK_TASK (acb->uuid);
+    QDEBUG ("SIM: cancel task%" PRId64 "\n", acb->uuid);
+
+    if (acb->prev) {
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+        acb->prev = NULL;
+        my_qemu_aio_release (acb);
+    } else {
+        fprintf (stderr, "Error: cancel a blksim task that does not exist: "
+                 "uuid=%"PRId64". Halt process %d for debugging...\n",
+                 acb->uuid, getpid());
+        fgetc (stdin); 
+        exit (1);
+    }
+}
+
+static int blksim_open (BlockDriverState * bs, const char *filename,
+                     int bdrv_flags)
+{
+    BDRVSimState *s = bs->opaque;
+    int open_flags = O_BINARY | O_LARGEFILE;
+
+    blksim_invoked  = TRUE;
+
+    if ((bdrv_flags & BDRV_O_RDWR)) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    } else if (!(bdrv_flags & BDRV_O_CACHE_WB)) {
+        open_flags |= O_DSYNC;
+    }
+
+    /* Parse the "blksim:" prefix */
+    if (!strncmp(filename, "blksim:", strlen("blksim:"))) {
+        filename += strlen("blksim:");
+    }
+
+    s->fd = open (filename, open_flags);
+    if (s->fd < 0)
+        return -1;
+
+    int64_t len = lseek (s->fd, 0, SEEK_END);
+    if (len >= 0) {
+        bs->total_sectors = len / 512;
+    } else {
+        bs->total_sectors = 0;
+    }
+
+    bs->growable = 1;
+    return 0;
+}
+
+static void blksim_close (BlockDriverState * bs)
+{
+    BDRVSimState *s = bs->opaque;
+    close (s->fd);
+}
+
+static int blksim_flush (BlockDriverState * bs)
+{
+    /*
+     * Skip real flushing to speed up simulation.
+         * BDRVSimState *s = bs->opaque;
+         * fdatasync (s->fd);
+     */
+    return 0;
+}
+
+static int blksim_has_zero_init (BlockDriverState * bs)
+{
+    struct stat buf;
+
+    if (stat (bs->filename, &buf) != 0) {
+        fprintf (stderr, "Failed to stat() %s\n", bs->filename);
+        exit (1);
+    }
+
+    if (S_ISBLK (buf.st_mode) || S_ISCHR (buf.st_mode)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+static int blksim_truncate (BlockDriverState * bs, int64_t offset)
+{
+    BDRVSimState *s = bs->opaque;
+    return ftruncate (s->fd, offset);
+}
+
+static BlockDriver bdrv_blksim = {
+    .format_name = "blksim",
+    .protocol_name = "blksim",
+    .instance_size = sizeof (BDRVSimState),
+    .bdrv_file_open = blksim_open,
+    .bdrv_close = blksim_close,
+    .bdrv_flush = blksim_flush,
+    .bdrv_read = blksim_read,
+    .bdrv_write = blksim_write,
+    .bdrv_aio_readv = blksim_aio_readv,
+    .bdrv_aio_writev = blksim_aio_writev,
+    .bdrv_aio_flush = blksim_aio_flush,
+    .bdrv_has_zero_init = blksim_has_zero_init,
+    .bdrv_truncate = blksim_truncate,
+};
+
+static void bdrv_blksim_init(void)
+{
+    bdrv_register(&bdrv_blksim);
+}
+block_init(bdrv_blksim_init);
+
+void init_blksim (int print, int64_t _rand_time)
+{
+    interactive_print = print;
+    rand_time = _rand_time;
+}
+
+/* 
+ * To work properly in the simulation mode, block device drivers that
+ * explicitly invoke qemu_aio_wait() should invoke blksim_qemu_aio_wait() if 
+ * the block device is openned using blksim. Most block device drivers do not
+ * invoke qemu_aio_wait() and hence should not be concerned about this.
+ */
+int blksim_qemu_aio_wait (void)
+{
+    SimAIOCB *acb = head.next;
+    if (acb == &head) {
+        return 0;
+    }
+    else {
+        run_task_by_acb (acb);
+        return 1;
+    }
+}
+
+int blksim_has_task (void)
+{
+    return head.next != &head;
+}
+
+int using_blksim (void)
+{
+    return blksim_invoked;
+}
diff --git a/block/blksim.h b/block/blksim.h
new file mode 100644
index 0000000..fa1e20d
--- /dev/null
+++ b/block/blksim.h
@@ -0,0 +1,35 @@ 
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this is the header of the simulated block device
+ *  driver "blksim".
+ *============================================================================*/
+
+#ifndef __block_sim_h__
+#define __block_sim_h__
+
+void init_blksim (int print, int64_t _rand_time);
+int using_blksim (void);
+int blksim_has_task (void);
+void blksim_list_tasks (void);
+int blksim_run_task_by_uuid (int64_t uuid);
+int blksim_run_all_tasks (void);
+int64_t blksim_get_time (void);
+void *blksim_new_timer (void *cb, void *opaque);
+void blksim_mod_timer (void *ts, int64_t expire_time);
+void blksim_free_timer (void *ts);
+void blksim_del_timer (void *ts);
+void blksim_bh_schedule (void *bh);
+void blksim_set_disk_io_return_code (int ret);
+int blksim_qemu_aio_wait(void);
+void blksim_set_instant_qemubh (int instant /* TRUE or FALSE */);
+
+#endif