Patchwork [09/19] Introduce event-tap.

login
register
mail settings
Submitter Yoshiaki Tamura
Date Jan. 19, 2011, 5:44 a.m.
Message ID <1295415904-11918-10-git-send-email-tamura.yoshiaki@lab.ntt.co.jp>
Download mbox | patch
Permalink /patch/79416/
State New
Headers show

Comments

Yoshiaki Tamura - Jan. 19, 2011, 5:44 a.m.
event-tap controls when to start FT transaction, and provides proxy
functions to called from net/block devices.  While FT transaction, it
queues up net/block requests, and flush them when the transaction gets
completed.

Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
---
 Makefile.target |    1 +
 event-tap.c     |  847 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 event-tap.h     |   42 +++
 qemu-tool.c     |   24 ++
 trace-events    |    9 +
 5 files changed, 923 insertions(+), 0 deletions(-)
 create mode 100644 event-tap.c
 create mode 100644 event-tap.h
Kevin Wolf - Jan. 19, 2011, 9:38 a.m.
Am 19.01.2011 06:44, schrieb Yoshiaki Tamura:
> event-tap controls when to start FT transaction, and provides proxy
> functions to called from net/block devices.  While FT transaction, it
> queues up net/block requests, and flush them when the transaction gets
> completed.
> 
> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>

One general comment: On the first glance this seems to mix block and net
(and some other things) arbitrarily instead of having a section for
handling all block stuff, then network, etc.

Is there a specific reason for the order in which you put the functions?
If not, maybe reordering them might improve readability.

> ---
>  Makefile.target |    1 +
>  event-tap.c     |  847 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  event-tap.h     |   42 +++
>  qemu-tool.c     |   24 ++
>  trace-events    |    9 +
>  5 files changed, 923 insertions(+), 0 deletions(-)
>  create mode 100644 event-tap.c
>  create mode 100644 event-tap.h
> 
> diff --git a/Makefile.target b/Makefile.target
> index e15b1c4..f36cd75 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -199,6 +199,7 @@ obj-y += rwhandler.o
>  obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>  obj-$(CONFIG_NO_KVM) += kvm-stub.o
>  LIBS+=-lz
> +obj-y += event-tap.o
>  
>  QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
>  QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
> diff --git a/event-tap.c b/event-tap.c
> new file mode 100644
> index 0000000..f492708
> --- /dev/null
> +++ b/event-tap.c

> @@ -0,0 +1,847 @@
> +/*
> + * Event Tap functions for QEMU
> + *
> + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu-common.h"
> +#include "qemu-error.h"
> +#include "block.h"
> +#include "block_int.h"
> +#include "ioport.h"
> +#include "osdep.h"
> +#include "sysemu.h"
> +#include "hw/hw.h"
> +#include "net.h"
> +#include "event-tap.h"
> +#include "trace.h"
> +
> +enum EVENT_TAP_STATE {
> +    EVENT_TAP_OFF,
> +    EVENT_TAP_ON,
> +    EVENT_TAP_FLUSH,
> +    EVENT_TAP_LOAD,
> +    EVENT_TAP_REPLAY,
> +};
> +
> +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF;
> +static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */

Indeed, bdrv_aio_cancel will segfault this way.

If you use dummies instead of real ACBs the only way to correctly
implement bdrv_aio_cancel is waiting for all in-flight AIOs
(qemu_aio_flush).

> +typedef struct EventTapIOport {
> +    uint32_t address;
> +    uint32_t data;
> +    int      index;
> +} EventTapIOport;
> +
> +#define MMIO_BUF_SIZE 8
> +
> +typedef struct EventTapMMIO {
> +    uint64_t address;
> +    uint8_t  buf[MMIO_BUF_SIZE];
> +    int      len;
> +} EventTapMMIO;
> +
> +typedef struct EventTapNetReq {
> +    char *device_name;
> +    int iovcnt;
> +    struct iovec *iov;
> +    int vlan_id;
> +    bool vlan_needed;
> +    bool async;
> +    NetPacketSent *sent_cb;
> +} EventTapNetReq;
> +
> +#define MAX_BLOCK_REQUEST 32
> +
> +typedef struct EventTapBlkReq {
> +    char *device_name;
> +    int num_reqs;
> +    int num_cbs;
> +    bool is_flush;
> +    BlockRequest reqs[MAX_BLOCK_REQUEST];
> +    BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST];
> +    void *opaque[MAX_BLOCK_REQUEST];
> +} EventTapBlkReq;
> +
> +#define EVENT_TAP_IOPORT (1 << 0)
> +#define EVENT_TAP_MMIO   (1 << 1)
> +#define EVENT_TAP_NET    (1 << 2)
> +#define EVENT_TAP_BLK    (1 << 3)
> +
> +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1)
> +
> +typedef struct EventTapLog {
> +    int mode;
> +    union {
> +        EventTapIOport ioport;
> +        EventTapMMIO mmio;
> +    };
> +    union {
> +        EventTapNetReq net_req;
> +        EventTapBlkReq blk_req;
> +    };
> +    QTAILQ_ENTRY(EventTapLog) node;
> +} EventTapLog;
> +
> +static EventTapLog *last_event_tap;
> +
> +static QTAILQ_HEAD(, EventTapLog) event_list;
> +static QTAILQ_HEAD(, EventTapLog) event_pool;
> +
> +static int (*event_tap_cb)(void);
> +static QEMUBH *event_tap_bh;
> +static VMChangeStateEntry *vmstate;
> +
> +static void event_tap_bh_cb(void *p)
> +{
> +    if (event_tap_cb) {
> +        event_tap_cb();
> +    }
> +
> +    qemu_bh_delete(event_tap_bh);
> +    event_tap_bh = NULL;
> +}
> +
> +static void event_tap_schedule_bh(void)
> +{
> +    trace_event_tap_ignore_bh(!!event_tap_bh);
> +
> +    /* if bh is already set, we ignore it for now */
> +    if (event_tap_bh) {
> +        return;
> +    }
> +
> +    event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL);
> +    qemu_bh_schedule(event_tap_bh);
> +
> +    return ;
> +}
> +
> +static void event_tap_alloc_net_req(EventTapNetReq *net_req,
> +                                   VLANClientState *vc,
> +                                   const struct iovec *iov, int iovcnt,
> +                                   NetPacketSent *sent_cb, bool async)
> +{
> +    int i;
> +
> +    net_req->iovcnt = iovcnt;
> +    net_req->async = async;
> +    net_req->device_name = qemu_strdup(vc->name);
> +    net_req->sent_cb = sent_cb;
> +
> +    if (vc->vlan) {
> +        net_req->vlan_needed = 1;
> +        net_req->vlan_id = vc->vlan->id;
> +    } else {
> +        net_req->vlan_needed = 0;
> +    }
> +
> +    if (async) {
> +        net_req->iov = (struct iovec *)iov;
> +    } else {
> +        net_req->iov = qemu_malloc(sizeof(struct iovec) * iovcnt);
> +        for (i = 0; i < iovcnt; i++) {
> +            net_req->iov[i].iov_base = qemu_malloc(iov[i].iov_len);
> +            memcpy(net_req->iov[i].iov_base, iov[i].iov_base, iov[i].iov_len);
> +            net_req->iov[i].iov_len = iov[i].iov_len;
> +        }
> +    }
> +}
> +
> +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
> +                                    BlockDriverState *bs, BlockRequest *reqs,
> +                                    int num_reqs, BlockDriverCompletionFunc *cb,
> +                                    void *opaque, bool is_flush)
> +{
> +    int i;
> +
> +    blk_req->num_reqs = num_reqs;
> +    blk_req->num_cbs = num_reqs;
> +    blk_req->device_name = qemu_strdup(bs->device_name);
> +    blk_req->is_flush = is_flush;
> +
> +    for (i = 0; i < num_reqs; i++) {
> +        blk_req->reqs[i].sector = reqs[i].sector;
> +        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
> +        blk_req->reqs[i].qiov = reqs[i].qiov;
> +        blk_req->reqs[i].cb = cb;
> +        blk_req->reqs[i].opaque = opaque;
> +        blk_req->cb[i] = reqs[i].cb;
> +        blk_req->opaque[i] = reqs[i].opaque;
> +    }
> +}
> +
> +static void *event_tap_alloc_log(void)
> +{
> +    EventTapLog *log;
> +
> +    if (QTAILQ_EMPTY(&event_pool)) {
> +        log = qemu_mallocz(sizeof(EventTapLog));
> +    } else {
> +        log = QTAILQ_FIRST(&event_pool);
> +        QTAILQ_REMOVE(&event_pool, log, node);
> +    }
> +
> +    return log;
> +}
> +
> +static void event_tap_free_log(EventTapLog *log)
> +{
> +    int i, mode = log->mode & ~EVENT_TAP_TYPE_MASK;
> +
> +    if (mode == EVENT_TAP_NET) {
> +        EventTapNetReq *net_req = &log->net_req;
> +
> +        if (!net_req->async) {
> +            for (i = 0; i < net_req->iovcnt; i++) {
> +                qemu_free(net_req->iov[i].iov_base);
> +            }
> +            qemu_free(net_req->iov);
> +        } else if (event_tap_state >= EVENT_TAP_LOAD) {
> +            qemu_free(net_req->iov);
> +        }
> +
> +        qemu_free(net_req->device_name);
> +    } else if (mode == EVENT_TAP_BLK) {
> +        EventTapBlkReq *blk_req = &log->blk_req;
> +
> +        if (event_tap_state >= EVENT_TAP_LOAD && !blk_req->is_flush) {
> +            for (i = 0; i < blk_req->num_reqs; i++) {
> +                qemu_iovec_destroy(blk_req->reqs[i].qiov);
> +                qemu_free(blk_req->reqs[i].qiov);
> +            }
> +        }
> +
> +        qemu_free(blk_req->device_name);
> +    }
> +
> +    log->mode = 0;
> +
> +    /* return the log to event_pool */
> +    QTAILQ_INSERT_HEAD(&event_pool, log, node);
> +}
> +
> +static void event_tap_free_pool(void)
> +{
> +    EventTapLog *log, *next;
> +
> +    QTAILQ_FOREACH_SAFE(log, &event_pool, node, next) {
> +        QTAILQ_REMOVE(&event_pool, log, node);
> +        qemu_free(log);
> +    }
> +}
> +
> +static void event_tap_blk_cb(void *opaque, int ret)
> +{
> +    EventTapLog *log = container_of(opaque, EventTapLog, blk_req);
> +    EventTapBlkReq *blk_req = opaque;
> +    int i;
> +
> +    blk_req->num_cbs--;
> +
> +    /* all outstanding requests are flushed */
> +    if (blk_req->num_cbs == 0) {
> +        for (i = 0; i < blk_req->num_reqs; i++) {
> +            blk_req->cb[i](blk_req->opaque[i], ret);
> +        }
> +
> +        event_tap_free_log(log);
> +    }
> +}
> +
> +static void event_tap_packet(VLANClientState *vc, const struct iovec *iov,
> +                            int iovcnt, NetPacketSent *sent_cb, bool async)
> +{
> +    int empty;
> +    EventTapLog *log = last_event_tap;
> +
> +    if (!log) {
> +        trace_event_tap_no_event();
> +        log = event_tap_alloc_log();
> +    }
> +
> +    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
> +        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
> +        return;
> +    }
> +
> +    log->mode |= EVENT_TAP_NET;
> +    event_tap_alloc_net_req(&log->net_req, vc, iov, iovcnt, sent_cb, async);
> +
> +    empty = QTAILQ_EMPTY(&event_list);
> +    QTAILQ_INSERT_TAIL(&event_list, log, node);
> +    last_event_tap = NULL;
> +
> +    if (empty) {
> +        event_tap_schedule_bh();
> +    }
> +}
> +
> +static void event_tap_bdrv(BlockDriverState *bs, BlockRequest *reqs,
> +                           int num_reqs, bool is_flush)
> +{
> +    EventTapLog *log = last_event_tap;
> +    int empty;
> +
> +    if (!log) {
> +        trace_event_tap_no_event();
> +        log = event_tap_alloc_log();
> +    }
> +
> +    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
> +        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
> +        return;
> +    }
> +
> +    log->mode |= EVENT_TAP_BLK;
> +    event_tap_alloc_blk_req(&log->blk_req, bs, reqs, num_reqs,
> +                            event_tap_blk_cb, &log->blk_req, is_flush);
> +
> +    empty = QTAILQ_EMPTY(&event_list);
> +    QTAILQ_INSERT_TAIL(&event_list, log, node);
> +    last_event_tap = NULL;
> +
> +    if (empty) {
> +        event_tap_schedule_bh();
> +    }
> +}
> +
> +BlockDriverAIOCB *event_tap_bdrv_aio_writev(BlockDriverState *bs,
> +                                            int64_t sector_num,
> +                                            QEMUIOVector *iov,
> +                                            int nb_sectors,
> +                                            BlockDriverCompletionFunc *cb,
> +                                            void *opaque)
> +{
> +    BlockRequest req;
> +
> +    assert(event_tap_state == EVENT_TAP_ON);
> +
> +    req.sector = sector_num;
> +    req.nb_sectors = nb_sectors;
> +    req.qiov = iov;
> +    req.cb = cb;
> +    req.opaque = opaque;
> +    event_tap_bdrv(bs, &req, 1, 0);
> +
> +    /* return a dummy_acb pointer to prevent from failing */
> +    return &dummy_acb;
> +}
> +
> +BlockDriverAIOCB *event_tap_bdrv_aio_flush(BlockDriverState *bs,
> +                                           BlockDriverCompletionFunc *cb,
> +                                           void *opaque)
> +{
> +    BlockRequest req;
> +
> +    assert(event_tap_state == EVENT_TAP_ON);
> +
> +    memset(&req, 0, sizeof(req));
> +    req.cb = cb;
> +    req.opaque = opaque;
> +    event_tap_bdrv(bs, &req, 1, 1);
> +
> +    return &dummy_acb;
> +}
> +
> +void event_tap_send_packet(VLANClientState *vc, const uint8_t *buf, int size)
> +{
> +    struct iovec iov;
> +
> +    assert(event_tap_state == EVENT_TAP_ON);
> +
> +    iov.iov_base = (uint8_t *)buf;
> +    iov.iov_len = size;
> +    event_tap_packet(vc, &iov, 1, NULL, 0);
> +
> +    return;
> +}
> +ssize_t event_tap_sendv_packet_async(VLANClientState *vc,
> +                                     const struct iovec *iov,
> +                                     int iovcnt, NetPacketSent *sent_cb)
> +{
> +    assert(event_tap_state == EVENT_TAP_ON);
> +    event_tap_packet(vc, iov, iovcnt, sent_cb, 1);
> +    return 0;
> +}
> +
> +int event_tap_register(int (*cb)(void))
> +{
> +    if (event_tap_state != EVENT_TAP_OFF) {
> +        error_report("event-tap is already on");
> +        return -EINVAL;
> +    }
> +
> +    if (!cb || event_tap_cb) {
> +        error_report("can't set event_tap_cb");
> +        return -EINVAL;
> +    }
> +
> +    event_tap_cb = cb;
> +    event_tap_state = EVENT_TAP_ON;
> +
> +    return 0;
> +}
> +
> +void event_tap_unregister(void)
> +{
> +    if (event_tap_state == EVENT_TAP_OFF) {
> +        error_report("event-tap is already off");
> +        return;
> +    }
> +
> +    event_tap_state = EVENT_TAP_OFF;
> +    event_tap_cb = NULL;
> +
> +    event_tap_flush();
> +    event_tap_free_pool();
> +}
> +
> +int event_tap_is_on(void)
> +{
> +    return (event_tap_state == EVENT_TAP_ON);
> +}
> +
> +void event_tap_ioport(int index, uint32_t address, uint32_t data)
> +{
> +    if (event_tap_state != EVENT_TAP_ON) {
> +        return;
> +    }
> +
> +    if (!last_event_tap) {
> +        last_event_tap = event_tap_alloc_log();
> +    }
> +
> +    last_event_tap->mode = EVENT_TAP_IOPORT;
> +    last_event_tap->ioport.index = index;
> +    last_event_tap->ioport.address = address;
> +    last_event_tap->ioport.data = data;
> +}
> +
> +void event_tap_mmio(uint64_t address, uint8_t *buf, int len)
> +{
> +    if (event_tap_state != EVENT_TAP_ON || len > MMIO_BUF_SIZE) {
> +        return;
> +    }
> +
> +    if (!last_event_tap) {
> +        last_event_tap = event_tap_alloc_log();
> +    }
> +
> +    last_event_tap->mode = EVENT_TAP_MMIO;
> +    last_event_tap->mmio.address = address;
> +    last_event_tap->mmio.len = len;
> +    memcpy(last_event_tap->mmio.buf, buf, len);
> +}
> +
> +static void event_tap_net_flush(EventTapNetReq *net_req)
> +{
> +    VLANClientState *vc;
> +    ssize_t len;
> +
> +    if (net_req->vlan_needed) {
> +        vc = qemu_find_vlan_client_by_name(NULL, net_req->vlan_id,
> +                                           net_req->device_name);
> +    } else {
> +        vc = qemu_find_netdev(net_req->device_name);
> +    }
> +
> +    if (net_req->async) {
> +        len = qemu_sendv_packet_async(vc, net_req->iov, net_req->iovcnt,
> +                                      net_req->sent_cb);
> +        if (len) {
> +            net_req->sent_cb(vc, len);
> +        } else {
> +            /* packets are queued in the net layer */
> +            trace_event_tap_append_packet();
> +        }
> +    } else {
> +        qemu_send_packet(vc, net_req->iov[0].iov_base,
> +                         net_req->iov[0].iov_len);
> +    }
> +}
> +
> +static void event_tap_blk_flush(EventTapBlkReq *blk_req)
> +{
> +    BlockDriverState *bs;
> +
> +    bs = bdrv_find(blk_req->device_name);

Please store the BlockDriverState in blk_req. This code loops over all
block devices and does a string comparison - and that for each request.
You can also save the qemu_strdup() when creating the request.

In the few places where you really need the device name (might be the
case for load/save, I'm not sure), you can still get it from the
BlockDriverState.

> +
> +    if (blk_req->is_flush) {
> +        bdrv_aio_flush(bs, blk_req->reqs[0].cb, blk_req->reqs[0].opaque);

You need to handle errors. If bdrv_aio_flush returns NULL, call the
callback with -EIO.

> +        return;
> +    }
> +
> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
> +                    blk_req->reqs[0].opaque);

Same here.

> +    bdrv_flush(bs);

This looks really strange. What is this supposed to do?

One point is that you write it immediately after bdrv_aio_write, so you
get an fsync for which you don't know if it includes the current write
request or if it doesn't. Which data do you want to get flushed to the disk?

The other thing is that you introduce a bdrv_flush for each request,
basically forcing everyone to something very similar to writethrough
mode. I'm sure this will have a big impact on performance.

Additionally, error handling is missing.

Kevin
Yoshiaki Tamura - Jan. 19, 2011, 1:04 p.m.
2011/1/19 Kevin Wolf <kwolf@redhat.com>:
> Am 19.01.2011 06:44, schrieb Yoshiaki Tamura:
>> event-tap controls when to start FT transaction, and provides proxy
>> functions to called from net/block devices.  While FT transaction, it
>> queues up net/block requests, and flush them when the transaction gets
>> completed.
>>
>> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>
> One general comment: On the first glance this seems to mix block and net
> (and some other things) arbitrarily instead of having a section for
> handling all block stuff, then network, etc.
>
> Is there a specific reason for the order in which you put the functions?
> If not, maybe reordering them might improve readability.

Thanks.  I'll rework on that.

>
>> ---
>>  Makefile.target |    1 +
>>  event-tap.c     |  847 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  event-tap.h     |   42 +++
>>  qemu-tool.c     |   24 ++
>>  trace-events    |    9 +
>>  5 files changed, 923 insertions(+), 0 deletions(-)
>>  create mode 100644 event-tap.c
>>  create mode 100644 event-tap.h
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index e15b1c4..f36cd75 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -199,6 +199,7 @@ obj-y += rwhandler.o
>>  obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>>  obj-$(CONFIG_NO_KVM) += kvm-stub.o
>>  LIBS+=-lz
>> +obj-y += event-tap.o
>>
>>  QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
>>  QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
>> diff --git a/event-tap.c b/event-tap.c
>> new file mode 100644
>> index 0000000..f492708
>> --- /dev/null
>> +++ b/event-tap.c
>
>> @@ -0,0 +1,847 @@
>> +/*
>> + * Event Tap functions for QEMU
>> + *
>> + * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation.
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu-common.h"
>> +#include "qemu-error.h"
>> +#include "block.h"
>> +#include "block_int.h"
>> +#include "ioport.h"
>> +#include "osdep.h"
>> +#include "sysemu.h"
>> +#include "hw/hw.h"
>> +#include "net.h"
>> +#include "event-tap.h"
>> +#include "trace.h"
>> +
>> +enum EVENT_TAP_STATE {
>> +    EVENT_TAP_OFF,
>> +    EVENT_TAP_ON,
>> +    EVENT_TAP_FLUSH,
>> +    EVENT_TAP_LOAD,
>> +    EVENT_TAP_REPLAY,
>> +};
>> +
>> +static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF;
>> +static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */
>
> Indeed, bdrv_aio_cancel will segfault this way.
>
> If you use dummies instead of real ACBs the only way to correctly
> implement bdrv_aio_cancel is waiting for all in-flight AIOs
> (qemu_aio_flush).

So I need to insert a new event_tap function to bdrv_aio_cancel
to do that.

>
>> +typedef struct EventTapIOport {
>> +    uint32_t address;
>> +    uint32_t data;
>> +    int      index;
>> +} EventTapIOport;
>> +
>> +#define MMIO_BUF_SIZE 8
>> +
>> +typedef struct EventTapMMIO {
>> +    uint64_t address;
>> +    uint8_t  buf[MMIO_BUF_SIZE];
>> +    int      len;
>> +} EventTapMMIO;
>> +
>> +typedef struct EventTapNetReq {
>> +    char *device_name;
>> +    int iovcnt;
>> +    struct iovec *iov;
>> +    int vlan_id;
>> +    bool vlan_needed;
>> +    bool async;
>> +    NetPacketSent *sent_cb;
>> +} EventTapNetReq;
>> +
>> +#define MAX_BLOCK_REQUEST 32
>> +
>> +typedef struct EventTapBlkReq {
>> +    char *device_name;
>> +    int num_reqs;
>> +    int num_cbs;
>> +    bool is_flush;
>> +    BlockRequest reqs[MAX_BLOCK_REQUEST];
>> +    BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST];
>> +    void *opaque[MAX_BLOCK_REQUEST];
>> +} EventTapBlkReq;
>> +
>> +#define EVENT_TAP_IOPORT (1 << 0)
>> +#define EVENT_TAP_MMIO   (1 << 1)
>> +#define EVENT_TAP_NET    (1 << 2)
>> +#define EVENT_TAP_BLK    (1 << 3)
>> +
>> +#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1)
>> +
>> +typedef struct EventTapLog {
>> +    int mode;
>> +    union {
>> +        EventTapIOport ioport;
>> +        EventTapMMIO mmio;
>> +    };
>> +    union {
>> +        EventTapNetReq net_req;
>> +        EventTapBlkReq blk_req;
>> +    };
>> +    QTAILQ_ENTRY(EventTapLog) node;
>> +} EventTapLog;
>> +
>> +static EventTapLog *last_event_tap;
>> +
>> +static QTAILQ_HEAD(, EventTapLog) event_list;
>> +static QTAILQ_HEAD(, EventTapLog) event_pool;
>> +
>> +static int (*event_tap_cb)(void);
>> +static QEMUBH *event_tap_bh;
>> +static VMChangeStateEntry *vmstate;
>> +
>> +static void event_tap_bh_cb(void *p)
>> +{
>> +    if (event_tap_cb) {
>> +        event_tap_cb();
>> +    }
>> +
>> +    qemu_bh_delete(event_tap_bh);
>> +    event_tap_bh = NULL;
>> +}
>> +
>> +static void event_tap_schedule_bh(void)
>> +{
>> +    trace_event_tap_ignore_bh(!!event_tap_bh);
>> +
>> +    /* if bh is already set, we ignore it for now */
>> +    if (event_tap_bh) {
>> +        return;
>> +    }
>> +
>> +    event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL);
>> +    qemu_bh_schedule(event_tap_bh);
>> +
>> +    return ;
>> +}
>> +
>> +static void event_tap_alloc_net_req(EventTapNetReq *net_req,
>> +                                   VLANClientState *vc,
>> +                                   const struct iovec *iov, int iovcnt,
>> +                                   NetPacketSent *sent_cb, bool async)
>> +{
>> +    int i;
>> +
>> +    net_req->iovcnt = iovcnt;
>> +    net_req->async = async;
>> +    net_req->device_name = qemu_strdup(vc->name);
>> +    net_req->sent_cb = sent_cb;
>> +
>> +    if (vc->vlan) {
>> +        net_req->vlan_needed = 1;
>> +        net_req->vlan_id = vc->vlan->id;
>> +    } else {
>> +        net_req->vlan_needed = 0;
>> +    }
>> +
>> +    if (async) {
>> +        net_req->iov = (struct iovec *)iov;
>> +    } else {
>> +        net_req->iov = qemu_malloc(sizeof(struct iovec) * iovcnt);
>> +        for (i = 0; i < iovcnt; i++) {
>> +            net_req->iov[i].iov_base = qemu_malloc(iov[i].iov_len);
>> +            memcpy(net_req->iov[i].iov_base, iov[i].iov_base, iov[i].iov_len);
>> +            net_req->iov[i].iov_len = iov[i].iov_len;
>> +        }
>> +    }
>> +}
>> +
>> +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
>> +                                    BlockDriverState *bs, BlockRequest *reqs,
>> +                                    int num_reqs, BlockDriverCompletionFunc *cb,
>> +                                    void *opaque, bool is_flush)
>> +{
>> +    int i;
>> +
>> +    blk_req->num_reqs = num_reqs;
>> +    blk_req->num_cbs = num_reqs;
>> +    blk_req->device_name = qemu_strdup(bs->device_name);
>> +    blk_req->is_flush = is_flush;
>> +
>> +    for (i = 0; i < num_reqs; i++) {
>> +        blk_req->reqs[i].sector = reqs[i].sector;
>> +        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
>> +        blk_req->reqs[i].qiov = reqs[i].qiov;
>> +        blk_req->reqs[i].cb = cb;
>> +        blk_req->reqs[i].opaque = opaque;
>> +        blk_req->cb[i] = reqs[i].cb;
>> +        blk_req->opaque[i] = reqs[i].opaque;
>> +    }
>> +}
>> +
>> +static void *event_tap_alloc_log(void)
>> +{
>> +    EventTapLog *log;
>> +
>> +    if (QTAILQ_EMPTY(&event_pool)) {
>> +        log = qemu_mallocz(sizeof(EventTapLog));
>> +    } else {
>> +        log = QTAILQ_FIRST(&event_pool);
>> +        QTAILQ_REMOVE(&event_pool, log, node);
>> +    }
>> +
>> +    return log;
>> +}
>> +
>> +static void event_tap_free_log(EventTapLog *log)
>> +{
>> +    int i, mode = log->mode & ~EVENT_TAP_TYPE_MASK;
>> +
>> +    if (mode == EVENT_TAP_NET) {
>> +        EventTapNetReq *net_req = &log->net_req;
>> +
>> +        if (!net_req->async) {
>> +            for (i = 0; i < net_req->iovcnt; i++) {
>> +                qemu_free(net_req->iov[i].iov_base);
>> +            }
>> +            qemu_free(net_req->iov);
>> +        } else if (event_tap_state >= EVENT_TAP_LOAD) {
>> +            qemu_free(net_req->iov);
>> +        }
>> +
>> +        qemu_free(net_req->device_name);
>> +    } else if (mode == EVENT_TAP_BLK) {
>> +        EventTapBlkReq *blk_req = &log->blk_req;
>> +
>> +        if (event_tap_state >= EVENT_TAP_LOAD && !blk_req->is_flush) {
>> +            for (i = 0; i < blk_req->num_reqs; i++) {
>> +                qemu_iovec_destroy(blk_req->reqs[i].qiov);
>> +                qemu_free(blk_req->reqs[i].qiov);
>> +            }
>> +        }
>> +
>> +        qemu_free(blk_req->device_name);
>> +    }
>> +
>> +    log->mode = 0;
>> +
>> +    /* return the log to event_pool */
>> +    QTAILQ_INSERT_HEAD(&event_pool, log, node);
>> +}
>> +
>> +static void event_tap_free_pool(void)
>> +{
>> +    EventTapLog *log, *next;
>> +
>> +    QTAILQ_FOREACH_SAFE(log, &event_pool, node, next) {
>> +        QTAILQ_REMOVE(&event_pool, log, node);
>> +        qemu_free(log);
>> +    }
>> +}
>> +
>> +static void event_tap_blk_cb(void *opaque, int ret)
>> +{
>> +    EventTapLog *log = container_of(opaque, EventTapLog, blk_req);
>> +    EventTapBlkReq *blk_req = opaque;
>> +    int i;
>> +
>> +    blk_req->num_cbs--;
>> +
>> +    /* all outstanding requests are flushed */
>> +    if (blk_req->num_cbs == 0) {
>> +        for (i = 0; i < blk_req->num_reqs; i++) {
>> +            blk_req->cb[i](blk_req->opaque[i], ret);
>> +        }
>> +
>> +        event_tap_free_log(log);
>> +    }
>> +}
>> +
>> +static void event_tap_packet(VLANClientState *vc, const struct iovec *iov,
>> +                            int iovcnt, NetPacketSent *sent_cb, bool async)
>> +{
>> +    int empty;
>> +    EventTapLog *log = last_event_tap;
>> +
>> +    if (!log) {
>> +        trace_event_tap_no_event();
>> +        log = event_tap_alloc_log();
>> +    }
>> +
>> +    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> +        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
>> +        return;
>> +    }
>> +
>> +    log->mode |= EVENT_TAP_NET;
>> +    event_tap_alloc_net_req(&log->net_req, vc, iov, iovcnt, sent_cb, async);
>> +
>> +    empty = QTAILQ_EMPTY(&event_list);
>> +    QTAILQ_INSERT_TAIL(&event_list, log, node);
>> +    last_event_tap = NULL;
>> +
>> +    if (empty) {
>> +        event_tap_schedule_bh();
>> +    }
>> +}
>> +
>> +static void event_tap_bdrv(BlockDriverState *bs, BlockRequest *reqs,
>> +                           int num_reqs, bool is_flush)
>> +{
>> +    EventTapLog *log = last_event_tap;
>> +    int empty;
>> +
>> +    if (!log) {
>> +        trace_event_tap_no_event();
>> +        log = event_tap_alloc_log();
>> +    }
>> +
>> +    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> +        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
>> +        return;
>> +    }
>> +
>> +    log->mode |= EVENT_TAP_BLK;
>> +    event_tap_alloc_blk_req(&log->blk_req, bs, reqs, num_reqs,
>> +                            event_tap_blk_cb, &log->blk_req, is_flush);
>> +
>> +    empty = QTAILQ_EMPTY(&event_list);
>> +    QTAILQ_INSERT_TAIL(&event_list, log, node);
>> +    last_event_tap = NULL;
>> +
>> +    if (empty) {
>> +        event_tap_schedule_bh();
>> +    }
>> +}
>> +
>> +BlockDriverAIOCB *event_tap_bdrv_aio_writev(BlockDriverState *bs,
>> +                                            int64_t sector_num,
>> +                                            QEMUIOVector *iov,
>> +                                            int nb_sectors,
>> +                                            BlockDriverCompletionFunc *cb,
>> +                                            void *opaque)
>> +{
>> +    BlockRequest req;
>> +
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +
>> +    req.sector = sector_num;
>> +    req.nb_sectors = nb_sectors;
>> +    req.qiov = iov;
>> +    req.cb = cb;
>> +    req.opaque = opaque;
>> +    event_tap_bdrv(bs, &req, 1, 0);
>> +
>> +    /* return a dummy_acb pointer to prevent from failing */
>> +    return &dummy_acb;
>> +}
>> +
>> +BlockDriverAIOCB *event_tap_bdrv_aio_flush(BlockDriverState *bs,
>> +                                           BlockDriverCompletionFunc *cb,
>> +                                           void *opaque)
>> +{
>> +    BlockRequest req;
>> +
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +
>> +    memset(&req, 0, sizeof(req));
>> +    req.cb = cb;
>> +    req.opaque = opaque;
>> +    event_tap_bdrv(bs, &req, 1, 1);
>> +
>> +    return &dummy_acb;
>> +}
>> +
>> +void event_tap_send_packet(VLANClientState *vc, const uint8_t *buf, int size)
>> +{
>> +    struct iovec iov;
>> +
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +
>> +    iov.iov_base = (uint8_t *)buf;
>> +    iov.iov_len = size;
>> +    event_tap_packet(vc, &iov, 1, NULL, 0);
>> +
>> +    return;
>> +}
>> +ssize_t event_tap_sendv_packet_async(VLANClientState *vc,
>> +                                     const struct iovec *iov,
>> +                                     int iovcnt, NetPacketSent *sent_cb)
>> +{
>> +    assert(event_tap_state == EVENT_TAP_ON);
>> +    event_tap_packet(vc, iov, iovcnt, sent_cb, 1);
>> +    return 0;
>> +}
>> +
>> +int event_tap_register(int (*cb)(void))
>> +{
>> +    if (event_tap_state != EVENT_TAP_OFF) {
>> +        error_report("event-tap is already on");
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (!cb || event_tap_cb) {
>> +        error_report("can't set event_tap_cb");
>> +        return -EINVAL;
>> +    }
>> +
>> +    event_tap_cb = cb;
>> +    event_tap_state = EVENT_TAP_ON;
>> +
>> +    return 0;
>> +}
>> +
>> +void event_tap_unregister(void)
>> +{
>> +    if (event_tap_state == EVENT_TAP_OFF) {
>> +        error_report("event-tap is already off");
>> +        return;
>> +    }
>> +
>> +    event_tap_state = EVENT_TAP_OFF;
>> +    event_tap_cb = NULL;
>> +
>> +    event_tap_flush();
>> +    event_tap_free_pool();
>> +}
>> +
>> +int event_tap_is_on(void)
>> +{
>> +    return (event_tap_state == EVENT_TAP_ON);
>> +}
>> +
>> +void event_tap_ioport(int index, uint32_t address, uint32_t data)
>> +{
>> +    if (event_tap_state != EVENT_TAP_ON) {
>> +        return;
>> +    }
>> +
>> +    if (!last_event_tap) {
>> +        last_event_tap = event_tap_alloc_log();
>> +    }
>> +
>> +    last_event_tap->mode = EVENT_TAP_IOPORT;
>> +    last_event_tap->ioport.index = index;
>> +    last_event_tap->ioport.address = address;
>> +    last_event_tap->ioport.data = data;
>> +}
>> +
>> +void event_tap_mmio(uint64_t address, uint8_t *buf, int len)
>> +{
>> +    if (event_tap_state != EVENT_TAP_ON || len > MMIO_BUF_SIZE) {
>> +        return;
>> +    }
>> +
>> +    if (!last_event_tap) {
>> +        last_event_tap = event_tap_alloc_log();
>> +    }
>> +
>> +    last_event_tap->mode = EVENT_TAP_MMIO;
>> +    last_event_tap->mmio.address = address;
>> +    last_event_tap->mmio.len = len;
>> +    memcpy(last_event_tap->mmio.buf, buf, len);
>> +}
>> +
>> +static void event_tap_net_flush(EventTapNetReq *net_req)
>> +{
>> +    VLANClientState *vc;
>> +    ssize_t len;
>> +
>> +    if (net_req->vlan_needed) {
>> +        vc = qemu_find_vlan_client_by_name(NULL, net_req->vlan_id,
>> +                                           net_req->device_name);
>> +    } else {
>> +        vc = qemu_find_netdev(net_req->device_name);
>> +    }
>> +
>> +    if (net_req->async) {
>> +        len = qemu_sendv_packet_async(vc, net_req->iov, net_req->iovcnt,
>> +                                      net_req->sent_cb);
>> +        if (len) {
>> +            net_req->sent_cb(vc, len);
>> +        } else {
>> +            /* packets are queued in the net layer */
>> +            trace_event_tap_append_packet();
>> +        }
>> +    } else {
>> +        qemu_send_packet(vc, net_req->iov[0].iov_base,
>> +                         net_req->iov[0].iov_len);
>> +    }
>> +}
>> +
>> +static void event_tap_blk_flush(EventTapBlkReq *blk_req)
>> +{
>> +    BlockDriverState *bs;
>> +
>> +    bs = bdrv_find(blk_req->device_name);
>
> Please store the BlockDriverState in blk_req. This code loops over all
> block devices and does a string comparison - and that for each request.
> You can also save the qemu_strdup() when creating the request.
>
> In the few places where you really need the device name (might be the
> case for load/save, I'm not sure), you can still get it from the
> BlockDriverState.

I would do so for the primary side.  Although we haven't
implemented yet, we want to replay block requests from block
layer on the secondary side, and need device name to restore
BlockDriverState.

>
>> +
>> +    if (blk_req->is_flush) {
>> +        bdrv_aio_flush(bs, blk_req->reqs[0].cb, blk_req->reqs[0].opaque);
>
> You need to handle errors. If bdrv_aio_flush returns NULL, call the
> callback with -EIO.

I'll do so.

>
>> +        return;
>> +    }
>> +
>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>> +                    blk_req->reqs[0].opaque);
>
> Same here.
>
>> +    bdrv_flush(bs);
>
> This looks really strange. What is this supposed to do?
>
> One point is that you write it immediately after bdrv_aio_write, so you
> get an fsync for which you don't know if it includes the current write
> request or if it doesn't. Which data do you want to get flushed to the disk?

I was expecting to flush the aio request that was just initiated.
Am I misunderstanding the function?

> The other thing is that you introduce a bdrv_flush for each request,
> basically forcing everyone to something very similar to writethrough
> mode. I'm sure this will have a big impact on performance.

The reason is to avoid inversion of queued requests.  Although
processing one-by-one is heavy, wouldn't having requests flushed
to disk out of order break the disk image?

> Additionally, error handling is missing.

I looked at the codes using bdrv_flush and realized some of them
doesn't handle errors, but scsi-disk.c does.  Should everyone
handle errors or depends on the usage?

>
> Kevin
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Kevin Wolf - Jan. 19, 2011, 1:50 p.m.
Am 19.01.2011 14:04, schrieb Yoshiaki Tamura:
>>> +static void event_tap_blk_flush(EventTapBlkReq *blk_req)
>>> +{
>>> +    BlockDriverState *bs;
>>> +
>>> +    bs = bdrv_find(blk_req->device_name);
>>
>> Please store the BlockDriverState in blk_req. This code loops over all
>> block devices and does a string comparison - and that for each request.
>> You can also save the qemu_strdup() when creating the request.
>>
>> In the few places where you really need the device name (might be the
>> case for load/save, I'm not sure), you can still get it from the
>> BlockDriverState.
> 
> I would do so for the primary side.  Although we haven't
> implemented yet, we want to replay block requests from block
> layer on the secondary side, and need device name to restore
> BlockDriverState.

Hm, I see. I'm not happy about it, but I don't have a suggestion right
away how to avoid it.

>>
>>> +
>>> +    if (blk_req->is_flush) {
>>> +        bdrv_aio_flush(bs, blk_req->reqs[0].cb, blk_req->reqs[0].opaque);
>>
>> You need to handle errors. If bdrv_aio_flush returns NULL, call the
>> callback with -EIO.
> 
> I'll do so.
> 
>>
>>> +        return;
>>> +    }
>>> +
>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>> +                    blk_req->reqs[0].opaque);
>>
>> Same here.
>>
>>> +    bdrv_flush(bs);
>>
>> This looks really strange. What is this supposed to do?
>>
>> One point is that you write it immediately after bdrv_aio_write, so you
>> get an fsync for which you don't know if it includes the current write
>> request or if it doesn't. Which data do you want to get flushed to the disk?
> 
> I was expecting to flush the aio request that was just initiated.
> Am I misunderstanding the function?

Seems so. The function names don't use really clear terminology either,
so you're not the first one to fall in this trap. Basically we have:

* qemu_aio_flush() waits for all AIO requests to complete. I think you
wanted to have exactly this, but only for a single block device. Such a
function doesn't exist yet.

* bdrv_flush() makes sure that all successfully completed requests are
written to disk (by calling fsync)

* bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
the fsync in the thread pool

>> The other thing is that you introduce a bdrv_flush for each request,
>> basically forcing everyone to something very similar to writethrough
>> mode. I'm sure this will have a big impact on performance.
> 
> The reason is to avoid inversion of queued requests.  Although
> processing one-by-one is heavy, wouldn't having requests flushed
> to disk out of order break the disk image?

No, that's fine. If a guest issues two requests at the same time, they
may complete in any order. You just need to make sure that you don't
call the completion callback before the request really has completed.

I'm just starting to wonder if the guest won't timeout the requests if
they are queued for too long. Even more, with IDE, it can only handle
one request at a time, so not completing requests doesn't sound like a
good idea at all. In what intervals is the event-tap queue flushed?

On the other hand, if you complete before actually writing out, you
don't get timeouts, but you signal success to the guest when the request
could still fail. What would you do in this case? With a writeback cache
mode we're fine, we can just fail the next flush (until then nothing is
guaranteed to be on disk and order doesn't matter either), but with
cache=writethrough we're in serious trouble.

Have you thought about this problem? Maybe we end up having to flush the
event-tap queue for each single write in writethrough mode.

>> Additionally, error handling is missing.
> 
> I looked at the codes using bdrv_flush and realized some of them
> doesn't handle errors, but scsi-disk.c does.  Should everyone
> handle errors or depends on the usage?

I added the return code only recently, it was a void function
previously. Probably some error handling should be added to all of them.

Kevin
Yoshiaki Tamura - Jan. 20, 2011, 5:19 a.m.
2011/1/19 Kevin Wolf <kwolf@redhat.com>:
> Am 19.01.2011 14:04, schrieb Yoshiaki Tamura:
>>>> +static void event_tap_blk_flush(EventTapBlkReq *blk_req)
>>>> +{
>>>> +    BlockDriverState *bs;
>>>> +
>>>> +    bs = bdrv_find(blk_req->device_name);
>>>
>>> Please store the BlockDriverState in blk_req. This code loops over all
>>> block devices and does a string comparison - and that for each request.
>>> You can also save the qemu_strdup() when creating the request.
>>>
>>> In the few places where you really need the device name (might be the
>>> case for load/save, I'm not sure), you can still get it from the
>>> BlockDriverState.
>>
>> I would do so for the primary side.  Although we haven't
>> implemented yet, we want to replay block requests from block
>> layer on the secondary side, and need device name to restore
>> BlockDriverState.
>
> Hm, I see. I'm not happy about it, but I don't have a suggestion right
> away how to avoid it.
>
>>>
>>>> +
>>>> +    if (blk_req->is_flush) {
>>>> +        bdrv_aio_flush(bs, blk_req->reqs[0].cb, blk_req->reqs[0].opaque);
>>>
>>> You need to handle errors. If bdrv_aio_flush returns NULL, call the
>>> callback with -EIO.
>>
>> I'll do so.
>>
>>>
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>>> +                    blk_req->reqs[0].opaque);
>>>
>>> Same here.
>>>
>>>> +    bdrv_flush(bs);
>>>
>>> This looks really strange. What is this supposed to do?
>>>
>>> One point is that you write it immediately after bdrv_aio_write, so you
>>> get an fsync for which you don't know if it includes the current write
>>> request or if it doesn't. Which data do you want to get flushed to the disk?
>>
>> I was expecting to flush the aio request that was just initiated.
>> Am I misunderstanding the function?
>
> Seems so. The function names don't use really clear terminology either,
> so you're not the first one to fall in this trap. Basically we have:
>
> * qemu_aio_flush() waits for all AIO requests to complete. I think you
> wanted to have exactly this, but only for a single block device. Such a
> function doesn't exist yet.
>
> * bdrv_flush() makes sure that all successfully completed requests are
> written to disk (by calling fsync)
>
> * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
> the fsync in the thread pool

Then what I wanted to do is, call qemu_aio_flush first, then
bdrv_flush.  It should be like live migration.

>
>>> The other thing is that you introduce a bdrv_flush for each request,
>>> basically forcing everyone to something very similar to writethrough
>>> mode. I'm sure this will have a big impact on performance.
>>
>> The reason is to avoid inversion of queued requests.  Although
>> processing one-by-one is heavy, wouldn't having requests flushed
>> to disk out of order break the disk image?
>
> No, that's fine. If a guest issues two requests at the same time, they
> may complete in any order. You just need to make sure that you don't
> call the completion callback before the request really has completed.

We need to flush requests, meaning aio and fsync, before sending
the final state of the guests, to make sure we can switch to the
secondary safely.

> I'm just starting to wonder if the guest won't timeout the requests if
> they are queued for too long. Even more, with IDE, it can only handle
> one request at a time, so not completing requests doesn't sound like a
> good idea at all. In what intervals is the event-tap queue flushed?

The requests are flushed once each transaction completes.  So
it's not with specific intervals.

> On the other hand, if you complete before actually writing out, you
> don't get timeouts, but you signal success to the guest when the request
> could still fail. What would you do in this case? With a writeback cache
> mode we're fine, we can just fail the next flush (until then nothing is
> guaranteed to be on disk and order doesn't matter either), but with
> cache=writethrough we're in serious trouble.
>
> Have you thought about this problem? Maybe we end up having to flush the
> event-tap queue for each single write in writethrough mode.

Yes, and that's what I'm trying to do at this point.  I know that
performance matters a lot, but sacrificing reliability over
performance now isn't a good idea.  I first want to lay the
ground, and then focus on optimization.  Note that without dirty
bitmap optimization, Kemari suffers a lot in sending rams.
Anthony and I discussed to take this approach at KVM Forum.

>>> Additionally, error handling is missing.
>>
>> I looked at the codes using bdrv_flush and realized some of them
>> doesn't handle errors, but scsi-disk.c does.  Should everyone
>> handle errors or depends on the usage?
>
> I added the return code only recently, it was a void function
> previously. Probably some error handling should be added to all of them.

Ah:)  Glad to hear that.

Yoshi

>
> Kevin
>
>
Kevin Wolf - Jan. 20, 2011, 9:15 a.m.
Am 20.01.2011 06:19, schrieb Yoshiaki Tamura:
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>>>> +                    blk_req->reqs[0].opaque);
>>>>
>>>> Same here.
>>>>
>>>>> +    bdrv_flush(bs);
>>>>
>>>> This looks really strange. What is this supposed to do?
>>>>
>>>> One point is that you write it immediately after bdrv_aio_write, so you
>>>> get an fsync for which you don't know if it includes the current write
>>>> request or if it doesn't. Which data do you want to get flushed to the disk?
>>>
>>> I was expecting to flush the aio request that was just initiated.
>>> Am I misunderstanding the function?
>>
>> Seems so. The function names don't use really clear terminology either,
>> so you're not the first one to fall in this trap. Basically we have:
>>
>> * qemu_aio_flush() waits for all AIO requests to complete. I think you
>> wanted to have exactly this, but only for a single block device. Such a
>> function doesn't exist yet.
>>
>> * bdrv_flush() makes sure that all successfully completed requests are
>> written to disk (by calling fsync)
>>
>> * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
>> the fsync in the thread pool
> 
> Then what I wanted to do is, call qemu_aio_flush first, then
> bdrv_flush.  It should be like live migration.

Okay, that makes sense. :-)

>>>> The other thing is that you introduce a bdrv_flush for each request,
>>>> basically forcing everyone to something very similar to writethrough
>>>> mode. I'm sure this will have a big impact on performance.
>>>
>>> The reason is to avoid inversion of queued requests.  Although
>>> processing one-by-one is heavy, wouldn't having requests flushed
>>> to disk out of order break the disk image?
>>
>> No, that's fine. If a guest issues two requests at the same time, they
>> may complete in any order. You just need to make sure that you don't
>> call the completion callback before the request really has completed.
> 
> We need to flush requests, meaning aio and fsync, before sending
> the final state of the guests, to make sure we can switch to the
> secondary safely.

In theory I think you could just re-submit the requests on the secondary
if they had not completed yet.

But you're right, let's keep things simple for the start.

>> I'm just starting to wonder if the guest won't timeout the requests if
>> they are queued for too long. Even more, with IDE, it can only handle
>> one request at a time, so not completing requests doesn't sound like a
>> good idea at all. In what intervals is the event-tap queue flushed?
> 
> The requests are flushed once each transaction completes.  So
> it's not with specific intervals.

Right. So when is a transaction completed? This is the time that a
single request will take.

>> On the other hand, if you complete before actually writing out, you
>> don't get timeouts, but you signal success to the guest when the request
>> could still fail. What would you do in this case? With a writeback cache
>> mode we're fine, we can just fail the next flush (until then nothing is
>> guaranteed to be on disk and order doesn't matter either), but with
>> cache=writethrough we're in serious trouble.
>>
>> Have you thought about this problem? Maybe we end up having to flush the
>> event-tap queue for each single write in writethrough mode.
> 
> Yes, and that's what I'm trying to do at this point.  

Oh, I must have missed that code. Which patch/function should I look at?

> I know that
> performance matters a lot, but sacrificing reliability over
> performance now isn't a good idea.  I first want to lay the
> ground, and then focus on optimization.  Note that without dirty
> bitmap optimization, Kemari suffers a lot in sending rams.
> Anthony and I discussed to take this approach at KVM Forum.

I agree, starting simple makes sense.

Kevin
Yoshiaki Tamura - Jan. 20, 2011, 10:39 a.m.
2011/1/20 Kevin Wolf <kwolf@redhat.com>:
> Am 20.01.2011 06:19, schrieb Yoshiaki Tamura:
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>>>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>>>>> +                    blk_req->reqs[0].opaque);
>>>>>
>>>>> Same here.
>>>>>
>>>>>> +    bdrv_flush(bs);
>>>>>
>>>>> This looks really strange. What is this supposed to do?
>>>>>
>>>>> One point is that you write it immediately after bdrv_aio_write, so you
>>>>> get an fsync for which you don't know if it includes the current write
>>>>> request or if it doesn't. Which data do you want to get flushed to the disk?
>>>>
>>>> I was expecting to flush the aio request that was just initiated.
>>>> Am I misunderstanding the function?
>>>
>>> Seems so. The function names don't use really clear terminology either,
>>> so you're not the first one to fall in this trap. Basically we have:
>>>
>>> * qemu_aio_flush() waits for all AIO requests to complete. I think you
>>> wanted to have exactly this, but only for a single block device. Such a
>>> function doesn't exist yet.
>>>
>>> * bdrv_flush() makes sure that all successfully completed requests are
>>> written to disk (by calling fsync)
>>>
>>> * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
>>> the fsync in the thread pool
>>
>> Then what I wanted to do is, call qemu_aio_flush first, then
>> bdrv_flush.  It should be like live migration.
>
> Okay, that makes sense. :-)
>
>>>>> The other thing is that you introduce a bdrv_flush for each request,
>>>>> basically forcing everyone to something very similar to writethrough
>>>>> mode. I'm sure this will have a big impact on performance.
>>>>
>>>> The reason is to avoid inversion of queued requests.  Although
>>>> processing one-by-one is heavy, wouldn't having requests flushed
>>>> to disk out of order break the disk image?
>>>
>>> No, that's fine. If a guest issues two requests at the same time, they
>>> may complete in any order. You just need to make sure that you don't
>>> call the completion callback before the request really has completed.
>>
>> We need to flush requests, meaning aio and fsync, before sending
>> the final state of the guests, to make sure we can switch to the
>> secondary safely.
>
> In theory I think you could just re-submit the requests on the secondary
> if they had not completed yet.
>
> But you're right, let's keep things simple for the start.
>
>>> I'm just starting to wonder if the guest won't timeout the requests if
>>> they are queued for too long. Even more, with IDE, it can only handle
>>> one request at a time, so not completing requests doesn't sound like a
>>> good idea at all. In what intervals is the event-tap queue flushed?
>>
>> The requests are flushed once each transaction completes.  So
>> it's not with specific intervals.
>
> Right. So when is a transaction completed? This is the time that a
> single request will take.

The transaction is completed when the vm state is sent to the
secondary, and the primary receives the ack to it.  Please let me
know if the answer is too vague.  What I can tell is that it
can't be super fast.

>>> On the other hand, if you complete before actually writing out, you
>>> don't get timeouts, but you signal success to the guest when the request
>>> could still fail. What would you do in this case? With a writeback cache
>>> mode we're fine, we can just fail the next flush (until then nothing is
>>> guaranteed to be on disk and order doesn't matter either), but with
>>> cache=writethrough we're in serious trouble.
>>>
>>> Have you thought about this problem? Maybe we end up having to flush the
>>> event-tap queue for each single write in writethrough mode.
>>
>> Yes, and that's what I'm trying to do at this point.
>
> Oh, I must have missed that code. Which patch/function should I look at?

Maybe I miss-answered to your question.  The device may receive
timeouts.  If timeouts didn't happen, the requests are flushed
one-by-one in writethrough because we're calling qemu_aio_flush
and bdrv_flush together.

Yoshi

>> I know that
>> performance matters a lot, but sacrificing reliability over
>> performance now isn't a good idea.  I first want to lay the
>> ground, and then focus on optimization.  Note that without dirty
>> bitmap optimization, Kemari suffers a lot in sending rams.
>> Anthony and I discussed to take this approach at KVM Forum.
>
> I agree, starting simple makes sense.
>
> Kevin
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Kevin Wolf - Jan. 20, 2011, 11:46 a.m.
Am 20.01.2011 11:39, schrieb Yoshiaki Tamura:
> 2011/1/20 Kevin Wolf <kwolf@redhat.com>:
>> Am 20.01.2011 06:19, schrieb Yoshiaki Tamura:
>>>>>>> +        return;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>>>>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>>>>>> +                    blk_req->reqs[0].opaque);
>>>>>>
>>>>>> Same here.
>>>>>>
>>>>>>> +    bdrv_flush(bs);
>>>>>>
>>>>>> This looks really strange. What is this supposed to do?
>>>>>>
>>>>>> One point is that you write it immediately after bdrv_aio_write, so you
>>>>>> get an fsync for which you don't know if it includes the current write
>>>>>> request or if it doesn't. Which data do you want to get flushed to the disk?
>>>>>
>>>>> I was expecting to flush the aio request that was just initiated.
>>>>> Am I misunderstanding the function?
>>>>
>>>> Seems so. The function names don't use really clear terminology either,
>>>> so you're not the first one to fall in this trap. Basically we have:
>>>>
>>>> * qemu_aio_flush() waits for all AIO requests to complete. I think you
>>>> wanted to have exactly this, but only for a single block device. Such a
>>>> function doesn't exist yet.
>>>>
>>>> * bdrv_flush() makes sure that all successfully completed requests are
>>>> written to disk (by calling fsync)
>>>>
>>>> * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
>>>> the fsync in the thread pool
>>>
>>> Then what I wanted to do is, call qemu_aio_flush first, then
>>> bdrv_flush.  It should be like live migration.
>>
>> Okay, that makes sense. :-)
>>
>>>>>> The other thing is that you introduce a bdrv_flush for each request,
>>>>>> basically forcing everyone to something very similar to writethrough
>>>>>> mode. I'm sure this will have a big impact on performance.
>>>>>
>>>>> The reason is to avoid inversion of queued requests.  Although
>>>>> processing one-by-one is heavy, wouldn't having requests flushed
>>>>> to disk out of order break the disk image?
>>>>
>>>> No, that's fine. If a guest issues two requests at the same time, they
>>>> may complete in any order. You just need to make sure that you don't
>>>> call the completion callback before the request really has completed.
>>>
>>> We need to flush requests, meaning aio and fsync, before sending
>>> the final state of the guests, to make sure we can switch to the
>>> secondary safely.
>>
>> In theory I think you could just re-submit the requests on the secondary
>> if they had not completed yet.
>>
>> But you're right, let's keep things simple for the start.
>>
>>>> I'm just starting to wonder if the guest won't timeout the requests if
>>>> they are queued for too long. Even more, with IDE, it can only handle
>>>> one request at a time, so not completing requests doesn't sound like a
>>>> good idea at all. In what intervals is the event-tap queue flushed?
>>>
>>> The requests are flushed once each transaction completes.  So
>>> it's not with specific intervals.
>>
>> Right. So when is a transaction completed? This is the time that a
>> single request will take.
> 
> The transaction is completed when the vm state is sent to the
> secondary, and the primary receives the ack to it.  Please let me
> know if the answer is too vague.  What I can tell is that it
> can't be super fast.
> 
>>>> On the other hand, if you complete before actually writing out, you
>>>> don't get timeouts, but you signal success to the guest when the request
>>>> could still fail. What would you do in this case? With a writeback cache
>>>> mode we're fine, we can just fail the next flush (until then nothing is
>>>> guaranteed to be on disk and order doesn't matter either), but with
>>>> cache=writethrough we're in serious trouble.
>>>>
>>>> Have you thought about this problem? Maybe we end up having to flush the
>>>> event-tap queue for each single write in writethrough mode.
>>>
>>> Yes, and that's what I'm trying to do at this point.
>>
>> Oh, I must have missed that code. Which patch/function should I look at?
> 
> Maybe I miss-answered to your question.  The device may receive
> timeouts.  

We should pay attention that the guest does not see timeouts. I'm not
expecting that I/O will be super fast, and as long as it is only a
performance problem we can live with it.

However, as soon as the guest gets timeouts it reports I/O errors and
eventually offlines the block device. At this point it's not a
performance problem any more, but also a correctness problem.

This is why I suggested that we flush the event-tap queue (i.e. complete
the transaction) immediately after an I/O request has been issued
instead of waiting for other events that would complete the transaction.

> If timeouts didn't happen, the requests are flushed
> one-by-one in writethrough because we're calling qemu_aio_flush
> and bdrv_flush together.

I think this is what we must do.

Kevin
Yoshiaki Tamura - Jan. 20, 2011, 1:50 p.m.
2011/1/20 Kevin Wolf <kwolf@redhat.com>:
> Am 20.01.2011 11:39, schrieb Yoshiaki Tamura:
>> 2011/1/20 Kevin Wolf <kwolf@redhat.com>:
>>> Am 20.01.2011 06:19, schrieb Yoshiaki Tamura:
>>>>>>>> +        return;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>>>>>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>>>>>>> +                    blk_req->reqs[0].opaque);
>>>>>>>
>>>>>>> Same here.
>>>>>>>
>>>>>>>> +    bdrv_flush(bs);
>>>>>>>
>>>>>>> This looks really strange. What is this supposed to do?
>>>>>>>
>>>>>>> One point is that you write it immediately after bdrv_aio_write, so you
>>>>>>> get an fsync for which you don't know if it includes the current write
>>>>>>> request or if it doesn't. Which data do you want to get flushed to the disk?
>>>>>>
>>>>>> I was expecting to flush the aio request that was just initiated.
>>>>>> Am I misunderstanding the function?
>>>>>
>>>>> Seems so. The function names don't use really clear terminology either,
>>>>> so you're not the first one to fall in this trap. Basically we have:
>>>>>
>>>>> * qemu_aio_flush() waits for all AIO requests to complete. I think you
>>>>> wanted to have exactly this, but only for a single block device. Such a
>>>>> function doesn't exist yet.
>>>>>
>>>>> * bdrv_flush() makes sure that all successfully completed requests are
>>>>> written to disk (by calling fsync)
>>>>>
>>>>> * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
>>>>> the fsync in the thread pool
>>>>
>>>> Then what I wanted to do is, call qemu_aio_flush first, then
>>>> bdrv_flush.  It should be like live migration.
>>>
>>> Okay, that makes sense. :-)
>>>
>>>>>>> The other thing is that you introduce a bdrv_flush for each request,
>>>>>>> basically forcing everyone to something very similar to writethrough
>>>>>>> mode. I'm sure this will have a big impact on performance.
>>>>>>
>>>>>> The reason is to avoid inversion of queued requests.  Although
>>>>>> processing one-by-one is heavy, wouldn't having requests flushed
>>>>>> to disk out of order break the disk image?
>>>>>
>>>>> No, that's fine. If a guest issues two requests at the same time, they
>>>>> may complete in any order. You just need to make sure that you don't
>>>>> call the completion callback before the request really has completed.
>>>>
>>>> We need to flush requests, meaning aio and fsync, before sending
>>>> the final state of the guests, to make sure we can switch to the
>>>> secondary safely.
>>>
>>> In theory I think you could just re-submit the requests on the secondary
>>> if they had not completed yet.
>>>
>>> But you're right, let's keep things simple for the start.
>>>
>>>>> I'm just starting to wonder if the guest won't timeout the requests if
>>>>> they are queued for too long. Even more, with IDE, it can only handle
>>>>> one request at a time, so not completing requests doesn't sound like a
>>>>> good idea at all. In what intervals is the event-tap queue flushed?
>>>>
>>>> The requests are flushed once each transaction completes.  So
>>>> it's not with specific intervals.
>>>
>>> Right. So when is a transaction completed? This is the time that a
>>> single request will take.
>>
>> The transaction is completed when the vm state is sent to the
>> secondary, and the primary receives the ack to it.  Please let me
>> know if the answer is too vague.  What I can tell is that it
>> can't be super fast.
>>
>>>>> On the other hand, if you complete before actually writing out, you
>>>>> don't get timeouts, but you signal success to the guest when the request
>>>>> could still fail. What would you do in this case? With a writeback cache
>>>>> mode we're fine, we can just fail the next flush (until then nothing is
>>>>> guaranteed to be on disk and order doesn't matter either), but with
>>>>> cache=writethrough we're in serious trouble.
>>>>>
>>>>> Have you thought about this problem? Maybe we end up having to flush the
>>>>> event-tap queue for each single write in writethrough mode.
>>>>
>>>> Yes, and that's what I'm trying to do at this point.
>>>
>>> Oh, I must have missed that code. Which patch/function should I look at?
>>
>> Maybe I miss-answered to your question.  The device may receive
>> timeouts.
>
> We should pay attention that the guest does not see timeouts. I'm not
> expecting that I/O will be super fast, and as long as it is only a
> performance problem we can live with it.
>
> However, as soon as the guest gets timeouts it reports I/O errors and
> eventually offlines the block device. At this point it's not a
> performance problem any more, but also a correctness problem.
>
> This is why I suggested that we flush the event-tap queue (i.e. complete
> the transaction) immediately after an I/O request has been issued
> instead of waiting for other events that would complete the transaction.

Right.  event-tap doesn't queue at specific interval.  It'll
schedule the transaction as bh once events are tapped .  The
purpose of the queue is store requests initiated while the
transaction.  So I believe current implementation should be doing
what you're expecting.  However, if the guest dirtied huge amount
of ram and initiated block requests, we may get timeouts even we
started transaction right away.

Yoshi

>> If timeouts didn't happen, the requests are flushed
>> one-by-one in writethrough because we're calling qemu_aio_flush
>> and bdrv_flush together.
>
> I think this is what we must do.
>
> Kevin
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Kevin Wolf - Jan. 20, 2011, 2:21 p.m.
Am 20.01.2011 14:50, schrieb Yoshiaki Tamura:
> 2011/1/20 Kevin Wolf <kwolf@redhat.com>:
>> Am 20.01.2011 11:39, schrieb Yoshiaki Tamura:
>>> 2011/1/20 Kevin Wolf <kwolf@redhat.com>:
>>>> Am 20.01.2011 06:19, schrieb Yoshiaki Tamura:
>>>>>>>>> +        return;
>>>>>>>>> +    }
>>>>>>>>> +
>>>>>>>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>>>>>>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>>>>>>>> +                    blk_req->reqs[0].opaque);
>>>>>>>>
>>>>>>>> Same here.
>>>>>>>>
>>>>>>>>> +    bdrv_flush(bs);
>>>>>>>>
>>>>>>>> This looks really strange. What is this supposed to do?
>>>>>>>>
>>>>>>>> One point is that you write it immediately after bdrv_aio_write, so you
>>>>>>>> get an fsync for which you don't know if it includes the current write
>>>>>>>> request or if it doesn't. Which data do you want to get flushed to the disk?
>>>>>>>
>>>>>>> I was expecting to flush the aio request that was just initiated.
>>>>>>> Am I misunderstanding the function?
>>>>>>
>>>>>> Seems so. The function names don't use really clear terminology either,
>>>>>> so you're not the first one to fall in this trap. Basically we have:
>>>>>>
>>>>>> * qemu_aio_flush() waits for all AIO requests to complete. I think you
>>>>>> wanted to have exactly this, but only for a single block device. Such a
>>>>>> function doesn't exist yet.
>>>>>>
>>>>>> * bdrv_flush() makes sure that all successfully completed requests are
>>>>>> written to disk (by calling fsync)
>>>>>>
>>>>>> * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
>>>>>> the fsync in the thread pool
>>>>>
>>>>> Then what I wanted to do is, call qemu_aio_flush first, then
>>>>> bdrv_flush.  It should be like live migration.
>>>>
>>>> Okay, that makes sense. :-)
>>>>
>>>>>>>> The other thing is that you introduce a bdrv_flush for each request,
>>>>>>>> basically forcing everyone to something very similar to writethrough
>>>>>>>> mode. I'm sure this will have a big impact on performance.
>>>>>>>
>>>>>>> The reason is to avoid inversion of queued requests.  Although
>>>>>>> processing one-by-one is heavy, wouldn't having requests flushed
>>>>>>> to disk out of order break the disk image?
>>>>>>
>>>>>> No, that's fine. If a guest issues two requests at the same time, they
>>>>>> may complete in any order. You just need to make sure that you don't
>>>>>> call the completion callback before the request really has completed.
>>>>>
>>>>> We need to flush requests, meaning aio and fsync, before sending
>>>>> the final state of the guests, to make sure we can switch to the
>>>>> secondary safely.
>>>>
>>>> In theory I think you could just re-submit the requests on the secondary
>>>> if they had not completed yet.
>>>>
>>>> But you're right, let's keep things simple for the start.
>>>>
>>>>>> I'm just starting to wonder if the guest won't timeout the requests if
>>>>>> they are queued for too long. Even more, with IDE, it can only handle
>>>>>> one request at a time, so not completing requests doesn't sound like a
>>>>>> good idea at all. In what intervals is the event-tap queue flushed?
>>>>>
>>>>> The requests are flushed once each transaction completes.  So
>>>>> it's not with specific intervals.
>>>>
>>>> Right. So when is a transaction completed? This is the time that a
>>>> single request will take.
>>>
>>> The transaction is completed when the vm state is sent to the
>>> secondary, and the primary receives the ack to it.  Please let me
>>> know if the answer is too vague.  What I can tell is that it
>>> can't be super fast.
>>>
>>>>>> On the other hand, if you complete before actually writing out, you
>>>>>> don't get timeouts, but you signal success to the guest when the request
>>>>>> could still fail. What would you do in this case? With a writeback cache
>>>>>> mode we're fine, we can just fail the next flush (until then nothing is
>>>>>> guaranteed to be on disk and order doesn't matter either), but with
>>>>>> cache=writethrough we're in serious trouble.
>>>>>>
>>>>>> Have you thought about this problem? Maybe we end up having to flush the
>>>>>> event-tap queue for each single write in writethrough mode.
>>>>>
>>>>> Yes, and that's what I'm trying to do at this point.
>>>>
>>>> Oh, I must have missed that code. Which patch/function should I look at?
>>>
>>> Maybe I miss-answered to your question.  The device may receive
>>> timeouts.
>>
>> We should pay attention that the guest does not see timeouts. I'm not
>> expecting that I/O will be super fast, and as long as it is only a
>> performance problem we can live with it.
>>
>> However, as soon as the guest gets timeouts it reports I/O errors and
>> eventually offlines the block device. At this point it's not a
>> performance problem any more, but also a correctness problem.
>>
>> This is why I suggested that we flush the event-tap queue (i.e. complete
>> the transaction) immediately after an I/O request has been issued
>> instead of waiting for other events that would complete the transaction.
> 
> Right.  event-tap doesn't queue at specific interval.  It'll
> schedule the transaction as bh once events are tapped .  The
> purpose of the queue is store requests initiated while the
> transaction.  

Ok, now I got it. :-)

So the patches are already doing the best we can do.

> So I believe current implementation should be doing
> what you're expecting.  However, if the guest dirtied huge amount
> of ram and initiated block requests, we may get timeouts even we
> started transaction right away.

Right. We'll have to live with that for now. If it happens, bad luck.

Kevin
Yoshiaki Tamura - Jan. 20, 2011, 3:48 p.m.
2011/1/20 Kevin Wolf <kwolf@redhat.com>:
> Am 20.01.2011 14:50, schrieb Yoshiaki Tamura:
>> 2011/1/20 Kevin Wolf <kwolf@redhat.com>:
>>> Am 20.01.2011 11:39, schrieb Yoshiaki Tamura:
>>>> 2011/1/20 Kevin Wolf <kwolf@redhat.com>:
>>>>> Am 20.01.2011 06:19, schrieb Yoshiaki Tamura:
>>>>>>>>>> +        return;
>>>>>>>>>> +    }
>>>>>>>>>> +
>>>>>>>>>> +    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
>>>>>>>>>> +                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
>>>>>>>>>> +                    blk_req->reqs[0].opaque);
>>>>>>>>>
>>>>>>>>> Same here.
>>>>>>>>>
>>>>>>>>>> +    bdrv_flush(bs);
>>>>>>>>>
>>>>>>>>> This looks really strange. What is this supposed to do?
>>>>>>>>>
>>>>>>>>> One point is that you write it immediately after bdrv_aio_write, so you
>>>>>>>>> get an fsync for which you don't know if it includes the current write
>>>>>>>>> request or if it doesn't. Which data do you want to get flushed to the disk?
>>>>>>>>
>>>>>>>> I was expecting to flush the aio request that was just initiated.
>>>>>>>> Am I misunderstanding the function?
>>>>>>>
>>>>>>> Seems so. The function names don't use really clear terminology either,
>>>>>>> so you're not the first one to fall in this trap. Basically we have:
>>>>>>>
>>>>>>> * qemu_aio_flush() waits for all AIO requests to complete. I think you
>>>>>>> wanted to have exactly this, but only for a single block device. Such a
>>>>>>> function doesn't exist yet.
>>>>>>>
>>>>>>> * bdrv_flush() makes sure that all successfully completed requests are
>>>>>>> written to disk (by calling fsync)
>>>>>>>
>>>>>>> * bdrv_aio_flush() is the asynchronous version of bdrv_flush, i.e. run
>>>>>>> the fsync in the thread pool
>>>>>>
>>>>>> Then what I wanted to do is, call qemu_aio_flush first, then
>>>>>> bdrv_flush.  It should be like live migration.
>>>>>
>>>>> Okay, that makes sense. :-)
>>>>>
>>>>>>>>> The other thing is that you introduce a bdrv_flush for each request,
>>>>>>>>> basically forcing everyone to something very similar to writethrough
>>>>>>>>> mode. I'm sure this will have a big impact on performance.
>>>>>>>>
>>>>>>>> The reason is to avoid inversion of queued requests.  Although
>>>>>>>> processing one-by-one is heavy, wouldn't having requests flushed
>>>>>>>> to disk out of order break the disk image?
>>>>>>>
>>>>>>> No, that's fine. If a guest issues two requests at the same time, they
>>>>>>> may complete in any order. You just need to make sure that you don't
>>>>>>> call the completion callback before the request really has completed.
>>>>>>
>>>>>> We need to flush requests, meaning aio and fsync, before sending
>>>>>> the final state of the guests, to make sure we can switch to the
>>>>>> secondary safely.
>>>>>
>>>>> In theory I think you could just re-submit the requests on the secondary
>>>>> if they had not completed yet.
>>>>>
>>>>> But you're right, let's keep things simple for the start.
>>>>>
>>>>>>> I'm just starting to wonder if the guest won't timeout the requests if
>>>>>>> they are queued for too long. Even more, with IDE, it can only handle
>>>>>>> one request at a time, so not completing requests doesn't sound like a
>>>>>>> good idea at all. In what intervals is the event-tap queue flushed?
>>>>>>
>>>>>> The requests are flushed once each transaction completes.  So
>>>>>> it's not with specific intervals.
>>>>>
>>>>> Right. So when is a transaction completed? This is the time that a
>>>>> single request will take.
>>>>
>>>> The transaction is completed when the vm state is sent to the
>>>> secondary, and the primary receives the ack to it.  Please let me
>>>> know if the answer is too vague.  What I can tell is that it
>>>> can't be super fast.
>>>>
>>>>>>> On the other hand, if you complete before actually writing out, you
>>>>>>> don't get timeouts, but you signal success to the guest when the request
>>>>>>> could still fail. What would you do in this case? With a writeback cache
>>>>>>> mode we're fine, we can just fail the next flush (until then nothing is
>>>>>>> guaranteed to be on disk and order doesn't matter either), but with
>>>>>>> cache=writethrough we're in serious trouble.
>>>>>>>
>>>>>>> Have you thought about this problem? Maybe we end up having to flush the
>>>>>>> event-tap queue for each single write in writethrough mode.
>>>>>>
>>>>>> Yes, and that's what I'm trying to do at this point.
>>>>>
>>>>> Oh, I must have missed that code. Which patch/function should I look at?
>>>>
>>>> Maybe I miss-answered to your question.  The device may receive
>>>> timeouts.
>>>
>>> We should pay attention that the guest does not see timeouts. I'm not
>>> expecting that I/O will be super fast, and as long as it is only a
>>> performance problem we can live with it.
>>>
>>> However, as soon as the guest gets timeouts it reports I/O errors and
>>> eventually offlines the block device. At this point it's not a
>>> performance problem any more, but also a correctness problem.
>>>
>>> This is why I suggested that we flush the event-tap queue (i.e. complete
>>> the transaction) immediately after an I/O request has been issued
>>> instead of waiting for other events that would complete the transaction.
>>
>> Right.  event-tap doesn't queue at specific interval.  It'll
>> schedule the transaction as bh once events are tapped .  The
>> purpose of the queue is store requests initiated while the
>> transaction.
>
> Ok, now I got it. :-)
>
> So the patches are already doing the best we can do.
>
>> So I believe current implementation should be doing
>> what you're expecting.  However, if the guest dirtied huge amount
>> of ram and initiated block requests, we may get timeouts even we
>> started transaction right away.
>
> Right. We'll have to live with that for now. If it happens, bad luck.


Yeah.  But I'll keep thinking to remove that limitation :)

Yoshi

>
> Kevin
>
>

Patch

diff --git a/Makefile.target b/Makefile.target
index e15b1c4..f36cd75 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -199,6 +199,7 @@  obj-y += rwhandler.o
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
 obj-$(CONFIG_NO_KVM) += kvm-stub.o
 LIBS+=-lz
+obj-y += event-tap.o
 
 QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
 QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
diff --git a/event-tap.c b/event-tap.c
new file mode 100644
index 0000000..f492708
--- /dev/null
+++ b/event-tap.c
@@ -0,0 +1,847 @@ 
+/*
+ * Event Tap functions for QEMU
+ *
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu-common.h"
+#include "qemu-error.h"
+#include "block.h"
+#include "block_int.h"
+#include "ioport.h"
+#include "osdep.h"
+#include "sysemu.h"
+#include "hw/hw.h"
+#include "net.h"
+#include "event-tap.h"
+#include "trace.h"
+
+enum EVENT_TAP_STATE {
+    EVENT_TAP_OFF,
+    EVENT_TAP_ON,
+    EVENT_TAP_FLUSH,
+    EVENT_TAP_LOAD,
+    EVENT_TAP_REPLAY,
+};
+
+static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF;
+static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */
+
+typedef struct EventTapIOport {
+    uint32_t address;
+    uint32_t data;
+    int      index;
+} EventTapIOport;
+
+#define MMIO_BUF_SIZE 8
+
+typedef struct EventTapMMIO {
+    uint64_t address;
+    uint8_t  buf[MMIO_BUF_SIZE];
+    int      len;
+} EventTapMMIO;
+
+typedef struct EventTapNetReq {
+    char *device_name;
+    int iovcnt;
+    struct iovec *iov;
+    int vlan_id;
+    bool vlan_needed;
+    bool async;
+    NetPacketSent *sent_cb;
+} EventTapNetReq;
+
+#define MAX_BLOCK_REQUEST 32
+
+typedef struct EventTapBlkReq {
+    char *device_name;
+    int num_reqs;
+    int num_cbs;
+    bool is_flush;
+    BlockRequest reqs[MAX_BLOCK_REQUEST];
+    BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST];
+    void *opaque[MAX_BLOCK_REQUEST];
+} EventTapBlkReq;
+
+#define EVENT_TAP_IOPORT (1 << 0)
+#define EVENT_TAP_MMIO   (1 << 1)
+#define EVENT_TAP_NET    (1 << 2)
+#define EVENT_TAP_BLK    (1 << 3)
+
+#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1)
+
+typedef struct EventTapLog {
+    int mode;
+    union {
+        EventTapIOport ioport;
+        EventTapMMIO mmio;
+    };
+    union {
+        EventTapNetReq net_req;
+        EventTapBlkReq blk_req;
+    };
+    QTAILQ_ENTRY(EventTapLog) node;
+} EventTapLog;
+
+static EventTapLog *last_event_tap;
+
+static QTAILQ_HEAD(, EventTapLog) event_list;
+static QTAILQ_HEAD(, EventTapLog) event_pool;
+
+static int (*event_tap_cb)(void);
+static QEMUBH *event_tap_bh;
+static VMChangeStateEntry *vmstate;
+
+static void event_tap_bh_cb(void *p)
+{
+    if (event_tap_cb) {
+        event_tap_cb();
+    }
+
+    qemu_bh_delete(event_tap_bh);
+    event_tap_bh = NULL;
+}
+
+static void event_tap_schedule_bh(void)
+{
+    trace_event_tap_ignore_bh(!!event_tap_bh);
+
+    /* if bh is already set, we ignore it for now */
+    if (event_tap_bh) {
+        return;
+    }
+
+    event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL);
+    qemu_bh_schedule(event_tap_bh);
+
+    return ;
+}
+
+static void event_tap_alloc_net_req(EventTapNetReq *net_req,
+                                   VLANClientState *vc,
+                                   const struct iovec *iov, int iovcnt,
+                                   NetPacketSent *sent_cb, bool async)
+{
+    int i;
+
+    net_req->iovcnt = iovcnt;
+    net_req->async = async;
+    net_req->device_name = qemu_strdup(vc->name);
+    net_req->sent_cb = sent_cb;
+
+    if (vc->vlan) {
+        net_req->vlan_needed = 1;
+        net_req->vlan_id = vc->vlan->id;
+    } else {
+        net_req->vlan_needed = 0;
+    }
+
+    if (async) {
+        net_req->iov = (struct iovec *)iov;
+    } else {
+        net_req->iov = qemu_malloc(sizeof(struct iovec) * iovcnt);
+        for (i = 0; i < iovcnt; i++) {
+            net_req->iov[i].iov_base = qemu_malloc(iov[i].iov_len);
+            memcpy(net_req->iov[i].iov_base, iov[i].iov_base, iov[i].iov_len);
+            net_req->iov[i].iov_len = iov[i].iov_len;
+        }
+    }
+}
+
+static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
+                                    BlockDriverState *bs, BlockRequest *reqs,
+                                    int num_reqs, BlockDriverCompletionFunc *cb,
+                                    void *opaque, bool is_flush)
+{
+    int i;
+
+    blk_req->num_reqs = num_reqs;
+    blk_req->num_cbs = num_reqs;
+    blk_req->device_name = qemu_strdup(bs->device_name);
+    blk_req->is_flush = is_flush;
+
+    for (i = 0; i < num_reqs; i++) {
+        blk_req->reqs[i].sector = reqs[i].sector;
+        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
+        blk_req->reqs[i].qiov = reqs[i].qiov;
+        blk_req->reqs[i].cb = cb;
+        blk_req->reqs[i].opaque = opaque;
+        blk_req->cb[i] = reqs[i].cb;
+        blk_req->opaque[i] = reqs[i].opaque;
+    }
+}
+
+static void *event_tap_alloc_log(void)
+{
+    EventTapLog *log;
+
+    if (QTAILQ_EMPTY(&event_pool)) {
+        log = qemu_mallocz(sizeof(EventTapLog));
+    } else {
+        log = QTAILQ_FIRST(&event_pool);
+        QTAILQ_REMOVE(&event_pool, log, node);
+    }
+
+    return log;
+}
+
+static void event_tap_free_log(EventTapLog *log)
+{
+    int i, mode = log->mode & ~EVENT_TAP_TYPE_MASK;
+
+    if (mode == EVENT_TAP_NET) {
+        EventTapNetReq *net_req = &log->net_req;
+
+        if (!net_req->async) {
+            for (i = 0; i < net_req->iovcnt; i++) {
+                qemu_free(net_req->iov[i].iov_base);
+            }
+            qemu_free(net_req->iov);
+        } else if (event_tap_state >= EVENT_TAP_LOAD) {
+            qemu_free(net_req->iov);
+        }
+
+        qemu_free(net_req->device_name);
+    } else if (mode == EVENT_TAP_BLK) {
+        EventTapBlkReq *blk_req = &log->blk_req;
+
+        if (event_tap_state >= EVENT_TAP_LOAD && !blk_req->is_flush) {
+            for (i = 0; i < blk_req->num_reqs; i++) {
+                qemu_iovec_destroy(blk_req->reqs[i].qiov);
+                qemu_free(blk_req->reqs[i].qiov);
+            }
+        }
+
+        qemu_free(blk_req->device_name);
+    }
+
+    log->mode = 0;
+
+    /* return the log to event_pool */
+    QTAILQ_INSERT_HEAD(&event_pool, log, node);
+}
+
+static void event_tap_free_pool(void)
+{
+    EventTapLog *log, *next;
+
+    QTAILQ_FOREACH_SAFE(log, &event_pool, node, next) {
+        QTAILQ_REMOVE(&event_pool, log, node);
+        qemu_free(log);
+    }
+}
+
+static void event_tap_blk_cb(void *opaque, int ret)
+{
+    EventTapLog *log = container_of(opaque, EventTapLog, blk_req);
+    EventTapBlkReq *blk_req = opaque;
+    int i;
+
+    blk_req->num_cbs--;
+
+    /* all outstanding requests are flushed */
+    if (blk_req->num_cbs == 0) {
+        for (i = 0; i < blk_req->num_reqs; i++) {
+            blk_req->cb[i](blk_req->opaque[i], ret);
+        }
+
+        event_tap_free_log(log);
+    }
+}
+
+static void event_tap_packet(VLANClientState *vc, const struct iovec *iov,
+                            int iovcnt, NetPacketSent *sent_cb, bool async)
+{
+    int empty;
+    EventTapLog *log = last_event_tap;
+
+    if (!log) {
+        trace_event_tap_no_event();
+        log = event_tap_alloc_log();
+    }
+
+    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
+        return;
+    }
+
+    log->mode |= EVENT_TAP_NET;
+    event_tap_alloc_net_req(&log->net_req, vc, iov, iovcnt, sent_cb, async);
+
+    empty = QTAILQ_EMPTY(&event_list);
+    QTAILQ_INSERT_TAIL(&event_list, log, node);
+    last_event_tap = NULL;
+
+    if (empty) {
+        event_tap_schedule_bh();
+    }
+}
+
+static void event_tap_bdrv(BlockDriverState *bs, BlockRequest *reqs,
+                           int num_reqs, bool is_flush)
+{
+    EventTapLog *log = last_event_tap;
+    int empty;
+
+    if (!log) {
+        trace_event_tap_no_event();
+        log = event_tap_alloc_log();
+    }
+
+    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        trace_event_tap_already_used(log->mode & ~EVENT_TAP_TYPE_MASK);
+        return;
+    }
+
+    log->mode |= EVENT_TAP_BLK;
+    event_tap_alloc_blk_req(&log->blk_req, bs, reqs, num_reqs,
+                            event_tap_blk_cb, &log->blk_req, is_flush);
+
+    empty = QTAILQ_EMPTY(&event_list);
+    QTAILQ_INSERT_TAIL(&event_list, log, node);
+    last_event_tap = NULL;
+
+    if (empty) {
+        event_tap_schedule_bh();
+    }
+}
+
+BlockDriverAIOCB *event_tap_bdrv_aio_writev(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector *iov,
+                                            int nb_sectors,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    BlockRequest req;
+
+    assert(event_tap_state == EVENT_TAP_ON);
+
+    req.sector = sector_num;
+    req.nb_sectors = nb_sectors;
+    req.qiov = iov;
+    req.cb = cb;
+    req.opaque = opaque;
+    event_tap_bdrv(bs, &req, 1, 0);
+
+    /* return a dummy_acb pointer to prevent from failing */
+    return &dummy_acb;
+}
+
+BlockDriverAIOCB *event_tap_bdrv_aio_flush(BlockDriverState *bs,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque)
+{
+    BlockRequest req;
+
+    assert(event_tap_state == EVENT_TAP_ON);
+
+    memset(&req, 0, sizeof(req));
+    req.cb = cb;
+    req.opaque = opaque;
+    event_tap_bdrv(bs, &req, 1, 1);
+
+    return &dummy_acb;
+}
+
+void event_tap_send_packet(VLANClientState *vc, const uint8_t *buf, int size)
+{
+    struct iovec iov;
+
+    assert(event_tap_state == EVENT_TAP_ON);
+
+    iov.iov_base = (uint8_t *)buf;
+    iov.iov_len = size;
+    event_tap_packet(vc, &iov, 1, NULL, 0);
+
+    return;
+}
+ssize_t event_tap_sendv_packet_async(VLANClientState *vc,
+                                     const struct iovec *iov,
+                                     int iovcnt, NetPacketSent *sent_cb)
+{
+    assert(event_tap_state == EVENT_TAP_ON);
+    event_tap_packet(vc, iov, iovcnt, sent_cb, 1);
+    return 0;
+}
+
+int event_tap_register(int (*cb)(void))
+{
+    if (event_tap_state != EVENT_TAP_OFF) {
+        error_report("event-tap is already on");
+        return -EINVAL;
+    }
+
+    if (!cb || event_tap_cb) {
+        error_report("can't set event_tap_cb");
+        return -EINVAL;
+    }
+
+    event_tap_cb = cb;
+    event_tap_state = EVENT_TAP_ON;
+
+    return 0;
+}
+
+void event_tap_unregister(void)
+{
+    if (event_tap_state == EVENT_TAP_OFF) {
+        error_report("event-tap is already off");
+        return;
+    }
+
+    event_tap_state = EVENT_TAP_OFF;
+    event_tap_cb = NULL;
+
+    event_tap_flush();
+    event_tap_free_pool();
+}
+
+int event_tap_is_on(void)
+{
+    return (event_tap_state == EVENT_TAP_ON);
+}
+
+void event_tap_ioport(int index, uint32_t address, uint32_t data)
+{
+    if (event_tap_state != EVENT_TAP_ON) {
+        return;
+    }
+
+    if (!last_event_tap) {
+        last_event_tap = event_tap_alloc_log();
+    }
+
+    last_event_tap->mode = EVENT_TAP_IOPORT;
+    last_event_tap->ioport.index = index;
+    last_event_tap->ioport.address = address;
+    last_event_tap->ioport.data = data;
+}
+
+void event_tap_mmio(uint64_t address, uint8_t *buf, int len)
+{
+    if (event_tap_state != EVENT_TAP_ON || len > MMIO_BUF_SIZE) {
+        return;
+    }
+
+    if (!last_event_tap) {
+        last_event_tap = event_tap_alloc_log();
+    }
+
+    last_event_tap->mode = EVENT_TAP_MMIO;
+    last_event_tap->mmio.address = address;
+    last_event_tap->mmio.len = len;
+    memcpy(last_event_tap->mmio.buf, buf, len);
+}
+
+static void event_tap_net_flush(EventTapNetReq *net_req)
+{
+    VLANClientState *vc;
+    ssize_t len;
+
+    if (net_req->vlan_needed) {
+        vc = qemu_find_vlan_client_by_name(NULL, net_req->vlan_id,
+                                           net_req->device_name);
+    } else {
+        vc = qemu_find_netdev(net_req->device_name);
+    }
+
+    if (net_req->async) {
+        len = qemu_sendv_packet_async(vc, net_req->iov, net_req->iovcnt,
+                                      net_req->sent_cb);
+        if (len) {
+            net_req->sent_cb(vc, len);
+        } else {
+            /* packets are queued in the net layer */
+            trace_event_tap_append_packet();
+        }
+    } else {
+        qemu_send_packet(vc, net_req->iov[0].iov_base,
+                         net_req->iov[0].iov_len);
+    }
+}
+
+static void event_tap_blk_flush(EventTapBlkReq *blk_req)
+{
+    BlockDriverState *bs;
+
+    bs = bdrv_find(blk_req->device_name);
+
+    if (blk_req->is_flush) {
+        bdrv_aio_flush(bs, blk_req->reqs[0].cb, blk_req->reqs[0].opaque);
+        return;
+    }
+
+    bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
+                    blk_req->reqs[0].nb_sectors, blk_req->reqs[0].cb,
+                    blk_req->reqs[0].opaque);
+    bdrv_flush(bs);
+}
+
+/* returns 1 if the queue gets emtpy */
+int event_tap_flush_one(void)
+{
+    EventTapLog *log;
+    int ret;
+
+    if (QTAILQ_EMPTY(&event_list)) {
+        return 1;
+    }
+
+    event_tap_state = EVENT_TAP_FLUSH;
+
+    log = QTAILQ_FIRST(&event_list);
+    switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
+    case EVENT_TAP_NET:
+        event_tap_net_flush(&log->net_req);
+        QTAILQ_REMOVE(&event_list, log, node);
+        event_tap_free_log(log);
+        break;
+    case EVENT_TAP_BLK:
+        event_tap_blk_flush(&log->blk_req);
+        QTAILQ_REMOVE(&event_list, log, node);
+        break;
+    default:
+        error_report("Unknown state %d", log->mode);
+        QTAILQ_REMOVE(&event_list, log, node);
+        event_tap_free_log(log);
+        return -EINVAL;
+    }
+
+    qemu_aio_flush();
+    ret = QTAILQ_EMPTY(&event_list);
+    event_tap_state = ret ? EVENT_TAP_ON : EVENT_TAP_FLUSH;
+
+    return ret;
+}
+
+void event_tap_flush(void)
+{
+    int ret;
+
+    do {
+        ret = event_tap_flush_one();
+    } while (ret == 0);
+
+    if (ret < 0) {
+        error_report("error flushing event-tap requests");
+        abort();
+    }
+}
+
+static void event_tap_replay(void *opaque, int running, int reason)
+{
+    EventTapLog *log, *next;
+
+    if (!running) {
+        return;
+    }
+
+    assert(event_tap_state == EVENT_TAP_LOAD);
+
+    event_tap_state = EVENT_TAP_REPLAY;
+
+    QTAILQ_FOREACH(log, &event_list, node) {
+        if ((log->mode & ~EVENT_TAP_TYPE_MASK) == EVENT_TAP_NET) {
+            EventTapNetReq *net_req = &log->net_req;
+            if (!net_req->async) {
+                event_tap_net_flush(net_req);
+                continue;
+            }
+        }
+
+        switch (log->mode & EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_IOPORT:
+            switch (log->ioport.index) {
+            case 0:
+                cpu_outb(log->ioport.address, log->ioport.data);
+                break;
+            case 1:
+                cpu_outw(log->ioport.address, log->ioport.data);
+                break;
+            case 2:
+                cpu_outl(log->ioport.address, log->ioport.data);
+                break;
+            }
+            break;
+        case EVENT_TAP_MMIO:
+            cpu_physical_memory_rw(log->mmio.address,
+                                   log->mmio.buf,
+                                   log->mmio.len, 1);
+            break;
+        case 0:
+            trace_event_tap_replay_no_event();
+            break;
+        default:
+            error_report("Unknown state %d", log->mode);
+            QTAILQ_REMOVE(&event_list, log, node);
+            event_tap_free_log(log);
+            return;
+        }
+    }
+
+    /* remove event logs from queue */
+    QTAILQ_FOREACH_SAFE(log, &event_list, node, next) {
+        QTAILQ_REMOVE(&event_list, log, node);
+        event_tap_free_log(log);
+    }
+
+    event_tap_state = EVENT_TAP_OFF;
+    qemu_del_vm_change_state_handler(vmstate);
+}
+
+static inline void event_tap_ioport_save(QEMUFile *f, EventTapIOport *ioport)
+{
+    qemu_put_be32(f, ioport->index);
+    qemu_put_be32(f, ioport->address);
+    qemu_put_byte(f, ioport->data);
+}
+
+static inline void event_tap_ioport_load(QEMUFile *f,
+                                         EventTapIOport *ioport)
+{
+    ioport->index = qemu_get_be32(f);
+    ioport->address = qemu_get_be32(f);
+    ioport->data = qemu_get_byte(f);
+}
+
+static inline void event_tap_mmio_save(QEMUFile *f, EventTapMMIO *mmio)
+{
+    qemu_put_be64(f, mmio->address);
+    qemu_put_byte(f, mmio->len);
+    qemu_put_buffer(f, mmio->buf, mmio->len);
+}
+
+static inline void event_tap_mmio_load(QEMUFile *f, EventTapMMIO *mmio)
+{
+    mmio->address = qemu_get_be64(f);
+    mmio->len = qemu_get_byte(f);
+    qemu_get_buffer(f, mmio->buf, mmio->len);
+}
+
+static void event_tap_net_save(QEMUFile *f, EventTapNetReq *net_req)
+{
+    ram_addr_t page_addr;
+    int i, len;
+
+    len = strlen(net_req->device_name);
+    qemu_put_byte(f, len);
+    qemu_put_buffer(f, (uint8_t *)net_req->device_name, len);
+    qemu_put_byte(f, net_req->vlan_id);
+    qemu_put_byte(f, net_req->vlan_needed);
+    qemu_put_byte(f, net_req->async);
+    qemu_put_be32(f, net_req->iovcnt);
+
+    for (i = 0; i < net_req->iovcnt; i++) {
+        qemu_put_be64(f, net_req->iov[i].iov_len);
+        if (net_req->async) {
+            page_addr =
+                qemu_ram_addr_from_host_nofail(net_req->iov[i].iov_base);
+            qemu_put_be64(f, page_addr);
+        } else {
+            qemu_put_buffer(f, (uint8_t *)net_req->iov[i].iov_base,
+                            net_req->iov[i].iov_len);
+        }
+    }
+}
+
+static void event_tap_net_load(QEMUFile *f, EventTapNetReq *net_req)
+{
+    ram_addr_t page_addr;
+    int i, len;
+
+    len = qemu_get_byte(f);
+    net_req->device_name = qemu_malloc(len + 1);
+    qemu_get_buffer(f, (uint8_t *)net_req->device_name, len);
+    net_req->device_name[len] = '\0';
+    net_req->vlan_id = qemu_get_byte(f);
+    net_req->vlan_needed = qemu_get_byte(f);
+    net_req->async = qemu_get_byte(f);
+    net_req->iovcnt = qemu_get_be32(f);
+    net_req->iov = qemu_malloc(sizeof(struct iovec) * net_req->iovcnt);
+
+    for (i = 0; i < net_req->iovcnt; i++) {
+        net_req->iov[i].iov_len = qemu_get_be64(f);
+        if (net_req->async) {
+            page_addr = qemu_get_be64(f);
+            net_req->iov[i].iov_base = qemu_get_ram_ptr(page_addr);
+        } else {
+            net_req->iov[i].iov_base = qemu_malloc(net_req->iov[i].iov_len);
+            qemu_get_buffer(f, (uint8_t *)net_req->iov[i].iov_base,
+                            net_req->iov[i].iov_len);
+        }
+    }
+}
+
+static void event_tap_blk_save(QEMUFile *f, EventTapBlkReq *blk_req)
+{
+    BlockRequest *req;
+    ram_addr_t page_addr;
+    int i, j, len;
+
+    len = strlen(blk_req->device_name);
+    qemu_put_byte(f, len);
+    qemu_put_buffer(f, (uint8_t *)blk_req->device_name, len);
+    qemu_put_byte(f, blk_req->num_reqs);
+    qemu_put_byte(f, blk_req->is_flush);
+
+    if (blk_req->is_flush) {
+        return;
+    }
+
+    for (i = 0; i < blk_req->num_reqs; i++) {
+        req = &blk_req->reqs[i];
+        qemu_put_be64(f, req->sector);
+        qemu_put_be32(f, req->nb_sectors);
+        qemu_put_be32(f, req->qiov->niov);
+
+        for (j = 0; j < req->qiov->niov; j++) {
+            page_addr =
+                qemu_ram_addr_from_host_nofail(req->qiov->iov[j].iov_base);
+            qemu_put_be64(f, page_addr);
+            qemu_put_be64(f, req->qiov->iov[j].iov_len);
+        }
+    }
+}
+
+static void event_tap_blk_load(QEMUFile *f, EventTapBlkReq *blk_req)
+{
+    BlockRequest *req;
+    ram_addr_t page_addr;
+    int i, j, len, niov;
+
+    len = qemu_get_byte(f);
+    blk_req->device_name = qemu_malloc(len + 1);
+    qemu_get_buffer(f, (uint8_t *)blk_req->device_name, len);
+    blk_req->device_name[len] = '\0';
+    blk_req->num_reqs = qemu_get_byte(f);
+    blk_req->is_flush = qemu_get_byte(f);
+
+    if (blk_req->is_flush) {
+        return;
+    }
+
+    for (i = 0; i < blk_req->num_reqs; i++) {
+        req = &blk_req->reqs[i];
+        req->sector = qemu_get_be64(f);
+        req->nb_sectors = qemu_get_be32(f);
+        req->qiov = qemu_mallocz(sizeof(QEMUIOVector));
+        niov = qemu_get_be32(f);
+        qemu_iovec_init(req->qiov, niov);
+
+        for (j = 0; j < niov; j++) {
+            void *iov_base;
+            size_t iov_len;
+            page_addr = qemu_get_be64(f);
+            iov_base = qemu_get_ram_ptr(page_addr);
+            iov_len = qemu_get_be64(f);
+            qemu_iovec_add(req->qiov, iov_base, iov_len);
+        }
+    }
+}
+
+static void event_tap_save(QEMUFile *f, void *opaque)
+{
+    EventTapLog *log;
+
+    QTAILQ_FOREACH(log, &event_list, node) {
+        qemu_put_byte(f, log->mode);
+
+        switch (log->mode & EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_IOPORT:
+            event_tap_ioport_save(f, &log->ioport);
+            break;
+        case EVENT_TAP_MMIO:
+            event_tap_mmio_save(f, &log->mmio);
+            break;
+        case 0:
+            trace_event_tap_save_no_event();
+            break;
+        default:
+            error_report("Unknown state %d", log->mode);
+            return;
+        }
+
+        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_NET:
+            event_tap_net_save(f, &log->net_req);
+            break;
+        case EVENT_TAP_BLK:
+            event_tap_blk_save(f, &log->blk_req);
+            break;
+        default:
+            error_report("Unknown state %d", log->mode);
+            return;
+        }
+    }
+
+    qemu_put_byte(f, 0); /* EOF */
+}
+
+static int event_tap_load(QEMUFile *f, void *opaque, int version_id)
+{
+    EventTapLog *log, *next;
+    int mode;
+
+    event_tap_state = EVENT_TAP_LOAD;
+
+    QTAILQ_FOREACH_SAFE(log, &event_list, node, next) {
+        QTAILQ_REMOVE(&event_list, log, node);
+        event_tap_free_log(log);
+    }
+
+    /* loop until EOF */
+    while ((mode = qemu_get_byte(f)) != 0) {
+        EventTapLog *log = event_tap_alloc_log();
+
+        log->mode = mode;
+        switch (log->mode & EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_IOPORT:
+            event_tap_ioport_load(f, &log->ioport);
+            break;
+        case EVENT_TAP_MMIO:
+            event_tap_mmio_load(f, &log->mmio);
+            break;
+        case 0:
+            trace_event_tap_load_no_event();
+            break;
+        default:
+            error_report("Unknown state %d", log->mode);
+            event_tap_free_log(log);
+            return -EINVAL;
+        }
+
+        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_NET:
+            event_tap_net_load(f, &log->net_req);
+            break;
+        case EVENT_TAP_BLK:
+            event_tap_blk_load(f, &log->blk_req);
+            break;
+        default:
+            error_report("Unknown state %d", log->mode);
+            event_tap_free_log(log);
+            return -EINVAL;
+        }
+
+        QTAILQ_INSERT_TAIL(&event_list, log, node);
+    }
+
+    return 0;
+}
+
+void event_tap_schedule_replay(void)
+{
+    vmstate = qemu_add_vm_change_state_handler(event_tap_replay, NULL);
+}
+
+void event_tap_init(void)
+{
+    QTAILQ_INIT(&event_list);
+    QTAILQ_INIT(&event_pool);
+    register_savevm(NULL, "event-tap", 0, 1,
+                    event_tap_save, event_tap_load, &last_event_tap);
+}
diff --git a/event-tap.h b/event-tap.h
new file mode 100644
index 0000000..2558250
--- /dev/null
+++ b/event-tap.h
@@ -0,0 +1,42 @@ 
+/*
+ * Event Tap functions for QEMU
+ *
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef EVENT_TAP_H
+#define EVENT_TAP_H
+
+#include "qemu-common.h"
+#include "net.h"
+#include "block.h"
+
+int event_tap_register(int (*cb)(void));
+void event_tap_unregister(void);
+int event_tap_is_on(void);
+void event_tap_ioport(int index, uint32_t address, uint32_t data);
+void event_tap_mmio(uint64_t address, uint8_t *buf, int len);
+void event_tap_init(void);
+void event_tap_flush(void);
+int event_tap_flush_one(void);
+void event_tap_schedule_replay(void);
+
+void event_tap_send_packet(VLANClientState *vc, const uint8_t *buf, int size);
+ssize_t event_tap_sendv_packet_async(VLANClientState *vc,
+                                     const struct iovec *iov,
+                                     int iovcnt, NetPacketSent *sent_cb);
+
+BlockDriverAIOCB *event_tap_bdrv_aio_writev(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector *iov,
+                                            int nb_sectors,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque);
+BlockDriverAIOCB *event_tap_bdrv_aio_flush(BlockDriverState *bs,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque);
+
+#endif
diff --git a/qemu-tool.c b/qemu-tool.c
index 392e1c9..dcbd566 100644
--- a/qemu-tool.c
+++ b/qemu-tool.c
@@ -16,6 +16,7 @@ 
 #include "qemu-timer.h"
 #include "qemu-log.h"
 #include "sysemu.h"
+#include "event-tap.h"
 
 #include <sys/time.h>
 
@@ -111,3 +112,26 @@  int qemu_set_fd_handler2(int fd,
 {
     return 0;
 }
+
+BlockDriverAIOCB *event_tap_bdrv_aio_writev(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector *iov,
+                                            int nb_sectors,
+                                            BlockDriverCompletionFunc *cb,
+                                            void *opaque)
+{
+    return NULL;
+}
+
+BlockDriverAIOCB *event_tap_bdrv_aio_flush(BlockDriverState *bs,
+                                           BlockDriverCompletionFunc *cb,
+                                           void *opaque)
+{
+    return NULL;
+}
+
+int event_tap_is_on(void)
+{
+    return 0;
+}
+
diff --git a/trace-events b/trace-events
index b8c6012..f3244fb 100644
--- a/trace-events
+++ b/trace-events
@@ -229,3 +229,12 @@  disable ft_trans_put_ready(void) "file is ready to put"
 disable ft_trans_get_ready(void) "file is ready to get"
 disable ft_trans_cb(void *cb) "callback %p"
 
+# event-tap.c
+disable event_tap_ignore_bh(int bh) "event_tap_bh is already scheduled %d"
+disable event_tap_net_cb(char *s, ssize_t len) "%s: %zd bytes packet was sended"
+disable event_tap_no_event(void) "no last_event_tap"
+disable event_tap_already_used(int mode) "last_event_tap already used %d"
+disable event_tap_append_packet(void) "This packet is appended"
+disable event_tap_replay_no_event(void) "No event to replay"
+disable event_tap_save_no_event(void) "No event to save"
+disable event_tap_load_no_event(void) "No event to load"