Patchwork [09/21] Introduce event-tap.

login
register
mail settings
Submitter Yoshiaki Tamura
Date Nov. 25, 2010, 6:06 a.m.
Message ID <1290665220-26478-10-git-send-email-tamura.yoshiaki@lab.ntt.co.jp>
Download mbox | patch
Permalink /patch/73002/
State New
Headers show

Comments

Yoshiaki Tamura - Nov. 25, 2010, 6:06 a.m.
event-tap controls when to start FT transaction, and provides proxy
functions to called from net/block devices.  While FT transaction, it
queues up net/block requests, and flush them when the transaction gets
completed.

Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
---
 Makefile.target |    1 +
 block.h         |    9 +
 event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 event-tap.h     |   34 +++
 net.h           |    4 +
 net/queue.c     |    1 +
 6 files changed, 843 insertions(+), 0 deletions(-)
 create mode 100644 event-tap.c
 create mode 100644 event-tap.h
Stefan Hajnoczi - Nov. 29, 2010, 11 a.m.
On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
<tamura.yoshiaki@lab.ntt.co.jp> wrote:
> event-tap controls when to start FT transaction, and provides proxy
> functions to called from net/block devices.  While FT transaction, it
> queues up net/block requests, and flush them when the transaction gets
> completed.
>
> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
> ---
>  Makefile.target |    1 +
>  block.h         |    9 +
>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  event-tap.h     |   34 +++
>  net.h           |    4 +
>  net/queue.c     |    1 +
>  6 files changed, 843 insertions(+), 0 deletions(-)
>  create mode 100644 event-tap.c
>  create mode 100644 event-tap.h

event_tap_state is checked at the beginning of several functions.  If
there is an unexpected state the function silently returns.  Should
these checks really be assert() so there is an abort and backtrace if
the program ever reaches this state?

> +typedef struct EventTapBlkReq {
> +    char *device_name;
> +    int num_reqs;
> +    int num_cbs;
> +    bool is_multiwrite;

Is multiwrite logging necessary?  If event tap is called from within
the block layer then multiwrite is turned into one or more
bdrv_aio_writev() calls.

> +static void event_tap_replay(void *opaque, int running, int reason)
> +{
> +    EventTapLog *log, *next;
> +
> +    if (!running) {
> +        return;
> +    }
> +
> +    if (event_tap_state != EVENT_TAP_LOAD) {
> +        return;
> +    }
> +
> +    event_tap_state = EVENT_TAP_REPLAY;
> +
> +    QTAILQ_FOREACH(log, &event_list, node) {
> +        EventTapBlkReq *blk_req;
> +
> +        /* event resume */
> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
> +        case EVENT_TAP_NET:
> +            event_tap_net_flush(&log->net_req);
> +            break;
> +        case EVENT_TAP_BLK:
> +            blk_req = &log->blk_req;
> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
> +                switch (log->ioport.index) {
> +                case 0:
> +                    cpu_outb(log->ioport.address, log->ioport.data);
> +                    break;
> +                case 1:
> +                    cpu_outw(log->ioport.address, log->ioport.data);
> +                    break;
> +                case 2:
> +                    cpu_outl(log->ioport.address, log->ioport.data);
> +                    break;
> +                }
> +            } else {
> +                /* EVENT_TAP_MMIO */
> +                cpu_physical_memory_rw(log->mmio.address,
> +                                       log->mmio.buf,
> +                                       log->mmio.len, 1);
> +            }
> +            break;

Why are net tx packets replayed at the net level but blk requests are
replayed at the pio/mmio level?

I expected everything to replay either as pio/mmio or as net/block.

> +static void event_tap_blk_load(QEMUFile *f, EventTapBlkReq *blk_req)
> +{
> +    BlockRequest *req;
> +    ram_addr_t page_addr;
> +    int i, j, len;
> +
> +    len = qemu_get_byte(f);
> +    blk_req->device_name = qemu_malloc(len + 1);
> +    qemu_get_buffer(f, (uint8_t *)blk_req->device_name, len);
> +    blk_req->device_name[len] = '\0';
> +    blk_req->num_reqs = qemu_get_byte(f);
> +
> +    for (i = 0; i < blk_req->num_reqs; i++) {
> +        req = &blk_req->reqs[i];
> +        req->sector = qemu_get_be64(f);
> +        req->nb_sectors = qemu_get_be32(f);
> +        req->qiov = qemu_malloc(sizeof(QEMUIOVector));

It would make sense to have common QEMUIOVector load/save functions
instead of inlining this code here.

> +static int event_tap_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    EventTapLog *log, *next;
> +    int mode;
> +
> +    event_tap_state = EVENT_TAP_LOAD;
> +
> +    QTAILQ_FOREACH_SAFE(log, &event_list, node, next) {
> +        QTAILQ_REMOVE(&event_list, log, node);
> +        event_tap_free_log(log);
> +    }
> +
> +    /* loop until EOF */
> +    while ((mode = qemu_get_byte(f)) != 0) {
> +        EventTapLog *log = event_tap_alloc_log();
> +
> +        log->mode = mode;
> +        switch (log->mode & EVENT_TAP_TYPE_MASK) {
> +        case EVENT_TAP_IOPORT:
> +            event_tap_ioport_load(f, &log->ioport);
> +            break;
> +        case EVENT_TAP_MMIO:
> +            event_tap_mmio_load(f, &log->mmio);
> +            break;
> +        case 0:
> +            DPRINTF("No event\n");
> +            break;
> +        default:
> +            fprintf(stderr, "Unknown state %d\n", log->mode);
> +            return -1;

log is leaked here...

> +        }
> +
> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
> +        case EVENT_TAP_NET:
> +            event_tap_net_load(f, &log->net_req);
> +            break;
> +        case EVENT_TAP_BLK:
> +            event_tap_blk_load(f, &log->blk_req);
> +            break;
> +        default:
> +            fprintf(stderr, "Unknown state %d\n", log->mode);
> +            return -1;

...and here.

Stefan
Marcelo Tosatti - Nov. 30, 2010, 1:19 a.m.
On Thu, Nov 25, 2010 at 03:06:48PM +0900, Yoshiaki Tamura wrote:
> event-tap controls when to start FT transaction, and provides proxy
> functions to called from net/block devices.  While FT transaction, it
> queues up net/block requests, and flush them when the transaction gets
> completed.
> 
> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>

> +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
> +                                    BlockDriverState *bs, BlockRequest *reqs,
> +                                    int num_reqs, BlockDriverCompletionFunc *cb,
> +                                    void *opaque, bool is_multiwrite)
> +{
> +    int i;
> +
> +    blk_req->num_reqs = num_reqs;
> +    blk_req->num_cbs = num_reqs;
> +    blk_req->device_name = qemu_strdup(bs->device_name);
> +    blk_req->is_multiwrite = is_multiwrite;
> +
> +    for (i = 0; i < num_reqs; i++) {
> +        blk_req->reqs[i].sector = reqs[i].sector;
> +        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
> +        blk_req->reqs[i].qiov = reqs[i].qiov;
> +        blk_req->reqs[i].cb = cb;
> +        blk_req->reqs[i].opaque = opaque;
> +        blk_req->cb[i] = reqs[i].cb;
> +        blk_req->opaque[i] = reqs[i].opaque;
> +    }    
> +}                                   

bdrv_aio_flush should also be logged, so that guest initiated flush is
respected on replay.
Yoshiaki Tamura - Nov. 30, 2010, 9:28 a.m.
2010/11/30 Marcelo Tosatti <mtosatti@redhat.com>:
> On Thu, Nov 25, 2010 at 03:06:48PM +0900, Yoshiaki Tamura wrote:
>> event-tap controls when to start FT transaction, and provides proxy
>> functions to called from net/block devices.  While FT transaction, it
>> queues up net/block requests, and flush them when the transaction gets
>> completed.
>>
>> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>
>> +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
>> +                                    BlockDriverState *bs, BlockRequest *reqs,
>> +                                    int num_reqs, BlockDriverCompletionFunc *cb,
>> +                                    void *opaque, bool is_multiwrite)
>> +{
>> +    int i;
>> +
>> +    blk_req->num_reqs = num_reqs;
>> +    blk_req->num_cbs = num_reqs;
>> +    blk_req->device_name = qemu_strdup(bs->device_name);
>> +    blk_req->is_multiwrite = is_multiwrite;
>> +
>> +    for (i = 0; i < num_reqs; i++) {
>> +        blk_req->reqs[i].sector = reqs[i].sector;
>> +        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
>> +        blk_req->reqs[i].qiov = reqs[i].qiov;
>> +        blk_req->reqs[i].cb = cb;
>> +        blk_req->reqs[i].opaque = opaque;
>> +        blk_req->cb[i] = reqs[i].cb;
>> +        blk_req->opaque[i] = reqs[i].opaque;
>> +    }
>> +}
>
> bdrv_aio_flush should also be logged, so that guest initiated flush is
> respected on replay.

In the current implementation w/o flush logging, there might be
order inversion after replay?

Yoshi
Yoshiaki Tamura - Nov. 30, 2010, 9:50 a.m.
2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
> On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
> <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>> event-tap controls when to start FT transaction, and provides proxy
>> functions to called from net/block devices.  While FT transaction, it
>> queues up net/block requests, and flush them when the transaction gets
>> completed.
>>
>> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>> ---
>>  Makefile.target |    1 +
>>  block.h         |    9 +
>>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  event-tap.h     |   34 +++
>>  net.h           |    4 +
>>  net/queue.c     |    1 +
>>  6 files changed, 843 insertions(+), 0 deletions(-)
>>  create mode 100644 event-tap.c
>>  create mode 100644 event-tap.h
>
> event_tap_state is checked at the beginning of several functions.  If
> there is an unexpected state the function silently returns.  Should
> these checks really be assert() so there is an abort and backtrace if
> the program ever reaches this state?

I'm wondering whether abort is too strong, but I think you're
right because stopping Kemari may not be enough as an error
handling.

>
>> +typedef struct EventTapBlkReq {
>> +    char *device_name;
>> +    int num_reqs;
>> +    int num_cbs;
>> +    bool is_multiwrite;
>
> Is multiwrite logging necessary?  If event tap is called from within
> the block layer then multiwrite is turned into one or more
> bdrv_aio_writev() calls.

If we move event-tap into block layer, I guess it won't be
necessary.

>> +static void event_tap_replay(void *opaque, int running, int reason)
>> +{
>> +    EventTapLog *log, *next;
>> +
>> +    if (!running) {
>> +        return;
>> +    }
>> +
>> +    if (event_tap_state != EVENT_TAP_LOAD) {
>> +        return;
>> +    }
>> +
>> +    event_tap_state = EVENT_TAP_REPLAY;
>> +
>> +    QTAILQ_FOREACH(log, &event_list, node) {
>> +        EventTapBlkReq *blk_req;
>> +
>> +        /* event resume */
>> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> +        case EVENT_TAP_NET:
>> +            event_tap_net_flush(&log->net_req);
>> +            break;
>> +        case EVENT_TAP_BLK:
>> +            blk_req = &log->blk_req;
>> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
>> +                switch (log->ioport.index) {
>> +                case 0:
>> +                    cpu_outb(log->ioport.address, log->ioport.data);
>> +                    break;
>> +                case 1:
>> +                    cpu_outw(log->ioport.address, log->ioport.data);
>> +                    break;
>> +                case 2:
>> +                    cpu_outl(log->ioport.address, log->ioport.data);
>> +                    break;
>> +                }
>> +            } else {
>> +                /* EVENT_TAP_MMIO */
>> +                cpu_physical_memory_rw(log->mmio.address,
>> +                                       log->mmio.buf,
>> +                                       log->mmio.len, 1);
>> +            }
>> +            break;
>
> Why are net tx packets replayed at the net level but blk requests are
> replayed at the pio/mmio level?
>
> I expected everything to replay either as pio/mmio or as net/block.

It's my mistake, sorry about that.  We're just in way of moving
replay from pio/mmio to net/block, and I mixed up.  I'll revert
it to pio/mmio replay in the next spin.

BTW, I would like to ask a question regarding this.  There is a
callback which net/block calls after processing the requests, and
is there a clean way to set this callback on the failovered
host upon replay?

>> +static void event_tap_blk_load(QEMUFile *f, EventTapBlkReq *blk_req)
>> +{
>> +    BlockRequest *req;
>> +    ram_addr_t page_addr;
>> +    int i, j, len;
>> +
>> +    len = qemu_get_byte(f);
>> +    blk_req->device_name = qemu_malloc(len + 1);
>> +    qemu_get_buffer(f, (uint8_t *)blk_req->device_name, len);
>> +    blk_req->device_name[len] = '\0';
>> +    blk_req->num_reqs = qemu_get_byte(f);
>> +
>> +    for (i = 0; i < blk_req->num_reqs; i++) {
>> +        req = &blk_req->reqs[i];
>> +        req->sector = qemu_get_be64(f);
>> +        req->nb_sectors = qemu_get_be32(f);
>> +        req->qiov = qemu_malloc(sizeof(QEMUIOVector));
>
> It would make sense to have common QEMUIOVector load/save functions
> instead of inlining this code here.

OK.

>> +static int event_tap_load(QEMUFile *f, void *opaque, int version_id)
>> +{
>> +    EventTapLog *log, *next;
>> +    int mode;
>> +
>> +    event_tap_state = EVENT_TAP_LOAD;
>> +
>> +    QTAILQ_FOREACH_SAFE(log, &event_list, node, next) {
>> +        QTAILQ_REMOVE(&event_list, log, node);
>> +        event_tap_free_log(log);
>> +    }
>> +
>> +    /* loop until EOF */
>> +    while ((mode = qemu_get_byte(f)) != 0) {
>> +        EventTapLog *log = event_tap_alloc_log();
>> +
>> +        log->mode = mode;
>> +        switch (log->mode & EVENT_TAP_TYPE_MASK) {
>> +        case EVENT_TAP_IOPORT:
>> +            event_tap_ioport_load(f, &log->ioport);
>> +            break;
>> +        case EVENT_TAP_MMIO:
>> +            event_tap_mmio_load(f, &log->mmio);
>> +            break;
>> +        case 0:
>> +            DPRINTF("No event\n");
>> +            break;
>> +        default:
>> +            fprintf(stderr, "Unknown state %d\n", log->mode);
>> +            return -1;
>
> log is leaked here...

Oops:(

>
>> +        }
>> +
>> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> +        case EVENT_TAP_NET:
>> +            event_tap_net_load(f, &log->net_req);
>> +            break;
>> +        case EVENT_TAP_BLK:
>> +            event_tap_blk_load(f, &log->blk_req);
>> +            break;
>> +        default:
>> +            fprintf(stderr, "Unknown state %d\n", log->mode);
>> +            return -1;
>
> ...and here.

Oops again:(
Will fix them.

Yoshi

>
> Stefan
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Stefan Hajnoczi - Nov. 30, 2010, 10:04 a.m.
On Tue, Nov 30, 2010 at 9:50 AM, Yoshiaki Tamura
<tamura.yoshiaki@lab.ntt.co.jp> wrote:
> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
>> On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
>> <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>>> event-tap controls when to start FT transaction, and provides proxy
>>> functions to called from net/block devices.  While FT transaction, it
>>> queues up net/block requests, and flush them when the transaction gets
>>> completed.
>>>
>>> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>>> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>>> ---
>>>  Makefile.target |    1 +
>>>  block.h         |    9 +
>>>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  event-tap.h     |   34 +++
>>>  net.h           |    4 +
>>>  net/queue.c     |    1 +
>>>  6 files changed, 843 insertions(+), 0 deletions(-)
>>>  create mode 100644 event-tap.c
>>>  create mode 100644 event-tap.h
>>
>> event_tap_state is checked at the beginning of several functions.  If
>> there is an unexpected state the function silently returns.  Should
>> these checks really be assert() so there is an abort and backtrace if
>> the program ever reaches this state?

Fancier error handling would work too.  For example cleaning up,
turning off Kemari, and producing an error message with
error_report().  In that case we need to think through the state of
the environment carefully and make sure we don't cause secondary
failures (like memory leaks).

> BTW, I would like to ask a question regarding this.  There is a
> callback which net/block calls after processing the requests, and
> is there a clean way to set this callback on the failovered
> host upon replay?

I think this is a limitation in the current design.  If requests are
re-issued by Kemari at the net/block level, how will the higher layers
know about these requests?  How will they be prepared to accept
callbacks?

Stefan
Yoshiaki Tamura - Nov. 30, 2010, 10:20 a.m.
2010/11/30 Stefan Hajnoczi <stefanha@gmail.com>:
> On Tue, Nov 30, 2010 at 9:50 AM, Yoshiaki Tamura
> <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
>>> On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
>>> <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>>>> event-tap controls when to start FT transaction, and provides proxy
>>>> functions to called from net/block devices.  While FT transaction, it
>>>> queues up net/block requests, and flush them when the transaction gets
>>>> completed.
>>>>
>>>> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>>>> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>>>> ---
>>>>  Makefile.target |    1 +
>>>>  block.h         |    9 +
>>>>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  event-tap.h     |   34 +++
>>>>  net.h           |    4 +
>>>>  net/queue.c     |    1 +
>>>>  6 files changed, 843 insertions(+), 0 deletions(-)
>>>>  create mode 100644 event-tap.c
>>>>  create mode 100644 event-tap.h
>>>
>>> event_tap_state is checked at the beginning of several functions.  If
>>> there is an unexpected state the function silently returns.  Should
>>> these checks really be assert() so there is an abort and backtrace if
>>> the program ever reaches this state?
>
> Fancier error handling would work too.  For example cleaning up,
> turning off Kemari, and producing an error message with
> error_report().  In that case we need to think through the state of
> the environment carefully and make sure we don't cause secondary
> failures (like memory leaks).

Turning off Kemari should include canceling the transaction which
notifies the secondary.  So same as you commented for
ft_trans_file error handling, I would implement better handling
for event-tap again.

>> BTW, I would like to ask a question regarding this.  There is a
>> callback which net/block calls after processing the requests, and
>> is there a clean way to set this callback on the failovered
>> host upon replay?
>
> I think this is a limitation in the current design.  If requests are
> re-issued by Kemari at the net/block level, how will the higher layers
> know about these requests?  How will they be prepared to accept
> callbacks?

That's why we're using pio/mmio replay at this moment.  With a
dirty hack in device emulators setting callbacks before replay,
block/net replay seems to work, but I don't think that to be a
correct solution.

Yoshi

>
> Stefan
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Marcelo Tosatti - Nov. 30, 2010, 10:25 a.m.
On Tue, Nov 30, 2010 at 06:28:55PM +0900, Yoshiaki Tamura wrote:
> 2010/11/30 Marcelo Tosatti <mtosatti@redhat.com>:
> > On Thu, Nov 25, 2010 at 03:06:48PM +0900, Yoshiaki Tamura wrote:
> >> event-tap controls when to start FT transaction, and provides proxy
> >> functions to called from net/block devices.  While FT transaction, it
> >> queues up net/block requests, and flush them when the transaction gets
> >> completed.
> >>
> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
> >
> >> +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
> >> +                                    BlockDriverState *bs, BlockRequest *reqs,
> >> +                                    int num_reqs, BlockDriverCompletionFunc *cb,
> >> +                                    void *opaque, bool is_multiwrite)
> >> +{
> >> +    int i;
> >> +
> >> +    blk_req->num_reqs = num_reqs;
> >> +    blk_req->num_cbs = num_reqs;
> >> +    blk_req->device_name = qemu_strdup(bs->device_name);
> >> +    blk_req->is_multiwrite = is_multiwrite;
> >> +
> >> +    for (i = 0; i < num_reqs; i++) {
> >> +        blk_req->reqs[i].sector = reqs[i].sector;
> >> +        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
> >> +        blk_req->reqs[i].qiov = reqs[i].qiov;
> >> +        blk_req->reqs[i].cb = cb;
> >> +        blk_req->reqs[i].opaque = opaque;
> >> +        blk_req->cb[i] = reqs[i].cb;
> >> +        blk_req->opaque[i] = reqs[i].opaque;
> >> +    }
> >> +}
> >
> > bdrv_aio_flush should also be logged, so that guest initiated flush is
> > respected on replay.
> 
> In the current implementation w/o flush logging, there might be
> order inversion after replay?
> 
> Yoshi

Yes, since a vcpu is allowed to continue after synchronization is
scheduled via a bh. For virtio-blk, for example:

1) bdrv_aio_write, event queued.
2) bdrv_aio_flush
3) bdrv_aio_write, event queued.

On replay, there is no flush between the two writes.

Why can't synchronization be done from event-tap itself, synchronously,
to avoid this kind of problem?

The way you hook synchronization into savevm seems unclean. Perhaps
better separation between standard savevm path and FT savevm would make
it cleaner.
Yoshiaki Tamura - Nov. 30, 2010, 10:35 a.m.
Marcelo Tosatti wrote:
> On Tue, Nov 30, 2010 at 06:28:55PM +0900, Yoshiaki Tamura wrote:
>> 2010/11/30 Marcelo Tosatti<mtosatti@redhat.com>:
>>> On Thu, Nov 25, 2010 at 03:06:48PM +0900, Yoshiaki Tamura wrote:
>>>> event-tap controls when to start FT transaction, and provides proxy
>>>> functions to called from net/block devices.  While FT transaction, it
>>>> queues up net/block requests, and flush them when the transaction gets
>>>> completed.
>>>>
>>>> Signed-off-by: Yoshiaki Tamura<tamura.yoshiaki@lab.ntt.co.jp>
>>>> Signed-off-by: OHMURA Kei<ohmura.kei@lab.ntt.co.jp>
>>>
>>>> +static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
>>>> +                                    BlockDriverState *bs, BlockRequest *reqs,
>>>> +                                    int num_reqs, BlockDriverCompletionFunc *cb,
>>>> +                                    void *opaque, bool is_multiwrite)
>>>> +{
>>>> +    int i;
>>>> +
>>>> +    blk_req->num_reqs = num_reqs;
>>>> +    blk_req->num_cbs = num_reqs;
>>>> +    blk_req->device_name = qemu_strdup(bs->device_name);
>>>> +    blk_req->is_multiwrite = is_multiwrite;
>>>> +
>>>> +    for (i = 0; i<  num_reqs; i++) {
>>>> +        blk_req->reqs[i].sector = reqs[i].sector;
>>>> +        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
>>>> +        blk_req->reqs[i].qiov = reqs[i].qiov;
>>>> +        blk_req->reqs[i].cb = cb;
>>>> +        blk_req->reqs[i].opaque = opaque;
>>>> +        blk_req->cb[i] = reqs[i].cb;
>>>> +        blk_req->opaque[i] = reqs[i].opaque;
>>>> +    }
>>>> +}
>>>
>>> bdrv_aio_flush should also be logged, so that guest initiated flush is
>>> respected on replay.
>>
>> In the current implementation w/o flush logging, there might be
>> order inversion after replay?
>>
>> Yoshi
>
> Yes, since a vcpu is allowed to continue after synchronization is
> scheduled via a bh. For virtio-blk, for example:
>
> 1) bdrv_aio_write, event queued.
> 2) bdrv_aio_flush
> 3) bdrv_aio_write, event queued.
>
> On replay, there is no flush between the two writes.
>
> Why can't synchronization be done from event-tap itself, synchronously,
> to avoid this kind of problem?

Thanks.  I would fix it.

> The way you hook synchronization into savevm seems unclean. Perhaps
> better separation between standard savevm path and FT savevm would make
> it cleaner.

I think you're mentioning about the changes in migration.c?

Yoshi
Marcelo Tosatti - Nov. 30, 2010, 1:11 p.m.
On Tue, Nov 30, 2010 at 07:35:54PM +0900, Yoshiaki Tamura wrote:
> Marcelo Tosatti wrote:
> >On Tue, Nov 30, 2010 at 06:28:55PM +0900, Yoshiaki Tamura wrote:
> >>2010/11/30 Marcelo Tosatti<mtosatti@redhat.com>:
> >>>On Thu, Nov 25, 2010 at 03:06:48PM +0900, Yoshiaki Tamura wrote:
> >>>>event-tap controls when to start FT transaction, and provides proxy
> >>>>functions to called from net/block devices.  While FT transaction, it
> >>>>queues up net/block requests, and flush them when the transaction gets
> >>>>completed.
> >>>>
> >>>>Signed-off-by: Yoshiaki Tamura<tamura.yoshiaki@lab.ntt.co.jp>
> >>>>Signed-off-by: OHMURA Kei<ohmura.kei@lab.ntt.co.jp>
> >>>
> >>>>+static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
> >>>>+                                    BlockDriverState *bs, BlockRequest *reqs,
> >>>>+                                    int num_reqs, BlockDriverCompletionFunc *cb,
> >>>>+                                    void *opaque, bool is_multiwrite)
> >>>>+{
> >>>>+    int i;
> >>>>+
> >>>>+    blk_req->num_reqs = num_reqs;
> >>>>+    blk_req->num_cbs = num_reqs;
> >>>>+    blk_req->device_name = qemu_strdup(bs->device_name);
> >>>>+    blk_req->is_multiwrite = is_multiwrite;
> >>>>+
> >>>>+    for (i = 0; i<  num_reqs; i++) {
> >>>>+        blk_req->reqs[i].sector = reqs[i].sector;
> >>>>+        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
> >>>>+        blk_req->reqs[i].qiov = reqs[i].qiov;
> >>>>+        blk_req->reqs[i].cb = cb;
> >>>>+        blk_req->reqs[i].opaque = opaque;
> >>>>+        blk_req->cb[i] = reqs[i].cb;
> >>>>+        blk_req->opaque[i] = reqs[i].opaque;
> >>>>+    }
> >>>>+}
> >>>
> >>>bdrv_aio_flush should also be logged, so that guest initiated flush is
> >>>respected on replay.
> >>
> >>In the current implementation w/o flush logging, there might be
> >>order inversion after replay?
> >>
> >>Yoshi
> >
> >Yes, since a vcpu is allowed to continue after synchronization is
> >scheduled via a bh. For virtio-blk, for example:
> >
> >1) bdrv_aio_write, event queued.
> >2) bdrv_aio_flush
> >3) bdrv_aio_write, event queued.
> >
> >On replay, there is no flush between the two writes.
> >
> >Why can't synchronization be done from event-tap itself, synchronously,
> >to avoid this kind of problem?
> 
> Thanks.  I would fix it.
> 
> >The way you hook synchronization into savevm seems unclean. Perhaps
> >better separation between standard savevm path and FT savevm would make
> >it cleaner.
> 
> I think you're mentioning about the changes in migration.c?
> 
> Yoshi

The important point is to stop vcpu activity after the event is queued,
and resume once synchronization is performed. Stopping the vm after
Kemari event queueing should do it, once Michael's "stable migration"
patchset is in (and net/block layers fixed).
Yoshiaki Tamura - Jan. 4, 2011, 11:02 a.m.
2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
> On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
> <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>> event-tap controls when to start FT transaction, and provides proxy
>> functions to called from net/block devices.  While FT transaction, it
>> queues up net/block requests, and flush them when the transaction gets
>> completed.
>>
>> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>> ---
>>  Makefile.target |    1 +
>>  block.h         |    9 +
>>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  event-tap.h     |   34 +++
>>  net.h           |    4 +
>>  net/queue.c     |    1 +
>>  6 files changed, 843 insertions(+), 0 deletions(-)
>>  create mode 100644 event-tap.c
>>  create mode 100644 event-tap.h
>
> event_tap_state is checked at the beginning of several functions.  If
> there is an unexpected state the function silently returns.  Should
> these checks really be assert() so there is an abort and backtrace if
> the program ever reaches this state?
>
>> +typedef struct EventTapBlkReq {
>> +    char *device_name;
>> +    int num_reqs;
>> +    int num_cbs;
>> +    bool is_multiwrite;
>
> Is multiwrite logging necessary?  If event tap is called from within
> the block layer then multiwrite is turned into one or more
> bdrv_aio_writev() calls.
>
>> +static void event_tap_replay(void *opaque, int running, int reason)
>> +{
>> +    EventTapLog *log, *next;
>> +
>> +    if (!running) {
>> +        return;
>> +    }
>> +
>> +    if (event_tap_state != EVENT_TAP_LOAD) {
>> +        return;
>> +    }
>> +
>> +    event_tap_state = EVENT_TAP_REPLAY;
>> +
>> +    QTAILQ_FOREACH(log, &event_list, node) {
>> +        EventTapBlkReq *blk_req;
>> +
>> +        /* event resume */
>> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> +        case EVENT_TAP_NET:
>> +            event_tap_net_flush(&log->net_req);
>> +            break;
>> +        case EVENT_TAP_BLK:
>> +            blk_req = &log->blk_req;
>> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
>> +                switch (log->ioport.index) {
>> +                case 0:
>> +                    cpu_outb(log->ioport.address, log->ioport.data);
>> +                    break;
>> +                case 1:
>> +                    cpu_outw(log->ioport.address, log->ioport.data);
>> +                    break;
>> +                case 2:
>> +                    cpu_outl(log->ioport.address, log->ioport.data);
>> +                    break;
>> +                }
>> +            } else {
>> +                /* EVENT_TAP_MMIO */
>> +                cpu_physical_memory_rw(log->mmio.address,
>> +                                       log->mmio.buf,
>> +                                       log->mmio.len, 1);
>> +            }
>> +            break;
>
> Why are net tx packets replayed at the net level but blk requests are
> replayed at the pio/mmio level?
>
> I expected everything to replay either as pio/mmio or as net/block.

Stefan,

After doing some heavy load tests, I realized that we have to
take a hybrid approach to replay for now.  This is because when a
device moves to the next state (e.g. virtio decreases inuse) is
different between net and block.  For example, virtio-net
decreases inuse upon returning from the net layer, but virtio-blk
does that inside of the callback.  If we only use pio/mmio
replay, even though event-tap tries to replay net requests, some
get lost because the state has proceeded already.  This doesn't
happen with block, because the state is still old enough to
replay.  Note that using hybrid approach won't cause duplicated
requests on the secondary.

Thanks,

Yoshi
Stefan Hajnoczi - Jan. 4, 2011, 11:14 a.m.
On Tue, Jan 4, 2011 at 11:02 AM, Yoshiaki Tamura
<tamura.yoshiaki@lab.ntt.co.jp> wrote:
> After doing some heavy load tests, I realized that we have to
> take a hybrid approach to replay for now.  This is because when a
> device moves to the next state (e.g. virtio decreases inuse) is
> different between net and block.  For example, virtio-net
> decreases inuse upon returning from the net layer, but virtio-blk
> does that inside of the callback.  If we only use pio/mmio
> replay, even though event-tap tries to replay net requests, some
> get lost because the state has proceeded already.  This doesn't
> happen with block, because the state is still old enough to
> replay.  Note that using hybrid approach won't cause duplicated
> requests on the secondary.

Thanks Yoshi.  I think I understand what you're saying.

Stefan
Michael S. Tsirkin - Jan. 4, 2011, 11:19 a.m.
On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
> >> event-tap controls when to start FT transaction, and provides proxy
> >> functions to called from net/block devices.  While FT transaction, it
> >> queues up net/block requests, and flush them when the transaction gets
> >> completed.
> >>
> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
> >> ---
> >>  Makefile.target |    1 +
> >>  block.h         |    9 +
> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >>  event-tap.h     |   34 +++
> >>  net.h           |    4 +
> >>  net/queue.c     |    1 +
> >>  6 files changed, 843 insertions(+), 0 deletions(-)
> >>  create mode 100644 event-tap.c
> >>  create mode 100644 event-tap.h
> >
> > event_tap_state is checked at the beginning of several functions.  If
> > there is an unexpected state the function silently returns.  Should
> > these checks really be assert() so there is an abort and backtrace if
> > the program ever reaches this state?
> >
> >> +typedef struct EventTapBlkReq {
> >> +    char *device_name;
> >> +    int num_reqs;
> >> +    int num_cbs;
> >> +    bool is_multiwrite;
> >
> > Is multiwrite logging necessary?  If event tap is called from within
> > the block layer then multiwrite is turned into one or more
> > bdrv_aio_writev() calls.
> >
> >> +static void event_tap_replay(void *opaque, int running, int reason)
> >> +{
> >> +    EventTapLog *log, *next;
> >> +
> >> +    if (!running) {
> >> +        return;
> >> +    }
> >> +
> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
> >> +        return;
> >> +    }
> >> +
> >> +    event_tap_state = EVENT_TAP_REPLAY;
> >> +
> >> +    QTAILQ_FOREACH(log, &event_list, node) {
> >> +        EventTapBlkReq *blk_req;
> >> +
> >> +        /* event resume */
> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
> >> +        case EVENT_TAP_NET:
> >> +            event_tap_net_flush(&log->net_req);
> >> +            break;
> >> +        case EVENT_TAP_BLK:
> >> +            blk_req = &log->blk_req;
> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
> >> +                switch (log->ioport.index) {
> >> +                case 0:
> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
> >> +                    break;
> >> +                case 1:
> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
> >> +                    break;
> >> +                case 2:
> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
> >> +                    break;
> >> +                }
> >> +            } else {
> >> +                /* EVENT_TAP_MMIO */
> >> +                cpu_physical_memory_rw(log->mmio.address,
> >> +                                       log->mmio.buf,
> >> +                                       log->mmio.len, 1);
> >> +            }
> >> +            break;
> >
> > Why are net tx packets replayed at the net level but blk requests are
> > replayed at the pio/mmio level?
> >
> > I expected everything to replay either as pio/mmio or as net/block.
> 
> Stefan,
> 
> After doing some heavy load tests, I realized that we have to
> take a hybrid approach to replay for now.  This is because when a
> device moves to the next state (e.g. virtio decreases inuse) is
> different between net and block.  For example, virtio-net
> decreases inuse upon returning from the net layer,
> but virtio-blk
> does that inside of the callback.

For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
Both are invoked from a callback.

> If we only use pio/mmio
> replay, even though event-tap tries to replay net requests, some
> get lost because the state has proceeded already.

It seems that all you need to do to avoid this is to
delay the callback?

> This doesn't
> happen with block, because the state is still old enough to
> replay.  Note that using hybrid approach won't cause duplicated
> requests on the secondary.

An assumption devices make is that a buffer is unused once
completion callback was invoked. Does this violate that assumption?
Yoshiaki Tamura - Jan. 4, 2011, 12:20 p.m.
2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
>> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
>> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
>> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>> >> event-tap controls when to start FT transaction, and provides proxy
>> >> functions to called from net/block devices.  While FT transaction, it
>> >> queues up net/block requests, and flush them when the transaction gets
>> >> completed.
>> >>
>> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>> >> ---
>> >>  Makefile.target |    1 +
>> >>  block.h         |    9 +
>> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> >>  event-tap.h     |   34 +++
>> >>  net.h           |    4 +
>> >>  net/queue.c     |    1 +
>> >>  6 files changed, 843 insertions(+), 0 deletions(-)
>> >>  create mode 100644 event-tap.c
>> >>  create mode 100644 event-tap.h
>> >
>> > event_tap_state is checked at the beginning of several functions.  If
>> > there is an unexpected state the function silently returns.  Should
>> > these checks really be assert() so there is an abort and backtrace if
>> > the program ever reaches this state?
>> >
>> >> +typedef struct EventTapBlkReq {
>> >> +    char *device_name;
>> >> +    int num_reqs;
>> >> +    int num_cbs;
>> >> +    bool is_multiwrite;
>> >
>> > Is multiwrite logging necessary?  If event tap is called from within
>> > the block layer then multiwrite is turned into one or more
>> > bdrv_aio_writev() calls.
>> >
>> >> +static void event_tap_replay(void *opaque, int running, int reason)
>> >> +{
>> >> +    EventTapLog *log, *next;
>> >> +
>> >> +    if (!running) {
>> >> +        return;
>> >> +    }
>> >> +
>> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
>> >> +        return;
>> >> +    }
>> >> +
>> >> +    event_tap_state = EVENT_TAP_REPLAY;
>> >> +
>> >> +    QTAILQ_FOREACH(log, &event_list, node) {
>> >> +        EventTapBlkReq *blk_req;
>> >> +
>> >> +        /* event resume */
>> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> >> +        case EVENT_TAP_NET:
>> >> +            event_tap_net_flush(&log->net_req);
>> >> +            break;
>> >> +        case EVENT_TAP_BLK:
>> >> +            blk_req = &log->blk_req;
>> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
>> >> +                switch (log->ioport.index) {
>> >> +                case 0:
>> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
>> >> +                    break;
>> >> +                case 1:
>> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
>> >> +                    break;
>> >> +                case 2:
>> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
>> >> +                    break;
>> >> +                }
>> >> +            } else {
>> >> +                /* EVENT_TAP_MMIO */
>> >> +                cpu_physical_memory_rw(log->mmio.address,
>> >> +                                       log->mmio.buf,
>> >> +                                       log->mmio.len, 1);
>> >> +            }
>> >> +            break;
>> >
>> > Why are net tx packets replayed at the net level but blk requests are
>> > replayed at the pio/mmio level?
>> >
>> > I expected everything to replay either as pio/mmio or as net/block.
>>
>> Stefan,
>>
>> After doing some heavy load tests, I realized that we have to
>> take a hybrid approach to replay for now.  This is because when a
>> device moves to the next state (e.g. virtio decreases inuse) is
>> different between net and block.  For example, virtio-net
>> decreases inuse upon returning from the net layer,
>> but virtio-blk
>> does that inside of the callback.
>
> For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
> For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
> Both are invoked from a callback.
>
>> If we only use pio/mmio
>> replay, even though event-tap tries to replay net requests, some
>> get lost because the state has proceeded already.
>
> It seems that all you need to do to avoid this is to
> delay the callback?

Yeah, if it's possible.  But if you take a look at virtio-net,
you'll see that virtio_push is called immediately after calling
qemu_sendv_packet while virtio-blk does that in the callback.

>
>> This doesn't
>> happen with block, because the state is still old enough to
>> replay.  Note that using hybrid approach won't cause duplicated
>> requests on the secondary.
>
> An assumption devices make is that a buffer is unused once
> completion callback was invoked. Does this violate that assumption?

No, it shouldn't.  In case of net with net layer replay, we copy
the content of the requests, and in case of block, because we
haven't called the callback yet, the requests remains fresh.

Yoshi

>
> --
> MST
>
>
Michael S. Tsirkin - Jan. 4, 2011, 1:10 p.m.
On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> > On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
> >> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
> >> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
> >> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
> >> >> event-tap controls when to start FT transaction, and provides proxy
> >> >> functions to called from net/block devices.  While FT transaction, it
> >> >> queues up net/block requests, and flush them when the transaction gets
> >> >> completed.
> >> >>
> >> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> >> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
> >> >> ---
> >> >>  Makefile.target |    1 +
> >> >>  block.h         |    9 +
> >> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> >>  event-tap.h     |   34 +++
> >> >>  net.h           |    4 +
> >> >>  net/queue.c     |    1 +
> >> >>  6 files changed, 843 insertions(+), 0 deletions(-)
> >> >>  create mode 100644 event-tap.c
> >> >>  create mode 100644 event-tap.h
> >> >
> >> > event_tap_state is checked at the beginning of several functions.  If
> >> > there is an unexpected state the function silently returns.  Should
> >> > these checks really be assert() so there is an abort and backtrace if
> >> > the program ever reaches this state?
> >> >
> >> >> +typedef struct EventTapBlkReq {
> >> >> +    char *device_name;
> >> >> +    int num_reqs;
> >> >> +    int num_cbs;
> >> >> +    bool is_multiwrite;
> >> >
> >> > Is multiwrite logging necessary?  If event tap is called from within
> >> > the block layer then multiwrite is turned into one or more
> >> > bdrv_aio_writev() calls.
> >> >
> >> >> +static void event_tap_replay(void *opaque, int running, int reason)
> >> >> +{
> >> >> +    EventTapLog *log, *next;
> >> >> +
> >> >> +    if (!running) {
> >> >> +        return;
> >> >> +    }
> >> >> +
> >> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
> >> >> +        return;
> >> >> +    }
> >> >> +
> >> >> +    event_tap_state = EVENT_TAP_REPLAY;
> >> >> +
> >> >> +    QTAILQ_FOREACH(log, &event_list, node) {
> >> >> +        EventTapBlkReq *blk_req;
> >> >> +
> >> >> +        /* event resume */
> >> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
> >> >> +        case EVENT_TAP_NET:
> >> >> +            event_tap_net_flush(&log->net_req);
> >> >> +            break;
> >> >> +        case EVENT_TAP_BLK:
> >> >> +            blk_req = &log->blk_req;
> >> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
> >> >> +                switch (log->ioport.index) {
> >> >> +                case 0:
> >> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
> >> >> +                    break;
> >> >> +                case 1:
> >> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
> >> >> +                    break;
> >> >> +                case 2:
> >> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
> >> >> +                    break;
> >> >> +                }
> >> >> +            } else {
> >> >> +                /* EVENT_TAP_MMIO */
> >> >> +                cpu_physical_memory_rw(log->mmio.address,
> >> >> +                                       log->mmio.buf,
> >> >> +                                       log->mmio.len, 1);
> >> >> +            }
> >> >> +            break;
> >> >
> >> > Why are net tx packets replayed at the net level but blk requests are
> >> > replayed at the pio/mmio level?
> >> >
> >> > I expected everything to replay either as pio/mmio or as net/block.
> >>
> >> Stefan,
> >>
> >> After doing some heavy load tests, I realized that we have to
> >> take a hybrid approach to replay for now.  This is because when a
> >> device moves to the next state (e.g. virtio decreases inuse) is
> >> different between net and block.  For example, virtio-net
> >> decreases inuse upon returning from the net layer,
> >> but virtio-blk
> >> does that inside of the callback.
> >
> > For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
> > For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
> > Both are invoked from a callback.
> >
> >> If we only use pio/mmio
> >> replay, even though event-tap tries to replay net requests, some
> >> get lost because the state has proceeded already.
> >
> > It seems that all you need to do to avoid this is to
> > delay the callback?
> 
> Yeah, if it's possible.  But if you take a look at virtio-net,
> you'll see that virtio_push is called immediately after calling
> qemu_sendv_packet
> while virtio-blk does that in the callback.

This is only if the packet was sent immediately.
I was referring to the case where the packet is queued.

> >
> >> This doesn't
> >> happen with block, because the state is still old enough to
> >> replay.  Note that using hybrid approach won't cause duplicated
> >> requests on the secondary.
> >
> > An assumption devices make is that a buffer is unused once
> > completion callback was invoked. Does this violate that assumption?
> 
> No, it shouldn't.  In case of net with net layer replay, we copy
> the content of the requests, and in case of block, because we
> haven't called the callback yet, the requests remains fresh.
> 
> Yoshi
> 

Yes, as long as you copy it should be fine.  Maybe it's a good idea for
event-tap to queue all packets to avoid the copy and avoid the need to
replay at the net level.

> >
> > --
> > MST
> >
> >
Yoshiaki Tamura - Jan. 4, 2011, 1:45 p.m.
2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
>> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
>> > On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
>> >> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
>> >> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
>> >> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>> >> >> event-tap controls when to start FT transaction, and provides proxy
>> >> >> functions to called from net/block devices.  While FT transaction, it
>> >> >> queues up net/block requests, and flush them when the transaction gets
>> >> >> completed.
>> >> >>
>> >> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> >> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>> >> >> ---
>> >> >>  Makefile.target |    1 +
>> >> >>  block.h         |    9 +
>> >> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> >> >>  event-tap.h     |   34 +++
>> >> >>  net.h           |    4 +
>> >> >>  net/queue.c     |    1 +
>> >> >>  6 files changed, 843 insertions(+), 0 deletions(-)
>> >> >>  create mode 100644 event-tap.c
>> >> >>  create mode 100644 event-tap.h
>> >> >
>> >> > event_tap_state is checked at the beginning of several functions.  If
>> >> > there is an unexpected state the function silently returns.  Should
>> >> > these checks really be assert() so there is an abort and backtrace if
>> >> > the program ever reaches this state?
>> >> >
>> >> >> +typedef struct EventTapBlkReq {
>> >> >> +    char *device_name;
>> >> >> +    int num_reqs;
>> >> >> +    int num_cbs;
>> >> >> +    bool is_multiwrite;
>> >> >
>> >> > Is multiwrite logging necessary?  If event tap is called from within
>> >> > the block layer then multiwrite is turned into one or more
>> >> > bdrv_aio_writev() calls.
>> >> >
>> >> >> +static void event_tap_replay(void *opaque, int running, int reason)
>> >> >> +{
>> >> >> +    EventTapLog *log, *next;
>> >> >> +
>> >> >> +    if (!running) {
>> >> >> +        return;
>> >> >> +    }
>> >> >> +
>> >> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
>> >> >> +        return;
>> >> >> +    }
>> >> >> +
>> >> >> +    event_tap_state = EVENT_TAP_REPLAY;
>> >> >> +
>> >> >> +    QTAILQ_FOREACH(log, &event_list, node) {
>> >> >> +        EventTapBlkReq *blk_req;
>> >> >> +
>> >> >> +        /* event resume */
>> >> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> >> >> +        case EVENT_TAP_NET:
>> >> >> +            event_tap_net_flush(&log->net_req);
>> >> >> +            break;
>> >> >> +        case EVENT_TAP_BLK:
>> >> >> +            blk_req = &log->blk_req;
>> >> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
>> >> >> +                switch (log->ioport.index) {
>> >> >> +                case 0:
>> >> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
>> >> >> +                    break;
>> >> >> +                case 1:
>> >> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
>> >> >> +                    break;
>> >> >> +                case 2:
>> >> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
>> >> >> +                    break;
>> >> >> +                }
>> >> >> +            } else {
>> >> >> +                /* EVENT_TAP_MMIO */
>> >> >> +                cpu_physical_memory_rw(log->mmio.address,
>> >> >> +                                       log->mmio.buf,
>> >> >> +                                       log->mmio.len, 1);
>> >> >> +            }
>> >> >> +            break;
>> >> >
>> >> > Why are net tx packets replayed at the net level but blk requests are
>> >> > replayed at the pio/mmio level?
>> >> >
>> >> > I expected everything to replay either as pio/mmio or as net/block.
>> >>
>> >> Stefan,
>> >>
>> >> After doing some heavy load tests, I realized that we have to
>> >> take a hybrid approach to replay for now.  This is because when a
>> >> device moves to the next state (e.g. virtio decreases inuse) is
>> >> different between net and block.  For example, virtio-net
>> >> decreases inuse upon returning from the net layer,
>> >> but virtio-blk
>> >> does that inside of the callback.
>> >
>> > For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
>> > For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
>> > Both are invoked from a callback.
>> >
>> >> If we only use pio/mmio
>> >> replay, even though event-tap tries to replay net requests, some
>> >> get lost because the state has proceeded already.
>> >
>> > It seems that all you need to do to avoid this is to
>> > delay the callback?
>>
>> Yeah, if it's possible.  But if you take a look at virtio-net,
>> you'll see that virtio_push is called immediately after calling
>> qemu_sendv_packet
>> while virtio-blk does that in the callback.
>
> This is only if the packet was sent immediately.
> I was referring to the case where the packet is queued.

I see.  I usually don't see packets get queued in the net layer.
What would be the effect to devices?  Restraint sending packets?

>
>> >
>> >> This doesn't
>> >> happen with block, because the state is still old enough to
>> >> replay.  Note that using hybrid approach won't cause duplicated
>> >> requests on the secondary.
>> >
>> > An assumption devices make is that a buffer is unused once
>> > completion callback was invoked. Does this violate that assumption?
>>
>> No, it shouldn't.  In case of net with net layer replay, we copy
>> the content of the requests, and in case of block, because we
>> haven't called the callback yet, the requests remains fresh.
>>
>> Yoshi
>>
>
> Yes, as long as you copy it should be fine.  Maybe it's a good idea for
> event-tap to queue all packets to avoid the copy and avoid the need to
> replay at the net level.

If queuing works fine for the devices, it seems to be a good
idea.  I think the ordering issue doesn't happen still.

Yoshi

>
>> >
>> > --
>> > MST
>> >
>> >
>
>
Michael S. Tsirkin - Jan. 4, 2011, 2:42 p.m.
On Tue, Jan 04, 2011 at 10:45:13PM +0900, Yoshiaki Tamura wrote:
> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> > On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
> >> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> >> > On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
> >> >> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
> >> >> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
> >> >> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
> >> >> >> event-tap controls when to start FT transaction, and provides proxy
> >> >> >> functions to called from net/block devices.  While FT transaction, it
> >> >> >> queues up net/block requests, and flush them when the transaction gets
> >> >> >> completed.
> >> >> >>
> >> >> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> >> >> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
> >> >> >> ---
> >> >> >>  Makefile.target |    1 +
> >> >> >>  block.h         |    9 +
> >> >> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> >> >>  event-tap.h     |   34 +++
> >> >> >>  net.h           |    4 +
> >> >> >>  net/queue.c     |    1 +
> >> >> >>  6 files changed, 843 insertions(+), 0 deletions(-)
> >> >> >>  create mode 100644 event-tap.c
> >> >> >>  create mode 100644 event-tap.h
> >> >> >
> >> >> > event_tap_state is checked at the beginning of several functions.  If
> >> >> > there is an unexpected state the function silently returns.  Should
> >> >> > these checks really be assert() so there is an abort and backtrace if
> >> >> > the program ever reaches this state?
> >> >> >
> >> >> >> +typedef struct EventTapBlkReq {
> >> >> >> +    char *device_name;
> >> >> >> +    int num_reqs;
> >> >> >> +    int num_cbs;
> >> >> >> +    bool is_multiwrite;
> >> >> >
> >> >> > Is multiwrite logging necessary?  If event tap is called from within
> >> >> > the block layer then multiwrite is turned into one or more
> >> >> > bdrv_aio_writev() calls.
> >> >> >
> >> >> >> +static void event_tap_replay(void *opaque, int running, int reason)
> >> >> >> +{
> >> >> >> +    EventTapLog *log, *next;
> >> >> >> +
> >> >> >> +    if (!running) {
> >> >> >> +        return;
> >> >> >> +    }
> >> >> >> +
> >> >> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
> >> >> >> +        return;
> >> >> >> +    }
> >> >> >> +
> >> >> >> +    event_tap_state = EVENT_TAP_REPLAY;
> >> >> >> +
> >> >> >> +    QTAILQ_FOREACH(log, &event_list, node) {
> >> >> >> +        EventTapBlkReq *blk_req;
> >> >> >> +
> >> >> >> +        /* event resume */
> >> >> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
> >> >> >> +        case EVENT_TAP_NET:
> >> >> >> +            event_tap_net_flush(&log->net_req);
> >> >> >> +            break;
> >> >> >> +        case EVENT_TAP_BLK:
> >> >> >> +            blk_req = &log->blk_req;
> >> >> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
> >> >> >> +                switch (log->ioport.index) {
> >> >> >> +                case 0:
> >> >> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
> >> >> >> +                    break;
> >> >> >> +                case 1:
> >> >> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
> >> >> >> +                    break;
> >> >> >> +                case 2:
> >> >> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
> >> >> >> +                    break;
> >> >> >> +                }
> >> >> >> +            } else {
> >> >> >> +                /* EVENT_TAP_MMIO */
> >> >> >> +                cpu_physical_memory_rw(log->mmio.address,
> >> >> >> +                                       log->mmio.buf,
> >> >> >> +                                       log->mmio.len, 1);
> >> >> >> +            }
> >> >> >> +            break;
> >> >> >
> >> >> > Why are net tx packets replayed at the net level but blk requests are
> >> >> > replayed at the pio/mmio level?
> >> >> >
> >> >> > I expected everything to replay either as pio/mmio or as net/block.
> >> >>
> >> >> Stefan,
> >> >>
> >> >> After doing some heavy load tests, I realized that we have to
> >> >> take a hybrid approach to replay for now.  This is because when a
> >> >> device moves to the next state (e.g. virtio decreases inuse) is
> >> >> different between net and block.  For example, virtio-net
> >> >> decreases inuse upon returning from the net layer,
> >> >> but virtio-blk
> >> >> does that inside of the callback.
> >> >
> >> > For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
> >> > For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
> >> > Both are invoked from a callback.
> >> >
> >> >> If we only use pio/mmio
> >> >> replay, even though event-tap tries to replay net requests, some
> >> >> get lost because the state has proceeded already.
> >> >
> >> > It seems that all you need to do to avoid this is to
> >> > delay the callback?
> >>
> >> Yeah, if it's possible.  But if you take a look at virtio-net,
> >> you'll see that virtio_push is called immediately after calling
> >> qemu_sendv_packet
> >> while virtio-blk does that in the callback.
> >
> > This is only if the packet was sent immediately.
> > I was referring to the case where the packet is queued.
> 
> I see.  I usually don't see packets get queued in the net layer.
> What would be the effect to devices?  Restraint sending packets?

Yes.

> >
> >> >
> >> >> This doesn't
> >> >> happen with block, because the state is still old enough to
> >> >> replay.  Note that using hybrid approach won't cause duplicated
> >> >> requests on the secondary.
> >> >
> >> > An assumption devices make is that a buffer is unused once
> >> > completion callback was invoked. Does this violate that assumption?
> >>
> >> No, it shouldn't.  In case of net with net layer replay, we copy
> >> the content of the requests, and in case of block, because we
> >> haven't called the callback yet, the requests remains fresh.
> >>
> >> Yoshi
> >>
> >
> > Yes, as long as you copy it should be fine.  Maybe it's a good idea for
> > event-tap to queue all packets to avoid the copy and avoid the need to
> > replay at the net level.
> 
> If queuing works fine for the devices, it seems to be a good
> idea.  I think the ordering issue doesn't happen still.
> 
> Yoshi

If you replay and both net and pio level, it becomes complex.
Maybe it's ok, but certainly harder to reason about.

> >
> >> >
> >> > --
> >> > MST
> >> >
> >> >
> >
> >
Yoshiaki Tamura - Jan. 6, 2011, 8:47 a.m.
2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> On Tue, Jan 04, 2011 at 10:45:13PM +0900, Yoshiaki Tamura wrote:
>> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
>> > On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
>> >> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
>> >> > On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
>> >> >> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
>> >> >> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
>> >> >> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>> >> >> >> event-tap controls when to start FT transaction, and provides proxy
>> >> >> >> functions to called from net/block devices.  While FT transaction, it
>> >> >> >> queues up net/block requests, and flush them when the transaction gets
>> >> >> >> completed.
>> >> >> >>
>> >> >> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> >> >> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>> >> >> >> ---
>> >> >> >>  Makefile.target |    1 +
>> >> >> >>  block.h         |    9 +
>> >> >> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> >> >> >>  event-tap.h     |   34 +++
>> >> >> >>  net.h           |    4 +
>> >> >> >>  net/queue.c     |    1 +
>> >> >> >>  6 files changed, 843 insertions(+), 0 deletions(-)
>> >> >> >>  create mode 100644 event-tap.c
>> >> >> >>  create mode 100644 event-tap.h
>> >> >> >
>> >> >> > event_tap_state is checked at the beginning of several functions.  If
>> >> >> > there is an unexpected state the function silently returns.  Should
>> >> >> > these checks really be assert() so there is an abort and backtrace if
>> >> >> > the program ever reaches this state?
>> >> >> >
>> >> >> >> +typedef struct EventTapBlkReq {
>> >> >> >> +    char *device_name;
>> >> >> >> +    int num_reqs;
>> >> >> >> +    int num_cbs;
>> >> >> >> +    bool is_multiwrite;
>> >> >> >
>> >> >> > Is multiwrite logging necessary?  If event tap is called from within
>> >> >> > the block layer then multiwrite is turned into one or more
>> >> >> > bdrv_aio_writev() calls.
>> >> >> >
>> >> >> >> +static void event_tap_replay(void *opaque, int running, int reason)
>> >> >> >> +{
>> >> >> >> +    EventTapLog *log, *next;
>> >> >> >> +
>> >> >> >> +    if (!running) {
>> >> >> >> +        return;
>> >> >> >> +    }
>> >> >> >> +
>> >> >> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
>> >> >> >> +        return;
>> >> >> >> +    }
>> >> >> >> +
>> >> >> >> +    event_tap_state = EVENT_TAP_REPLAY;
>> >> >> >> +
>> >> >> >> +    QTAILQ_FOREACH(log, &event_list, node) {
>> >> >> >> +        EventTapBlkReq *blk_req;
>> >> >> >> +
>> >> >> >> +        /* event resume */
>> >> >> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> >> >> >> +        case EVENT_TAP_NET:
>> >> >> >> +            event_tap_net_flush(&log->net_req);
>> >> >> >> +            break;
>> >> >> >> +        case EVENT_TAP_BLK:
>> >> >> >> +            blk_req = &log->blk_req;
>> >> >> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
>> >> >> >> +                switch (log->ioport.index) {
>> >> >> >> +                case 0:
>> >> >> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
>> >> >> >> +                    break;
>> >> >> >> +                case 1:
>> >> >> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
>> >> >> >> +                    break;
>> >> >> >> +                case 2:
>> >> >> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
>> >> >> >> +                    break;
>> >> >> >> +                }
>> >> >> >> +            } else {
>> >> >> >> +                /* EVENT_TAP_MMIO */
>> >> >> >> +                cpu_physical_memory_rw(log->mmio.address,
>> >> >> >> +                                       log->mmio.buf,
>> >> >> >> +                                       log->mmio.len, 1);
>> >> >> >> +            }
>> >> >> >> +            break;
>> >> >> >
>> >> >> > Why are net tx packets replayed at the net level but blk requests are
>> >> >> > replayed at the pio/mmio level?
>> >> >> >
>> >> >> > I expected everything to replay either as pio/mmio or as net/block.
>> >> >>
>> >> >> Stefan,
>> >> >>
>> >> >> After doing some heavy load tests, I realized that we have to
>> >> >> take a hybrid approach to replay for now.  This is because when a
>> >> >> device moves to the next state (e.g. virtio decreases inuse) is
>> >> >> different between net and block.  For example, virtio-net
>> >> >> decreases inuse upon returning from the net layer,
>> >> >> but virtio-blk
>> >> >> does that inside of the callback.
>> >> >
>> >> > For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
>> >> > For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
>> >> > Both are invoked from a callback.
>> >> >
>> >> >> If we only use pio/mmio
>> >> >> replay, even though event-tap tries to replay net requests, some
>> >> >> get lost because the state has proceeded already.
>> >> >
>> >> > It seems that all you need to do to avoid this is to
>> >> > delay the callback?
>> >>
>> >> Yeah, if it's possible.  But if you take a look at virtio-net,
>> >> you'll see that virtio_push is called immediately after calling
>> >> qemu_sendv_packet
>> >> while virtio-blk does that in the callback.
>> >
>> > This is only if the packet was sent immediately.
>> > I was referring to the case where the packet is queued.
>>
>> I see.  I usually don't see packets get queued in the net layer.
>> What would be the effect to devices?  Restraint sending packets?
>
> Yes.
>
>> >
>> >> >
>> >> >> This doesn't
>> >> >> happen with block, because the state is still old enough to
>> >> >> replay.  Note that using hybrid approach won't cause duplicated
>> >> >> requests on the secondary.
>> >> >
>> >> > An assumption devices make is that a buffer is unused once
>> >> > completion callback was invoked. Does this violate that assumption?
>> >>
>> >> No, it shouldn't.  In case of net with net layer replay, we copy
>> >> the content of the requests, and in case of block, because we
>> >> haven't called the callback yet, the requests remains fresh.
>> >>
>> >> Yoshi
>> >>
>> >
>> > Yes, as long as you copy it should be fine.  Maybe it's a good idea for
>> > event-tap to queue all packets to avoid the copy and avoid the need to
>> > replay at the net level.
>>
>> If queuing works fine for the devices, it seems to be a good
>> idea.  I think the ordering issue doesn't happen still.
>>
>> Yoshi
>
> If you replay and both net and pio level, it becomes complex.
> Maybe it's ok, but certainly harder to reason about.

Michael,

It seems queuing at event-tap like in net layer works for devices
that use qemu_send_packet_async as you suggested.  But for those
that use qemu_send_packet, we still need to copy the contents
just like net layer queuing does, and net level replay should be
kept to handle it.
Thanks,

Yoshi

>
>> >
>> >> >
>> >> > --
>> >> > MST
>> >> >
>> >> >
>> >
>> >
>
>
Michael S. Tsirkin - Jan. 6, 2011, 9:36 a.m.
On Thu, Jan 06, 2011 at 05:47:27PM +0900, Yoshiaki Tamura wrote:
> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> > On Tue, Jan 04, 2011 at 10:45:13PM +0900, Yoshiaki Tamura wrote:
> >> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> >> > On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
> >> >> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
> >> >> > On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
> >> >> >> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
> >> >> >> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
> >> >> >> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
> >> >> >> >> event-tap controls when to start FT transaction, and provides proxy
> >> >> >> >> functions to called from net/block devices.  While FT transaction, it
> >> >> >> >> queues up net/block requests, and flush them when the transaction gets
> >> >> >> >> completed.
> >> >> >> >>
> >> >> >> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
> >> >> >> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
> >> >> >> >> ---
> >> >> >> >>  Makefile.target |    1 +
> >> >> >> >>  block.h         |    9 +
> >> >> >> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >> >> >> >>  event-tap.h     |   34 +++
> >> >> >> >>  net.h           |    4 +
> >> >> >> >>  net/queue.c     |    1 +
> >> >> >> >>  6 files changed, 843 insertions(+), 0 deletions(-)
> >> >> >> >>  create mode 100644 event-tap.c
> >> >> >> >>  create mode 100644 event-tap.h
> >> >> >> >
> >> >> >> > event_tap_state is checked at the beginning of several functions.  If
> >> >> >> > there is an unexpected state the function silently returns.  Should
> >> >> >> > these checks really be assert() so there is an abort and backtrace if
> >> >> >> > the program ever reaches this state?
> >> >> >> >
> >> >> >> >> +typedef struct EventTapBlkReq {
> >> >> >> >> +    char *device_name;
> >> >> >> >> +    int num_reqs;
> >> >> >> >> +    int num_cbs;
> >> >> >> >> +    bool is_multiwrite;
> >> >> >> >
> >> >> >> > Is multiwrite logging necessary?  If event tap is called from within
> >> >> >> > the block layer then multiwrite is turned into one or more
> >> >> >> > bdrv_aio_writev() calls.
> >> >> >> >
> >> >> >> >> +static void event_tap_replay(void *opaque, int running, int reason)
> >> >> >> >> +{
> >> >> >> >> +    EventTapLog *log, *next;
> >> >> >> >> +
> >> >> >> >> +    if (!running) {
> >> >> >> >> +        return;
> >> >> >> >> +    }
> >> >> >> >> +
> >> >> >> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
> >> >> >> >> +        return;
> >> >> >> >> +    }
> >> >> >> >> +
> >> >> >> >> +    event_tap_state = EVENT_TAP_REPLAY;
> >> >> >> >> +
> >> >> >> >> +    QTAILQ_FOREACH(log, &event_list, node) {
> >> >> >> >> +        EventTapBlkReq *blk_req;
> >> >> >> >> +
> >> >> >> >> +        /* event resume */
> >> >> >> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
> >> >> >> >> +        case EVENT_TAP_NET:
> >> >> >> >> +            event_tap_net_flush(&log->net_req);
> >> >> >> >> +            break;
> >> >> >> >> +        case EVENT_TAP_BLK:
> >> >> >> >> +            blk_req = &log->blk_req;
> >> >> >> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
> >> >> >> >> +                switch (log->ioport.index) {
> >> >> >> >> +                case 0:
> >> >> >> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
> >> >> >> >> +                    break;
> >> >> >> >> +                case 1:
> >> >> >> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
> >> >> >> >> +                    break;
> >> >> >> >> +                case 2:
> >> >> >> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
> >> >> >> >> +                    break;
> >> >> >> >> +                }
> >> >> >> >> +            } else {
> >> >> >> >> +                /* EVENT_TAP_MMIO */
> >> >> >> >> +                cpu_physical_memory_rw(log->mmio.address,
> >> >> >> >> +                                       log->mmio.buf,
> >> >> >> >> +                                       log->mmio.len, 1);
> >> >> >> >> +            }
> >> >> >> >> +            break;
> >> >> >> >
> >> >> >> > Why are net tx packets replayed at the net level but blk requests are
> >> >> >> > replayed at the pio/mmio level?
> >> >> >> >
> >> >> >> > I expected everything to replay either as pio/mmio or as net/block.
> >> >> >>
> >> >> >> Stefan,
> >> >> >>
> >> >> >> After doing some heavy load tests, I realized that we have to
> >> >> >> take a hybrid approach to replay for now.  This is because when a
> >> >> >> device moves to the next state (e.g. virtio decreases inuse) is
> >> >> >> different between net and block.  For example, virtio-net
> >> >> >> decreases inuse upon returning from the net layer,
> >> >> >> but virtio-blk
> >> >> >> does that inside of the callback.
> >> >> >
> >> >> > For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
> >> >> > For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
> >> >> > Both are invoked from a callback.
> >> >> >
> >> >> >> If we only use pio/mmio
> >> >> >> replay, even though event-tap tries to replay net requests, some
> >> >> >> get lost because the state has proceeded already.
> >> >> >
> >> >> > It seems that all you need to do to avoid this is to
> >> >> > delay the callback?
> >> >>
> >> >> Yeah, if it's possible.  But if you take a look at virtio-net,
> >> >> you'll see that virtio_push is called immediately after calling
> >> >> qemu_sendv_packet
> >> >> while virtio-blk does that in the callback.
> >> >
> >> > This is only if the packet was sent immediately.
> >> > I was referring to the case where the packet is queued.
> >>
> >> I see.  I usually don't see packets get queued in the net layer.
> >> What would be the effect to devices?  Restraint sending packets?
> >
> > Yes.
> >
> >> >
> >> >> >
> >> >> >> This doesn't
> >> >> >> happen with block, because the state is still old enough to
> >> >> >> replay.  Note that using hybrid approach won't cause duplicated
> >> >> >> requests on the secondary.
> >> >> >
> >> >> > An assumption devices make is that a buffer is unused once
> >> >> > completion callback was invoked. Does this violate that assumption?
> >> >>
> >> >> No, it shouldn't.  In case of net with net layer replay, we copy
> >> >> the content of the requests, and in case of block, because we
> >> >> haven't called the callback yet, the requests remains fresh.
> >> >>
> >> >> Yoshi
> >> >>
> >> >
> >> > Yes, as long as you copy it should be fine.  Maybe it's a good idea for
> >> > event-tap to queue all packets to avoid the copy and avoid the need to
> >> > replay at the net level.
> >>
> >> If queuing works fine for the devices, it seems to be a good
> >> idea.  I think the ordering issue doesn't happen still.
> >>
> >> Yoshi
> >
> > If you replay and both net and pio level, it becomes complex.
> > Maybe it's ok, but certainly harder to reason about.
> 
> Michael,
> 
> It seems queuing at event-tap like in net layer works for devices
> that use qemu_send_packet_async as you suggested.  But for those
> that use qemu_send_packet, we still need to copy the contents
> just like net layer queuing does, and net level replay should be
> kept to handle it.
> Thanks,
> 
> Yoshi

Right. And I think it's fine. What I found confusing was
where both virtio (because avail idx is moved back) and
the net layer replay the packet.


> >
> >> >
> >> >> >
> >> >> > --
> >> >> > MST
> >> >> >
> >> >> >
> >> >
> >> >
> >
> >
Yoshiaki Tamura - Jan. 6, 2011, 9:41 a.m.
2011/1/6 Michael S. Tsirkin <mst@redhat.com>:
> On Thu, Jan 06, 2011 at 05:47:27PM +0900, Yoshiaki Tamura wrote:
>> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
>> > On Tue, Jan 04, 2011 at 10:45:13PM +0900, Yoshiaki Tamura wrote:
>> >> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
>> >> > On Tue, Jan 04, 2011 at 09:20:53PM +0900, Yoshiaki Tamura wrote:
>> >> >> 2011/1/4 Michael S. Tsirkin <mst@redhat.com>:
>> >> >> > On Tue, Jan 04, 2011 at 08:02:54PM +0900, Yoshiaki Tamura wrote:
>> >> >> >> 2010/11/29 Stefan Hajnoczi <stefanha@gmail.com>:
>> >> >> >> > On Thu, Nov 25, 2010 at 6:06 AM, Yoshiaki Tamura
>> >> >> >> > <tamura.yoshiaki@lab.ntt.co.jp> wrote:
>> >> >> >> >> event-tap controls when to start FT transaction, and provides proxy
>> >> >> >> >> functions to called from net/block devices.  While FT transaction, it
>> >> >> >> >> queues up net/block requests, and flush them when the transaction gets
>> >> >> >> >> completed.
>> >> >> >> >>
>> >> >> >> >> Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
>> >> >> >> >> Signed-off-by: OHMURA Kei <ohmura.kei@lab.ntt.co.jp>
>> >> >> >> >> ---
>> >> >> >> >>  Makefile.target |    1 +
>> >> >> >> >>  block.h         |    9 +
>> >> >> >> >>  event-tap.c     |  794 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>> >> >> >> >>  event-tap.h     |   34 +++
>> >> >> >> >>  net.h           |    4 +
>> >> >> >> >>  net/queue.c     |    1 +
>> >> >> >> >>  6 files changed, 843 insertions(+), 0 deletions(-)
>> >> >> >> >>  create mode 100644 event-tap.c
>> >> >> >> >>  create mode 100644 event-tap.h
>> >> >> >> >
>> >> >> >> > event_tap_state is checked at the beginning of several functions.  If
>> >> >> >> > there is an unexpected state the function silently returns.  Should
>> >> >> >> > these checks really be assert() so there is an abort and backtrace if
>> >> >> >> > the program ever reaches this state?
>> >> >> >> >
>> >> >> >> >> +typedef struct EventTapBlkReq {
>> >> >> >> >> +    char *device_name;
>> >> >> >> >> +    int num_reqs;
>> >> >> >> >> +    int num_cbs;
>> >> >> >> >> +    bool is_multiwrite;
>> >> >> >> >
>> >> >> >> > Is multiwrite logging necessary?  If event tap is called from within
>> >> >> >> > the block layer then multiwrite is turned into one or more
>> >> >> >> > bdrv_aio_writev() calls.
>> >> >> >> >
>> >> >> >> >> +static void event_tap_replay(void *opaque, int running, int reason)
>> >> >> >> >> +{
>> >> >> >> >> +    EventTapLog *log, *next;
>> >> >> >> >> +
>> >> >> >> >> +    if (!running) {
>> >> >> >> >> +        return;
>> >> >> >> >> +    }
>> >> >> >> >> +
>> >> >> >> >> +    if (event_tap_state != EVENT_TAP_LOAD) {
>> >> >> >> >> +        return;
>> >> >> >> >> +    }
>> >> >> >> >> +
>> >> >> >> >> +    event_tap_state = EVENT_TAP_REPLAY;
>> >> >> >> >> +
>> >> >> >> >> +    QTAILQ_FOREACH(log, &event_list, node) {
>> >> >> >> >> +        EventTapBlkReq *blk_req;
>> >> >> >> >> +
>> >> >> >> >> +        /* event resume */
>> >> >> >> >> +        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
>> >> >> >> >> +        case EVENT_TAP_NET:
>> >> >> >> >> +            event_tap_net_flush(&log->net_req);
>> >> >> >> >> +            break;
>> >> >> >> >> +        case EVENT_TAP_BLK:
>> >> >> >> >> +            blk_req = &log->blk_req;
>> >> >> >> >> +            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
>> >> >> >> >> +                switch (log->ioport.index) {
>> >> >> >> >> +                case 0:
>> >> >> >> >> +                    cpu_outb(log->ioport.address, log->ioport.data);
>> >> >> >> >> +                    break;
>> >> >> >> >> +                case 1:
>> >> >> >> >> +                    cpu_outw(log->ioport.address, log->ioport.data);
>> >> >> >> >> +                    break;
>> >> >> >> >> +                case 2:
>> >> >> >> >> +                    cpu_outl(log->ioport.address, log->ioport.data);
>> >> >> >> >> +                    break;
>> >> >> >> >> +                }
>> >> >> >> >> +            } else {
>> >> >> >> >> +                /* EVENT_TAP_MMIO */
>> >> >> >> >> +                cpu_physical_memory_rw(log->mmio.address,
>> >> >> >> >> +                                       log->mmio.buf,
>> >> >> >> >> +                                       log->mmio.len, 1);
>> >> >> >> >> +            }
>> >> >> >> >> +            break;
>> >> >> >> >
>> >> >> >> > Why are net tx packets replayed at the net level but blk requests are
>> >> >> >> > replayed at the pio/mmio level?
>> >> >> >> >
>> >> >> >> > I expected everything to replay either as pio/mmio or as net/block.
>> >> >> >>
>> >> >> >> Stefan,
>> >> >> >>
>> >> >> >> After doing some heavy load tests, I realized that we have to
>> >> >> >> take a hybrid approach to replay for now.  This is because when a
>> >> >> >> device moves to the next state (e.g. virtio decreases inuse) is
>> >> >> >> different between net and block.  For example, virtio-net
>> >> >> >> decreases inuse upon returning from the net layer,
>> >> >> >> but virtio-blk
>> >> >> >> does that inside of the callback.
>> >> >> >
>> >> >> > For TX, virtio-net calls virtqueue_push from virtio_net_tx_complete.
>> >> >> > For RX, virtio-net calls virtqueue_flush from virtio_net_receive.
>> >> >> > Both are invoked from a callback.
>> >> >> >
>> >> >> >> If we only use pio/mmio
>> >> >> >> replay, even though event-tap tries to replay net requests, some
>> >> >> >> get lost because the state has proceeded already.
>> >> >> >
>> >> >> > It seems that all you need to do to avoid this is to
>> >> >> > delay the callback?
>> >> >>
>> >> >> Yeah, if it's possible.  But if you take a look at virtio-net,
>> >> >> you'll see that virtio_push is called immediately after calling
>> >> >> qemu_sendv_packet
>> >> >> while virtio-blk does that in the callback.
>> >> >
>> >> > This is only if the packet was sent immediately.
>> >> > I was referring to the case where the packet is queued.
>> >>
>> >> I see.  I usually don't see packets get queued in the net layer.
>> >> What would be the effect to devices?  Restraint sending packets?
>> >
>> > Yes.
>> >
>> >> >
>> >> >> >
>> >> >> >> This doesn't
>> >> >> >> happen with block, because the state is still old enough to
>> >> >> >> replay.  Note that using hybrid approach won't cause duplicated
>> >> >> >> requests on the secondary.
>> >> >> >
>> >> >> > An assumption devices make is that a buffer is unused once
>> >> >> > completion callback was invoked. Does this violate that assumption?
>> >> >>
>> >> >> No, it shouldn't.  In case of net with net layer replay, we copy
>> >> >> the content of the requests, and in case of block, because we
>> >> >> haven't called the callback yet, the requests remains fresh.
>> >> >>
>> >> >> Yoshi
>> >> >>
>> >> >
>> >> > Yes, as long as you copy it should be fine.  Maybe it's a good idea for
>> >> > event-tap to queue all packets to avoid the copy and avoid the need to
>> >> > replay at the net level.
>> >>
>> >> If queuing works fine for the devices, it seems to be a good
>> >> idea.  I think the ordering issue doesn't happen still.
>> >>
>> >> Yoshi
>> >
>> > If you replay and both net and pio level, it becomes complex.
>> > Maybe it's ok, but certainly harder to reason about.
>>
>> Michael,
>>
>> It seems queuing at event-tap like in net layer works for devices
>> that use qemu_send_packet_async as you suggested.  But for those
>> that use qemu_send_packet, we still need to copy the contents
>> just like net layer queuing does, and net level replay should be
>> kept to handle it.
>> Thanks,
>>
>> Yoshi
>
> Right. And I think it's fine. What I found confusing was
> where both virtio (because avail idx is moved back) and
> the net layer replay the packet.

I agree, and that part is fixed.  There won't be double layer
replay for the same device.

Yoshi

>
>
>> >
>> >> >
>> >> >> >
>> >> >> > --
>> >> >> > MST
>> >> >> >
>> >> >> >
>> >> >
>> >> >
>> >
>> >
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

Patch

diff --git a/Makefile.target b/Makefile.target
index 2800f47..3922d79 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -197,6 +197,7 @@  obj-y += rwhandler.o
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
 obj-$(CONFIG_NO_KVM) += kvm-stub.o
 LIBS+=-lz
+obj-y += event-tap.o
 
 QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
 QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
diff --git a/block.h b/block.h
index 78ecfac..0f07617 100644
--- a/block.h
+++ b/block.h
@@ -116,6 +116,12 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
 BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
                                   QEMUIOVector *iov, int nb_sectors,
                                   BlockDriverCompletionFunc *cb, void *opaque);
+
+BlockDriverAIOCB *bdrv_aio_writev_proxy(BlockDriverState *bs,
+                                        int64_t sector_num, QEMUIOVector *iov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc *cb,
+                                        void *opaque);
 BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
 				 BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
@@ -134,6 +140,9 @@  typedef struct BlockRequest {
 
 int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs,
     int num_reqs);
+int bdrv_aio_multiwrite_proxy(BlockDriverState *bs, BlockRequest *reqs,
+                              int num_reqs);
+
 
 /* sg packet commands */
 int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf);
diff --git a/event-tap.c b/event-tap.c
new file mode 100644
index 0000000..cf7a38a
--- /dev/null
+++ b/event-tap.c
@@ -0,0 +1,794 @@ 
+/*
+ * Event Tap functions for QEMU
+ *
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu-common.h"
+#include "block.h"
+#include "block_int.h"
+#include "ioport.h"
+#include "osdep.h"
+#include "sysemu.h"
+#include "hw/hw.h"
+#include "net.h"
+#include "event-tap.h"
+
+// #define DEBUG_EVENT_TAP
+
+#ifdef DEBUG_EVENT_TAP
+#define DPRINTF(fmt, ...) \
+    do { printf("event-tap: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+static enum EVENT_TAP_STATE event_tap_state = EVENT_TAP_OFF;
+static BlockDriverAIOCB dummy_acb; /* we may need a pool for dummies */
+
+typedef struct EventTapIOport {
+    uint32_t address;
+    uint32_t data;    
+    int      index;
+} EventTapIOport;
+
+#define MMIO_BUF_SIZE 8
+
+typedef struct EventTapMMIO {
+    uint64_t address;
+    uint8_t  buf[MMIO_BUF_SIZE];
+    int      len;
+} EventTapMMIO;
+
+typedef struct EventTapNetReq {
+    char *device_name;
+    int iovcnt;
+    struct iovec *iov;
+    int vlan_id;
+    bool vlan_needed;
+    bool async;
+} EventTapNetReq;
+
+#define MAX_BLOCK_REQUEST 32
+
+typedef struct EventTapBlkReq {
+    char *device_name;
+    int num_reqs;
+    int num_cbs;
+    bool is_multiwrite;
+    BlockRequest reqs[MAX_BLOCK_REQUEST];
+    BlockDriverCompletionFunc *cb[MAX_BLOCK_REQUEST];
+    void *opaque[MAX_BLOCK_REQUEST];
+} EventTapBlkReq;
+
+#define EVENT_TAP_IOPORT (1 << 0)
+#define EVENT_TAP_MMIO   (1 << 1)
+#define EVENT_TAP_NET    (1 << 2)
+#define EVENT_TAP_BLK    (1 << 3)
+
+#define EVENT_TAP_TYPE_MASK (EVENT_TAP_NET - 1)
+
+typedef struct EventTapLog {
+    int mode;
+    union {
+        EventTapIOport ioport ;    
+        EventTapMMIO mmio;
+    };
+    union {
+        EventTapNetReq net_req;
+        EventTapBlkReq blk_req;
+    };
+    QTAILQ_ENTRY(EventTapLog) node;
+} EventTapLog;
+
+static EventTapLog *last_event_tap;
+
+static QTAILQ_HEAD(, EventTapLog) event_list;
+static QTAILQ_HEAD(, EventTapLog) event_pool;
+
+static int (*event_tap_cb)(void);
+static QEMUBH *event_tap_bh;
+static VMChangeStateEntry *vmstate;
+
+static void event_tap_bh_cb(void *p)
+{
+    event_tap_cb();
+    qemu_bh_delete(event_tap_bh);
+    event_tap_bh = NULL;
+}
+
+static int event_tap_schedule_bh(void)
+{
+    /* if bh is already set, we ignore it for now */
+    if (event_tap_bh) {
+        DPRINTF("event_tap_bh is already scheduled\n");
+        return 0;
+    }
+
+    event_tap_bh = qemu_bh_new(event_tap_bh_cb, NULL);
+    qemu_bh_schedule(event_tap_bh);
+
+    return 0;
+}
+
+static int event_tap_alloc_net_req(EventTapNetReq *net_req, 
+                                   VLANClientState *vc,
+                                   const struct iovec *iov, int iovcnt,
+                                   NetPacketSent *sent_cb, bool async)
+{
+    int i, ret = 0;
+
+    net_req->iovcnt = iovcnt;
+    net_req->async = async;
+    net_req->device_name = qemu_strdup(vc->name);
+
+    if (vc->vlan) {
+        net_req->vlan_needed = 1;
+        net_req->vlan_id = vc->vlan->id;
+    } else {
+        net_req->vlan_needed = 0;
+    }
+
+    net_req->iov = qemu_malloc(sizeof(struct iovec) * iovcnt);
+
+    for (i = 0; i < iovcnt; i++) {
+        net_req->iov[i].iov_base = qemu_malloc(iov[i].iov_len);
+        memcpy(net_req->iov[i].iov_base, iov[i].iov_base, iov[i].iov_len);
+        net_req->iov[i].iov_len = iov[i].iov_len;
+        ret += iov[i].iov_len;
+    }
+
+    return ret;
+}
+
+static void event_tap_alloc_blk_req(EventTapBlkReq *blk_req,
+                                    BlockDriverState *bs, BlockRequest *reqs,
+                                    int num_reqs, BlockDriverCompletionFunc *cb,
+                                    void *opaque, bool is_multiwrite)
+{
+    int i;
+
+    blk_req->num_reqs = num_reqs;
+    blk_req->num_cbs = num_reqs;
+    blk_req->device_name = qemu_strdup(bs->device_name);
+    blk_req->is_multiwrite = is_multiwrite;
+
+    for (i = 0; i < num_reqs; i++) {
+        blk_req->reqs[i].sector = reqs[i].sector;
+        blk_req->reqs[i].nb_sectors = reqs[i].nb_sectors;
+        blk_req->reqs[i].qiov = reqs[i].qiov;
+        blk_req->reqs[i].cb = cb;
+        blk_req->reqs[i].opaque = opaque;
+        blk_req->cb[i] = reqs[i].cb;
+        blk_req->opaque[i] = reqs[i].opaque;
+    }    
+}                                   
+
+static void *event_tap_alloc_log(void)
+{
+    EventTapLog *log;
+
+    if (QTAILQ_EMPTY(&event_pool)) {
+        log = qemu_mallocz(sizeof(EventTapLog));
+    } else {
+        log = QTAILQ_FIRST(&event_pool);
+        QTAILQ_REMOVE(&event_pool, log, node);
+    }
+
+    return log;
+}
+
+static void event_tap_free_log(EventTapLog *log)
+{
+    int i, mode = log->mode & ~EVENT_TAP_TYPE_MASK;
+
+    if (mode == EVENT_TAP_NET) {
+        EventTapNetReq *net_req = &log->net_req;
+        for (i = 0; i < net_req->iovcnt; i++) {
+            qemu_free(net_req->iov[i].iov_base);
+        }
+        qemu_free(net_req->iov);
+        qemu_free(net_req->device_name);
+    } else if (mode == EVENT_TAP_BLK) {
+        EventTapBlkReq *blk_req = &log->blk_req;
+
+        if (event_tap_state >= EVENT_TAP_LOAD) {
+            for (i = 0; i < blk_req->num_reqs; i++) {
+                qemu_free(blk_req->reqs[i].qiov->iov);
+                qemu_free(blk_req->reqs[i].qiov);
+            }
+        }
+        qemu_free(blk_req->device_name);
+    }
+
+    log->mode = 0;
+
+    /* return the log to event_pool */
+    QTAILQ_INSERT_HEAD(&event_pool, log, node);
+}
+
+static void event_tap_free_pool(void)
+{
+    EventTapLog *log, *next;
+
+    QTAILQ_FOREACH_SAFE(log, &event_pool, node, next) {
+        QTAILQ_REMOVE(&event_pool, log, node);
+        qemu_free(log);
+    }
+}
+
+/* This func is called by qemu_net_queue_flush() when a packet is appended */
+static void event_tap_net_cb(VLANClientState *vc, ssize_t len)
+{
+    DPRINTF("%s: %zd bytes packet was sended\n", vc->name, len);
+}
+
+static void event_tap_blk_cb(void *opaque, int ret)
+{
+    EventTapLog *log = container_of(opaque, EventTapLog, blk_req);
+    EventTapBlkReq *blk_req = opaque;
+    int i;
+
+    blk_req->num_cbs--;
+    if (blk_req->num_cbs == 0) {
+        /* all outstanding requests are flushed */
+        for (i = 0; i < blk_req->num_reqs; i++) {
+            blk_req->cb[i](blk_req->opaque[i], ret);
+        }
+        event_tap_free_log(log);
+    }
+}
+
+static int net_event_tap(VLANClientState *vc, const struct iovec *iov,
+                         int iovcnt, NetPacketSent *sent_cb, bool async)
+{
+    int ret = 0, empty;
+    EventTapLog *log = last_event_tap;
+
+    if (!log) {
+        DPRINTF("no last_event_tap\n");
+        log = event_tap_alloc_log();
+    }
+
+    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        DPRINTF("last_event_tap already used %d\n",
+                log->mode & ~EVENT_TAP_TYPE_MASK);
+        return ret;
+    }
+
+    log->mode |= EVENT_TAP_NET;
+    ret = event_tap_alloc_net_req(&log->net_req, vc, iov, iovcnt, sent_cb,
+                                  async);
+
+    empty = QTAILQ_EMPTY(&event_list); 
+    QTAILQ_INSERT_TAIL(&event_list, log, node);
+    last_event_tap = NULL;
+
+    if (empty) {
+        event_tap_schedule_bh();
+    }
+
+    return ret;
+}
+
+static void bdrv_event_tap(BlockDriverState *bs, BlockRequest *reqs,
+                           int num_reqs, bool is_multiwrite)
+{
+    EventTapLog *log = last_event_tap;
+    int empty;
+
+    if (!log) {
+        DPRINTF("no last_event_tap\n");
+        log = event_tap_alloc_log();
+    }
+    if (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        DPRINTF("last_event_tap already used\n");
+        return;
+    }
+
+    log->mode |= EVENT_TAP_BLK;
+    event_tap_alloc_blk_req(&log->blk_req, bs, reqs, num_reqs, event_tap_blk_cb,
+                            &log->blk_req, is_multiwrite);
+
+    empty = QTAILQ_EMPTY(&event_list); 
+    QTAILQ_INSERT_TAIL(&event_list, log, node);
+    last_event_tap = NULL;
+
+    if (empty) {
+        event_tap_schedule_bh();
+    }
+}
+
+BlockDriverAIOCB *bdrv_aio_writev_proxy(BlockDriverState *bs,
+                                        int64_t sector_num,
+                                        QEMUIOVector *iov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc *cb,
+                                        void *opaque)
+{
+    if (event_tap_state == EVENT_TAP_ON) {
+        BlockRequest req;
+
+        req.sector = sector_num;
+        req.nb_sectors = nb_sectors;
+        req.qiov = iov;
+        req.cb = cb;
+        req.opaque = opaque;
+        bdrv_event_tap(bs, &req, 1, 0);
+
+        /* return a dummy_acb pointer to prevent from failing */
+        return &dummy_acb;
+    }
+
+    return bdrv_aio_writev(bs, sector_num, iov, nb_sectors, cb, opaque);
+}
+
+int bdrv_aio_multiwrite_proxy(BlockDriverState *bs, BlockRequest *reqs,
+                              int num_reqs)
+{
+    if (event_tap_state == EVENT_TAP_ON) {
+        bdrv_event_tap(bs, reqs, num_reqs, 1);
+        return 0;
+    }
+
+    return bdrv_aio_multiwrite(bs, reqs, num_reqs);
+}
+
+void qemu_send_packet_proxy(VLANClientState *vc, const uint8_t *buf, int size)
+{
+    if (event_tap_state == EVENT_TAP_ON) {
+        struct iovec iov;
+        iov.iov_base = (uint8_t*)buf;
+        iov.iov_len = size;
+
+        net_event_tap(vc, &iov, 1, NULL, 0);
+        return;
+    }
+
+    return qemu_send_packet(vc, buf, size);
+}
+ssize_t qemu_sendv_packet_async_proxy(VLANClientState *vc,
+                                      const struct iovec *iov,
+                                      int iovcnt, NetPacketSent *sent_cb)
+{
+    if (event_tap_state == EVENT_TAP_ON) {
+        return net_event_tap(vc, iov, iovcnt, sent_cb, 1);
+    }
+
+    return qemu_sendv_packet_async(vc, iov, iovcnt, sent_cb);
+}
+
+int event_tap_register(int (*cb)(void))
+{
+    if (cb == NULL || event_tap_state != EVENT_TAP_OFF)
+        return -1;
+    if (event_tap_cb == NULL)
+        event_tap_cb = cb;
+
+    event_tap_state = EVENT_TAP_ON;
+
+    return 0;
+}
+
+int event_tap_unregister(void)
+{
+    if (event_tap_state == EVENT_TAP_OFF)
+        return -1;
+
+    event_tap_state = EVENT_TAP_OFF;
+    event_tap_cb = NULL;
+
+    event_tap_flush();
+    event_tap_free_pool();
+
+    return 0;
+}
+
+void event_tap_suspend(void)
+{
+    if (event_tap_state == EVENT_TAP_ON) {
+        event_tap_state = EVENT_TAP_SUSPEND;
+    }
+}
+
+void event_tap_resume(void)
+{
+    if (event_tap_state == EVENT_TAP_SUSPEND) {
+        event_tap_state = EVENT_TAP_ON;
+    }
+}
+
+int event_tap_get_state(void)
+{
+    return event_tap_state;
+}
+
+void event_tap_ioport(int index, uint32_t address, uint32_t data)
+{
+    if (event_tap_state != EVENT_TAP_ON) {
+        return;
+    }
+
+    if (!last_event_tap) {
+        last_event_tap = event_tap_alloc_log();
+    }
+
+    last_event_tap->mode = EVENT_TAP_IOPORT;
+    last_event_tap->ioport.index = index;
+    last_event_tap->ioport.address = address;
+    last_event_tap->ioport.data = data;
+}
+
+void event_tap_mmio(uint64_t address, uint8_t *buf, int len)
+{
+    if (event_tap_state != EVENT_TAP_ON || len > MMIO_BUF_SIZE) {
+        return;
+    }
+
+    if (!last_event_tap) {
+        last_event_tap = event_tap_alloc_log();
+    }
+
+    last_event_tap->mode = EVENT_TAP_MMIO;
+    last_event_tap->mmio.address = address;
+    last_event_tap->mmio.len = len;
+    memcpy(last_event_tap->mmio.buf, buf, len);
+}
+
+static void event_tap_net_flush(EventTapNetReq *net_req)
+{
+    VLANClientState *vc;
+    ssize_t len;
+
+    if (net_req->vlan_needed) {
+        vc = qemu_find_vlan_client_by_name(NULL, net_req->vlan_id,
+                                           net_req->device_name);
+    } else {
+        vc = qemu_find_netdev(net_req->device_name);
+    }
+
+    if (net_req->async) {
+        len = qemu_sendv_packet_async(vc, net_req->iov, net_req->iovcnt,
+                                      event_tap_net_cb);
+        if (len == 0) {
+            DPRINTF("This packet is appended\n");
+        }
+    } else {
+        qemu_send_packet(vc, net_req->iov[0].iov_base,
+                         net_req->iov[0].iov_len);
+    }
+}
+
+static void event_tap_blk_flush(EventTapBlkReq *blk_req)
+{
+    BlockDriverState *bs;
+
+    bs = bdrv_find(blk_req->device_name);
+
+    if (blk_req->is_multiwrite) {
+        bdrv_aio_multiwrite(bs, blk_req->reqs, blk_req->num_reqs);
+    } else {
+        bdrv_aio_writev(bs, blk_req->reqs[0].sector, blk_req->reqs[0].qiov,
+                        blk_req->reqs[0].nb_sectors, blk_req->cb[0],
+                        blk_req->opaque[0]);
+    }
+}
+
+/* returns 1 if the queue gets emtpy */
+int event_tap_flush_one(void)
+{
+    EventTapLog *log;
+
+    if (QTAILQ_EMPTY(&event_list)) {
+        return 1;
+    }
+
+    log = QTAILQ_FIRST(&event_list);
+    switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
+    case EVENT_TAP_NET:
+        event_tap_net_flush(&log->net_req);
+        QTAILQ_REMOVE(&event_list, log, node);
+        event_tap_free_log(log);
+        break;
+    case EVENT_TAP_BLK:
+        event_tap_blk_flush(&log->blk_req);
+        QTAILQ_REMOVE(&event_list, log, node);
+        break;
+    default:
+        fprintf(stderr, "Unknown state %d\n", log->mode);
+        return -1;
+    }
+
+    return QTAILQ_EMPTY(&event_list);
+}
+
+void event_tap_flush(void)
+{
+    int ret;
+    do {
+        ret = event_tap_flush_one();
+    } while (ret == 0);
+}
+
+static void event_tap_replay(void *opaque, int running, int reason)
+{
+    EventTapLog *log, *next;
+
+    if (!running) {
+        return;
+    }
+
+    if (event_tap_state != EVENT_TAP_LOAD) {
+        return;
+    }
+
+    event_tap_state = EVENT_TAP_REPLAY;
+
+    QTAILQ_FOREACH(log, &event_list, node) {
+        EventTapBlkReq *blk_req;
+
+        /* event resume */
+        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_NET:
+            event_tap_net_flush(&log->net_req);
+            break;
+        case EVENT_TAP_BLK:
+            blk_req = &log->blk_req;
+            if ((log->mode & EVENT_TAP_TYPE_MASK) == EVENT_TAP_IOPORT) {
+                switch (log->ioport.index) {
+                case 0:
+                    cpu_outb(log->ioport.address, log->ioport.data);
+                    break;
+                case 1:
+                    cpu_outw(log->ioport.address, log->ioport.data);
+                    break;
+                case 2:
+                    cpu_outl(log->ioport.address, log->ioport.data);
+                    break;
+                }
+            } else {
+                /* EVENT_TAP_MMIO */
+                cpu_physical_memory_rw(log->mmio.address,
+                                       log->mmio.buf,
+                                       log->mmio.len, 1);
+            }
+            break;
+        case 0:
+            DPRINTF("No event\n");
+            break;
+        default:
+            fprintf(stderr, "Unknown state %d\n", log->mode);
+            return;
+        }
+    }
+
+    /* remove event logs from queue */
+    QTAILQ_FOREACH_SAFE(log, &event_list, node, next) {
+        QTAILQ_REMOVE(&event_list, log, node);
+        event_tap_free_log(log);
+    }
+
+    event_tap_state = EVENT_TAP_OFF;
+    qemu_del_vm_change_state_handler(vmstate);
+}
+
+static inline void event_tap_ioport_save(QEMUFile *f, EventTapIOport *ioport)
+{
+    qemu_put_be32(f, ioport->index);
+    qemu_put_be32(f, ioport->address);
+    qemu_put_byte(f, ioport->data);
+}
+
+static inline void event_tap_ioport_load(QEMUFile *f,
+                                         EventTapIOport *ioport)
+{
+    ioport->index = qemu_get_be32(f);
+    ioport->address = qemu_get_be32(f);
+    ioport->data = qemu_get_byte(f);
+}
+
+static inline void event_tap_mmio_save(QEMUFile *f, EventTapMMIO *mmio)
+{
+    qemu_put_be64(f, mmio->address);
+    qemu_put_byte(f, mmio->len);
+    qemu_put_buffer(f, mmio->buf, mmio->len);
+}
+
+static inline void event_tap_mmio_load(QEMUFile *f, EventTapMMIO *mmio)
+{
+    mmio->address = qemu_get_be64(f);
+    mmio->len = qemu_get_byte(f);
+    qemu_get_buffer(f, mmio->buf, mmio->len);
+}
+
+static void event_tap_net_save(QEMUFile *f, EventTapNetReq *net_req)
+{
+    int i, len;
+
+    len = strlen(net_req->device_name);
+    qemu_put_byte(f, len);
+    qemu_put_buffer(f, (uint8_t *)net_req->device_name, len);
+    qemu_put_byte(f, net_req->vlan_id);
+    qemu_put_byte(f, net_req->vlan_needed);
+    qemu_put_byte(f, net_req->iovcnt);
+
+    for (i = 0; i < net_req->iovcnt; i++) {
+        qemu_put_be64(f, net_req->iov[i].iov_len);
+        qemu_put_buffer(f, (uint8_t *)net_req->iov[i].iov_base,
+                        net_req->iov[i].iov_len);
+    }
+}
+
+static void event_tap_net_load(QEMUFile *f, EventTapNetReq *net_req)
+{
+    int i, len;
+
+    len = qemu_get_byte(f);
+    net_req->device_name = qemu_malloc(len + 1);
+    qemu_get_buffer(f, (uint8_t *)net_req->device_name, len);
+    net_req->device_name[len] = '\0';
+    net_req->vlan_id = qemu_get_byte(f);
+    net_req->vlan_needed = qemu_get_byte(f);
+    net_req->iovcnt = qemu_get_byte(f);
+    net_req->iov = qemu_malloc(sizeof(struct iovec) * net_req->iovcnt);
+
+    for (i = 0; i < net_req->iovcnt; i++) {
+        net_req->iov[i].iov_len = qemu_get_be64(f);
+        net_req->iov[i].iov_base = qemu_malloc(net_req->iov[i].iov_len);
+        qemu_get_buffer(f, (uint8_t *)net_req->iov[i].iov_base,
+                        net_req->iov[i].iov_len);
+    }
+}
+
+static void event_tap_blk_save(QEMUFile *f, EventTapBlkReq *blk_req)
+{
+    BlockRequest *req;
+    ram_addr_t page_addr;
+    int i, j, len;
+
+    len = strlen(blk_req->device_name);
+    qemu_put_byte(f, len);
+    qemu_put_buffer(f, (uint8_t *)blk_req->device_name, len);
+    qemu_put_byte(f, blk_req->num_reqs);
+
+    for (i = 0; i < blk_req->num_reqs; i++) {
+        req = &blk_req->reqs[i];
+        qemu_put_be64(f, req->sector);
+        qemu_put_be32(f, req->nb_sectors);
+        qemu_put_byte(f, req->qiov->niov);
+        for (j = 0; j < req->qiov->niov; j++) {
+            page_addr =
+                qemu_ram_addr_from_host_nofail(req->qiov->iov[j].iov_base);
+            qemu_put_be64(f, page_addr);
+            qemu_put_be64(f, req->qiov->iov[j].iov_len);
+        }
+    }
+}
+
+static void event_tap_blk_load(QEMUFile *f, EventTapBlkReq *blk_req)
+{
+    BlockRequest *req;
+    ram_addr_t page_addr;
+    int i, j, len;
+
+    len = qemu_get_byte(f);
+    blk_req->device_name = qemu_malloc(len + 1);
+    qemu_get_buffer(f, (uint8_t *)blk_req->device_name, len);
+    blk_req->device_name[len] = '\0';
+    blk_req->num_reqs = qemu_get_byte(f);
+
+    for (i = 0; i < blk_req->num_reqs; i++) {
+        req = &blk_req->reqs[i];
+        req->sector = qemu_get_be64(f);
+        req->nb_sectors = qemu_get_be32(f);
+        req->qiov = qemu_malloc(sizeof(QEMUIOVector));
+        req->qiov->niov = qemu_get_byte(f);
+        req->qiov->iov = qemu_malloc(sizeof(struct iovec) * req->qiov->niov);
+        for (j = 0; j < req->qiov->niov; j++) {
+            page_addr = qemu_get_be64(f);
+            req->qiov->iov[j].iov_base = qemu_get_ram_ptr(page_addr);
+            req->qiov->iov[j].iov_len = qemu_get_be64(f);
+        }
+    }
+}
+
+static void event_tap_save(QEMUFile *f, void *opaque)
+{
+    EventTapLog *log;
+
+    QTAILQ_FOREACH(log, &event_list, node) {
+        qemu_put_byte(f, log->mode);
+        DPRINTF("log->mode=%d\n", log->mode);
+        switch (log->mode & EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_IOPORT:
+            event_tap_ioport_save(f, &log->ioport);
+            break;
+        case EVENT_TAP_MMIO:
+            event_tap_mmio_save(f, &log->mmio);
+            break;
+        case 0:
+            DPRINTF("No event\n");
+            break;
+        default:
+            fprintf(stderr, "Unknown state %d\n", log->mode);
+            return;
+        }
+
+        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_NET:
+            event_tap_net_save(f, &log->net_req);
+            break;
+        case EVENT_TAP_BLK:
+            event_tap_blk_save(f, &log->blk_req);
+            break;
+        default:
+            fprintf(stderr, "Unknown state %d\n", log->mode);
+            return;
+        }
+    }
+
+    qemu_put_byte(f, 0); /* EOF */
+}
+
+static int event_tap_load(QEMUFile *f, void *opaque, int version_id)
+{
+    EventTapLog *log, *next;
+    int mode;
+
+    event_tap_state = EVENT_TAP_LOAD;
+
+    QTAILQ_FOREACH_SAFE(log, &event_list, node, next) {
+        QTAILQ_REMOVE(&event_list, log, node);
+        event_tap_free_log(log);
+    }
+
+    /* loop until EOF */
+    while ((mode = qemu_get_byte(f)) != 0) {
+        EventTapLog *log = event_tap_alloc_log();
+
+        log->mode = mode;
+        switch (log->mode & EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_IOPORT:
+            event_tap_ioport_load(f, &log->ioport);
+            break;
+        case EVENT_TAP_MMIO:
+            event_tap_mmio_load(f, &log->mmio);
+            break;
+        case 0:
+            DPRINTF("No event\n");
+            break;
+        default:
+            fprintf(stderr, "Unknown state %d\n", log->mode);
+            return -1;
+        }
+
+        switch (log->mode & ~EVENT_TAP_TYPE_MASK) {
+        case EVENT_TAP_NET:
+            event_tap_net_load(f, &log->net_req);
+            break;
+        case EVENT_TAP_BLK:
+            event_tap_blk_load(f, &log->blk_req);
+            break;
+        default:
+            fprintf(stderr, "Unknown state %d\n", log->mode);
+            return -1;
+        }
+
+        QTAILQ_INSERT_TAIL(&event_list, log, node);
+    }
+
+    return 0;
+}
+
+void event_tap_init(void)
+{
+    QTAILQ_INIT(&event_list);
+    QTAILQ_INIT(&event_pool);    
+    register_savevm(NULL, "event-tap", 0, 1,
+                    event_tap_save, event_tap_load, &last_event_tap);
+    vmstate = qemu_add_vm_change_state_handler(event_tap_replay, NULL);
+}
diff --git a/event-tap.h b/event-tap.h
new file mode 100644
index 0000000..61b9bbc
--- /dev/null
+++ b/event-tap.h
@@ -0,0 +1,34 @@ 
+/*
+ * Event Tap functions for QEMU
+ *
+ * Copyright (c) 2010 Nippon Telegraph and Telephone Corporation. 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef EVENT_TAP_H
+#define EVENT_TAP_H
+
+#include "qemu-common.h"
+
+enum EVENT_TAP_STATE {
+    EVENT_TAP_OFF,
+    EVENT_TAP_ON,
+    EVENT_TAP_SUSPEND,
+    EVENT_TAP_LOAD,
+    EVENT_TAP_REPLAY,
+};
+
+int event_tap_register(int (*cb)(void));
+int event_tap_unregister(void);
+void event_tap_suspend(void);
+void event_tap_resume(void);
+int event_tap_get_state(void);
+void event_tap_ioport(int index, uint32_t address, uint32_t data);
+void event_tap_mmio(uint64_t address, uint8_t *buf, int len);
+void event_tap_init(void);
+void event_tap_flush(void);
+int event_tap_flush_one(void);
+
+#endif
diff --git a/net.h b/net.h
index 44c31a9..93fd403 100644
--- a/net.h
+++ b/net.h
@@ -105,6 +105,10 @@  ssize_t qemu_sendv_packet(VLANClientState *vc, const struct iovec *iov,
 ssize_t qemu_sendv_packet_async(VLANClientState *vc, const struct iovec *iov,
                                 int iovcnt, NetPacketSent *sent_cb);
 void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size);
+void qemu_send_packet_proxy(VLANClientState *vc, const uint8_t *buf, int size);
+ssize_t qemu_sendv_packet_async_proxy(VLANClientState *vc,
+                                      const struct iovec *iov,
+                                      int iovcnt, NetPacketSent *sent_cb);
 ssize_t qemu_send_packet_raw(VLANClientState *vc, const uint8_t *buf, int size);
 ssize_t qemu_send_packet_async(VLANClientState *vc, const uint8_t *buf,
                                int size, NetPacketSent *sent_cb);
diff --git a/net/queue.c b/net/queue.c
index 2ea6cd0..e7a35b0 100644
--- a/net/queue.c
+++ b/net/queue.c
@@ -258,3 +258,4 @@  void qemu_net_queue_flush(NetQueue *queue)
         qemu_free(packet);
     }
 }
+