Patchwork [2/3] Add support for live block copy

login
register
mail settings
Submitter Marcelo Tosatti
Date Feb. 22, 2011, 5 p.m.
Message ID <20110222170115.710717278@redhat.com>
Download mbox | patch
Permalink /patch/83998/
State New
Headers show

Comments

Marcelo Tosatti - Feb. 22, 2011, 5 p.m.
Support live image copy + switch. That is, copy an image backing 
a guest hard disk to a destination image (destination image must 
be created separately), and switch to this copy.

Command syntax:

block_copy device filename [commit_filename] [-i] -- live block copy device to image
             optional commit filename 
             -i for incremental copy (base image shared between src and destination)

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Anthony Liguori - Feb. 22, 2011, 8:50 p.m.
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
> Support live image copy + switch. That is, copy an image backing
> a guest hard disk to a destination image (destination image must
> be created separately), and switch to this copy.
>
> Command syntax:
>
> block_copy device filename [commit_filename] [-i] -- live block copy device to image
>               optional commit filename
>               -i for incremental copy (base image shared between src and destination)
>
> Signed-off-by: Marcelo Tosatti<mtosatti@redhat.com>
>
> Index: qemu/block-copy.c
> ===================================================================
> --- /dev/null
> +++ qemu/block-copy.c
> @@ -0,0 +1,741 @@
> +/*
> + * QEMU live block copy
> + *
> + * Copyright (C) 2010 Red Hat Inc.
> + *
> + * Authors: Marcelo Tosatti<mtosatti@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu-common.h"
> +#include "block_int.h"
> +#include "blockdev.h"
> +#include "qemu-queue.h"
> +#include "qemu-timer.h"
> +#include "monitor.h"
> +#include "block-copy.h"
> +#include "migration.h"
> +#include "sysemu.h"
> +#include "qjson.h"
> +#include<assert.h>
> +
> +#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK<<  BDRV_SECTOR_BITS)
> +#define MAX_IS_ALLOCATED_SEARCH 65536
> +
> +/*
> + * Stages:
> + *
> + * STAGE_BULK: bulk reads/writes in progress
> + * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
> + * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
> + * STAGE_SWITCH_FINISHED: switched to new image.
> + */
> +
> +enum BdrvCopyStage {
> +    STAGE_BULK,
> +    STAGE_BULK_FINISHED,
> +    STAGE_DIRTY,
> +    STAGE_SWITCH_FINISHED,
> +};
> +
> +typedef struct BdrvCopyState {
> +    BlockDriverState *src;
> +    BlockDriverState *dst;
> +    bool shared_base;
> +
> +    int64_t curr_sector;
> +    int64_t completed_sectors;
> +    int64_t nr_sectors;
> +
> +    enum BdrvCopyStage stage;
> +    int inflight_reads;
> +    int error;
> +    int failed;
> +    int cancelled;
> +    QLIST_HEAD(, BdrvCopyBlock) io_list;
> +    unsigned long *aio_bitmap;
> +    QEMUTimer *aio_timer;
> +    QLIST_ENTRY(BdrvCopyState) list;
> +
> +    int64_t blocks;
> +    int64_t total_time;
> +
> +    char src_device_name[32];
> +    char dst_filename[1024];
> +    int commit_fd;
> +} BdrvCopyState;
> +
> +typedef struct BdrvCopyBlock {
> +    BdrvCopyState *state;
> +    uint8_t *buf;
> +    int64_t sector;
> +    int64_t nr_sectors;
> +    struct iovec iov;
> +    QEMUIOVector qiov;
> +    BlockDriverAIOCB *aiocb;
> +    int64_t time;
> +    QLIST_ENTRY(BdrvCopyBlock) list;
> +} BdrvCopyBlock;
> +
> +static QLIST_HEAD(, BdrvCopyState) block_copy_list =
> +    QLIST_HEAD_INITIALIZER(block_copy_list);
> +
> +static void alloc_aio_bitmap(BdrvCopyState *s)
> +{
> +    BlockDriverState *bs = s->src;
> +    int64_t bitmap_size;
> +
> +    bitmap_size = (bdrv_getlength(bs)>>  BDRV_SECTOR_BITS) +
> +            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
> +    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
> +
> +    s->aio_bitmap = qemu_mallocz(bitmap_size);
> +}
> +
> +static bool aio_inflight(BdrvCopyState *s, int64_t sector)
> +{
> +    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    if (s->aio_bitmap&&
> +        (sector<<  BDRV_SECTOR_BITS)<  bdrv_getlength(s->src)) {
> +        return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)]&
> +            (1UL<<  (chunk % (sizeof(unsigned long) * 8))));
> +    } else {
> +        return 0;
> +    }
> +}
> +
> +static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
> +                             int nb_sectors, int set)
> +{
> +    int64_t start, end;
> +    unsigned long val, idx, bit;
> +
> +    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    for (; start<= end; start++) {
> +        idx = start / (sizeof(unsigned long) * 8);
> +        bit = start % (sizeof(unsigned long) * 8);
> +        val = s->aio_bitmap[idx];
> +        if (set) {
> +            if (!(val&  (1UL<<  bit))) {
> +                val |= 1UL<<  bit;
> +            }
> +        } else {
> +            if (val&  (1UL<<  bit)) {
> +                val&= ~(1UL<<  bit);
> +            }
> +        }
> +        s->aio_bitmap[idx] = val;
> +    }
> +}
> +
> +static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
> +{
> +    s->stage = stage;
> +
> +    switch (stage) {
> +    case STAGE_BULK:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
> +        break;
> +    case STAGE_BULK_FINISHED:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
> +        break;
> +    case STAGE_DIRTY:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
> +        break;
> +    case STAGE_SWITCH_FINISHED:
> +        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
> +        break;
> +    default:
> +        break;
> +    }
> +}
> +
> +static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
> +{
> +    s->error = ret;
> +    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
> +{
> +    s->blocks++;
> +    s->total_time += time;
> +}
> +
> +static void blk_copy_write_cb(void *opaque, int ret)
> +{
> +    BdrvCopyBlock *blk = opaque;
> +    BdrvCopyState *s = blk->state;
> +
> +    if (ret<  0) {
> +        QLIST_REMOVE(blk, list);
> +        qemu_free(blk->buf);
> +        qemu_free(blk);
> +        blk_copy_handle_cb_error(s, ret);
> +        return;
> +    }
> +
> +    QLIST_REMOVE(blk, list);
> +    add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
> +
> +    /* schedule switch to STAGE_DIRTY on last bulk write completion */
> +    if (blk->state->stage == STAGE_BULK_FINISHED) {
> +        qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +    }
> +
> +    if (blk->state->stage>  STAGE_BULK_FINISHED) {
> +        set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
> +    }
> +
> +    qemu_free(blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
> +{
> +    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> +    blk->state = s;
> +    blk->sector = read_blk->sector;
> +    blk->nr_sectors = read_blk->nr_sectors;
> +    blk->time = read_blk->time;
> +    blk->buf = read_blk->buf;
> +    QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> +    blk->iov.iov_base = read_blk->buf;
> +    blk->iov.iov_len = read_blk->iov.iov_len;
> +    qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> +    BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
> +    blk->aiocb = bdrv_aio_writev(s->dst, blk->sector,&blk->qiov,
> +                                 blk->iov.iov_len / BDRV_SECTOR_SIZE,
> +                                 blk_copy_write_cb, blk);
> +    if (!blk->aiocb) {
> +        s->error = 1;
> +        goto error;
> +    }
> +
> +    return;
> +
> +error:
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(read_blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static void blk_copy_read_cb(void *opaque, int ret)
> +{
> +    BdrvCopyBlock *blk = opaque;
> +    BdrvCopyState *s = blk->state;
> +
> +    s->inflight_reads--;
> +    if (ret<  0) {
> +        QLIST_REMOVE(blk, list);
> +        qemu_free(blk->buf);
> +        qemu_free(blk);
> +        blk_copy_handle_cb_error(s, ret);
> +        return;
> +    }
> +    blk_copy_issue_write(s, blk);
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(blk);
> +    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
> +}
> +
> +static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
> +                                int nr_sectors)
> +{
> +    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
> +    blk->buf = qemu_mallocz(BLOCK_SIZE);
> +    blk->state = s;
> +    blk->sector = sector;
> +    blk->nr_sectors = nr_sectors;
> +    QLIST_INSERT_HEAD(&s->io_list, blk, list);
> +
> +    blk->iov.iov_base = blk->buf;
> +    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
> +
> +    s->inflight_reads++;
> +    blk->time = qemu_get_clock_ns(rt_clock);
> +    blk->aiocb = bdrv_aio_readv(s->src, sector,&blk->qiov, nr_sectors,
> +                                blk_copy_read_cb, blk);
> +    if (!blk->aiocb) {
> +        s->error = 1;
> +        goto error;
> +    }
> +
> +    return;
> +
> +error:
> +    s->inflight_reads--;
> +    QLIST_REMOVE(blk, list);
> +    qemu_free(blk->buf);
> +    qemu_free(blk);
> +}
> +
> +static bool blkcopy_can_switch(BdrvCopyState *s)
> +{
> +    int64_t remaining_dirty;
> +    int64_t avg_transfer_time;
> +
> +    remaining_dirty = bdrv_get_dirty_count(s->src);
> +    if (remaining_dirty == 0 || s->blocks == 0) {
> +        return true;
> +    }
> +
> +    avg_transfer_time = s->total_time / s->blocks;
> +    if ((remaining_dirty * avg_transfer_time)<= migrate_max_downtime()) {
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static int blk_issue_reads_dirty(BdrvCopyState *s)
> +{
> +    int64_t sector;
> +
> +    for (sector = s->curr_sector; sector<  s->nr_sectors;) {
> +        if (bdrv_get_dirty(s->src, sector)&&  !aio_inflight(s, sector)) {
> +            int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
> +                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> +            blk_copy_issue_read(s, sector, nr_sectors);
> +            bdrv_reset_dirty(s->src, sector, nr_sectors);
> +            set_aio_inflight(s, sector, nr_sectors, 1);
> +            break;
> +        }
> +
> +        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
> +        s->curr_sector = sector;
> +    }
> +
> +    if (sector>= s->nr_sectors) {
> +        s->curr_sector = 0;
> +    }
> +    return 0;
> +}
> +
> +static int blk_issue_reads_bulk(BdrvCopyState *s)
> +{
> +    int nr_sectors;
> +    int64_t curr_sector = s->curr_sector;
> +
> +    if (s->shared_base) {
> +        while (curr_sector<  s->nr_sectors&&
> +                !bdrv_is_allocated(s->src, curr_sector,
> +                                   MAX_IS_ALLOCATED_SEARCH,&nr_sectors)) {
> +                curr_sector += nr_sectors;
> +        }
> +    }
> +
> +    if (curr_sector>= s->nr_sectors) {
> +        s->curr_sector = 0;
> +        return 1;
> +    }
> +
> +    curr_sector&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
> +    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
> +
> +    blk_copy_issue_read(s, s->curr_sector, nr_sectors);
> +    s->curr_sector += nr_sectors;
> +    s->completed_sectors = curr_sector;
> +    return 0;
> +}
> +
> +static void blkcopy_finish(BdrvCopyState *s)
> +{
> +    int64_t sector;
> +    uint8_t *buf;
> +
> +    buf = qemu_malloc(BLOCK_SIZE);
> +
> +    /* FIXME: speed up loop, get_next_dirty_block? */
> +    for (sector = 0; sector<  s->nr_sectors;
> +         sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
> +        if (bdrv_get_dirty(s->src, sector)) {
> +            int nr_sectors = MIN(s->nr_sectors - sector,
> +                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
> +
> +            memset(buf, 0, BLOCK_SIZE);
> +            if (bdrv_read(s->src, sector, buf, nr_sectors)<  0) {
> +                goto error;
> +            }
> +            if (bdrv_write(s->dst, sector, buf, nr_sectors)<  0) {
> +                goto error;
> +            }
> +            bdrv_reset_dirty(s->src, sector, nr_sectors);
> +        }
> +
> +        if (bdrv_get_dirty_count(s->src) == 0)
> +            break;
> +    }
> +    qemu_free(buf);
> +    return;
> +
> +error:
> +    qemu_free(buf);
> +    s->error = 1;
> +}
> +
> +static int write_commit_file(BdrvCopyState *s)
> +{
> +    char commit_msg[1400];
> +    const char *buf = commit_msg;
> +    int len, ret;
> +
> +    sprintf(commit_msg, "commit QEMU block_copy %s ->  %s\n", s->src_device_name,
> +                        s->dst_filename);
> +
> +    len = strlen(commit_msg);
> +    while (len>  0) {
> +        ret = write(s->commit_fd, buf, len);
> +        if (ret == -1&&  errno == EINTR) {
> +            continue;
> +        }
> +        if (ret<= 0) {
> +            return -errno;
> +        }
> +        buf += ret;
> +        len -= ret;
> +    }
> +
> +    if (fsync(s->commit_fd) == -1) {
> +        return -errno;
> +    }
>
>    

This is more or less black magic.  What is this commit file used for and 
why aren't we using something like a QMP event?

Regards,

Anthony Liguori
Anthony Liguori - Feb. 22, 2011, 8:50 p.m.
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:

This patch is large and not properly inlined.  Can it be split up into 
more reviewable chunks?

Regards,

Anthony Liguori
Marcelo Tosatti - Feb. 22, 2011, 9:07 p.m.
On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote:
> >+static int write_commit_file(BdrvCopyState *s)
> >+{
> >+    char commit_msg[1400];
> >+    const char *buf = commit_msg;
> >+    int len, ret;
> >+
> >+    sprintf(commit_msg, "commit QEMU block_copy %s ->  %s\n", s->src_device_name,
> >+                        s->dst_filename);
> >+
> >+    len = strlen(commit_msg);
> >+    while (len>  0) {
> >+        ret = write(s->commit_fd, buf, len);
> >+        if (ret == -1&&  errno == EINTR) {
> >+            continue;
> >+        }
> >+        if (ret<= 0) {
> >+            return -errno;
> >+        }
> >+        buf += ret;
> >+        len -= ret;
> >+    }
> >+
> >+    if (fsync(s->commit_fd) == -1) {
> >+        return -errno;
> >+    }
> >
> 
> This is more or less black magic.  What is this commit file used for
> and why aren't we using something like a QMP event?

The commit file is considered reliable storage for the result of image
switch operation. Think of the following scenario:

- mgmt app requests live copy of guests ide1-hd0 
from /a/image.img to /b/image.img.
- mgmt app dies.
- guest switches to /b/image.img, /a/image.img is outdated.
- guest dies.

Notifying the switch via QMP would not be reliable in this case.

Will add this as a comment in the code.
Anthony Liguori - Feb. 22, 2011, 9:11 p.m.
On 02/22/2011 03:07 PM, Marcelo Tosatti wrote:
> On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote:
>    
>>> +static int write_commit_file(BdrvCopyState *s)
>>> +{
>>> +    char commit_msg[1400];
>>> +    const char *buf = commit_msg;
>>> +    int len, ret;
>>> +
>>> +    sprintf(commit_msg, "commit QEMU block_copy %s ->   %s\n", s->src_device_name,
>>> +                        s->dst_filename);
>>> +
>>> +    len = strlen(commit_msg);
>>> +    while (len>   0) {
>>> +        ret = write(s->commit_fd, buf, len);
>>> +        if (ret == -1&&   errno == EINTR) {
>>> +            continue;
>>> +        }
>>> +        if (ret<= 0) {
>>> +            return -errno;
>>> +        }
>>> +        buf += ret;
>>> +        len -= ret;
>>> +    }
>>> +
>>> +    if (fsync(s->commit_fd) == -1) {
>>> +        return -errno;
>>> +    }
>>>
>>>        
>> This is more or less black magic.  What is this commit file used for
>> and why aren't we using something like a QMP event?
>>      
> The commit file is considered reliable storage for the result of image
> switch operation. Think of the following scenario:
>
> - mgmt app requests live copy of guests ide1-hd0
> from /a/image.img to /b/image.img.
> - mgmt app dies.
> - guest switches to /b/image.img, /a/image.img is outdated.
> - guest dies.
>
> Notifying the switch via QMP would not be reliable in this case.
>    

But this is true of any number of operations in QEMU such as hotplug 
where if a management tool dies after requesting hotplug and then the 
guest dies before the management tool reconnects, it's never known 
whether it's truly connected or not.

The only way to robustly solve this is for QEMU to maintain a stateful 
config file.  A one-off for this particular command doesn't seem wise to me.

Regards,

Anthony Liguori

> Will add this as a comment in the code.
>
>
Anthony Liguori - Feb. 22, 2011, 9:16 p.m.
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
> +
> +static void blkcopy_switch(BdrvCopyState *s)
> +{
> +    char src_filename[1024];
> +    int open_flags;
> +
> +    strncpy(src_filename, s->src->filename, sizeof(src_filename));
> +    open_flags = s->src->open_flags;
>    

strncpy doesn't leave a NULL terminator if it runs out of space.  I 
don't think that's your desire here.

Regards,

Anthony Liguori
Marcelo Tosatti - Feb. 22, 2011, 11:09 p.m.
On Tue, Feb 22, 2011 at 03:11:06PM -0600, Anthony Liguori wrote:
> On 02/22/2011 03:07 PM, Marcelo Tosatti wrote:
> >On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote:
> >>>+static int write_commit_file(BdrvCopyState *s)
> >>>+{
> >>>+    char commit_msg[1400];
> >>>+    const char *buf = commit_msg;
> >>>+    int len, ret;
> >>>+
> >>>+    sprintf(commit_msg, "commit QEMU block_copy %s ->   %s\n", s->src_device_name,
> >>>+                        s->dst_filename);
> >>>+
> >>>+    len = strlen(commit_msg);
> >>>+    while (len>   0) {
> >>>+        ret = write(s->commit_fd, buf, len);
> >>>+        if (ret == -1&&   errno == EINTR) {
> >>>+            continue;
> >>>+        }
> >>>+        if (ret<= 0) {
> >>>+            return -errno;
> >>>+        }
> >>>+        buf += ret;
> >>>+        len -= ret;
> >>>+    }
> >>>+
> >>>+    if (fsync(s->commit_fd) == -1) {
> >>>+        return -errno;
> >>>+    }
> >>>
> >>This is more or less black magic.  What is this commit file used for
> >>and why aren't we using something like a QMP event?
> >The commit file is considered reliable storage for the result of image
> >switch operation. Think of the following scenario:
> >
> >- mgmt app requests live copy of guests ide1-hd0
> >from /a/image.img to /b/image.img.
> >- mgmt app dies.
> >- guest switches to /b/image.img, /a/image.img is outdated.
> >- guest dies.
> >
> >Notifying the switch via QMP would not be reliable in this case.
> 
> But this is true of any number of operations in QEMU such as hotplug
> where if a management tool dies after requesting hotplug and then
> the guest dies before the management tool reconnects, it's never
> known whether it's truly connected or not.

This is a different case. The mgmt tool should be able to safely store
the desired device tree before hotplugging a device.

And even if you consider that should be done by QEMU, in a config file,
restarting the guest with or without the device is not going to result
in data corruption/loss.

Image switch is critical because using an outdated image will result in
data corruption.

> The only way to robustly solve this is for QEMU to maintain a
> stateful config file.  A one-off for this particular command doesn't
> seem wise to me.

I don't disagree the commit message could be in a config file, but since
that does not exist, a separate file is a workable solution. Also, the
separate commit file is not incompatible with future improvements.
Anthony Liguori - Feb. 22, 2011, 11:14 p.m.
On 02/22/2011 05:09 PM, Marcelo Tosatti wrote:
> On Tue, Feb 22, 2011 at 03:11:06PM -0600, Anthony Liguori wrote:
>    
>> On 02/22/2011 03:07 PM, Marcelo Tosatti wrote:
>>      
>>> On Tue, Feb 22, 2011 at 02:50:09PM -0600, Anthony Liguori wrote:
>>>        
>>>>> +static int write_commit_file(BdrvCopyState *s)
>>>>> +{
>>>>> +    char commit_msg[1400];
>>>>> +    const char *buf = commit_msg;
>>>>> +    int len, ret;
>>>>> +
>>>>> +    sprintf(commit_msg, "commit QEMU block_copy %s ->    %s\n", s->src_device_name,
>>>>> +                        s->dst_filename);
>>>>> +
>>>>> +    len = strlen(commit_msg);
>>>>> +    while (len>    0) {
>>>>> +        ret = write(s->commit_fd, buf, len);
>>>>> +        if (ret == -1&&    errno == EINTR) {
>>>>> +            continue;
>>>>> +        }
>>>>> +        if (ret<= 0) {
>>>>> +            return -errno;
>>>>> +        }
>>>>> +        buf += ret;
>>>>> +        len -= ret;
>>>>> +    }
>>>>> +
>>>>> +    if (fsync(s->commit_fd) == -1) {
>>>>> +        return -errno;
>>>>> +    }
>>>>>
>>>>>            
>>>> This is more or less black magic.  What is this commit file used for
>>>> and why aren't we using something like a QMP event?
>>>>          
>>> The commit file is considered reliable storage for the result of image
>>> switch operation. Think of the following scenario:
>>>
>>> - mgmt app requests live copy of guests ide1-hd0
>>>        
>> >from /a/image.img to /b/image.img.
>>      
>>> - mgmt app dies.
>>> - guest switches to /b/image.img, /a/image.img is outdated.
>>> - guest dies.
>>>
>>> Notifying the switch via QMP would not be reliable in this case.
>>>        
>> But this is true of any number of operations in QEMU such as hotplug
>> where if a management tool dies after requesting hotplug and then
>> the guest dies before the management tool reconnects, it's never
>> known whether it's truly connected or not.
>>      
> This is a different case. The mgmt tool should be able to safely store
> the desired device tree before hotplugging a device.
>
> And even if you consider that should be done by QEMU, in a config file,
> restarting the guest with or without the device is not going to result
> in data corruption/loss.
>    

If a guest started writing to a disk and is storing important data 
there, and then the guest is restarted and the disk (and data) isn't 
there, I'd classify that as data loss.

>> The only way to robustly solve this is for QEMU to maintain a
>> stateful config file.  A one-off for this particular command doesn't
>> seem wise to me.
>>      
> I don't disagree the commit message could be in a config file, but since
> that does not exist, a separate file is a workable solution.

I disagree.  This is something we'll have to maintain forever and what's 
to stop us from having additional one-off mechanisms to deal with this 
same problem.

-drive already ties into the qemuopts infrastructure and we have 
readconfig and writeconfig.  I don't think we're missing any major 
pieces to do this in a more proper fashion.

Regards,

Anthony Liguori

>   Also, the
> separate commit file is not incompatible with future improvements.
>
>
Avi Kivity - Feb. 23, 2011, 12:46 p.m.
On 02/22/2011 11:11 PM, Anthony Liguori wrote:
>> The commit file is considered reliable storage for the result of image
>> switch operation. Think of the following scenario:
>>
>> - mgmt app requests live copy of guests ide1-hd0
>> from /a/image.img to /b/image.img.
>> - mgmt app dies.
>> - guest switches to /b/image.img, /a/image.img is outdated.
>> - guest dies.
>>
>> Notifying the switch via QMP would not be reliable in this case.
>
> But this is true of any number of operations in QEMU such as hotplug 
> where if a management tool dies after requesting hotplug and then the 
> guest dies before the management tool reconnects, it's never known 
> whether it's truly connected or not.
>
> The only way to robustly solve this is for QEMU to maintain a stateful 
> config file.

Or, the management tool can remember that it tries to hotplug, 
reconnect, query qemu (or just retry), and proceed according to the result:

- insert record in config for the new device, status = inactive
- commit
- tell qemu to hotplug
<crash>
- restart, read database, see inactive device
- query qemu
- if device is there, mark as active, commit
- else, tell qemu to hotplug, when successful, mark as active, commit


>   A one-off for this particular command doesn't seem wise to me.

Strongly agreed.
Avi Kivity - Feb. 23, 2011, 1:01 p.m.
On 02/23/2011 01:14 AM, Anthony Liguori wrote:
>
> -drive already ties into the qemuopts infrastructure and we have 
> readconfig and writeconfig.  I don't think we're missing any major 
> pieces to do this in a more proper fashion.

The problem with qemu config files is that it splits the authoritative 
source of where images are stored into two.  Is it in the management 
tool's database or is it in qemu's config file?

For the problem at hand, one solution is to make qemu stop after the 
copy, and then management can issue an additional command to rearrange 
the disk and resume the guest.  A drawback here is that if management 
dies, the guest is stopped until it restarts.  We also make management 
latency guest visible, even if it doesn't die at an inconvenient place.

An alternative approach is to have the copy be performed by a new 
layered block format driver:

- create a new image, type = live-copy, containing three pieces of 
information
    - source image
    - destination image
    - copy state (initially nothing is copied)
- tell qemu switch to the new image
- qemu starts copying, updates copy state as needed
- copy finishes, event is emitted; reads and writes still serviced
- management receives event, switches qemu to destination image
- management removes live-copy image

If management dies while this is happening, it can simply query the 
state of the copy.  Similarly, if qemu dies, the copy state is 
persistent (could be 0/1 or real range of blocks).
Anthony Liguori - Feb. 23, 2011, 2:35 p.m.
On 02/23/2011 07:01 AM, Avi Kivity wrote:
> On 02/23/2011 01:14 AM, Anthony Liguori wrote:
>>
>> -drive already ties into the qemuopts infrastructure and we have 
>> readconfig and writeconfig.  I don't think we're missing any major 
>> pieces to do this in a more proper fashion.
>
> The problem with qemu config files is that it splits the authoritative 
> source of where images are stored into two.  Is it in the management 
> tool's database or is it in qemu's config file?

I like to use the phrase "stateful config file".  To me, it's just a 
database for QEMU to persist data about the VM.  It's the only way for 
QEMU to make certain transactions atomic in the face of QEMU crashing.

The user visible config file is a totally different concept.  A 
management tool launches QEMU and tells it where to keep it's state 
database.  The management application may prepopulate the state database 
or it may just use an empty file.

QEMU uses the state database to store information that is created 
dynamically.  For instance, devices added through device_add.  A device 
added via -device wouldn't necessary get added to the state database.

Practically speaking, it let's you invoke QEMU with a fixed command 
line, while still using the monitor to make changes that would otherwise 
require the command line being updated.

> For the problem at hand, one solution is to make qemu stop after the 
> copy, and then management can issue an additional command to rearrange 
> the disk and resume the guest.  A drawback here is that if management 
> dies, the guest is stopped until it restarts.  We also make management 
> latency guest visible, even if it doesn't die at an inconvenient place.
>
> An alternative approach is to have the copy be performed by a new 
> layered block format driver:
>
> - create a new image, type = live-copy, containing three pieces of 
> information
>    - source image
>    - destination image
>    - copy state (initially nothing is copied)
> - tell qemu switch to the new image
> - qemu starts copying, updates copy state as needed
> - copy finishes, event is emitted; reads and writes still serviced
> - management receives event, switches qemu to destination image
> - management removes live-copy image
>
> If management dies while this is happening, it can simply query the 
> state of the copy.  Similarly, if qemu dies, the copy state is 
> persistent (could be 0/1 or real range of blocks).

This is a more elegant solution to the problem than the commit problem 
but it's also a one-off.  I think we have a generic problem here and we 
ought to try to solve it generically (within reason).

Regards,

Anthony Liguori
Avi Kivity - Feb. 23, 2011, 3:31 p.m.
On 02/23/2011 04:35 PM, Anthony Liguori wrote:
> On 02/23/2011 07:01 AM, Avi Kivity wrote:
>> On 02/23/2011 01:14 AM, Anthony Liguori wrote:
>>>
>>> -drive already ties into the qemuopts infrastructure and we have 
>>> readconfig and writeconfig.  I don't think we're missing any major 
>>> pieces to do this in a more proper fashion.
>>
>> The problem with qemu config files is that it splits the 
>> authoritative source of where images are stored into two.  Is it in 
>> the management tool's database or is it in qemu's config file?
>
> I like to use the phrase "stateful config file".  To me, it's just a 
> database for QEMU to persist data about the VM.  It's the only way for 
> QEMU to make certain transactions atomic in the face of QEMU crashing.
>
> The user visible config file is a totally different concept.  A 
> management tool launches QEMU and tells it where to keep it's state 
> database.  The management application may prepopulate the state 
> database or it may just use an empty file.

In that case the word 'config' is misleading.  To me, it implies that 
the user configures something, and qemu reads it, not something mostly 
internal to qemu.

Qemu does keep state.  Currently only images, but in theory also the 
on-board NVRAM.

>
> QEMU uses the state database to store information that is created 
> dynamically.  For instance, devices added through device_add.  A 
> device added via -device wouldn't necessary get added to the state 
> database.
>
> Practically speaking, it let's you invoke QEMU with a fixed command 
> line, while still using the monitor to make changes that would 
> otherwise require the command line being updated.

Then the invoker quickly loses track of what the actual state is.  It 
can't just remember which commands it issued (presumably in response to 
the user updating user visible state).  It has to parse the stateful 
config file qemu outputs.  But at which points should it parse it?

I don't think it's reasonable to have three different ways to interact 
with qemu, all needed: the command line, reading and writing the 
stateful config file, and the monitor.  I'd rather push for starting 
qemu with a blank guest and assembling (cold-plugging) all the hardware 
via the monitor before starting the guest.

>> For the problem at hand, one solution is to make qemu stop after the 
>> copy, and then management can issue an additional command to 
>> rearrange the disk and resume the guest.  A drawback here is that if 
>> management dies, the guest is stopped until it restarts.  We also 
>> make management latency guest visible, even if it doesn't die at an 
>> inconvenient place.
>>
>> An alternative approach is to have the copy be performed by a new 
>> layered block format driver:
>>
>> - create a new image, type = live-copy, containing three pieces of 
>> information
>>    - source image
>>    - destination image
>>    - copy state (initially nothing is copied)
>> - tell qemu switch to the new image
>> - qemu starts copying, updates copy state as needed
>> - copy finishes, event is emitted; reads and writes still serviced
>> - management receives event, switches qemu to destination image
>> - management removes live-copy image
>>
>> If management dies while this is happening, it can simply query the 
>> state of the copy.  Similarly, if qemu dies, the copy state is 
>> persistent (could be 0/1 or real range of blocks).
>
> This is a more elegant solution to the problem than the commit problem 
> but it's also a one-off.  I think we have a generic problem here and 
> we ought to try to solve it generically (within reason).

Can you give more examples?

I think I demonstrated that hot-plug can be solved via the existing 
interfaces.
Anthony Liguori - Feb. 23, 2011, 4:01 p.m.
On 02/23/2011 09:31 AM, Avi Kivity wrote:
> On 02/23/2011 04:35 PM, Anthony Liguori wrote:
>> On 02/23/2011 07:01 AM, Avi Kivity wrote:
>>> On 02/23/2011 01:14 AM, Anthony Liguori wrote:
>>>>
>>>> -drive already ties into the qemuopts infrastructure and we have 
>>>> readconfig and writeconfig.  I don't think we're missing any major 
>>>> pieces to do this in a more proper fashion.
>>>
>>> The problem with qemu config files is that it splits the 
>>> authoritative source of where images are stored into two.  Is it in 
>>> the management tool's database or is it in qemu's config file?
>>
>> I like to use the phrase "stateful config file".  To me, it's just a 
>> database for QEMU to persist data about the VM.  It's the only way 
>> for QEMU to make certain transactions atomic in the face of QEMU 
>> crashing.
>>
>> The user visible config file is a totally different concept.  A 
>> management tool launches QEMU and tells it where to keep it's state 
>> database.  The management application may prepopulate the state 
>> database or it may just use an empty file.
>
> In that case the word 'config' is misleading.  To me, it implies that 
> the user configures something, and qemu reads it, not something mostly 
> internal to qemu.

Understood.

>
> Qemu does keep state.  Currently only images, but in theory also the 
> on-board NVRAM.

Yeah, this is a good example of an area where a "stateful config file" 
would be useful.  I like the idea of storing this sort of thing in a 
text file with a config structure because a user certainly wants to be 
able to specify the boot order.  Being able to tweak this kind of stuff 
adds a lot of interesting capabilities.

>>
>> QEMU uses the state database to store information that is created 
>> dynamically.  For instance, devices added through device_add.  A 
>> device added via -device wouldn't necessary get added to the state 
>> database.
>>
>> Practically speaking, it let's you invoke QEMU with a fixed command 
>> line, while still using the monitor to make changes that would 
>> otherwise require the command line being updated.
>
> Then the invoker quickly loses track of what the actual state is.  It 
> can't just remember which commands it issued (presumably in response 
> to the user updating user visible state).  It has to parse the 
> stateful config file qemu outputs.

Well specifically, it has to ask QEMU and QEMU can tell it the current 
state via a nice structured data format over QMP.  It's a hell of a lot 
easier than the management tool trying to do this outside of QEMU.

>   But at which points should it parse it?

I was thinking that we should post events whenever we change the 
stateful config.  That would let the management tool have a mechanism 
for determining when settings have been changed.  Of course, if the 
management tool crashes, it should re-read at startup.

> I don't think it's reasonable to have three different ways to interact 
> with qemu, all needed: the command line, reading and writing the 
> stateful config file, and the monitor.  I'd rather push for starting 
> qemu with a blank guest and assembling (cold-plugging) all the 
> hardware via the monitor before starting the guest.

Yes.   I view the command line as optional.  To me, this is the ideal 
interaction:

1) start qemu with an empty stateful config file

2) issue monitor commands to create all devices and backends

3) the stateful config file totally captures the state of all of the 
issued QMP commands.  The management tool can relaunch the guest just by 
passing the stateful config file to QEMU.

4) when the management tool needs to "extract" a config file, it can 
read the stateful config (through the monitor) and generate it's own config.

5) the management tool should treat the stateful config file as more or 
less opaque.  It shouldn't be visible to end user.

In the non-managed case, users should interact directly with the config 
file.

>>> For the problem at hand, one solution is to make qemu stop after the 
>>> copy, and then management can issue an additional command to 
>>> rearrange the disk and resume the guest.  A drawback here is that if 
>>> management dies, the guest is stopped until it restarts.  We also 
>>> make management latency guest visible, even if it doesn't die at an 
>>> inconvenient place.
>>>
>>> An alternative approach is to have the copy be performed by a new 
>>> layered block format driver:
>>>
>>> - create a new image, type = live-copy, containing three pieces of 
>>> information
>>>    - source image
>>>    - destination image
>>>    - copy state (initially nothing is copied)
>>> - tell qemu switch to the new image
>>> - qemu starts copying, updates copy state as needed
>>> - copy finishes, event is emitted; reads and writes still serviced
>>> - management receives event, switches qemu to destination image
>>> - management removes live-copy image
>>>
>>> If management dies while this is happening, it can simply query the 
>>> state of the copy.  Similarly, if qemu dies, the copy state is 
>>> persistent (could be 0/1 or real range of blocks).
>>
>> This is a more elegant solution to the problem than the commit 
>> problem but it's also a one-off.  I think we have a generic problem 
>> here and we ought to try to solve it generically (within reason).
>
> Can you give more examples?
>
> I think I demonstrated that hot-plug can be solved via the existing 
> interfaces.

Sure.  CMOS settings right now are not persisted across reboot.  Guest 
initiated activities like IDE or PCI eject are tricky to persist 
correctly within a management tool.

We could add events for all of this things but it's all racy since 
events are posted.  If we have a stateful config file, we can make all 
of these things non-racy and post an event that the config has changed.  
If there's a crash, the management tool can read the config on startup 
to catch up on missed events.

I think the nature of a posted event management interface is such that 
we need a stateful config that persists across QEMU invocations.

Regards,

Anthony Liguori
Avi Kivity - Feb. 23, 2011, 4:14 p.m.
On 02/23/2011 06:01 PM, Anthony Liguori wrote:
>
>>
>> Qemu does keep state.  Currently only images, but in theory also the 
>> on-board NVRAM.
>
> Yeah, this is a good example of an area where a "stateful config file" 
> would be useful.  I like the idea of storing this sort of thing in a 
> text file with a config structure because a user certainly wants to be 
> able to specify the boot order.  Being able to tweak this kind of 
> stuff adds a lot of interesting capabilities.

My preference would be a binary file (a disk image, in fact), with a 
tool to play with the known fields.  It allows a management tool to 
reuse its storage stack.

>
>>>
>>> QEMU uses the state database to store information that is created 
>>> dynamically.  For instance, devices added through device_add.  A 
>>> device added via -device wouldn't necessary get added to the state 
>>> database.
>>>
>>> Practically speaking, it let's you invoke QEMU with a fixed command 
>>> line, while still using the monitor to make changes that would 
>>> otherwise require the command line being updated.
>>
>> Then the invoker quickly loses track of what the actual state is.  It 
>> can't just remember which commands it issued (presumably in response 
>> to the user updating user visible state).  It has to parse the 
>> stateful config file qemu outputs.
>
> Well specifically, it has to ask QEMU and QEMU can tell it the current 
> state via a nice structured data format over QMP.  It's a hell of a 
> lot easier than the management tool trying to do this outside of QEMU.

So, if qemu crashes, the management tool has to start it up to find out 
what the current state is.

>
>>   But at which points should it parse it?
>
> I was thinking that we should post events whenever we change the 
> stateful config.  That would let the management tool have a mechanism 
> for determining when settings have been changed.  Of course, if the 
> management tool crashes, it should re-read at startup.
>
>> I don't think it's reasonable to have three different ways to 
>> interact with qemu, all needed: the command line, reading and writing 
>> the stateful config file, and the monitor.  I'd rather push for 
>> starting qemu with a blank guest and assembling (cold-plugging) all 
>> the hardware via the monitor before starting the guest.
>
> Yes.   I view the command line as optional.  To me, this is the ideal 
> interaction:
>
> 1) start qemu with an empty stateful config file
>
> 2) issue monitor commands to create all devices and backends
>
> 3) the stateful config file totally captures the state of all of the 
> issued QMP commands.  The management tool can relaunch the guest just 
> by passing the stateful config file to QEMU.
>
> 4) when the management tool needs to "extract" a config file, it can 
> read the stateful config (through the monitor) and generate it's own 
> config.
>
> 5) the management tool should treat the stateful config file as more 
> or less opaque.  It shouldn't be visible to end user.
>
> In the non-managed case, users should interact directly with the 
> config file.

Doesn't the stateful non-config file becomes a failure point?  It has to 
be on shared and redundant storage?

To me, it seems a lot easier to require management to replay any 
commands that hadn't been acknowledged (due to management failure), or 
to query qemu as to its current state (if it is alive).  Management 
already needs stable, redundant config storage anyway (often a database).

>> Can you give more examples?
>>
>> I think I demonstrated that hot-plug can be solved via the existing 
>> interfaces.
>
> Sure.  CMOS settings right now are not persisted across reboot. 

That is handled easily with an NVRAM disk image.

> Guest initiated activities like IDE or PCI eject are tricky to persist 
> correctly within a management tool.
>
> We could add events for all of this things but it's all racy since 
> events are posted.  If we have a stateful config file, we can make all 
> of these things non-racy and post an event that the config has 
> changed.  If there's a crash, the management tool can read the config 
> on startup to catch up on missed events.

If qemu crashes, these events are meaningless.  If management crashes, 
it has to query qemu for all state that it wants to keep track of via 
events.

>
> I think the nature of a posted event management interface is such that 
> we need a stateful config that persists across QEMU invocations.

I'm not convinced, and I think making qemu manage even more state 
creates more problems.
Peter Maydell - Feb. 23, 2011, 4:17 p.m.
On 23 February 2011 16:01, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Yes.   I view the command line as optional.  To me, this is the ideal
> interaction:
>
> 1) start qemu with an empty stateful config file
>
> 2) issue monitor commands to create all devices and backends
>
> 3) the stateful config file totally captures the state of all of the issued
> QMP commands.  The management tool can relaunch the guest just by passing
> the stateful config file to QEMU.
>
> 4) when the management tool needs to "extract" a config file, it can read
> the stateful config (through the monitor) and generate it's own config.
>
> 5) the management tool should treat the stateful config file as more or less
> opaque.  It shouldn't be visible to end user.
>
> In the non-managed case, users should interact directly with the config
> file.

For the typical TCG use case, that sounds a whole lot worse than
the current situation of "just run qemu with a suitable command
line"...

-- PMM
Anthony Liguori - Feb. 23, 2011, 4:28 p.m.
On 02/23/2011 10:14 AM, Avi Kivity wrote:
> On 02/23/2011 06:01 PM, Anthony Liguori wrote:
>>
>>>
>>> Qemu does keep state.  Currently only images, but in theory also the 
>>> on-board NVRAM.
>>
>> Yeah, this is a good example of an area where a "stateful config 
>> file" would be useful.  I like the idea of storing this sort of thing 
>> in a text file with a config structure because a user certainly wants 
>> to be able to specify the boot order.  Being able to tweak this kind 
>> of stuff adds a lot of interesting capabilities.
>
> My preference would be a binary file (a disk image, in fact), with a 
> tool to play with the known fields.  It allows a management tool to 
> reuse its storage stack.

I'm not sure yet what I think is best.  To be honest, I was planning on 
tackling this for 0.16 and focusing on the QMP side of the world for 
0.15 so I don't have as well formed opinions yet.

>> Well specifically, it has to ask QEMU and QEMU can tell it the 
>> current state via a nice structured data format over QMP.  It's a 
>> hell of a lot easier than the management tool trying to do this 
>> outside of QEMU.
>
> So, if qemu crashes, the management tool has to start it up to find 
> out what the current state is.

Depends on how opaque we make the state file.  I've been thinking a 
simple ini syntax with a well supported set of keys.  In that case, a 
management tool can read it without starting QEMU.

>>
>>>   But at which points should it parse it?
>>
>> I was thinking that we should post events whenever we change the 
>> stateful config.  That would let the management tool have a mechanism 
>> for determining when settings have been changed.  Of course, if the 
>> management tool crashes, it should re-read at startup.
>>
>>> I don't think it's reasonable to have three different ways to 
>>> interact with qemu, all needed: the command line, reading and 
>>> writing the stateful config file, and the monitor.  I'd rather push 
>>> for starting qemu with a blank guest and assembling (cold-plugging) 
>>> all the hardware via the monitor before starting the guest.
>>
>> Yes.   I view the command line as optional.  To me, this is the ideal 
>> interaction:
>>
>> 1) start qemu with an empty stateful config file
>>
>> 2) issue monitor commands to create all devices and backends
>>
>> 3) the stateful config file totally captures the state of all of the 
>> issued QMP commands.  The management tool can relaunch the guest just 
>> by passing the stateful config file to QEMU.
>>
>> 4) when the management tool needs to "extract" a config file, it can 
>> read the stateful config (through the monitor) and generate it's own 
>> config.
>>
>> 5) the management tool should treat the stateful config file as more 
>> or less opaque.  It shouldn't be visible to end user.
>>
>> In the non-managed case, users should interact directly with the 
>> config file.
>
> Doesn't the stateful non-config file becomes a failure point?  It has 
> to be on shared and redundant storage?

It depends on what your availability model is and how frequently your 
management tool backs up the config.  As of right now, we have a pretty 
glaring reliability hole here so adding a stateful "non-config" can only 
improve things.

> To me, it seems a lot easier to require management to replay any 
> commands that hadn't been acknowledged (due to management failure), or 
> to query qemu as to its current state (if it is alive). 

You still have the race condition around guest initiated events like 
eject.  Unless you have an acknowledged event from a management tool 
(which we can't do in QMP today) whereas you don't complete the guest 
initiated eject operation until management ack's it, we need to store 
that state ourself.

I don't like the idea of making a management tool such an integral part 
of the functional paths.  Not having a stateful config file also means 
that this problem isn't solved in any form without a really 
sophisticated management stack.  I'm a big fan of being robust in the 
face of not-so sophisticated management tools.

> Management already needs stable, redundant config storage anyway 
> (often a database).
>
>>> Can you give more examples?
>>>
>>> I think I demonstrated that hot-plug can be solved via the existing 
>>> interfaces.
>>
>> Sure.  CMOS settings right now are not persisted across reboot. 
>
> That is handled easily with an NVRAM disk image.
>
>> Guest initiated activities like IDE or PCI eject are tricky to 
>> persist correctly within a management tool.
>>
>> We could add events for all of this things but it's all racy since 
>> events are posted.  If we have a stateful config file, we can make 
>> all of these things non-racy and post an event that the config has 
>> changed.  If there's a crash, the management tool can read the config 
>> on startup to catch up on missed events.
>
> If qemu crashes, these events are meaningless.  If management crashes, 
> it has to query qemu for all state that it wants to keep track of via 
> events.

Think power failure, not qemu crash.  In the event of a power failure, 
any hardware change initiated by the guest ought to be consistent with 
when the guest has restarted.  If you eject the CDROM tray and then lose 
power, its still ejected after the power comes back on.

>>
>> I think the nature of a posted event management interface is such 
>> that we need a stateful config that persists across QEMU invocations.
>
> I'm not convinced, and I think making qemu manage even more state 
> creates more problems.

Well this patch series is making qemu management more state.  The only 
question is whether we do this as a one-off mechanism or whether we 
architect a general mechanism to do it.

How much state we store can always be up for discussion but I think it's 
undeniable that we need to store more state than we're storing today (none).

Regards,

Anthony Liguori
Anthony Liguori - Feb. 23, 2011, 4:30 p.m.
On 02/23/2011 10:17 AM, Peter Maydell wrote:
> On 23 February 2011 16:01, Anthony Liguori<anthony@codemonkey.ws>  wrote:
>    
>> Yes.   I view the command line as optional.  To me, this is the ideal
>> interaction:
>>
>> 1) start qemu with an empty stateful config file
>>
>> 2) issue monitor commands to create all devices and backends
>>
>> 3) the stateful config file totally captures the state of all of the issued
>> QMP commands.  The management tool can relaunch the guest just by passing
>> the stateful config file to QEMU.
>>
>> 4) when the management tool needs to "extract" a config file, it can read
>> the stateful config (through the monitor) and generate it's own config.
>>
>> 5) the management tool should treat the stateful config file as more or less
>> opaque.  It shouldn't be visible to end user.
>>
>> In the non-managed case, users should interact directly with the config
>> file.
>>      
> For the typical TCG use case, that sounds a whole lot worse than
> the current situation of "just run qemu with a suitable command
> line"...
>    

This is only an option.  Running qemu with no stateful config and just 
using command line parameters will continue to be well supported.

Internally, no stateful config is essentially equivalent to using a 
stateful config in -snapshot mode where all of the changes are just 
lost.  The command line parameters will all eventually be implemented 
more or less in terms of internal QMP commands.

This is another reason for the QAPI refactoring.  -device handling (and 
all iterations of it) just becomes a call to qmp_device_add().

Regards,

Anthony Liguori

> -- PMM
>
>
Avi Kivity - Feb. 23, 2011, 5:18 p.m.
On 02/23/2011 06:28 PM, Anthony Liguori wrote:
>
>>> Well specifically, it has to ask QEMU and QEMU can tell it the 
>>> current state via a nice structured data format over QMP.  It's a 
>>> hell of a lot easier than the management tool trying to do this 
>>> outside of QEMU.
>>
>> So, if qemu crashes, the management tool has to start it up to find 
>> out what the current state is.
>
> Depends on how opaque we make the state file.  I've been thinking a 
> simple ini syntax with a well supported set of keys.  In that case, a 
> management tool can read it without starting QEMU.

Then the management stack has to worry about yet another way of 
interacting via qemu.  I'd like to limit it to the monitor.

>>
>> Doesn't the stateful non-config file becomes a failure point?  It has 
>> to be on shared and redundant storage?
>
> It depends on what your availability model is and how frequently your 
> management tool backs up the config.  As of right now, we have a 
> pretty glaring reliability hole here so adding a stateful "non-config" 
> can only improve things.

I think the solutions I pointed out close the hole with the existing 
interfaces.

>
>> To me, it seems a lot easier to require management to replay any 
>> commands that hadn't been acknowledged (due to management failure), 
>> or to query qemu as to its current state (if it is alive). 
>
> You still have the race condition around guest initiated events like 
> eject.  Unless you have an acknowledged event from a management tool 
> (which we can't do in QMP today) whereas you don't complete the guest 
> initiated eject operation until management ack's it, we need to store 
> that state ourself.

I don't see why.

If management crashes, it queries the eject state when it reconnects to 
qemu.
If qemu crashes, the eject state is lost, but that is fine.  My CD-ROM 
drive tray pulls itself in when the machine is started.

>
> I don't like the idea of making a management tool such an integral 
> part of the functional paths. 

I agree that we don't want qemu to wait on the management stack any more 
than necessary.

> Not having a stateful config file also means that this problem isn't 
> solved in any form without a really sophisticated management stack.  
> I'm a big fan of being robust in the face of not-so sophisticated 
> management tools.

You're introducing the need for additional code in the management layer, 
the care and feeding for the stateful non-config file.

>> If qemu crashes, these events are meaningless.  If management 
>> crashes, it has to query qemu for all state that it wants to keep 
>> track of via events.
>
> Think power failure, not qemu crash.  In the event of a power failure, 
> any hardware change initiated by the guest ought to be consistent with 
> when the guest has restarted.  If you eject the CDROM tray and then 
> lose power, its still ejected after the power comes back on.

Not on all machines.

Let's list guest state which is independent of power.  That would be 
wither NVRAM of various types, or physical alterations.  CD-ROM eject is 
one.  Are there others?

>
>>>
>>> I think the nature of a posted event management interface is such 
>>> that we need a stateful config that persists across QEMU invocations.
>>
>> I'm not convinced, and I think making qemu manage even more state 
>> creates more problems.
>
> Well this patch series is making qemu management more state.  The only 
> question is whether we do this as a one-off mechanism or whether we 
> architect a general mechanism to do it.
>
> How much state we store can always be up for discussion but I think 
> it's undeniable that we need to store more state than we're storing 
> today (none).

I think my solution (multiplexing block format driver) fits the 
requirements for live-copy perfectly.  In fact it has a name - it's a 
RAID-1 driver started in degraded mode.  It could be useful other use cases.
Markus Armbruster - Feb. 23, 2011, 5:26 p.m.
Avi Kivity <avi@redhat.com> writes:

> On 02/23/2011 04:35 PM, Anthony Liguori wrote:
>> On 02/23/2011 07:01 AM, Avi Kivity wrote:
>>> On 02/23/2011 01:14 AM, Anthony Liguori wrote:
>>>>
>>>> -drive already ties into the qemuopts infrastructure and we have
>>>> readconfig and writeconfig.  I don't think we're missing any major
>>>> pieces to do this in a more proper fashion.
>>>
>>> The problem with qemu config files is that it splits the
>>> authoritative source of where images are stored into two.  Is it in
>>> the management tool's database or is it in qemu's config file?
>>
>> I like to use the phrase "stateful config file".  To me, it's just a
>> database for QEMU to persist data about the VM.  It's the only way
>> for QEMU to make certain transactions atomic in the face of QEMU
>> crashing.
>>
>> The user visible config file is a totally different concept.  A
>> management tool launches QEMU and tells it where to keep it's state
>> database.  The management application may prepopulate the state
>> database or it may just use an empty file.
>
> In that case the word 'config' is misleading.  To me, it implies that
> the user configures something, and qemu reads it, not something mostly
> internal to qemu.
>
> Qemu does keep state.  Currently only images, but in theory also the
> on-board NVRAM.
>
>>
>> QEMU uses the state database to store information that is created
>> dynamically.  For instance, devices added through device_add.  A
>> device added via -device wouldn't necessary get added to the state
>> database.
>>
>> Practically speaking, it let's you invoke QEMU with a fixed command
>> line, while still using the monitor to make changes that would
>> otherwise require the command line being updated.
>
> Then the invoker quickly loses track of what the actual state is.  It
> can't just remember which commands it issued (presumably in response
> to the user updating user visible state).  It has to parse the
> stateful config file qemu outputs.  But at which points should it
> parse it?
>
> I don't think it's reasonable to have three different ways to interact
> with qemu, all needed: the command line, reading and writing the
> stateful config file, and the monitor.  I'd rather push for starting
> qemu with a blank guest and assembling (cold-plugging) all the
> hardware via the monitor before starting the guest.

Exactly.  And qdev has brought his within our reach.

[...]
Marcelo Tosatti - Feb. 23, 2011, 5:49 p.m.
On Wed, Feb 23, 2011 at 03:01:14PM +0200, Avi Kivity wrote:
> On 02/23/2011 01:14 AM, Anthony Liguori wrote:
> >
> >-drive already ties into the qemuopts infrastructure and we have
> >readconfig and writeconfig.  I don't think we're missing any major
> >pieces to do this in a more proper fashion.
> 
> The problem with qemu config files is that it splits the
> authoritative source of where images are stored into two.  Is it in
> the management tool's database or is it in qemu's config file?
> 
> For the problem at hand, one solution is to make qemu stop after the
> copy, and then management can issue an additional command to
> rearrange the disk and resume the guest.  A drawback here is that if
> management dies, the guest is stopped until it restarts.  We also
> make management latency guest visible, even if it doesn't die at an
> inconvenient place.
> 
> An alternative approach is to have the copy be performed by a new
> layered block format driver:
> 
> - create a new image, type = live-copy, containing three pieces of
> information
>    - source image
>    - destination image
>    - copy state (initially nothing is copied)
> - tell qemu switch to the new image
> - qemu starts copying, updates copy state as needed
> - copy finishes, event is emitted; reads and writes still serviced
> - management receives event, switches qemu to destination image
> - management removes live-copy image
> 
> If management dies while this is happening, it can simply query the
> state of the copy.  
> Similarly, if qemu dies, the copy state is persistent (could be 0/1 or
> real range of blocks).

You don't know if a given block is uptodate or not without the dirty
bitmap. So unless you also keep track of dirty log somehow, this is
meaningless.

So a commit file as proposed indicates copy state (in 0/1 fashion). The
difference in your proposal is that such information is stored inside a
special purpose image format?
Anthony Liguori - Feb. 23, 2011, 7:06 p.m.
On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
> Index: qemu/qerror.h
> ===================================================================
> --- qemu.orig/qerror.h
> +++ qemu/qerror.h
> @@ -171,4 +171,13 @@ QError *qobject_to_qerror(const QObject
>   #define QERR_VNC_SERVER_FAILED \
>       "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }"
>
> +#define QERR_BLOCKCOPY_IN_PROGRESS \
> +    "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }"
>    

The caller already knows the device name by virtue of issuing the 
command so this is redundant.

I think a better approach would be a QERR_IN_PROGRESS 'data': { 
'operation': %s }

For block copy, we'd say QERR_IN_PROGRESS("block copy").

> +
> +#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \
> +    "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }"
> +
> +#define QERR_MIGRATION_IN_PROGRESS \
> +    "{ 'class': 'MigrationInProgress', 'data': {} }"
>    

Then QERR_IN_PROGRESS("live migration")

>   #endif /* QERROR_H */
> Index: qemu/qmp-commands.hx
> ===================================================================
> --- qemu.orig/qmp-commands.hx
> +++ qemu/qmp-commands.hx
> @@ -581,6 +581,75 @@ Example:
>   EQMP
>
>       {
> +        .name       = "block_copy",
> +        .args_type  = "device:s,filename:s,commit_filename:s?,inc:-i",
> +        .params     = "device filename [commit_filename] [-i]",
> +        .help       = "live block copy device to image"
> +                      "\n\t\t\t optional commit filename "
> +                      "\n\t\t\t -i for incremental copy "
> +                      "(base image shared between src and destination)",
> +        .user_print = monitor_user_noop,
> +        .mhandler.cmd_new = do_bdrv_copy,
> +    },
> +
> +SQMP
> +block-copy
> +-------
> +
> +Live block copy.
>    

I'm not sure copy really describes what we're doing here.  Maybe 
migrate-block?

> +Arguments:
> +
> +- "device": device name (json-string)
> +- "filename": target image filename (json-string)
>    

Is this a created image?  Is this an image to create?

To future proof for blockdev, we should make this argument optional and 
if it's not specified throw an error about missing argument.  This let's 
us introduce an optional blockdev argument such that we can use a 
blockdev name.

> +- "commit_filename": target commit filename (json-string, optional)
>    

I think we should drop this.

> +- "inc": incremental disk copy (json-bool, optional)
>    

Let's use the full name (incremental) and we need to describe in detail 
what the semantics of this are.  Will it scan the target block device to 
identify identical blocks?

> +Example:
> +
> +->  { "execute": "block_copy",
> +                            "arguments": { "device": "ide0-hd1",
> +                               "filename": "/mnt/new-disk.img",
> +                               "commit_filename: "/mnt/commit-new-disk.img"
> +                             } }
> +
> +<- { "return": {} }
> +
> +Notes:
> +
> +(1) The 'query-block-copy' command should be used to check block copy progress
> +    and final result (this information is provided by the 'status' member)
> +(2) Boolean argument "inc" defaults to false
>    

We should also document error semantics.  What errors are expected and why?

> +EQMP
> +
> +    {
> +        .name       = "block_copy_cancel",
> +        .args_type  = "device:s",
> +        .params     = "device",
> +        .help       = "cancel live block copy",
> +        .user_print = monitor_user_noop,
> +        .mhandler.cmd_new = do_bdrv_copy_cancel,
> +    },
> +
> +SQMP
> +block_copy_cancel
> +--------------
> +
> +Cancel live block copy.
> +
> +Arguments:
> +
> +- device: device name (json-string)
> +
> +Example:
> +
> +->  { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } }
> +<- { "return": {} }
>    

cancel-block-migration?

What happens if:
  - No block copy is active anymore (it's completed)
  - A block copy was never started
  - device refers to a device that no longer exists
  - device refers to a device with no media present

If this command succeeds, what the state of target?  If I resume the 
operation with the incremental flag set, will it Just Work?

> +
> +EQMP
> +
> +    {
>           .name       = "netdev_add",
>           .args_type  = "netdev:O",
>           .params     = "[user|tap|socket],id=str[,prop=value][,...]",
> @@ -1740,6 +1809,44 @@ Examples:
>   EQMP
>
>   SQMP
> +query-block-copy
> +-------------
> +
> +Live block copy status.
> +
> +Each block copy instance information is stored in a json-object and the returned
> +value is a json-array of all instances.
> +
> +Each json-object contains the following:
> +
> +- "device": device name (json-string)
> +- "status": block copy status (json-string)
> +    - Possible values: "active", "failed", "completed"
> +- "info": A json-object with the statistics information, if status is "active":
> +    - "percentage": percentage completed (json-int)
>    

So let's fold this into query-block

And let's also report the statistics as MigrationStats which is the format:

{ 'MigrationStats': {'transferred': 'int', 'remaining': 'int', 'total': 
'int' } }

So I'd propose changing:

{ 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int',
                        'rd_operations': 'int', 'wr_operations': 'int',
                        'wr_highest_offset': 'int' } }
{ 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats',
                  '*parent': 'BlockStats'} }

[ 'query-blockstats', {}, {}, ['BlockStats'] ]

To:

{ 'BlockMigrationStatus': [ 'active', 'inactive', 'cancelled' ] }

{ 'BlockMigrationInfo': {'status': 'BlockMigrationStatus', 'stats': 
'BlockMigrationStats', '*target_filename': 'str' } }

{ 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int',
                        'rd_operations': 'int', 'wr_operations': 'int',
                        'wr_highest_offset': 'int', '*migration': 
'BlockMigrationInfo' } }
{ 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats',
                  '*parent': 'BlockStats'} }

[ 'query-blockstats', {}, {}, ['BlockStats'] ]

Regards,

Anthony Liguori

> +
> +Example:
> +
> +Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed:
> +
> +->  { "execute": "query-block-copy" }
> +<- {
> +      "return":[
> +        {"device":"ide1-hd0",
> +            "status":"active",
> +            "info":{
> +               "percentage":23,
> +            }
> +        },
> +        {"device":"ide1-hd1",
> +         "status":"failed"
> +        }
> +      ]
> +   }
> +
> +EQMP
> +
> +SQMP
>   query-balloon
>   -------------
>
> Index: qemu/Makefile.objs
> ===================================================================
> --- qemu.orig/Makefile.objs
> +++ qemu/Makefile.objs
> @@ -98,7 +98,7 @@ common-obj-y += buffered_file.o migratio
>   common-obj-y += qemu-char.o savevm.o #aio.o
>   common-obj-y += msmouse.o ps2.o
>   common-obj-y += qdev.o qdev-properties.o
> -common-obj-y += block-migration.o
> +common-obj-y += block-migration.o block-copy.o
>   common-obj-y += pflib.o
>
>   common-obj-$(CONFIG_BRLAPI) += baum.o
> Index: qemu/sysemu.h
> ===================================================================
> --- qemu.orig/sysemu.h
> +++ qemu/sysemu.h
> @@ -46,6 +46,7 @@ void qemu_del_vm_change_state_handler(VM
>   #define VMSTOP_SAVEVM    6
>   #define VMSTOP_LOADVM    7
>   #define VMSTOP_MIGRATE   8
> +#define VMSTOP_BLOCKCOPY 9
>
>   void vm_start(void);
>   void vm_stop(int reason);
>
>
>
Anthony Liguori - Feb. 23, 2011, 8:06 p.m.
On 02/23/2011 11:26 AM, Markus Armbruster wrote:
>> I don't think it's reasonable to have three different ways to interact
>> with qemu, all needed: the command line, reading and writing the
>> stateful config file, and the monitor.  I'd rather push for starting
>> qemu with a blank guest and assembling (cold-plugging) all the
>> hardware via the monitor before starting the guest.
>>      
> Exactly.  And qdev has brought his within our reach.
>    

Actually, QMP is the star of the show here, not qdev.

The way we get here is by incrementally converting the option handling 
to be qmp calls.  For instance, instead of:

case QEMU_OPTION_name:
       qemu_name = optarg;
       break;

We do:

case QEMU_OPTION_name:
        qmp_set_name(optarg, NULL);
        break;

When we can compile vl.c with nothing more than QMP dependencies, we've 
achieved the goals here.  This will mean adding a lot of new QMP commands.

There are some command line options that must be handled before the 
machine is initialized and QMP is normally run.  For 0.16, we can 
introduce a new QMP mode whereas the event loop gets to run before doing 
machine init and explicit create_machine() command is needed.  This is 
the final bit that will be needed to realize this goal.

A lot of good things come out of this.  Quite a lot of these new 
commands don't strictly need to run before machine init (like -name) 
which means we'll get the ability to change a lot of parameters without 
rebooting the guest which couldn't be changed before.

And this is all incremental stuff that can be done in parallel of the 
QAPI work.  We just need to do the work of adding the function calls (or 
function call wrappers where appropriate).

Regards,

Anthony Liguori

> [...]
>
>
Anthony Liguori - Feb. 23, 2011, 8:18 p.m.
On 02/23/2011 11:18 AM, Avi Kivity wrote:
> On 02/23/2011 06:28 PM, Anthony Liguori wrote:
>>
>>>> Well specifically, it has to ask QEMU and QEMU can tell it the 
>>>> current state via a nice structured data format over QMP.  It's a 
>>>> hell of a lot easier than the management tool trying to do this 
>>>> outside of QEMU.
>>>
>>> So, if qemu crashes, the management tool has to start it up to find 
>>> out what the current state is.
>>
>> Depends on how opaque we make the state file.  I've been thinking a 
>> simple ini syntax with a well supported set of keys.  In that case, a 
>> management tool can read it without starting QEMU.
>
> Then the management stack has to worry about yet another way of 
> interacting via qemu.

{ 'StateItem': { 'key': 'str', 'value': 'str' } }
{ 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ 'StateItem' 
] } }
{ 'StateInfo': { 'sections': [ 'StateSection' ] } }

{ 'query-state', {}, {}, 'StateInfo' }

A management tool never need to worry about anything other than this 
command if it so chooses.  If we have the pre-machine init mode for 
0.16, then this can even be used to inspect state without running a guest.

The fact that the state is visible in the filesystem is an 
implementation detail.

>   I'd like to limit it to the monitor.
>
>>>
>>> Doesn't the stateful non-config file becomes a failure point?  It 
>>> has to be on shared and redundant storage?
>>
>> It depends on what your availability model is and how frequently your 
>> management tool backs up the config.  As of right now, we have a 
>> pretty glaring reliability hole here so adding a stateful 
>> "non-config" can only improve things.
>
> I think the solutions I pointed out close the hole with the existing 
> interfaces.

It doesn't work for eject unless you interpose an acknowledged event.  
Ultimately, this is a simple problem.  If you want reliability, we 
either need symmetric RPCs so that the device model can call (and wait) 
to the management layer to acknowledge a change or QEMU can post an 
event to the management layer, and maintain the state in a reliable fashion.

>>
>>> To me, it seems a lot easier to require management to replay any 
>>> commands that hadn't been acknowledged (due to management failure), 
>>> or to query qemu as to its current state (if it is alive). 
>>
>> You still have the race condition around guest initiated events like 
>> eject.  Unless you have an acknowledged event from a management tool 
>> (which we can't do in QMP today) whereas you don't complete the guest 
>> initiated eject operation until management ack's it, we need to store 
>> that state ourself.
>
> I don't see why.
>
> If management crashes, it queries the eject state when it reconnects 
> to qemu.
> If qemu crashes, the eject state is lost, but that is fine.  My CD-ROM 
> drive tray pulls itself in when the machine is started.

Pick any of a number of possible events that change the machine's 
state.  We can wave our hands at some things saying they don't matter 
and do one off solutions for others, or we can just have a robust way of 
handling this consistently.

>>
>> I don't like the idea of making a management tool such an integral 
>> part of the functional paths. 
>
> I agree that we don't want qemu to wait on the management stack any 
> more than necessary.
>
>> Not having a stateful config file also means that this problem isn't 
>> solved in any form without a really sophisticated management stack.  
>> I'm a big fan of being robust in the face of not-so sophisticated 
>> management tools.
>
> You're introducing the need for additional code in the management 
> layer, the care and feeding for the stateful non-config file.

If a management layer ignores the stateful non-config file, as you like 
to call it, it'll get the same semantics it has today.  I think managing 
a single thing is a whole lot easier than managing an NVRAM file, a 
block migration layering file, and all of the future things we're going 
to add once we decide they are important too.

>>> If qemu crashes, these events are meaningless.  If management 
>>> crashes, it has to query qemu for all state that it wants to keep 
>>> track of via events.
>>
>> Think power failure, not qemu crash.  In the event of a power 
>> failure, any hardware change initiated by the guest ought to be 
>> consistent with when the guest has restarted.  If you eject the CDROM 
>> tray and then lose power, its still ejected after the power comes 
>> back on.
>
> Not on all machines.
>
> Let's list guest state which is independent of power.  That would be 
> wither NVRAM of various types, or physical alterations.  CD-ROM eject 
> is one.  Are there others?

Any indirect qemu state.  Block migration is an example, but other 
examples would be VNC server information (like current password), WCE 
setting (depending on whether we modelled eeprom for the drivers), and 
persisted device settings (lots of devices have eeprom these days).

>>
>>>>
>>>> I think the nature of a posted event management interface is such 
>>>> that we need a stateful config that persists across QEMU invocations.
>>>
>>> I'm not convinced, and I think making qemu manage even more state 
>>> creates more problems.
>>
>> Well this patch series is making qemu management more state.  The 
>> only question is whether we do this as a one-off mechanism or whether 
>> we architect a general mechanism to do it.
>>
>> How much state we store can always be up for discussion but I think 
>> it's undeniable that we need to store more state than we're storing 
>> today (none).
>
> I think my solution (multiplexing block format driver) fits the 
> requirements for live-copy perfectly.  In fact it has a name - it's a 
> RAID-1 driver started in degraded mode.  It could be useful other use 
> cases.

It feels a bit awkward to me to be honest.

Regards,

Anthony Liguori
Marcelo Tosatti - Feb. 23, 2011, 8:44 p.m.
On Wed, Feb 23, 2011 at 02:18:31PM -0600, Anthony Liguori wrote:
> On 02/23/2011 11:18 AM, Avi Kivity wrote:
> >On 02/23/2011 06:28 PM, Anthony Liguori wrote:
> >>
> >>>>Well specifically, it has to ask QEMU and QEMU can tell it
> >>>>the current state via a nice structured data format over
> >>>>QMP.  It's a hell of a lot easier than the management tool
> >>>>trying to do this outside of QEMU.
> >>>
> >>>So, if qemu crashes, the management tool has to start it up to
> >>>find out what the current state is.
> >>
> >>Depends on how opaque we make the state file.  I've been
> >>thinking a simple ini syntax with a well supported set of keys.
> >>In that case, a management tool can read it without starting
> >>QEMU.
> >
> >Then the management stack has to worry about yet another way of
> >interacting via qemu.
> 
> { 'StateItem': { 'key': 'str', 'value': 'str' } }
> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [
> 'StateItem' ] } }
> { 'StateInfo': { 'sections': [ 'StateSection' ] } }
> 
> { 'query-state', {}, {}, 'StateInfo' }
> 
> A management tool never need to worry about anything other than this
> command if it so chooses.  If we have the pre-machine init mode for
> 0.16, then this can even be used to inspect state without running a
> guest.
> 
> The fact that the state is visible in the filesystem is an
> implementation detail.
> 
> >  I'd like to limit it to the monitor.
> >
> >>>
> >>>Doesn't the stateful non-config file becomes a failure point?
> >>>It has to be on shared and redundant storage?
> >>
> >>It depends on what your availability model is and how frequently
> >>your management tool backs up the config.  As of right now, we
> >>have a pretty glaring reliability hole here so adding a stateful
> >>"non-config" can only improve things.
> >
> >I think the solutions I pointed out close the hole with the
> >existing interfaces.
> 
> It doesn't work for eject unless you interpose an acknowledged
> event.  Ultimately, this is a simple problem.  If you want
> reliability, we either need symmetric RPCs so that the device model
> can call (and wait) to the management layer to acknowledge a change
> or QEMU can post an event to the management layer, and maintain the
> state in a reliable fashion.
> 
> >>
> >>>To me, it seems a lot easier to require management to replay
> >>>any commands that hadn't been acknowledged (due to management
> >>>failure), or to query qemu as to its current state (if it is
> >>>alive).
> >>
> >>You still have the race condition around guest initiated events
> >>like eject.  Unless you have an acknowledged event from a
> >>management tool (which we can't do in QMP today) whereas you
> >>don't complete the guest initiated eject operation until
> >>management ack's it, we need to store that state ourself.
> >
> >I don't see why.
> >
> >If management crashes, it queries the eject state when it
> >reconnects to qemu.
> >If qemu crashes, the eject state is lost, but that is fine.  My
> >CD-ROM drive tray pulls itself in when the machine is started.
> 
> Pick any of a number of possible events that change the machine's
> state.  We can wave our hands at some things saying they don't
> matter and do one off solutions for others, or we can just have a
> robust way of handling this consistently.
> 
> >>
> >>I don't like the idea of making a management tool such an
> >>integral part of the functional paths.
> >
> >I agree that we don't want qemu to wait on the management stack
> >any more than necessary.
> >
> >>Not having a stateful config file also means that this problem
> >>isn't solved in any form without a really sophisticated
> >>management stack.  I'm a big fan of being robust in the face of
> >>not-so sophisticated management tools.
> >
> >You're introducing the need for additional code in the management
> >layer, the care and feeding for the stateful non-config file.
> 
> If a management layer ignores the stateful non-config file, as you
> like to call it, it'll get the same semantics it has today.  I think
> managing a single thing is a whole lot easier than managing an NVRAM
> file, a block migration layering file, and all of the future things
> we're going to add once we decide they are important too.
> 
> >>>If qemu crashes, these events are meaningless.  If management
> >>>crashes, it has to query qemu for all state that it wants to
> >>>keep track of via events.
> >>
> >>Think power failure, not qemu crash.  In the event of a power
> >>failure, any hardware change initiated by the guest ought to be
> >>consistent with when the guest has restarted.  If you eject the
> >>CDROM tray and then lose power, its still ejected after the
> >>power comes back on.
> >
> >Not on all machines.
> >
> >Let's list guest state which is independent of power.  That would
> >be wither NVRAM of various types, or physical alterations.  CD-ROM
> >eject is one.  Are there others?
> 
> Any indirect qemu state.  Block migration is an example, but other
> examples would be VNC server information (like current password),
> WCE setting (depending on whether we modelled eeprom for the
> drivers), and persisted device settings (lots of devices have eeprom
> these days).

Agreed.

Why a separate location for this "stateful non-config" section, however?

The state in question (backing image property, device presence, VNC
info, etc) is already represented in either host or guest configuration,
so why not simply expose that?
Anthony Liguori - Feb. 23, 2011, 9:41 p.m.
On 02/23/2011 02:44 PM, Marcelo Tosatti wrote:
>
>> Any indirect qemu state.  Block migration is an example, but other
>> examples would be VNC server information (like current password),
>> WCE setting (depending on whether we modelled eeprom for the
>> drivers), and persisted device settings (lots of devices have eeprom
>> these days).
>>      
> Agreed.
>
> Why a separate location for this "stateful non-config" section, however?
>
> The state in question (backing image property, device presence, VNC
> info, etc) is already represented in either host or guest configuration,
> so why not simply expose that?
>    

Hrm, are you suggesting that the "stateful non-config" be hidden 
directly from the user such that only existing monitor interfaces are 
used to query it's contents?

I'll have to think about that a bit.  Obviously, the existing commands 
would still be authoritative even with a query-state command.  I think 
the only question is whether there's value in making this totally opaque.

Regards,

Anthony Liguori
James Brown - Feb. 24, 2011, 5:41 a.m.
Who I can do it?
Markus Armbruster - Feb. 24, 2011, 7:37 a.m.
Anthony Liguori <anthony@codemonkey.ws> writes:

> On 02/23/2011 11:18 AM, Avi Kivity wrote:
>> On 02/23/2011 06:28 PM, Anthony Liguori wrote:
>>>
>>>>> Well specifically, it has to ask QEMU and QEMU can tell it the
>>>>> current state via a nice structured data format over QMP.  It's a
>>>>> hell of a lot easier than the management tool trying to do this
>>>>> outside of QEMU.
>>>>
>>>> So, if qemu crashes, the management tool has to start it up to
>>>> find out what the current state is.
>>>
>>> Depends on how opaque we make the state file.  I've been thinking a
>>> simple ini syntax with a well supported set of keys.  In that case,
>>> a management tool can read it without starting QEMU.
>>
>> Then the management stack has to worry about yet another way of
>> interacting via qemu.
>
> { 'StateItem': { 'key': 'str', 'value': 'str' } }
> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [
> StateItem' ] } }
> { 'StateInfo': { 'sections': [ 'StateSection' ] } }
>
> { 'query-state', {}, {}, 'StateInfo' }
>
> A management tool never need to worry about anything other than this
> command if it so chooses.  If we have the pre-machine init mode for
> 0.16, then this can even be used to inspect state without running a
> guest.
>
> The fact that the state is visible in the filesystem is an
> implementation detail.
>
>>   I'd like to limit it to the monitor.
>>
>>>>
>>>> Doesn't the stateful non-config file becomes a failure point?  It
>>>> has to be on shared and redundant storage?
>>>
>>> It depends on what your availability model is and how frequently
>>> your management tool backs up the config.  As of right now, we have
>>> a pretty glaring reliability hole here so adding a stateful
>>> "non-config" can only improve things.
>>
>> I think the solutions I pointed out close the hole with the existing
>> interfaces.
>
> It doesn't work for eject unless you interpose an acknowledged event.
> Ultimately, this is a simple problem.  If you want reliability, we
> either need symmetric RPCs so that the device model can call (and
> wait) to the management layer to acknowledge a change or QEMU can post
> an event to the management layer, and maintain the state in a reliable
> fashion.
>
>>>
>>>> To me, it seems a lot easier to require management to replay any
>>>> commands that hadn't been acknowledged (due to management
>>>> failure), or to query qemu as to its current state (if it is
>>>> alive). 
>>>
>>> You still have the race condition around guest initiated events
>>> like eject.  Unless you have an acknowledged event from a
>>> management tool (which we can't do in QMP today) whereas you don't
>>> complete the guest initiated eject operation until management ack's
>>> it, we need to store that state ourself.
>>
>> I don't see why.
>>
>> If management crashes, it queries the eject state when it reconnects
>> to qemu.
>> If qemu crashes, the eject state is lost, but that is fine.  My
>> CD-ROM drive tray pulls itself in when the machine is started.
>
> Pick any of a number of possible events that change the machine's
> state.  We can wave our hands at some things saying they don't matter
> and do one off solutions for others, or we can just have a robust way
> of handling this consistently.

What machine state is to be preserved across crashes?  Maybe I'm naive,
but I can't think of anything but non-volatile device memory,
e.g. disks, RTC CMOS, EEPROM.

I can't quite see why we need conceptually different mechanisms for
disks and all the rest.

[...]
Avi Kivity - Feb. 24, 2011, 8:54 a.m.
On 02/23/2011 10:18 PM, Anthony Liguori wrote:
>> Then the management stack has to worry about yet another way of 
>> interacting via qemu.
>
>
> { 'StateItem': { 'key': 'str', 'value': 'str' } }
> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ 
> 'StateItem' ] } }
> { 'StateInfo': { 'sections': [ 'StateSection' ] } }
>
> { 'query-state', {}, {}, 'StateInfo' }
>
> A management tool never need to worry about anything other than this 
> command if it so chooses.  If we have the pre-machine init mode for 
> 0.16, then this can even be used to inspect state without running a 
> guest.

So we have yet another information tree.  If we store the cd-rom eject 
state here, then we need to make an association between the device path 
of the cd-rom, and the StateItem key.

Far better to store it in the device itself.  For example, we could make 
a layered block format driver that stores the eject state and a "backing 
file" containing the actual media.  Eject and media change would be 
recorded in the block format driver's state.  You could then hot-unplug 
a USB cd-writer and hot-plug it back into a different guest, 
implementing a virtual sneakernet.

>
> The fact that the state is visible in the filesystem is an 
> implementation detail.

A detail that has to be catered for by the management stack - it has to 
provide a safe place for it, back it up, etc.

>
>>   I'd like to limit it to the monitor.
>>
>>>>
>>>> Doesn't the stateful non-config file becomes a failure point?  It 
>>>> has to be on shared and redundant storage?
>>>
>>> It depends on what your availability model is and how frequently 
>>> your management tool backs up the config.  As of right now, we have 
>>> a pretty glaring reliability hole here so adding a stateful 
>>> "non-config" can only improve things.
>>
>> I think the solutions I pointed out close the hole with the existing 
>> interfaces.
>
> It doesn't work for eject unless you interpose an acknowledged event.  
> Ultimately, this is a simple problem.  If you want reliability, we 
> either need symmetric RPCs so that the device model can call (and 
> wait) to the management layer to acknowledge a change or QEMU can post 
> an event to the management layer, and maintain the state in a reliable 
> fashion.

I don't see why it doesn't work.  Please explain.

>>> You still have the race condition around guest initiated events like 
>>> eject.  Unless you have an acknowledged event from a management tool 
>>> (which we can't do in QMP today) whereas you don't complete the 
>>> guest initiated eject operation until management ack's it, we need 
>>> to store that state ourself.
>>
>> I don't see why.
>>
>> If management crashes, it queries the eject state when it reconnects 
>> to qemu.
>> If qemu crashes, the eject state is lost, but that is fine.  My 
>> CD-ROM drive tray pulls itself in when the machine is started.
>
> Pick any of a number of possible events that change the machine's 
> state.  We can wave our hands at some things saying they don't matter 
> and do one off solutions for others, or we can just have a robust way 
> of handling this consistently.

Both block live copy and cd-rom eject state can be solved with layered 
block format drivers.  I don't think a central place for random data 
makes sense.  State belongs near the device that maintains it, esp. if 
the device is hot-pluggable, so it's easy to associate the state with 
the device.

>>
>> You're introducing the need for additional code in the management 
>> layer, the care and feeding for the stateful non-config file.
>
> If a management layer ignores the stateful non-config file, as you 
> like to call it, it'll get the same semantics it has today.  I think 
> managing a single thing is a whole lot easier than managing an NVRAM 
> file, a block migration layering file, and all of the future things 
> we're going to add once we decide they are important too.

I disagree.  Storing NVRAM as a disk image is a simple extension of 
existing management tools.  Block live-copy and cd-rom eject state also 
make sense as per-image state if you take hotunplug and hotplug into 
account.

>
>>>> If qemu crashes, these events are meaningless.  If management 
>>>> crashes, it has to query qemu for all state that it wants to keep 
>>>> track of via events.
>>>
>>> Think power failure, not qemu crash.  In the event of a power 
>>> failure, any hardware change initiated by the guest ought to be 
>>> consistent with when the guest has restarted.  If you eject the 
>>> CDROM tray and then lose power, its still ejected after the power 
>>> comes back on.
>>
>> Not on all machines.
>>
>> Let's list guest state which is independent of power.  That would be 
>> wither NVRAM of various types, or physical alterations.  CD-ROM eject 
>> is one.  Are there others?
>
> Any indirect qemu state.  Block migration is an example, but other 
> examples would be VNC server information (like current password), WCE 
> setting (depending on whether we modelled eeprom for the drivers), and 
> persisted device settings (lots of devices have eeprom these days).

Device settings should be stored with the devices, not with qemu.

Suppose we take the cold-plug on startup via the monitor approach.  So 
we start with a bare machine, cold plug stuff into it.  Now qemu has to 
reconcile the stateful non-config file with the hardware.  What if 
something has changed?  A device moved into a different slot?

If a network card has eeprom, we can specify it with -device 
rtl8139,eeprom=id, where id specifies a disk image for the eeprom.

>> I think my solution (multiplexing block format driver) fits the 
>> requirements for live-copy perfectly.  In fact it has a name - it's a 
>> RAID-1 driver started in degraded mode.  It could be useful other use 
>> cases.
>
> It feels a bit awkward to me to be honest.
>

Not to me.
Avi Kivity - Feb. 24, 2011, 8:58 a.m.
On 02/23/2011 07:49 PM, Marcelo Tosatti wrote:
> On Wed, Feb 23, 2011 at 03:01:14PM +0200, Avi Kivity wrote:
> >  On 02/23/2011 01:14 AM, Anthony Liguori wrote:
> >  >
> >  >-drive already ties into the qemuopts infrastructure and we have
> >  >readconfig and writeconfig.  I don't think we're missing any major
> >  >pieces to do this in a more proper fashion.
> >
> >  The problem with qemu config files is that it splits the
> >  authoritative source of where images are stored into two.  Is it in
> >  the management tool's database or is it in qemu's config file?
> >
> >  For the problem at hand, one solution is to make qemu stop after the
> >  copy, and then management can issue an additional command to
> >  rearrange the disk and resume the guest.  A drawback here is that if
> >  management dies, the guest is stopped until it restarts.  We also
> >  make management latency guest visible, even if it doesn't die at an
> >  inconvenient place.
> >
> >  An alternative approach is to have the copy be performed by a new
> >  layered block format driver:
> >
> >  - create a new image, type = live-copy, containing three pieces of
> >  information
> >     - source image
> >     - destination image
> >     - copy state (initially nothing is copied)
> >  - tell qemu switch to the new image
> >  - qemu starts copying, updates copy state as needed
> >  - copy finishes, event is emitted; reads and writes still serviced
> >  - management receives event, switches qemu to destination image
> >  - management removes live-copy image
> >
> >  If management dies while this is happening, it can simply query the
> >  state of the copy.
> >  Similarly, if qemu dies, the copy state is persistent (could be 0/1 or
> >  real range of blocks).
>
> You don't know if a given block is uptodate or not without the dirty
> bitmap. So unless you also keep track of dirty log somehow, this is
> meaningless.

First, you no longer need the dirty bitmap.  Since all writes go through 
the layered block format driver, you know first-hand what is dirty and 
what isn't.

> So a commit file as proposed indicates copy state (in 0/1 fashion). The
> difference in your proposal is that such information is stored inside a
> special purpose image format?
>

It could also store the already synced range.

The difference is that the file is self contained.  You could hot-unplug 
the image and hot-plug it later (continuing the copy with qemu-img), or 
live migrate it.  In fact I think a qemu RAID-1 driver removes the 
restriction that you can't live-migrate and live-copy simultaneously.
Stefan Hajnoczi - Feb. 24, 2011, 10 a.m.
On Thu, Feb 24, 2011 at 5:41 AM, James Brown <jbrownfirst@gmail.com> wrote:
> Who I can do it?

Please see http://lists.nongnu.org/mailman/listinfo/qemu-devel for
info on how to unsubscribe.

Stefan
Markus Armbruster - Feb. 24, 2011, 12:15 p.m.
Anthony Liguori <anthony@codemonkey.ws> writes:

> On 02/23/2011 11:26 AM, Markus Armbruster wrote:
>>> I don't think it's reasonable to have three different ways to interact
>>> with qemu, all needed: the command line, reading and writing the
>>> stateful config file, and the monitor.  I'd rather push for starting
>>> qemu with a blank guest and assembling (cold-plugging) all the
>>> hardware via the monitor before starting the guest.
>>>      
>> Exactly.  And qdev has brought his within our reach.
>>    
>
> Actually, QMP is the star of the show here, not qdev.
>
> The way we get here is by incrementally converting the option handling
> to be qmp calls.  For instance, instead of:
>
> case QEMU_OPTION_name:
>       qemu_name = optarg;
>       break;
>
> We do:
>
> case QEMU_OPTION_name:
>        qmp_set_name(optarg, NULL);
>        break;
>
> When we can compile vl.c with nothing more than QMP dependencies,
> we've achieved the goals here.  This will mean adding a lot of new QMP
> commands.
>
> There are some command line options that must be handled before the
> machine is initialized and QMP is normally run.  For 0.16, we can
> introduce a new QMP mode whereas the event loop gets to run before
> doing machine init and explicit create_machine() command is needed.
> This is the final bit that will be needed to realize this goal.
>
> A lot of good things come out of this.  Quite a lot of these new
> commands don't strictly need to run before machine init (like -name)
> which means we'll get the ability to change a lot of parameters
> without rebooting the guest which couldn't be changed before.
>
> And this is all incremental stuff that can be done in parallel of the
> QAPI work.  We just need to do the work of adding the function calls
> (or function call wrappers where appropriate).

Well, I wouldn't put "let's create a sane separation between option
parsing and machine configuration" under the "QEMU *Monitor* Protocol"
flag, but I certainly don't mind you hoisting whatever flag pleases you.
Marcelo Tosatti - Feb. 24, 2011, 2:39 p.m.
On Wed, Feb 23, 2011 at 03:41:30PM -0600, Anthony Liguori wrote:
> On 02/23/2011 02:44 PM, Marcelo Tosatti wrote:
> >
> >>Any indirect qemu state.  Block migration is an example, but other
> >>examples would be VNC server information (like current password),
> >>WCE setting (depending on whether we modelled eeprom for the
> >>drivers), and persisted device settings (lots of devices have eeprom
> >>these days).
> >Agreed.
> >
> >Why a separate location for this "stateful non-config" section, however?
> >
> >The state in question (backing image property, device presence, VNC
> >info, etc) is already represented in either host or guest configuration,
> >so why not simply expose that?
> 
> Hrm, are you suggesting that the "stateful non-config" be hidden
> directly from the user such that only existing monitor interfaces
> are used to query it's contents?

Yes. So that the guest can be restarted with the same VM parameters.

> I'll have to think about that a bit.  Obviously, the existing
> commands would still be authoritative even with a query-state
> command.  I think the only question is whether there's value in
> making this totally opaque.

Why you need a query-state command at all?

> 
> Regards,
> 
> Anthony Liguori
>
Anthony Liguori - Feb. 24, 2011, 3 p.m.
On 02/24/2011 02:54 AM, Avi Kivity wrote:
> On 02/23/2011 10:18 PM, Anthony Liguori wrote:
>>> Then the management stack has to worry about yet another way of 
>>> interacting via qemu.
>>
>>
>> { 'StateItem': { 'key': 'str', 'value': 'str' } }
>> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ 
>> 'StateItem' ] } }
>> { 'StateInfo': { 'sections': [ 'StateSection' ] } }
>>
>> { 'query-state', {}, {}, 'StateInfo' }
>>
>> A management tool never need to worry about anything other than this 
>> command if it so chooses.  If we have the pre-machine init mode for 
>> 0.16, then this can even be used to inspect state without running a 
>> guest.
>
> So we have yet another information tree.  If we store the cd-rom eject 
> state here, then we need to make an association between the device 
> path of the cd-rom, and the StateItem key.

And this linkage is key.

Let's say I launch QEMU with:

qemu -cdrom ~/foo.img

And then in the monitor, I do:

(qemu) eject ide1-cd0

The question is, what command can I now use to launch the same qemu 
instance?

When I think of stateful config, what I really think of is a way to spit 
out a command line that essentially becomes, "this is how you now launch 
QEMU".

In this case, it would be:

qemu -cdrom ~/foo.img -device ide-disk,id=ide1-cd0,drive=

Or, we could think of this in terms of:

qemu -cdrom ~/foo.img -readconfig foo.cfg

Where foo.cfg contained:

[device "ide1-cd0"]
driver="ide-disk"
drive=""

So what I'm really suggesting is that we generate foo.cfg whenever 
monitor commands do things that change the command line and introduce a 
new option to reflect this, IOW:

qemu -cdrom ~/foo.img -config foo.cfg


> Far better to store it in the device itself.  For example, we could 
> make a layered block format driver that stores the eject state and a 
> "backing file" containing the actual media.  Eject and media change 
> would be recorded in the block format driver's state.  You could then 
> hot-unplug a USB cd-writer and hot-plug it back into a different 
> guest, implementing a virtual sneakernet.

I think you're far too hung up on "store it in the device itself".  The 
recipe to create the device model is not intrinsic to the device model.  
It's an independent thing that's a combination of the command line 
arguments and any executed monitor commands.

Maybe a better way to think about the stateful config file is a 
mechanism to replay the monitor history.

>>
>> The fact that the state is visible in the filesystem is an 
>> implementation detail.
>
> A detail that has to be catered for by the management stack - it has 
> to provide a safe place for it, back it up, etc.

If it cares for QEMU to preserve state.  Today, this all gets thrown away.

>> It doesn't work for eject unless you interpose an acknowledged 
>> event.  Ultimately, this is a simple problem.  If you want 
>> reliability, we either need symmetric RPCs so that the device model 
>> can call (and wait) to the management layer to acknowledge a change 
>> or QEMU can post an event to the management layer, and maintain the 
>> state in a reliable fashion.
>
> I don't see why it doesn't work.  Please explain.

1) guest eject
2) qemu posts eject event
3) qemu acknowledges eject to the guest
4) management tool sees eject event and updates guest config

There's a race between 3 & 4.  It can only be addressed by interposing 4 
between 2 and 3 OR making qemu persist this state between 2 and 3 such 
that the management tool can reliably query it.

>>>> You still have the race condition around guest initiated events 
>>>> like eject.  Unless you have an acknowledged event from a 
>>>> management tool (which we can't do in QMP today) whereas you don't 
>>>> complete the guest initiated eject operation until management ack's 
>>>> it, we need to store that state ourself.
>>>
>>> I don't see why.
>>>
>>> If management crashes, it queries the eject state when it reconnects 
>>> to qemu.
>>> If qemu crashes, the eject state is lost, but that is fine.  My 
>>> CD-ROM drive tray pulls itself in when the machine is started.
>>
>> Pick any of a number of possible events that change the machine's 
>> state.  We can wave our hands at some things saying they don't matter 
>> and do one off solutions for others, or we can just have a robust way 
>> of handling this consistently.
>
> Both block live copy and cd-rom eject state can be solved with layered 
> block format drivers.  I don't think a central place for random data 
> makes sense.  State belongs near the device that maintains it, esp. if 
> the device is hot-pluggable, so it's easy to associate the state with 
> the device.
>
>>>
>>> You're introducing the need for additional code in the management 
>>> layer, the care and feeding for the stateful non-config file.
>>
>> If a management layer ignores the stateful non-config file, as you 
>> like to call it, it'll get the same semantics it has today.  I think 
>> managing a single thing is a whole lot easier than managing an NVRAM 
>> file, a block migration layering file, and all of the future things 
>> we're going to add once we decide they are important too.
>
> I disagree.  Storing NVRAM as a disk image is a simple extension of 
> existing management tools.  Block live-copy and cd-rom eject state 
> also make sense as per-image state if you take hotunplug and hotplug 
> into account.

Everything can be stored in a block driver but when the data is highly 
structured, isn't it nice to expose it in a structured, human readable 
way?  I know I'd personally prefer a text representation of CMOS than a 
binary blob.

>>
>>>>> If qemu crashes, these events are meaningless.  If management 
>>>>> crashes, it has to query qemu for all state that it wants to keep 
>>>>> track of via events.
>>>>
>>>> Think power failure, not qemu crash.  In the event of a power 
>>>> failure, any hardware change initiated by the guest ought to be 
>>>> consistent with when the guest has restarted.  If you eject the 
>>>> CDROM tray and then lose power, its still ejected after the power 
>>>> comes back on.
>>>
>>> Not on all machines.
>>>
>>> Let's list guest state which is independent of power.  That would be 
>>> wither NVRAM of various types, or physical alterations.  CD-ROM 
>>> eject is one.  Are there others?
>>
>> Any indirect qemu state.  Block migration is an example, but other 
>> examples would be VNC server information (like current password), WCE 
>> setting (depending on whether we modelled eeprom for the drivers), 
>> and persisted device settings (lots of devices have eeprom these days).
>
> Device settings should be stored with the devices, not with qemu.
>
> Suppose we take the cold-plug on startup via the monitor approach.  So 
> we start with a bare machine, cold plug stuff into it.  Now qemu has 
> to reconcile the stateful non-config file with the hardware.  What if 
> something has changed?  A device moved into a different slot?

Sorry, I'm confused.  Is there anything in the stateful config file when 
we start up?  If so, the act of starting up will add a bunch of hardware.

> If a network card has eeprom, we can specify it with -device 
> rtl8139,eeprom=id, where id specifies a disk image for the eeprom.

We could, but then we'll end up with a bunch of little block devices.  
That seems less than ideal to me.

Technically, mac address is stored on eeprom and we store that as a 
device property today.  We can't persist device properties even though 
you can change the mac address of a network card and it does persist 
across reboots.  Are you advocating that we introduce an eeprom for 
every network card (all in a slightly different format) and have special 
tools to manipulate the eeprom to store and view the mac address?

Regards,

Anthony Liguori

>>> I think my solution (multiplexing block format driver) fits the 
>>> requirements for live-copy perfectly.  In fact it has a name - it's 
>>> a RAID-1 driver started in degraded mode.  It could be useful other 
>>> use cases.
>>
>> It feels a bit awkward to me to be honest.
>>
>
> Not to me.
>
Marcelo Tosatti - Feb. 24, 2011, 3:14 p.m.
On Thu, Feb 24, 2011 at 10:58:10AM +0200, Avi Kivity wrote:
> On 02/23/2011 07:49 PM, Marcelo Tosatti wrote:
> >On Wed, Feb 23, 2011 at 03:01:14PM +0200, Avi Kivity wrote:
> >>  On 02/23/2011 01:14 AM, Anthony Liguori wrote:
> >>  >
> >>  >-drive already ties into the qemuopts infrastructure and we have
> >>  >readconfig and writeconfig.  I don't think we're missing any major
> >>  >pieces to do this in a more proper fashion.
> >>
> >>  The problem with qemu config files is that it splits the
> >>  authoritative source of where images are stored into two.  Is it in
> >>  the management tool's database or is it in qemu's config file?
> >>
> >>  For the problem at hand, one solution is to make qemu stop after the
> >>  copy, and then management can issue an additional command to
> >>  rearrange the disk and resume the guest.  A drawback here is that if
> >>  management dies, the guest is stopped until it restarts.  We also
> >>  make management latency guest visible, even if it doesn't die at an
> >>  inconvenient place.
> >>
> >>  An alternative approach is to have the copy be performed by a new
> >>  layered block format driver:
> >>
> >>  - create a new image, type = live-copy, containing three pieces of
> >>  information
> >>     - source image
> >>     - destination image
> >>     - copy state (initially nothing is copied)
> >>  - tell qemu switch to the new image

There is a similar situation with atomicity here. Mgmt app requests a
switch and dies immediately, before receiving the command reply. Qemu
crashes. Which image is the uptodate one, source or live-copy?

> >>  - qemu starts copying, updates copy state as needed
> >>  - copy finishes, event is emitted; reads and writes still serviced
> >>  - management receives event, switches qemu to destination image
> >>  - management removes live-copy image
> >>
> >>  If management dies while this is happening, it can simply query the
> >>  state of the copy.
> >>  Similarly, if qemu dies, the copy state is persistent (could be 0/1 or
> >>  real range of blocks).
> >
> >You don't know if a given block is uptodate or not without the dirty
> >bitmap. So unless you also keep track of dirty log somehow, this is
> >meaningless.
> 
> First, you no longer need the dirty bitmap.  Since all writes go
> through the layered block format driver, you know first-hand what is
> dirty and what isn't.
> 
> >So a commit file as proposed indicates copy state (in 0/1 fashion). The
> >difference in your proposal is that such information is stored inside a
> >special purpose image format?
> >
> 
> It could also store the already synced range.
> 
> The difference is that the file is self contained.  


> You could hot-unplug the image and hot-plug it later (continuing the
> copy with qemu-img),

Then there's no need for live copy. qemu-img does that already.

> or live migrate it. 

You can live migrate (but not live migrate with live block migration)
with live copy in progress, its just that its not supported yet.

> In fact I think a qemu RAID-1 driver
> removes the restriction that you can't live-migrate and live-copy
> simultaneously.

As mentioned its just an implementation detail.
Avi Kivity - Feb. 24, 2011, 3:22 p.m.
On 02/24/2011 05:00 PM, Anthony Liguori wrote:
> On 02/24/2011 02:54 AM, Avi Kivity wrote:
>> On 02/23/2011 10:18 PM, Anthony Liguori wrote:
>>>> Then the management stack has to worry about yet another way of 
>>>> interacting via qemu.
>>>
>>>
>>> { 'StateItem': { 'key': 'str', 'value': 'str' } }
>>> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ 
>>> 'StateItem' ] } }
>>> { 'StateInfo': { 'sections': [ 'StateSection' ] } }
>>>
>>> { 'query-state', {}, {}, 'StateInfo' }
>>>
>>> A management tool never need to worry about anything other than this 
>>> command if it so chooses.  If we have the pre-machine init mode for 
>>> 0.16, then this can even be used to inspect state without running a 
>>> guest.
>>
>> So we have yet another information tree.  If we store the cd-rom 
>> eject state here, then we need to make an association between the 
>> device path of the cd-rom, and the StateItem key.
>
> And this linkage is key.
>
> Let's say I launch QEMU with:
>
> qemu -cdrom ~/foo.img
>
> And then in the monitor, I do:
>
> (qemu) eject ide1-cd0
>
> The question is, what command can I now use to launch the same qemu 
> instance?
>
> When I think of stateful config, what I really think of is a way to 
> spit out a command line that essentially becomes, "this is how you now 
> launch QEMU".
>
> In this case, it would be:
>
> qemu -cdrom ~/foo.img -device ide-disk,id=ide1-cd0,drive=
>
> Or, we could think of this in terms of:
>
> qemu -cdrom ~/foo.img -readconfig foo.cfg
>
> Where foo.cfg contained:
>
> [device "ide1-cd0"]
> driver="ide-disk"
> drive=""
>
> So what I'm really suggesting is that we generate foo.cfg whenever 
> monitor commands do things that change the command line and introduce 
> a new option to reflect this, IOW:
>
> qemu -cdrom ~/foo.img -config foo.cfg

If you move the cdrom to a different IDE channel, you have to update the 
stateful non-config file.

Whereas if you do

    $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img
    $ qemu -cdrom ~/foo-media-tray.img

the cd-rom tray state will be tracked in the image file.

>
>> Far better to store it in the device itself.  For example, we could 
>> make a layered block format driver that stores the eject state and a 
>> "backing file" containing the actual media.  Eject and media change 
>> would be recorded in the block format driver's state.  You could then 
>> hot-unplug a USB cd-writer and hot-plug it back into a different 
>> guest, implementing a virtual sneakernet.
>
> I think you're far too hung up on "store it in the device itself".  
> The recipe to create the device model is not intrinsic to the device 
> model.  It's an independent thing that's a combination of the command 
> line arguments and any executed monitor commands.
>
> Maybe a better way to think about the stateful config file is a 
> mechanism to replay the monitor history.

Again the question is who is the authoritative source of the 
configuration.  Is it the management tool or is it qemu?

The management tool already has to keep track of (the optional parts of) 
the guest device tree.  It cannot start reading the stateful non-config 
file at random points in time.  So all that is left is the guest 
controlled portions of the device tree, which are pretty rare, and 
random events like live-copy migration.  I think that introducing a new 
authoritative source of information will create a lot of problems.

>>>
>>> The fact that the state is visible in the filesystem is an 
>>> implementation detail.
>>
>> A detail that has to be catered for by the management stack - it has 
>> to provide a safe place for it, back it up, etc.
>
> If it cares for QEMU to preserve state.  Today, this all gets thrown 
> away.

Right, but we should make it easy, not hard.

>
>>> It doesn't work for eject unless you interpose an acknowledged 
>>> event.  Ultimately, this is a simple problem.  If you want 
>>> reliability, we either need symmetric RPCs so that the device model 
>>> can call (and wait) to the management layer to acknowledge a change 
>>> or QEMU can post an event to the management layer, and maintain the 
>>> state in a reliable fashion.
>>
>> I don't see why it doesn't work.  Please explain.
>
> 1) guest eject
> 2) qemu posts eject event
> 3) qemu acknowledges eject to the guest
> 4) management tool sees eject event and updates guest config
>
> There's a race between 3 & 4.  It can only be addressed by interposing 
> 4 between 2 and 3 OR making qemu persist this state between 2 and 3 
> such that the management tool can reliably query it.

If "it" is my cd-rom tray block format driver, it works.  It's really 
the same in action as the stateful non-config, except it's part of the 
device/image, not a central location.

>> I disagree.  Storing NVRAM as a disk image is a simple extension of 
>> existing management tools.  Block live-copy and cd-rom eject state 
>> also make sense as per-image state if you take hotunplug and hotplug 
>> into account.
>
> Everything can be stored in a block driver but when the data is highly 
> structured, isn't it nice to expose it in a structured, human readable 
> way?  I know I'd personally prefer a text representation of CMOS than 
> a binary blob.

Have a tool expose it.  Part of the range is unspecified anyway.

Using a block format driver means that we don't have to care about a 
crash during a write, that we can snapshot it, etc.

>> Device settings should be stored with the devices, not with qemu.
>>
>> Suppose we take the cold-plug on startup via the monitor approach.  
>> So we start with a bare machine, cold plug stuff into it.  Now qemu 
>> has to reconcile the stateful non-config file with the hardware.  
>> What if something has changed?  A device moved into a different slot?
>
> Sorry, I'm confused.  Is there anything in the stateful config file 
> when we start up?  If so, the act of starting up will add a bunch of 
> hardware.

Suppose it has information about ide1-cd0's media tray.  Now we restart 
qemu and cold-plug the cdrom into ide0-cd0.  What happens to the 
information?

Or do we store both the cdrom and media tray information in there?  If 
so the management stack has to edit the file to make changes (perhaps 
via the monitor).

>
>> If a network card has eeprom, we can specify it with -device 
>> rtl8139,eeprom=id, where id specifies a disk image for the eeprom.
>
> We could, but then we'll end up with a bunch of little block devices.  
> That seems less than ideal to me.
>
> Technically, mac address is stored on eeprom and we store that as a 
> device property today.  We can't persist device properties even though 
> you can change the mac address of a network card and it does persist 
> across reboots.  Are you advocating that we introduce an eeprom for 
> every network card (all in a slightly different format) and have 
> special tools to manipulate the eeprom to store and view the mac address?

Yes -- if we really want to support it.  Obviously we have to store the 
entire eeprom, not just the portion containing the MAC address, so it's 
not just a key/value store.  A card may even have its firmware in flash.
Avi Kivity - Feb. 24, 2011, 3:28 p.m.
On 02/24/2011 05:14 PM, Marcelo Tosatti wrote:
> >  >>   The problem with qemu config files is that it splits the
> >  >>   authoritative source of where images are stored into two.  Is it in
> >  >>   the management tool's database or is it in qemu's config file?
> >  >>
> >  >>   For the problem at hand, one solution is to make qemu stop after the
> >  >>   copy, and then management can issue an additional command to
> >  >>   rearrange the disk and resume the guest.  A drawback here is that if
> >  >>   management dies, the guest is stopped until it restarts.  We also
> >  >>   make management latency guest visible, even if it doesn't die at an
> >  >>   inconvenient place.
> >  >>
> >  >>   An alternative approach is to have the copy be performed by a new
> >  >>   layered block format driver:
> >  >>
> >  >>   - create a new image, type = live-copy, containing three pieces of
> >  >>   information
> >  >>      - source image
> >  >>      - destination image
> >  >>      - copy state (initially nothing is copied)
> >  >>   - tell qemu switch to the new image
>
> There is a similar situation with atomicity here. Mgmt app requests a
> switch and dies immediately, before receiving the command reply. Qemu
> crashes. Which image is the uptodate one, source or live-copy?

live-copy (or it's new name, RAID-1).  Once you've created it it is 
equivalent to source.  Once it switches to state=synced you can switch 
back to either source or destination (I guess by telling qemu to detach 
the one you don't want first, so it falls back to state=degraded).

>
> >  You could hot-unplug the image and hot-plug it later (continuing the
> >  copy with qemu-img),
>
> Then there's no need for live copy. qemu-img does that already.

It will start from the beginning.

> >  or live migrate it.
>
> You can live migrate (but not live migrate with live block migration)
> with live copy in progress, its just that its not supported yet.

A RAID-1 driver will work with block live migration too.

> >  In fact I think a qemu RAID-1 driver
> >  removes the restriction that you can't live-migrate and live-copy
> >  simultaneously.
>
> As mentioned its just an implementation detail.

I think it's an important one.  It moves the code from the generic layer 
to a driver.  It allows generic RAID-1 functionality (for high availablity).
Marcelo Tosatti - Feb. 24, 2011, 4:39 p.m.
On Thu, Feb 24, 2011 at 05:28:20PM +0200, Avi Kivity wrote:
> On 02/24/2011 05:14 PM, Marcelo Tosatti wrote:
> >>  >>   The problem with qemu config files is that it splits the
> >>  >>   authoritative source of where images are stored into two.  Is it in
> >>  >>   the management tool's database or is it in qemu's config file?
> >>  >>
> >>  >>   For the problem at hand, one solution is to make qemu stop after the
> >>  >>   copy, and then management can issue an additional command to
> >>  >>   rearrange the disk and resume the guest.  A drawback here is that if
> >>  >>   management dies, the guest is stopped until it restarts.  We also
> >>  >>   make management latency guest visible, even if it doesn't die at an
> >>  >>   inconvenient place.
> >>  >>
> >>  >>   An alternative approach is to have the copy be performed by a new
> >>  >>   layered block format driver:
> >>  >>
> >>  >>   - create a new image, type = live-copy, containing three pieces of
> >>  >>   information
> >>  >>      - source image
> >>  >>      - destination image
> >>  >>      - copy state (initially nothing is copied)
> >>  >>   - tell qemu switch to the new image
> >
> >There is a similar situation with atomicity here. Mgmt app requests a
> >switch and dies immediately, before receiving the command reply. Qemu
> >crashes. Which image is the uptodate one, source or live-copy?
> 
> live-copy (or it's new name, RAID-1).  Once you've created it it is
> equivalent to source.  Once it switches to state=synced you can
> switch back to either source or destination (I guess by telling qemu
> to detach the one you don't want first, so it falls back to
> state=degraded).
> 
> >
> >>  You could hot-unplug the image and hot-plug it later (continuing the
> >>  copy with qemu-img),
> >
> >Then there's no need for live copy. qemu-img does that already.
> 
> It will start from the beginning.
> 
> >>  or live migrate it.
> >
> >You can live migrate (but not live migrate with live block migration)
> >with live copy in progress, its just that its not supported yet.
> 
> A RAID-1 driver will work with block live migration too.

Nobody cares about that one (block copy and block live migration in
progress). In fact, i doubt anybody cares about parallel block migration
and block copy either (mgmt can easily cope with that limitation, stop
live copy if migration is needed).

> >>  In fact I think a qemu RAID-1 driver
> >>  removes the restriction that you can't live-migrate and live-copy
> >>  simultaneously.
> >
> >As mentioned its just an implementation detail.

I meant the restriction of live-migrate and live-copy in parallel.

> I think it's an important one.  It moves the code from the generic
> layer to a driver. 

Well it is a nice idea, but devil is in the details:

- Guest writes must invalidate in progress live copy reads
and live copy writes, so you have to maintain a queue for live
copy AIO.
- Live copy writes must be aware of in progress guest AIO writes, so
you have to maintain a queue for guest AIO.
- Guest writes must be mirrored to source and destination.
- qemu-img must handle this new format. 

So my view ATM is that this is overengineering.

> It allows generic RAID-1 functionality (for high
> availablity).

Isnt HA responsability of the host filesystem?
Avi Kivity - Feb. 24, 2011, 5:32 p.m.
On 02/24/2011 06:39 PM, Marcelo Tosatti wrote:
> >  >
> >  >You can live migrate (but not live migrate with live block migration)
> >  >with live copy in progress, its just that its not supported yet.
> >
> >  A RAID-1 driver will work with block live migration too.
>
> Nobody cares about that one (block copy and block live migration in
> progress). In fact, i doubt anybody cares about parallel block migration
> and block copy either (mgmt can easily cope with that limitation, stop
> live copy if migration is needed).

It's a lot better to have an orthogonal feature set where everything 
works with everything.  These sort of constraints aren't very good, 
especially as they aren't documented.  The code that deals with live 
migration and live copy may be in different places and now we force the 
management tool authors to tangle it up.

> >  >>   In fact I think a qemu RAID-1 driver
> >  >>   removes the restriction that you can't live-migrate and live-copy
> >  >>   simultaneously.
> >  >
> >  >As mentioned its just an implementation detail.
>
> I meant the restriction of live-migrate and live-copy in parallel.
>
> >  I think it's an important one.  It moves the code from the generic
> >  layer to a driver.
>
> Well it is a nice idea, but devil is in the details:
>
> - Guest writes must invalidate in progress live copy reads
> and live copy writes, so you have to maintain a queue for live
> copy AIO.

You just add the locations to a queue and fire them off again when the 
live copy write returns.

> - Live copy writes must be aware of in progress guest AIO writes, so
> you have to maintain a queue for guest AIO.
> - Guest writes must be mirrored to source and destination.

Don't you need to do this for live-copy?  Ah, you do it asynchronously 
from the dirty log.

You can do the same with the RAID-1 driver by invalidating the 
destination block instead of dual-issuing the write.

> - qemu-img must handle this new format.
>
> So my view ATM is that this is overengineering.

I believe it will actually be smaller since there's no interfacing with 
the dirty log.

> >  It allows generic RAID-1 functionality (for high
> >  availablity).
>
> Isnt HA responsability of the host filesystem?

So is sparseness, snapshots, etc.  Yet we do them in qemu.  In a perfect 
world we'd have just raw files (and use RAID over loop or something for 
live-copy).
Anthony Liguori - Feb. 24, 2011, 5:45 p.m.
On 02/24/2011 10:39 AM, Marcelo Tosatti wrote:
> On Thu, Feb 24, 2011 at 05:28:20PM +0200, Avi Kivity wrote:
>    
>> On 02/24/2011 05:14 PM, Marcelo Tosatti wrote:
>>      
>>>>   >>    The problem with qemu config files is that it splits the
>>>>   >>    authoritative source of where images are stored into two.  Is it in
>>>>   >>    the management tool's database or is it in qemu's config file?
>>>>   >>
>>>>   >>    For the problem at hand, one solution is to make qemu stop after the
>>>>   >>    copy, and then management can issue an additional command to
>>>>   >>    rearrange the disk and resume the guest.  A drawback here is that if
>>>>   >>    management dies, the guest is stopped until it restarts.  We also
>>>>   >>    make management latency guest visible, even if it doesn't die at an
>>>>   >>    inconvenient place.
>>>>   >>
>>>>   >>    An alternative approach is to have the copy be performed by a new
>>>>   >>    layered block format driver:
>>>>   >>
>>>>   >>    - create a new image, type = live-copy, containing three pieces of
>>>>   >>    information
>>>>   >>       - source image
>>>>   >>       - destination image
>>>>   >>       - copy state (initially nothing is copied)
>>>>   >>    - tell qemu switch to the new image
>>>>          
>>> There is a similar situation with atomicity here. Mgmt app requests a
>>> switch and dies immediately, before receiving the command reply. Qemu
>>> crashes. Which image is the uptodate one, source or live-copy?
>>>        
>> live-copy (or it's new name, RAID-1).  Once you've created it it is
>> equivalent to source.  Once it switches to state=synced you can
>> switch back to either source or destination (I guess by telling qemu
>> to detach the one you don't want first, so it falls back to
>> state=degraded).
>>
>>      
>>>        
>>>>   You could hot-unplug the image and hot-plug it later (continuing the
>>>>   copy with qemu-img),
>>>>          
>>> Then there's no need for live copy. qemu-img does that already.
>>>        
>> It will start from the beginning.
>>
>>      
>>>>   or live migrate it.
>>>>          
>>> You can live migrate (but not live migrate with live block migration)
>>> with live copy in progress, its just that its not supported yet.
>>>        
>> A RAID-1 driver will work with block live migration too.
>>      
> Nobody cares about that one (block copy and block live migration in
> progress). In fact, i doubt anybody cares about parallel block migration
> and block copy either (mgmt can easily cope with that limitation, stop
> live copy if migration is needed).
>
>    
>>>>   In fact I think a qemu RAID-1 driver
>>>>   removes the restriction that you can't live-migrate and live-copy
>>>>   simultaneously.
>>>>          
>>> As mentioned its just an implementation detail.
>>>        
> I meant the restriction of live-migrate and live-copy in parallel.
>
>    
>> I think it's an important one.  It moves the code from the generic
>> layer to a driver.
>>      
> Well it is a nice idea, but devil is in the details:
>
> - Guest writes must invalidate in progress live copy reads
> and live copy writes, so you have to maintain a queue for live
> copy AIO.
> - Live copy writes must be aware of in progress guest AIO writes, so
> you have to maintain a queue for guest AIO.
> - Guest writes must be mirrored to source and destination.
> - qemu-img must handle this new format.
>
> So my view ATM is that this is overengineering.
>    

Yeah, it feels like we're introducing QEMU level RAID.  At what point 
are we going to add RAID5 support and not just RAID1.  And it certainly 
begs the question of whether this use-case can be satisfied by just 
using Linux's RAID stack leaving one drive degraded and enabling the 
other drive when the time comes to fail over.

Regards,

Anthony Liguori
Anthony Liguori - Feb. 24, 2011, 5:58 p.m.
On 02/24/2011 09:22 AM, Avi Kivity wrote:
> On 02/24/2011 05:00 PM, Anthony Liguori wrote:
>> On 02/24/2011 02:54 AM, Avi Kivity wrote:
>>> On 02/23/2011 10:18 PM, Anthony Liguori wrote:
>>>>> Then the management stack has to worry about yet another way of 
>>>>> interacting via qemu.
>>>>
>>>>
>>>> { 'StateItem': { 'key': 'str', 'value': 'str' } }
>>>> { 'StateSection': { 'kind': 'str', 'name': 'str', 'items': [ 
>>>> 'StateItem' ] } }
>>>> { 'StateInfo': { 'sections': [ 'StateSection' ] } }
>>>>
>>>> { 'query-state', {}, {}, 'StateInfo' }
>>>>
>>>> A management tool never need to worry about anything other than 
>>>> this command if it so chooses.  If we have the pre-machine init 
>>>> mode for 0.16, then this can even be used to inspect state without 
>>>> running a guest.
>>>
>>> So we have yet another information tree.  If we store the cd-rom 
>>> eject state here, then we need to make an association between the 
>>> device path of the cd-rom, and the StateItem key.
>>
>> And this linkage is key.
>>
>> Let's say I launch QEMU with:
>>
>> qemu -cdrom ~/foo.img
>>
>> And then in the monitor, I do:
>>
>> (qemu) eject ide1-cd0
>>
>> The question is, what command can I now use to launch the same qemu 
>> instance?
>>
>> When I think of stateful config, what I really think of is a way to 
>> spit out a command line that essentially becomes, "this is how you 
>> now launch QEMU".
>>
>> In this case, it would be:
>>
>> qemu -cdrom ~/foo.img -device ide-disk,id=ide1-cd0,drive=
>>
>> Or, we could think of this in terms of:
>>
>> qemu -cdrom ~/foo.img -readconfig foo.cfg
>>
>> Where foo.cfg contained:
>>
>> [device "ide1-cd0"]
>> driver="ide-disk"
>> drive=""
>>
>> So what I'm really suggesting is that we generate foo.cfg whenever 
>> monitor commands do things that change the command line and introduce 
>> a new option to reflect this, IOW:
>>
>> qemu -cdrom ~/foo.img -config foo.cfg
>
> If you move the cdrom to a different IDE channel, you have to update 
> the stateful non-config file.
>
> Whereas if you do
>
>    $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img
>    $ qemu -cdrom ~/foo-media-tray.img
>
> the cd-rom tray state will be tracked in the image file.

Yeah, but how do you move it?  If you do a remove/add through QMP, then 
the config file will reflect things just fine.

If you want to do it outside of QEMU, then you can just ignore the 
config file and manage all of the state yourself.  But it's never going 
to work as well (it will be racy) and you're pushing a tremendous amount 
of knowledge that ultimately belongs in QEMU (what state needs to 
persist) to something that isn't QEMU which means it's probably not 
going to be done correctly.

I know you're a big fan of the omnipotent management tool but my 
experience has been that we need to help the management tooling folks 
more by expecting less from them.

>>> Far better to store it in the device itself.  For example, we could 
>>> make a layered block format driver that stores the eject state and a 
>>> "backing file" containing the actual media.  Eject and media change 
>>> would be recorded in the block format driver's state.  You could 
>>> then hot-unplug a USB cd-writer and hot-plug it back into a 
>>> different guest, implementing a virtual sneakernet.
>>
>> I think you're far too hung up on "store it in the device itself".  
>> The recipe to create the device model is not intrinsic to the device 
>> model.  It's an independent thing that's a combination of the command 
>> line arguments and any executed monitor commands.
>>
>> Maybe a better way to think about the stateful config file is a 
>> mechanism to replay the monitor history.
>
> Again the question is who is the authoritative source of the 
> configuration.  Is it the management tool or is it qemu?

QEMU.   No question about it.  At any point in time, we are the 
authoritative source of what the guest's configuration is.  There's no 
doubt about it.  A management tool can try to keep up with us, but 
ultimately we are the only ones that know for sure.

We have all of this information internally.  Just persisting it is not a 
major architectural change.  It's something we should have been doing 
(arguably) from the very beginning.

> The management tool already has to keep track of (the optional parts 
> of) the guest device tree.  It cannot start reading the stateful 
> non-config file at random points in time.  So all that is left is the 
> guest controlled portions of the device tree, which are pretty rare, 
> and random events like live-copy migration.  I think that introducing 
> a new authoritative source of information will create a lot of problems.

QEMU has always been the authoritative source.  Nothing new has been 
introduced.  We never persisted the machine's configuration which meant 
management tools had to try to aggressively keep up with us which is 
intrinsically error prone.  Fixing this will only improve existing 
management tools.

>
>>>>
>>>> The fact that the state is visible in the filesystem is an 
>>>> implementation detail.
>>>
>>> A detail that has to be catered for by the management stack - it has 
>>> to provide a safe place for it, back it up, etc.
>>
>> If it cares for QEMU to preserve state.  Today, this all gets thrown 
>> away.
>
> Right, but we should make it easy, not hard.

Yeah, I fail to see how this makes it hard.  We conveniently are saying, 
hey, this is all the state that needs to be persisted.  We'll persist it 
for you if you want, otherwise, we'll expose it in a central location.

If the tool wants to ignore it and guess based on various combinations 
of other commands, more power to it.

>>
>>>> It doesn't work for eject unless you interpose an acknowledged 
>>>> event.  Ultimately, this is a simple problem.  If you want 
>>>> reliability, we either need symmetric RPCs so that the device model 
>>>> can call (and wait) to the management layer to acknowledge a change 
>>>> or QEMU can post an event to the management layer, and maintain the 
>>>> state in a reliable fashion.
>>>
>>> I don't see why it doesn't work.  Please explain.
>>
>> 1) guest eject
>> 2) qemu posts eject event
>> 3) qemu acknowledges eject to the guest
>> 4) management tool sees eject event and updates guest config
>>
>> There's a race between 3 & 4.  It can only be addressed by 
>> interposing 4 between 2 and 3 OR making qemu persist this state 
>> between 2 and 3 such that the management tool can reliably query it.
>
> If "it" is my cd-rom tray block format driver, it works.  It's really 
> the same in action as the stateful non-config, except it's part of the 
> device/image, not a central location.

Because you've introduced a one-off.  Having a bunch of one-offs 
(especially being a bunch of new block formats!) is not going to make 
things simpler for management tools.

>>> I disagree.  Storing NVRAM as a disk image is a simple extension of 
>>> existing management tools.  Block live-copy and cd-rom eject state 
>>> also make sense as per-image state if you take hotunplug and hotplug 
>>> into account.
>>
>> Everything can be stored in a block driver but when the data is 
>> highly structured, isn't it nice to expose it in a structured, human 
>> readable way?  I know I'd personally prefer a text representation of 
>> CMOS than a binary blob.
>
> Have a tool expose it.  Part of the range is unspecified anyway.

I guess we need to agree to disagree then.

> Using a block format driver means that we don't have to care about a 
> crash during a write, that we can snapshot it, etc.

Why?  We always need to care about a crash during write.  What I've been 
thinking for a config file is the class approach of using a ~ and .# 
file to make sure that we write out the new file and then atomically 
rename it to get the new contents.  Yeah, it's a bit heavy weight but 
this shouldn't be a very common thing to update.

>>> Device settings should be stored with the devices, not with qemu.
>>>
>>> Suppose we take the cold-plug on startup via the monitor approach.  
>>> So we start with a bare machine, cold plug stuff into it.  Now qemu 
>>> has to reconcile the stateful non-config file with the hardware.  
>>> What if something has changed?  A device moved into a different slot?
>>
>> Sorry, I'm confused.  Is there anything in the stateful config file 
>> when we start up?  If so, the act of starting up will add a bunch of 
>> hardware.
>
> Suppose it has information about ide1-cd0's media tray.  Now we 
> restart qemu and cold-plug the cdrom into ide0-cd0.  What happens to 
> the information?

Whether media is present is not a property of a blockdev, it's a 
property of a device.  What does it even mean to use your media-tray 
format with something like a CMOS device?

>> Technically, mac address is stored on eeprom and we store that as a 
>> device property today.  We can't persist device properties even 
>> though you can change the mac address of a network card and it does 
>> persist across reboots.  Are you advocating that we introduce an 
>> eeprom for every network card (all in a slightly different format) 
>> and have special tools to manipulate the eeprom to store and view the 
>> mac address?
>
>
> Yes -- if we really want to support it.  Obviously we have to store 
> the entire eeprom, not just the portion containing the MAC address, so 
> it's not just a key/value store.  A card may even have its firmware in 
> flash.

I think that's overengineering.  I think we can go very far by just 
persisting small amounts of information in a central location.  We're 
not building a cycle-accurate simulator here afterall.

Regards,

Anthony Liguori
Stefan Hajnoczi - Feb. 25, 2011, 7:16 a.m.
On Wed, Feb 23, 2011 at 3:31 PM, Avi Kivity <avi@redhat.com> wrote:
> On 02/23/2011 04:35 PM, Anthony Liguori wrote:
>>
>> QEMU uses the state database to store information that is created
>> dynamically.  For instance, devices added through device_add.  A device
>> added via -device wouldn't necessary get added to the state database.
>>
>> Practically speaking, it let's you invoke QEMU with a fixed command line,
>> while still using the monitor to make changes that would otherwise require
>> the command line being updated.
>
> Then the invoker quickly loses track of what the actual state is.  It can't
> just remember which commands it issued (presumably in response to the user
> updating user visible state).  It has to parse the stateful config file qemu
> outputs.  But at which points should it parse it?
>
> I don't think it's reasonable to have three different ways to interact with
> qemu, all needed: the command line, reading and writing the stateful config
> file, and the monitor.  I'd rather push for starting qemu with a blank guest
> and assembling (cold-plugging) all the hardware via the monitor before
> starting the guest.

This is why I asked about unifying the stateful config with qdev or
QemuOpts in the machine init mailing list thread.  If we have n
different ways of toggling things inside QEMU including command-line,
HMP, QMP, and stateful config, then a large chunk of QEMU is just
going to be parsing and formatting these external interfaces.

Stefan
Marcelo Tosatti - Feb. 26, 2011, 12:02 a.m.
On Wed, Feb 23, 2011 at 01:06:46PM -0600, Anthony Liguori wrote:
> On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
> >Index: qemu/qerror.h
> >===================================================================
> >--- qemu.orig/qerror.h
> >+++ qemu/qerror.h
> >@@ -171,4 +171,13 @@ QError *qobject_to_qerror(const QObject
> >  #define QERR_VNC_SERVER_FAILED \
> >      "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }"
> >
> >+#define QERR_BLOCKCOPY_IN_PROGRESS \
> >+    "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }"
> 
> The caller already knows the device name by virtue of issuing the
> command so this is redundant.
> 
> I think a better approach would be a QERR_IN_PROGRESS 'data': {
> 'operation': %s }
> 
> For block copy, we'd say QERR_IN_PROGRESS("block copy").
> 
> >+
> >+#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \
> >+    "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }"
> >+
> >+#define QERR_MIGRATION_IN_PROGRESS \
> >+    "{ 'class': 'MigrationInProgress', 'data': {} }"
> 
> Then QERR_IN_PROGRESS("live migration")

Can the error format change like that? What about applications that make
use of it? If it can change, sure. (libvirt.git does not seem to be
aware of MigrationInProgress).

> >  #endif /* QERROR_H */
> >Index: qemu/qmp-commands.hx
> >===================================================================
> >--- qemu.orig/qmp-commands.hx
> >+++ qemu/qmp-commands.hx
> >@@ -581,6 +581,75 @@ Example:
> >  EQMP
> >
> >      {
> >+        .name       = "block_copy",
> >+        .args_type  = "device:s,filename:s,commit_filename:s?,inc:-i",
> >+        .params     = "device filename [commit_filename] [-i]",
> >+        .help       = "live block copy device to image"
> >+                      "\n\t\t\t optional commit filename "
> >+                      "\n\t\t\t -i for incremental copy "
> >+                      "(base image shared between src and destination)",
> >+        .user_print = monitor_user_noop,
> >+        .mhandler.cmd_new = do_bdrv_copy,
> >+    },
> >+
> >+SQMP
> >+block-copy
> >+-------
> >+
> >+Live block copy.
> 
> I'm not sure copy really describes what we're doing here.  Maybe
> migrate-block?

Problem its easy to confuse "migrate-block" with "block migration". I
could not come up with a better, non-confusing name than "live block
copy".

> >+Arguments:
> >+
> >+- "device": device name (json-string)
> >+- "filename": target image filename (json-string)
> 
> Is this a created image?  Is this an image to create?

A previously created image.

> To future proof for blockdev, we should make this argument optional
> and if it's not specified throw an error about missing argument.
> This let's us introduce an optional blockdev argument such that we
> can use a blockdev name.

What you mean "blockdev"?

> 
> >+- "commit_filename": target commit filename (json-string, optional)
> 
> I think we should drop this.

Why? Sorry but this can't wait for non-config persistent storage. This
mistake was made in the past with irqchip for example, lets not repeat
it.

Its OK to deprecate "commit_filename" in favour of its location in
non-config persistent storage. 

Its not the end of the world for a mgmt app to handle change (not saying
its not a good principle) such as this.

> >+- "inc": incremental disk copy (json-bool, optional)
> 
> Let's use the full name (incremental) and we need to describe in
> detail what the semantics of this are.  Will it scan the target
> block device to identify identical blocks?

No, it does not attempt to identify identical blocks, yet.

You are right, i'll write down a document to describe these details.

> 
> >+Example:
> >+
> >+->  { "execute": "block_copy",
> >+                            "arguments": { "device": "ide0-hd1",
> >+                               "filename": "/mnt/new-disk.img",
> >+                               "commit_filename: "/mnt/commit-new-disk.img"
> >+                             } }
> >+
> >+<- { "return": {} }
> >+
> >+Notes:
> >+
> >+(1) The 'query-block-copy' command should be used to check block copy progress
> >+    and final result (this information is provided by the 'status' member)
> >+(2) Boolean argument "inc" defaults to false
> 
> We should also document error semantics.  What errors are expected and why?

Fair.

> >+EQMP
> >+
> >+    {
> >+        .name       = "block_copy_cancel",
> >+        .args_type  = "device:s",
> >+        .params     = "device",
> >+        .help       = "cancel live block copy",
> >+        .user_print = monitor_user_noop,
> >+        .mhandler.cmd_new = do_bdrv_copy_cancel,
> >+    },
> >+
> >+SQMP
> >+block_copy_cancel
> >+--------------
> >+
> >+Cancel live block copy.
> >+
> >+Arguments:
> >+
> >+- device: device name (json-string)
> >+
> >+Example:
> >+
> >+->  { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } }
> >+<- { "return": {} }
> 
> cancel-block-migration?

Again, conflicts with "block migration" from live migration.

> What happens if:
>  - No block copy is active anymore (it's completed)

cancel succeeds.

>  - A block copy was never started

qerror_report(QERR_DEVICE_NOT_FOUND, device);

>  - device refers to a device that no longer exists

qerror_report(QERR_DEVICE_NOT_FOUND, device);

>  - device refers to a device with no media present

Right, this should be dealt with.

> If this command succeeds, what the state of target?  

It will be used as the image backing the guest block device. But
probably i don't understand your question.

> If I resume the
> operation with the incremental flag set, will it Just Work?

As in:

- block_copy ide0-hd1 /mnt/aa.img
- block_copy ide0-hd1 /mnt/aa.img -i

? 

The second command will fail with QERR_BLOCKCOPY_IN_PROGRESS.

> >+-------------
> >+
> >+Live block copy status.
> >+
> >+Each block copy instance information is stored in a json-object and the returned
> >+value is a json-array of all instances.
> >+
> >+Each json-object contains the following:
> >+
> >+- "device": device name (json-string)
> >+- "status": block copy status (json-string)
> >+    - Possible values: "active", "failed", "completed"
> >+- "info": A json-object with the statistics information, if status is "active":
> >+    - "percentage": percentage completed (json-int)
> 
> So let's fold this into query-block
> 
> And let's also report the statistics as MigrationStats which is the format:
> 
> { 'MigrationStats': {'transferred': 'int', 'remaining': 'int',
> 'total': 'int' } }
> 
> So I'd propose changing:
> 
> { 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int',
>                        'rd_operations': 'int', 'wr_operations': 'int',
>                        'wr_highest_offset': 'int' } }
> { 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats',
>                  '*parent': 'BlockStats'} }
> 
> [ 'query-blockstats', {}, {}, ['BlockStats'] ]
> 
> To:
> 
> { 'BlockMigrationStatus': [ 'active', 'inactive', 'cancelled' ] }
> 
> { 'BlockMigrationInfo': {'status': 'BlockMigrationStatus', 'stats':
> 'BlockMigrationStats', '*target_filename': 'str' } }
> 
> { 'BlockDeviceStats': {'rd_bytes': 'int', 'wr_bytes': 'int',
>                        'rd_operations': 'int', 'wr_operations': 'int',
>                        'wr_highest_offset': 'int', '*migration':
> 'BlockMigrationInfo' } }
> { 'BlockStats': {'*device': 'str', 'stats': 'BlockDeviceStats',
>                  '*parent': 'BlockStats'} }
> 
> [ 'query-blockstats', {}, {}, ['BlockStats'] ]

That makes sense, other than the name conflict already mentioned.

Thanks
Anthony Liguori - Feb. 26, 2011, 1:45 p.m.
On 02/25/2011 06:02 PM, Marcelo Tosatti wrote:
> On Wed, Feb 23, 2011 at 01:06:46PM -0600, Anthony Liguori wrote:
>    
>> On 02/22/2011 11:00 AM, Marcelo Tosatti wrote:
>>      
>>> Index: qemu/qerror.h
>>> ===================================================================
>>> --- qemu.orig/qerror.h
>>> +++ qemu/qerror.h
>>> @@ -171,4 +171,13 @@ QError *qobject_to_qerror(const QObject
>>>   #define QERR_VNC_SERVER_FAILED \
>>>       "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }"
>>>
>>> +#define QERR_BLOCKCOPY_IN_PROGRESS \
>>> +    "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }"
>>>        
>> The caller already knows the device name by virtue of issuing the
>> command so this is redundant.
>>
>> I think a better approach would be a QERR_IN_PROGRESS 'data': {
>> 'operation': %s }
>>
>> For block copy, we'd say QERR_IN_PROGRESS("block copy").
>>
>>      
>>> +
>>> +#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \
>>> +    "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }"
>>> +
>>> +#define QERR_MIGRATION_IN_PROGRESS \
>>> +    "{ 'class': 'MigrationInProgress', 'data': {} }"
>>>        
>> Then QERR_IN_PROGRESS("live migration")
>>      
> Can the error format change like that? What about applications that make
> use of it? If it can change, sure. (libvirt.git does not seem to be
> aware of MigrationInProgress).
>    

You're adding MigrationInProgress in this patch so I don't see what 
could be using it.

>    
>>>   #endif /* QERROR_H */
>>> Index: qemu/qmp-commands.hx
>>> ===================================================================
>>> --- qemu.orig/qmp-commands.hx
>>> +++ qemu/qmp-commands.hx
>>> @@ -581,6 +581,75 @@ Example:
>>>   EQMP
>>>
>>>       {
>>> +        .name       = "block_copy",
>>> +        .args_type  = "device:s,filename:s,commit_filename:s?,inc:-i",
>>> +        .params     = "device filename [commit_filename] [-i]",
>>> +        .help       = "live block copy device to image"
>>> +                      "\n\t\t\t optional commit filename "
>>> +                      "\n\t\t\t -i for incremental copy "
>>> +                      "(base image shared between src and destination)",
>>> +        .user_print = monitor_user_noop,
>>> +        .mhandler.cmd_new = do_bdrv_copy,
>>> +    },
>>> +
>>> +SQMP
>>> +block-copy
>>> +-------
>>> +
>>> +Live block copy.
>>>        
>> I'm not sure copy really describes what we're doing here.  Maybe
>> migrate-block?
>>      
> Problem its easy to confuse "migrate-block" with "block migration". I
> could not come up with a better, non-confusing name than "live block
> copy".
>    

Yeah, I don't think I like my suggested names any better to be honest.

>>> +Arguments:
>>> +
>>> +- "device": device name (json-string)
>>> +- "filename": target image filename (json-string)
>>>        
>> Is this a created image?  Is this an image to create?
>>      
> A previously created image.
>
>    
>> To future proof for blockdev, we should make this argument optional
>> and if it's not specified throw an error about missing argument.
>> This let's us introduce an optional blockdev argument such that we
>> can use a blockdev name.
>>      
> What you mean "blockdev"?
>    

-drive file=image.qcow2,if=none,id=foo

'foo' is a named blockdev.  We don't have a way to add these through the 
monitor (yet) but we will for 0.15.

>>> +- "commit_filename": target commit filename (json-string, optional)
>>>        
>> I think we should drop this.
>>      
> Why? Sorry but this can't wait for non-config persistent storage. This
> mistake was made in the past with irqchip for example, lets not repeat
> it.
>
> Its OK to deprecate "commit_filename" in favour of its location in
> non-config persistent storage.
>
> Its not the end of the world for a mgmt app to handle change (not saying
> its not a good principle) such as this.
>    

Even as a one off, it's not a very good solution to the problem.  We'd 
be way better of just having nothing here than using the commit file.  
What are the semantics of a half written file?  How does a management 
tool detect a half written file?

>> What happens if:
>>   - No block copy is active anymore (it's completed)
>>      
> cancel succeeds.
>    

I think it would be better to fail.

>>   - device refers to a device with no media present
>>      
> Right, this should be dealt with.
>
>    
>> If this command succeeds, what the state of target?
>>      
> It will be used as the image backing the guest block device. But
> probably i don't understand your question.
>    

If I execute block-copy, then I do block-copy-cancel, if it succeeds, am 
I guaranteed that we're still using the old block device?  (The answer 
is no because we don't error out if the migration has completed already).

>> If I resume the
>> operation with the incremental flag set, will it Just Work?
>>      
> As in:
>
> - block_copy ide0-hd1 /mnt/aa.img
> - block_copy ide0-hd1 /mnt/aa.img -i
>
> ?
>
> The second command will fail with QERR_BLOCKCOPY_IN_PROGRESS.
>    

No, as in:

block_copy ide0-hd1 /mnt/aa.img
block_copy_cancel ide0-h1
block_copy ide0-h1 /mnt/aa.img -i

Does it pick up where it left off or does it start all over again?

Regards,

Anthony Liguori
Anthony Liguori - Feb. 26, 2011, 3:32 p.m.
On 02/25/2011 06:02 PM, Marcelo Tosatti wrote:
>>> +- "commit_filename": target commit filename (json-string, optional)
>>>        
>> I think we should drop this.
>>      
> Why? Sorry but this can't wait for non-config persistent storage. This
> mistake was made in the past with irqchip for example, lets not repeat
> it.
>
> Its OK to deprecate "commit_filename" in favour of its location in
> non-config persistent storage.
>
> Its not the end of the world for a mgmt app to handle change (not saying
> its not a good principle) such as this

If you're looking for a stop-gap, I'd suggest adding an optional 
"action" parameter.  This parameter would take a string with valid 
values of "stop", "none".

If action=stop, then when block copy finishes, the guest will be stopped 
and an event will be sent.  This lets a management tool get notified, 
update it's config, and then continue the guest.  If the management tool 
crashes, it can do a query-block-copy to see that the copy finished 
along with a query-status to fix the guest.

This is more in line with how our other commands work.

Regards,

Anthony Liguori
Avi Kivity - Feb. 27, 2011, 9:10 a.m.
On 02/24/2011 07:58 PM, Anthony Liguori wrote:
>> If you move the cdrom to a different IDE channel, you have to update 
>> the stateful non-config file.
>>
>> Whereas if you do
>>
>>    $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img
>>    $ qemu -cdrom ~/foo-media-tray.img
>>
>> the cd-rom tray state will be tracked in the image file.
>
>
> Yeah, but how do you move it? 

There is no need to move the file at all.  Simply point the new drive at 
the media tray.

> If you do a remove/add through QMP, then the config file will reflect 
> things just fine.

If all access to the state file is through QMP then it becomes more 
palatable.  A bit on that later.

>
> If you want to do it outside of QEMU, then you can just ignore the 
> config file and manage all of the state yourself.  But it's never 
> going to work as well (it will be racy) and you're pushing a 
> tremendous amount of knowledge that ultimately belongs in QEMU (what 
> state needs to persist) to something that isn't QEMU which means it's 
> probably not going to be done correctly.
>
> I know you're a big fan of the omnipotent management tool but my 
> experience has been that we need to help the management tooling folks 
> more by expecting less from them.

I thought that's what I'm doing by separating the state out.  It's easy 
for management to assemble configuration from their database and convert 
it into a centralized representation (like a qemu command line).  It's a 
lot harder to disassemble a central state representation and move it 
back to the database.

Using QMP is better than directly accessing the state file since qemu 
does the disassembly for you (provided the command references the device 
using its normal path, not some random key).  The file just becomes a 
way to survive a crash, and all management needs to know about is to 
make it available and back it up.  But it means that everything must be 
done via QMP, including assembly of the machine, otherwise the state 
file can become stale.

Separating the state out to the device is even easier, since management 
is already expected to take care of disk images.  All that's needed is 
to create the media tray image once, then you can forget about it 
completely.

>>
>> Again the question is who is the authoritative source of the 
>> configuration.  Is it the management tool or is it qemu?
>
> QEMU.   No question about it.  At any point in time, we are the 
> authoritative source of what the guest's configuration is.  There's no 
> doubt about it.  A management tool can try to keep up with us, but 
> ultimately we are the only ones that know for sure.
>
> We have all of this information internally.  Just persisting it is not 
> a major architectural change.  It's something we should have been 
> doing (arguably) from the very beginning.

That's a huge divergence from how management tools are written.  
Currently they contain the required guest configuration, a 
representation of what's the current live configuration, and they issue 
monitor commands to move the live configuration towards the required 
configuration (or just generate a qemu command line).  What you're 
describing is completely different, I'm not even sure what it is.

>
>> The management tool already has to keep track of (the optional parts 
>> of) the guest device tree.  It cannot start reading the stateful 
>> non-config file at random points in time.  So all that is left is the 
>> guest controlled portions of the device tree, which are pretty rare, 
>> and random events like live-copy migration.  I think that introducing 
>> a new authoritative source of information will create a lot of problems.
>
> QEMU has always been the authoritative source.  Nothing new has been 
> introduced.  We never persisted the machine's configuration which 
> meant management tools had to try to aggressively keep up with us 
> which is intrinsically error prone.  Fixing this will only improve 
> existing management tools.

If you look at management tools, they believe they are the authoritative 
source of configuration information (not guest state, which is more or 
less ignored).

>>
>> Right, but we should make it easy, not hard.
>
> Yeah, I fail to see how this makes it hard.  We conveniently are 
> saying, hey, this is all the state that needs to be persisted.  We'll 
> persist it for you if you want, otherwise, we'll expose it in a 
> central location.

The state-in-a-file is just a blob.  Don't expect the tool to parse it 
and reassociate the various bits to its own representation.  Exposing it 
via QMP commands is a lot better though.

> If the tool wants to ignore it and guess based on various combinations 
> of other commands, more power to it.
>>>> I don't see why it doesn't work.  Please explain.
>>>
>>> 1) guest eject
>>> 2) qemu posts eject event
>>> 3) qemu acknowledges eject to the guest
>>> 4) management tool sees eject event and updates guest config
>>>
>>> There's a race between 3 & 4.  It can only be addressed by 
>>> interposing 4 between 2 and 3 OR making qemu persist this state 
>>> between 2 and 3 such that the management tool can reliably query it.
>>
>> If "it" is my cd-rom tray block format driver, it works.  It's really 
>> the same in action as the stateful non-config, except it's part of 
>> the device/image, not a central location.
>
> Because you've introduced a one-off.  Having a bunch of one-offs 
> (especially being a bunch of new block formats!) is not going to make 
> things simpler for management tools.

I don't see it as a one-off.  We have some state, we store it in a state 
file.  Like we store each image in its own file.

>>
>> Have a tool expose it.  Part of the range is unspecified anyway.
>
> I guess we need to agree to disagree then.

We can just extend our existing disagreement agreement.

>
>> Using a block format driver means that we don't have to care about a 
>> crash during a write, that we can snapshot it, etc.
>
> Why? 

Because block format drivers are already prepared for it.

> We always need to care about a crash during write.  What I've been 
> thinking for a config file is the class approach of using a ~ and .# 
> file to make sure that we write out the new file and then atomically 
> rename it to get the new contents.  Yeah, it's a bit heavy weight but 
> this shouldn't be a very common thing to update. 

>
>>
>> Suppose it has information about ide1-cd0's media tray.  Now we 
>> restart qemu and cold-plug the cdrom into ide0-cd0.  What happens to 
>> the information?
>
> Whether media is present is not a property of a blockdev, it's a 
> property of a device.

You're making it the property of a device path (I guess you could make 
it the property of a device's id).

>   What does it even mean to use your media-tray format with something 
> like a CMOS device?

Nothing.

>
>>> Technically, mac address is stored on eeprom and we store that as a 
>>> device property today.  We can't persist device properties even 
>>> though you can change the mac address of a network card and it does 
>>> persist across reboots.  Are you advocating that we introduce an 
>>> eeprom for every network card (all in a slightly different format) 
>>> and have special tools to manipulate the eeprom to store and view 
>>> the mac address?
>>
>>
>> Yes -- if we really want to support it.  Obviously we have to store 
>> the entire eeprom, not just the portion containing the MAC address, 
>> so it's not just a key/value store.  A card may even have its 
>> firmware in flash.
>
> I think that's overengineering.  I think we can go very far by just 
> persisting small amounts of information in a central location.  We're 
> not building a cycle-accurate simulator here afterall.

Well, the whole thing is rather theoretical.
Avi Kivity - Feb. 27, 2011, 9:22 a.m.
On 02/24/2011 07:45 PM, Anthony Liguori wrote:
>
> Yeah, it feels like we're introducing QEMU level RAID. 

We already did, with sheepdog (well the RAID code is really outside qemu 
itself).

> At what point are we going to add RAID5 support and not just RAID1.  
> And it certainly begs the question of whether this use-case can be 
> satisfied by just using Linux's RAID stack leaving one drive degraded 
> and enabling the other drive when the time comes to fail over.

The magic word is btrfs, we already have atomic live copy outside qemu 
('cp --reflink').  If you also want an atomic switch you have to move it 
inside qemu so you can switch to the new file.

Using one of the Linux RAID-1 implementations (md, dm, or drbd) is also 
possible but these are really designed for block devices, not files.
Dor Laor - Feb. 27, 2011, 9:55 a.m.
On 02/27/2011 11:10 AM, Avi Kivity wrote:
> On 02/24/2011 07:58 PM, Anthony Liguori wrote:
>>> If you move the cdrom to a different IDE channel, you have to update
>>> the stateful non-config file.
>>>
>>> Whereas if you do
>>>
>>> $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img
>>> $ qemu -cdrom ~/foo-media-tray.img
>>>
>>> the cd-rom tray state will be tracked in the image file.
>>
>>
>> Yeah, but how do you move it?
>
> There is no need to move the file at all. Simply point the new drive at
> the media tray.
>
>> If you do a remove/add through QMP, then the config file will reflect
>> things just fine.
>
> If all access to the state file is through QMP then it becomes more
> palatable. A bit on that later.
>
>>
>> If you want to do it outside of QEMU, then you can just ignore the
>> config file and manage all of the state yourself. But it's never going
>> to work as well (it will be racy) and you're pushing a tremendous
>> amount of knowledge that ultimately belongs in QEMU (what state needs
>> to persist) to something that isn't QEMU which means it's probably not
>> going to be done correctly.
>>
>> I know you're a big fan of the omnipotent management tool but my
>> experience has been that we need to help the management tooling folks
>> more by expecting less from them.
>
> I thought that's what I'm doing by separating the state out. It's easy
> for management to assemble configuration from their database and convert
> it into a centralized representation (like a qemu command line). It's a
> lot harder to disassemble a central state representation and move it
> back to the database.
>
> Using QMP is better than directly accessing the state file since qemu
> does the disassembly for you (provided the command references the device
> using its normal path, not some random key). The file just becomes a way
> to survive a crash, and all management needs to know about is to make it
> available and back it up. But it means that everything must be done via
> QMP, including assembly of the machine, otherwise the state file can
> become stale.
>
> Separating the state out to the device is even easier, since management
> is already expected to take care of disk images. All that's needed is to
> create the media tray image once, then you can forget about it completely.
>
>>>
>>> Again the question is who is the authoritative source of the
>>> configuration. Is it the management tool or is it qemu?
>>
>> QEMU. No question about it. At any point in time, we are the
>> authoritative source of what the guest's configuration is. There's no
>> doubt about it. A management tool can try to keep up with us, but
>> ultimately we are the only ones that know for sure.
>>
>> We have all of this information internally. Just persisting it is not
>> a major architectural change. It's something we should have been doing
>> (arguably) from the very beginning.
>
> That's a huge divergence from how management tools are written.
> Currently they contain the required guest configuration, a
> representation of what's the current live configuration, and they issue
> monitor commands to move the live configuration towards the required
> configuration (or just generate a qemu command line). What you're
> describing is completely different, I'm not even sure what it is.
>
>>
>>> The management tool already has to keep track of (the optional parts
>>> of) the guest device tree. It cannot start reading the stateful
>>> non-config file at random points in time. So all that is left is the
>>> guest controlled portions of the device tree, which are pretty rare,
>>> and random events like live-copy migration. I think that introducing
>>> a new authoritative source of information will create a lot of problems.
>>
>> QEMU has always been the authoritative source. Nothing new has been
>> introduced. We never persisted the machine's configuration which meant
>> management tools had to try to aggressively keep up with us which is
>> intrinsically error prone. Fixing this will only improve existing
>> management tools.
>
> If you look at management tools, they believe they are the authoritative
> source of configuration information (not guest state, which is more or
> less ignored).
>
>>>
>>> Right, but we should make it easy, not hard.
>>
>> Yeah, I fail to see how this makes it hard. We conveniently are
>> saying, hey, this is all the state that needs to be persisted. We'll
>> persist it for you if you want, otherwise, we'll expose it in a
>> central location.
>
> The state-in-a-file is just a blob. Don't expect the tool to parse it
> and reassociate the various bits to its own representation. Exposing it
> via QMP commands is a lot better though.
>
>> If the tool wants to ignore it and guess based on various combinations
>> of other commands, more power to it.
>>>>> I don't see why it doesn't work. Please explain.
>>>>
>>>> 1) guest eject
>>>> 2) qemu posts eject event
>>>> 3) qemu acknowledges eject to the guest
>>>> 4) management tool sees eject event and updates guest config
>>>>
>>>> There's a race between 3 & 4. It can only be addressed by
>>>> interposing 4 between 2 and 3 OR making qemu persist this state
>>>> between 2 and 3 such that the management tool can reliably query it.
>>>
>>> If "it" is my cd-rom tray block format driver, it works. It's really
>>> the same in action as the stateful non-config, except it's part of
>>> the device/image, not a central location.
>>
>> Because you've introduced a one-off. Having a bunch of one-offs
>> (especially being a bunch of new block formats!) is not going to make
>> things simpler for management tools.
>
> I don't see it as a one-off. We have some state, we store it in a state
> file. Like we store each image in its own file.
>
>>>
>>> Have a tool expose it. Part of the range is unspecified anyway.
>>
>> I guess we need to agree to disagree then.
>
> We can just extend our existing disagreement agreement.
>
>>
>>> Using a block format driver means that we don't have to care about a
>>> crash during a write, that we can snapshot it, etc.
>>
>> Why?
>
> Because block format drivers are already prepared for it.
>
>> We always need to care about a crash during write. What I've been
>> thinking for a config file is the class approach of using a ~ and .#
>> file to make sure that we write out the new file and then atomically
>> rename it to get the new contents. Yeah, it's a bit heavy weight but
>> this shouldn't be a very common thing to update.
>
>>
>>>
>>> Suppose it has information about ide1-cd0's media tray. Now we
>>> restart qemu and cold-plug the cdrom into ide0-cd0. What happens to
>>> the information?
>>
>> Whether media is present is not a property of a blockdev, it's a
>> property of a device.
>
> You're making it the property of a device path (I guess you could make
> it the property of a device's id).
>
>> What does it even mean to use your media-tray format with something
>> like a CMOS device?
>
> Nothing.
>
>>
>>>> Technically, mac address is stored on eeprom and we store that as a
>>>> device property today. We can't persist device properties even
>>>> though you can change the mac address of a network card and it does
>>>> persist across reboots. Are you advocating that we introduce an
>>>> eeprom for every network card (all in a slightly different format)
>>>> and have special tools to manipulate the eeprom to store and view
>>>> the mac address?
>>>
>>>
>>> Yes -- if we really want to support it. Obviously we have to store
>>> the entire eeprom, not just the portion containing the MAC address,
>>> so it's not just a key/value store. A card may even have its firmware
>>> in flash.
>>
>> I think that's overengineering. I think we can go very far by just
>> persisting small amounts of information in a central location. We're
>> not building a cycle-accurate simulator here afterall.
>
> Well, the whole thing is rather theoretical.

What about a simpler approach were QMP events will be written to a 
event-log-file (or even named pipe). This way apps will have the option 
of reading the log event log post mgmt crash and will be able to realize 
the correct guest state. This can solve the NVRAM changes and also the 
block live copy. It also keeps the management model the same and there 
is no need to mange these files since the persistent info is kep by the 
management else where.
Anthony Liguori - Feb. 27, 2011, 1:49 p.m.
On 02/27/2011 03:55 AM, Dor Laor wrote:
> What about a simpler approach were QMP events will be written to a 
> event-log-file (or even named pipe).

The management tool can just use a small daemon that does nothing other 
than write QMP events to a log.  There's no real need for this code to 
live in QEMU.

Since events are posted, even if we wrote it in QEMU, the event wouldn't 
be guaranteed on disk by the time the event invocation returns.

Regards,

Anthony Liguori
Anthony Liguori - Feb. 27, 2011, 2 p.m.
On 02/27/2011 03:10 AM, Avi Kivity wrote:
> On 02/24/2011 07:58 PM, Anthony Liguori wrote:
>>> If you move the cdrom to a different IDE channel, you have to update 
>>> the stateful non-config file.
>>>
>>> Whereas if you do
>>>
>>>    $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img
>>>    $ qemu -cdrom ~/foo-media-tray.img
>>>
>>> the cd-rom tray state will be tracked in the image file.
>>
>>
>> Yeah, but how do you move it? 
>
> There is no need to move the file at all.  Simply point the new drive 
> at the media tray.

No, I was asking, how do you move the cdrom to a different IDE channel.  
Are you using QMP?  Are you changing the command line arguments?

>
>> If you do a remove/add through QMP, then the config file will reflect 
>> things just fine.
>
> If all access to the state file is through QMP then it becomes more 
> palatable.  A bit on that later.

As I think I've mentioned before, I hadn't really thought about an 
opaque state file but I'm not necessary opposed to it.  I don't see an 
obvious advantage to making it opaque but I agree it should be 
accessible via QMP.

>> If you want to do it outside of QEMU, then you can just ignore the 
>> config file and manage all of the state yourself.  But it's never 
>> going to work as well (it will be racy) and you're pushing a 
>> tremendous amount of knowledge that ultimately belongs in QEMU (what 
>> state needs to persist) to something that isn't QEMU which means it's 
>> probably not going to be done correctly.
>>
>> I know you're a big fan of the omnipotent management tool but my 
>> experience has been that we need to help the management tooling folks 
>> more by expecting less from them.
>
> I thought that's what I'm doing by separating the state out.  It's 
> easy for management to assemble configuration from their database and 
> convert it into a centralized representation (like a qemu command 
> line).  It's a lot harder to disassemble a central state 
> representation and move it back to the database.
>
> Using QMP is better than directly accessing the state file since qemu 
> does the disassembly for you (provided the command references the 
> device using its normal path, not some random key).  The file just 
> becomes a way to survive a crash, and all management needs to know 
> about is to make it available and back it up.  But it means that 
> everything must be done via QMP, including assembly of the machine, 
> otherwise the state file can become stale.
>
> Separating the state out to the device is even easier, since 
> management is already expected to take care of disk images.  All 
> that's needed is to create the media tray image once, then you can 
> forget about it completely.

Except that instead of having one state file, we might have a dozen 
additional "device state" files.

>>>
>>> Again the question is who is the authoritative source of the 
>>> configuration.  Is it the management tool or is it qemu?
>>
>> QEMU.   No question about it.  At any point in time, we are the 
>> authoritative source of what the guest's configuration is.  There's 
>> no doubt about it.  A management tool can try to keep up with us, but 
>> ultimately we are the only ones that know for sure.
>>
>> We have all of this information internally.  Just persisting it is 
>> not a major architectural change.  It's something we should have been 
>> doing (arguably) from the very beginning.
>
> That's a huge divergence from how management tools are written.

This is one of the reasons why management tooling around QEMU needs 
quite a bit of improving.

There is simply no way a management tool can do a good job of being an 
authoritative source of configuration.  The races we're discussion is a 
good example of why.

But beyond those races, QEMU is the only entity that knows with 
certainty what bits of information are important to persist in order to 
preserve a guest across shutdown/restart.  The fact that we've punted 
this problem for so long has only ensured that management tools are 
either intrinsically broken or only support the most minimal subset of 
functionality we actually support.

>   Currently they contain the required guest configuration, a 
> representation of what's the current live configuration, and they 
> issue monitor commands to move the live configuration towards the 
> required configuration (or just generate a qemu command line).  What 
> you're describing is completely different, I'm not even sure what it is.

Management tools shouldn't have to think about how the monitor commands 
they issue impact the invocation options of QEMU.

>
>>
>>> The management tool already has to keep track of (the optional parts 
>>> of) the guest device tree.  It cannot start reading the stateful 
>>> non-config file at random points in time.  So all that is left is 
>>> the guest controlled portions of the device tree, which are pretty 
>>> rare, and random events like live-copy migration.  I think that 
>>> introducing a new authoritative source of information will create a 
>>> lot of problems.
>>
>> QEMU has always been the authoritative source.  Nothing new has been 
>> introduced.  We never persisted the machine's configuration which 
>> meant management tools had to try to aggressively keep up with us 
>> which is intrinsically error prone.  Fixing this will only improve 
>> existing management tools.
>
> If you look at management tools, they believe they are the 
> authoritative source of configuration information (not guest state, 
> which is more or less ignored).

It's because we've given them no other option.

>>>
>>> Right, but we should make it easy, not hard.
>>
>> Yeah, I fail to see how this makes it hard.  We conveniently are 
>> saying, hey, this is all the state that needs to be persisted.  We'll 
>> persist it for you if you want, otherwise, we'll expose it in a 
>> central location.
>
> The state-in-a-file is just a blob.  Don't expect the tool to parse it 
> and reassociate the various bits to its own representation.  Exposing 
> it via QMP commands is a lot better though.

I don't really see this as being a major issue.  There's no such thing 
as a "blob".  If someone wants to manipulate the state, they will.   We 
need to keep compatibility to support migrating from version-to-version.

I agree that we want to provide QMP interfaces to work with the state 
file.  But I don't think we should be hostile to manual manipulation.

Regards,

Anthony Liguori
Avi Kivity - Feb. 27, 2011, 3:31 p.m.
On 02/27/2011 04:00 PM, Anthony Liguori wrote:
> On 02/27/2011 03:10 AM, Avi Kivity wrote:
>> On 02/24/2011 07:58 PM, Anthony Liguori wrote:
>>>> If you move the cdrom to a different IDE channel, you have to 
>>>> update the stateful non-config file.
>>>>
>>>> Whereas if you do
>>>>
>>>>    $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img
>>>>    $ qemu -cdrom ~/foo-media-tray.img
>>>>
>>>> the cd-rom tray state will be tracked in the image file.
>>>
>>>
>>> Yeah, but how do you move it? 
>>
>> There is no need to move the file at all.  Simply point the new drive 
>> at the media tray.
>
> No, I was asking, how do you move the cdrom to a different IDE 
> channel.  Are you using QMP?  Are you changing the command line 
> arguments?

Yes.

If we're doing hot-move (not really relevant to ide-cd) then you'd use 
QMP.  If you're editing a virtual machine that is down, or scheduling a 
change for the next reboot, then you're using command line arguments (or 
cold-plugging into a stopped guest).

Requiring management to remember the old configuration and issue delta 
commands to move the device for the cold-plug case is increased 
complexity IMO.

>
>>
>>> If you do a remove/add through QMP, then the config file will 
>>> reflect things just fine.
>>
>> If all access to the state file is through QMP then it becomes more 
>> palatable.  A bit on that later.
>
> As I think I've mentioned before, I hadn't really thought about an 
> opaque state file but I'm not necessary opposed to it.  I don't see an 
> obvious advantage to making it opaque but I agree it should be 
> accessible via QMP.

The advantage is that we keep the management tool talking to one 
interface (I don't think we should prevent users from interpreting it, 
just make it unnecessary).

>>
>> I thought that's what I'm doing by separating the state out.  It's 
>> easy for management to assemble configuration from their database and 
>> convert it into a centralized representation (like a qemu command 
>> line).  It's a lot harder to disassemble a central state 
>> representation and move it back to the database.
>>
>> Using QMP is better than directly accessing the state file since qemu 
>> does the disassembly for you (provided the command references the 
>> device using its normal path, not some random key).  The file just 
>> becomes a way to survive a crash, and all management needs to know 
>> about is to make it available and back it up.  But it means that 
>> everything must be done via QMP, including assembly of the machine, 
>> otherwise the state file can become stale.
>>
>> Separating the state out to the device is even easier, since 
>> management is already expected to take care of disk images.  All 
>> that's needed is to create the media tray image once, then you can 
>> forget about it completely.
>
> Except that instead of having one state file, we might have a dozen 
> additional "device state" files.

That is fine.  We already have one state file per block device.

>>> QEMU.   No question about it.  At any point in time, we are the 
>>> authoritative source of what the guest's configuration is.  There's 
>>> no doubt about it.  A management tool can try to keep up with us, 
>>> but ultimately we are the only ones that know for sure.
>>>
>>> We have all of this information internally.  Just persisting it is 
>>> not a major architectural change.  It's something we should have 
>>> been doing (arguably) from the very beginning.
>>
>> That's a huge divergence from how management tools are written.
>
> This is one of the reasons why management tooling around QEMU needs 
> quite a bit of improving.
>
> There is simply no way a management tool can do a good job of being an 
> authoritative source of configuration.  The races we're discussion is 
> a good example of why.

What we're discussing is not configuration.  It is non-volatile state.  
Configuration comes from the user; state comes from the guest (the 
management tool may edit state; but the guest cannot edit the 
configuration).

I agree 100% the management tool cannot be the authoritative source of 
state.

My position is:
- the management tool should be 100% in control of configuration (how 
the guest is put together from its components)
- qemu should be 100% in control of state (memory, disk state, NVRAM in 
various components, cd-rom eject state, explosive bolts for payload 
separation, self-destruct mechanism, etc.)
- the management tool should have access to state using the same 
identifiers it used to create the devices that contain the state
- it is preferable to store state "in" the device so that when the 
configuration changes, state is maintained (like hot-unplug of a network 
card with NVRAM followed by hot-plug of the same card).
- the angular momentum of the planet we (presumably) are on won't 
change, whatever we do [1]

>
> But beyond those races, QEMU is the only entity that knows with 
> certainty what bits of information are important to persist in order 
> to preserve a guest across shutdown/restart.  The fact that we've 
> punted this problem for so long has only ensured that management tools 
> are either intrinsically broken or only support the most minimal 
> subset of functionality we actually support.

I'm not arguing about that.  I just want to stress again the difference 
between state and configuration.  Qemu has no authority, in my mind, as 
to configuration.  Only state.

>>   Currently they contain the required guest configuration, a 
>> representation of what's the current live configuration, and they 
>> issue monitor commands to move the live configuration towards the 
>> required configuration (or just generate a qemu command line).  What 
>> you're describing is completely different, I'm not even sure what it is.
>
> Management tools shouldn't have to think about how the monitor 
> commands they issue impact the invocation options of QEMU.

They have to, when creating a guest from scratch.

But I admit, this throws a new light (for me) on things.  What's the 
implications?
- must have a qemu instance running when editing configuration, even 
when the guest is down
- cannot add additional information to configuration; must store it in 
an external database and cross-reference it with the qemu data using the 
device ID
- when editing non-hotpluggable configuration for the next boot, must 
maintain old config somewhere, so we can issue delta commands later 
(might be needed for current way of doing things)
- no transactions/queries/etc except on non-authoritative source
- issues with shared-nothing design (well, can store the configuration 
file using DRBD).

>>
>> If you look at management tools, they believe they are the 
>> authoritative source of configuration information (not guest state, 
>> which is more or less ignored).
>
> It's because we've given them no other option.

It's the natural way of doing it.  You have a web interface that talks 
to a database.  When you want to list all VMs that have network cards on 
the production subnet, you issue a database query and get a recordset.  
How do you do that when the authoritative source of information is 
spread across a cluster?

>
>>>>
>>>> Right, but we should make it easy, not hard.
>>>
>>> Yeah, I fail to see how this makes it hard.  We conveniently are 
>>> saying, hey, this is all the state that needs to be persisted.  
>>> We'll persist it for you if you want, otherwise, we'll expose it in 
>>> a central location.
>>
>> The state-in-a-file is just a blob.  Don't expect the tool to parse 
>> it and reassociate the various bits to its own representation.  
>> Exposing it via QMP commands is a lot better though.
>
> I don't really see this as being a major issue.  There's no such thing 
> as a "blob".  If someone wants to manipulate the state, they will.   
> We need to keep compatibility to support migrating from 
> version-to-version.
>
> I agree that we want to provide QMP interfaces to work with the state 
> file.  But I don't think we should be hostile to manual manipulation.

No, not hostile.  We should make QMP commands sufficient to deal with 
it, that's all.


[1] in fact, it does change, due to tidal effects.
Dor Laor - Feb. 27, 2011, 4:02 p.m.
On 02/27/2011 03:49 PM, Anthony Liguori wrote:
> On 02/27/2011 03:55 AM, Dor Laor wrote:
>> What about a simpler approach were QMP events will be written to a
>> event-log-file (or even named pipe).
>
> The management tool can just use a small daemon that does nothing other
> than write QMP events to a log. There's no real need for this code to
> live in QEMU.
>

IIUC in case the management daemon will run qemu using named pipe for 
qmp it will happen automatically. If you agree to this approach it will 
simplify the more complex config file option (although it is nice to 
have as independent option for single hosts managed by simpler mgmts)

> Since events are posted, even if we wrote it in QEMU, the event wouldn't
> be guaranteed on disk by the time the event invocation returns.
>
> Regards,
>
> Anthony Liguori
>
Anthony Liguori - Feb. 27, 2011, 5:25 p.m.
On 02/27/2011 10:02 AM, Dor Laor wrote:
> On 02/27/2011 03:49 PM, Anthony Liguori wrote:
>> On 02/27/2011 03:55 AM, Dor Laor wrote:
>>> What about a simpler approach were QMP events will be written to a
>>> event-log-file (or even named pipe).
>>
>> The management tool can just use a small daemon that does nothing other
>> than write QMP events to a log. There's no real need for this code to
>> live in QEMU.
>>
>
> IIUC in case the management daemon will run qemu using named pipe for 
> qmp it will happen automatically.

No, the event model is changing (it was always intended to change 
though).  Events will need explicit registration so it's necessary to 
have bidirectional communication.

So you can't just do -qmp foo -chardev file:event.log,id=foo.  You won't 
actually see most of the events.

Regards,

Anthony Liguori

> If you agree to this approach it will simplify the more complex config 
> file option (although it is nice to have as independent option for 
> single hosts managed by simpler mgmts)
>
>> Since events are posted, even if we wrote it in QEMU, the event wouldn't
>> be guaranteed on disk by the time the event invocation returns.
>>
>> Regards,
>>
>> Anthony Liguori
>>
>
>
Anthony Liguori - Feb. 27, 2011, 5:41 p.m.
On 02/27/2011 09:31 AM, Avi Kivity wrote:
> On 02/27/2011 04:00 PM, Anthony Liguori wrote:
>> On 02/27/2011 03:10 AM, Avi Kivity wrote:
>>> On 02/24/2011 07:58 PM, Anthony Liguori wrote:
>>>>> If you move the cdrom to a different IDE channel, you have to 
>>>>> update the stateful non-config file.
>>>>>
>>>>> Whereas if you do
>>>>>
>>>>>    $ qemu-img create -f cd-tray -b ~/foo.img ~/foo-media-tray.img
>>>>>    $ qemu -cdrom ~/foo-media-tray.img
>>>>>
>>>>> the cd-rom tray state will be tracked in the image file.
>>>>
>>>>
>>>> Yeah, but how do you move it? 
>>>
>>> There is no need to move the file at all.  Simply point the new 
>>> drive at the media tray.
>>
>> No, I was asking, how do you move the cdrom to a different IDE 
>> channel.  Are you using QMP?  Are you changing the command line 
>> arguments?
>
> Yes.
>
> If we're doing hot-move (not really relevant to ide-cd) then you'd use 
> QMP.  If you're editing a virtual machine that is down, or scheduling 
> a change for the next reboot, then you're using command line arguments 
> (or cold-plugging into a stopped guest).
>
> Requiring management to remember the old configuration and issue delta 
> commands to move the device for the cold-plug case is increased 
> complexity IMO.
>
>>
>>>
>>>> If you do a remove/add through QMP, then the config file will 
>>>> reflect things just fine.
>>>
>>> If all access to the state file is through QMP then it becomes more 
>>> palatable.  A bit on that later.
>>
>> As I think I've mentioned before, I hadn't really thought about an 
>> opaque state file but I'm not necessary opposed to it.  I don't see 
>> an obvious advantage to making it opaque but I agree it should be 
>> accessible via QMP.
>
> The advantage is that we keep the management tool talking to one 
> interface (I don't think we should prevent users from interpreting it, 
> just make it unnecessary).
>
>>>
>>> I thought that's what I'm doing by separating the state out.  It's 
>>> easy for management to assemble configuration from their database 
>>> and convert it into a centralized representation (like a qemu 
>>> command line).  It's a lot harder to disassemble a central state 
>>> representation and move it back to the database.
>>>
>>> Using QMP is better than directly accessing the state file since 
>>> qemu does the disassembly for you (provided the command references 
>>> the device using its normal path, not some random key).  The file 
>>> just becomes a way to survive a crash, and all management needs to 
>>> know about is to make it available and back it up.  But it means 
>>> that everything must be done via QMP, including assembly of the 
>>> machine, otherwise the state file can become stale.
>>>
>>> Separating the state out to the device is even easier, since 
>>> management is already expected to take care of disk images.  All 
>>> that's needed is to create the media tray image once, then you can 
>>> forget about it completely.
>>
>> Except that instead of having one state file, we might have a dozen 
>> additional "device state" files.
>
> That is fine.  We already have one state file per block device.
>
>>>> QEMU.   No question about it.  At any point in time, we are the 
>>>> authoritative source of what the guest's configuration is.  There's 
>>>> no doubt about it.  A management tool can try to keep up with us, 
>>>> but ultimately we are the only ones that know for sure.
>>>>
>>>> We have all of this information internally.  Just persisting it is 
>>>> not a major architectural change.  It's something we should have 
>>>> been doing (arguably) from the very beginning.
>>>
>>> That's a huge divergence from how management tools are written.
>>
>> This is one of the reasons why management tooling around QEMU needs 
>> quite a bit of improving.
>>
>> There is simply no way a management tool can do a good job of being 
>> an authoritative source of configuration.  The races we're discussion 
>> is a good example of why.
>
> What we're discussing is not configuration.  It is non-volatile 
> state.  Configuration comes from the user; state comes from the guest 
> (the management tool may edit state; but the guest cannot edit the 
> configuration).
>
> I agree 100% the management tool cannot be the authoritative source of 
> state.
>
> My position is:
> - the management tool should be 100% in control of configuration (how 
> the guest is put together from its components)
> - qemu should be 100% in control of state (memory, disk state, NVRAM 
> in various components, cd-rom eject state, explosive bolts for payload 
> separation, self-destruct mechanism, etc.)

There simply is not such a clean separation between the two because 
things that the guest does affects the configuration of the guest.

Hot plug, removable media eject, persistent device settings (whether 
it's CMOS or EEPROM) all disrupt this model.

If you really wanted to have this separation, you'd have to be very 
strict about making all guest settings not be specified in config.  You 
would need to do:

qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:90 e1000.0.rom
qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:91 e1000.1.rom

qemu -device e1000,id=e1000.0,eeprom=e1000.0.rom -device 
e1000,id=e1000.1,eeprom=e1000.1.rom

And now I need a tool that lets me modify e1000-eprom images if I want 
to change the mac address dynamically (say I'm trying to clone a VM).

This type of model can be workable but as I said earlier, I think it's 
overengineering the problem.

We don't separate configuration from guest state today.  Instead of 
setting ourselves up for failure by setting an unrealistic standard that 
we try to achieve and never do, let's embrace the system that is working 
for us today.  We are authoritative for everything and guest state is 
intimately tied to the virtual machine configuration.

>>
>> But beyond those races, QEMU is the only entity that knows with 
>> certainty what bits of information are important to persist in order 
>> to preserve a guest across shutdown/restart.  The fact that we've 
>> punted this problem for so long has only ensured that management 
>> tools are either intrinsically broken or only support the most 
>> minimal subset of functionality we actually support.
>
> I'm not arguing about that.  I just want to stress again the 
> difference between state and configuration.  Qemu has no authority, in 
> my mind, as to configuration.  Only state.

Being the one that creates a guest based on configuration, I would say 
that we most certainly do.

>>>   Currently they contain the required guest configuration, a 
>>> representation of what's the current live configuration, and they 
>>> issue monitor commands to move the live configuration towards the 
>>> required configuration (or just generate a qemu command line).  What 
>>> you're describing is completely different, I'm not even sure what it 
>>> is.
>>
>> Management tools shouldn't have to think about how the monitor 
>> commands they issue impact the invocation options of QEMU.
>
> They have to, when creating a guest from scratch.
>
> But I admit, this throws a new light (for me) on things.  What's the 
> implications?
> - must have a qemu instance running when editing configuration, even 
> when the guest is down

QMP is an API.  Whether a qemu instance is launched is an implementation 
detail.  This could all be hidden completely with libqmp.

> - cannot add additional information to configuration; must store it in 
> an external database and cross-reference it with the qemu data using 
> the device ID

Don't confuse a management tool's notion of configuration with QEMU's 
configuration.

A management tools config is used to initially create and then 
manipulate an existing guest.   If the management tool supports 
out-of-band manipulation of a configuration file, then it needs to 
determine how the configuration file changed and execute the appropriate 
commands.

> - when editing non-hotpluggable configuration for the next boot, must 
> maintain old config somewhere, so we can issue delta commands later 
> (might be needed for current way of doing things)

Yes, it is.  libvirt kind of cheats here and just deletes the old VM and 
creates a new one when editing the XML IIUC.

> - no transactions/queries/etc except on non-authoritative source
> - issues with shared-nothing design (well, can store the configuration 
> file using DRBD).

In both cases, today a management tool races with QEMU so both of these 
points are currently true.

>>> If you look at management tools, they believe they are the 
>>> authoritative source of configuration information (not guest state, 
>>> which is more or less ignored).
>>
>> It's because we've given them no other option.
>
> It's the natural way of doing it.  You have a web interface that talks 
> to a database.  When you want to list all VMs that have network cards 
> on the production subnet, you issue a database query and get a 
> recordset.  How do you do that when the authoritative source of 
> information is spread across a cluster?

This problem still exists today.  A guest can eject a network card on 
it's own (without the management tool issuing a device_del command).  
QEMU will delete the NIC when this happens.  The same is true with CDROM 
eject.

Management tools are simply not authoritative today.

Regards,

Anthony Liguori
Avi Kivity - Feb. 28, 2011, 8:38 a.m.
On 02/27/2011 07:41 PM, Anthony Liguori wrote:
>>
>> I agree 100% the management tool cannot be the authoritative source 
>> of state.
>>
>> My position is:
>> - the management tool should be 100% in control of configuration (how 
>> the guest is put together from its components)
>> - qemu should be 100% in control of state (memory, disk state, NVRAM 
>> in various components, cd-rom eject state, explosive bolts for 
>> payload separation, self-destruct mechanism, etc.)
>
>
> There simply is not such a clean separation between the two because 
> things that the guest does affects the configuration of the guest.
>
> Hot plug, 

I don't think hotunplug works this way.  When the guest ejects the pci 
or usb device, it simply stops working with the device and disconnects 
the power.  There is nothing non-volatile going on, no spring-loaded 
lever that pushes the device out.  If the server reboots immediately 
after hotunplug, but before the user physically removes the device, then 
the server will see the device when it boots up.

> removable media eject, 

Here, we do have a single bit of non-volatile storage.

> persistent device settings (whether it's CMOS or EEPROM) all disrupt 
> this model.

These are just arrays of bits, most of them with no standard 
interpretation.  So a block device fits them perfectly.

>
> If you really wanted to have this separation, you'd have to be very 
> strict about making all guest settings not be specified in config.  
> You would need to do:
>
> qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:90 e1000.0.rom
> qemu-img create -f e1000-eprom -o macaddr=12:23:45:67:78:91 e1000.1.rom
>
> qemu -device e1000,id=e1000.0,eeprom=e1000.0.rom -device 
> e1000,id=e1000.1,eeprom=e1000.1.rom
>
> And now I need a tool that lets me modify e1000-eprom images if I want 
> to change the mac address dynamically (say I'm trying to clone a VM).
>
> This type of model can be workable but as I said earlier, I think it's 
> overengineering the problem.

In fact I don't think anyone wants this.  Usually management wants the 
assigned MAC to be used without the guest playing games with it.  So 
it's more or less pointless however it's implemented.

>
> We don't separate configuration from guest state today.  Instead of 
> setting ourselves up for failure by setting an unrealistic standard 
> that we try to achieve and never do, let's embrace the system that is 
> working for us today.  We are authoritative for everything and guest 
> state is intimately tied to the virtual machine configuration.

"we are authoritative for everything" is a clean break from everything 
that's being done today.  It's also a clean break from the model of 
central management plus database.  We can't force it on people.

Non-volatile state is not intimately tied to configuration.  We store 
block device state completely outside the configuration.  What's left is 
the CD-ROM tray, CMOS memory, and network card EEPROM.  We could argue 
back and forth about where exactly they belong, but they aren't really 
worth the conversation since they are meaningless for real-life use.

>
>>>
>>> But beyond those races, QEMU is the only entity that knows with 
>>> certainty what bits of information are important to persist in order 
>>> to preserve a guest across shutdown/restart.  The fact that we've 
>>> punted this problem for so long has only ensured that management 
>>> tools are either intrinsically broken or only support the most 
>>> minimal subset of functionality we actually support.
>>
>> I'm not arguing about that.  I just want to stress again the 
>> difference between state and configuration.  Qemu has no authority, 
>> in my mind, as to configuration.  Only state.
>
> Being the one that creates a guest based on configuration, I would say 
> that we most certainly do.

That is not what being authoritative means.

In a virt-manager deployment, libvirt is the authoritative source of 
guest configuration.  In a RHEV-M deployment, the RHEV-M database is the 
authoritative source of guest configuration.  You can completely replace 
the host machine and your guest will recreate just fine as long as the 
authoritative source is intact.

>
>>>>   Currently they contain the required guest configuration, a 
>>>> representation of what's the current live configuration, and they 
>>>> issue monitor commands to move the live configuration towards the 
>>>> required configuration (or just generate a qemu command line).  
>>>> What you're describing is completely different, I'm not even sure 
>>>> what it is.
>>>
>>> Management tools shouldn't have to think about how the monitor 
>>> commands they issue impact the invocation options of QEMU.
>>
>> They have to, when creating a guest from scratch.
>>
>> But I admit, this throws a new light (for me) on things.  What's the 
>> implications?
>> - must have a qemu instance running when editing configuration, even 
>> when the guest is down
>
> QMP is an API.  Whether a qemu instance is launched is an 
> implementation detail.  This could all be hidden completely with libqmp.

QMP is first and foremost a protocol.

>
>> - cannot add additional information to configuration; must store it 
>> in an external database and cross-reference it with the qemu data 
>> using the device ID
>
> Don't confuse a management tool's notion of configuration with QEMU's 
> configuration.
>
> A management tools config is used to initially create and then 
> manipulate an existing guest.   If the management tool supports 
> out-of-band manipulation of a configuration file, then it needs to 
> determine how the configuration file changed and execute the 
> appropriate commands.

I wasn't talking about that.  I was talking about data that is 
meaningful to a user but not meaningful to qemu.  That sort of data 
doesn't store well if qemu is the authoritative source.

> Yes, it is.  libvirt kind of cheats here and just deletes the old VM 
> and creates a new one when editing the XML IIUC.
>
>> - no transactions/queries/etc except on non-authoritative source
>> - issues with shared-nothing design (well, can store the 
>> configuration file using DRBD).
>
> In both cases, today a management tool races with QEMU so both of 
> these points are currently true.

No, it doesn't.  If the guest ejects a network card, the network card is 
still there.  Queries against the database still return correct results.

>
>>>> If you look at management tools, they believe they are the 
>>>> authoritative source of configuration information (not guest state, 
>>>> which is more or less ignored).
>>>
>>> It's because we've given them no other option.
>>
>> It's the natural way of doing it.  You have a web interface that 
>> talks to a database.  When you want to list all VMs that have network 
>> cards on the production subnet, you issue a database query and get a 
>> recordset.  How do you do that when the authoritative source of 
>> information is spread across a cluster?
>
> This problem still exists today.  A guest can eject a network card on 
> it's own (without the management tool issuing a device_del command).  
> QEMU will delete the NIC when this happens. 

I think that's a bug.

> The same is true with CDROM eject.

CDROM tray position is state, not configuration.

>
> Management tools are simply not authoritative today.
>
> Regards,
>
> Anthony Liguori
Dor Laor - Feb. 28, 2011, 8:58 a.m.
On 02/27/2011 07:25 PM, Anthony Liguori wrote:
> On 02/27/2011 10:02 AM, Dor Laor wrote:
>> On 02/27/2011 03:49 PM, Anthony Liguori wrote:
>>> On 02/27/2011 03:55 AM, Dor Laor wrote:
>>>> What about a simpler approach were QMP events will be written to a
>>>> event-log-file (or even named pipe).
>>>
>>> The management tool can just use a small daemon that does nothing other
>>> than write QMP events to a log. There's no real need for this code to
>>> live in QEMU.
>>>
>>
>> IIUC in case the management daemon will run qemu using named pipe for
>> qmp it will happen automatically.
>
> No, the event model is changing (it was always intended to change
> though). Events will need explicit registration so it's necessary to
> have bidirectional communication.

But the mgmt->qemu direction is only about event registration, it's not 
valuable info.
It's a simpler approach than relaying on another config file and it is 
safer than having a separate mgmt daemon to log these events.
Cross posting to libvirt-list to hear their view.

>
> So you can't just do -qmp foo -chardev file:event.log,id=foo. You won't
> actually see most of the events.
>
> Regards,
>
> Anthony Liguori
>
>> If you agree to this approach it will simplify the more complex config
>> file option (although it is nice to have as independent option for
>> single hosts managed by simpler mgmts)
>>
>>> Since events are posted, even if we wrote it in QEMU, the event wouldn't
>>> be guaranteed on disk by the time the event invocation returns.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>
>>
>
>
Anthony Liguori - Feb. 28, 2011, 12:45 p.m.
On 02/28/2011 02:38 AM, Avi Kivity wrote:
>>
>> We don't separate configuration from guest state today.  Instead of 
>> setting ourselves up for failure by setting an unrealistic standard 
>> that we try to achieve and never do, let's embrace the system that is 
>> working for us today.  We are authoritative for everything and guest 
>> state is intimately tied to the virtual machine configuration.
>
> "we are authoritative for everything" is a clean break from everything 
> that's being done today.  It's also a clean break from the model of 
> central management plus database.  We can't force it on people.

No, it isn't.  This has been the way QEMU has always been.

QEMU has always and will always continue to be useful in the absence of 
a management layer.  That means that it will mix modifications to 
configuration with guest actions.

To avoid races, a management tool needs to either poll QEMU or listen 
for events to know when QEMU initiates a configuration change.  This is 
how tools are written today.

The only thing being discussed is how to handle a very small and rare 
race condition whereas QEMU may send an notification to a crashed 
management tool *and* QEMU crashes before the management tool restarts.

The only two options to solve this problem are synchronous notifications 
or a QEMU global state file.  Since the former is a radical change to 
our existing API, the later is a much more reasonable option.

If a management tool doesn't care about this race, it can simply not use 
the global state file.

Regards,

Anthony Liguori
Avi Kivity - Feb. 28, 2011, 1:21 p.m.
On 02/28/2011 02:45 PM, Anthony Liguori wrote:
> On 02/28/2011 02:38 AM, Avi Kivity wrote:
>>>
>>> We don't separate configuration from guest state today.  Instead of 
>>> setting ourselves up for failure by setting an unrealistic standard 
>>> that we try to achieve and never do, let's embrace the system that 
>>> is working for us today.  We are authoritative for everything and 
>>> guest state is intimately tied to the virtual machine configuration.
>>
>> "we are authoritative for everything" is a clean break from 
>> everything that's being done today.  It's also a clean break from the 
>> model of central management plus database.  We can't force it on people.
>
> No, it isn't.  This has been the way QEMU has always been.
>
> QEMU has always and will always continue to be useful in the absence 
> of a management layer.  That means that it will mix modifications to 
> configuration with guest actions.
>
> To avoid races, a management tool needs to either poll QEMU or listen 
> for events to know when QEMU initiates a configuration change.  This 
> is how tools are written today.
>
> The only thing being discussed is how to handle a very small and rare 
> race condition whereas QEMU may send an notification to a crashed 
> management tool *and* QEMU crashes before the management tool restarts.
>
> The only two options to solve this problem are synchronous 
> notifications or a QEMU global state file.  Since the former is a 
> radical change to our existing API, the later is a much more 
> reasonable option.
>
> If a management tool doesn't care about this race, it can simply not 
> use the global state file.

You're just ignoring what I've written.
Anthony Liguori - Feb. 28, 2011, 5:33 p.m.
On Feb 28, 2011 7:21 AM, "Avi Kivity" <avi@redhat.com> wrote:
>
> On 02/28/2011 02:45 PM, Anthony Liguori wrote:
>>
>> On 02/28/2011 02:38 AM, Avi Kivity wrote:
>>>>
>>>>
>>>> We don't separate configuration from guest state today.  Instead of
setting ourselves up for failure by setting an unrealistic standard that we
try to achieve and never do, let's embrace the system that is working for us
today.  We are authoritative for everything and guest state is intimately
tied to the virtual machine configuration.
>>>
>>>
>>> "we are authoritative for everything" is a clean break from everything
that's being done today.  It's also a clean break from the model of central
management plus database.  We can't force it on people.
>>
>>
>> No, it isn't.  This has been the way QEMU has always been.
>>
>> QEMU has always and will always continue to be useful in the absence of a
management layer.  That means that it will mix modifications to
configuration with guest actions.
>>
>> To avoid races, a management tool needs to either poll QEMU or listen for
events to know when QEMU initiates a configuration change.  This is how
tools are written today.
>>
>> The only thing being discussed is how to handle a very small and rare
race condition whereas QEMU may send an notification to a crashed management
tool *and* QEMU crashes before the management tool restarts.
>>
>> The only two options to solve this problem are synchronous notifications
or a QEMU global state file.  Since the former is a radical change to our
existing API, the later is a much more reasonable option.
>>
>> If a management tool doesn't care about this race, it can simply not use
the global state file.
>
>
> You're just ignoring what I've written.

No, you're just impervious to my subtle attempt to refocus the discussion on
solving a practical problem.

There's a lot of good, reasonably straight forward changes we can make that
have a high return on investment.  The only suggestion I'm making beyond
Marcelo's original patch is that we use a structured format and that we make
it possible to use the same file to solve this problem in multiple places.

I don't think this creates a fundamental break in how management tools
interact with QEMU.  I don't think introducing RAID support in the block
layer is a reasonable alternative.

Regards,

Anthony Liguori
>
>
> --
> error compiling committee.c: too many arguments to function
>
Avi Kivity - Feb. 28, 2011, 5:47 p.m.
On 02/28/2011 07:33 PM, Anthony Liguori wrote:
>
> >
> > You're just ignoring what I've written.
>
> No, you're just impervious to my subtle attempt to refocus the 
> discussion on solving a practical problem.
>
> There's a lot of good, reasonably straight forward changes we can make 
> that have a high return on investment.
>

Is making qemu the authoritative source of configuration information a 
straightforward change?  Is the return on it high?  Is the investment low?

"No" to all three (ignoring for the moment whether it is good or not, 
which we were debating).

> The only suggestion I'm making beyond Marcelo's original patch is that 
> we use a structured format and that we make it possible to use the 
> same file to solve this problem in multiple places.
>

No, you're suggesting a lot more than that.

> I don't think this creates a fundamental break in how management tools 
> interact with QEMU.  I don't think introducing RAID support in the 
> block layer is a reasonable alternative.
>
>

Why not?

Something that avoids the whole state thing altogether:

- instead of atomically switching when live copy is done, keep on 
issuing writes to both the origin and the live copy
- issue a notification to management
- management receives the notification, and issues an atomic blockdev 
switch command

this is really the RAID-1 solution but without the state file (credit 
Dor).  An advantage is that there is no additional latency when trying 
to catch up to the dirty bitmap.
Anthony Liguori - Feb. 28, 2011, 6:12 p.m.
On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com> wrote:
>
> On 02/28/2011 07:33 PM, Anthony Liguori wrote:
>>
>>
>> >
>> > You're just ignoring what I've written.
>>
>> No, you're just impervious to my subtle attempt to refocus the discussion
on solving a practical problem.
>>
>> There's a lot of good, reasonably straight forward changes we can make
that have a high return on investment.
>>
>
> Is making qemu the authoritative source of configuration information a
straightforward change?  Is the return on it high?  Is the investment low?

I think this is where we fundamentally disagree.  My position is that QEMU
is already the authoritative source.  Having a state file doesn't change
anything.

Do a hot unplug of a network device with upstream libvirt with acpiphp
unloaded, consult libvirt and then consult the monitor to see who has the
right view of the guests config.

To me, that's the definition of authoritative.

> "No" to all three (ignoring for the moment whether it is good or not,
which we were debating).
>
>
>> The only suggestion I'm making beyond Marcelo's original patch is that we
use a structured format and that we make it possible to use the same file to
solve this problem in multiple places.
>>
>
> No, you're suggesting a lot more than that.

That's exactly what I'm suggesting from a technical perspective.

>> I don't think this creates a fundamental break in how management tools
interact with QEMU.  I don't think introducing RAID support in the block
layer is a reasonable alternative.
>>
>>
>
> Why not?

Because its a lot of complexity and code that can go wrong while only
solving the race for one specific case.  Not to mention that we double the
iop rate.

> Something that avoids the whole state thing altogether:
>
> - instead of atomically switching when live copy is done, keep on issuing
writes to both the origin and the live copy
> - issue a notification to management
> - management receives the notification, and issues an atomic blockdev
switch command

> this is really the RAID-1 solution but without the state file (credit
Dor).  An advantage is that there is no additional latency when trying to
catch up to the dirty bitmap.

It still suffers from the two generals problem.  You cannot solve this
without making one node reliable and that takes us back to it being either
QEMU (posted event and state file) or the management tool (sync event).

Regards,

Anthony Liguori

>
> --
> error compiling committee.c: too many arguments to function
>
Marcelo Tosatti - Feb. 28, 2011, 6:56 p.m.
On Mon, Feb 28, 2011 at 07:47:13PM +0200, Avi Kivity wrote:
> On 02/28/2011 07:33 PM, Anthony Liguori wrote:
> >
> >>
> >> You're just ignoring what I've written.
> >
> >No, you're just impervious to my subtle attempt to refocus the
> >discussion on solving a practical problem.
> >
> >There's a lot of good, reasonably straight forward changes we can
> >make that have a high return on investment.
> >
> 
> Is making qemu the authoritative source of configuration information
> a straightforward change?  Is the return on it high?  Is the
> investment low?
> 
> "No" to all three (ignoring for the moment whether it is good or
> not, which we were debating).
> 
> >The only suggestion I'm making beyond Marcelo's original patch is
> >that we use a structured format and that we make it possible to
> >use the same file to solve this problem in multiple places.
> >
> 
> No, you're suggesting a lot more than that.
> 
> >I don't think this creates a fundamental break in how management
> >tools interact with QEMU.  I don't think introducing RAID support
> >in the block layer is a reasonable alternative.
> >
> >
> 
> Why not?
> 
> Something that avoids the whole state thing altogether:
> 
> - instead of atomically switching when live copy is done, keep on
> issuing writes to both the origin and the live copy
> - issue a notification to management
> - management receives the notification, and issues an atomic
> blockdev switch command

How do you know if QEMU performed the switch or not? That is,
how can the switch command be atomic?

> this is really the RAID-1 solution but without the state file
> (credit Dor).  An advantage is that there is no additional latency
> when trying to catch up to the dirty bitmap.

So you're implementing a virtual block driver not visible to the user as
an image format. The image format would allow storage of persistent copy
progress, etc. so you lose all that.

All of that to avoid introducing a commit file which is not part of
global qemu state.
Marcelo Tosatti - Feb. 28, 2011, 7:09 p.m.
On Sat, Feb 26, 2011 at 07:45:44AM -0600, Anthony Liguori wrote:
> >>>+- "commit_filename": target commit filename (json-string, optional)
> >>I think we should drop this.
> >Why? Sorry but this can't wait for non-config persistent storage. This
> >mistake was made in the past with irqchip for example, lets not repeat
> >it.
> >
> >Its OK to deprecate "commit_filename" in favour of its location in
> >non-config persistent storage.
> >
> >Its not the end of the world for a mgmt app to handle change (not saying
> >its not a good principle) such as this.
> 
> Even as a one off, it's not a very good solution to the problem.
> We'd be way better of just having nothing here than using the commit
> file.  What are the semantics of a half written file?  How does a
> management tool detect a half written file?

If the commit file contains the full commit message, it can be
considered valid. Otherwise, it should be considered invalid.

Stopping the guest and waiting for mgmt to issue a continue command is a
solution, but it has drawbacks introduced by reliance on mgmt app (what 
if mgmt app crashes, latency, etc). 

But it seems that it is preferred over a commit file. 

Addressing the other comments in the meantime, thanks for input.
Marcelo Tosatti - March 1, 2011, 2:35 a.m.
On Sat, Feb 26, 2011 at 07:45:44AM -0600, Anthony Liguori wrote:
> >>>+- "filename": target image filename (json-string)
> >>Is this a created image?  Is this an image to create?
> >A previously created image.
> >
> >>To future proof for blockdev, we should make this argument optional
> >>and if it's not specified throw an error about missing argument.
> >>This let's us introduce an optional blockdev argument such that we
> >>can use a blockdev name.
> >What you mean "blockdev"?
> 
> -drive file=image.qcow2,if=none,id=foo
> 
> 'foo' is a named blockdev.  We don't have a way to add these through
> the monitor (yet) but we will for 0.15.

Fail to see the point of having a named blockdev to specify the target?

> >>If I resume the
> >>operation with the incremental flag set, will it Just Work?
> >As in:
> >
> >- block_copy ide0-hd1 /mnt/aa.img
> >- block_copy ide0-hd1 /mnt/aa.img -i
> >
> >?
> >
> >The second command will fail with QERR_BLOCKCOPY_IN_PROGRESS.
> 
> No, as in:
> 
> block_copy ide0-hd1 /mnt/aa.img
> block_copy_cancel ide0-h1
> block_copy ide0-h1 /mnt/aa.img -i
> 
> Does it pick up where it left off or does it start all over again?

It starts all over again.
Dor Laor - March 1, 2011, 8:59 a.m.
On 02/28/2011 08:12 PM, Anthony Liguori wrote:
>
> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com
> <mailto:avi@redhat.com>> wrote:
>  >
>  > On 02/28/2011 07:33 PM, Anthony Liguori wrote:
>  >>
>  >>
>  >> >
>  >> > You're just ignoring what I've written.
>  >>
>  >> No, you're just impervious to my subtle attempt to refocus the
> discussion on solving a practical problem.
>  >>
>  >> There's a lot of good, reasonably straight forward changes we can
> make that have a high return on investment.
>  >>
>  >
>  > Is making qemu the authoritative source of configuration information
> a straightforward change?  Is the return on it high?  Is the investment low?
>
> I think this is where we fundamentally disagree.  My position is that
> QEMU is already the authoritative source.  Having a state file doesn't
> change anything.
>
> Do a hot unplug of a network device with upstream libvirt with acpiphp
> unloaded, consult libvirt and then consult the monitor to see who has
> the right view of the guests config.
>
> To me, that's the definition of authoritative.
>
>  > "No" to all three (ignoring for the moment whether it is good or not,
> which we were debating).
>  >
>  >
>  >> The only suggestion I'm making beyond Marcelo's original patch is
> that we use a structured format and that we make it possible to use the
> same file to solve this problem in multiple places.
>  >>
>  >
>  > No, you're suggesting a lot more than that.
>
> That's exactly what I'm suggesting from a technical perspective.
>
>  >> I don't think this creates a fundamental break in how management
> tools interact with QEMU.  I don't think introducing RAID support in the
> block layer is a reasonable alternative.
>  >>
>  >>
>  >
>  > Why not?
>
> Because its a lot of complexity and code that can go wrong while only
> solving the race for one specific case.  Not to mention that we double
> the iop rate.
>
>  > Something that avoids the whole state thing altogether:
>  >
>  > - instead of atomically switching when live copy is done, keep on
> issuing writes to both the origin and the live copy
>  > - issue a notification to management
>  > - management receives the notification, and issues an atomic blockdev
> switch command
>
>  > this is really the RAID-1 solution but without the state file (credit
> Dor).  An advantage is that there is no additional latency when trying
> to catch up to the dirty bitmap.
>
> It still suffers from the two generals problem.  You cannot solve this
> without making one node reliable and that takes us back to it being
> either QEMU (posted event and state file) or the management tool (sync
> event).

It is safe w/o a state file by changing the basic live copy algorithm:

1. Live copy in progress stage
    Once live copy command is issued, a dirty bit map is created for
    tracking. There is a single pass over the entire image where we copy
    blocks from the src to the dst.

    Write commands for blocks that were already copied will be done
    twice for the src and dst.

    Once the full copy single pass ends, we trigger a QMP event that
    this stage can end.

    The live copy stage keeps running till the management issue a switch
    command. When it will happen, the switch is immediate and no need to
    copy additional blocks (but flush pending IOs).

2. Management sends a switch command.
    Qemu stops the doubling the IO and switches to the destination.
    End.

Now let's see the error case:
- qemu failure over stage #1
   No matter what happens, the management will start qemu with the
   source image. The destination will be erased, no matter how much we
   copied.
- management failure over stage #1
   The new mgmt daemon needs to query qemu's status.
   Management can continue as before.
- qemu+mgmt failure at stage #1
   The management should just run qemu with the source image.
- mgmt failure post sending stage #2 command.
   The mgmt DB states that we switched, just need to connect to qemu.
- qemu failure before/after getting the stage #2 event.
   Management will need just to execute new qemu with the dst image
- Failure of both qemu & mgmt in stage #2
   The same as above.

Pros:
  - Fast switch over time, minimal latency
  - No external storage/config needed
  - No need to wait for mgmt

Thanks,
Dor

>
> Regards,
>
> Anthony Liguori
>
>  >
>  > --
>  > error compiling committee.c: too many arguments to function
>  >
>
Avi Kivity - March 1, 2011, 9:39 a.m.
On 02/28/2011 08:12 PM, Anthony Liguori wrote:
>
>
> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com 
> <mailto:avi@redhat.com>> wrote:
> >
> > On 02/28/2011 07:33 PM, Anthony Liguori wrote:
> >>
> >>
> >> >
> >> > You're just ignoring what I've written.
> >>
> >> No, you're just impervious to my subtle attempt to refocus the 
> discussion on solving a practical problem.
> >>
> >> There's a lot of good, reasonably straight forward changes we can 
> make that have a high return on investment.
> >>
> >
> > Is making qemu the authoritative source of configuration information 
> a straightforward change?  Is the return on it high?  Is the 
> investment low?
>
> I think this is where we fundamentally disagree.  My position is that 
> QEMU is already the authoritative source.  Having a state file doesn't 
> change anything.
>
> Do a hot unplug of a network device with upstream libvirt with acpiphp 
> unloaded, consult libvirt and then consult the monitor to see who has 
> the right view of the guests config.
>

libvirt is right and the monitor is wrong.

On real hardware, calling _EJ0 doesn't affect the configuration one 
little bit (if I understand it correctly).  It just turns off power to 
the slot.  If you power-cycle, the card will be there.

In the real world, the authoritative source of configuration is a human 
with a screwdriver.  The virtualized equivalent is the management tool.

> To me, that's the definition of authoritative.
>
> > "No" to all three (ignoring for the moment whether it is good or 
> not, which we were debating).
> >
> >
> >> The only suggestion I'm making beyond Marcelo's original patch is 
> that we use a structured format and that we make it possible to use 
> the same file to solve this problem in multiple places.
> >>
> >
> > No, you're suggesting a lot more than that.
>
> That's exactly what I'm suggesting from a technical perspective.
>

Unless I'm hallucinating, you're suggesting quite a bit more.  A 
revolution in how qemu is to be managed.

> >> I don't think this creates a fundamental break in how management 
> tools interact with QEMU.  I don't think introducing RAID support in 
> the block layer is a reasonable alternative.
> >>
> >>
> >
> > Why not?
>
> Because its a lot of complexity and code that can go wrong while only 
> solving the race for one specific case.  Not to mention that we double 
> the iop rate.
>

IMO it's of similar complexity.  The number of I/Os don't change (reads 
stay the same, and any write that has already been mirrored needs to be 
re-mirrored in both cases.  We do gain lower latency switchover and we 
package the code as a block format driver instead of core block code.  
We decouple the dependencies from live migration.

> > Something that avoids the whole state thing altogether:
> >
> > - instead of atomically switching when live copy is done, keep on 
> issuing writes to both the origin and the live copy
> > - issue a notification to management
> > - management receives the notification, and issues an atomic 
> blockdev switch command
>
> > this is really the RAID-1 solution but without the state file 
> (credit Dor).  An advantage is that there is no additional latency 
> when trying to catch up to the dirty bitmap.
>
> It still suffers from the two generals problem.  You cannot solve this 
> without making one node reliable and that takes us back to it being 
> either QEMU (posted event and state file) or the management tool (sync 
> event).
>
>

It works without either.  If qemu fails, you simply re-mirror 
everything.  If the management tool fails, it re-subscribes to the 
mirror-complete event, queries whether it already happened in its 
absence, and if it did, requests the switchover.
Avi Kivity - March 1, 2011, 9:45 a.m.
On 02/28/2011 08:56 PM, Marcelo Tosatti wrote:
> >
> >  Something that avoids the whole state thing altogether:
> >
> >  - instead of atomically switching when live copy is done, keep on
> >  issuing writes to both the origin and the live copy
> >  - issue a notification to management
> >  - management receives the notification, and issues an atomic
> >  blockdev switch command
>
> How do you know if QEMU performed the switch or not? That is,
> how can the switch command be atomic?

If you fail while qemu processes the command, you query it to see what 
it's current state.  You can simply re-issue the command; it's idempotent.

If qemu fails before you issue the switch you discard the new copy.  If 
it fails after the switch (whether it acked or not) you discard the 
original.

> >  this is really the RAID-1 solution but without the state file
> >  (credit Dor).  An advantage is that there is no additional latency
> >  when trying to catch up to the dirty bitmap.
>
> So you're implementing a virtual block driver not visible to the user as
> an image format. The image format would allow storage of persistent copy
> progress, etc. so you lose all that.
>
> All of that to avoid introducing a commit file which is not part of
> global qemu state.

I think it has other advantages as well - code isolation, live migration 
compatibility, reduced switchover times.

Really the bad thing about the commit file is its un-documented-ness and 
ad-hoc nature which means the management tool is unlikely to get things 
right.

Qemu maintaining state is fine.  It already does - in block devices, and 
should do more.  We just have to take the right approach to do it (and 
be careful not to make it manage configuration).
Anthony Liguori - March 1, 2011, 3:51 p.m.
On 03/01/2011 04:39 AM, Avi Kivity wrote:
> On 02/28/2011 08:12 PM, Anthony Liguori wrote:
>>
>>
>> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com 
>> <mailto:avi@redhat.com>> wrote:
>> >
>> > On 02/28/2011 07:33 PM, Anthony Liguori wrote:
>> >>
>> >>
>> >> >
>> >> > You're just ignoring what I've written.
>> >>
>> >> No, you're just impervious to my subtle attempt to refocus the 
>> discussion on solving a practical problem.
>> >>
>> >> There's a lot of good, reasonably straight forward changes we can 
>> make that have a high return on investment.
>> >>
>> >
>> > Is making qemu the authoritative source of configuration 
>> information a straightforward change?  Is the return on it high?  Is 
>> the investment low?
>>
>> I think this is where we fundamentally disagree.  My position is that 
>> QEMU is already the authoritative source.  Having a state file 
>> doesn't change anything.
>>
>> Do a hot unplug of a network device with upstream libvirt with 
>> acpiphp unloaded, consult libvirt and then consult the monitor to see 
>> who has the right view of the guests config.
>>
>
> libvirt is right and the monitor is wrong.
>
> On real hardware, calling _EJ0 doesn't affect the configuration one 
> little bit (if I understand it correctly).  It just turns off power to 
> the slot.  If you power-cycle, the card will be there.

It's up to the hardware vendor.  Since it's ACPI, it can result in any 
number of operations.  Usually, there's some logic to flip on an LED or 
something.

There's nothing that prevents a vendor from ejecting the card.  My point 
is that there aren't cleanly separated lines in the real world.

>> To me, that's the definition of authoritative.
>>
>> > "No" to all three (ignoring for the moment whether it is good or 
>> not, which we were debating).
>> >
>> >
>> >> The only suggestion I'm making beyond Marcelo's original patch is 
>> that we use a structured format and that we make it possible to use 
>> the same file to solve this problem in multiple places.
>> >>
>> >
>> > No, you're suggesting a lot more than that.
>>
>> That's exactly what I'm suggesting from a technical perspective.
>>
>
> Unless I'm hallucinating, you're suggesting quite a bit more.  A 
> revolution in how qemu is to be managed.

Let me take another route to see if I can't persuade you.

First, let's clarify your proposal.  You want to introduce a new block 
format
that references to block devices.  It may also store a dirty bitmap to keep
track of which blocks are out of sync.  Hopefully, it goes without saying
that the dirty bitmap is strictly optional (it's a performance 
optimization) so
let's ignore it.

Your format, as a text file, looks like:

[raid1]
primary=diska.img
secondary=diskb.img
active=primary

To use it, here's the sequence:

0) qemu uses disk A for a block device

1) create a raid1 block device pointing to disk A and disk B.

2) management tool asks qemu to us the new raid1 block device.

3) qemu acks (2)

4) at some point, the mirror completes, writes are going to both disks

5) qemu sends out an event indicating that the disks are in sync

6) management tool then sends a command to fail over to disk B

7) qemu acks (6)

We're making the management tool the "authoritative" source of how to launch
QEMU.  That means that the management tool ultimately determines which 
command
line to relaunch QEMU with.

Here are the races:

A) If QEMU crashes between (2) and (3), it may have issues a write to 
the new
    raid1 block device before the management tool sees (3).  If this 
happens,
    when the management tool restarts QEMU with disk A, we're left with a
    dangling raid1 block device.  Not a critical failure, but not ideal.

B) If QEMU crashes between (6) and (7), QEMU may have started writing to 
disk
    B before the management tool sees (7).  This means that the 
management tool
    will create the guest with the raid1 block device which no longer is the
    correct disk.  This could fail in subtly bad ways.  Depending on how 
read
    is implemented (if you try to do striping for instance), bad data 
could be
    returned.  You could try to implement a policy of always reading 
from B if
    the block has been copied but this gets harry really quickly.  It's
    definitely not RAID1 anymore.

You may observe that the problem is not the RAID1 mechanism, but 
changing from
using a normal device and the RAID1 mechanism.  It would then be wise to 
say,
let's always use this image format.  Since that eliminates the race, we 
don't
really need the copy bitmap anymore.

Now we're left with a simple format that just refers to two filenames.  
However,
block devices are more than just a filename.  It needs a format, cache
settings, etc.  So let's put this all in the RAID1 block format.  We 
also need
a way to indicate which block device is selected.

Let's make it a text file for purposes of discussion.  It will look 
something
like:

[primary]
filename=diska.img
cache=none
format=raw

[secondary]
filename=diskb.img
cache=writethrough
format=qcow2

[global]
active=primary

Since we might want to mirror multiple drives at once, we should probablyn
support having multiple drives configured which means we need to not 
just have
a single active entry, but an entry associated with a particular device.

[drive "diskA"]
filename=diska.img
cache=none
format=raw

[drive "diskB"]
filename=diskb.img
cache=writethrough
format=qcow2

[device "vda"]
drive=diskB

And this is exactly what I'm proposing.  It's really the natural 
generalization
of what you're proposing.

So basically, the only differences are:

  1) always use the new RAID1 format
  2) drop the progress bitmap
  3) support multiple devices per file
  4) let drive properties be specified beyond filename

All reasonable things to do.

Regards,

Anthony Liguori
Dor Laor - March 1, 2011, 10:27 p.m.
On 03/01/2011 05:51 PM, Anthony Liguori wrote:
> On 03/01/2011 04:39 AM, Avi Kivity wrote:
>> On 02/28/2011 08:12 PM, Anthony Liguori wrote:
>>>
>>>
>>> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com
>>> <mailto:avi@redhat.com>> wrote:
>>> >
>>> > On 02/28/2011 07:33 PM, Anthony Liguori wrote:
>>> >>
>>> >>
>>> >> >
>>> >> > You're just ignoring what I've written.
>>> >>
>>> >> No, you're just impervious to my subtle attempt to refocus the
>>> discussion on solving a practical problem.
>>> >>
>>> >> There's a lot of good, reasonably straight forward changes we can
>>> make that have a high return on investment.
>>> >>
>>> >
>>> > Is making qemu the authoritative source of configuration
>>> information a straightforward change? Is the return on it high? Is
>>> the investment low?
>>>
>>> I think this is where we fundamentally disagree. My position is that
>>> QEMU is already the authoritative source. Having a state file doesn't
>>> change anything.
>>>
>>> Do a hot unplug of a network device with upstream libvirt with
>>> acpiphp unloaded, consult libvirt and then consult the monitor to see
>>> who has the right view of the guests config.
>>>
>>
>> libvirt is right and the monitor is wrong.
>>
>> On real hardware, calling _EJ0 doesn't affect the configuration one
>> little bit (if I understand it correctly). It just turns off power to
>> the slot. If you power-cycle, the card will be there.
>
> It's up to the hardware vendor. Since it's ACPI, it can result in any
> number of operations. Usually, there's some logic to flip on an LED or
> something.
>
> There's nothing that prevents a vendor from ejecting the card. My point
> is that there aren't cleanly separated lines in the real world.
>
>>> To me, that's the definition of authoritative.
>>>
>>> > "No" to all three (ignoring for the moment whether it is good or
>>> not, which we were debating).
>>> >
>>> >
>>> >> The only suggestion I'm making beyond Marcelo's original patch is
>>> that we use a structured format and that we make it possible to use
>>> the same file to solve this problem in multiple places.
>>> >>
>>> >
>>> > No, you're suggesting a lot more than that.
>>>
>>> That's exactly what I'm suggesting from a technical perspective.
>>>
>>
>> Unless I'm hallucinating, you're suggesting quite a bit more. A
>> revolution in how qemu is to be managed.
>
> Let me take another route to see if I can't persuade you.
>
> First, let's clarify your proposal. You want to introduce a new block
> format

No. That was Avi's initial proposal, after we talked we realized that it 
is not needed and we can use plain files w/o any new configuration.
Pretty much similar to what you're proposing below, just w/o the 
configuration files.

> that references to block devices. It may also store a dirty bitmap to keep
> track of which blocks are out of sync. Hopefully, it goes without saying
> that the dirty bitmap is strictly optional (it's a performance
> optimization) so
> let's ignore it.
>
> Your format, as a text file, looks like:
>
> [raid1]
> primary=diska.img
> secondary=diskb.img
> active=primary
>
> To use it, here's the sequence:
>
> 0) qemu uses disk A for a block device
>
> 1) create a raid1 block device pointing to disk A and disk B.
>
> 2) management tool asks qemu to us the new raid1 block device.
>
> 3) qemu acks (2)
>
> 4) at some point, the mirror completes, writes are going to both disks
>
> 5) qemu sends out an event indicating that the disks are in sync
>
> 6) management tool then sends a command to fail over to disk B
>
> 7) qemu acks (6)

7) is not a must when there is no raid.

>
> We're making the management tool the "authoritative" source of how to
> launch
> QEMU. That means that the management tool ultimately determines which
> command
> line to relaunch QEMU with.

This is what we have today regardless of live copy. How else would you 
track many hot plug/unplug operations and live migration afterwards?
For enterprise usage, that's the best case. It's also true for a single 
host w/ libvirt and virt-manager.

>
> Here are the races:
>
> A) If QEMU crashes between (2) and (3), it may have issues a write to
> the new
> raid1 block device before the management tool sees (3). If this happens,
> when the management tool restarts QEMU with disk A, we're left with a
> dangling raid1 block device. Not a critical failure, but not ideal.

Once there is no raid there is no race.

>
> B) If QEMU crashes between (6) and (7), QEMU may have started writing to
> disk
> B before the management tool sees (7). This means that the management tool
> will create the guest with the raid1 block device which no longer is the
> correct disk. This could fail in subtly bad ways. Depending on how read
> is implemented (if you try to do striping for instance), bad data could be
> returned. You could try to implement a policy of always reading from B if
> the block has been copied but this gets harry really quickly. It's
> definitely not RAID1 anymore.

Exactly! Drop the raid and always read from B post #6.
This is what I was suggesting before.

>
> You may observe that the problem is not the RAID1 mechanism, but
> changing from
> using a normal device and the RAID1 mechanism. It would then be wise to
> say,
> let's always use this image format. Since that eliminates the race, we
> don't
> really need the copy bitmap anymore.
>
> Now we're left with a simple format that just refers to two filenames.

Ok, looks good. A management app won't need the files below since it 
manages everything by its own.

> However,
> block devices are more than just a filename. It needs a format, cache
> settings, etc. So let's put this all in the RAID1 block format. We also
> need
> a way to indicate which block device is selected.
>
> Let's make it a text file for purposes of discussion. It will look
> something
> like:
>
> [primary]
> filename=diska.img
> cache=none
> format=raw
>
> [secondary]
> filename=diskb.img
> cache=writethrough
> format=qcow2
>
> [global]
> active=primary
>
> Since we might want to mirror multiple drives at once, we should probablyn
> support having multiple drives configured which means we need to not
> just have
> a single active entry, but an entry associated with a particular device.
>
> [drive "diskA"]
> filename=diska.img
> cache=none
> format=raw
>
> [drive "diskB"]
> filename=diskb.img
> cache=writethrough
> format=qcow2
>
> [device "vda"]
> drive=diskB
>
> And this is exactly what I'm proposing. It's really the natural
> generalization
> of what you're proposing.
>
> So basically, the only differences are:
>
> 1) always use the new RAID1 format
> 2) drop the progress bitmap
> 3) support multiple devices per file
> 4) let drive properties be specified beyond filename
>
> All reasonable things to do.
>
> Regards,
>
> Anthony Liguori
>
Anthony Liguori - March 2, 2011, 12:39 p.m.
On 03/01/2011 03:59 AM, Dor Laor wrote:
> On 02/28/2011 08:12 PM, Anthony Liguori wrote:
>>
>> On Feb 28, 2011 11:47 AM, "Avi Kivity" <avi@redhat.com
>> <mailto:avi@redhat.com>> wrote:
>> >
>> > On 02/28/2011 07:33 PM, Anthony Liguori wrote:
>> >>
>> >>
>> >> >
>> >> > You're just ignoring what I've written.
>> >>
>> >> No, you're just impervious to my subtle attempt to refocus the
>> discussion on solving a practical problem.
>> >>
>> >> There's a lot of good, reasonably straight forward changes we can
>> make that have a high return on investment.
>> >>
>> >
>> > Is making qemu the authoritative source of configuration information
>> a straightforward change?  Is the return on it high?  Is the 
>> investment low?
>>
>> I think this is where we fundamentally disagree.  My position is that
>> QEMU is already the authoritative source.  Having a state file doesn't
>> change anything.
>>
>> Do a hot unplug of a network device with upstream libvirt with acpiphp
>> unloaded, consult libvirt and then consult the monitor to see who has
>> the right view of the guests config.
>>
>> To me, that's the definition of authoritative.
>>
>> > "No" to all three (ignoring for the moment whether it is good or not,
>> which we were debating).
>> >
>> >
>> >> The only suggestion I'm making beyond Marcelo's original patch is
>> that we use a structured format and that we make it possible to use the
>> same file to solve this problem in multiple places.
>> >>
>> >
>> > No, you're suggesting a lot more than that.
>>
>> That's exactly what I'm suggesting from a technical perspective.
>>
>> >> I don't think this creates a fundamental break in how management
>> tools interact with QEMU.  I don't think introducing RAID support in the
>> block layer is a reasonable alternative.
>> >>
>> >>
>> >
>> > Why not?
>>
>> Because its a lot of complexity and code that can go wrong while only
>> solving the race for one specific case.  Not to mention that we double
>> the iop rate.
>>
>> > Something that avoids the whole state thing altogether:
>> >
>> > - instead of atomically switching when live copy is done, keep on
>> issuing writes to both the origin and the live copy
>> > - issue a notification to management
>> > - management receives the notification, and issues an atomic blockdev
>> switch command
>>
>> > this is really the RAID-1 solution but without the state file (credit
>> Dor).  An advantage is that there is no additional latency when trying
>> to catch up to the dirty bitmap.
>>
>> It still suffers from the two generals problem.  You cannot solve this
>> without making one node reliable and that takes us back to it being
>> either QEMU (posted event and state file) or the management tool (sync
>> event).
>
> It is safe w/o a state file by changing the basic live copy algorithm:
>
> 1. Live copy in progress stage
>    Once live copy command is issued, a dirty bit map is created for
>    tracking. There is a single pass over the entire image where we copy
>    blocks from the src to the dst.
>
>    Write commands for blocks that were already copied will be done
>    twice for the src and dst.
>
>    Once the full copy single pass ends, we trigger a QMP event that
>    this stage can end.
>
>    The live copy stage keeps running till the management issue a switch
>    command. When it will happen, the switch is immediate and no need to
>    copy additional blocks (but flush pending IOs).
>
> 2. Management sends a switch command.
>    Qemu stops the doubling the IO and switches to the destination.
>    End.

Here is where your race is:

2. Management sends a switch command

3. QEMU receives switch command

4. QEMU stops doubling IO and switches to the destination

5. QEMU sends acknowledgement of switch command

6. Management receives acknowledge of switch command

7. Management changes internal state definition to reflect the new 
destination

If QEMU or the management tool crashes after step 4 and before step 6, 
when the management tool restarts QEMU with the source image, data loss 
will have occurred (and potentially corruption if a flush had happened).

This all boils down to the Two Generals Problem[1].  It's simply not 
fixable without making one end reliable and that means that someone 
needs to fsync() something *after* the switchover happens but before the 
first write happens.  That can be QEMU (Avi's RAID proposal and my state 
file proposal) or it can be the management tool (if we introduce 
synchronous events).

[1] http://en.wikipedia.org/wiki/Two_Generals%27_Problem

Regards,

Anthony Liguori
Avi Kivity - March 2, 2011, 1 p.m.
On 03/02/2011 02:39 PM, Anthony Liguori wrote:
>
> Here is where your race is:
>
> 2. Management sends a switch command
>
> 3. QEMU receives switch command
>
> 4. QEMU stops doubling IO and switches to the destination
>
> 5. QEMU sends acknowledgement of switch command
>
> 6. Management receives acknowledge of switch command
>
> 7. Management changes internal state definition to reflect the new 
> destination
>
> If QEMU or the management tool crashes after step 4 and before step 6, 
> when the management tool restarts QEMU with the source image, data 
> loss will have occurred (and potentially corruption if a flush had 
> happened).

No.  After step 2, any qemu restart will be with the destination image.  
If the management tool restarts, it can query the state (or just 
re-issue the switch command, which is idempotent).

>
> This all boils down to the Two Generals Problem[1].  It's simply not 
> fixable without making one end reliable and that means that someone 
> needs to fsync() something *after* the switchover happens but before 
> the first write happens.  That can be QEMU (Avi's RAID proposal and my 
> state file proposal) or it can be the management tool (if we introduce 
> synchronous events).

The two problems are not equivalent.  Once the management tool receives 
acknowledgement that the switch occurred, the protocol terminates.
Anthony Liguori - March 2, 2011, 3:07 p.m.
On 03/02/2011 08:00 AM, Avi Kivity wrote:
> On 03/02/2011 02:39 PM, Anthony Liguori wrote:
>>
>> Here is where your race is:
>>
>> 2. Management sends a switch command
>>
>> 3. QEMU receives switch command
>>
>> 4. QEMU stops doubling IO and switches to the destination
>>
>> 5. QEMU sends acknowledgement of switch command
>>
>> 6. Management receives acknowledge of switch command
>>
>> 7. Management changes internal state definition to reflect the new 
>> destination
>>
>> If QEMU or the management tool crashes after step 4 and before step 
>> 6, when the management tool restarts QEMU with the source image, data 
>> loss will have occurred (and potentially corruption if a flush had 
>> happened).
>
> No.  After step 2, any qemu restart will be with the destination 
> image.  If the management tool restarts, it can query the state (or 
> just re-issue the switch command, which is idempotent).

Yeah, I think you're right, although I need to think through it a bit more.

Regards,

Anthony Liguori
Avi Kivity - March 2, 2011, 4:30 p.m.
On 03/01/2011 05:51 PM, Anthony Liguori wrote:
>>> Do a hot unplug of a network device with upstream libvirt with 
>>> acpiphp unloaded, consult libvirt and then consult the monitor to 
>>> see who has the right view of the guests config.
>>>
>>
>> libvirt is right and the monitor is wrong.
>>
>> On real hardware, calling _EJ0 doesn't affect the configuration one 
>> little bit (if I understand it correctly).  It just turns off power 
>> to the slot.  If you power-cycle, the card will be there.
>
> It's up to the hardware vendor.  Since it's ACPI, it can result in any 
> number of operations.  Usually, there's some logic to flip on an LED 
> or something.
>
> There's nothing that prevents a vendor from ejecting the card.  My 
> point is that there aren't cleanly separated lines in the real world.

We can implement out virtual hardware like real hardware, or we can do 
some new stuff, and break our management model in the process.

>>>
>>
>> Unless I'm hallucinating, you're suggesting quite a bit more.  A 
>> revolution in how qemu is to be managed.
>
> Let me take another route to see if I can't persuade you.
>
> First, let's clarify your proposal.  You want to introduce a new block 
> format
> that references to block devices.  It may also store a dirty bitmap to 
> keep
> track of which blocks are out of sync.  Hopefully, it goes without saying
> that the dirty bitmap is strictly optional (it's a performance 
> optimization) so
> let's ignore it.

(as was related elsewhere, the state is also optional)

>
> Your format, as a text file, looks like:
>
> [raid1]
> primary=diska.img
> secondary=diskb.img
> active=primary
>
> To use it, here's the sequence:
>
> 0) qemu uses disk A for a block device
>
> 1) create a raid1 block device pointing to disk A and disk B.
>
> 2) management tool asks qemu to us the new raid1 block device.
>
> 3) qemu acks (2)
>
> 4) at some point, the mirror completes, writes are going to both disks
>
> 5) qemu sends out an event indicating that the disks are in sync
>
> 6) management tool then sends a command to fail over to disk B
>
> 7) qemu acks (6)
>
> We're making the management tool the "authoritative" source of how to 
> launch
> QEMU.  That means that the management tool ultimately determines which 
> command
> line to relaunch QEMU with.
>
> Here are the races:
>
> A) If QEMU crashes between (2) and (3), it may have issues a write to 
> the new
>    raid1 block device before the management tool sees (3).  If this 
> happens,
>    when the management tool restarts QEMU with disk A, we're left with a
>    dangling raid1 block device.  Not a critical failure, but not ideal.

You can restart qemu with the RAID1 blockdev.

>
> B) If QEMU crashes between (6) and (7), QEMU may have started writing 
> to disk
>    B before the management tool sees (7).  This means that the 
> management tool
>    will create the guest with the raid1 block device which no longer 
> is the
>    correct disk.  This could fail in subtly bad ways.  Depending on 
> how read
>    is implemented (if you try to do striping for instance), bad data 
> could be
>    returned.  You could try to implement a policy of always reading 
> from B if
>    the block has been copied but this gets harry really quickly.  It's
>    definitely not RAID1 anymore.

As related elsewhere, you restart qemu with image B.

The trick is to partition the problem into idempotent commands; these 
allow you to recover from any failure.

>
> You may observe that the problem is not the RAID1 mechanism, but 
> changing from
> using a normal device and the RAID1 mechanism.  It would then be wise 
> to say,
> let's always use this image format.  Since that eliminates the race, 
> we don't
> really need the copy bitmap anymore.
>
> Now we're left with a simple format that just refers to two 
> filenames.  However,
> block devices are more than just a filename.  It needs a format, cache
> settings, etc.  So let's put this all in the RAID1 block format.  We 
> also need
> a way to indicate which block device is selected.
>
> Let's make it a text file for purposes of discussion.  It will look 
> something
> like:
>
> [primary]
> filename=diska.img
> cache=none
> format=raw
>
> [secondary]
> filename=diskb.img
> cache=writethrough
> format=qcow2
>
> [global]
> active=primary
>
> Since we might want to mirror multiple drives at once, we should 
> probablyn
> support having multiple drives configured which means we need to not 
> just have
> a single active entry, but an entry associated with a particular device.

Or you have one file per RAID-1 image set.  This is important because 
images are not associated with a qemu instance.  You can hot-unplug an 
image from one qemu and hot-plug it into another.

>
> [drive "diskA"]
> filename=diska.img
> cache=none
> format=raw
>
> [drive "diskB"]
> filename=diskb.img
> cache=writethrough
> format=qcow2
>
> [device "vda"]
> drive=diskB
>
> And this is exactly what I'm proposing. 

It's exactly what I'm opposing.  Making qemu manage all this stuff.

> It's really the natural generalization
> of what you're proposing.
>
> So basically, the only differences are:
>
>  1) always use the new RAID1 format
>  2) drop the progress bitmap
>  3) support multiple devices per file
>  4) let drive properties be specified beyond filename
>
> All reasonable things to do.

Well, I dislike 3, and the whole "qemu is authoritative source of 
configuration" thing.
Anthony Liguori - March 2, 2011, 9:55 p.m.
On 03/02/2011 11:30 AM, Avi Kivity wrote:
>> It's really the natural generalization
>> of what you're proposing.
>>
>> So basically, the only differences are:
>>
>>  1) always use the new RAID1 format
>>  2) drop the progress bitmap
>>  3) support multiple devices per file
>>  4) let drive properties be specified beyond filename
>>
>> All reasonable things to do.
>
> Well, I dislike 3, and the whole "qemu is authoritative source of 
> configuration" thing.

Yeah, at this point, I think the new no-stored-state approach is 
obviously better than anything proposed.  So we'll have to revisit this 
discussion if/when there's another use-case that demands it.

Regards,

Anthony Liguori

Patch

Index: qemu/block-copy.c
===================================================================
--- /dev/null
+++ qemu/block-copy.c
@@ -0,0 +1,741 @@ 
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "blockdev.h"
+#include "qemu-queue.h"
+#include "qemu-timer.h"
+#include "monitor.h"
+#include "block-copy.h"
+#include "migration.h"
+#include "sysemu.h"
+#include "qjson.h"
+#include <assert.h>
+
+#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
+#define MAX_IS_ALLOCATED_SEARCH 65536
+
+/*
+ * Stages:
+ *
+ * STAGE_BULK: bulk reads/writes in progress
+ * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
+ * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
+ * STAGE_SWITCH_FINISHED: switched to new image.
+ */
+
+enum BdrvCopyStage {
+    STAGE_BULK,
+    STAGE_BULK_FINISHED,
+    STAGE_DIRTY,
+    STAGE_SWITCH_FINISHED,
+};
+
+typedef struct BdrvCopyState {
+    BlockDriverState *src;
+    BlockDriverState *dst;
+    bool shared_base;
+
+    int64_t curr_sector;
+    int64_t completed_sectors;
+    int64_t nr_sectors;
+
+    enum BdrvCopyStage stage;
+    int inflight_reads;
+    int error;
+    int failed;
+    int cancelled;
+    QLIST_HEAD(, BdrvCopyBlock) io_list;
+    unsigned long *aio_bitmap;
+    QEMUTimer *aio_timer;
+    QLIST_ENTRY(BdrvCopyState) list;
+
+    int64_t blocks;
+    int64_t total_time;
+
+    char src_device_name[32];
+    char dst_filename[1024];
+    int commit_fd;
+} BdrvCopyState;
+
+typedef struct BdrvCopyBlock {
+    BdrvCopyState *state;
+    uint8_t *buf;
+    int64_t sector;
+    int64_t nr_sectors;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    BlockDriverAIOCB *aiocb;
+    int64_t time;
+    QLIST_ENTRY(BdrvCopyBlock) list;
+} BdrvCopyBlock;
+
+static QLIST_HEAD(, BdrvCopyState) block_copy_list =
+    QLIST_HEAD_INITIALIZER(block_copy_list);
+
+static void alloc_aio_bitmap(BdrvCopyState *s)
+{
+    BlockDriverState *bs = s->src;
+    int64_t bitmap_size;
+
+    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
+            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+    s->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
+static bool aio_inflight(BdrvCopyState *s, int64_t sector)
+{
+    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    if (s->aio_bitmap &&
+        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(s->src)) {
+        return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
+            (1UL << (chunk % (sizeof(unsigned long) * 8))));
+    } else {
+        return 0;
+    }
+}
+
+static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
+                             int nb_sectors, int set)
+{
+    int64_t start, end;
+    unsigned long val, idx, bit;
+
+    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    for (; start <= end; start++) {
+        idx = start / (sizeof(unsigned long) * 8);
+        bit = start % (sizeof(unsigned long) * 8);
+        val = s->aio_bitmap[idx];
+        if (set) {
+            if (!(val & (1UL << bit))) {
+                val |= 1UL << bit;
+            }
+        } else {
+            if (val & (1UL << bit)) {
+                val &= ~(1UL << bit);
+            }
+        }
+        s->aio_bitmap[idx] = val;
+    }
+}
+
+static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
+{
+    s->stage = stage;
+
+    switch (stage) {
+    case STAGE_BULK:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
+        break;
+    case STAGE_BULK_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
+        break;
+    case STAGE_DIRTY:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
+        break;
+    case STAGE_SWITCH_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
+        break;
+    default:
+        break;
+    }
+}
+
+static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
+{
+    s->error = ret;
+    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+}
+
+static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
+{
+    s->blocks++;
+    s->total_time += time;
+}
+
+static void blk_copy_write_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+
+    QLIST_REMOVE(blk, list);
+    add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
+
+    /* schedule switch to STAGE_DIRTY on last bulk write completion */
+    if (blk->state->stage == STAGE_BULK_FINISHED) {
+        qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+    }
+
+    if (blk->state->stage > STAGE_BULK_FINISHED) {
+        set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
+    }
+
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->state = s;
+    blk->sector = read_blk->sector;
+    blk->nr_sectors = read_blk->nr_sectors;
+    blk->time = read_blk->time;
+    blk->buf = read_blk->buf;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = read_blk->buf;
+    blk->iov.iov_len = read_blk->iov.iov_len;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
+    blk->aiocb = bdrv_aio_writev(s->dst, blk->sector, &blk->qiov,
+                                 blk->iov.iov_len / BDRV_SECTOR_SIZE,
+                                 blk_copy_write_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    QLIST_REMOVE(blk, list);
+    qemu_free(read_blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_read_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    s->inflight_reads--;
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+    blk_copy_issue_write(s, blk);
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk);
+    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+}
+
+static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
+                                int nr_sectors)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->buf = qemu_mallocz(BLOCK_SIZE);
+    blk->state = s;
+    blk->sector = sector;
+    blk->nr_sectors = nr_sectors;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = blk->buf;
+    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    s->inflight_reads++;
+    blk->time = qemu_get_clock_ns(rt_clock);
+    blk->aiocb = bdrv_aio_readv(s->src, sector, &blk->qiov, nr_sectors,
+                                blk_copy_read_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    s->inflight_reads--;
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static bool blkcopy_can_switch(BdrvCopyState *s)
+{
+    int64_t remaining_dirty;
+    int64_t avg_transfer_time;
+
+    remaining_dirty = bdrv_get_dirty_count(s->src);
+    if (remaining_dirty == 0 || s->blocks == 0) {
+        return true;
+    }
+
+    avg_transfer_time = s->total_time / s->blocks;
+    if ((remaining_dirty * avg_transfer_time) <= migrate_max_downtime()) {
+        return true;
+    }
+    return false;
+}
+
+static int blk_issue_reads_dirty(BdrvCopyState *s)
+{
+    int64_t sector;
+
+    for (sector = s->curr_sector; sector < s->nr_sectors;) {
+        if (bdrv_get_dirty(s->src, sector) && !aio_inflight(s, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            blk_copy_issue_read(s, sector, nr_sectors);
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+            set_aio_inflight(s, sector, nr_sectors, 1);
+            break;
+        }
+
+        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+        s->curr_sector = sector;
+    }
+
+    if (sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+    }
+    return 0;
+}
+
+static int blk_issue_reads_bulk(BdrvCopyState *s)
+{
+    int nr_sectors;
+    int64_t curr_sector = s->curr_sector;
+
+    if (s->shared_base) {
+        while (curr_sector < s->nr_sectors &&
+                !bdrv_is_allocated(s->src, curr_sector,
+                                   MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
+                curr_sector += nr_sectors;
+        }
+    }
+
+    if (curr_sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+        return 1;
+    }
+
+    curr_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
+    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    blk_copy_issue_read(s, s->curr_sector, nr_sectors);
+    s->curr_sector += nr_sectors;
+    s->completed_sectors = curr_sector;
+    return 0;
+}
+
+static void blkcopy_finish(BdrvCopyState *s)
+{
+    int64_t sector;
+    uint8_t *buf;
+
+    buf = qemu_malloc(BLOCK_SIZE);
+
+    /* FIXME: speed up loop, get_next_dirty_block? */
+    for (sector = 0; sector < s->nr_sectors;
+         sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
+        if (bdrv_get_dirty(s->src, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            memset(buf, 0, BLOCK_SIZE);
+            if (bdrv_read(s->src, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            if (bdrv_write(s->dst, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+        }
+
+        if (bdrv_get_dirty_count(s->src) == 0)
+            break;
+    }
+    qemu_free(buf);
+    return;
+
+error:
+    qemu_free(buf);
+    s->error = 1;
+}
+
+static int write_commit_file(BdrvCopyState *s)
+{
+    char commit_msg[1400];
+    const char *buf = commit_msg;
+    int len, ret;
+
+    sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name,
+                        s->dst_filename);
+
+    len = strlen(commit_msg);
+    while (len > 0) {
+        ret = write(s->commit_fd, buf, len);
+        if (ret == -1 && errno == EINTR) {
+            continue;
+        }
+        if (ret <= 0) {
+            return -errno;
+        }
+        buf += ret;
+        len -= ret;
+    }
+
+    if (fsync(s->commit_fd) == -1) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+static void blkcopy_cleanup(BdrvCopyState *s)
+{
+    assert(s->inflight_reads == 0);
+    assert(QLIST_EMPTY(&s->io_list));
+    bdrv_set_dirty_tracking(s->src, 0);
+    drive_put_ref(drive_get_by_blockdev(s->src));
+    bdrv_set_in_use(s->src, 0);
+    if (s->stage >= STAGE_DIRTY)
+        qemu_free(s->aio_bitmap);
+    qemu_del_timer(s->aio_timer);
+    if (s->commit_fd)
+        close(s->commit_fd);
+}
+
+static void blkcopy_free(BdrvCopyState *s)
+{
+    QLIST_REMOVE(s, list);
+    qemu_free(s);
+}
+
+static void handle_error(BdrvCopyState *s)
+{
+    if (!QLIST_EMPTY(&s->io_list))
+        return;
+    s->failed = 1;
+    blkcopy_cleanup(s);
+}
+
+static void blkcopy_switch(BdrvCopyState *s)
+{
+    char src_filename[1024];
+    int open_flags;
+
+    strncpy(src_filename, s->src->filename, sizeof(src_filename));
+    open_flags = s->src->open_flags;
+
+    assert(s->stage == STAGE_DIRTY);
+
+    vm_stop(VMSTOP_BLOCKCOPY);
+    /* flush any guest writes, dirty bitmap uptodate after this.
+     * copy AIO also finished.
+     */
+    qemu_aio_flush();
+    assert(QLIST_EMPTY(&s->io_list));
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    blkcopy_finish(s);
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    assert(bdrv_get_dirty_count(s->src) == 0);
+    bdrv_flush_all();
+
+    bdrv_close(s->src);
+    bdrv_close(s->dst);
+    if (bdrv_open(s->src, s->dst->filename, s->src->open_flags, NULL) < 0) {
+        s->failed = 1;
+        goto err;
+    }
+
+    if (s->commit_fd && write_commit_file(s)) {
+        s->failed = 1;
+        bdrv_close(s->src);
+        goto err;
+    }
+
+    blkcopy_set_stage(s, STAGE_SWITCH_FINISHED);
+    blkcopy_cleanup(s);
+vm_start:
+    vm_start();
+    return;
+
+err:
+    if (bdrv_open(s->src, src_filename, open_flags, NULL) < 0) {
+        error_report("%s: %s: cannot fallback to source image\n", __func__,
+                     s->dst_filename);
+        abort();
+    }
+    blkcopy_cleanup(s);
+    goto vm_start;
+}
+
+#define BLKCOPY_INFLIGHT 2
+
+/*
+ * To simplify the implementation, the IO completion callbacks do not
+ * handle stage control or submit IO for further blocks. A timer is used
+ * for such purpose.
+ */
+
+static void aio_timer(void *opaque)
+{
+    BdrvCopyState *s = opaque;
+
+    assert(s->cancelled == 0);
+
+    if (s->error) {
+        handle_error(s);
+        return;
+    }
+
+    while (s->stage == STAGE_BULK) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        if (blk_issue_reads_bulk(s)) {
+            blkcopy_set_stage(s, STAGE_BULK_FINISHED);
+        }
+    }
+
+    if (s->stage == STAGE_BULK_FINISHED) {
+        if (QLIST_EMPTY(&s->io_list)) {
+            blkcopy_set_stage(s, STAGE_DIRTY);
+            alloc_aio_bitmap(s);
+        }
+    }
+
+    while (s->stage == STAGE_DIRTY) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        blk_issue_reads_dirty(s);
+        if (blkcopy_can_switch(s)) {
+            BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_SWITCH_START);
+            blkcopy_switch(s);
+            return;
+        }
+    }
+}
+
+static int bdrv_copy(Monitor *mon, const char * device, BlockDriverState *src,
+                     BlockDriverState *dst, const char *commit_file,
+                     bool shared_base)
+{
+    int64_t sectors;
+    BdrvCopyState *blkcopy, *safe;
+    int f;
+
+    QLIST_FOREACH_SAFE(blkcopy, &block_copy_list, list, safe) {
+        if (!strcmp(blkcopy->src_device_name, src->device_name)) {
+            if (blkcopy->stage == STAGE_SWITCH_FINISHED || blkcopy->failed) {
+                blkcopy_free(blkcopy);
+            } else {
+                qerror_report(QERR_BLOCKCOPY_IN_PROGRESS, src->device_name);
+                return -1;
+            }
+        }
+    }
+
+    sectors = bdrv_getlength(src) >> BDRV_SECTOR_BITS;
+    if (sectors != bdrv_getlength(dst) >> BDRV_SECTOR_BITS) {
+        qerror_report(QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS);
+        return -1;
+    }
+
+    if (commit_file) {
+        f = open(commit_file, O_CREAT|O_WRONLY, S_IRUSR);
+        if (f == -1) {
+            qerror_report(QERR_OPEN_FILE_FAILED, commit_file);
+            return -1;
+        }
+    }
+
+    blkcopy = qemu_mallocz(sizeof(BdrvCopyState));
+    blkcopy->src = src;
+    blkcopy->dst = dst;
+    blkcopy->curr_sector = 0;
+    blkcopy->nr_sectors = sectors;
+    blkcopy_set_stage(blkcopy, STAGE_BULK);
+    blkcopy->aio_timer = qemu_new_timer(rt_clock, aio_timer, blkcopy);
+    blkcopy->shared_base = shared_base;
+    blkcopy->commit_fd = f;
+    strncpy(blkcopy->src_device_name, blkcopy->src->device_name,
+            sizeof(blkcopy->src_device_name) - 1);
+    strncpy(blkcopy->dst_filename, blkcopy->dst->filename,
+            sizeof(blkcopy->dst_filename) - 1);
+
+    bdrv_set_dirty_tracking(src, 1);
+    qemu_mod_timer(blkcopy->aio_timer, qemu_get_clock(rt_clock));
+    drive_get_ref(drive_get_by_blockdev(src));
+    bdrv_set_in_use(src, 1);
+
+    QLIST_INSERT_HEAD(&block_copy_list, blkcopy, list);
+    return 0;
+}
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    const char *device = qdict_get_str(qdict, "device");
+    const char *filename = qdict_get_str(qdict, "filename");
+    const char *commit_file = qdict_get_try_str(qdict, "commit_filename");
+    bool shared_base = qdict_get_try_bool(qdict, "inc", 0);
+    BlockDriverState *new_bs, *bs;
+    int ret;
+
+    if (migration_active()) {
+        qerror_report(QERR_MIGRATION_IN_PROGRESS);
+        return -1;
+    }
+
+    bs = bdrv_find(device);
+    if (!bs) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    new_bs = bdrv_new("");
+    if (bdrv_open(new_bs, filename, bs->open_flags, NULL) < 0) {
+        bdrv_delete(new_bs);
+        qerror_report(QERR_OPEN_FILE_FAILED, filename);
+        return -1;
+    }
+
+    ret = bdrv_copy(mon, device, bs, new_bs, commit_file, shared_base);
+    return ret;
+}
+
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    BdrvCopyState *blkcopy, *s = NULL;
+    const char *device = qdict_get_str(qdict, "device");
+
+    QLIST_FOREACH(blkcopy, &block_copy_list, list) {
+        if (!strcmp(blkcopy->src_device_name, device)) {
+            s = blkcopy;
+            break;
+        }
+    }
+
+    if (!s) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    s->cancelled = 1;
+    do {
+        qemu_aio_flush();
+    } while (!QLIST_EMPTY(&s->io_list));
+    blkcopy_cleanup(s);
+    blkcopy_free(s);
+
+    return 0;
+}
+
+static void blockcopy_print_dict(QObject *obj, void *opaque)
+{
+    QDict *c_dict;
+    Monitor *mon = opaque;
+
+    c_dict = qobject_to_qdict(obj);
+
+    monitor_printf(mon, "%s: status=%s ",
+                        qdict_get_str(c_dict, "device"),
+                        qdict_get_str(c_dict, "status"));
+
+    if (qdict_haskey(c_dict, "info")) {
+        QDict *qdict = qobject_to_qdict(qdict_get(c_dict, "info"));
+
+        monitor_printf(mon, "percentage=%ld %%",
+                       qdict_get_int(qdict, "percentage"));
+    }
+
+    monitor_printf(mon, "\n");
+}
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data)
+{
+    qlist_iter(qobject_to_qlist(data), blockcopy_print_dict, mon);
+}
+
+void do_info_blockcopy(Monitor *mon, QObject **ret_data)
+{
+    QList *c_list;
+    BdrvCopyState *s;
+
+    c_list = qlist_new();
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        QObject *c_obj;
+        static const char *status[] = { "failed", "active", "completed" };
+        int i;
+
+        if (s->failed) {
+            i = 0;
+        } else if (s->stage < STAGE_SWITCH_FINISHED) {
+            i = 1;
+        } else {
+            i = 2;
+        }
+
+        c_obj = qobject_from_jsonf("{ 'device': %s, 'status': %s }",
+                                    s->src_device_name, status[i]);
+
+        if (i == 1) {
+            QDict *dict = qobject_to_qdict(c_obj);
+            QObject *obj;
+
+            /* FIXME: add dirty stage progress? */
+            obj = qobject_from_jsonf("{ 'percentage': %" PRId64 "}",
+                                     s->completed_sectors * 100 / s->nr_sectors);
+            qdict_put_obj(dict, "info", obj);
+        }
+        qlist_append_obj(c_list, c_obj);
+    }
+
+    *ret_data = QOBJECT(c_list);
+}
+
+bool block_copy_active(void)
+{
+    BdrvCopyState *s;
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        if (s->failed) {
+            continue;
+        }
+        if (s->stage < STAGE_SWITCH_FINISHED) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
Index: qemu/block-copy.h
===================================================================
--- /dev/null
+++ qemu/block-copy.h
@@ -0,0 +1,25 @@ 
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_COPY_H
+#define BLOCK_COPY_H
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data);
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data);
+void do_info_blockcopy(Monitor *mon, QObject **ret_data);
+
+bool block_copy_active(void);
+
+#endif /* BLOCK_COPY_H */
+
Index: qemu/hmp-commands.hx
===================================================================
--- qemu.orig/hmp-commands.hx
+++ qemu/hmp-commands.hx
@@ -806,6 +806,43 @@  Set maximum speed to @var{value} (in byt
 ETEXI
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,commit_filename:s?,inc:-i",
+        .params     = "device filename [commit_filename] [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t optional commit filename "
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between src and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+STEXI
+@item block_copy @var{device} @var{filename} [@var{commit_filename}] [-i]
+@findex block_copy
+Live copy block device @var{device} to image @var{filename}.
+        -i for incremental copy (base image is shared)
+
+Optionally a commit message is written to @var{commit_filename}
+once the switch to the new image is performed.
+ETEXI
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+STEXI
+@item block_copy_cancel @var{device}
+@findex block_copy_cancel
+Cancel live block copy on @var{device}.
+ETEXI
+
+    {
         .name       = "migrate_set_downtime",
         .args_type  = "value:T",
         .params     = "value",
@@ -1352,6 +1389,8 @@  show device tree
 show qdev device model list
 @item info roms
 show roms
+@item info block-copy
+show block copy status
 @end table
 ETEXI
 
Index: qemu/monitor.c
===================================================================
--- qemu.orig/monitor.c
+++ qemu/monitor.c
@@ -45,6 +45,7 @@ 
 #include "balloon.h"
 #include "qemu-timer.h"
 #include "migration.h"
+#include "block-copy.h"
 #include "kvm.h"
 #include "acl.h"
 #include "qint.h"
@@ -3109,6 +3110,14 @@  static const mon_cmd_t info_cmds[] = {
     },
 #endif
     {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
+    {
         .name       = NULL,
     },
 };
@@ -3250,6 +3259,14 @@  static const mon_cmd_t qmp_query_cmds[] 
         .mhandler.info_async = do_info_balloon,
         .flags      = MONITOR_CMD_ASYNC,
     },
+    {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
     { /* NULL */ },
 };
 
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h
+++ qemu/block.h
@@ -299,6 +299,13 @@  typedef enum {
     BLKDBG_CLUSTER_ALLOC_BYTES,
     BLKDBG_CLUSTER_FREE,
 
+    BLKDBG_BLKCOPY_STAGE_BULK,
+    BLKDBG_BLKCOPY_STAGE_BULK_FINISHED,
+    BLKDBG_BLKCOPY_STAGE_DIRTY,
+    BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED,
+    BLKDBG_BLKCOPY_SWITCH_START,
+    BLKDBG_BLKCOPY_AIO_WRITE,
+
     BLKDBG_EVENT_MAX,
 } BlkDebugEvent;
 
Index: qemu/block/blkdebug.c
===================================================================
--- qemu.orig/block/blkdebug.c
+++ qemu/block/blkdebug.c
@@ -178,6 +178,14 @@  static const char *event_names[BLKDBG_EV
     [BLKDBG_CLUSTER_ALLOC]                  = "cluster_alloc",
     [BLKDBG_CLUSTER_ALLOC_BYTES]            = "cluster_alloc_bytes",
     [BLKDBG_CLUSTER_FREE]                   = "cluster_free",
+
+
+    [BLKDBG_BLKCOPY_STAGE_BULK]             = "blkcopy_stage_bulk",
+    [BLKDBG_BLKCOPY_STAGE_BULK_FINISHED]    = "blkcopy_stage_bulk_finished",
+    [BLKDBG_BLKCOPY_STAGE_DIRTY]            = "blkcopy_stage_dirty",
+    [BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED]  = "blkcopy_stage_switch_finished",
+    [BLKDBG_BLKCOPY_SWITCH_START]           = "blkcopy_switch_start",
+    [BLKDBG_BLKCOPY_AIO_WRITE]              = "blkcopy_aio_write",
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)
Index: qemu/qerror.c
===================================================================
--- qemu.orig/qerror.c
+++ qemu/qerror.c
@@ -209,6 +209,18 @@  static const QErrorStringTable qerror_ta
         .error_fmt = QERR_VNC_SERVER_FAILED,
         .desc      = "Could not start VNC server on %(target)",
     },
+    {
+        .error_fmt = QERR_BLOCKCOPY_IN_PROGRESS,
+        .desc      = "Block copy for %(device) in progress",
+    },
+    {
+        .error_fmt = QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS,
+        .desc      = "Length of destination image differs from source image",
+    },
+    {
+        .error_fmt = QERR_MIGRATION_IN_PROGRESS,
+        .desc      = "Migration in progress",
+    },
     {}
 };
 
Index: qemu/qerror.h
===================================================================
--- qemu.orig/qerror.h
+++ qemu/qerror.h
@@ -171,4 +171,13 @@  QError *qobject_to_qerror(const QObject 
 #define QERR_VNC_SERVER_FAILED \
     "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }"
 
+#define QERR_BLOCKCOPY_IN_PROGRESS \
+    "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }"
+
+#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \
+    "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }"
+
+#define QERR_MIGRATION_IN_PROGRESS \
+    "{ 'class': 'MigrationInProgress', 'data': {} }"
+
 #endif /* QERROR_H */
Index: qemu/qmp-commands.hx
===================================================================
--- qemu.orig/qmp-commands.hx
+++ qemu/qmp-commands.hx
@@ -581,6 +581,75 @@  Example:
 EQMP
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,commit_filename:s?,inc:-i",
+        .params     = "device filename [commit_filename] [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t optional commit filename "
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between src and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+SQMP
+block-copy
+-------
+
+Live block copy.
+
+Arguments:
+
+- "device": device name (json-string)
+- "filename": target image filename (json-string)
+- "commit_filename": target commit filename (json-string, optional)
+- "inc": incremental disk copy (json-bool, optional)
+
+Example:
+
+-> { "execute": "block_copy",
+                            "arguments": { "device": "ide0-hd1",
+                               "filename": "/mnt/new-disk.img",
+                               "commit_filename: "/mnt/commit-new-disk.img"
+                             } }
+
+<- { "return": {} }
+
+Notes:
+
+(1) The 'query-block-copy' command should be used to check block copy progress
+    and final result (this information is provided by the 'status' member)
+(2) Boolean argument "inc" defaults to false
+
+EQMP
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+SQMP
+block_copy_cancel
+--------------
+
+Cancel live block copy.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "netdev_add",
         .args_type  = "netdev:O",
         .params     = "[user|tap|socket],id=str[,prop=value][,...]",
@@ -1740,6 +1809,44 @@  Examples:
 EQMP
 
 SQMP
+query-block-copy
+-------------
+
+Live block copy status.
+
+Each block copy instance information is stored in a json-object and the returned
+value is a json-array of all instances.
+
+Each json-object contains the following:
+
+- "device": device name (json-string)
+- "status": block copy status (json-string)
+    - Possible values: "active", "failed", "completed"
+- "info": A json-object with the statistics information, if status is "active":
+    - "percentage": percentage completed (json-int)
+
+Example:
+
+Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed:
+
+-> { "execute": "query-block-copy" }
+<- {
+      "return":[
+        {"device":"ide1-hd0",
+            "status":"active",
+            "info":{
+               "percentage":23,
+            }
+        },
+        {"device":"ide1-hd1",
+         "status":"failed"
+        }
+      ]
+   }
+
+EQMP
+
+SQMP
 query-balloon
 -------------
 
Index: qemu/Makefile.objs
===================================================================
--- qemu.orig/Makefile.objs
+++ qemu/Makefile.objs
@@ -98,7 +98,7 @@  common-obj-y += buffered_file.o migratio
 common-obj-y += qemu-char.o savevm.o #aio.o
 common-obj-y += msmouse.o ps2.o
 common-obj-y += qdev.o qdev-properties.o
-common-obj-y += block-migration.o
+common-obj-y += block-migration.o block-copy.o
 common-obj-y += pflib.o
 
 common-obj-$(CONFIG_BRLAPI) += baum.o
Index: qemu/sysemu.h
===================================================================
--- qemu.orig/sysemu.h
+++ qemu/sysemu.h
@@ -46,6 +46,7 @@  void qemu_del_vm_change_state_handler(VM
 #define VMSTOP_SAVEVM    6
 #define VMSTOP_LOADVM    7
 #define VMSTOP_MIGRATE   8
+#define VMSTOP_BLOCKCOPY 9
 
 void vm_start(void);
 void vm_stop(int reason);