diff mbox series

[v7,4/5] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

Message ID 1524550428-27173-5-git-send-email-wei.w.wang@intel.com
State New
Headers show
Series virtio-balloon: free page hint reporting support | expand

Commit Message

Wang, Wei W April 24, 2018, 6:13 a.m. UTC
The new feature enables the virtio-balloon device to receive hints of
guest free pages from the free page vq.

balloon_free_page_start - start guest free page hint reporting.
balloon_free_page_stop - stop guest free page hint reporting.

Note: balloon will report pages which were free at the time
of this call. As the reporting happens asynchronously, dirty bit logging
must be enabled before this call is made. Guest reporting must be
disabled before the migration dirty bitmap is synchronized.

TODO:
- handle the case when page poisoning is in use

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
CC: Michael S. Tsirkin <mst@redhat.com>
CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
CC: Juan Quintela <quintela@redhat.com>
---
 balloon.c                                       |  58 +++++-
 hw/virtio/virtio-balloon.c                      | 241 ++++++++++++++++++++++--
 include/hw/virtio/virtio-balloon.h              |  27 ++-
 include/standard-headers/linux/virtio_balloon.h |   7 +
 include/sysemu/balloon.h                        |  15 +-
 5 files changed, 319 insertions(+), 29 deletions(-)

Comments

Michael S. Tsirkin May 29, 2018, 3:24 p.m. UTC | #1
On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
> The new feature enables the virtio-balloon device to receive hints of
> guest free pages from the free page vq.
> 
> balloon_free_page_start - start guest free page hint reporting.
> balloon_free_page_stop - stop guest free page hint reporting.
> 
> Note: balloon will report pages which were free at the time
> of this call. As the reporting happens asynchronously, dirty bit logging
> must be enabled before this call is made. Guest reporting must be
> disabled before the migration dirty bitmap is synchronized.
> 
> TODO:
> - handle the case when page poisoning is in use
>
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> CC: Michael S. Tsirkin <mst@redhat.com>
> CC: Dr. David Alan Gilbert <dgilbert@redhat.com>
> CC: Juan Quintela <quintela@redhat.com>
> ---
>  balloon.c                                       |  58 +++++-
>  hw/virtio/virtio-balloon.c                      | 241 ++++++++++++++++++++++--
>  include/hw/virtio/virtio-balloon.h              |  27 ++-
>  include/standard-headers/linux/virtio_balloon.h |   7 +
>  include/sysemu/balloon.h                        |  15 +-
>  5 files changed, 319 insertions(+), 29 deletions(-)
> 
> diff --git a/balloon.c b/balloon.c
> index 6bf0a96..87a0410 100644
> --- a/balloon.c
> +++ b/balloon.c
> @@ -36,6 +36,9 @@
>  
>  static QEMUBalloonEvent *balloon_event_fn;
>  static QEMUBalloonStatus *balloon_stat_fn;
> +static QEMUBalloonFreePageSupport *balloon_free_page_support_fn;
> +static QEMUBalloonFreePageStart *balloon_free_page_start_fn;
> +static QEMUBalloonFreePageStop *balloon_free_page_stop_fn;
>  static void *balloon_opaque;
>  static bool balloon_inhibited;
>  
> @@ -64,19 +67,51 @@ static bool have_balloon(Error **errp)
>      return true;
>  }
>  
> -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
> -                             QEMUBalloonStatus *stat_func, void *opaque)
> +bool balloon_free_page_support(void)
>  {
> -    if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
> -        /* We're already registered one balloon handler.  How many can
> -         * a guest really have?
> -         */
> -        return -1;
> +    return balloon_free_page_support_fn &&
> +           balloon_free_page_support_fn(balloon_opaque);
> +}
> +
> +/*
> + * Balloon will report pages which were free at the time of this call. As the
> + * reporting happens asynchronously, dirty bit logging must be enabled before
> + * this call is made.
> + */
> +void balloon_free_page_start(void)
> +{
> +    balloon_free_page_start_fn(balloon_opaque);
> +}

Please create notifier support, not a single global.


> +
> +/*
> + * Guest reporting must be disabled before the migration dirty bitmap is
> + * synchronized.
> + */
> +void balloon_free_page_stop(void)
> +{
> +    balloon_free_page_stop_fn(balloon_opaque);
> +}
> +
> +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
> +                              QEMUBalloonStatus *stat_fn,
> +                              QEMUBalloonFreePageSupport *free_page_support_fn,
> +                              QEMUBalloonFreePageStart *free_page_start_fn,
> +                              QEMUBalloonFreePageStop *free_page_stop_fn,
> +                              void *opaque)
> +{
> +    if (balloon_event_fn || balloon_stat_fn || balloon_free_page_support_fn ||
> +        balloon_free_page_start_fn || balloon_free_page_stop_fn ||
> +        balloon_opaque) {
> +        /* We already registered one balloon handler. */
> +        return;
>      }
> -    balloon_event_fn = event_func;
> -    balloon_stat_fn = stat_func;
> +
> +    balloon_event_fn = event_fn;
> +    balloon_stat_fn = stat_fn;
> +    balloon_free_page_support_fn = free_page_support_fn;
> +    balloon_free_page_start_fn = free_page_start_fn;
> +    balloon_free_page_stop_fn = free_page_stop_fn;
>      balloon_opaque = opaque;
> -    return 0;
>  }
>  
>  void qemu_remove_balloon_handler(void *opaque)
> @@ -86,6 +121,9 @@ void qemu_remove_balloon_handler(void *opaque)
>      }
>      balloon_event_fn = NULL;
>      balloon_stat_fn = NULL;
> +    balloon_free_page_support_fn = NULL;
> +    balloon_free_page_start_fn = NULL;
> +    balloon_free_page_stop_fn = NULL;
>      balloon_opaque = NULL;
>  }
>  
> diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
> index f456cea..13bf0db 100644
> --- a/hw/virtio/virtio-balloon.c
> +++ b/hw/virtio/virtio-balloon.c
> @@ -31,6 +31,7 @@
>  
>  #include "hw/virtio/virtio-bus.h"
>  #include "hw/virtio/virtio-access.h"
> +#include "migration/misc.h"
>  
>  #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
>  
> @@ -308,6 +309,125 @@ out:
>      }
>  }
>  
> +static void virtio_balloon_poll_free_page_hints(void *opaque)
> +{
> +    VirtQueueElement *elem;
> +    VirtIOBalloon *dev = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VirtQueue *vq = dev->free_page_vq;
> +    uint32_t id;
> +    size_t size;
> +
> +    while (1) {
> +        qemu_mutex_lock(&dev->free_page_lock);
> +        while (dev->block_iothread) {
> +            qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
> +        }
> +
> +        /*
> +         * If the migration thread actively stops the reporting, exit
> +         * immediately.
> +         */
> +        if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {

Please refactor this : move loop body into a function so
you can do lock/unlock in a single place.

> +            qemu_mutex_unlock(&dev->free_page_lock);
> +            break;
> +        }
> +
> +        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
> +        if (!elem) {
> +            qemu_mutex_unlock(&dev->free_page_lock);
> +            continue;
> +        }
> +
> +        if (elem->out_num) {
> +            size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id));
> +            virtqueue_push(vq, elem, size);
> +            g_free(elem);
> +
> +            virtio_tswap32s(vdev, &id);
> +            if (unlikely(size != sizeof(id))) {
> +                virtio_error(vdev, "received an incorrect cmd id");
> +                break;
> +            }
> +            if (id == dev->free_page_report_cmd_id) {
> +                dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
> +            } else {
> +                /*
> +                 * Stop the optimization only when it has started. This
> +                 * avoids a stale stop sign for the previous command.
> +                 */
> +                if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START) {
> +                    dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> +                    qemu_mutex_unlock(&dev->free_page_lock);
> +                    break;
> +                }
> +            }
> +        }
> +
> +        if (elem->in_num) {
> +            /* TODO: send the poison value to the destination */
> +            if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START &&
> +                !dev->poison_val) {
> +                qemu_guest_free_page_hint(elem->in_sg[0].iov_base,
> +                                          elem->in_sg[0].iov_len);
> +            }
> +            virtqueue_push(vq, elem, 0);
> +            g_free(elem);
> +        }
> +        qemu_mutex_unlock(&dev->free_page_lock);
> +    }
> +    virtio_notify(vdev, vq);
> +}
> +
> +static bool virtio_balloon_free_page_support(void *opaque)
> +{
> +    VirtIOBalloon *s = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +
> +    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);

or if poison is negotiated.

> +}
> +
> +static void virtio_balloon_free_page_start(void *opaque)
> +{
> +    VirtIOBalloon *s = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +
> +    /* For the stop and copy phase, we don't need to start the optimization */
> +    if (!vdev->vm_running) {
> +        return;
> +    }
> +
> +    if (s->free_page_report_cmd_id == UINT_MAX) {
> +        s->free_page_report_cmd_id =
> +                       VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN;
> +    } else {
> +        s->free_page_report_cmd_id++;
> +    }
> +
> +    s->free_page_report_status = FREE_PAGE_REPORT_S_REQUESTED;
> +    virtio_notify_config(vdev);
> +    qemu_bh_schedule(s->free_page_bh);
> +}
> +
> +static void virtio_balloon_free_page_stop(void *opaque)
> +{
> +    VirtIOBalloon *s = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +
> +    if (s->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
> +        return;

Please just reverse the logic.

> +    } else {
> +        qemu_mutex_lock(&s->free_page_lock);
> +        /*
> +         * The guest hasn't done the reporting, so host sends a notification
> +         * to the guest to actively stop the reporting.
> +         */
> +        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> +        qemu_mutex_unlock(&s->free_page_lock);
> +        virtio_notify_config(vdev);
> +    }
> +}
> +
>  static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
>  {
>      VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
> @@ -315,6 +435,17 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
>  
>      config.num_pages = cpu_to_le32(dev->num_pages);
>      config.actual = cpu_to_le32(dev->actual);
> +    if (virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> +        config.poison_val = cpu_to_le32(dev->poison_val);
> +    }
> +
> +    if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
> +        config.free_page_report_cmd_id =
> +                       cpu_to_le32(VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
> +    } else {
> +        config.free_page_report_cmd_id =
> +                       cpu_to_le32(dev->free_page_report_cmd_id);
> +    }
>  
>      trace_virtio_balloon_get_config(config.num_pages, config.actual);
>      memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
> @@ -368,6 +499,7 @@ static void virtio_balloon_set_config(VirtIODevice *vdev,
>                          ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT),
>                          &error_abort);
>      }
> +    dev->poison_val = le32_to_cpu(config.poison_val);
>      trace_virtio_balloon_set_config(dev->actual, oldactual);
>  }
>  
> @@ -377,6 +509,11 @@ static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
>      VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
>      f |= dev->host_features;
>      virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
> +
> +    if (dev->host_features & 1ULL << VIRTIO_BALLOON_F_FREE_PAGE_HINT) {
> +        virtio_add_feature(&f, VIRTIO_BALLOON_F_PAGE_POISON);
> +    }
> +
>      return f;
>  }
>  
> @@ -413,6 +550,18 @@ static int virtio_balloon_post_load_device(void *opaque, int version_id)
>      return 0;
>  }
>  
> +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
> +    .name = "virtio-balloon-device/free-page-report",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = virtio_balloon_free_page_support,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
> +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription vmstate_virtio_balloon_device = {
>      .name = "virtio-balloon-device",
>      .version_id = 1,
> @@ -423,30 +572,42 @@ static const VMStateDescription vmstate_virtio_balloon_device = {
>          VMSTATE_UINT32(actual, VirtIOBalloon),
>          VMSTATE_END_OF_LIST()
>      },
> +    .subsections = (const VMStateDescription * []) {
> +        &vmstate_virtio_balloon_free_page_report,
> +        NULL
> +    }
>  };
>  
>  static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBalloon *s = VIRTIO_BALLOON(dev);
> -    int ret;
>  
>      virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
>                  sizeof(struct virtio_balloon_config));
>  
> -    ret = qemu_add_balloon_handler(virtio_balloon_to_target,
> -                                   virtio_balloon_stat, s);
> -
> -    if (ret < 0) {
> -        error_setg(errp, "Only one balloon device is supported");
> -        virtio_cleanup(vdev);
> -        return;
> -    }
> -
>      s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
> -
> +    if (virtio_has_feature(s->host_features,
> +                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> +        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
> +        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> +        s->free_page_report_cmd_id =
> +                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;
> +        if (s->iothread) {
> +            object_ref(OBJECT(s->iothread));
> +            s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
> +                                       virtio_balloon_poll_free_page_hints, s);
> +            qemu_mutex_init(&s->free_page_lock);
> +            qemu_cond_init(&s->free_page_cond);
> +            s->block_iothread = false;
> +        } else {
> +            /* Simply disable this feature if the iothread wasn't created. */
> +            s->host_features &= ~(1 << VIRTIO_BALLOON_F_FREE_PAGE_HINT);
> +            virtio_error(vdev, "iothread is missing");
> +        }
> +    }
>      reset_stats(s);
>  }
>  
> @@ -455,6 +616,10 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp)
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBalloon *s = VIRTIO_BALLOON(dev);
>  
> +    if (virtio_balloon_free_page_support(s)) {
> +        qemu_bh_delete(s->free_page_bh);
> +        virtio_balloon_free_page_stop(s);
> +    }
>      balloon_stats_destroy_timer(s);
>      qemu_remove_balloon_handler(s);
>      virtio_cleanup(vdev);
> @@ -464,6 +629,10 @@ static void virtio_balloon_device_reset(VirtIODevice *vdev)
>  {
>      VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
>  
> +    if (virtio_balloon_free_page_support(s)) {
> +        virtio_balloon_free_page_stop(s);
> +    }
> +
>      if (s->stats_vq_elem != NULL) {
>          virtqueue_unpop(s->svq, s->stats_vq_elem, 0);
>          g_free(s->stats_vq_elem);
> @@ -475,11 +644,47 @@ static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
>  {
>      VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
>  
> -    if (!s->stats_vq_elem && vdev->vm_running &&
> -        (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
> -        /* poll stats queue for the element we have discarded when the VM
> -         * was stopped */
> -        virtio_balloon_receive_stats(vdev, s->svq);
> +    if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
> +        if (!s->stats_vq_elem && vdev->vm_running &&
> +            virtqueue_rewind(s->svq, 1)) {
> +            /*
> +             * Poll stats queue for the element we have discarded when the VM
> +             * was stopped.
> +             */
> +            virtio_balloon_receive_stats(vdev, s->svq);
> +        }
> +
> +        if (virtio_balloon_free_page_support(s)) {
> +            qemu_add_balloon_handler(virtio_balloon_to_target,
> +                                     virtio_balloon_stat,
> +                                     virtio_balloon_free_page_support,
> +                                     virtio_balloon_free_page_start,
> +                                     virtio_balloon_free_page_stop,
> +                                     s);
> +        } else {
> +            qemu_add_balloon_handler(virtio_balloon_to_target,
> +                                     virtio_balloon_stat, NULL, NULL, NULL, s);
> +        }
> +    }
> +
> +    if (virtio_balloon_free_page_support(s)) {
> +        /*
> +         * The VM is woken up and the iothread was blocked, so signal it to
> +         * continue.
> +         */
> +        if (vdev->vm_running && s->block_iothread) {
> +            qemu_mutex_lock(&s->free_page_lock);
> +            s->block_iothread = false;
> +            qemu_cond_signal(&s->free_page_cond);
> +            qemu_mutex_unlock(&s->free_page_lock);
> +        }
> +
> +        /* The VM is stopped, block the iothread. */
> +        if (!vdev->vm_running) {
> +            qemu_mutex_lock(&s->free_page_lock);
> +            s->block_iothread = true;
> +            qemu_mutex_unlock(&s->free_page_lock);
> +        }
>      }
>  }
>  
> @@ -509,6 +714,10 @@ static const VMStateDescription vmstate_virtio_balloon = {
>  static Property virtio_balloon_properties[] = {
>      DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features,
>                      VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
> +    DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
> +                    VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
> +    DEFINE_PROP_LINK("iothread", VirtIOBalloon, iothread, TYPE_IOTHREAD,
> +                     IOThread *),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
> index 1ea13bd..f865832 100644
> --- a/include/hw/virtio/virtio-balloon.h
> +++ b/include/hw/virtio/virtio-balloon.h
> @@ -18,11 +18,14 @@
>  #include "standard-headers/linux/virtio_balloon.h"
>  #include "hw/virtio/virtio.h"
>  #include "hw/pci/pci.h"
> +#include "sysemu/iothread.h"
>  
>  #define TYPE_VIRTIO_BALLOON "virtio-balloon-device"
>  #define VIRTIO_BALLOON(obj) \
>          OBJECT_CHECK(VirtIOBalloon, (obj), TYPE_VIRTIO_BALLOON)
>  
> +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN 0x80000000
> +
>  typedef struct virtio_balloon_stat VirtIOBalloonStat;
>  
>  typedef struct virtio_balloon_stat_modern {
> @@ -31,15 +34,37 @@ typedef struct virtio_balloon_stat_modern {
>         uint64_t val;
>  } VirtIOBalloonStatModern;
>  
> +enum virtio_balloon_free_page_report_status {
> +    FREE_PAGE_REPORT_S_STOP = 0,
> +    FREE_PAGE_REPORT_S_REQUESTED = 1,
> +    FREE_PAGE_REPORT_S_START = 2,
> +};
> +
>  typedef struct VirtIOBalloon {
>      VirtIODevice parent_obj;
> -    VirtQueue *ivq, *dvq, *svq;
> +    VirtQueue *ivq, *dvq, *svq, *free_page_vq;
> +    uint32_t free_page_report_status;
>      uint32_t num_pages;
>      uint32_t actual;
> +    uint32_t free_page_report_cmd_id;
> +    uint32_t poison_val;
>      uint64_t stats[VIRTIO_BALLOON_S_NR];
>      VirtQueueElement *stats_vq_elem;
>      size_t stats_vq_offset;
>      QEMUTimer *stats_timer;
> +    IOThread *iothread;
> +    QEMUBH *free_page_bh;
> +    /*
> +     * Lock to synchronize threads to access the free page reporting related
> +     * fields (e.g. free_page_report_status).
> +     */
> +    QemuMutex free_page_lock;
> +    QemuCond  free_page_cond;
> +    /*
> +     * Set to block iothread to continue reading free page hints as the VM is
> +     * stopped.
> +     */
> +    bool block_iothread;
>      int64_t stats_last_update;
>      int64_t stats_poll_interval;
>      uint32_t host_features;
> diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
> index 7b0a41b..f89e80f 100644
> --- a/include/standard-headers/linux/virtio_balloon.h
> +++ b/include/standard-headers/linux/virtio_balloon.h
> @@ -34,15 +34,22 @@
>  #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
>  #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> +#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
> +#define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> +#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
>  struct virtio_balloon_config {
>  	/* Number of pages host wants Guest to give up. */
>  	uint32_t num_pages;
>  	/* Number of pages we've actually got in balloon. */
>  	uint32_t actual;
> +	/* Free page report command id, readonly by guest */
> +	uint32_t free_page_report_cmd_id;
> +	/* Stores PAGE_POISON if page poisoning is in use */
> +	uint32_t poison_val;
>  };
>  
>  #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
> diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h
> index 66543ae..6561a08 100644
> --- a/include/sysemu/balloon.h
> +++ b/include/sysemu/balloon.h
> @@ -18,11 +18,22 @@
>  
>  typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
>  typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info);
> +typedef bool (QEMUBalloonFreePageSupport)(void *opaque);
> +typedef void (QEMUBalloonFreePageStart)(void *opaque);
> +typedef void (QEMUBalloonFreePageStop)(void *opaque);
>  
> -int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
> -			     QEMUBalloonStatus *stat_func, void *opaque);
>  void qemu_remove_balloon_handler(void *opaque);
>  bool qemu_balloon_is_inhibited(void);
>  void qemu_balloon_inhibit(bool state);
> +bool balloon_free_page_support(void);
> +void balloon_free_page_start(void);
> +void balloon_free_page_stop(void);
> +
> +void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
> +                              QEMUBalloonStatus *stat_fn,
> +                              QEMUBalloonFreePageSupport *free_page_support_fn,
> +                              QEMUBalloonFreePageStart *free_page_start_fn,
> +                              QEMUBalloonFreePageStop *free_page_stop_fn,
> +                              void *opaque);
>  
>  #endif
> -- 
> 1.8.3.1
Wang, Wei W May 30, 2018, 9:12 a.m. UTC | #2
On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
> On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
>> +/*
>> + * Balloon will report pages which were free at the time of this call. As the
>> + * reporting happens asynchronously, dirty bit logging must be enabled before
>> + * this call is made.
>> + */
>> +void balloon_free_page_start(void)
>> +{
>> +    balloon_free_page_start_fn(balloon_opaque);
>> +}
> Please create notifier support, not a single global.

OK. The start is called at the end of bitmap_sync, and the stop is 
called at the beginning of bitmap_sync. In this case, we will need to 
add two migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and 
MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?


>
> +static void virtio_balloon_poll_free_page_hints(void *opaque)
> +{
> +    VirtQueueElement *elem;
> +    VirtIOBalloon *dev = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VirtQueue *vq = dev->free_page_vq;
> +    uint32_t id;
> +    size_t size;
> +
> +    while (1) {
> +        qemu_mutex_lock(&dev->free_page_lock);
> +        while (dev->block_iothread) {
> +            qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
> +        }
> +
> +        /*
> +         * If the migration thread actively stops the reporting, exit
> +         * immediately.
> +         */
> +        if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
> Please refactor this : move loop body into a function so
> you can do lock/unlock in a single place.

Sounds good.

>
> +
> +static bool virtio_balloon_free_page_support(void *opaque)
> +{
> +    VirtIOBalloon *s = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> +
> +    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
> or if poison is negotiated.

Will make it
return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) && 
!virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)


Best,
Wei
Michael S. Tsirkin May 30, 2018, 12:47 p.m. UTC | #3
On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
> On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
> > On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
> > > +/*
> > > + * Balloon will report pages which were free at the time of this call. As the
> > > + * reporting happens asynchronously, dirty bit logging must be enabled before
> > > + * this call is made.
> > > + */
> > > +void balloon_free_page_start(void)
> > > +{
> > > +    balloon_free_page_start_fn(balloon_opaque);
> > > +}
> > Please create notifier support, not a single global.
> 
> OK. The start is called at the end of bitmap_sync, and the stop is called at
> the beginning of bitmap_sync. In this case, we will need to add two
> migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
> MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?

If that's the way you do it, you need to ask migration guys, not me.

> 
> > 
> > +static void virtio_balloon_poll_free_page_hints(void *opaque)
> > +{
> > +    VirtQueueElement *elem;
> > +    VirtIOBalloon *dev = opaque;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > +    VirtQueue *vq = dev->free_page_vq;
> > +    uint32_t id;
> > +    size_t size;
> > +
> > +    while (1) {
> > +        qemu_mutex_lock(&dev->free_page_lock);
> > +        while (dev->block_iothread) {
> > +            qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
> > +        }
> > +
> > +        /*
> > +         * If the migration thread actively stops the reporting, exit
> > +         * immediately.
> > +         */
> > +        if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
> > Please refactor this : move loop body into a function so
> > you can do lock/unlock in a single place.
> 
> Sounds good.
> 
> > 
> > +
> > +static bool virtio_balloon_free_page_support(void *opaque)
> > +{
> > +    VirtIOBalloon *s = opaque;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > +
> > +    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
> > or if poison is negotiated.
> 
> Will make it
> return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) &&
> !virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)


I mean the reverse:
	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) ||
	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)


If poison has been negotiated you must migrate the
guest supplied value even if you don't use it for hints.


> 
> 
> Best,
> Wei
Wang, Wei W May 31, 2018, 2:27 a.m. UTC | #4
On 05/30/2018 08:47 PM, Michael S. Tsirkin wrote:
> On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
>> On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
>>> On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
>>>> +/*
>>>> + * Balloon will report pages which were free at the time of this call. As the
>>>> + * reporting happens asynchronously, dirty bit logging must be enabled before
>>>> + * this call is made.
>>>> + */
>>>> +void balloon_free_page_start(void)
>>>> +{
>>>> +    balloon_free_page_start_fn(balloon_opaque);
>>>> +}
>>> Please create notifier support, not a single global.
>> OK. The start is called at the end of bitmap_sync, and the stop is called at
>> the beginning of bitmap_sync. In this case, we will need to add two
>> migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
>> MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?
> If that's the way you do it, you need to ask migration guys, not me.

Yeah, I know.. thanks for the virtio part.

>>> +
>>> +static bool virtio_balloon_free_page_support(void *opaque)
>>> +{
>>> +    VirtIOBalloon *s = opaque;
>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
>>> +
>>> +    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
>>> or if poison is negotiated.
>> Will make it
>> return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) &&
>> !virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)
>
> I mean the reverse:
> 	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) ||
> 	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)
>
>
> If poison has been negotiated you must migrate the
> guest supplied value even if you don't use it for hints.


Just a little confused with the logic. Writing it that way means that we 
are taking this possibility "virtio_vdev_has_feature(vdev, 
VIRTIO_BALLOON_F_FREE_PAGE_HINT)=fasle, virtio_vdev_has_feature(vdev, 
VIRTIO_BALLOON_F_PAGE_POISON)=true" into account, and let the support 
function return true when F_FREE_PAGE_HINT isn't supported.

If guest doesn't support F_FREE_PAGE_HINT, it doesn't support the free 
page reporting (even the free page vq). I'm not sure why we tell the 
migration thread that the free page reporting feature is supported via 
this support function. If the support function simply returns false when 
F_FREE_PAGE_HINT isn't negotiated, the legacy migration already migrates 
the poisoned pages (not skipped, but may be compressed).

I think it would be better to simply use the original "return 
virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)" here.


Best,
Wei
Michael S. Tsirkin May 31, 2018, 5:42 p.m. UTC | #5
On Thu, May 31, 2018 at 10:27:00AM +0800, Wei Wang wrote:
> On 05/30/2018 08:47 PM, Michael S. Tsirkin wrote:
> > On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
> > > On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
> > > > On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
> > > > > +/*
> > > > > + * Balloon will report pages which were free at the time of this call. As the
> > > > > + * reporting happens asynchronously, dirty bit logging must be enabled before
> > > > > + * this call is made.
> > > > > + */
> > > > > +void balloon_free_page_start(void)
> > > > > +{
> > > > > +    balloon_free_page_start_fn(balloon_opaque);
> > > > > +}
> > > > Please create notifier support, not a single global.
> > > OK. The start is called at the end of bitmap_sync, and the stop is called at
> > > the beginning of bitmap_sync. In this case, we will need to add two
> > > migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
> > > MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?
> > If that's the way you do it, you need to ask migration guys, not me.
> 
> Yeah, I know.. thanks for the virtio part.
> 
> > > > +
> > > > +static bool virtio_balloon_free_page_support(void *opaque)
> > > > +{
> > > > +    VirtIOBalloon *s = opaque;
> > > > +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
> > > > +
> > > > +    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
> > > > or if poison is negotiated.
> > > Will make it
> > > return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) &&
> > > !virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)
> > 
> > I mean the reverse:
> > 	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) ||
> > 	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)
> > 
> > 
> > If poison has been negotiated you must migrate the
> > guest supplied value even if you don't use it for hints.
> 
> 
> Just a little confused with the logic. Writing it that way means that we are
> taking this possibility "virtio_vdev_has_feature(vdev,
> VIRTIO_BALLOON_F_FREE_PAGE_HINT)=fasle, virtio_vdev_has_feature(vdev,
> VIRTIO_BALLOON_F_PAGE_POISON)=true" into account, and let the support
> function return true when F_FREE_PAGE_HINT isn't supported.

All I am saying is that in this configuration, you must migrate
the poison value programmed by guest even if you do not
yet use it without VIRTIO_BALLOON_F_FREE_PAGE_HINT.

Right now you have
a section:
+    .needed = virtio_balloon_free_page_support,

which includes the poison value.

So if guest migrates after writing the poison value,
it's lost. Not nice.

> If guest doesn't support F_FREE_PAGE_HINT, it doesn't support the free page
> reporting (even the free page vq). I'm not sure why we tell the migration
> thread that the free page reporting feature is supported via this support
> function. If the support function simply returns false when F_FREE_PAGE_HINT
> isn't negotiated, the legacy migration already migrates the poisoned pages
> (not skipped, but may be compressed).
> 
> I think it would be better to simply use the original "return
> virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)" here.


So maybe you should put the poison value in a separate section then.


> 
> Best,
> Wei
Wang, Wei W June 1, 2018, 3:18 a.m. UTC | #6
On 06/01/2018 01:42 AM, Michael S. Tsirkin wrote:
> On Thu, May 31, 2018 at 10:27:00AM +0800, Wei Wang wrote:
>> On 05/30/2018 08:47 PM, Michael S. Tsirkin wrote:
>>> On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
>>>> On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
>>>>> On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
>>>>>> +/*
>>>>>> + * Balloon will report pages which were free at the time of this call. As the
>>>>>> + * reporting happens asynchronously, dirty bit logging must be enabled before
>>>>>> + * this call is made.
>>>>>> + */
>>>>>> +void balloon_free_page_start(void)
>>>>>> +{
>>>>>> +    balloon_free_page_start_fn(balloon_opaque);
>>>>>> +}
>>>>> Please create notifier support, not a single global.
>>>> OK. The start is called at the end of bitmap_sync, and the stop is called at
>>>> the beginning of bitmap_sync. In this case, we will need to add two
>>>> migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
>>>> MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?
>>> If that's the way you do it, you need to ask migration guys, not me.
>> Yeah, I know.. thanks for the virtio part.
>>
>>>>> +
>>>>> +static bool virtio_balloon_free_page_support(void *opaque)
>>>>> +{
>>>>> +    VirtIOBalloon *s = opaque;
>>>>> +    VirtIODevice *vdev = VIRTIO_DEVICE(s);
>>>>> +
>>>>> +    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
>>>>> or if poison is negotiated.
>>>> Will make it
>>>> return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) &&
>>>> !virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)
>>> I mean the reverse:
>>> 	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT) ||
>>> 	virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)
>>>
>>>
>>> If poison has been negotiated you must migrate the
>>> guest supplied value even if you don't use it for hints.
>>
>> Just a little confused with the logic. Writing it that way means that we are
>> taking this possibility "virtio_vdev_has_feature(vdev,
>> VIRTIO_BALLOON_F_FREE_PAGE_HINT)=fasle, virtio_vdev_has_feature(vdev,
>> VIRTIO_BALLOON_F_PAGE_POISON)=true" into account, and let the support
>> function return true when F_FREE_PAGE_HINT isn't supported.
> All I am saying is that in this configuration, you must migrate
> the poison value programmed by guest even if you do not
> yet use it without VIRTIO_BALLOON_F_FREE_PAGE_HINT.
>
> Right now you have
> a section:
> +    .needed = virtio_balloon_free_page_support,
>
> which includes the poison value.
>
> So if guest migrates after writing the poison value,
> it's lost. Not nice.
>
>> If guest doesn't support F_FREE_PAGE_HINT, it doesn't support the free page
>> reporting (even the free page vq). I'm not sure why we tell the migration
>> thread that the free page reporting feature is supported via this support
>> function. If the support function simply returns false when F_FREE_PAGE_HINT
>> isn't negotiated, the legacy migration already migrates the poisoned pages
>> (not skipped, but may be compressed).
>>
>> I think it would be better to simply use the original "return
>> virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)" here.
>
> So maybe you should put the poison value in a separate section then.

Yes, that looks good to me, thanks.

Best,
Wei
Wang, Wei W June 4, 2018, 8:04 a.m. UTC | #7
On 05/30/2018 08:47 PM, Michael S. Tsirkin wrote:
> On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
>> On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
>>> On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
>>>> +/*
>>>> + * Balloon will report pages which were free at the time of this call. As the
>>>> + * reporting happens asynchronously, dirty bit logging must be enabled before
>>>> + * this call is made.
>>>> + */
>>>> +void balloon_free_page_start(void)
>>>> +{
>>>> +    balloon_free_page_start_fn(balloon_opaque);
>>>> +}
>>> Please create notifier support, not a single global.
>> OK. The start is called at the end of bitmap_sync, and the stop is called at
>> the beginning of bitmap_sync. In this case, we will need to add two
>> migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
>> MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?

Peter, do you have any thought about this?

Currently, the usage of free page optimization isn't limited to the 
first stage. It is used in each stage. A global call to start the free 
page optimization is made after bitmap sync, and another global call to 
stop the optimization is made before bitmap sync. It is simple to just 
use global calls.

If we change the implementation to use notifiers, I think we will need 
to add two new MigrationStatus as above. Would you think that is 
worthwhile for some reason?

Best,
Wei
Peter Xu June 5, 2018, 6:58 a.m. UTC | #8
On Mon, Jun 04, 2018 at 04:04:51PM +0800, Wei Wang wrote:
> On 05/30/2018 08:47 PM, Michael S. Tsirkin wrote:
> > On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
> > > On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
> > > > On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
> > > > > +/*
> > > > > + * Balloon will report pages which were free at the time of this call. As the
> > > > > + * reporting happens asynchronously, dirty bit logging must be enabled before
> > > > > + * this call is made.
> > > > > + */
> > > > > +void balloon_free_page_start(void)
> > > > > +{
> > > > > +    balloon_free_page_start_fn(balloon_opaque);
> > > > > +}
> > > > Please create notifier support, not a single global.
> > > OK. The start is called at the end of bitmap_sync, and the stop is called at
> > > the beginning of bitmap_sync. In this case, we will need to add two
> > > migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
> > > MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?
> 
> Peter, do you have any thought about this?
> 
> Currently, the usage of free page optimization isn't limited to the first
> stage. It is used in each stage. A global call to start the free page
> optimization is made after bitmap sync, and another global call to stop the
> optimization is made before bitmap sync. It is simple to just use global
> calls.
> 
> If we change the implementation to use notifiers, I think we will need to
> add two new MigrationStatus as above. Would you think that is worthwhile for
> some reason?

I'm a bit confused.  Could you elaborate why we need those extra
states?

Or, to ask a more general question - could you elaborate a bit on how
you order these operations?  I would be really glad if you can point
me to some documents for the feature.  Is there any latest virtio
document that I can refer to (or old cover letter links)?  It'll be
good if the document could mention about things like:

- why we need this feature? Is that purely for migration purpose?  Or
  it can be used somewhere else too?
- high level stuff about how this is implemented, e.g.:
  - the protocol of the new virtio queues
  - how we should get the free page hints (please see below)

For now, what I see is that we do:

(1) stop hinting
(2) sync bitmap
(3) start hinting

Why this order?  My understanding is that obviously there is a race
between the page hinting thread and the dirty bitmap tracking part
(which is done in KVM).  How do we make sure there is no race?

An direct question is that, do we need to make sure step (1) must be
before step (2)?  Asked since I see that currently step (1) is an
async operation (taking a lock, set status, then return).  Then would
such an async operation satisfy any ordering requirement after all?

Btw, I would appreciate if you can push your new trees (both QEMU and
kernel) to the links you mentioned in the cover letter - I noticed
that they are not the same as what you have posted on the list.

Thanks,
Wang, Wei W June 5, 2018, 1:22 p.m. UTC | #9
On 06/05/2018 02:58 PM, Peter Xu wrote:
> On Mon, Jun 04, 2018 at 04:04:51PM +0800, Wei Wang wrote:
>> On 05/30/2018 08:47 PM, Michael S. Tsirkin wrote:
>>> On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
>>>> On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
>>>>> On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
>>>>>> +/*
>>>>>> + * Balloon will report pages which were free at the time of this call. As the
>>>>>> + * reporting happens asynchronously, dirty bit logging must be enabled before
>>>>>> + * this call is made.
>>>>>> + */
>>>>>> +void balloon_free_page_start(void)
>>>>>> +{
>>>>>> +    balloon_free_page_start_fn(balloon_opaque);
>>>>>> +}
>>>>> Please create notifier support, not a single global.
>>>> OK. The start is called at the end of bitmap_sync, and the stop is called at
>>>> the beginning of bitmap_sync. In this case, we will need to add two
>>>> migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
>>>> MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?
>> Peter, do you have any thought about this?
>>
>> Currently, the usage of free page optimization isn't limited to the first
>> stage. It is used in each stage. A global call to start the free page
>> optimization is made after bitmap sync, and another global call to stop the
>> optimization is made before bitmap sync. It is simple to just use global
>> calls.
>>
>> If we change the implementation to use notifiers, I think we will need to
>> add two new MigrationStatus as above. Would you think that is worthwhile for
>> some reason?
> I'm a bit confused.  Could you elaborate why we need those extra
> states?

Sure. Notifiers are used when an event happens. In this case, it would 
be a state change, which invokes the state change callback. So I think 
we probably need to add 2 new states for the start and stop callback.


> Or, to ask a more general question - could you elaborate a bit on how
> you order these operations?  I would be really glad if you can point
> me to some documents for the feature.  Is there any latest virtio
> document that I can refer to (or old cover letter links)?  It'll be
> good if the document could mention about things like:

I haven't made documents to explain it yet. It's planed to be ready 
after this code series is done. But I'm glad to answer the questions below.


>
> - why we need this feature? Is that purely for migration purpose?  Or
>    it can be used somewhere else too?

Yes. Migration is the one that currently benefits a lot from this 
feature. I haven't thought of others so far. It is common that new 
features start with just 1 or 2 typical use cases.


> - high level stuff about how this is implemented, e.g.:
>    - the protocol of the new virtio queues
>    - how we should get the free page hints (please see below)

The high-level introduction would be
1. host sends a start cmd id to the guest;
2. the guest starts a new round of reporting by sending a cmd_id+free 
page hints to host;
3. QEMU side optimization code applies the free page hints (filter them 
from the dirty bitmap) only when the reported cmd id matches the one 
that was just sent.

The protocol was suggested by Michael and has been thoroughly discussed 
when upstreaming the kernel part. It might not be necessary to go over 
that again :)
I would suggest to focus on the supplied interface and its usage in live 
migration. That is, now we have two APIs, start() and stop(), to start 
and stop the optimization.

1) where in the migration code should we use them (do you agree with the 
step (1), (2), (3) you concluded below?)
2) how should we use them, directly do global call or via notifiers?

>
> For now, what I see is that we do:
>
> (1) stop hinting
> (2) sync bitmap
> (3) start hinting
>
> Why this order?

We start to filter out free pages from the dirty bitmap only when all 
the dirty bits are ready there, i.e. after sync bitmap. To some degree, 
the action of synchronizing bitmap indicates the end of the last round 
and the beginning of the new round, so we stop the free page 
optimization for the old round when the old round ends.


>   My understanding is that obviously there is a race
> between the page hinting thread and the dirty bitmap tracking part
> (which is done in KVM).  How do we make sure there is no race?

Could you please explain more about the race you saw? (free page is 
reported from the guest, and the bitmap is tracked in KVM)



>
> An direct question is that, do we need to make sure step (1) must be
> before step (2)?  Asked since I see that currently step (1) is an
> async operation (taking a lock, set status, then return).  Then would
> such an async operation satisfy any ordering requirement after all?

Yes. Step(1) guarantees us that the QEMU side optimization call has 
exited (we don't need to rely on guest side ACK because the guest could 
be in any state). This is enough. If the guest continues to report after 
that, that reported hints will be detected as stale hints and dropped in 
the next start of optimization.


>
> Btw, I would appreciate if you can push your new trees (both QEMU and
> kernel) to the links you mentioned in the cover letter - I noticed
> that they are not the same as what you have posted on the list.
>

Sure.
For kernel part, you can get it from linux-next: 
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
For the v7 QEMU part: 
git://github.com/wei-w-wang/qemu-free-page-hint.git (my connection to 
github is too slow, it would be ready in 24hours, I can also send you 
the raw patches via email if you need)


Best,
Wei
Peter Xu June 6, 2018, 5:42 a.m. UTC | #10
On Tue, Jun 05, 2018 at 09:22:51PM +0800, Wei Wang wrote:
> On 06/05/2018 02:58 PM, Peter Xu wrote:
> > On Mon, Jun 04, 2018 at 04:04:51PM +0800, Wei Wang wrote:
> > > On 05/30/2018 08:47 PM, Michael S. Tsirkin wrote:
> > > > On Wed, May 30, 2018 at 05:12:09PM +0800, Wei Wang wrote:
> > > > > On 05/29/2018 11:24 PM, Michael S. Tsirkin wrote:
> > > > > > On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
> > > > > > > +/*
> > > > > > > + * Balloon will report pages which were free at the time of this call. As the
> > > > > > > + * reporting happens asynchronously, dirty bit logging must be enabled before
> > > > > > > + * this call is made.
> > > > > > > + */
> > > > > > > +void balloon_free_page_start(void)
> > > > > > > +{
> > > > > > > +    balloon_free_page_start_fn(balloon_opaque);
> > > > > > > +}
> > > > > > Please create notifier support, not a single global.
> > > > > OK. The start is called at the end of bitmap_sync, and the stop is called at
> > > > > the beginning of bitmap_sync. In this case, we will need to add two
> > > > > migration states, MIGRATION_STATUS_BEFORE_BITMAP_SYNC and
> > > > > MIGRATION_STATUS_AFTER_BITMAP_SYNC, right?
> > > Peter, do you have any thought about this?
> > > 
> > > Currently, the usage of free page optimization isn't limited to the first
> > > stage. It is used in each stage. A global call to start the free page
> > > optimization is made after bitmap sync, and another global call to stop the
> > > optimization is made before bitmap sync. It is simple to just use global
> > > calls.
> > > 
> > > If we change the implementation to use notifiers, I think we will need to
> > > add two new MigrationStatus as above. Would you think that is worthwhile for
> > > some reason?
> > I'm a bit confused.  Could you elaborate why we need those extra
> > states?
> 
> Sure. Notifiers are used when an event happens. In this case, it would be a
> state change, which invokes the state change callback. So I think we
> probably need to add 2 new states for the start and stop callback.

IMHO migration states do not suite here.  IMHO bitmap syncing is too
frequently an operation, especially at the end of a precopy migration.
If you really want to introduce some notifiers, I would prefer
something new rather than fiddling around with migration state.  E.g.,
maybe a new migration event notifiers, then introduce two new events
for both start/end of bitmap syncing.

> 
> 
> > Or, to ask a more general question - could you elaborate a bit on how
> > you order these operations?  I would be really glad if you can point
> > me to some documents for the feature.  Is there any latest virtio
> > document that I can refer to (or old cover letter links)?  It'll be
> > good if the document could mention about things like:
> 
> I haven't made documents to explain it yet. It's planed to be ready after
> this code series is done. But I'm glad to answer the questions below.

Ok, thanks.  If we are very sure we'll have a document, IMHO it'll be
very nice at least for reviewers to have the document as long as
prototyping is finished... But it's okay.

> 
> 
> > 
> > - why we need this feature? Is that purely for migration purpose?  Or
> >    it can be used somewhere else too?
> 
> Yes. Migration is the one that currently benefits a lot from this feature. I
> haven't thought of others so far. It is common that new features start with
> just 1 or 2 typical use cases.

Yes, it was a pure question actually; this is okay to me.

> 
> 
> > - high level stuff about how this is implemented, e.g.:
> >    - the protocol of the new virtio queues
> >    - how we should get the free page hints (please see below)
> 
> The high-level introduction would be
> 1. host sends a start cmd id to the guest;
> 2. the guest starts a new round of reporting by sending a cmd_id+free page
> hints to host;
> 3. QEMU side optimization code applies the free page hints (filter them from
> the dirty bitmap) only when the reported cmd id matches the one that was
> just sent.
> 
> The protocol was suggested by Michael and has been thoroughly discussed when
> upstreaming the kernel part. It might not be necessary to go over that again
> :)

I don't mean we should go back to review the content again; I mean we
still need to have such a knowledge on some of the details. Since
there is no document to properly define the interface between
migration code and the balloon API yet, IMHO it's still useful even
for a reviewer from migration pov to fully understand what's that
behind, especially this is quite low-level stuff to play around with
guest pages, and it contains some tricky points and potential
cross-over with e.g. dirty page trackings.

> I would suggest to focus on the supplied interface and its usage in live
> migration. That is, now we have two APIs, start() and stop(), to start and
> stop the optimization.
> 
> 1) where in the migration code should we use them (do you agree with the
> step (1), (2), (3) you concluded below?)
> 2) how should we use them, directly do global call or via notifiers?

I don't know how Dave and Juan might think; here I tend to agree with
Michael that some notifier framework should be nicer.

> 
> > 
> > For now, what I see is that we do:
> > 
> > (1) stop hinting
> > (2) sync bitmap
> > (3) start hinting
> > 
> > Why this order?
> 
> We start to filter out free pages from the dirty bitmap only when all the
> dirty bits are ready there, i.e. after sync bitmap. To some degree, the
> action of synchronizing bitmap indicates the end of the last round and the
> beginning of the new round, so we stop the free page optimization for the
> old round when the old round ends.

Yeh this looks sane to me.

> 
> 
> >   My understanding is that obviously there is a race
> > between the page hinting thread and the dirty bitmap tracking part
> > (which is done in KVM).  How do we make sure there is no race?
> 
> Could you please explain more about the race you saw? (free page is reported
> from the guest, and the bitmap is tracked in KVM)

It's the one I mentioned below...

> 
> 
> 
> > 
> > An direct question is that, do we need to make sure step (1) must be
> > before step (2)?  Asked since I see that currently step (1) is an
> > async operation (taking a lock, set status, then return).  Then would
> > such an async operation satisfy any ordering requirement after all?
> 
> Yes. Step(1) guarantees us that the QEMU side optimization call has exited
> (we don't need to rely on guest side ACK because the guest could be in any
> state).

This is not that obvious to me.  For now I think it's true, since when
we call stop() we'll take the mutex, meanwhile the mutex is actually
always held by the iothread (in the big loop in
virtio_balloon_poll_free_page_hints) until either:

- it sleeps in qemu_cond_wait() [1], or
- it leaves the big loop [2]

Since I don't see anyone who will set dev->block_iothread to true for
the balloon device, then [1] cannot happen; then I think when stop()
has taken the mutex then the thread must have quitted the big loop,
which goes to path [2].  I am not sure my understanding is correct,
but in all cases "Step(1) guarantees us that the QEMU side
optimization call has exited" is not obvious to me.  Would you add
some comment to the code (or even improve the code itself somehow) to
help people understand that?

For example, I saw that the old github code has a pthread_join() (in
that old code it was not using iothread at all).  That's a very good
code example so that people can understand that it's a synchronous
operations.

> This is enough. If the guest continues to report after that, that
> reported hints will be detected as stale hints and dropped in the next start
> of optimization.

This is not clear to me too.  Say, stop() only sets the balloon status
to STOP, AFAIU it does not really modify the cmd_id field immediately,
then how will the new coming hints be known as stale hints?

> 
> 
> > 
> > Btw, I would appreciate if you can push your new trees (both QEMU and
> > kernel) to the links you mentioned in the cover letter - I noticed
> > that they are not the same as what you have posted on the list.
> > 
> 
> Sure.
> For kernel part, you can get it from linux-next:
> git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
> For the v7 QEMU part: git://github.com/wei-w-wang/qemu-free-page-hint.git
> (my connection to github is too slow, it would be ready in 24hours, I can
> also send you the raw patches via email if you need)

No need to post patches; I can read the ones on the list for sure.
It's just a reminder in case you forgot to push the tree when sending
new versions.  Thanks,
Peter Xu June 6, 2018, 6:43 a.m. UTC | #11
On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:

[...]

> +static void virtio_balloon_poll_free_page_hints(void *opaque)
> +{
> +    VirtQueueElement *elem;
> +    VirtIOBalloon *dev = opaque;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +    VirtQueue *vq = dev->free_page_vq;
> +    uint32_t id;
> +    size_t size;
> +
> +    while (1) {
> +        qemu_mutex_lock(&dev->free_page_lock);
> +        while (dev->block_iothread) {
> +            qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
> +        }
> +
> +        /*
> +         * If the migration thread actively stops the reporting, exit
> +         * immediately.
> +         */
> +        if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
> +            qemu_mutex_unlock(&dev->free_page_lock);
> +            break;
> +        }
> +
> +        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
> +        if (!elem) {
> +            qemu_mutex_unlock(&dev->free_page_lock);
> +            continue;
> +        }
> +
> +        if (elem->out_num) {
> +            size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id));
> +            virtqueue_push(vq, elem, size);

Silly question: is this sending the same id back to guest?  Why?

> +            g_free(elem);
> +
> +            virtio_tswap32s(vdev, &id);
> +            if (unlikely(size != sizeof(id))) {
> +                virtio_error(vdev, "received an incorrect cmd id");

Forgot to unlock?

Maybe we can just move the lock operations outside:

  mutex_lock();
  while (1) {
    ...
    if (block) {
      qemu_cond_wait();
    }
    ...
    if (skip) {
      continue;
    }
    ...
    if (error) {
      break;
    }
    ...
  }
  mutex_unlock();

> +                break;
> +            }
> +            if (id == dev->free_page_report_cmd_id) {
> +                dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
> +            } else {
> +                /*
> +                 * Stop the optimization only when it has started. This
> +                 * avoids a stale stop sign for the previous command.
> +                 */
> +                if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START) {
> +                    dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> +                    qemu_mutex_unlock(&dev->free_page_lock);
> +                    break;
> +                }
> +            }
> +        }
> +
> +        if (elem->in_num) {
> +            /* TODO: send the poison value to the destination */
> +            if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START &&
> +                !dev->poison_val) {
> +                qemu_guest_free_page_hint(elem->in_sg[0].iov_base,
> +                                          elem->in_sg[0].iov_len);
> +            }
> +            virtqueue_push(vq, elem, 0);
> +            g_free(elem);
> +        }
> +        qemu_mutex_unlock(&dev->free_page_lock);
> +    }
> +    virtio_notify(vdev, vq);
> +}

[...]

> +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
> +    .name = "virtio-balloon-device/free-page-report",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = virtio_balloon_free_page_support,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
> +        VMSTATE_UINT32(poison_val, VirtIOBalloon),

(could we move all the poison-related lines into another patch or
 postpone?  after all we don't support it yet, do we?)

> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription vmstate_virtio_balloon_device = {
>      .name = "virtio-balloon-device",
>      .version_id = 1,
> @@ -423,30 +572,42 @@ static const VMStateDescription vmstate_virtio_balloon_device = {
>          VMSTATE_UINT32(actual, VirtIOBalloon),
>          VMSTATE_END_OF_LIST()
>      },
> +    .subsections = (const VMStateDescription * []) {
> +        &vmstate_virtio_balloon_free_page_report,
> +        NULL
> +    }
>  };
>  
>  static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VirtIOBalloon *s = VIRTIO_BALLOON(dev);
> -    int ret;
>  
>      virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
>                  sizeof(struct virtio_balloon_config));
>  
> -    ret = qemu_add_balloon_handler(virtio_balloon_to_target,
> -                                   virtio_balloon_stat, s);
> -
> -    if (ret < 0) {
> -        error_setg(errp, "Only one balloon device is supported");
> -        virtio_cleanup(vdev);
> -        return;
> -    }
> -
>      s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
>      s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
> -
> +    if (virtio_has_feature(s->host_features,
> +                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> +        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
> +        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> +        s->free_page_report_cmd_id =
> +                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;

Why explicitly -1?  I thought ID_MIN would be fine too?

> +        if (s->iothread) {
> +            object_ref(OBJECT(s->iothread));
> +            s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
> +                                       virtio_balloon_poll_free_page_hints, s);

Just to mention that now we can create internal iothreads.  Please
have a look at iothread_create().

> +            qemu_mutex_init(&s->free_page_lock);
> +            qemu_cond_init(&s->free_page_cond);
> +            s->block_iothread = false;
> +        } else {
> +            /* Simply disable this feature if the iothread wasn't created. */
> +            s->host_features &= ~(1 << VIRTIO_BALLOON_F_FREE_PAGE_HINT);
> +            virtio_error(vdev, "iothread is missing");
> +        }
> +    }
>      reset_stats(s);
>  }

Regards,
Wang, Wei W June 6, 2018, 10:04 a.m. UTC | #12
On 06/06/2018 01:42 PM, Peter Xu wrote:
>
> IMHO migration states do not suite here.  IMHO bitmap syncing is too
> frequently an operation, especially at the end of a precopy migration.
> If you really want to introduce some notifiers, I would prefer
> something new rather than fiddling around with migration state.  E.g.,
> maybe a new migration event notifiers, then introduce two new events
> for both start/end of bitmap syncing.

Please see if below aligns to what you meant:

MigrationState {
...
+ int ram_save_state;

}

typedef enum RamSaveState {
     RAM_SAVE_BEGIN = 0,
     RAM_SAVE_END = 1,
     RAM_SAVE_MAX = 2
}

then at the step 1) and 3) you concluded somewhere below, we change the 
state and invoke the callback.


Btw, the migration_state_notifiers is already there, but seems not 
really used (I only tracked spice-core.c called 
add_migration_state_change_notifier). I thought adding new migration 
states can reuse all that we have.
What's your real concern about that? (not sure how defining new events 
would make a difference)

>> I would suggest to focus on the supplied interface and its usage in live
>> migration. That is, now we have two APIs, start() and stop(), to start and
>> stop the optimization.
>>
>> 1) where in the migration code should we use them (do you agree with the
>> step (1), (2), (3) you concluded below?)
>> 2) how should we use them, directly do global call or via notifiers?
> I don't know how Dave and Juan might think; here I tend to agree with
> Michael that some notifier framework should be nicer.
>

What would be the advantages of using notifiers here?



> This is not that obvious to me.  For now I think it's true, since when
> we call stop() we'll take the mutex, meanwhile the mutex is actually
> always held by the iothread (in the big loop in
> virtio_balloon_poll_free_page_hints) until either:
>
> - it sleeps in qemu_cond_wait() [1], or
> - it leaves the big loop [2]
>
> Since I don't see anyone who will set dev->block_iothread to true for
> the balloon device, then [1] cannot happen;

there is a case in virtio_balloon_set_status which sets 
dev->block_iothread to true.

Did you mean the free_page_lock mutex? it is released at the bottom of 
the while() loop in virtio_balloon_poll_free_page_hint. It's actually 
released for every hint. That is,

while(1){
     take the lock;
     process 1 hint from the vq;
     release the lock;
}

>   then I think when stop()
> has taken the mutex then the thread must have quitted the big loop,
> which goes to path [2].  I am not sure my understanding is correct,
> but in all cases "Step(1) guarantees us that the QEMU side
> optimization call has exited" is not obvious to me.  Would you add
> some comment to the code (or even improve the code itself somehow) to
> help people understand that?
>
> For example, I saw that the old github code has a pthread_join() (in
> that old code it was not using iothread at all).  That's a very good
> code example so that people can understand that it's a synchronous
> operations.

>> This is enough. If the guest continues to report after that, that
>> reported hints will be detected as stale hints and dropped in the next start
>> of optimization.
> This is not clear to me too.  Say, stop() only sets the balloon status
> to STOP, AFAIU it does not really modify the cmd_id field immediately,
> then how will the new coming hints be known as stale hints?

Yes, you get that correctly - stop() only sets the status to STOP. On 
the other side, virtio_balloon_poll_free_page_hints will stop when it 
sees the staus is STOP. The free_page_lock guarantees that when stop() 
returns, virtio_balloon_poll_free_page_hints will not proceed. When 
virtio_balloon_poll_free_page_hints exits, the coming hints are not 
processed any more. They just stay in the vq. The next time start() is 
called, virtio_balloon_poll_free_page_hints works again, it will first 
drop all those stale hints.
I'll see where I could add some comments to explain.


Best,
Wei
Wang, Wei W June 6, 2018, 10:11 a.m. UTC | #13
On 06/06/2018 02:43 PM, Peter Xu wrote:
> On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
>
> [...]
>
> +        if (elem->out_num) {
> +            size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id));
> +            virtqueue_push(vq, elem, size);
> Silly question: is this sending the same id back to guest?  Why?

No. It's just giving back the used buffer.

>
>> +            g_free(elem);
>> +
>> +            virtio_tswap32s(vdev, &id);
>> +            if (unlikely(size != sizeof(id))) {
>> +                virtio_error(vdev, "received an incorrect cmd id");
> Forgot to unlock?
>
> Maybe we can just move the lock operations outside:
>
>    mutex_lock();
>    while (1) {
>      ...
>      if (block) {
>        qemu_cond_wait();
>      }
>      ...
>      if (skip) {
>        continue;
>      }
>      ...
>      if (error) {
>        break;
>      }
>      ...
>    }
>    mutex_unlock();


I got similar comments from Michael, and it will be
while (1) {
lock;
func();
unlock();
}

All the unlock inside the body will be gone.

> [...]
>> +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
>> +    .name = "virtio-balloon-device/free-page-report",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .needed = virtio_balloon_free_page_support,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
>> +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
> (could we move all the poison-related lines into another patch or
>   postpone?  after all we don't support it yet, do we?)
>

  We don't support migrating poison value, but guest maybe use it, so we 
are actually disabling this feature in that case. Probably good to leave 
the code together to handle that case.


>> +    if (virtio_has_feature(s->host_features,
>> +                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>> +        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
>> +        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
>> +        s->free_page_report_cmd_id =
>> +                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;
> Why explicitly -1?  I thought ID_MIN would be fine too?

Yes, that will also be fine. Since we states that the cmd id will be 
from [MIN, MAX], and we make s->free_page_report_cmd_id++ in start(), 
using VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN here will make it [MIN 
+ 1, MAX].

>
>> +        if (s->iothread) {
>> +            object_ref(OBJECT(s->iothread));
>> +            s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
>> +                                       virtio_balloon_poll_free_page_hints, s);
> Just to mention that now we can create internal iothreads.  Please
> have a look at iothread_create().

Thanks. I noticed that, but I think configuring via the cmd line can let 
people share the iothread with other devices that need it.

Best,
Wei
Peter Xu June 6, 2018, 11:02 a.m. UTC | #14
On Wed, Jun 06, 2018 at 06:04:23PM +0800, Wei Wang wrote:
> On 06/06/2018 01:42 PM, Peter Xu wrote:
> > 
> > IMHO migration states do not suite here.  IMHO bitmap syncing is too
> > frequently an operation, especially at the end of a precopy migration.
> > If you really want to introduce some notifiers, I would prefer
> > something new rather than fiddling around with migration state.  E.g.,
> > maybe a new migration event notifiers, then introduce two new events
> > for both start/end of bitmap syncing.
> 
> Please see if below aligns to what you meant:
> 
> MigrationState {
> ...
> + int ram_save_state;
> 
> }
> 
> typedef enum RamSaveState {
>     RAM_SAVE_BEGIN = 0,
>     RAM_SAVE_END = 1,
>     RAM_SAVE_MAX = 2
> }
> 
> then at the step 1) and 3) you concluded somewhere below, we change the
> state and invoke the callback.

I mean something like this:

1693c64c27 ("postcopy: Add notifier chain", 2018-03-20)

That was a postcopy-only notifier.  Maybe we can generalize it into a
more common notifier for the migration framework so that we can even
register with non-postcopy events like bitmap syncing?

> 
> 
> Btw, the migration_state_notifiers is already there, but seems not really
> used (I only tracked spice-core.c called
> add_migration_state_change_notifier). I thought adding new migration states
> can reuse all that we have.
> What's your real concern about that? (not sure how defining new events would
> make a difference)

Migration state is exposed via control path (QMP).  Adding new states
mean that the QMP clients will see more.  IMO that's not really
anything that a QMP client will need to know, instead we can keep it
internally.  That's a reason from compatibility pov.

Meanwhile, it's not really a state-thing at all for me.  It looks
really more like hook or event (start/stop of sync).

> 
> > > I would suggest to focus on the supplied interface and its usage in live
> > > migration. That is, now we have two APIs, start() and stop(), to start and
> > > stop the optimization.
> > > 
> > > 1) where in the migration code should we use them (do you agree with the
> > > step (1), (2), (3) you concluded below?)
> > > 2) how should we use them, directly do global call or via notifiers?
> > I don't know how Dave and Juan might think; here I tend to agree with
> > Michael that some notifier framework should be nicer.
> > 
> 
> What would be the advantages of using notifiers here?

Isolation of modules?  Then migration/ram.c at least won't need to
include something like "balloon.h".

And I think it's also possible too if some other modules would like to
hook at these places someday.

> 
> 
> 
> > This is not that obvious to me.  For now I think it's true, since when
> > we call stop() we'll take the mutex, meanwhile the mutex is actually
> > always held by the iothread (in the big loop in
> > virtio_balloon_poll_free_page_hints) until either:
> > 
> > - it sleeps in qemu_cond_wait() [1], or
> > - it leaves the big loop [2]
> > 
> > Since I don't see anyone who will set dev->block_iothread to true for
> > the balloon device, then [1] cannot happen;
> 
> there is a case in virtio_balloon_set_status which sets dev->block_iothread
> to true.
> 
> Did you mean the free_page_lock mutex? it is released at the bottom of the
> while() loop in virtio_balloon_poll_free_page_hint. It's actually released
> for every hint. That is,
> 
> while(1){
>     take the lock;
>     process 1 hint from the vq;
>     release the lock;
> }

Ah, so now I understand why you need the lock to be inside the loop,
since the loop is busy polling actually.  Is it possible to do this in
an async way?  I'm a bit curious on how much time will it use to do
one round of the free page hints (e.g., an idle guest with 8G mem, or
any configuration you tested)?  I suppose during that time the
iothread will be held steady with 100% cpu usage, am I right?

> 
> >   then I think when stop()
> > has taken the mutex then the thread must have quitted the big loop,
> > which goes to path [2].  I am not sure my understanding is correct,
> > but in all cases "Step(1) guarantees us that the QEMU side
> > optimization call has exited" is not obvious to me.  Would you add
> > some comment to the code (or even improve the code itself somehow) to
> > help people understand that?
> > 
> > For example, I saw that the old github code has a pthread_join() (in
> > that old code it was not using iothread at all).  That's a very good
> > code example so that people can understand that it's a synchronous
> > operations.
> 
> > > This is enough. If the guest continues to report after that, that
> > > reported hints will be detected as stale hints and dropped in the next start
> > > of optimization.
> > This is not clear to me too.  Say, stop() only sets the balloon status
> > to STOP, AFAIU it does not really modify the cmd_id field immediately,
> > then how will the new coming hints be known as stale hints?
> 
> Yes, you get that correctly - stop() only sets the status to STOP. On the
> other side, virtio_balloon_poll_free_page_hints will stop when it sees the
> staus is STOP. The free_page_lock guarantees that when stop() returns,
> virtio_balloon_poll_free_page_hints will not proceed. When
> virtio_balloon_poll_free_page_hints exits, the coming hints are not
> processed any more. They just stay in the vq. The next time start() is
> called, virtio_balloon_poll_free_page_hints works again, it will first drop
> all those stale hints.
> I'll see where I could add some comments to explain.

That'll be nice.

Thanks,
Peter Xu June 7, 2018, 3:17 a.m. UTC | #15
On Wed, Jun 06, 2018 at 06:11:50PM +0800, Wei Wang wrote:
> On 06/06/2018 02:43 PM, Peter Xu wrote:
> > On Tue, Apr 24, 2018 at 02:13:47PM +0800, Wei Wang wrote:
> > 
> > [...]
> > 
> > +        if (elem->out_num) {
> > +            size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id));
> > +            virtqueue_push(vq, elem, size);
> > Silly question: is this sending the same id back to guest?  Why?
> 
> No. It's just giving back the used buffer.

Oops, sorry!

> 
> > 
> > > +            g_free(elem);
> > > +
> > > +            virtio_tswap32s(vdev, &id);
> > > +            if (unlikely(size != sizeof(id))) {
> > > +                virtio_error(vdev, "received an incorrect cmd id");
> > Forgot to unlock?
> > 
> > Maybe we can just move the lock operations outside:
> > 
> >    mutex_lock();
> >    while (1) {
> >      ...
> >      if (block) {
> >        qemu_cond_wait();
> >      }
> >      ...
> >      if (skip) {
> >        continue;
> >      }
> >      ...
> >      if (error) {
> >        break;
> >      }
> >      ...
> >    }
> >    mutex_unlock();
> 
> 
> I got similar comments from Michael, and it will be
> while (1) {
> lock;
> func();
> unlock();
> }
> 
> All the unlock inside the body will be gone.

Ok I think I have more question on this part...

Actually AFAICT this new feature uses iothread in a way very similar
to the block layer, so I digged a bit on how block layer used the
iothreads.  I see that the block code is using something like
virtio_queue_aio_set_host_notifier_handler() to hook up the
iothread/aiocontext and the ioeventfd, however here you are manually
creating one QEMUBH and bound that to the new context.  Should you
also use something like the block layer?  Then IMHO you can avoid
using a busy loop there (assuming the performance does not really
matter that much here for page hintings), and all the packet handling
can again be based on interrupts from the guest (ioeventfd).

[1]

> 
> > [...]
> > > +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
> > > +    .name = "virtio-balloon-device/free-page-report",
> > > +    .version_id = 1,
> > > +    .minimum_version_id = 1,
> > > +    .needed = virtio_balloon_free_page_support,
> > > +    .fields = (VMStateField[]) {
> > > +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
> > > +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
> > (could we move all the poison-related lines into another patch or
> >   postpone?  after all we don't support it yet, do we?)
> > 
> 
>  We don't support migrating poison value, but guest maybe use it, so we are
> actually disabling this feature in that case. Probably good to leave the
> code together to handle that case.

Could we just avoid declaring that feature bit in emulation code
completely?  I mean, we support VIRTIO_BALLOON_F_FREE_PAGE_HINT first
as the first step (as you mentioned in commit message, the POISON is a
TODO).  Then when you really want to completely support the POISON
bit, you can put all that into a separate patch.  Would that work?

> 
> 
> > > +    if (virtio_has_feature(s->host_features,
> > > +                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> > > +        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
> > > +        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
> > > +        s->free_page_report_cmd_id =
> > > +                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;
> > Why explicitly -1?  I thought ID_MIN would be fine too?
> 
> Yes, that will also be fine. Since we states that the cmd id will be from
> [MIN, MAX], and we make s->free_page_report_cmd_id++ in start(), using
> VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN here will make it [MIN + 1, MAX].

Then I would prefer we just use the MIN value, otherwise IMO we'd
better have a comment mentioning about why that -1 is there.

> 
> > 
> > > +        if (s->iothread) {
> > > +            object_ref(OBJECT(s->iothread));
> > > +            s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
> > > +                                       virtio_balloon_poll_free_page_hints, s);
> > Just to mention that now we can create internal iothreads.  Please
> > have a look at iothread_create().
> 
> Thanks. I noticed that, but I think configuring via the cmd line can let
> people share the iothread with other devices that need it.

Ok.  Please have a look at my previous comment at [1].  I'm not sure
whether my understanding is correct.  But in case if so, not sure
whether we can avoid this QEMUBH here.

Thanks,
Wang, Wei W June 7, 2018, 5:24 a.m. UTC | #16
On 06/06/2018 07:02 PM, Peter Xu wrote:
> On Wed, Jun 06, 2018 at 06:04:23PM +0800, Wei Wang wrote:
>> On 06/06/2018 01:42 PM, Peter Xu wrote:
>>> IMHO migration states do not suite here.  IMHO bitmap syncing is too
>>> frequently an operation, especially at the end of a precopy migration.
>>> If you really want to introduce some notifiers, I would prefer
>>> something new rather than fiddling around with migration state.  E.g.,
>>> maybe a new migration event notifiers, then introduce two new events
>>> for both start/end of bitmap syncing.
>> Please see if below aligns to what you meant:
>>
>> MigrationState {
>> ...
>> + int ram_save_state;
>>
>> }
>>
>> typedef enum RamSaveState {
>>      RAM_SAVE_BEGIN = 0,
>>      RAM_SAVE_END = 1,
>>      RAM_SAVE_MAX = 2
>> }
>>
>> then at the step 1) and 3) you concluded somewhere below, we change the
>> state and invoke the callback.
> I mean something like this:
>
> 1693c64c27 ("postcopy: Add notifier chain", 2018-03-20)
>
> That was a postcopy-only notifier.  Maybe we can generalize it into a
> more common notifier for the migration framework so that we can even
> register with non-postcopy events like bitmap syncing?

Precopy already has its own notifiers: git 99a0db9b
If we want to reuse, that one would be more suitable. I think mixing 
non-related events into one notifier list isn't nice.

>>
>> Btw, the migration_state_notifiers is already there, but seems not really
>> used (I only tracked spice-core.c called
>> add_migration_state_change_notifier). I thought adding new migration states
>> can reuse all that we have.
>> What's your real concern about that? (not sure how defining new events would
>> make a difference)
> Migration state is exposed via control path (QMP).  Adding new states
> mean that the QMP clients will see more.  IMO that's not really
> anything that a QMP client will need to know, instead we can keep it
> internally.  That's a reason from compatibility pov.
>
> Meanwhile, it's not really a state-thing at all for me.  It looks
> really more like hook or event (start/stop of sync).

Thanks for sharing your concerns in detail, which are quite helpful for 
the discussion. To reuse 99a0db9b, we can also add sub-states (or say 
events), instead of new migration states.
For example, we can still define "enum RamSaveState" as above, which can 
be an indication for the notifier queued on the 99a0db9b notider_list to 
decide whether to call start or stop.
Does this solve your concern?


>
>>>> I would suggest to focus on the supplied interface and its usage in live
>>>> migration. That is, now we have two APIs, start() and stop(), to start and
>>>> stop the optimization.
>>>>
>>>> 1) where in the migration code should we use them (do you agree with the
>>>> step (1), (2), (3) you concluded below?)
>>>> 2) how should we use them, directly do global call or via notifiers?
>>> I don't know how Dave and Juan might think; here I tend to agree with
>>> Michael that some notifier framework should be nicer.
>>>
>> What would be the advantages of using notifiers here?
> Isolation of modules?  Then migration/ram.c at least won't need to
> include something like "balloon.h".
>
> And I think it's also possible too if some other modules would like to
> hook at these places someday.

OK, I can implement it with notifiers, but this (framework) is usually 
added when someday there is a second user who needs a callback at the 
same place.



>>
>>
>>> This is not that obvious to me.  For now I think it's true, since when
>>> we call stop() we'll take the mutex, meanwhile the mutex is actually
>>> always held by the iothread (in the big loop in
>>> virtio_balloon_poll_free_page_hints) until either:
>>>
>>> - it sleeps in qemu_cond_wait() [1], or
>>> - it leaves the big loop [2]
>>>
>>> Since I don't see anyone who will set dev->block_iothread to true for
>>> the balloon device, then [1] cannot happen;
>> there is a case in virtio_balloon_set_status which sets dev->block_iothread
>> to true.
>>
>> Did you mean the free_page_lock mutex? it is released at the bottom of the
>> while() loop in virtio_balloon_poll_free_page_hint. It's actually released
>> for every hint. That is,
>>
>> while(1){
>>      take the lock;
>>      process 1 hint from the vq;
>>      release the lock;
>> }
> Ah, so now I understand why you need the lock to be inside the loop,
> since the loop is busy polling actually.  Is it possible to do this in
> an async way?

We need to use polling here because of some back story in the guest side 
(due to some locks being held) that makes it a barrier to sending 
notifications for each hints.

> I'm a bit curious on how much time will it use to do
> one round of the free page hints (e.g., an idle guest with 8G mem, or
> any configuration you tested)?  I suppose during that time the
> iothread will be held steady with 100% cpu usage, am I right?

Compared to the time spent by the legacy migration to send free pages, 
that small amount of CPU usage spent on filtering free pages could be 
neglected.
Grinding a chopper will not hold up the work of cutting firewood :)

Best,
Wei
Wang, Wei W June 7, 2018, 5:29 a.m. UTC | #17
On 06/07/2018 11:17 AM, Peter Xu wrote:
> On Wed, Jun 06, 2018 at 06:11:50PM +0800, Wei Wang wrote:
>
> I got similar comments from Michael, and it will be
> while (1) {
> lock;
> func();
> unlock();
> }
>
> All the unlock inside the body will be gone.
> Ok I think I have more question on this part...
>
> Actually AFAICT this new feature uses iothread in a way very similar
> to the block layer, so I digged a bit on how block layer used the
> iothreads.  I see that the block code is using something like
> virtio_queue_aio_set_host_notifier_handler() to hook up the
> iothread/aiocontext and the ioeventfd, however here you are manually
> creating one QEMUBH and bound that to the new context.  Should you
> also use something like the block layer?  Then IMHO you can avoid
> using a busy loop there (assuming the performance does not really
> matter that much here for page hintings), and all the packet handling
> can again be based on interrupts from the guest (ioeventfd).
>
> [1]

Also mentioned in another discussion thread that it's better to not let 
guest send notifications. Otherwise, we would have used the virtqueue 
door bell to notify host.
So we need to use polling here, and Michael suggested to implemented in 
BH, which sounds good to me.


>
>>> [...]
>>>> +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
>>>> +    .name = "virtio-balloon-device/free-page-report",
>>>> +    .version_id = 1,
>>>> +    .minimum_version_id = 1,
>>>> +    .needed = virtio_balloon_free_page_support,
>>>> +    .fields = (VMStateField[]) {
>>>> +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
>>>> +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
>>> (could we move all the poison-related lines into another patch or
>>>    postpone?  after all we don't support it yet, do we?)
>>>
>>   We don't support migrating poison value, but guest maybe use it, so we are
>> actually disabling this feature in that case. Probably good to leave the
>> code together to handle that case.
> Could we just avoid declaring that feature bit in emulation code
> completely?  I mean, we support VIRTIO_BALLOON_F_FREE_PAGE_HINT first
> as the first step (as you mentioned in commit message, the POISON is a
> TODO).  Then when you really want to completely support the POISON
> bit, you can put all that into a separate patch.  Would that work?
>

Not really. The F_PAGE_POISON isn't a feature configured via QEMU cmd 
line like F_FREE_PAGE_HINT. We always set F_PAGE_POISON if 
F_FREE_PAGE_HINT is enabled. It is used to detect if the guest is using 
page poison.


>>
>>>> +    if (virtio_has_feature(s->host_features,
>>>> +                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
>>>> +        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
>>>> +        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
>>>> +        s->free_page_report_cmd_id =
>>>> +                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;
>>> Why explicitly -1?  I thought ID_MIN would be fine too?
>> Yes, that will also be fine. Since we states that the cmd id will be from
>> [MIN, MAX], and we make s->free_page_report_cmd_id++ in start(), using
>> VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN here will make it [MIN + 1, MAX].
> Then I would prefer we just use the MIN value, otherwise IMO we'd
> better have a comment mentioning about why that -1 is there.

Sure, we can do that.

Best,
Wei
Peter Xu June 7, 2018, 6:32 a.m. UTC | #18
On Thu, Jun 07, 2018 at 01:24:29PM +0800, Wei Wang wrote:
> On 06/06/2018 07:02 PM, Peter Xu wrote:
> > On Wed, Jun 06, 2018 at 06:04:23PM +0800, Wei Wang wrote:
> > > On 06/06/2018 01:42 PM, Peter Xu wrote:
> > > > IMHO migration states do not suite here.  IMHO bitmap syncing is too
> > > > frequently an operation, especially at the end of a precopy migration.
> > > > If you really want to introduce some notifiers, I would prefer
> > > > something new rather than fiddling around with migration state.  E.g.,
> > > > maybe a new migration event notifiers, then introduce two new events
> > > > for both start/end of bitmap syncing.
> > > Please see if below aligns to what you meant:
> > > 
> > > MigrationState {
> > > ...
> > > + int ram_save_state;
> > > 
> > > }
> > > 
> > > typedef enum RamSaveState {
> > >      RAM_SAVE_BEGIN = 0,
> > >      RAM_SAVE_END = 1,
> > >      RAM_SAVE_MAX = 2
> > > }
> > > 
> > > then at the step 1) and 3) you concluded somewhere below, we change the
> > > state and invoke the callback.
> > I mean something like this:
> > 
> > 1693c64c27 ("postcopy: Add notifier chain", 2018-03-20)
> > 
> > That was a postcopy-only notifier.  Maybe we can generalize it into a
> > more common notifier for the migration framework so that we can even
> > register with non-postcopy events like bitmap syncing?
> 
> Precopy already has its own notifiers: git 99a0db9b
> If we want to reuse, that one would be more suitable. I think mixing
> non-related events into one notifier list isn't nice.

I think that's only for migration state changes?

> 
> > > 
> > > Btw, the migration_state_notifiers is already there, but seems not really
> > > used (I only tracked spice-core.c called
> > > add_migration_state_change_notifier). I thought adding new migration states
> > > can reuse all that we have.
> > > What's your real concern about that? (not sure how defining new events would
> > > make a difference)
> > Migration state is exposed via control path (QMP).  Adding new states
> > mean that the QMP clients will see more.  IMO that's not really
> > anything that a QMP client will need to know, instead we can keep it
> > internally.  That's a reason from compatibility pov.
> > 
> > Meanwhile, it's not really a state-thing at all for me.  It looks
> > really more like hook or event (start/stop of sync).
> 
> Thanks for sharing your concerns in detail, which are quite helpful for the
> discussion. To reuse 99a0db9b, we can also add sub-states (or say events),
> instead of new migration states.
> For example, we can still define "enum RamSaveState" as above, which can be
> an indication for the notifier queued on the 99a0db9b notider_list to decide
> whether to call start or stop.
> Does this solve your concern?

Frankly speaking I don't fully understand how you would add that
sub-state.  If you are confident with the idea, maybe you can post
your new version with the change, then I can read the code.

> 
> 
> > 
> > > > > I would suggest to focus on the supplied interface and its usage in live
> > > > > migration. That is, now we have two APIs, start() and stop(), to start and
> > > > > stop the optimization.
> > > > > 
> > > > > 1) where in the migration code should we use them (do you agree with the
> > > > > step (1), (2), (3) you concluded below?)
> > > > > 2) how should we use them, directly do global call or via notifiers?
> > > > I don't know how Dave and Juan might think; here I tend to agree with
> > > > Michael that some notifier framework should be nicer.
> > > > 
> > > What would be the advantages of using notifiers here?
> > Isolation of modules?  Then migration/ram.c at least won't need to
> > include something like "balloon.h".
> > 
> > And I think it's also possible too if some other modules would like to
> > hook at these places someday.
> 
> OK, I can implement it with notifiers, but this (framework) is usually added
> when someday there is a second user who needs a callback at the same place.
> 
> 
> 
> > > 
> > > 
> > > > This is not that obvious to me.  For now I think it's true, since when
> > > > we call stop() we'll take the mutex, meanwhile the mutex is actually
> > > > always held by the iothread (in the big loop in
> > > > virtio_balloon_poll_free_page_hints) until either:
> > > > 
> > > > - it sleeps in qemu_cond_wait() [1], or
> > > > - it leaves the big loop [2]
> > > > 
> > > > Since I don't see anyone who will set dev->block_iothread to true for
> > > > the balloon device, then [1] cannot happen;
> > > there is a case in virtio_balloon_set_status which sets dev->block_iothread
> > > to true.
> > > 
> > > Did you mean the free_page_lock mutex? it is released at the bottom of the
> > > while() loop in virtio_balloon_poll_free_page_hint. It's actually released
> > > for every hint. That is,
> > > 
> > > while(1){
> > >      take the lock;
> > >      process 1 hint from the vq;
> > >      release the lock;
> > > }
> > Ah, so now I understand why you need the lock to be inside the loop,
> > since the loop is busy polling actually.  Is it possible to do this in
> > an async way?
> 
> We need to use polling here because of some back story in the guest side
> (due to some locks being held) that makes it a barrier to sending
> notifications for each hints.

Any link to the "back story" that I can learn about? :) If it's too
complicated a problem and you think I don't need to understand at all,
please feel free to do so.  Then I would assume at least Michael has
fully acknowledged that idea, and I can just stop putting more time on
this part.

Besides, if you are going to use a busy loop, then I would be not
quite sure about whether you really want to share that iothread with
others, since AFAIU that's not how iothread is designed (which is
mostly event-based and should not welcome out-of-control blocking in
the handler of events).  Though I am not 100% confident about my
understaning on that, I only raise this question up.  Anyway you'll
just take over the thread for a while without sharing, and after the
burst IOs it's mostly never used (until the next bitmap sync).  Then
it seems a bit confusing to me on why you need to share that after
all.

> 
> > I'm a bit curious on how much time will it use to do
> > one round of the free page hints (e.g., an idle guest with 8G mem, or
> > any configuration you tested)?  I suppose during that time the
> > iothread will be held steady with 100% cpu usage, am I right?
> 
> Compared to the time spent by the legacy migration to send free pages, that
> small amount of CPU usage spent on filtering free pages could be neglected.
> Grinding a chopper will not hold up the work of cutting firewood :)

Sorry I didn't express myself clearly.

My question was that, have you measured how long time it will take
from starting of the free page hints (when balloon state is set to
FREE_PAGE_REPORT_S_REQUESTED), until it completes (when QEMU receives
the VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID, then set the status to
FREE_PAGE_REPORT_S_STOP)?
Peter Xu June 7, 2018, 6:58 a.m. UTC | #19
On Thu, Jun 07, 2018 at 01:29:22PM +0800, Wei Wang wrote:
> On 06/07/2018 11:17 AM, Peter Xu wrote:
> > On Wed, Jun 06, 2018 at 06:11:50PM +0800, Wei Wang wrote:
> > 
> > I got similar comments from Michael, and it will be
> > while (1) {
> > lock;
> > func();
> > unlock();
> > }
> > 
> > All the unlock inside the body will be gone.
> > Ok I think I have more question on this part...
> > 
> > Actually AFAICT this new feature uses iothread in a way very similar
> > to the block layer, so I digged a bit on how block layer used the
> > iothreads.  I see that the block code is using something like
> > virtio_queue_aio_set_host_notifier_handler() to hook up the
> > iothread/aiocontext and the ioeventfd, however here you are manually
> > creating one QEMUBH and bound that to the new context.  Should you
> > also use something like the block layer?  Then IMHO you can avoid
> > using a busy loop there (assuming the performance does not really
> > matter that much here for page hintings), and all the packet handling
> > can again be based on interrupts from the guest (ioeventfd).
> > 
> > [1]
> 
> Also mentioned in another discussion thread that it's better to not let
> guest send notifications. Otherwise, we would have used the virtqueue door
> bell to notify host.
> So we need to use polling here, and Michael suggested to implemented in BH,
> which sounds good to me.

(We're discussing the same problem in the other thread, so let's do it
 there)

> 
> 
> > 
> > > > [...]
> > > > > +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
> > > > > +    .name = "virtio-balloon-device/free-page-report",
> > > > > +    .version_id = 1,
> > > > > +    .minimum_version_id = 1,
> > > > > +    .needed = virtio_balloon_free_page_support,
> > > > > +    .fields = (VMStateField[]) {
> > > > > +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
> > > > > +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
> > > > (could we move all the poison-related lines into another patch or
> > > >    postpone?  after all we don't support it yet, do we?)
> > > > 
> > >   We don't support migrating poison value, but guest maybe use it, so we are
> > > actually disabling this feature in that case. Probably good to leave the
> > > code together to handle that case.
> > Could we just avoid declaring that feature bit in emulation code
> > completely?  I mean, we support VIRTIO_BALLOON_F_FREE_PAGE_HINT first
> > as the first step (as you mentioned in commit message, the POISON is a
> > TODO).  Then when you really want to completely support the POISON
> > bit, you can put all that into a separate patch.  Would that work?
> > 
> 
> Not really. The F_PAGE_POISON isn't a feature configured via QEMU cmd line
> like F_FREE_PAGE_HINT. We always set F_PAGE_POISON if F_FREE_PAGE_HINT is
> enabled. It is used to detect if the guest is using page poison.

Ok I think I kind of understand.  But it seems strange to me to have
this as a feature bit.  I thought it suites more to be a config field
so that guest could setup.  Like, we can have 1 byte to setup "whether
PAGE_POISON is used in the guest", another 1 byte to setup "what is
the PAGE_POISON value if it's enabled".

Asked since I see this in virtio spec (v1.0, though I guess it won't
change) in chapter "2.2.1 Driver Requirements: Feature Bits":

"The driver MUST NOT accept a feature which the device did not offer"

Then I'm curious what would happen if:

- a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
- a guest that enabled PAGE_POISON

Then how the driver could tell the host that PAGE_POISON is enabled
considering that guest should never set that feature bit if the
emulation code didn't provide it?
Wang, Wei W June 7, 2018, 11:59 a.m. UTC | #20
On 06/07/2018 02:32 PM, Peter Xu wrote:
> On Thu, Jun 07, 2018 at 01:24:29PM +0800, Wei Wang wrote:
>> On 06/06/2018 07:02 PM, Peter Xu wrote:
>>> On Wed, Jun 06, 2018 at 06:04:23PM +0800, Wei Wang wrote:
>>>> On 06/06/2018 01:42 PM, Peter Xu wrote:
>>>>> IMHO migration states do not suite here.  IMHO bitmap syncing is too
>>>>> frequently an operation, especially at the end of a precopy migration.
>>>>> If you really want to introduce some notifiers, I would prefer
>>>>> something new rather than fiddling around with migration state.  E.g.,
>>>>> maybe a new migration event notifiers, then introduce two new events
>>>>> for both start/end of bitmap syncing.
>>>> Please see if below aligns to what you meant:
>>>>
>>>> MigrationState {
>>>> ...
>>>> + int ram_save_state;
>>>>
>>>> }
>>>>
>>>> typedef enum RamSaveState {
>>>>       RAM_SAVE_BEGIN = 0,
>>>>       RAM_SAVE_END = 1,
>>>>       RAM_SAVE_MAX = 2
>>>> }
>>>>
>>>> then at the step 1) and 3) you concluded somewhere below, we change the
>>>> state and invoke the callback.
>>> I mean something like this:
>>>
>>> 1693c64c27 ("postcopy: Add notifier chain", 2018-03-20)
>>>
>>> That was a postcopy-only notifier.  Maybe we can generalize it into a
>>> more common notifier for the migration framework so that we can even
>>> register with non-postcopy events like bitmap syncing?
>> Precopy already has its own notifiers: git 99a0db9b
>> If we want to reuse, that one would be more suitable. I think mixing
>> non-related events into one notifier list isn't nice.
> I think that's only for migration state changes?
>
>>>> Btw, the migration_state_notifiers is already there, but seems not really
>>>> used (I only tracked spice-core.c called
>>>> add_migration_state_change_notifier). I thought adding new migration states
>>>> can reuse all that we have.
>>>> What's your real concern about that? (not sure how defining new events would
>>>> make a difference)
>>> Migration state is exposed via control path (QMP).  Adding new states
>>> mean that the QMP clients will see more.  IMO that's not really
>>> anything that a QMP client will need to know, instead we can keep it
>>> internally.  That's a reason from compatibility pov.
>>>
>>> Meanwhile, it's not really a state-thing at all for me.  It looks
>>> really more like hook or event (start/stop of sync).
>> Thanks for sharing your concerns in detail, which are quite helpful for the
>> discussion. To reuse 99a0db9b, we can also add sub-states (or say events),
>> instead of new migration states.
>> For example, we can still define "enum RamSaveState" as above, which can be
>> an indication for the notifier queued on the 99a0db9b notider_list to decide
>> whether to call start or stop.
>> Does this solve your concern?
> Frankly speaking I don't fully understand how you would add that
> sub-state.  If you are confident with the idea, maybe you can post
> your new version with the change, then I can read the code.

Sure. Code is more straightforward for this one. Let's check it in the 
new version.

>>>>> This is not that obvious to me.  For now I think it's true, since when
>>>>> we call stop() we'll take the mutex, meanwhile the mutex is actually
>>>>> always held by the iothread (in the big loop in
>>>>> virtio_balloon_poll_free_page_hints) until either:
>>>>>
>>>>> - it sleeps in qemu_cond_wait() [1], or
>>>>> - it leaves the big loop [2]
>>>>>
>>>>> Since I don't see anyone who will set dev->block_iothread to true for
>>>>> the balloon device, then [1] cannot happen;
>>>> there is a case in virtio_balloon_set_status which sets dev->block_iothread
>>>> to true.
>>>>
>>>> Did you mean the free_page_lock mutex? it is released at the bottom of the
>>>> while() loop in virtio_balloon_poll_free_page_hint. It's actually released
>>>> for every hint. That is,
>>>>
>>>> while(1){
>>>>       take the lock;
>>>>       process 1 hint from the vq;
>>>>       release the lock;
>>>> }
>>> Ah, so now I understand why you need the lock to be inside the loop,
>>> since the loop is busy polling actually.  Is it possible to do this in
>>> an async way?
>> We need to use polling here because of some back story in the guest side
>> (due to some locks being held) that makes it a barrier to sending
>> notifications for each hints.
> Any link to the "back story" that I can learn about? :) If it's too
> complicated a problem and you think I don't need to understand at all,
> please feel free to do so.

I searched a little bit, and forgot where we discussed this one. But the 
conclusion is that we don't want kick happens when the mm lock is held. 
Also, polling is a good idea here to me.
There are 32 versions of kernel patch discussions scattered, interesting 
to watch, but might take too much time. Also people usually have 
different thoughts (sometimes with partial understanding) when they 
watch something (we even have many different versions of implementations 
ourselves if you check the whole 32 versions). It's not easy to get here 
with many consensus. That's why I hope our discussion could be more 
focused on the migration part, which is the last part that has not be 
fully finalized.



> Then I would assume at least Michael has
> fully acknowledged that idea, and I can just stop putting more time on
> this part.

Yes, he's been on the loop since the beginning.


>
> Besides, if you are going to use a busy loop, then I would be not
> quite sure about whether you really want to share that iothread with
> others, since AFAIU that's not how iothread is designed (which is
> mostly event-based and should not welcome out-of-control blocking in
> the handler of events).  Though I am not 100% confident about my
> understaning on that, I only raise this question up.  Anyway you'll
> just take over the thread for a while without sharing, and after the
> burst IOs it's mostly never used (until the next bitmap sync).  Then
> it seems a bit confusing to me on why you need to share that after
> all.

Not necessarily _need_ to share it, I meant it can be shared using qemu 
command line.
Live migration doesn't happen all the time, and that optimization 
doesn't run that long, if users want to have other BHs run in this 
iothread context, they can only create one iothread via the qemu cmd line.


>
>>> I'm a bit curious on how much time will it use to do
>>> one round of the free page hints (e.g., an idle guest with 8G mem, or
>>> any configuration you tested)?  I suppose during that time the
>>> iothread will be held steady with 100% cpu usage, am I right?
>> Compared to the time spent by the legacy migration to send free pages, that
>> small amount of CPU usage spent on filtering free pages could be neglected.
>> Grinding a chopper will not hold up the work of cutting firewood :)
> Sorry I didn't express myself clearly.
>
> My question was that, have you measured how long time it will take
> from starting of the free page hints (when balloon state is set to
> FREE_PAGE_REPORT_S_REQUESTED), until it completes (when QEMU receives
> the VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID, then set the status to
> FREE_PAGE_REPORT_S_STOP)?
>

I vaguely remember it's several ms (for around 7.5G free pages) long 
time ago. What would be the concern behind that number you want to know?

Best,
Wei
Wang, Wei W June 7, 2018, 12:01 p.m. UTC | #21
On 06/07/2018 02:58 PM, Peter Xu wrote:
> On Thu, Jun 07, 2018 at 01:29:22PM +0800, Wei Wang wrote:
>>>>> [...]
>>>>>> +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
>>>>>> +    .name = "virtio-balloon-device/free-page-report",
>>>>>> +    .version_id = 1,
>>>>>> +    .minimum_version_id = 1,
>>>>>> +    .needed = virtio_balloon_free_page_support,
>>>>>> +    .fields = (VMStateField[]) {
>>>>>> +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
>>>>>> +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
>>>>> (could we move all the poison-related lines into another patch or
>>>>>     postpone?  after all we don't support it yet, do we?)
>>>>>
>>>>    We don't support migrating poison value, but guest maybe use it, so we are
>>>> actually disabling this feature in that case. Probably good to leave the
>>>> code together to handle that case.
>>> Could we just avoid declaring that feature bit in emulation code
>>> completely?  I mean, we support VIRTIO_BALLOON_F_FREE_PAGE_HINT first
>>> as the first step (as you mentioned in commit message, the POISON is a
>>> TODO).  Then when you really want to completely support the POISON
>>> bit, you can put all that into a separate patch.  Would that work?
>>>
>> Not really. The F_PAGE_POISON isn't a feature configured via QEMU cmd line
>> like F_FREE_PAGE_HINT. We always set F_PAGE_POISON if F_FREE_PAGE_HINT is
>> enabled. It is used to detect if the guest is using page poison.
> Ok I think I kind of understand.  But it seems strange to me to have
> this as a feature bit.  I thought it suites more to be a config field
> so that guest could setup.  Like, we can have 1 byte to setup "whether
> PAGE_POISON is used in the guest", another 1 byte to setup "what is
> the PAGE_POISON value if it's enabled".

This is also suggested by Michael, which sounds good to me. Using config 
is doable, but that doesn't show advantages over using feature bits.



>
> Asked since I see this in virtio spec (v1.0, though I guess it won't
> change) in chapter "2.2.1 Driver Requirements: Feature Bits":
>
> "The driver MUST NOT accept a feature which the device did not offer"
>
> Then I'm curious what would happen if:
>
> - a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
> - a guest that enabled PAGE_POISON
>
> Then how the driver could tell the host that PAGE_POISON is enabled
> considering that guest should never set that feature bit if the
> emulation code didn't provide it?
>

All the emulator implementations need to follow the virtio spec. We will 
finally have this feature written to the virtio-balloon device section, 
and state that the F_PAGE_POISON needs to be set on the device when 
F_FREE_PAGE_HINT is set on the device.


Best,
Wei
Peter Xu June 8, 2018, 1:37 a.m. UTC | #22
On Thu, Jun 07, 2018 at 08:01:42PM +0800, Wei Wang wrote:
> On 06/07/2018 02:58 PM, Peter Xu wrote:
> > On Thu, Jun 07, 2018 at 01:29:22PM +0800, Wei Wang wrote:
> > > > > > [...]
> > > > > > > +static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
> > > > > > > +    .name = "virtio-balloon-device/free-page-report",
> > > > > > > +    .version_id = 1,
> > > > > > > +    .minimum_version_id = 1,
> > > > > > > +    .needed = virtio_balloon_free_page_support,
> > > > > > > +    .fields = (VMStateField[]) {
> > > > > > > +        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
> > > > > > > +        VMSTATE_UINT32(poison_val, VirtIOBalloon),
> > > > > > (could we move all the poison-related lines into another patch or
> > > > > >     postpone?  after all we don't support it yet, do we?)
> > > > > > 
> > > > >    We don't support migrating poison value, but guest maybe use it, so we are
> > > > > actually disabling this feature in that case. Probably good to leave the
> > > > > code together to handle that case.
> > > > Could we just avoid declaring that feature bit in emulation code
> > > > completely?  I mean, we support VIRTIO_BALLOON_F_FREE_PAGE_HINT first
> > > > as the first step (as you mentioned in commit message, the POISON is a
> > > > TODO).  Then when you really want to completely support the POISON
> > > > bit, you can put all that into a separate patch.  Would that work?
> > > > 
> > > Not really. The F_PAGE_POISON isn't a feature configured via QEMU cmd line
> > > like F_FREE_PAGE_HINT. We always set F_PAGE_POISON if F_FREE_PAGE_HINT is
> > > enabled. It is used to detect if the guest is using page poison.
> > Ok I think I kind of understand.  But it seems strange to me to have
> > this as a feature bit.  I thought it suites more to be a config field
> > so that guest could setup.  Like, we can have 1 byte to setup "whether
> > PAGE_POISON is used in the guest", another 1 byte to setup "what is
> > the PAGE_POISON value if it's enabled".
> 
> This is also suggested by Michael, which sounds good to me. Using config is
> doable, but that doesn't show advantages over using feature bits.
> 
> 
> 
> > 
> > Asked since I see this in virtio spec (v1.0, though I guess it won't
> > change) in chapter "2.2.1 Driver Requirements: Feature Bits":
> > 
> > "The driver MUST NOT accept a feature which the device did not offer"
> > 
> > Then I'm curious what would happen if:
> > 
> > - a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
> > - a guest that enabled PAGE_POISON
> > 
> > Then how the driver could tell the host that PAGE_POISON is enabled
> > considering that guest should never set that feature bit if the
> > emulation code didn't provide it?
> > 
> 
> All the emulator implementations need to follow the virtio spec. We will
> finally have this feature written to the virtio-balloon device section, and
> state that the F_PAGE_POISON needs to be set on the device when
> F_FREE_PAGE_HINT is set on the device.

Okay.  Still I would think a single feature cleaner here since they
are actually tightly bound together, e.g., normally AFAIU this only
happens when we introduce FEATURE1, after a while we introduced
FEATURE2, then we need to have two features there since there are
emulators that are already running only with FEATURE1.

AFAICT the thing behind is that your kernel patches are split (one for
FEATURE1, one for FEATURE2), however when you only have FEATURE1 it's
actually broken (if without the POISON support, PAGE_HINT feature
might be broken).  So it would be nicer if the kernel patches are
squashed so that no commit would broke any guest.  And, if they are
squashed then IMHO we don't need two feature bits at all. ;)

But anyway, I understand it now.  Thanks,
Peter Xu June 8, 2018, 1:58 a.m. UTC | #23
On Fri, Jun 08, 2018 at 09:37:23AM +0800, Peter Xu wrote:

[...]

> > > Asked since I see this in virtio spec (v1.0, though I guess it won't
> > > change) in chapter "2.2.1 Driver Requirements: Feature Bits":
> > > 
> > > "The driver MUST NOT accept a feature which the device did not offer"
> > > 
> > > Then I'm curious what would happen if:
> > > 
> > > - a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
> > > - a guest that enabled PAGE_POISON
> > > 
> > > Then how the driver could tell the host that PAGE_POISON is enabled
> > > considering that guest should never set that feature bit if the
> > > emulation code didn't provide it?
> > > 
> > 
> > All the emulator implementations need to follow the virtio spec. We will
> > finally have this feature written to the virtio-balloon device section, and
> > state that the F_PAGE_POISON needs to be set on the device when
> > F_FREE_PAGE_HINT is set on the device.
> 
> Okay.  Still I would think a single feature cleaner here since they
> are actually tightly bound together, e.g., normally AFAIU this only
> happens when we introduce FEATURE1, after a while we introduced
> FEATURE2, then we need to have two features there since there are
> emulators that are already running only with FEATURE1.
> 
> AFAICT the thing behind is that your kernel patches are split (one for
> FEATURE1, one for FEATURE2), however when you only have FEATURE1 it's
> actually broken (if without the POISON support, PAGE_HINT feature
> might be broken).  So it would be nicer if the kernel patches are
> squashed so that no commit would broke any guest.  And, if they are
> squashed then IMHO we don't need two feature bits at all. ;)
> 
> But anyway, I understand it now.  Thanks,

This also reminds me that since we're going to declare both features
in this single patch, the final version of the patch should contain
the implementation of poisoned bits rather than a todo, am I right?

Regards,
Michael S. Tsirkin June 8, 2018, 1:58 a.m. UTC | #24
On Fri, Jun 08, 2018 at 09:37:23AM +0800, Peter Xu wrote:
> > > Asked since I see this in virtio spec (v1.0, though I guess it won't
> > > change) in chapter "2.2.1 Driver Requirements: Feature Bits":
> > > 
> > > "The driver MUST NOT accept a feature which the device did not offer"
> > > 
> > > Then I'm curious what would happen if:
> > > 
> > > - a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
> > > - a guest that enabled PAGE_POISON
> > > 
> > > Then how the driver could tell the host that PAGE_POISON is enabled
> > > considering that guest should never set that feature bit if the
> > > emulation code didn't provide it?

It wouldn't. It just has to deal with the fact that host can discard
writes to hinted pages. Right now driver deals with it simply by
disabling F_FREE_PAGE_HINT.
Peter Xu June 8, 2018, 2:17 a.m. UTC | #25
On Thu, Jun 07, 2018 at 07:59:22PM +0800, Wei Wang wrote:
> On 06/07/2018 02:32 PM, Peter Xu wrote:
> > On Thu, Jun 07, 2018 at 01:24:29PM +0800, Wei Wang wrote:
> > > On 06/06/2018 07:02 PM, Peter Xu wrote:
> > > > On Wed, Jun 06, 2018 at 06:04:23PM +0800, Wei Wang wrote:
> > > > > On 06/06/2018 01:42 PM, Peter Xu wrote:
> > > > > > IMHO migration states do not suite here.  IMHO bitmap syncing is too
> > > > > > frequently an operation, especially at the end of a precopy migration.
> > > > > > If you really want to introduce some notifiers, I would prefer
> > > > > > something new rather than fiddling around with migration state.  E.g.,
> > > > > > maybe a new migration event notifiers, then introduce two new events
> > > > > > for both start/end of bitmap syncing.
> > > > > Please see if below aligns to what you meant:
> > > > > 
> > > > > MigrationState {
> > > > > ...
> > > > > + int ram_save_state;
> > > > > 
> > > > > }
> > > > > 
> > > > > typedef enum RamSaveState {
> > > > >       RAM_SAVE_BEGIN = 0,
> > > > >       RAM_SAVE_END = 1,
> > > > >       RAM_SAVE_MAX = 2
> > > > > }
> > > > > 
> > > > > then at the step 1) and 3) you concluded somewhere below, we change the
> > > > > state and invoke the callback.
> > > > I mean something like this:
> > > > 
> > > > 1693c64c27 ("postcopy: Add notifier chain", 2018-03-20)
> > > > 
> > > > That was a postcopy-only notifier.  Maybe we can generalize it into a
> > > > more common notifier for the migration framework so that we can even
> > > > register with non-postcopy events like bitmap syncing?
> > > Precopy already has its own notifiers: git 99a0db9b
> > > If we want to reuse, that one would be more suitable. I think mixing
> > > non-related events into one notifier list isn't nice.
> > I think that's only for migration state changes?
> > 
> > > > > Btw, the migration_state_notifiers is already there, but seems not really
> > > > > used (I only tracked spice-core.c called
> > > > > add_migration_state_change_notifier). I thought adding new migration states
> > > > > can reuse all that we have.
> > > > > What's your real concern about that? (not sure how defining new events would
> > > > > make a difference)
> > > > Migration state is exposed via control path (QMP).  Adding new states
> > > > mean that the QMP clients will see more.  IMO that's not really
> > > > anything that a QMP client will need to know, instead we can keep it
> > > > internally.  That's a reason from compatibility pov.
> > > > 
> > > > Meanwhile, it's not really a state-thing at all for me.  It looks
> > > > really more like hook or event (start/stop of sync).
> > > Thanks for sharing your concerns in detail, which are quite helpful for the
> > > discussion. To reuse 99a0db9b, we can also add sub-states (or say events),
> > > instead of new migration states.
> > > For example, we can still define "enum RamSaveState" as above, which can be
> > > an indication for the notifier queued on the 99a0db9b notider_list to decide
> > > whether to call start or stop.
> > > Does this solve your concern?
> > Frankly speaking I don't fully understand how you would add that
> > sub-state.  If you are confident with the idea, maybe you can post
> > your new version with the change, then I can read the code.
> 
> Sure. Code is more straightforward for this one. Let's check it in the new
> version.
> 
> > > > > > This is not that obvious to me.  For now I think it's true, since when
> > > > > > we call stop() we'll take the mutex, meanwhile the mutex is actually
> > > > > > always held by the iothread (in the big loop in
> > > > > > virtio_balloon_poll_free_page_hints) until either:
> > > > > > 
> > > > > > - it sleeps in qemu_cond_wait() [1], or
> > > > > > - it leaves the big loop [2]
> > > > > > 
> > > > > > Since I don't see anyone who will set dev->block_iothread to true for
> > > > > > the balloon device, then [1] cannot happen;
> > > > > there is a case in virtio_balloon_set_status which sets dev->block_iothread
> > > > > to true.
> > > > > 
> > > > > Did you mean the free_page_lock mutex? it is released at the bottom of the
> > > > > while() loop in virtio_balloon_poll_free_page_hint. It's actually released
> > > > > for every hint. That is,
> > > > > 
> > > > > while(1){
> > > > >       take the lock;
> > > > >       process 1 hint from the vq;
> > > > >       release the lock;
> > > > > }
> > > > Ah, so now I understand why you need the lock to be inside the loop,
> > > > since the loop is busy polling actually.  Is it possible to do this in
> > > > an async way?
> > > We need to use polling here because of some back story in the guest side
> > > (due to some locks being held) that makes it a barrier to sending
> > > notifications for each hints.
> > Any link to the "back story" that I can learn about? :) If it's too
> > complicated a problem and you think I don't need to understand at all,
> > please feel free to do so.
> 
> I searched a little bit, and forgot where we discussed this one. But the
> conclusion is that we don't want kick happens when the mm lock is held.
> Also, polling is a good idea here to me.
> There are 32 versions of kernel patch discussions scattered, interesting to
> watch, but might take too much time. Also people usually have different
> thoughts (sometimes with partial understanding) when they watch something
> (we even have many different versions of implementations ourselves if you
> check the whole 32 versions). It's not easy to get here with many consensus.
> That's why I hope our discussion could be more focused on the migration
> part, which is the last part that has not be fully finalized.

It's ok.

I'd be focused on migration part if you have a very clear interface
declared. :) You know, it was not even clear to me before I read the
series on whether the free_page_stop() operation is synchronous. And
IMHO that's really important even if I focus on migration review.

I'd say I'll treat reviewers somehow differently from you.  But I
don't think that worth a debate.

> 
> 
> 
> > Then I would assume at least Michael has
> > fully acknowledged that idea, and I can just stop putting more time on
> > this part.
> 
> Yes, he's been on the loop since the beginning.
> 
> 
> > 
> > Besides, if you are going to use a busy loop, then I would be not
> > quite sure about whether you really want to share that iothread with
> > others, since AFAIU that's not how iothread is designed (which is
> > mostly event-based and should not welcome out-of-control blocking in
> > the handler of events).  Though I am not 100% confident about my
> > understaning on that, I only raise this question up.  Anyway you'll
> > just take over the thread for a while without sharing, and after the
> > burst IOs it's mostly never used (until the next bitmap sync).  Then
> > it seems a bit confusing to me on why you need to share that after
> > all.
> 
> Not necessarily _need_ to share it, I meant it can be shared using qemu
> command line.
> Live migration doesn't happen all the time, and that optimization doesn't
> run that long, if users want to have other BHs run in this iothread context,
> they can only create one iothread via the qemu cmd line.

IMO iothreads and aiocontexts are for event-driven model.  Busy loop
is not an event-driven model.  Here if we want a busy loop I'll create
a thread when start page hinting, then join the thread when done.

But I'll stop commenting on this.  Please prepare a more clear
interface for migration in your next post.  I'll read that.

> 
> 
> > 
> > > > I'm a bit curious on how much time will it use to do
> > > > one round of the free page hints (e.g., an idle guest with 8G mem, or
> > > > any configuration you tested)?  I suppose during that time the
> > > > iothread will be held steady with 100% cpu usage, am I right?
> > > Compared to the time spent by the legacy migration to send free pages, that
> > > small amount of CPU usage spent on filtering free pages could be neglected.
> > > Grinding a chopper will not hold up the work of cutting firewood :)
> > Sorry I didn't express myself clearly.
> > 
> > My question was that, have you measured how long time it will take
> > from starting of the free page hints (when balloon state is set to
> > FREE_PAGE_REPORT_S_REQUESTED), until it completes (when QEMU receives
> > the VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID, then set the status to
> > FREE_PAGE_REPORT_S_STOP)?
> > 
> 
> I vaguely remember it's several ms (for around 7.5G free pages) long time
> ago. What would be the concern behind that number you want to know?

Because roughly I know the time between two bitmap syncs.  Then I will
know how possible a free page hinting process won't stop until the
next bitmap sync happens.

Thanks,
Peter Xu June 8, 2018, 2:34 a.m. UTC | #26
On Fri, Jun 08, 2018 at 04:58:21AM +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 08, 2018 at 09:37:23AM +0800, Peter Xu wrote:
> > > > Asked since I see this in virtio spec (v1.0, though I guess it won't
> > > > change) in chapter "2.2.1 Driver Requirements: Feature Bits":
> > > > 
> > > > "The driver MUST NOT accept a feature which the device did not offer"
> > > > 
> > > > Then I'm curious what would happen if:
> > > > 
> > > > - a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
> > > > - a guest that enabled PAGE_POISON
> > > > 
> > > > Then how the driver could tell the host that PAGE_POISON is enabled
> > > > considering that guest should never set that feature bit if the
> > > > emulation code didn't provide it?
> 
> It wouldn't. It just has to deal with the fact that host can discard
> writes to hinted pages. Right now driver deals with it simply by
> disabling F_FREE_PAGE_HINT.

Ah I see.  Thanks Michael.

Then it seems to me that it's more important to implement the F_POISON
along with where it is declared since otherwise it'll be a real broken
(device declares F_POISON, guest assumes it can handle the POISON so
guest will enable FREE_PAGE_HINT, however the device can't really
handle that).

Or, if the guest driver is capable to drop F_FREE_PAGE_HINT when
F_POISON is not declared, we can safely split the two features into
two patches in QEMU too.

Regards,
Michael S. Tsirkin June 8, 2018, 2:49 a.m. UTC | #27
On Fri, Jun 08, 2018 at 10:34:25AM +0800, Peter Xu wrote:
> On Fri, Jun 08, 2018 at 04:58:21AM +0300, Michael S. Tsirkin wrote:
> > On Fri, Jun 08, 2018 at 09:37:23AM +0800, Peter Xu wrote:
> > > > > Asked since I see this in virtio spec (v1.0, though I guess it won't
> > > > > change) in chapter "2.2.1 Driver Requirements: Feature Bits":
> > > > > 
> > > > > "The driver MUST NOT accept a feature which the device did not offer"
> > > > > 
> > > > > Then I'm curious what would happen if:
> > > > > 
> > > > > - a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
> > > > > - a guest that enabled PAGE_POISON
> > > > > 
> > > > > Then how the driver could tell the host that PAGE_POISON is enabled
> > > > > considering that guest should never set that feature bit if the
> > > > > emulation code didn't provide it?
> > 
> > It wouldn't. It just has to deal with the fact that host can discard
> > writes to hinted pages. Right now driver deals with it simply by
> > disabling F_FREE_PAGE_HINT.
> 
> Ah I see.  Thanks Michael.
> 
> Then it seems to me that it's more important to implement the F_POISON
> along with where it is declared since otherwise it'll be a real broken
> (device declares F_POISON, guest assumes it can handle the POISON so
> guest will enable FREE_PAGE_HINT, however the device can't really
> handle that).

It seems to handle it fine, it just ignores the hints.

> Or, if the guest driver is capable to drop F_FREE_PAGE_HINT when
> F_POISON is not declared, we can safely split the two features into
> two patches in QEMU too.
> 
> Regards,
> 
> -- 
> Peter Xu
Peter Xu June 8, 2018, 3:34 a.m. UTC | #28
On Fri, Jun 08, 2018 at 05:49:26AM +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 08, 2018 at 10:34:25AM +0800, Peter Xu wrote:
> > On Fri, Jun 08, 2018 at 04:58:21AM +0300, Michael S. Tsirkin wrote:
> > > On Fri, Jun 08, 2018 at 09:37:23AM +0800, Peter Xu wrote:
> > > > > > Asked since I see this in virtio spec (v1.0, though I guess it won't
> > > > > > change) in chapter "2.2.1 Driver Requirements: Feature Bits":
> > > > > > 
> > > > > > "The driver MUST NOT accept a feature which the device did not offer"
> > > > > > 
> > > > > > Then I'm curious what would happen if:
> > > > > > 
> > > > > > - a emulator (not QEMU) only offered F_FREE_PAGE_HINT, not F_POISON
> > > > > > - a guest that enabled PAGE_POISON
> > > > > > 
> > > > > > Then how the driver could tell the host that PAGE_POISON is enabled
> > > > > > considering that guest should never set that feature bit if the
> > > > > > emulation code didn't provide it?
> > > 
> > > It wouldn't. It just has to deal with the fact that host can discard
> > > writes to hinted pages. Right now driver deals with it simply by
> > > disabling F_FREE_PAGE_HINT.
> > 
> > Ah I see.  Thanks Michael.
> > 
> > Then it seems to me that it's more important to implement the F_POISON
> > along with where it is declared since otherwise it'll be a real broken
> > (device declares F_POISON, guest assumes it can handle the POISON so
> > guest will enable FREE_PAGE_HINT, however the device can't really
> > handle that).
> 
> It seems to handle it fine, it just ignores the hints.

Ok I misunderstood.  Then that's fine.

The message in the commit message is a bit misleading:

"TODO: - handle the case when page poisoning is in use"

It seems to me that:

"Now we handle the page poisoning by dropping the page hints
 directly.  In the future we might do something better."

Regards,
Wang, Wei W June 8, 2018, 7:14 a.m. UTC | #29
On 06/08/2018 10:17 AM, Peter Xu wrote:
> On Thu, Jun 07, 2018 at 07:59:22PM +0800, Wei Wang wrote:
>> Not necessarily _need_ to share it, I meant it can be shared using qemu
>> command line.
>> Live migration doesn't happen all the time, and that optimization doesn't
>> run that long, if users want to have other BHs run in this iothread context,
>> they can only create one iothread via the qemu cmd line.
> IMO iothreads and aiocontexts are for event-driven model.

To me it's just a thread which polls for submitted callbacks to run. 
When migration reaches the place that needs to submit the optimization 
function, it calls start() to submit it. I'm not sure why there is a 
worry about what's inside the callback.

> Busy loop
> is not an event-driven model.  Here if we want a busy loop I'll create
> a thread when start page hinting, then join the thread when done.

  The old (v4) implementation worked that way as you mentioned above, 
and Michael suggested to use iothread in the previous discussion. I'm 
fine with both actually. For the virtio part, we've had many 
discussions, I would take the choice I had with Michael before, unless 
there is an obvious advantage (e.g. proved better performance).


>
> But I'll stop commenting on this.  Please prepare a more clear
> interface for migration in your next post.  I'll read that.
>

Sure, thanks. The new version is coming soon.




>>
>>>>> I'm a bit curious on how much time will it use to do
>>>>> one round of the free page hints (e.g., an idle guest with 8G mem, or
>>>>> any configuration you tested)?  I suppose during that time the
>>>>> iothread will be held steady with 100% cpu usage, am I right?
>>>> Compared to the time spent by the legacy migration to send free pages, that
>>>> small amount of CPU usage spent on filtering free pages could be neglected.
>>>> Grinding a chopper will not hold up the work of cutting firewood :)
>>> Sorry I didn't express myself clearly.
>>>
>>> My question was that, have you measured how long time it will take
>>> from starting of the free page hints (when balloon state is set to
>>> FREE_PAGE_REPORT_S_REQUESTED), until it completes (when QEMU receives
>>> the VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID, then set the status to
>>> FREE_PAGE_REPORT_S_STOP)?
>>>
>> I vaguely remember it's several ms (for around 7.5G free pages) long time
>> ago. What would be the concern behind that number you want to know?
> Because roughly I know the time between two bitmap syncs.  Then I will
> know how possible a free page hinting process won't stop until the
> next bitmap sync happens.

We have a function, stop(), to stop the optimization before the next 
bitmap sync if the optimization is still running. But I never saw that 
case happens (the free page hinting finishes itself before that).

Best,
Wei
Wang, Wei W June 8, 2018, 7:31 a.m. UTC | #30
On 06/07/2018 02:32 PM, Peter Xu wrote:
>>>> Btw, the migration_state_notifiers is already there, but seems not really
>>>> used (I only tracked spice-core.c called
>>>> add_migration_state_change_notifier). I thought adding new migration states
>>>> can reuse all that we have.
>>>> What's your real concern about that? (not sure how defining new events would
>>>> make a difference)
>>> Migration state is exposed via control path (QMP).  Adding new states
>>> mean that the QMP clients will see more.  IMO that's not really
>>> anything that a QMP client will need to know, instead we can keep it
>>> internally.  That's a reason from compatibility pov.
>>>
>>> Meanwhile, it's not really a state-thing at all for me.  It looks
>>> really more like hook or event (start/stop of sync).
>> Thanks for sharing your concerns in detail, which are quite helpful for the
>> discussion. To reuse 99a0db9b, we can also add sub-states (or say events),
>> instead of new migration states.
>> For example, we can still define "enum RamSaveState" as above, which can be
>> an indication for the notifier queued on the 99a0db9b notider_list to decide
>> whether to call start or stop.
>> Does this solve your concern?
> Frankly speaking I don't fully understand how you would add that
> sub-state.  If you are confident with the idea, maybe you can post
> your new version with the change, then I can read the code.

Reusing 99a0db9b functions well, but I find it is more clear to let ram 
save state have it's own notifier list..will show how that works in v8.

Best,
Wei
diff mbox series

Patch

diff --git a/balloon.c b/balloon.c
index 6bf0a96..87a0410 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,9 @@ 
 
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
+static QEMUBalloonFreePageSupport *balloon_free_page_support_fn;
+static QEMUBalloonFreePageStart *balloon_free_page_start_fn;
+static QEMUBalloonFreePageStop *balloon_free_page_stop_fn;
 static void *balloon_opaque;
 static bool balloon_inhibited;
 
@@ -64,19 +67,51 @@  static bool have_balloon(Error **errp)
     return true;
 }
 
-int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
-                             QEMUBalloonStatus *stat_func, void *opaque)
+bool balloon_free_page_support(void)
 {
-    if (balloon_event_fn || balloon_stat_fn || balloon_opaque) {
-        /* We're already registered one balloon handler.  How many can
-         * a guest really have?
-         */
-        return -1;
+    return balloon_free_page_support_fn &&
+           balloon_free_page_support_fn(balloon_opaque);
+}
+
+/*
+ * Balloon will report pages which were free at the time of this call. As the
+ * reporting happens asynchronously, dirty bit logging must be enabled before
+ * this call is made.
+ */
+void balloon_free_page_start(void)
+{
+    balloon_free_page_start_fn(balloon_opaque);
+}
+
+/*
+ * Guest reporting must be disabled before the migration dirty bitmap is
+ * synchronized.
+ */
+void balloon_free_page_stop(void)
+{
+    balloon_free_page_stop_fn(balloon_opaque);
+}
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
+                              QEMUBalloonStatus *stat_fn,
+                              QEMUBalloonFreePageSupport *free_page_support_fn,
+                              QEMUBalloonFreePageStart *free_page_start_fn,
+                              QEMUBalloonFreePageStop *free_page_stop_fn,
+                              void *opaque)
+{
+    if (balloon_event_fn || balloon_stat_fn || balloon_free_page_support_fn ||
+        balloon_free_page_start_fn || balloon_free_page_stop_fn ||
+        balloon_opaque) {
+        /* We already registered one balloon handler. */
+        return;
     }
-    balloon_event_fn = event_func;
-    balloon_stat_fn = stat_func;
+
+    balloon_event_fn = event_fn;
+    balloon_stat_fn = stat_fn;
+    balloon_free_page_support_fn = free_page_support_fn;
+    balloon_free_page_start_fn = free_page_start_fn;
+    balloon_free_page_stop_fn = free_page_stop_fn;
     balloon_opaque = opaque;
-    return 0;
 }
 
 void qemu_remove_balloon_handler(void *opaque)
@@ -86,6 +121,9 @@  void qemu_remove_balloon_handler(void *opaque)
     }
     balloon_event_fn = NULL;
     balloon_stat_fn = NULL;
+    balloon_free_page_support_fn = NULL;
+    balloon_free_page_start_fn = NULL;
+    balloon_free_page_stop_fn = NULL;
     balloon_opaque = NULL;
 }
 
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index f456cea..13bf0db 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -31,6 +31,7 @@ 
 
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
+#include "migration/misc.h"
 
 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
 
@@ -308,6 +309,125 @@  out:
     }
 }
 
+static void virtio_balloon_poll_free_page_hints(void *opaque)
+{
+    VirtQueueElement *elem;
+    VirtIOBalloon *dev = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+    VirtQueue *vq = dev->free_page_vq;
+    uint32_t id;
+    size_t size;
+
+    while (1) {
+        qemu_mutex_lock(&dev->free_page_lock);
+        while (dev->block_iothread) {
+            qemu_cond_wait(&dev->free_page_cond, &dev->free_page_lock);
+        }
+
+        /*
+         * If the migration thread actively stops the reporting, exit
+         * immediately.
+         */
+        if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
+            qemu_mutex_unlock(&dev->free_page_lock);
+            break;
+        }
+
+        elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+        if (!elem) {
+            qemu_mutex_unlock(&dev->free_page_lock);
+            continue;
+        }
+
+        if (elem->out_num) {
+            size = iov_to_buf(elem->out_sg, elem->out_num, 0, &id, sizeof(id));
+            virtqueue_push(vq, elem, size);
+            g_free(elem);
+
+            virtio_tswap32s(vdev, &id);
+            if (unlikely(size != sizeof(id))) {
+                virtio_error(vdev, "received an incorrect cmd id");
+                break;
+            }
+            if (id == dev->free_page_report_cmd_id) {
+                dev->free_page_report_status = FREE_PAGE_REPORT_S_START;
+            } else {
+                /*
+                 * Stop the optimization only when it has started. This
+                 * avoids a stale stop sign for the previous command.
+                 */
+                if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START) {
+                    dev->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
+                    qemu_mutex_unlock(&dev->free_page_lock);
+                    break;
+                }
+            }
+        }
+
+        if (elem->in_num) {
+            /* TODO: send the poison value to the destination */
+            if (dev->free_page_report_status == FREE_PAGE_REPORT_S_START &&
+                !dev->poison_val) {
+                qemu_guest_free_page_hint(elem->in_sg[0].iov_base,
+                                          elem->in_sg[0].iov_len);
+            }
+            virtqueue_push(vq, elem, 0);
+            g_free(elem);
+        }
+        qemu_mutex_unlock(&dev->free_page_lock);
+    }
+    virtio_notify(vdev, vq);
+}
+
+static bool virtio_balloon_free_page_support(void *opaque)
+{
+    VirtIOBalloon *s = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT);
+}
+
+static void virtio_balloon_free_page_start(void *opaque)
+{
+    VirtIOBalloon *s = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    /* For the stop and copy phase, we don't need to start the optimization */
+    if (!vdev->vm_running) {
+        return;
+    }
+
+    if (s->free_page_report_cmd_id == UINT_MAX) {
+        s->free_page_report_cmd_id =
+                       VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN;
+    } else {
+        s->free_page_report_cmd_id++;
+    }
+
+    s->free_page_report_status = FREE_PAGE_REPORT_S_REQUESTED;
+    virtio_notify_config(vdev);
+    qemu_bh_schedule(s->free_page_bh);
+}
+
+static void virtio_balloon_free_page_stop(void *opaque)
+{
+    VirtIOBalloon *s = opaque;
+    VirtIODevice *vdev = VIRTIO_DEVICE(s);
+
+    if (s->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
+        return;
+    } else {
+        qemu_mutex_lock(&s->free_page_lock);
+        /*
+         * The guest hasn't done the reporting, so host sends a notification
+         * to the guest to actively stop the reporting.
+         */
+        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
+        qemu_mutex_unlock(&s->free_page_lock);
+        virtio_notify_config(vdev);
+    }
+}
+
 static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 {
     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
@@ -315,6 +435,17 @@  static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
 
     config.num_pages = cpu_to_le32(dev->num_pages);
     config.actual = cpu_to_le32(dev->actual);
+    if (virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+        config.poison_val = cpu_to_le32(dev->poison_val);
+    }
+
+    if (dev->free_page_report_status == FREE_PAGE_REPORT_S_STOP) {
+        config.free_page_report_cmd_id =
+                       cpu_to_le32(VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
+    } else {
+        config.free_page_report_cmd_id =
+                       cpu_to_le32(dev->free_page_report_cmd_id);
+    }
 
     trace_virtio_balloon_get_config(config.num_pages, config.actual);
     memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
@@ -368,6 +499,7 @@  static void virtio_balloon_set_config(VirtIODevice *vdev,
                         ((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT),
                         &error_abort);
     }
+    dev->poison_val = le32_to_cpu(config.poison_val);
     trace_virtio_balloon_set_config(dev->actual, oldactual);
 }
 
@@ -377,6 +509,11 @@  static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
     VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
     f |= dev->host_features;
     virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
+
+    if (dev->host_features & 1ULL << VIRTIO_BALLOON_F_FREE_PAGE_HINT) {
+        virtio_add_feature(&f, VIRTIO_BALLOON_F_PAGE_POISON);
+    }
+
     return f;
 }
 
@@ -413,6 +550,18 @@  static int virtio_balloon_post_load_device(void *opaque, int version_id)
     return 0;
 }
 
+static const VMStateDescription vmstate_virtio_balloon_free_page_report = {
+    .name = "virtio-balloon-device/free-page-report",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = virtio_balloon_free_page_support,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(free_page_report_cmd_id, VirtIOBalloon),
+        VMSTATE_UINT32(poison_val, VirtIOBalloon),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_virtio_balloon_device = {
     .name = "virtio-balloon-device",
     .version_id = 1,
@@ -423,30 +572,42 @@  static const VMStateDescription vmstate_virtio_balloon_device = {
         VMSTATE_UINT32(actual, VirtIOBalloon),
         VMSTATE_END_OF_LIST()
     },
+    .subsections = (const VMStateDescription * []) {
+        &vmstate_virtio_balloon_free_page_report,
+        NULL
+    }
 };
 
 static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
-    int ret;
 
     virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
                 sizeof(struct virtio_balloon_config));
 
-    ret = qemu_add_balloon_handler(virtio_balloon_to_target,
-                                   virtio_balloon_stat, s);
-
-    if (ret < 0) {
-        error_setg(errp, "Only one balloon device is supported");
-        virtio_cleanup(vdev);
-        return;
-    }
-
     s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
     s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
-
+    if (virtio_has_feature(s->host_features,
+                           VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+        s->free_page_vq = virtio_add_queue(vdev, VIRTQUEUE_MAX_SIZE, NULL);
+        s->free_page_report_status = FREE_PAGE_REPORT_S_STOP;
+        s->free_page_report_cmd_id =
+                           VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN - 1;
+        if (s->iothread) {
+            object_ref(OBJECT(s->iothread));
+            s->free_page_bh = aio_bh_new(iothread_get_aio_context(s->iothread),
+                                       virtio_balloon_poll_free_page_hints, s);
+            qemu_mutex_init(&s->free_page_lock);
+            qemu_cond_init(&s->free_page_cond);
+            s->block_iothread = false;
+        } else {
+            /* Simply disable this feature if the iothread wasn't created. */
+            s->host_features &= ~(1 << VIRTIO_BALLOON_F_FREE_PAGE_HINT);
+            virtio_error(vdev, "iothread is missing");
+        }
+    }
     reset_stats(s);
 }
 
@@ -455,6 +616,10 @@  static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp)
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VirtIOBalloon *s = VIRTIO_BALLOON(dev);
 
+    if (virtio_balloon_free_page_support(s)) {
+        qemu_bh_delete(s->free_page_bh);
+        virtio_balloon_free_page_stop(s);
+    }
     balloon_stats_destroy_timer(s);
     qemu_remove_balloon_handler(s);
     virtio_cleanup(vdev);
@@ -464,6 +629,10 @@  static void virtio_balloon_device_reset(VirtIODevice *vdev)
 {
     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
 
+    if (virtio_balloon_free_page_support(s)) {
+        virtio_balloon_free_page_stop(s);
+    }
+
     if (s->stats_vq_elem != NULL) {
         virtqueue_unpop(s->svq, s->stats_vq_elem, 0);
         g_free(s->stats_vq_elem);
@@ -475,11 +644,47 @@  static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
 {
     VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
 
-    if (!s->stats_vq_elem && vdev->vm_running &&
-        (status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
-        /* poll stats queue for the element we have discarded when the VM
-         * was stopped */
-        virtio_balloon_receive_stats(vdev, s->svq);
+    if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
+        if (!s->stats_vq_elem && vdev->vm_running &&
+            virtqueue_rewind(s->svq, 1)) {
+            /*
+             * Poll stats queue for the element we have discarded when the VM
+             * was stopped.
+             */
+            virtio_balloon_receive_stats(vdev, s->svq);
+        }
+
+        if (virtio_balloon_free_page_support(s)) {
+            qemu_add_balloon_handler(virtio_balloon_to_target,
+                                     virtio_balloon_stat,
+                                     virtio_balloon_free_page_support,
+                                     virtio_balloon_free_page_start,
+                                     virtio_balloon_free_page_stop,
+                                     s);
+        } else {
+            qemu_add_balloon_handler(virtio_balloon_to_target,
+                                     virtio_balloon_stat, NULL, NULL, NULL, s);
+        }
+    }
+
+    if (virtio_balloon_free_page_support(s)) {
+        /*
+         * The VM is woken up and the iothread was blocked, so signal it to
+         * continue.
+         */
+        if (vdev->vm_running && s->block_iothread) {
+            qemu_mutex_lock(&s->free_page_lock);
+            s->block_iothread = false;
+            qemu_cond_signal(&s->free_page_cond);
+            qemu_mutex_unlock(&s->free_page_lock);
+        }
+
+        /* The VM is stopped, block the iothread. */
+        if (!vdev->vm_running) {
+            qemu_mutex_lock(&s->free_page_lock);
+            s->block_iothread = true;
+            qemu_mutex_unlock(&s->free_page_lock);
+        }
     }
 }
 
@@ -509,6 +714,10 @@  static const VMStateDescription vmstate_virtio_balloon = {
 static Property virtio_balloon_properties[] = {
     DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features,
                     VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
+    DEFINE_PROP_BIT("free-page-hint", VirtIOBalloon, host_features,
+                    VIRTIO_BALLOON_F_FREE_PAGE_HINT, false),
+    DEFINE_PROP_LINK("iothread", VirtIOBalloon, iothread, TYPE_IOTHREAD,
+                     IOThread *),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-balloon.h b/include/hw/virtio/virtio-balloon.h
index 1ea13bd..f865832 100644
--- a/include/hw/virtio/virtio-balloon.h
+++ b/include/hw/virtio/virtio-balloon.h
@@ -18,11 +18,14 @@ 
 #include "standard-headers/linux/virtio_balloon.h"
 #include "hw/virtio/virtio.h"
 #include "hw/pci/pci.h"
+#include "sysemu/iothread.h"
 
 #define TYPE_VIRTIO_BALLOON "virtio-balloon-device"
 #define VIRTIO_BALLOON(obj) \
         OBJECT_CHECK(VirtIOBalloon, (obj), TYPE_VIRTIO_BALLOON)
 
+#define VIRTIO_BALLOON_FREE_PAGE_REPORT_CMD_ID_MIN 0x80000000
+
 typedef struct virtio_balloon_stat VirtIOBalloonStat;
 
 typedef struct virtio_balloon_stat_modern {
@@ -31,15 +34,37 @@  typedef struct virtio_balloon_stat_modern {
        uint64_t val;
 } VirtIOBalloonStatModern;
 
+enum virtio_balloon_free_page_report_status {
+    FREE_PAGE_REPORT_S_STOP = 0,
+    FREE_PAGE_REPORT_S_REQUESTED = 1,
+    FREE_PAGE_REPORT_S_START = 2,
+};
+
 typedef struct VirtIOBalloon {
     VirtIODevice parent_obj;
-    VirtQueue *ivq, *dvq, *svq;
+    VirtQueue *ivq, *dvq, *svq, *free_page_vq;
+    uint32_t free_page_report_status;
     uint32_t num_pages;
     uint32_t actual;
+    uint32_t free_page_report_cmd_id;
+    uint32_t poison_val;
     uint64_t stats[VIRTIO_BALLOON_S_NR];
     VirtQueueElement *stats_vq_elem;
     size_t stats_vq_offset;
     QEMUTimer *stats_timer;
+    IOThread *iothread;
+    QEMUBH *free_page_bh;
+    /*
+     * Lock to synchronize threads to access the free page reporting related
+     * fields (e.g. free_page_report_status).
+     */
+    QemuMutex free_page_lock;
+    QemuCond  free_page_cond;
+    /*
+     * Set to block iothread to continue reading free page hints as the VM is
+     * stopped.
+     */
+    bool block_iothread;
     int64_t stats_last_update;
     int64_t stats_poll_interval;
     uint32_t host_features;
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index 7b0a41b..f89e80f 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -34,15 +34,22 @@ 
 #define VIRTIO_BALLOON_F_MUST_TELL_HOST	0 /* Tell before reclaiming pages */
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
+#define VIRTIO_BALLOON_F_FREE_PAGE_HINT 3 /* VQ to report free pages */
+#define VIRTIO_BALLOON_F_PAGE_POISON	4 /* Guest is using page poisoning */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
+#define VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID 0
 struct virtio_balloon_config {
 	/* Number of pages host wants Guest to give up. */
 	uint32_t num_pages;
 	/* Number of pages we've actually got in balloon. */
 	uint32_t actual;
+	/* Free page report command id, readonly by guest */
+	uint32_t free_page_report_cmd_id;
+	/* Stores PAGE_POISON if page poisoning is in use */
+	uint32_t poison_val;
 };
 
 #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h
index 66543ae..6561a08 100644
--- a/include/sysemu/balloon.h
+++ b/include/sysemu/balloon.h
@@ -18,11 +18,22 @@ 
 
 typedef void (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
 typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info);
+typedef bool (QEMUBalloonFreePageSupport)(void *opaque);
+typedef void (QEMUBalloonFreePageStart)(void *opaque);
+typedef void (QEMUBalloonFreePageStop)(void *opaque);
 
-int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
-			     QEMUBalloonStatus *stat_func, void *opaque);
 void qemu_remove_balloon_handler(void *opaque);
 bool qemu_balloon_is_inhibited(void);
 void qemu_balloon_inhibit(bool state);
+bool balloon_free_page_support(void);
+void balloon_free_page_start(void);
+void balloon_free_page_stop(void);
+
+void qemu_add_balloon_handler(QEMUBalloonEvent *event_fn,
+                              QEMUBalloonStatus *stat_fn,
+                              QEMUBalloonFreePageSupport *free_page_support_fn,
+                              QEMUBalloonFreePageStart *free_page_start_fn,
+                              QEMUBalloonFreePageStop *free_page_stop_fn,
+                              void *opaque);
 
 #endif