diff mbox series

[v17,6/6] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_VQ

Message ID 1509696786-1597-7-git-send-email-wei.w.wang@intel.com
State New
Headers show
Series Virtio-balloon Enhancement | expand

Commit Message

Wang, Wei W Nov. 3, 2017, 8:13 a.m. UTC
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
support of reporting hints of guest free pages to the host via
virtio-balloon. The host requests the guest to report the free pages by
sending commands via the virtio-balloon configuration registers.

When the guest starts to report, the first element added to the free page
vq is a sequence id of the start reporting command. The id is given by
the host, and it indicates whether the following free pages correspond
to the command. For example, the host may stop the report and start again
with a new command id. The obsolete pages for the previous start command
can be detected by the id dismatching on the host. The id is added to the
vq using an output buffer, and the free pages are added to the vq using
input buffer.

Here are some explainations about the added configuration registers:
- host2guest_cmd: a register used by the host to send commands to the
guest.
- guest2host_cmd: written by the guest to ACK to the host about the
commands that have been received. The host will clear the corresponding
bits on the host2guest_cmd register. The guest also uses this register
to send commands to the host (e.g. when finish free page reporting).
- free_page_cmd_id: the sequence id of the free page report command
given by the host.

Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
---
 drivers/virtio/virtio_balloon.c     | 234 ++++++++++++++++++++++++++++++++----
 include/uapi/linux/virtio_balloon.h |  11 ++
 2 files changed, 223 insertions(+), 22 deletions(-)

Comments

Wang, Wei W Nov. 13, 2017, 10:34 a.m. UTC | #1
Ping for comments, thanks.

On 11/03/2017 04:13 PM, Wei Wang wrote:
> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
> support of reporting hints of guest free pages to the host via
> virtio-balloon. The host requests the guest to report the free pages by
> sending commands via the virtio-balloon configuration registers.
>
> When the guest starts to report, the first element added to the free page
> vq is a sequence id of the start reporting command. The id is given by
> the host, and it indicates whether the following free pages correspond
> to the command. For example, the host may stop the report and start again
> with a new command id. The obsolete pages for the previous start command
> can be detected by the id dismatching on the host. The id is added to the
> vq using an output buffer, and the free pages are added to the vq using
> input buffer.
>
> Here are some explainations about the added configuration registers:
> - host2guest_cmd: a register used by the host to send commands to the
> guest.
> - guest2host_cmd: written by the guest to ACK to the host about the
> commands that have been received. The host will clear the corresponding
> bits on the host2guest_cmd register. The guest also uses this register
> to send commands to the host (e.g. when finish free page reporting).
> - free_page_cmd_id: the sequence id of the free page report command
> given by the host.
>
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> ---
>   drivers/virtio/virtio_balloon.c     | 234 ++++++++++++++++++++++++++++++++----
>   include/uapi/linux/virtio_balloon.h |  11 ++
>   2 files changed, 223 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index b31fc25..4087f04 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
>   
>   struct virtio_balloon {
>   	struct virtio_device *vdev;
> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +
> +	/* Balloon's own wq for cpu-intensive work items */
> +	struct workqueue_struct *balloon_wq;
> +	/* The free page reporting work item submitted to the balloon wq */
> +	struct work_struct report_free_page_work;
>   
>   	/* The balloon servicing is delegated to a freezable workqueue. */
>   	struct work_struct update_balloon_stats_work;
> @@ -65,6 +70,10 @@ struct virtio_balloon {
>   	spinlock_t stop_update_lock;
>   	bool stop_update;
>   
> +	/* Stop reporting free pages */
> +	bool report_free_page_stop;
> +	uint32_t free_page_cmd_id;
> +
>   	/* Waiting for host to ack the pages we released. */
>   	wait_queue_head_t acked;
>   
> @@ -191,6 +200,30 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,
>   		kick_and_wait(vq, vb->acked);
>   }
>   
> +static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
> +{
> +	int err = 0;
> +	unsigned int len;
> +
> +	/* Detach all the used buffers from the vq */
> +	while (virtqueue_get_buf(vq, &len))
> +		;
> +
> +	/*
> +	 * Since this is an optimization feature, losing a couple of free
> +	 * pages to report isn't important. We simply resturn without adding
> +	 * the page if the vq is full.
> +	 */
> +	if (vq->num_free) {
> +		err = add_one_sg(vq, addr, size);
> +		BUG_ON(err);
> +	}
> +
> +	/* Batch till the vq is full */
> +	if (!vq->num_free)
> +		virtqueue_kick(vq);
> +}
> +
>   /*
>    * Send balloon pages in sgs to host. The balloon pages are recorded in the
>    * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
> @@ -495,9 +528,8 @@ static void stats_handle_request(struct virtio_balloon *vb)
>   	virtqueue_kick(vq);
>   }
>   
> -static void virtballoon_changed(struct virtio_device *vdev)
> +static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
>   {
> -	struct virtio_balloon *vb = vdev->priv;
>   	unsigned long flags;
>   
>   	spin_lock_irqsave(&vb->stop_update_lock, flags);
> @@ -506,6 +538,50 @@ static void virtballoon_changed(struct virtio_device *vdev)
>   	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
>   }
>   
> +static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
> +{
> +	unsigned long flags;
> +
> +	vb->report_free_page_stop = false;
> +	spin_lock_irqsave(&vb->stop_update_lock, flags);
> +	if (!vb->stop_update)
> +		queue_work(vb->balloon_wq, &vb->report_free_page_work);
> +	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> +}
> +
> +static void virtballoon_changed(struct virtio_device *vdev)
> +{
> +	struct virtio_balloon *vb = vdev->priv;
> +	u32 host2guest_cmd, guest2host_cmd = 0;
> +
> +	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> +		virtballoon_cmd_balloon_memory(vb);
> +		return;
> +	}
> +
> +	virtio_cread(vb->vdev, struct virtio_balloon_config, host2guest_cmd,
> +		     &host2guest_cmd);
> +
> +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_BALLOON_MEMORY) {
> +		virtballoon_cmd_balloon_memory(vb);
> +		guest2host_cmd |= VIRTIO_BALLOON_CMD_BALLOON_MEMORY;
> +	}
> +
> +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START) {
> +		virtballoon_cmd_report_free_page_start(vb);
> +		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START;
> +	}
> +
> +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP) {
> +		vb->report_free_page_stop = true;
> +		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
> +	}
> +
> +	/* Ack to the host about the commands that have been received */
> +	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
> +		      &guest2host_cmd);
> +}
> +
>   static inline s64 towards_target(struct virtio_balloon *vb)
>   {
>   	s64 target;
> @@ -597,42 +673,147 @@ static void update_balloon_size_func(struct work_struct *work)
>   		queue_work(system_freezable_wq, work);
>   }
>   
> -static int init_vqs(struct virtio_balloon *vb)
> +static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
> +					   unsigned long nr_pages)
>   {
> -	struct virtqueue *vqs[3];
> -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> -	static const char * const names[] = { "inflate", "deflate", "stats" };
> -	int err, nvqs;
> +	struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
> +	void *addr = (void *)pfn_to_kaddr(pfn);
> +	uint32_t len = nr_pages << PAGE_SHIFT;
> +
> +	if (vb->report_free_page_stop)
> +		return false;
> +
> +	send_free_page_sg(vb->free_page_vq, addr, len);
>   
> +	return true;
> +}
> +
> +static void report_free_page_end(struct virtio_balloon *vb)
> +{
> +	u32 cmd = VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
>   	/*
> -	 * We expect two virtqueues: inflate and deflate, and
> -	 * optionally stat.
> +	 * The host may have already requested to stop the reporting before we
> +	 * finish, so no need to notify the host in this case.
>   	 */
> -	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> -	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> +	if (vb->report_free_page_stop)
> +		return;
> +	vb->report_free_page_stop = true;
> +
> +	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
> +		      &cmd);
> +}
> +
> +static void report_free_page_cmd_id(struct virtio_balloon *vb)
> +{
> +	struct scatterlist sg;
> +	int err;
> +
> +	virtio_cread(vb->vdev, struct virtio_balloon_config, free_page_cmd_id,
> +		     &vb->free_page_cmd_id);
> +	sg_init_one(&sg, &vb->free_page_cmd_id, sizeof(uint32_t));
> +	err = virtqueue_add_outbuf(vb->free_page_vq, &sg, 1,
> +				   &vb->free_page_cmd_id, GFP_KERNEL);
> +	BUG_ON(err);
> +}
> +
> +static void report_free_page(struct work_struct *work)
> +{
> +	struct virtio_balloon *vb;
> +
> +	vb = container_of(work, struct virtio_balloon, report_free_page_work);
> +	report_free_page_cmd_id(vb);
> +	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> +	/*
> +	 * The last few free page blocks that were added may not reach the
> +	 * batch size, but need a kick to notify the device to handle them.
> +	 */
> +	virtqueue_kick(vb->free_page_vq);
> +	report_free_page_end(vb);
> +}
> +
> +static int init_vqs(struct virtio_balloon *vb)
> +{
> +	struct virtqueue **vqs;
> +	vq_callback_t **callbacks;
> +	const char **names;
> +	struct scatterlist sg;
> +	int i, nvqs, err = -ENOMEM;
> +
> +	/* Inflateq and deflateq are used unconditionally */
> +	nvqs = 2;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
> +		nvqs++;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
> +		nvqs++;
> +
> +	/* Allocate space for find_vqs parameters */
> +	vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
> +	if (!vqs)
> +		goto err_vq;
> +	callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
> +	if (!callbacks)
> +		goto err_callback;
> +	names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
> +	if (!names)
> +		goto err_names;
> +
> +	callbacks[0] = balloon_ack;
> +	names[0] = "inflate";
> +	callbacks[1] = balloon_ack;
> +	names[1] = "deflate";
> +
> +	i = 2;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> +		callbacks[i] = stats_request;
> +		names[i] = "stats";
> +		i++;
> +	}
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> +		callbacks[i] = NULL;
> +		names[i] = "free_page_vq";
> +	}
> +
> +	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
> +					 NULL, NULL);
>   	if (err)
> -		return err;
> +		goto err_find;
>   
>   	vb->inflate_vq = vqs[0];
>   	vb->deflate_vq = vqs[1];
> +	i = 2;
>   	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> -		struct scatterlist sg;
> -		unsigned int num_stats;
> -		vb->stats_vq = vqs[2];
> -
> +		vb->stats_vq = vqs[i++];
>   		/*
>   		 * Prime this virtqueue with one buffer so the hypervisor can
>   		 * use it to signal us later (it can't be broken yet!).
>   		 */
> -		num_stats = update_balloon_stats(vb);
> -
> -		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> +		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
>   		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> -		    < 0)
> -			BUG();
> +		    < 0) {
> +			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> +				 __func__);
> +			goto err_find;
> +		}
>   		virtqueue_kick(vb->stats_vq);
>   	}
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
> +		vb->free_page_vq = vqs[i];
> +
> +	kfree(names);
> +	kfree(callbacks);
> +	kfree(vqs);
>   	return 0;
> +
> +err_find:
> +	kfree(names);
> +err_names:
> +	kfree(callbacks);
> +err_callback:
> +	kfree(vqs);
> +err_vq:
> +	return err;
>   }
>   
>   #ifdef CONFIG_BALLOON_COMPACTION
> @@ -761,6 +942,13 @@ static int virtballoon_probe(struct virtio_device *vdev)
>   	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG))
>   		xb_init(&vb->page_xb);
>   
> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> +		vb->balloon_wq = alloc_workqueue("balloon-wq",
> +					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
> +		INIT_WORK(&vb->report_free_page_work, report_free_page);
> +		vb->report_free_page_stop = true;
> +	}
> +
>   	vb->nb.notifier_call = virtballoon_oom_notify;
>   	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
>   	err = register_oom_notifier(&vb->nb);
> @@ -825,6 +1013,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
>   	spin_unlock_irq(&vb->stop_update_lock);
>   	cancel_work_sync(&vb->update_balloon_size_work);
>   	cancel_work_sync(&vb->update_balloon_stats_work);
> +	cancel_work_sync(&vb->report_free_page_work);
>   
>   	remove_common(vb);
>   #ifdef CONFIG_BALLOON_COMPACTION
> @@ -878,6 +1067,7 @@ static unsigned int features[] = {
>   	VIRTIO_BALLOON_F_STATS_VQ,
>   	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>   	VIRTIO_BALLOON_F_SG,
> +	VIRTIO_BALLOON_F_FREE_PAGE_VQ,
>   };
>   
>   static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index 37780a7..b758484 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -35,15 +35,26 @@
>   #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>   #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>   #define VIRTIO_BALLOON_F_SG		3 /* Use sg instead of PFN lists */
> +#define VIRTIO_BALLOON_F_FREE_PAGE_VQ	4 /* VQ to report free pages */
>   
>   /* Size of a PFN in the balloon interface. */
>   #define VIRTIO_BALLOON_PFN_SHIFT 12
>   
> +#define	VIRTIO_BALLOON_CMD_BALLOON_MEMORY		(1 << 0)
> +#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START	(1 << 1)
> +#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP	(1 << 2)
> +
>   struct virtio_balloon_config {
>   	/* Number of pages host wants Guest to give up. */
>   	__u32 num_pages;
>   	/* Number of pages we've actually got in balloon. */
>   	__u32 actual;
> +	/* Host-to-guest command, readonly by guest */
> +	__u32 host2guest_cmd;
> +	/* Sequence id of the free_page report command, readonly by guest */
> +	__u32 free_page_cmd_id;
> +	/* Guest-to-host command */
> +	__u32 guest2host_cmd;
>   };
>   
>   #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
Michael S. Tsirkin Nov. 13, 2017, 5:32 p.m. UTC | #2
You should Cc Nitesh who is working on a related feature.

On Mon, Nov 13, 2017 at 06:34:48PM +0800, Wei Wang wrote:
> Ping for comments, thanks.
> 
> On 11/03/2017 04:13 PM, Wei Wang wrote:
> > Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
> > support of reporting hints of guest free pages to the host via
> > virtio-balloon. The host requests the guest to report the free pages by
> > sending commands via the virtio-balloon configuration registers.
> > 
> > When the guest starts to report, the first element added to the free page
> > vq is a sequence id of the start reporting command. The id is given by
> > the host, and it indicates whether the following free pages correspond
> > to the command. For example, the host may stop the report and start again
> > with a new command id. The obsolete pages for the previous start command
> > can be detected by the id dismatching on the host. The id is added to the
> > vq using an output buffer, and the free pages are added to the vq using
> > input buffer.
> > 
> > Here are some explainations about the added configuration registers:
> > - host2guest_cmd: a register used by the host to send commands to the
> > guest.
> > - guest2host_cmd: written by the guest to ACK to the host about the
> > commands that have been received. The host will clear the corresponding
> > bits on the host2guest_cmd register. The guest also uses this register
> > to send commands to the host (e.g. when finish free page reporting).

I am not sure what is the role of guest2host_cmd. Reporting of
the correct cmd id seems sufficient indication that guest
received the start command. Not getting any more seems sufficient
to detect stop.


> > - free_page_cmd_id: the sequence id of the free page report command
> > given by the host.
> > 
> > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: Michal Hocko <mhocko@kernel.org>
> > ---
> >   drivers/virtio/virtio_balloon.c     | 234 ++++++++++++++++++++++++++++++++----
> >   include/uapi/linux/virtio_balloon.h |  11 ++
> >   2 files changed, 223 insertions(+), 22 deletions(-)
> > 
> > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> > index b31fc25..4087f04 100644
> > --- a/drivers/virtio/virtio_balloon.c
> > +++ b/drivers/virtio/virtio_balloon.c
> > @@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
> >   struct virtio_balloon {
> >   	struct virtio_device *vdev;
> > -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> > +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> > +
> > +	/* Balloon's own wq for cpu-intensive work items */
> > +	struct workqueue_struct *balloon_wq;
> > +	/* The free page reporting work item submitted to the balloon wq */
> > +	struct work_struct report_free_page_work;
> >   	/* The balloon servicing is delegated to a freezable workqueue. */
> >   	struct work_struct update_balloon_stats_work;
> > @@ -65,6 +70,10 @@ struct virtio_balloon {
> >   	spinlock_t stop_update_lock;
> >   	bool stop_update;
> > +	/* Stop reporting free pages */
> > +	bool report_free_page_stop;

I would revert logic here: bool report_free_page;

> > +	uint32_t free_page_cmd_id;
> > +
> >   	/* Waiting for host to ack the pages we released. */
> >   	wait_queue_head_t acked;
> > @@ -191,6 +200,30 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,
> >   		kick_and_wait(vq, vb->acked);
> >   }
> > +static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
> > +{
> > +	int err = 0;
> > +	unsigned int len;
> > +
> > +	/* Detach all the used buffers from the vq */
> > +	while (virtqueue_get_buf(vq, &len))
> > +		;
> > +
> > +	/*
> > +	 * Since this is an optimization feature, losing a couple of free
> > +	 * pages to report isn't important. We simply resturn without adding
> > +	 * the page if the vq is full.
> > +	 */
> > +	if (vq->num_free) {
> > +		err = add_one_sg(vq, addr, size);
> > +		BUG_ON(err);
> > +	}
> > +
> > +	/* Batch till the vq is full */
> > +	if (!vq->num_free)
> > +		virtqueue_kick(vq);
> > +}
> > +
> >   /*
> >    * Send balloon pages in sgs to host. The balloon pages are recorded in the
> >    * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
> > @@ -495,9 +528,8 @@ static void stats_handle_request(struct virtio_balloon *vb)
> >   	virtqueue_kick(vq);
> >   }
> > -static void virtballoon_changed(struct virtio_device *vdev)
> > +static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
> >   {
> > -	struct virtio_balloon *vb = vdev->priv;
> >   	unsigned long flags;
> >   	spin_lock_irqsave(&vb->stop_update_lock, flags);
> > @@ -506,6 +538,50 @@ static void virtballoon_changed(struct virtio_device *vdev)
> >   	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> >   }
> > +static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
> > +{
> > +	unsigned long flags;
> > +
> > +	vb->report_free_page_stop = false;

this flag is used a lot outside any locks. Why is this safe?
Please add some comments explaining access to this flag.

> > +	spin_lock_irqsave(&vb->stop_update_lock, flags);
> > +	if (!vb->stop_update)
> > +		queue_work(vb->balloon_wq, &vb->report_free_page_work);
> > +	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> > +}
> > +
> > +static void virtballoon_changed(struct virtio_device *vdev)
> > +{
> > +	struct virtio_balloon *vb = vdev->priv;
> > +	u32 host2guest_cmd, guest2host_cmd = 0;
> > +
> > +	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> > +		virtballoon_cmd_balloon_memory(vb);
> > +		return;

This might be a handy feature: one can disable balloon without
hot-unplug. But I would use a separate feature flag to
control it.

> > +	}
> > +
> > +	virtio_cread(vb->vdev, struct virtio_balloon_config, host2guest_cmd,
> > +		     &host2guest_cmd);
> > +
> > +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_BALLOON_MEMORY) {
> > +		virtballoon_cmd_balloon_memory(vb);
> > +		guest2host_cmd |= VIRTIO_BALLOON_CMD_BALLOON_MEMORY;
> > +	}
> > +
> > +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START) {
> > +		virtballoon_cmd_report_free_page_start(vb);
> > +		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START;
> > +	}
> > +
> > +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP) {
> > +		vb->report_free_page_stop = true;
> > +		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
> > +	}

I am not sure why free page has start+stop but e.g. balloon has a single
bit. In fact I would really just use command id. When it changes, we
know a new report is needed.

> > +
> > +	/* Ack to the host about the commands that have been received */
> > +	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
> > +		      &guest2host_cmd);

So the same register is used to ack stop command and to signal
end of report. This seems buggy.

> > +}
> > +
> >   static inline s64 towards_target(struct virtio_balloon *vb)
> >   {
> >   	s64 target;
> > @@ -597,42 +673,147 @@ static void update_balloon_size_func(struct work_struct *work)
> >   		queue_work(system_freezable_wq, work);
> >   }
> > -static int init_vqs(struct virtio_balloon *vb)
> > +static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
> > +					   unsigned long nr_pages)
> >   {
> > -	struct virtqueue *vqs[3];
> > -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> > -	static const char * const names[] = { "inflate", "deflate", "stats" };
> > -	int err, nvqs;
> > +	struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
> > +	void *addr = (void *)pfn_to_kaddr(pfn);

How do we know all free pages have a kaddr?

> > +	uint32_t len = nr_pages << PAGE_SHIFT;
> > +
> > +	if (vb->report_free_page_stop)
> > +		return false;
> > +
> > +	send_free_page_sg(vb->free_page_vq, addr, len);
> > +	return true;
> > +}
> > +
> > +static void report_free_page_end(struct virtio_balloon *vb)
> > +{
> > +	u32 cmd = VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
> >   	/*
> > -	 * We expect two virtqueues: inflate and deflate, and
> > -	 * optionally stat.
> > +	 * The host may have already requested to stop the reporting before we
> > +	 * finish, so no need to notify the host in this case.
> >   	 */
> > -	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> > -	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> > +	if (vb->report_free_page_stop)
> > +		return;
> > +	vb->report_free_page_stop = true;
> > +
> > +	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
> > +		      &cmd);

Wouldn't it be easier to add a buffer in the queue?

> > +}
> > +
> > +static void report_free_page_cmd_id(struct virtio_balloon *vb)
> > +{
> > +	struct scatterlist sg;
> > +	int err;
> > +
> > +	virtio_cread(vb->vdev, struct virtio_balloon_config, free_page_cmd_id,
> > +		     &vb->free_page_cmd_id);
> > +	sg_init_one(&sg, &vb->free_page_cmd_id, sizeof(uint32_t));
> > +	err = virtqueue_add_outbuf(vb->free_page_vq, &sg, 1,
> > +				   &vb->free_page_cmd_id, GFP_KERNEL);
> > +	BUG_ON(err);
> > +}
> > +
> > +static void report_free_page(struct work_struct *work)
> > +{
> > +	struct virtio_balloon *vb;
> > +
> > +	vb = container_of(work, struct virtio_balloon, report_free_page_work);
> > +	report_free_page_cmd_id(vb);
> > +	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> > +	/*
> > +	 * The last few free page blocks that were added may not reach the
> > +	 * batch size, but need a kick to notify the device to handle them.
> > +	 */
> > +	virtqueue_kick(vb->free_page_vq);
> > +	report_free_page_end(vb);
> > +}
> > +
> > +static int init_vqs(struct virtio_balloon *vb)
> > +{
> > +	struct virtqueue **vqs;
> > +	vq_callback_t **callbacks;
> > +	const char **names;
> > +	struct scatterlist sg;
> > +	int i, nvqs, err = -ENOMEM;
> > +
> > +	/* Inflateq and deflateq are used unconditionally */
> > +	nvqs = 2;
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
> > +		nvqs++;
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
> > +		nvqs++;
> > +
> > +	/* Allocate space for find_vqs parameters */
> > +	vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
> > +	if (!vqs)
> > +		goto err_vq;
> > +	callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
> > +	if (!callbacks)
> > +		goto err_callback;
> > +	names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
> > +	if (!names)
> > +		goto err_names;
> > +
> > +	callbacks[0] = balloon_ack;
> > +	names[0] = "inflate";
> > +	callbacks[1] = balloon_ack;
> > +	names[1] = "deflate";
> > +
> > +	i = 2;
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > +		callbacks[i] = stats_request;
> > +		names[i] = "stats";
> > +		i++;
> > +	}
> > +
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> > +		callbacks[i] = NULL;
> > +		names[i] = "free_page_vq";
> > +	}
> > +
> > +	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
> > +					 NULL, NULL);
> >   	if (err)
> > -		return err;
> > +		goto err_find;
> >   	vb->inflate_vq = vqs[0];
> >   	vb->deflate_vq = vqs[1];
> > +	i = 2;
> >   	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> > -		struct scatterlist sg;
> > -		unsigned int num_stats;
> > -		vb->stats_vq = vqs[2];
> > -
> > +		vb->stats_vq = vqs[i++];
> >   		/*
> >   		 * Prime this virtqueue with one buffer so the hypervisor can
> >   		 * use it to signal us later (it can't be broken yet!).
> >   		 */
> > -		num_stats = update_balloon_stats(vb);
> > -
> > -		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> > +		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
> >   		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> > -		    < 0)
> > -			BUG();
> > +		    < 0) {
> > +			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> > +				 __func__);
> > +			goto err_find;
> > +		}
> >   		virtqueue_kick(vb->stats_vq);
> >   	}
> > +
> > +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
> > +		vb->free_page_vq = vqs[i];
> > +
> > +	kfree(names);
> > +	kfree(callbacks);
> > +	kfree(vqs);
> >   	return 0;
> > +
> > +err_find:
> > +	kfree(names);
> > +err_names:
> > +	kfree(callbacks);
> > +err_callback:
> > +	kfree(vqs);
> > +err_vq:
> > +	return err;
> >   }
> >   #ifdef CONFIG_BALLOON_COMPACTION
> > @@ -761,6 +942,13 @@ static int virtballoon_probe(struct virtio_device *vdev)
> >   	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG))
> >   		xb_init(&vb->page_xb);
> > +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> > +		vb->balloon_wq = alloc_workqueue("balloon-wq",
> > +					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
> > +		INIT_WORK(&vb->report_free_page_work, report_free_page);
> > +		vb->report_free_page_stop = true;
> > +	}
> > +
> >   	vb->nb.notifier_call = virtballoon_oom_notify;
> >   	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
> >   	err = register_oom_notifier(&vb->nb);
> > @@ -825,6 +1013,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
> >   	spin_unlock_irq(&vb->stop_update_lock);
> >   	cancel_work_sync(&vb->update_balloon_size_work);
> >   	cancel_work_sync(&vb->update_balloon_stats_work);
> > +	cancel_work_sync(&vb->report_free_page_work);
> >   	remove_common(vb);
> >   #ifdef CONFIG_BALLOON_COMPACTION
> > @@ -878,6 +1067,7 @@ static unsigned int features[] = {
> >   	VIRTIO_BALLOON_F_STATS_VQ,
> >   	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
> >   	VIRTIO_BALLOON_F_SG,
> > +	VIRTIO_BALLOON_F_FREE_PAGE_VQ,
> >   };
> >   static struct virtio_driver virtio_balloon_driver = {
> > diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> > index 37780a7..b758484 100644
> > --- a/include/uapi/linux/virtio_balloon.h
> > +++ b/include/uapi/linux/virtio_balloon.h
> > @@ -35,15 +35,26 @@
> >   #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
> >   #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
> >   #define VIRTIO_BALLOON_F_SG		3 /* Use sg instead of PFN lists */
> > +#define VIRTIO_BALLOON_F_FREE_PAGE_VQ	4 /* VQ to report free pages */
> >   /* Size of a PFN in the balloon interface. */
> >   #define VIRTIO_BALLOON_PFN_SHIFT 12
> > +#define	VIRTIO_BALLOON_CMD_BALLOON_MEMORY		(1 << 0)
> > +#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START	(1 << 1)
> > +#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP	(1 << 2)
> > +
> >   struct virtio_balloon_config {
> >   	/* Number of pages host wants Guest to give up. */
> >   	__u32 num_pages;
> >   	/* Number of pages we've actually got in balloon. */
> >   	__u32 actual;
> > +	/* Host-to-guest command, readonly by guest */
> > +	__u32 host2guest_cmd;
> > +	/* Sequence id of the free_page report command, readonly by guest */
> > +	__u32 free_page_cmd_id;
> > +	/* Guest-to-host command */
> > +	__u32 guest2host_cmd;
> >   };
> >   #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
Wang, Wei W Nov. 14, 2017, 12:02 p.m. UTC | #3
On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:
>> - guest2host_cmd: written by the guest to ACK to the host about the
>> commands that have been received. The host will clear the corresponding
>> bits on the host2guest_cmd register. The guest also uses this register
>> to send commands to the host (e.g. when finish free page reporting).
> I am not sure what is the role of guest2host_cmd. Reporting of
> the correct cmd id seems sufficient indication that guest
> received the start command. Not getting any more seems sufficient
> to detect stop.
>

I think the issue is when the host is waiting for the guest to report 
pages, it does not know whether the guest is going to report more or the 
report is done already. That's why we need a way to let the guest tell 
the host "the report is done, don't wait for more", then the host 
continues to the next step - sending the non-free pages to the 
destination. The following method is a conclusion of other comments, 
with some new thought. Please have a check if it is good.

Two new configuration registers in total:
- cmd_reg: the command register, combined from the previous host2guest 
and guest2host. I think we can use the same register for host requesting 
and guest ACKing, since the guest writing will trap to QEMU, that is, 
all the writes to the register are performed in QEMU, and we can keep 
things work in a correct way there.
- cmd_id_reg: the sequence id of the free page report command.

-- free page report:
     - host requests the guest to start reporting by "cmd_reg | 
REPORT_START";
     - guest ACKs to the host about receiving the start reporting 
request by "cmd_reg | REPORT_START", host will clear the flag bit once 
receiving the ACK.
     - host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
     - guest ACKs to the host about receiving the stop reporting request 
by "cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
     - guest tells the host about the start of the reporting by writing 
"cmd id" into an outbuf, which is added to the free page vq.
     - guest tells the host about the end of the reporting by writing 
"0" into an outbuf, which is added to the free page vq. (we reserve 
"id=0" as the stop sign)

-- ballooning:
     - host requests the guest to start ballooning by "cmd_reg | 
BALLOONING";
     - guest ACKs to the host about receiving the request by "cmd_reg | 
BALLOONING", host will clear the flag once receiving the ACK.


Some more explanations:
-- Why not let the host request the guest to start the free page 
reporting simply by writing a new cmd id to the cmd_id_reg?
The configuration interrupt is shared among all the features - 
ballooning, free page reporting, and future feature extensions which 
need host-to-guest requests. Some features may need to add other feature 
specific configuration registers, like free page reporting need the 
cmd_id_reg, which is not used by ballooning. The rule here is that the 
feature specific registers are read only when that feature is requested 
via the cmd_reg. For example, the cmd_id_reg is read only when "cmd_reg 
| REPORT_START" is true. Otherwise, when the driver receives a 
configuration interrupt, it has to read both cmd_reg and cmd_id 
registers to know what are requested by the host - think about the case 
that ballooning requests are sent frequently while free page reporting 
isn't requested, the guest has to read the cmd_id register every time a 
ballooning request is sent by the host, which is not necessary. If 
future new features follow this style, there will be more unnecessary 
VMexits to read the unused feature specific registers.
So I think it is good to have a central control of the feature request 
via only one cmd register - reading that one is enough to know what is 
requested by the host.


Best,
Wei
Michael S. Tsirkin Nov. 14, 2017, 9:21 p.m. UTC | #4
On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:
> On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:
> > > - guest2host_cmd: written by the guest to ACK to the host about the
> > > commands that have been received. The host will clear the corresponding
> > > bits on the host2guest_cmd register. The guest also uses this register
> > > to send commands to the host (e.g. when finish free page reporting).
> > I am not sure what is the role of guest2host_cmd. Reporting of
> > the correct cmd id seems sufficient indication that guest
> > received the start command. Not getting any more seems sufficient
> > to detect stop.
> > 
> 
> I think the issue is when the host is waiting for the guest to report pages,
> it does not know whether the guest is going to report more or the report is
> done already. That's why we need a way to let the guest tell the host "the
> report is done, don't wait for more", then the host continues to the next
> step - sending the non-free pages to the destination. The following method
> is a conclusion of other comments, with some new thought. Please have a
> check if it is good.

config won't work well for this IMHO.
Writes to config register are hard to synchronize with the VQ.
For example, guest sends free pages, host says stop, meanwhile
guest sends stop for 1st set of pages.

How about adding a buffer with "stop" in the VQ instead?
Wastes a VQ entry which you will need to reserve for this
but is it a big deal?


> Two new configuration registers in total:
> - cmd_reg: the command register, combined from the previous host2guest and
> guest2host. I think we can use the same register for host requesting and
> guest ACKing, since the guest writing will trap to QEMU, that is, all the
> writes to the register are performed in QEMU, and we can keep things work in
> a correct way there.
> - cmd_id_reg: the sequence id of the free page report command.
> 
> -- free page report:
>     - host requests the guest to start reporting by "cmd_reg |
> REPORT_START";
>     - guest ACKs to the host about receiving the start reporting request by
> "cmd_reg | REPORT_START", host will clear the flag bit once receiving the
> ACK.
>     - host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
>     - guest ACKs to the host about receiving the stop reporting request by
> "cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
>     - guest tells the host about the start of the reporting by writing "cmd
> id" into an outbuf, which is added to the free page vq.
>     - guest tells the host about the end of the reporting by writing "0"
> into an outbuf, which is added to the free page vq. (we reserve "id=0" as
> the stop sign)
> 
> -- ballooning:
>     - host requests the guest to start ballooning by "cmd_reg | BALLOONING";
>     - guest ACKs to the host about receiving the request by "cmd_reg |
> BALLOONING", host will clear the flag once receiving the ACK.
> 
> 
> Some more explanations:
> -- Why not let the host request the guest to start the free page reporting
> simply by writing a new cmd id to the cmd_id_reg?
> The configuration interrupt is shared among all the features - ballooning,
> free page reporting, and future feature extensions which need host-to-guest
> requests. Some features may need to add other feature specific configuration
> registers, like free page reporting need the cmd_id_reg, which is not used
> by ballooning. The rule here is that the feature specific registers are read
> only when that feature is requested via the cmd_reg. For example, the
> cmd_id_reg is read only when "cmd_reg | REPORT_START" is true. Otherwise,
> when the driver receives a configuration interrupt, it has to read both
> cmd_reg and cmd_id registers to know what are requested by the host - think
> about the case that ballooning requests are sent frequently while free page
> reporting isn't requested, the guest has to read the cmd_id register every
> time a ballooning request is sent by the host, which is not necessary. If
> future new features follow this style, there will be more unnecessary
> VMexits to read the unused feature specific registers.
> So I think it is good to have a central control of the feature request via
> only one cmd register - reading that one is enough to know what is requested
> by the host.
> 

Right now you are increasing the cost of balloon request 3x though.


How about we establish a baseline with a simple interface, and
then add the command register when it's actually benefitial.



> Best,
> Wei
Wang, Wei W Nov. 15, 2017, 3:47 a.m. UTC | #5
On 11/15/2017 05:21 AM, Michael S. Tsirkin wrote:
> On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:
>> On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:
>>>> - guest2host_cmd: written by the guest to ACK to the host about the
>>>> commands that have been received. The host will clear the corresponding
>>>> bits on the host2guest_cmd register. The guest also uses this register
>>>> to send commands to the host (e.g. when finish free page reporting).
>>> I am not sure what is the role of guest2host_cmd. Reporting of
>>> the correct cmd id seems sufficient indication that guest
>>> received the start command. Not getting any more seems sufficient
>>> to detect stop.
>>>
>> I think the issue is when the host is waiting for the guest to report pages,
>> it does not know whether the guest is going to report more or the report is
>> done already. That's why we need a way to let the guest tell the host "the
>> report is done, don't wait for more", then the host continues to the next
>> step - sending the non-free pages to the destination. The following method
>> is a conclusion of other comments, with some new thought. Please have a
>> check if it is good.
> config won't work well for this IMHO.
> Writes to config register are hard to synchronize with the VQ.
> For example, guest sends free pages, host says stop, meanwhile
> guest sends stop for 1st set of pages.

I still don't see an issue with this. Please see below:
(before jumping into the discussion, just make sure I've well explained 
this point: now host-to-guest commands are done via config, and 
guest-to-host commands are done via the free page vq)

Case: Host starts to request the reporting with cmd_id=1. Some time 
later, Host writes "stop" to config, meantime guest happens to finish 
the reporting and plan to actively send a "stop" command from the 
free_page_vq().
           Essentially, this is like a sync between two threads - if we 
view the config interrupt handler as one thread, another is the free 
page reporting worker thread.

         - what the config handler does is simply:
               1.1:  WRITE_ONCE(vb->reporting_stop, true);

         - what the reporting thread will do is
               2.1:  WRITE_ONCE(vb->reporting_stop, true);
               2.2:  send_stop_to_host_via_vq();

 From the guest point of view, no matter 1.1 is executed first or 2.1 
first, it doesn't make a difference to the end result - 
vb->reporting_stop is set.

 From the host point of view, it knows that cmd_id=1 has truly stopped 
the reporting when it receives a "stop" sign via the vq.


> How about adding a buffer with "stop" in the VQ instead?
> Wastes a VQ entry which you will need to reserve for this
> but is it a big deal?

The free page vq is guest-to-host direction. Using it for host-to-guest 
requests will make it bidirectional, which will result in the same issue 
described before: https://lkml.org/lkml/2017/10/11/1009 (the first response)

On the other hand, I think adding another new vq for host-to-guest 
requesting doesn't make a difference in essence, compared to using 
config (same 1.1, 2.1, 2.2 above), but will be more complicated.


>> Two new configuration registers in total:
>> - cmd_reg: the command register, combined from the previous host2guest and
>> guest2host. I think we can use the same register for host requesting and
>> guest ACKing, since the guest writing will trap to QEMU, that is, all the
>> writes to the register are performed in QEMU, and we can keep things work in
>> a correct way there.
>> - cmd_id_reg: the sequence id of the free page report command.
>>
>> -- free page report:
>>      - host requests the guest to start reporting by "cmd_reg |
>> REPORT_START";
>>      - guest ACKs to the host about receiving the start reporting request by
>> "cmd_reg | REPORT_START", host will clear the flag bit once receiving the
>> ACK.
>>      - host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
>>      - guest ACKs to the host about receiving the stop reporting request by
>> "cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
>>      - guest tells the host about the start of the reporting by writing "cmd
>> id" into an outbuf, which is added to the free page vq.
>>      - guest tells the host about the end of the reporting by writing "0"
>> into an outbuf, which is added to the free page vq. (we reserve "id=0" as
>> the stop sign)
>>
>> -- ballooning:
>>      - host requests the guest to start ballooning by "cmd_reg | BALLOONING";
>>      - guest ACKs to the host about receiving the request by "cmd_reg |
>> BALLOONING", host will clear the flag once receiving the ACK.
>>
>>
>> Some more explanations:
>> -- Why not let the host request the guest to start the free page reporting
>> simply by writing a new cmd id to the cmd_id_reg?
>> The configuration interrupt is shared among all the features - ballooning,
>> free page reporting, and future feature extensions which need host-to-guest
>> requests. Some features may need to add other feature specific configuration
>> registers, like free page reporting need the cmd_id_reg, which is not used
>> by ballooning. The rule here is that the feature specific registers are read
>> only when that feature is requested via the cmd_reg. For example, the
>> cmd_id_reg is read only when "cmd_reg | REPORT_START" is true. Otherwise,
>> when the driver receives a configuration interrupt, it has to read both
>> cmd_reg and cmd_id registers to know what are requested by the host - think
>> about the case that ballooning requests are sent frequently while free page
>> reporting isn't requested, the guest has to read the cmd_id register every
>> time a ballooning request is sent by the host, which is not necessary. If
>> future new features follow this style, there will be more unnecessary
>> VMexits to read the unused feature specific registers.
>> So I think it is good to have a central control of the feature request via
>> only one cmd register - reading that one is enough to know what is requested
>> by the host.
>>
> Right now you are increasing the cost of balloon request 3x though.

Not that much, I think, just a cmd register read and ACK, and this 
should be neglected compared to the ballooning time.
(I don't see a difference in the performance testing either).

Best,
Wei
Michael S. Tsirkin Nov. 15, 2017, 1:26 p.m. UTC | #6
On Wed, Nov 15, 2017 at 11:47:58AM +0800, Wei Wang wrote:
> On 11/15/2017 05:21 AM, Michael S. Tsirkin wrote:
> > On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:
> > > On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:
> > > > > - guest2host_cmd: written by the guest to ACK to the host about the
> > > > > commands that have been received. The host will clear the corresponding
> > > > > bits on the host2guest_cmd register. The guest also uses this register
> > > > > to send commands to the host (e.g. when finish free page reporting).
> > > > I am not sure what is the role of guest2host_cmd. Reporting of
> > > > the correct cmd id seems sufficient indication that guest
> > > > received the start command. Not getting any more seems sufficient
> > > > to detect stop.
> > > > 
> > > I think the issue is when the host is waiting for the guest to report pages,
> > > it does not know whether the guest is going to report more or the report is
> > > done already. That's why we need a way to let the guest tell the host "the
> > > report is done, don't wait for more", then the host continues to the next
> > > step - sending the non-free pages to the destination. The following method
> > > is a conclusion of other comments, with some new thought. Please have a
> > > check if it is good.
> > config won't work well for this IMHO.
> > Writes to config register are hard to synchronize with the VQ.
> > For example, guest sends free pages, host says stop, meanwhile
> > guest sends stop for 1st set of pages.
> 
> I still don't see an issue with this. Please see below:
> (before jumping into the discussion, just make sure I've well explained this
> point: now host-to-guest commands are done via config, and guest-to-host
> commands are done via the free page vq)

This is fine by me actually. But right now you have guest to host
not going through vq, going through command register instead -
this is how sending stop to host seems to happen.
If you make it go through vq then I think all will be well.

> 
> Case: Host starts to request the reporting with cmd_id=1. Some time later,
> Host writes "stop" to config, meantime guest happens to finish the reporting
> and plan to actively send a "stop" command from the free_page_vq().
>           Essentially, this is like a sync between two threads - if we view
> the config interrupt handler as one thread, another is the free page
> reporting worker thread.
> 
>         - what the config handler does is simply:
>               1.1:  WRITE_ONCE(vb->reporting_stop, true);
> 
>         - what the reporting thread will do is
>               2.1:  WRITE_ONCE(vb->reporting_stop, true);
>               2.2:  send_stop_to_host_via_vq();
> 
> From the guest point of view, no matter 1.1 is executed first or 2.1 first,
> it doesn't make a difference to the end result - vb->reporting_stop is set.
> 
> From the host point of view, it knows that cmd_id=1 has truly stopped the
> reporting when it receives a "stop" sign via the vq.
> 
> 
> > How about adding a buffer with "stop" in the VQ instead?
> > Wastes a VQ entry which you will need to reserve for this
> > but is it a big deal?
> 
> The free page vq is guest-to-host direction.

Yes, for guest to host stop sign.

> Using it for host-to-guest
> requests will make it bidirectional, which will result in the same issue
> described before: https://lkml.org/lkml/2017/10/11/1009 (the first response)
> 
> On the other hand, I think adding another new vq for host-to-guest
> requesting doesn't make a difference in essence, compared to using config
> (same 1.1, 2.1, 2.2 above), but will be more complicated.

I agree with this. Host to guest can just incremenent the "free command id"
register.

> 
> > > Two new configuration registers in total:
> > > - cmd_reg: the command register, combined from the previous host2guest and
> > > guest2host. I think we can use the same register for host requesting and
> > > guest ACKing, since the guest writing will trap to QEMU, that is, all the
> > > writes to the register are performed in QEMU, and we can keep things work in
> > > a correct way there.
> > > - cmd_id_reg: the sequence id of the free page report command.
> > > 
> > > -- free page report:
> > >      - host requests the guest to start reporting by "cmd_reg |
> > > REPORT_START";
> > >      - guest ACKs to the host about receiving the start reporting request by
> > > "cmd_reg | REPORT_START", host will clear the flag bit once receiving the
> > > ACK.
> > >      - host requests the guest to stop reporting by "cmd_reg | REPORT_STOP";
> > >      - guest ACKs to the host about receiving the stop reporting request by
> > > "cmd_reg | REPORT_STOP", host will clear the flag once receiving the ACK.
> > >      - guest tells the host about the start of the reporting by writing "cmd
> > > id" into an outbuf, which is added to the free page vq.
> > >      - guest tells the host about the end of the reporting by writing "0"
> > > into an outbuf, which is added to the free page vq. (we reserve "id=0" as
> > > the stop sign)
> > > 
> > > -- ballooning:
> > >      - host requests the guest to start ballooning by "cmd_reg | BALLOONING";
> > >      - guest ACKs to the host about receiving the request by "cmd_reg |
> > > BALLOONING", host will clear the flag once receiving the ACK.
> > > 
> > > 
> > > Some more explanations:
> > > -- Why not let the host request the guest to start the free page reporting
> > > simply by writing a new cmd id to the cmd_id_reg?
> > > The configuration interrupt is shared among all the features - ballooning,
> > > free page reporting, and future feature extensions which need host-to-guest
> > > requests. Some features may need to add other feature specific configuration
> > > registers, like free page reporting need the cmd_id_reg, which is not used
> > > by ballooning. The rule here is that the feature specific registers are read
> > > only when that feature is requested via the cmd_reg. For example, the
> > > cmd_id_reg is read only when "cmd_reg | REPORT_START" is true. Otherwise,
> > > when the driver receives a configuration interrupt, it has to read both
> > > cmd_reg and cmd_id registers to know what are requested by the host - think
> > > about the case that ballooning requests are sent frequently while free page
> > > reporting isn't requested, the guest has to read the cmd_id register every
> > > time a ballooning request is sent by the host, which is not necessary. If
> > > future new features follow this style, there will be more unnecessary
> > > VMexits to read the unused feature specific registers.
> > > So I think it is good to have a central control of the feature request via
> > > only one cmd register - reading that one is enough to know what is requested
> > > by the host.
> > > 
> > Right now you are increasing the cost of balloon request 3x though.
> 
> Not that much, I think, just a cmd register read and ACK, and this should be
> neglected compared to the ballooning time.
> (I don't see a difference in the performance testing either).
> 
> Best,
> Wei
Michael S. Tsirkin Nov. 15, 2017, 8:32 p.m. UTC | #7
On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
> support of reporting hints of guest free pages to the host via
> virtio-balloon. The host requests the guest to report the free pages by
> sending commands via the virtio-balloon configuration registers.
> 
> When the guest starts to report, the first element added to the free page
> vq is a sequence id of the start reporting command. The id is given by
> the host, and it indicates whether the following free pages correspond
> to the command. For example, the host may stop the report and start again
> with a new command id. The obsolete pages for the previous start command
> can be detected by the id dismatching on the host. The id is added to the
> vq using an output buffer, and the free pages are added to the vq using
> input buffer.
> 
> Here are some explainations about the added configuration registers:
> - host2guest_cmd: a register used by the host to send commands to the
> guest.
> - guest2host_cmd: written by the guest to ACK to the host about the
> commands that have been received. The host will clear the corresponding
> bits on the host2guest_cmd register. The guest also uses this register
> to send commands to the host (e.g. when finish free page reporting).
> - free_page_cmd_id: the sequence id of the free page report command
> given by the host.
> 
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Signed-off-by: Liang Li <liang.z.li@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> ---
>  drivers/virtio/virtio_balloon.c     | 234 ++++++++++++++++++++++++++++++++----
>  include/uapi/linux/virtio_balloon.h |  11 ++
>  2 files changed, 223 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index b31fc25..4087f04 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -55,7 +55,12 @@ static struct vfsmount *balloon_mnt;
>  
>  struct virtio_balloon {
>  	struct virtio_device *vdev;
> -	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> +	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +
> +	/* Balloon's own wq for cpu-intensive work items */
> +	struct workqueue_struct *balloon_wq;
> +	/* The free page reporting work item submitted to the balloon wq */
> +	struct work_struct report_free_page_work;
>  
>  	/* The balloon servicing is delegated to a freezable workqueue. */
>  	struct work_struct update_balloon_stats_work;
> @@ -65,6 +70,10 @@ struct virtio_balloon {
>  	spinlock_t stop_update_lock;
>  	bool stop_update;
>  
> +	/* Stop reporting free pages */
> +	bool report_free_page_stop;
> +	uint32_t free_page_cmd_id;
> +
>  	/* Waiting for host to ack the pages we released. */
>  	wait_queue_head_t acked;
>  
> @@ -191,6 +200,30 @@ static void send_balloon_page_sg(struct virtio_balloon *vb,
>  		kick_and_wait(vq, vb->acked);
>  }
>  
> +static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
> +{
> +	int err = 0;
> +	unsigned int len;
> +
> +	/* Detach all the used buffers from the vq */
> +	while (virtqueue_get_buf(vq, &len))
> +		;
> +
> +	/*
> +	 * Since this is an optimization feature, losing a couple of free
> +	 * pages to report isn't important. We simply resturn without adding
> +	 * the page if the vq is full.
> +	 */
> +	if (vq->num_free) {
> +		err = add_one_sg(vq, addr, size);
> +		BUG_ON(err);
> +	}
> +
> +	/* Batch till the vq is full */
> +	if (!vq->num_free)
> +		virtqueue_kick(vq);
> +}
> +
>  /*
>   * Send balloon pages in sgs to host. The balloon pages are recorded in the
>   * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
> @@ -495,9 +528,8 @@ static void stats_handle_request(struct virtio_balloon *vb)
>  	virtqueue_kick(vq);
>  }
>  
> -static void virtballoon_changed(struct virtio_device *vdev)
> +static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
>  {
> -	struct virtio_balloon *vb = vdev->priv;
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&vb->stop_update_lock, flags);
> @@ -506,6 +538,50 @@ static void virtballoon_changed(struct virtio_device *vdev)
>  	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
>  }
>  
> +static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
> +{
> +	unsigned long flags;
> +
> +	vb->report_free_page_stop = false;
> +	spin_lock_irqsave(&vb->stop_update_lock, flags);
> +	if (!vb->stop_update)
> +		queue_work(vb->balloon_wq, &vb->report_free_page_work);
> +	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
> +}
> +
> +static void virtballoon_changed(struct virtio_device *vdev)
> +{
> +	struct virtio_balloon *vb = vdev->priv;
> +	u32 host2guest_cmd, guest2host_cmd = 0;
> +
> +	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> +		virtballoon_cmd_balloon_memory(vb);
> +		return;
> +	}
> +
> +	virtio_cread(vb->vdev, struct virtio_balloon_config, host2guest_cmd,
> +		     &host2guest_cmd);
> +
> +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_BALLOON_MEMORY) {
> +		virtballoon_cmd_balloon_memory(vb);
> +		guest2host_cmd |= VIRTIO_BALLOON_CMD_BALLOON_MEMORY;
> +	}
> +
> +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START) {
> +		virtballoon_cmd_report_free_page_start(vb);
> +		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START;
> +	}
> +
> +	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP) {
> +		vb->report_free_page_stop = true;
> +		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
> +	}
> +
> +	/* Ack to the host about the commands that have been received */
> +	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
> +		      &guest2host_cmd);
> +}
> +
>  static inline s64 towards_target(struct virtio_balloon *vb)
>  {
>  	s64 target;
> @@ -597,42 +673,147 @@ static void update_balloon_size_func(struct work_struct *work)
>  		queue_work(system_freezable_wq, work);
>  }
>  
> -static int init_vqs(struct virtio_balloon *vb)
> +static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
> +					   unsigned long nr_pages)
>  {
> -	struct virtqueue *vqs[3];
> -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
> -	static const char * const names[] = { "inflate", "deflate", "stats" };
> -	int err, nvqs;
> +	struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
> +	void *addr = (void *)pfn_to_kaddr(pfn);
> +	uint32_t len = nr_pages << PAGE_SHIFT;
> +
> +	if (vb->report_free_page_stop)
> +		return false;
> +
> +	send_free_page_sg(vb->free_page_vq, addr, len);
>  
> +	return true;
> +}
> +
> +static void report_free_page_end(struct virtio_balloon *vb)
> +{
> +	u32 cmd = VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
>  	/*
> -	 * We expect two virtqueues: inflate and deflate, and
> -	 * optionally stat.
> +	 * The host may have already requested to stop the reporting before we
> +	 * finish, so no need to notify the host in this case.
>  	 */
> -	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
> -	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
> +	if (vb->report_free_page_stop)
> +		return;
> +	vb->report_free_page_stop = true;
> +
> +	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
> +		      &cmd);
> +}
> +
> +static void report_free_page_cmd_id(struct virtio_balloon *vb)
> +{
> +	struct scatterlist sg;
> +	int err;
> +
> +	virtio_cread(vb->vdev, struct virtio_balloon_config, free_page_cmd_id,
> +		     &vb->free_page_cmd_id);
> +	sg_init_one(&sg, &vb->free_page_cmd_id, sizeof(uint32_t));
> +	err = virtqueue_add_outbuf(vb->free_page_vq, &sg, 1,
> +				   &vb->free_page_cmd_id, GFP_KERNEL);
> +	BUG_ON(err);
> +}
> +
> +static void report_free_page(struct work_struct *work)
> +{
> +	struct virtio_balloon *vb;
> +
> +	vb = container_of(work, struct virtio_balloon, report_free_page_work);
> +	report_free_page_cmd_id(vb);
> +	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> +	/*
> +	 * The last few free page blocks that were added may not reach the
> +	 * batch size, but need a kick to notify the device to handle them.
> +	 */
> +	virtqueue_kick(vb->free_page_vq);
> +	report_free_page_end(vb);
> +}
> +

I think there's an issue here: if pages are poisoned and hypervisor
subsequently drops them, testing them after allocation will
trigger a false positive.

The specific configuration:

PAGE_POISONING on
PAGE_POISONING_NO_SANITY off
PAGE_POISONING_ZERO off


Solutions:
1. disable the feature in that configuration
	suggested as an initial step
2. pass poison value to host so it can validate page content
   before it drops it
3. pass poison value to host so it can init allocated pages with that value

In fact one nice side effect would be that unmap
becomes safe even though free list is not locked anymore.

It would be interesting to see whether this last has
any value performance-wise.


> +static int init_vqs(struct virtio_balloon *vb)
> +{
> +	struct virtqueue **vqs;
> +	vq_callback_t **callbacks;
> +	const char **names;
> +	struct scatterlist sg;
> +	int i, nvqs, err = -ENOMEM;
> +
> +	/* Inflateq and deflateq are used unconditionally */
> +	nvqs = 2;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
> +		nvqs++;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
> +		nvqs++;
> +
> +	/* Allocate space for find_vqs parameters */
> +	vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
> +	if (!vqs)
> +		goto err_vq;
> +	callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
> +	if (!callbacks)
> +		goto err_callback;
> +	names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
> +	if (!names)
> +		goto err_names;
> +
> +	callbacks[0] = balloon_ack;
> +	names[0] = "inflate";
> +	callbacks[1] = balloon_ack;
> +	names[1] = "deflate";
> +
> +	i = 2;
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> +		callbacks[i] = stats_request;
> +		names[i] = "stats";
> +		i++;
> +	}
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> +		callbacks[i] = NULL;
> +		names[i] = "free_page_vq";
> +	}
> +
> +	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
> +					 NULL, NULL);
>  	if (err)
> -		return err;
> +		goto err_find;
>  
>  	vb->inflate_vq = vqs[0];
>  	vb->deflate_vq = vqs[1];
> +	i = 2;
>  	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
> -		struct scatterlist sg;
> -		unsigned int num_stats;
> -		vb->stats_vq = vqs[2];
> -
> +		vb->stats_vq = vqs[i++];
>  		/*
>  		 * Prime this virtqueue with one buffer so the hypervisor can
>  		 * use it to signal us later (it can't be broken yet!).
>  		 */
> -		num_stats = update_balloon_stats(vb);
> -
> -		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
> +		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
>  		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
> -		    < 0)
> -			BUG();
> +		    < 0) {
> +			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
> +				 __func__);
> +			goto err_find;
> +		}
>  		virtqueue_kick(vb->stats_vq);
>  	}
> +
> +	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
> +		vb->free_page_vq = vqs[i];
> +
> +	kfree(names);
> +	kfree(callbacks);
> +	kfree(vqs);
>  	return 0;
> +
> +err_find:
> +	kfree(names);
> +err_names:
> +	kfree(callbacks);
> +err_callback:
> +	kfree(vqs);
> +err_vq:
> +	return err;
>  }
>  
>  #ifdef CONFIG_BALLOON_COMPACTION
> @@ -761,6 +942,13 @@ static int virtballoon_probe(struct virtio_device *vdev)
>  	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG))
>  		xb_init(&vb->page_xb);
>  
> +	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
> +		vb->balloon_wq = alloc_workqueue("balloon-wq",
> +					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
> +		INIT_WORK(&vb->report_free_page_work, report_free_page);
> +		vb->report_free_page_stop = true;
> +	}
> +
>  	vb->nb.notifier_call = virtballoon_oom_notify;
>  	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
>  	err = register_oom_notifier(&vb->nb);
> @@ -825,6 +1013,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
>  	spin_unlock_irq(&vb->stop_update_lock);
>  	cancel_work_sync(&vb->update_balloon_size_work);
>  	cancel_work_sync(&vb->update_balloon_stats_work);
> +	cancel_work_sync(&vb->report_free_page_work);
>  
>  	remove_common(vb);
>  #ifdef CONFIG_BALLOON_COMPACTION
> @@ -878,6 +1067,7 @@ static unsigned int features[] = {
>  	VIRTIO_BALLOON_F_STATS_VQ,
>  	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
>  	VIRTIO_BALLOON_F_SG,
> +	VIRTIO_BALLOON_F_FREE_PAGE_VQ,
>  };
>  
>  static struct virtio_driver virtio_balloon_driver = {
> diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
> index 37780a7..b758484 100644
> --- a/include/uapi/linux/virtio_balloon.h
> +++ b/include/uapi/linux/virtio_balloon.h
> @@ -35,15 +35,26 @@
>  #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
>  #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
>  #define VIRTIO_BALLOON_F_SG		3 /* Use sg instead of PFN lists */
> +#define VIRTIO_BALLOON_F_FREE_PAGE_VQ	4 /* VQ to report free pages */
>  
>  /* Size of a PFN in the balloon interface. */
>  #define VIRTIO_BALLOON_PFN_SHIFT 12
>  
> +#define	VIRTIO_BALLOON_CMD_BALLOON_MEMORY		(1 << 0)
> +#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START	(1 << 1)
> +#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP	(1 << 2)
> +
>  struct virtio_balloon_config {
>  	/* Number of pages host wants Guest to give up. */
>  	__u32 num_pages;
>  	/* Number of pages we've actually got in balloon. */
>  	__u32 actual;
> +	/* Host-to-guest command, readonly by guest */
> +	__u32 host2guest_cmd;
> +	/* Sequence id of the free_page report command, readonly by guest */
> +	__u32 free_page_cmd_id;
> +	/* Guest-to-host command */
> +	__u32 guest2host_cmd;
>  };
>  
>  #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
> -- 
> 2.7.4
Wang, Wei W Nov. 16, 2017, 11:59 a.m. UTC | #8
On 11/15/2017 09:26 PM, Michael S. Tsirkin wrote:
> On Wed, Nov 15, 2017 at 11:47:58AM +0800, Wei Wang wrote:
>> On 11/15/2017 05:21 AM, Michael S. Tsirkin wrote:
>>> On Tue, Nov 14, 2017 at 08:02:03PM +0800, Wei Wang wrote:
>>>> On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:
>>>>>> - guest2host_cmd: written by the guest to ACK to the host about the
>>>>>> commands that have been received. The host will clear the corresponding
>>>>>> bits on the host2guest_cmd register. The guest also uses this register
>>>>>> to send commands to the host (e.g. when finish free page reporting).
>>>>> I am not sure what is the role of guest2host_cmd. Reporting of
>>>>> the correct cmd id seems sufficient indication that guest
>>>>> received the start command. Not getting any more seems sufficient
>>>>> to detect stop.
>>>>>
>>>> I think the issue is when the host is waiting for the guest to report pages,
>>>> it does not know whether the guest is going to report more or the report is
>>>> done already. That's why we need a way to let the guest tell the host "the
>>>> report is done, don't wait for more", then the host continues to the next
>>>> step - sending the non-free pages to the destination. The following method
>>>> is a conclusion of other comments, with some new thought. Please have a
>>>> check if it is good.
>>> config won't work well for this IMHO.
>>> Writes to config register are hard to synchronize with the VQ.
>>> For example, guest sends free pages, host says stop, meanwhile
>>> guest sends stop for 1st set of pages.
>> I still don't see an issue with this. Please see below:
>> (before jumping into the discussion, just make sure I've well explained this
>> point: now host-to-guest commands are done via config, and guest-to-host
>> commands are done via the free page vq)
> This is fine by me actually. But right now you have guest to host
> not going through vq, going through command register instead -
> this is how sending stop to host seems to happen.
> If you make it go through vq then I think all will be well.
>
>> Case: Host starts to request the reporting with cmd_id=1. Some time later,
>> Host writes "stop" to config, meantime guest happens to finish the reporting
>> and plan to actively send a "stop" command from the free_page_vq().
>>            Essentially, this is like a sync between two threads - if we view
>> the config interrupt handler as one thread, another is the free page
>> reporting worker thread.
>>
>>          - what the config handler does is simply:
>>                1.1:  WRITE_ONCE(vb->reporting_stop, true);
>>
>>          - what the reporting thread will do is
>>                2.1:  WRITE_ONCE(vb->reporting_stop, true);
>>                2.2:  send_stop_to_host_via_vq();
>>
>>  From the guest point of view, no matter 1.1 is executed first or 2.1 first,
>> it doesn't make a difference to the end result - vb->reporting_stop is set.
>>
>>  From the host point of view, it knows that cmd_id=1 has truly stopped the
>> reporting when it receives a "stop" sign via the vq.
>>
>>
>>> How about adding a buffer with "stop" in the VQ instead?
>>> Wastes a VQ entry which you will need to reserve for this
>>> but is it a big deal?
>> The free page vq is guest-to-host direction.
> Yes, for guest to host stop sign.
>
>> Using it for host-to-guest
>> requests will make it bidirectional, which will result in the same issue
>> described before: https://lkml.org/lkml/2017/10/11/1009 (the first response)
>>
>> On the other hand, I think adding another new vq for host-to-guest
>> requesting doesn't make a difference in essence, compared to using config
>> (same 1.1, 2.1, 2.2 above), but will be more complicated.
> I agree with this. Host to guest can just incremenent the "free command id"
> register.


OK, thanks for the suggestions. I think one more issue left here:

Previously, when the guest receives a config interrupt, it blindly adds 
the balloon work item to the workqueue in virtballoon_changed(), because 
only ballooning uses the config.
Now, free page reporting is requested via config, too.

We have the following two options:

Option 1: add "diff = towards_target()" to virtballoon_changed(), and if 
diff = 0, it will not add the balloon work item to the wq.

Option 2: add "cmd" for the host-to-guest request, and add the item when 
"cmd | CMD_BALLOON" is true.

I'm inclined to take option 1 now. Which one would you prefer?

Best,
Wei
Wang, Wei W Nov. 16, 2017, 1:27 p.m. UTC | #9
On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:
> On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
>> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
>> support of reporting hints of guest free pages to the host via
>> virtio-balloon. The host requests the guest to report the free pages by
>> sending commands via the virtio-balloon configuration registers.
>>
>> When the guest starts to report, the first element added to the free page
>> vq is a sequence id of the start reporting command. The id is given by
>> the host, and it indicates whether the following free pages correspond
>> to the command. For example, the host may stop the report and start again
>> with a new command id. The obsolete pages for the previous start command
>> can be detected by the id dismatching on the host. The id is added to the
>> vq using an output buffer, and the free pages are added to the vq using
>> input buffer.
>>
>> Here are some explainations about the added configuration registers:
>> - host2guest_cmd: a register used by the host to send commands to the
>> guest.
>> - guest2host_cmd: written by the guest to ACK to the host about the
>> commands that have been received. The host will clear the corresponding
>> bits on the host2guest_cmd register. The guest also uses this register
>> to send commands to the host (e.g. when finish free page reporting).
>> - free_page_cmd_id: the sequence id of the free page report command
>> given by the host.
>>
>> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
>> Signed-off-by: Liang Li <liang.z.li@intel.com>
>> Cc: Michael S. Tsirkin <mst@redhat.com>
>> Cc: Michal Hocko <mhocko@kernel.org>
>> ---
>>
>> +
>> +static void report_free_page(struct work_struct *work)
>> +{
>> +	struct virtio_balloon *vb;
>> +
>> +	vb = container_of(work, struct virtio_balloon, report_free_page_work);
>> +	report_free_page_cmd_id(vb);
>> +	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
>> +	/*
>> +	 * The last few free page blocks that were added may not reach the
>> +	 * batch size, but need a kick to notify the device to handle them.
>> +	 */
>> +	virtqueue_kick(vb->free_page_vq);
>> +	report_free_page_end(vb);
>> +}
>> +
> I think there's an issue here: if pages are poisoned and hypervisor
> subsequently drops them, testing them after allocation will
> trigger a false positive.
>
> The specific configuration:
>
> PAGE_POISONING on
> PAGE_POISONING_NO_SANITY off
> PAGE_POISONING_ZERO off
>
>
> Solutions:
> 1. disable the feature in that configuration
> 	suggested as an initial step

Thanks for the finding.
Similar to this option: I'm thinking could we make walk_free_mem_block() 
simply return if that option is on?
That is, at the beginning of the function:
     if (!page_poisoning_enabled())
                 return;

I think in most usages, people would not choose to use the poisoning 
option due to the added overhead.


Probably we could make it a separate fix patch of this report following 
patch 5 to explain the above reasons in the commit.

> 2. pass poison value to host so it can validate page content
>     before it drops it
> 3. pass poison value to host so it can init allocated pages with that value
>
> In fact one nice side effect would be that unmap
> becomes safe even though free list is not locked anymore.

I haven't got this point yet,  how would it bring performance benefit?

> It would be interesting to see whether this last has
> any value performance-wise.
>

Best,
Wei
Wang, Wei W Nov. 17, 2017, 11:35 a.m. UTC | #10
On 11/16/2017 09:27 PM, Wei Wang wrote:
> On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:
>> On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
>>> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
>>> support of reporting hints of guest free pages to the host via
>>> virtio-balloon. The host requests the guest to report the free pages by
>>> sending commands via the virtio-balloon configuration registers.
>>>
>>> When the guest starts to report, the first element added to the free 
>>> page
>>> vq is a sequence id of the start reporting command. The id is given by
>>> the host, and it indicates whether the following free pages correspond
>>> to the command. For example, the host may stop the report and start 
>>> again
>>> with a new command id. The obsolete pages for the previous start 
>>> command
>>> can be detected by the id dismatching on the host. The id is added 
>>> to the
>>> vq using an output buffer, and the free pages are added to the vq using
>>> input buffer.
>>>
>>> Here are some explainations about the added configuration registers:
>>> - host2guest_cmd: a register used by the host to send commands to the
>>> guest.
>>> - guest2host_cmd: written by the guest to ACK to the host about the
>>> commands that have been received. The host will clear the corresponding
>>> bits on the host2guest_cmd register. The guest also uses this register
>>> to send commands to the host (e.g. when finish free page reporting).
>>> - free_page_cmd_id: the sequence id of the free page report command
>>> given by the host.
>>>
>>> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
>>> Signed-off-by: Liang Li <liang.z.li@intel.com>
>>> Cc: Michael S. Tsirkin <mst@redhat.com>
>>> Cc: Michal Hocko <mhocko@kernel.org>
>>> ---
>>>
>>> +
>>> +static void report_free_page(struct work_struct *work)
>>> +{
>>> +    struct virtio_balloon *vb;
>>> +
>>> +    vb = container_of(work, struct virtio_balloon, 
>>> report_free_page_work);
>>> +    report_free_page_cmd_id(vb);
>>> +    walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
>>> +    /*
>>> +     * The last few free page blocks that were added may not reach the
>>> +     * batch size, but need a kick to notify the device to handle 
>>> them.
>>> +     */
>>> +    virtqueue_kick(vb->free_page_vq);
>>> +    report_free_page_end(vb);
>>> +}
>>> +
>> I think there's an issue here: if pages are poisoned and hypervisor
>> subsequently drops them, testing them after allocation will
>> trigger a false positive.
>>
>> The specific configuration:
>>
>> PAGE_POISONING on
>> PAGE_POISONING_NO_SANITY off
>> PAGE_POISONING_ZERO off
>>
>>
>> Solutions:
>> 1. disable the feature in that configuration
>>     suggested as an initial step
>
> Thanks for the finding.
> Similar to this option: I'm thinking could we make 
> walk_free_mem_block() simply return if that option is on?
> That is, at the beginning of the function:
>     if (!page_poisoning_enabled())
>                 return;
>


Thought about it more, I think it would be better to put this logic to 
virtio_balloon:

         send_free_page_cmd_id(vb, &vb->start_cmd_id);
         if (page_poisoning_enabled() &&
             !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
                 walk_free_mem_block(vb, 0, 
&virtio_balloon_send_free_pages);
         send_free_page_cmd_id(vb, &vb->stop_cmd_id);


walk_free_mem_block() should be a more generic API, and this potential 
page poisoning issue is specific to live migration which is only one use 
case of this function, so I think it is better to handle it in the 
special use case itself.

Best,
Wei
Wang, Wei W Nov. 17, 2017, 11:48 a.m. UTC | #11
On 11/17/2017 07:35 PM, Wei Wang wrote:
> On 11/16/2017 09:27 PM, Wei Wang wrote:
>> On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:
>>> On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
>>>> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
>>>> support of reporting hints of guest free pages to the host via
>>>> virtio-balloon. The host requests the guest to report the free 
>>>> pages by
>>>> sending commands via the virtio-balloon configuration registers.
>>>>
>>>> When the guest starts to report, the first element added to the 
>>>> free page
>>>> vq is a sequence id of the start reporting command. The id is given by
>>>> the host, and it indicates whether the following free pages correspond
>>>> to the command. For example, the host may stop the report and start 
>>>> again
>>>> with a new command id. The obsolete pages for the previous start 
>>>> command
>>>> can be detected by the id dismatching on the host. The id is added 
>>>> to the
>>>> vq using an output buffer, and the free pages are added to the vq 
>>>> using
>>>> input buffer.
>>>>
>>>> Here are some explainations about the added configuration registers:
>>>> - host2guest_cmd: a register used by the host to send commands to the
>>>> guest.
>>>> - guest2host_cmd: written by the guest to ACK to the host about the
>>>> commands that have been received. The host will clear the 
>>>> corresponding
>>>> bits on the host2guest_cmd register. The guest also uses this register
>>>> to send commands to the host (e.g. when finish free page reporting).
>>>> - free_page_cmd_id: the sequence id of the free page report command
>>>> given by the host.
>>>>
>>>> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
>>>> Signed-off-by: Liang Li <liang.z.li@intel.com>
>>>> Cc: Michael S. Tsirkin <mst@redhat.com>
>>>> Cc: Michal Hocko <mhocko@kernel.org>
>>>> ---
>>>>
>>>> +
>>>> +static void report_free_page(struct work_struct *work)
>>>> +{
>>>> +    struct virtio_balloon *vb;
>>>> +
>>>> +    vb = container_of(work, struct virtio_balloon, 
>>>> report_free_page_work);
>>>> +    report_free_page_cmd_id(vb);
>>>> +    walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
>>>> +    /*
>>>> +     * The last few free page blocks that were added may not reach 
>>>> the
>>>> +     * batch size, but need a kick to notify the device to handle 
>>>> them.
>>>> +     */
>>>> +    virtqueue_kick(vb->free_page_vq);
>>>> +    report_free_page_end(vb);
>>>> +}
>>>> +
>>> I think there's an issue here: if pages are poisoned and hypervisor
>>> subsequently drops them, testing them after allocation will
>>> trigger a false positive.
>>>
>>> The specific configuration:
>>>
>>> PAGE_POISONING on
>>> PAGE_POISONING_NO_SANITY off
>>> PAGE_POISONING_ZERO off
>>>
>>>
>>> Solutions:
>>> 1. disable the feature in that configuration
>>>     suggested as an initial step
>>
>> Thanks for the finding.
>> Similar to this option: I'm thinking could we make 
>> walk_free_mem_block() simply return if that option is on?
>> That is, at the beginning of the function:
>>     if (!page_poisoning_enabled())
>>                 return;
>>
>
>
> Thought about it more, I think it would be better to put this logic to 
> virtio_balloon:
>
>         send_free_page_cmd_id(vb, &vb->start_cmd_id);
>         if (page_poisoning_enabled() &&
>             !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
>                 walk_free_mem_block(vb, 0, 
> &virtio_balloon_send_free_pages);

logic should be inverse:
     if (!(page_poisoning_enabled() &&
             !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)))

Best,
Wei
Michael S. Tsirkin Nov. 17, 2017, 12:44 p.m. UTC | #12
On Fri, Nov 17, 2017 at 07:35:03PM +0800, Wei Wang wrote:
> On 11/16/2017 09:27 PM, Wei Wang wrote:
> > On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:
> > > On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
> > > > Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
> > > > support of reporting hints of guest free pages to the host via
> > > > virtio-balloon. The host requests the guest to report the free pages by
> > > > sending commands via the virtio-balloon configuration registers.
> > > > 
> > > > When the guest starts to report, the first element added to the
> > > > free page
> > > > vq is a sequence id of the start reporting command. The id is given by
> > > > the host, and it indicates whether the following free pages correspond
> > > > to the command. For example, the host may stop the report and
> > > > start again
> > > > with a new command id. The obsolete pages for the previous start
> > > > command
> > > > can be detected by the id dismatching on the host. The id is
> > > > added to the
> > > > vq using an output buffer, and the free pages are added to the vq using
> > > > input buffer.
> > > > 
> > > > Here are some explainations about the added configuration registers:
> > > > - host2guest_cmd: a register used by the host to send commands to the
> > > > guest.
> > > > - guest2host_cmd: written by the guest to ACK to the host about the
> > > > commands that have been received. The host will clear the corresponding
> > > > bits on the host2guest_cmd register. The guest also uses this register
> > > > to send commands to the host (e.g. when finish free page reporting).
> > > > - free_page_cmd_id: the sequence id of the free page report command
> > > > given by the host.
> > > > 
> > > > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > > > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > > Cc: Michal Hocko <mhocko@kernel.org>
> > > > ---
> > > > 
> > > > +
> > > > +static void report_free_page(struct work_struct *work)
> > > > +{
> > > > +    struct virtio_balloon *vb;
> > > > +
> > > > +    vb = container_of(work, struct virtio_balloon,
> > > > report_free_page_work);
> > > > +    report_free_page_cmd_id(vb);
> > > > +    walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> > > > +    /*
> > > > +     * The last few free page blocks that were added may not reach the
> > > > +     * batch size, but need a kick to notify the device to
> > > > handle them.
> > > > +     */
> > > > +    virtqueue_kick(vb->free_page_vq);
> > > > +    report_free_page_end(vb);
> > > > +}
> > > > +
> > > I think there's an issue here: if pages are poisoned and hypervisor
> > > subsequently drops them, testing them after allocation will
> > > trigger a false positive.
> > > 
> > > The specific configuration:
> > > 
> > > PAGE_POISONING on
> > > PAGE_POISONING_NO_SANITY off
> > > PAGE_POISONING_ZERO off
> > > 
> > > 
> > > Solutions:
> > > 1. disable the feature in that configuration
> > >     suggested as an initial step
> > 
> > Thanks for the finding.
> > Similar to this option: I'm thinking could we make walk_free_mem_block()
> > simply return if that option is on?
> > That is, at the beginning of the function:
> >     if (!page_poisoning_enabled())
> >                 return;
> > 
> 
> 
> Thought about it more, I think it would be better to put this logic to
> virtio_balloon:
> 
>         send_free_page_cmd_id(vb, &vb->start_cmd_id);
>         if (page_poisoning_enabled() &&
>             !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
>                 walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
>         send_free_page_cmd_id(vb, &vb->stop_cmd_id);
> 
> 
> walk_free_mem_block() should be a more generic API, and this potential page
> poisoning issue is specific to live migration which is only one use case of
> this function, so I think it is better to handle it in the special use case
> itself.
> 
> Best,
> Wei
> 

It's a quick work-around but it doesn't make me very happy.

AFAIK e.g. RHEL has a debug kernel with poisoning enabled.
If this never uses free page hinting at all, it will
be much less useful for debugging guests.
Michael S. Tsirkin Nov. 17, 2017, 1:18 p.m. UTC | #13
On Thu, Nov 16, 2017 at 09:27:24PM +0800, Wei Wang wrote:
> On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:
> > On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
> > > Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature indicates the
> > > support of reporting hints of guest free pages to the host via
> > > virtio-balloon. The host requests the guest to report the free pages by
> > > sending commands via the virtio-balloon configuration registers.
> > > 
> > > When the guest starts to report, the first element added to the free page
> > > vq is a sequence id of the start reporting command. The id is given by
> > > the host, and it indicates whether the following free pages correspond
> > > to the command. For example, the host may stop the report and start again
> > > with a new command id. The obsolete pages for the previous start command
> > > can be detected by the id dismatching on the host. The id is added to the
> > > vq using an output buffer, and the free pages are added to the vq using
> > > input buffer.
> > > 
> > > Here are some explainations about the added configuration registers:
> > > - host2guest_cmd: a register used by the host to send commands to the
> > > guest.
> > > - guest2host_cmd: written by the guest to ACK to the host about the
> > > commands that have been received. The host will clear the corresponding
> > > bits on the host2guest_cmd register. The guest also uses this register
> > > to send commands to the host (e.g. when finish free page reporting).
> > > - free_page_cmd_id: the sequence id of the free page report command
> > > given by the host.
> > > 
> > > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > Cc: Michal Hocko <mhocko@kernel.org>
> > > ---
> > > 
> > > +
> > > +static void report_free_page(struct work_struct *work)
> > > +{
> > > +	struct virtio_balloon *vb;
> > > +
> > > +	vb = container_of(work, struct virtio_balloon, report_free_page_work);
> > > +	report_free_page_cmd_id(vb);
> > > +	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> > > +	/*
> > > +	 * The last few free page blocks that were added may not reach the
> > > +	 * batch size, but need a kick to notify the device to handle them.
> > > +	 */
> > > +	virtqueue_kick(vb->free_page_vq);
> > > +	report_free_page_end(vb);
> > > +}
> > > +
> > I think there's an issue here: if pages are poisoned and hypervisor
> > subsequently drops them, testing them after allocation will
> > trigger a false positive.
> > 
> > The specific configuration:
> > 
> > PAGE_POISONING on
> > PAGE_POISONING_NO_SANITY off
> > PAGE_POISONING_ZERO off
> > 
> > 
> > Solutions:
> > 1. disable the feature in that configuration
> > 	suggested as an initial step
> 
> Thanks for the finding.
> Similar to this option: I'm thinking could we make walk_free_mem_block()
> simply return if that option is on?
> That is, at the beginning of the function:
>     if (!page_poisoning_enabled())
>                 return;
> 
> I think in most usages, people would not choose to use the poisoning option
> due to the added overhead.
> 
> 
> Probably we could make it a separate fix patch of this report following
> patch 5 to explain the above reasons in the commit.
> 
> > 2. pass poison value to host so it can validate page content
> >     before it drops it
> > 3. pass poison value to host so it can init allocated pages with that value
> > 
> > In fact one nice side effect would be that unmap
> > becomes safe even though free list is not locked anymore.
> 
> I haven't got this point yet,  how would it bring performance benefit?

Upon getting a free page, host could check that its content
matches the poison value. If it doesn't page has been used.

But let's ignore this for now.

> > It would be interesting to see whether this last has
> > any value performance-wise.
> > 
> 
> Best,
> Wei
Wang, Wei W Nov. 18, 2017, 5:22 a.m. UTC | #14
On Friday, November 17, 2017 8:45 PM, Michael S. Tsirkin wrote:
> On Fri, Nov 17, 2017 at 07:35:03PM +0800, Wei Wang wrote:
> > On 11/16/2017 09:27 PM, Wei Wang wrote:
> > > On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:
> > > > On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
> > > > > Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature
> > > > > indicates the support of reporting hints of guest free pages to
> > > > > the host via virtio-balloon. The host requests the guest to
> > > > > report the free pages by sending commands via the virtio-balloon
> configuration registers.
> > > > >
> > > > > When the guest starts to report, the first element added to the
> > > > > free page vq is a sequence id of the start reporting command.
> > > > > The id is given by the host, and it indicates whether the
> > > > > following free pages correspond to the command. For example, the
> > > > > host may stop the report and start again with a new command id.
> > > > > The obsolete pages for the previous start command can be
> > > > > detected by the id dismatching on the host. The id is added to
> > > > > the vq using an output buffer, and the free pages are added to
> > > > > the vq using input buffer.
> > > > >
> > > > > Here are some explainations about the added configuration registers:
> > > > > - host2guest_cmd: a register used by the host to send commands
> > > > > to the guest.
> > > > > - guest2host_cmd: written by the guest to ACK to the host about
> > > > > the commands that have been received. The host will clear the
> > > > > corresponding bits on the host2guest_cmd register. The guest
> > > > > also uses this register to send commands to the host (e.g. when finish
> free page reporting).
> > > > > - free_page_cmd_id: the sequence id of the free page report
> > > > > command given by the host.
> > > > >
> > > > > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > > > > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > > > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > > > Cc: Michal Hocko <mhocko@kernel.org>
> > > > > ---
> > > > >
> > > > > +
> > > > > +static void report_free_page(struct work_struct *work) {
> > > > > +    struct virtio_balloon *vb;
> > > > > +
> > > > > +    vb = container_of(work, struct virtio_balloon,
> > > > > report_free_page_work);
> > > > > +    report_free_page_cmd_id(vb);
> > > > > +    walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> > > > > +    /*
> > > > > +     * The last few free page blocks that were added may not reach the
> > > > > +     * batch size, but need a kick to notify the device to
> > > > > handle them.
> > > > > +     */
> > > > > +    virtqueue_kick(vb->free_page_vq);
> > > > > +    report_free_page_end(vb);
> > > > > +}
> > > > > +
> > > > I think there's an issue here: if pages are poisoned and
> > > > hypervisor subsequently drops them, testing them after allocation
> > > > will trigger a false positive.
> > > >
> > > > The specific configuration:
> > > >
> > > > PAGE_POISONING on
> > > > PAGE_POISONING_NO_SANITY off
> > > > PAGE_POISONING_ZERO off
> > > >
> > > >
> > > > Solutions:
> > > > 1. disable the feature in that configuration
> > > >     suggested as an initial step
> > >
> > > Thanks for the finding.
> > > Similar to this option: I'm thinking could we make
> > > walk_free_mem_block() simply return if that option is on?
> > > That is, at the beginning of the function:
> > >     if (!page_poisoning_enabled())
> > >                 return;
> > >
> >
> >
> > Thought about it more, I think it would be better to put this logic to
> > virtio_balloon:
> >
> >         send_free_page_cmd_id(vb, &vb->start_cmd_id);
> >         if (page_poisoning_enabled() &&
> >             !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
> >                 walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> >         send_free_page_cmd_id(vb, &vb->stop_cmd_id);
> >
> >
> > walk_free_mem_block() should be a more generic API, and this potential
> > page poisoning issue is specific to live migration which is only one
> > use case of this function, so I think it is better to handle it in the
> > special use case itself.
> >
> > Best,
> > Wei
> >
> 
> It's a quick work-around but it doesn't make me very happy.
> 
> AFAIK e.g. RHEL has a debug kernel with poisoning enabled.
> If this never uses free page hinting at all, it will be much less useful for
> debugging guests.
> 

I understand your concern. I think people who use debugging guests don't regard performance as the first priority, and most vendors usually wouldn't use debugging guests for their products.

How about taking it as the initial solution? We can exploit more solutions after this series is done.

Best,
Wei
Michael S. Tsirkin Nov. 19, 2017, 3:11 p.m. UTC | #15
On Sat, Nov 18, 2017 at 05:22:28AM +0000, Wang, Wei W wrote:
> On Friday, November 17, 2017 8:45 PM, Michael S. Tsirkin wrote:
> > On Fri, Nov 17, 2017 at 07:35:03PM +0800, Wei Wang wrote:
> > > On 11/16/2017 09:27 PM, Wei Wang wrote:
> > > > On 11/16/2017 04:32 AM, Michael S. Tsirkin wrote:
> > > > > On Fri, Nov 03, 2017 at 04:13:06PM +0800, Wei Wang wrote:
> > > > > > Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_VQ feature
> > > > > > indicates the support of reporting hints of guest free pages to
> > > > > > the host via virtio-balloon. The host requests the guest to
> > > > > > report the free pages by sending commands via the virtio-balloon
> > configuration registers.
> > > > > >
> > > > > > When the guest starts to report, the first element added to the
> > > > > > free page vq is a sequence id of the start reporting command.
> > > > > > The id is given by the host, and it indicates whether the
> > > > > > following free pages correspond to the command. For example, the
> > > > > > host may stop the report and start again with a new command id.
> > > > > > The obsolete pages for the previous start command can be
> > > > > > detected by the id dismatching on the host. The id is added to
> > > > > > the vq using an output buffer, and the free pages are added to
> > > > > > the vq using input buffer.
> > > > > >
> > > > > > Here are some explainations about the added configuration registers:
> > > > > > - host2guest_cmd: a register used by the host to send commands
> > > > > > to the guest.
> > > > > > - guest2host_cmd: written by the guest to ACK to the host about
> > > > > > the commands that have been received. The host will clear the
> > > > > > corresponding bits on the host2guest_cmd register. The guest
> > > > > > also uses this register to send commands to the host (e.g. when finish
> > free page reporting).
> > > > > > - free_page_cmd_id: the sequence id of the free page report
> > > > > > command given by the host.
> > > > > >
> > > > > > Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> > > > > > Signed-off-by: Liang Li <liang.z.li@intel.com>
> > > > > > Cc: Michael S. Tsirkin <mst@redhat.com>
> > > > > > Cc: Michal Hocko <mhocko@kernel.org>
> > > > > > ---
> > > > > >
> > > > > > +
> > > > > > +static void report_free_page(struct work_struct *work) {
> > > > > > +    struct virtio_balloon *vb;
> > > > > > +
> > > > > > +    vb = container_of(work, struct virtio_balloon,
> > > > > > report_free_page_work);
> > > > > > +    report_free_page_cmd_id(vb);
> > > > > > +    walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> > > > > > +    /*
> > > > > > +     * The last few free page blocks that were added may not reach the
> > > > > > +     * batch size, but need a kick to notify the device to
> > > > > > handle them.
> > > > > > +     */
> > > > > > +    virtqueue_kick(vb->free_page_vq);
> > > > > > +    report_free_page_end(vb);
> > > > > > +}
> > > > > > +
> > > > > I think there's an issue here: if pages are poisoned and
> > > > > hypervisor subsequently drops them, testing them after allocation
> > > > > will trigger a false positive.
> > > > >
> > > > > The specific configuration:
> > > > >
> > > > > PAGE_POISONING on
> > > > > PAGE_POISONING_NO_SANITY off
> > > > > PAGE_POISONING_ZERO off
> > > > >
> > > > >
> > > > > Solutions:
> > > > > 1. disable the feature in that configuration
> > > > >     suggested as an initial step
> > > >
> > > > Thanks for the finding.
> > > > Similar to this option: I'm thinking could we make
> > > > walk_free_mem_block() simply return if that option is on?
> > > > That is, at the beginning of the function:
> > > >     if (!page_poisoning_enabled())
> > > >                 return;
> > > >
> > >
> > >
> > > Thought about it more, I think it would be better to put this logic to
> > > virtio_balloon:
> > >
> > >         send_free_page_cmd_id(vb, &vb->start_cmd_id);
> > >         if (page_poisoning_enabled() &&
> > >             !IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
> > >                 walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
> > >         send_free_page_cmd_id(vb, &vb->stop_cmd_id);
> > >
> > >
> > > walk_free_mem_block() should be a more generic API, and this potential
> > > page poisoning issue is specific to live migration which is only one
> > > use case of this function, so I think it is better to handle it in the
> > > special use case itself.
> > >
> > > Best,
> > > Wei
> > >
> > 
> > It's a quick work-around but it doesn't make me very happy.
> > 
> > AFAIK e.g. RHEL has a debug kernel with poisoning enabled.
> > If this never uses free page hinting at all, it will be much less useful for
> > debugging guests.
> > 
> 
> I understand your concern. I think people who use debugging guests
> don't regard performance as the first priority, and most vendors
> usually wouldn't use debugging guests for their products.

And when one of these crashes but only after migration what do you do?  A
very common step is for Red Hat support is to ask people to try
reproducing with a debug build.

IOT being able to debug guests is important, if a debugging guest takes
a significantly different path from non-debug one, we have a problem.

> 
> How about taking it as the initial solution? We can exploit more
> solutions after this series is done.
> 
> Best,
> Wei

I think it's fine as a separate patch.
Wang, Wei W Nov. 20, 2017, 11:42 a.m. UTC | #16
On 11/14/2017 01:32 AM, Michael S. Tsirkin wrote:
> You should Cc Nitesh who is working on a related feature.

OK, I'll do. We have two more issues which haven't been discussed yet, 
please have a check below.

>
> On Mon, Nov 13, 2017 at 06:34:48PM +0800, Wei Wang wrote:
>> Ping for comments, thanks.
>>
>> On 11/03/2017 04:13 PM, Wei Wang wrote:
>>> +static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
>>> +{
>>> +	unsigned long flags;
>>> +
>>> +	vb->report_free_page_stop = false;
> this flag is used a lot outside any locks. Why is this safe?
> Please add some comments explaining access to this flag.

I will revert the logic as suggested: vb->report_free_page. Also plan to 
simplify its usage as below.

The flag is set or cleared in the config handler according to the 
new_cmd_id given
by the host:

new_cmd_id=0:                    WRITE_ONCE(vb->report_free_page, 
false); // stop reporting
new_cmd_id != old_cmd_id: WRITE_ONCE(vb->report_free_page, true);  // 
start reporting


The flag is read by virtio_balloon_send_free_pages() - the callback to 
report free pages:

if (!READ_ONCE(vb->report_free_page))
                 return false;

I don't find where it could be unsafe then (the flag is written by the 
config handler only).



>
>>> +}
>>> +
>>>    static inline s64 towards_target(struct virtio_balloon *vb)
>>>    {
>>>    	s64 target;
>>> @@ -597,42 +673,147 @@ static void update_balloon_size_func(struct work_struct *work)
>>>    		queue_work(system_freezable_wq, work);
>>>    }
>>> -static int init_vqs(struct virtio_balloon *vb)
>>> +static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
>>> +					   unsigned long nr_pages)
>>>    {
>>> -	struct virtqueue *vqs[3];
>>> -	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
>>> -	static const char * const names[] = { "inflate", "deflate", "stats" };
>>> -	int err, nvqs;
>>> +	struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
>>> +	void *addr = (void *)pfn_to_kaddr(pfn);
> How do we know all free pages have a kaddr?

For x86_64, it works well since the kernel has all the physical memory 
mapped already. But for 32-bit kernel, yes, the high memory usually 
isn't mapped and thus no kaddr. Essentially, this pfn_to_kaddr convert 
isn't necessary, we do it here because the current API that virtio has 
is based on "struct scatterlist", which takes a kaddr, and this kaddr is 
then convert back to physical address in virtqueue_add() when assigning 
to desc->addr.

I think a better solution would be to add a new API, which directly 
assigns the caller's guest physical address to desc->addr, similar to 
the previous implementation "add_one_chunk()" 
(https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg02452.html). 
But we can change that to a general virtio API:
virtqueue_add_one_desc(struct virtqueue *_vq, u64 base_addr, u32 size, 
bool in_desc, void *data);

What do you think?

Best,
Wei
diff mbox series

Patch

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b31fc25..4087f04 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -55,7 +55,12 @@  static struct vfsmount *balloon_mnt;
 
 struct virtio_balloon {
 	struct virtio_device *vdev;
-	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+	/* Balloon's own wq for cpu-intensive work items */
+	struct workqueue_struct *balloon_wq;
+	/* The free page reporting work item submitted to the balloon wq */
+	struct work_struct report_free_page_work;
 
 	/* The balloon servicing is delegated to a freezable workqueue. */
 	struct work_struct update_balloon_stats_work;
@@ -65,6 +70,10 @@  struct virtio_balloon {
 	spinlock_t stop_update_lock;
 	bool stop_update;
 
+	/* Stop reporting free pages */
+	bool report_free_page_stop;
+	uint32_t free_page_cmd_id;
+
 	/* Waiting for host to ack the pages we released. */
 	wait_queue_head_t acked;
 
@@ -191,6 +200,30 @@  static void send_balloon_page_sg(struct virtio_balloon *vb,
 		kick_and_wait(vq, vb->acked);
 }
 
+static void send_free_page_sg(struct virtqueue *vq, void *addr, uint32_t size)
+{
+	int err = 0;
+	unsigned int len;
+
+	/* Detach all the used buffers from the vq */
+	while (virtqueue_get_buf(vq, &len))
+		;
+
+	/*
+	 * Since this is an optimization feature, losing a couple of free
+	 * pages to report isn't important. We simply resturn without adding
+	 * the page if the vq is full.
+	 */
+	if (vq->num_free) {
+		err = add_one_sg(vq, addr, size);
+		BUG_ON(err);
+	}
+
+	/* Batch till the vq is full */
+	if (!vq->num_free)
+		virtqueue_kick(vq);
+}
+
 /*
  * Send balloon pages in sgs to host. The balloon pages are recorded in the
  * page xbitmap. Each bit in the bitmap corresponds to a page of PAGE_SIZE.
@@ -495,9 +528,8 @@  static void stats_handle_request(struct virtio_balloon *vb)
 	virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
+static void virtballoon_cmd_balloon_memory(struct virtio_balloon *vb)
 {
-	struct virtio_balloon *vb = vdev->priv;
 	unsigned long flags;
 
 	spin_lock_irqsave(&vb->stop_update_lock, flags);
@@ -506,6 +538,50 @@  static void virtballoon_changed(struct virtio_device *vdev)
 	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
 }
 
+static void virtballoon_cmd_report_free_page_start(struct virtio_balloon *vb)
+{
+	unsigned long flags;
+
+	vb->report_free_page_stop = false;
+	spin_lock_irqsave(&vb->stop_update_lock, flags);
+	if (!vb->stop_update)
+		queue_work(vb->balloon_wq, &vb->report_free_page_work);
+	spin_unlock_irqrestore(&vb->stop_update_lock, flags);
+}
+
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+	struct virtio_balloon *vb = vdev->priv;
+	u32 host2guest_cmd, guest2host_cmd = 0;
+
+	if (!virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+		virtballoon_cmd_balloon_memory(vb);
+		return;
+	}
+
+	virtio_cread(vb->vdev, struct virtio_balloon_config, host2guest_cmd,
+		     &host2guest_cmd);
+
+	if (host2guest_cmd & VIRTIO_BALLOON_CMD_BALLOON_MEMORY) {
+		virtballoon_cmd_balloon_memory(vb);
+		guest2host_cmd |= VIRTIO_BALLOON_CMD_BALLOON_MEMORY;
+	}
+
+	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START) {
+		virtballoon_cmd_report_free_page_start(vb);
+		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START;
+	}
+
+	if (host2guest_cmd & VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP) {
+		vb->report_free_page_stop = true;
+		guest2host_cmd |= VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
+	}
+
+	/* Ack to the host about the commands that have been received */
+	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
+		      &guest2host_cmd);
+}
+
 static inline s64 towards_target(struct virtio_balloon *vb)
 {
 	s64 target;
@@ -597,42 +673,147 @@  static void update_balloon_size_func(struct work_struct *work)
 		queue_work(system_freezable_wq, work);
 }
 
-static int init_vqs(struct virtio_balloon *vb)
+static bool virtio_balloon_send_free_pages(void *opaque, unsigned long pfn,
+					   unsigned long nr_pages)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-	static const char * const names[] = { "inflate", "deflate", "stats" };
-	int err, nvqs;
+	struct virtio_balloon *vb = (struct virtio_balloon *)opaque;
+	void *addr = (void *)pfn_to_kaddr(pfn);
+	uint32_t len = nr_pages << PAGE_SHIFT;
+
+	if (vb->report_free_page_stop)
+		return false;
+
+	send_free_page_sg(vb->free_page_vq, addr, len);
 
+	return true;
+}
+
+static void report_free_page_end(struct virtio_balloon *vb)
+{
+	u32 cmd = VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP;
 	/*
-	 * We expect two virtqueues: inflate and deflate, and
-	 * optionally stat.
+	 * The host may have already requested to stop the reporting before we
+	 * finish, so no need to notify the host in this case.
 	 */
-	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
-	err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL);
+	if (vb->report_free_page_stop)
+		return;
+	vb->report_free_page_stop = true;
+
+	virtio_cwrite(vb->vdev, struct virtio_balloon_config, guest2host_cmd,
+		      &cmd);
+}
+
+static void report_free_page_cmd_id(struct virtio_balloon *vb)
+{
+	struct scatterlist sg;
+	int err;
+
+	virtio_cread(vb->vdev, struct virtio_balloon_config, free_page_cmd_id,
+		     &vb->free_page_cmd_id);
+	sg_init_one(&sg, &vb->free_page_cmd_id, sizeof(uint32_t));
+	err = virtqueue_add_outbuf(vb->free_page_vq, &sg, 1,
+				   &vb->free_page_cmd_id, GFP_KERNEL);
+	BUG_ON(err);
+}
+
+static void report_free_page(struct work_struct *work)
+{
+	struct virtio_balloon *vb;
+
+	vb = container_of(work, struct virtio_balloon, report_free_page_work);
+	report_free_page_cmd_id(vb);
+	walk_free_mem_block(vb, 0, &virtio_balloon_send_free_pages);
+	/*
+	 * The last few free page blocks that were added may not reach the
+	 * batch size, but need a kick to notify the device to handle them.
+	 */
+	virtqueue_kick(vb->free_page_vq);
+	report_free_page_end(vb);
+}
+
+static int init_vqs(struct virtio_balloon *vb)
+{
+	struct virtqueue **vqs;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct scatterlist sg;
+	int i, nvqs, err = -ENOMEM;
+
+	/* Inflateq and deflateq are used unconditionally */
+	nvqs = 2;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
+		nvqs++;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
+		nvqs++;
+
+	/* Allocate space for find_vqs parameters */
+	vqs = kcalloc(nvqs, sizeof(*vqs), GFP_KERNEL);
+	if (!vqs)
+		goto err_vq;
+	callbacks = kmalloc_array(nvqs, sizeof(*callbacks), GFP_KERNEL);
+	if (!callbacks)
+		goto err_callback;
+	names = kmalloc_array(nvqs, sizeof(*names), GFP_KERNEL);
+	if (!names)
+		goto err_names;
+
+	callbacks[0] = balloon_ack;
+	names[0] = "inflate";
+	callbacks[1] = balloon_ack;
+	names[1] = "deflate";
+
+	i = 2;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+		callbacks[i] = stats_request;
+		names[i] = "stats";
+		i++;
+	}
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+		callbacks[i] = NULL;
+		names[i] = "free_page_vq";
+	}
+
+	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names,
+					 NULL, NULL);
 	if (err)
-		return err;
+		goto err_find;
 
 	vb->inflate_vq = vqs[0];
 	vb->deflate_vq = vqs[1];
+	i = 2;
 	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
-		struct scatterlist sg;
-		unsigned int num_stats;
-		vb->stats_vq = vqs[2];
-
+		vb->stats_vq = vqs[i++];
 		/*
 		 * Prime this virtqueue with one buffer so the hypervisor can
 		 * use it to signal us later (it can't be broken yet!).
 		 */
-		num_stats = update_balloon_stats(vb);
-
-		sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
+		sg_init_one(&sg, vb->stats, sizeof(vb->stats));
 		if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
-		    < 0)
-			BUG();
+		    < 0) {
+			dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
+				 __func__);
+			goto err_find;
+		}
 		virtqueue_kick(vb->stats_vq);
 	}
+
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ))
+		vb->free_page_vq = vqs[i];
+
+	kfree(names);
+	kfree(callbacks);
+	kfree(vqs);
 	return 0;
+
+err_find:
+	kfree(names);
+err_names:
+	kfree(callbacks);
+err_callback:
+	kfree(vqs);
+err_vq:
+	return err;
 }
 
 #ifdef CONFIG_BALLOON_COMPACTION
@@ -761,6 +942,13 @@  static int virtballoon_probe(struct virtio_device *vdev)
 	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_SG))
 		xb_init(&vb->page_xb);
 
+	if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_VQ)) {
+		vb->balloon_wq = alloc_workqueue("balloon-wq",
+					WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0);
+		INIT_WORK(&vb->report_free_page_work, report_free_page);
+		vb->report_free_page_stop = true;
+	}
+
 	vb->nb.notifier_call = virtballoon_oom_notify;
 	vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
 	err = register_oom_notifier(&vb->nb);
@@ -825,6 +1013,7 @@  static void virtballoon_remove(struct virtio_device *vdev)
 	spin_unlock_irq(&vb->stop_update_lock);
 	cancel_work_sync(&vb->update_balloon_size_work);
 	cancel_work_sync(&vb->update_balloon_stats_work);
+	cancel_work_sync(&vb->report_free_page_work);
 
 	remove_common(vb);
 #ifdef CONFIG_BALLOON_COMPACTION
@@ -878,6 +1067,7 @@  static unsigned int features[] = {
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_SG,
+	VIRTIO_BALLOON_F_FREE_PAGE_VQ,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 37780a7..b758484 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -35,15 +35,26 @@ 
 #define VIRTIO_BALLOON_F_STATS_VQ	1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM	2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_SG		3 /* Use sg instead of PFN lists */
+#define VIRTIO_BALLOON_F_FREE_PAGE_VQ	4 /* VQ to report free pages */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
 
+#define	VIRTIO_BALLOON_CMD_BALLOON_MEMORY		(1 << 0)
+#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_START	(1 << 1)
+#define	VIRTIO_BALLOON_CMD_REPORT_FREE_PAGE_STOP	(1 << 2)
+
 struct virtio_balloon_config {
 	/* Number of pages host wants Guest to give up. */
 	__u32 num_pages;
 	/* Number of pages we've actually got in balloon. */
 	__u32 actual;
+	/* Host-to-guest command, readonly by guest */
+	__u32 host2guest_cmd;
+	/* Sequence id of the free_page report command, readonly by guest */
+	__u32 free_page_cmd_id;
+	/* Guest-to-host command */
+	__u32 guest2host_cmd;
 };
 
 #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */