diff mbox

[kernel,v5,5/5] virtio-balloon: tell host vm's unused page info

Message ID 1480495397-23225-6-git-send-email-liang.z.li@intel.com
State New
Headers show

Commit Message

Li, Liang Z Nov. 30, 2016, 8:43 a.m. UTC
This patch contains two parts:

One is to add a new API to mm go get the unused page information.
The virtio balloon driver will use this new API added to get the
unused page info and send it to hypervisor(QEMU) to speed up live
migration. During sending the bitmap, some the pages may be modified
and are used by the guest, this inaccuracy can be corrected by the
dirty page logging mechanism.

One is to add support the request for vm's unused page information,
QEMU can make use of unused page information and the dirty page
logging mechanism to skip the transportation of some of these unused
pages, this is very helpful to reduce the network traffic and speed
up the live migration process.

Signed-off-by: Liang Li <liang.z.li@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Amit Shah <amit.shah@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
---
 drivers/virtio/virtio_balloon.c | 126 +++++++++++++++++++++++++++++++++++++---
 include/linux/mm.h              |   3 +-
 mm/page_alloc.c                 |  72 +++++++++++++++++++++++
 3 files changed, 193 insertions(+), 8 deletions(-)

Comments

Dave Hansen Nov. 30, 2016, 7:15 p.m. UTC | #1
On 11/30/2016 12:43 AM, Liang Li wrote:
> +static void send_unused_pages_info(struct virtio_balloon *vb,
> +				unsigned long req_id)
> +{
> +	struct scatterlist sg_in;
> +	unsigned long pos = 0;
> +	struct virtqueue *vq = vb->req_vq;
> +	struct virtio_balloon_resp_hdr *hdr = vb->resp_hdr;
> +	int ret, order;
> +
> +	mutex_lock(&vb->balloon_lock);
> +
> +	for (order = MAX_ORDER - 1; order >= 0; order--) {

I scratched my head for a bit on this one.  Why are you walking over
orders, *then* zones.  I *think* you're doing it because you can
efficiently fill the bitmaps at a given order for all zones, then move
to a new bitmap.  But, it would be interesting to document this.

> +		pos = 0;
> +		ret = get_unused_pages(vb->resp_data,
> +			 vb->resp_buf_size / sizeof(unsigned long),
> +			 order, &pos);

FWIW, get_unsued_pages() is a pretty bad name.  "get" usually implies
bumping reference counts or consuming something.  You're just
"recording" or "marking" them.

> +		if (ret == -ENOSPC) {
> +			void *new_resp_data;
> +
> +			new_resp_data = kmalloc(2 * vb->resp_buf_size,
> +						GFP_KERNEL);
> +			if (new_resp_data) {
> +				kfree(vb->resp_data);
> +				vb->resp_data = new_resp_data;
> +				vb->resp_buf_size *= 2;

What happens to the data in ->resp_data at this point?  Doesn't this
just throw it away?

...
> +struct page_info_item {
> +	__le64 start_pfn : 52; /* start pfn for the bitmap */
> +	__le64 page_shift : 6; /* page shift width, in bytes */
> +	__le64 bmap_len : 6;  /* bitmap length, in bytes */
> +};

Is 'bmap_len' too short?  a 64-byte buffer is a bit tiny.  Right?

> +static int  mark_unused_pages(struct zone *zone,
> +		unsigned long *unused_pages, unsigned long size,
> +		int order, unsigned long *pos)
> +{
> +	unsigned long pfn, flags;
> +	unsigned int t;
> +	struct list_head *curr;
> +	struct page_info_item *info;
> +
> +	if (zone_is_empty(zone))
> +		return 0;
> +
> +	spin_lock_irqsave(&zone->lock, flags);
> +
> +	if (*pos + zone->free_area[order].nr_free > size)
> +		return -ENOSPC;

Urg, so this won't partially fill?  So, what the nr_free pages limit
where we no longer fit in the kmalloc()'d buffer where this simply won't
work?

> +	for (t = 0; t < MIGRATE_TYPES; t++) {
> +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
> +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
> +			info = (struct page_info_item *)(unused_pages + *pos);
> +			info->start_pfn = pfn;
> +			info->page_shift = order + PAGE_SHIFT;
> +			*pos += 1;
> +		}
> +	}

Do we need to fill in ->bmap_len here?
Li, Liang Z Dec. 4, 2016, 1:13 p.m. UTC | #2
> On 11/30/2016 12:43 AM, Liang Li wrote:
> > +static void send_unused_pages_info(struct virtio_balloon *vb,
> > +				unsigned long req_id)
> > +{
> > +	struct scatterlist sg_in;
> > +	unsigned long pos = 0;
> > +	struct virtqueue *vq = vb->req_vq;
> > +	struct virtio_balloon_resp_hdr *hdr = vb->resp_hdr;
> > +	int ret, order;
> > +
> > +	mutex_lock(&vb->balloon_lock);
> > +
> > +	for (order = MAX_ORDER - 1; order >= 0; order--) {
> 
> I scratched my head for a bit on this one.  Why are you walking over orders,
> *then* zones.  I *think* you're doing it because you can efficiently fill the
> bitmaps at a given order for all zones, then move to a new bitmap.  But, it
> would be interesting to document this.
> 

Yes, use the order is somewhat strange, but it's helpful to keep the API simple. 
Do you think it's acceptable?

> > +		pos = 0;
> > +		ret = get_unused_pages(vb->resp_data,
> > +			 vb->resp_buf_size / sizeof(unsigned long),
> > +			 order, &pos);
> 
> FWIW, get_unsued_pages() is a pretty bad name.  "get" usually implies
> bumping reference counts or consuming something.  You're just "recording"
> or "marking" them.
> 

Will change to mark_unused_pages().

> > +		if (ret == -ENOSPC) {
> > +			void *new_resp_data;
> > +
> > +			new_resp_data = kmalloc(2 * vb->resp_buf_size,
> > +						GFP_KERNEL);
> > +			if (new_resp_data) {
> > +				kfree(vb->resp_data);
> > +				vb->resp_data = new_resp_data;
> > +				vb->resp_buf_size *= 2;
> 
> What happens to the data in ->resp_data at this point?  Doesn't this just
> throw it away?
> 

Yes, so we should make sure the data in resp_data is not inuse.

> ...
> > +struct page_info_item {
> > +	__le64 start_pfn : 52; /* start pfn for the bitmap */
> > +	__le64 page_shift : 6; /* page shift width, in bytes */
> > +	__le64 bmap_len : 6;  /* bitmap length, in bytes */ };
> 
> Is 'bmap_len' too short?  a 64-byte buffer is a bit tiny.  Right?
> 

Currently, we just use the 8 bytes and 0 bytes bitmap, should we support more than 64 bytes?

> > +static int  mark_unused_pages(struct zone *zone,
> > +		unsigned long *unused_pages, unsigned long size,
> > +		int order, unsigned long *pos)
> > +{
> > +	unsigned long pfn, flags;
> > +	unsigned int t;
> > +	struct list_head *curr;
> > +	struct page_info_item *info;
> > +
> > +	if (zone_is_empty(zone))
> > +		return 0;
> > +
> > +	spin_lock_irqsave(&zone->lock, flags);
> > +
> > +	if (*pos + zone->free_area[order].nr_free > size)
> > +		return -ENOSPC;
> 
> Urg, so this won't partially fill?  So, what the nr_free pages limit where we no
> longer fit in the kmalloc()'d buffer where this simply won't work?
> 

Yes.  My initial implementation is partially fill, it's better for the worst case.
I thought the above code is more efficient for most case ...
Do you think partially fill the bitmap is better?
 
> > +	for (t = 0; t < MIGRATE_TYPES; t++) {
> > +		list_for_each(curr, &zone->free_area[order].free_list[t]) {
> > +			pfn = page_to_pfn(list_entry(curr, struct page, lru));
> > +			info = (struct page_info_item *)(unused_pages +
> *pos);
> > +			info->start_pfn = pfn;
> > +			info->page_shift = order + PAGE_SHIFT;
> > +			*pos += 1;
> > +		}
> > +	}
> 
> Do we need to fill in ->bmap_len here?

For integrity, the bmap_len should be filled, will add.
Omit this step just because QEMU assume the ->bmp_len is 0 and ignore this field.

Thanks for your comment!

Liang
Dave Hansen Dec. 5, 2016, 5:22 p.m. UTC | #3
On 12/04/2016 05:13 AM, Li, Liang Z wrote:
>> On 11/30/2016 12:43 AM, Liang Li wrote:
>>> +static void send_unused_pages_info(struct virtio_balloon *vb,
>>> +				unsigned long req_id)
>>> +{
>>> +	struct scatterlist sg_in;
>>> +	unsigned long pos = 0;
>>> +	struct virtqueue *vq = vb->req_vq;
>>> +	struct virtio_balloon_resp_hdr *hdr = vb->resp_hdr;
>>> +	int ret, order;
>>> +
>>> +	mutex_lock(&vb->balloon_lock);
>>> +
>>> +	for (order = MAX_ORDER - 1; order >= 0; order--) {
>>
>> I scratched my head for a bit on this one.  Why are you walking over orders,
>> *then* zones.  I *think* you're doing it because you can efficiently fill the
>> bitmaps at a given order for all zones, then move to a new bitmap.  But, it
>> would be interesting to document this.
> 
> Yes, use the order is somewhat strange, but it's helpful to keep the API simple. 
> Do you think it's acceptable?

Yeah, it's fine.  Just comment it, please.

>>> +		if (ret == -ENOSPC) {
>>> +			void *new_resp_data;
>>> +
>>> +			new_resp_data = kmalloc(2 * vb->resp_buf_size,
>>> +						GFP_KERNEL);
>>> +			if (new_resp_data) {
>>> +				kfree(vb->resp_data);
>>> +				vb->resp_data = new_resp_data;
>>> +				vb->resp_buf_size *= 2;
>>
>> What happens to the data in ->resp_data at this point?  Doesn't this just
>> throw it away?
> 
> Yes, so we should make sure the data in resp_data is not inuse.

But doesn't it have valid data that we just collected and haven't told
the hypervisor about yet?  Aren't we throwing away good data that cost
us something to collect?

>> ...
>>> +struct page_info_item {
>>> +	__le64 start_pfn : 52; /* start pfn for the bitmap */
>>> +	__le64 page_shift : 6; /* page shift width, in bytes */

What does a page_shift "in bytes" mean? :)

>>> +	__le64 bmap_len : 6;  /* bitmap length, in bytes */ };
>>
>> Is 'bmap_len' too short?  a 64-byte buffer is a bit tiny.  Right?
> 
> Currently, we just use the 8 bytes and 0 bytes bitmap, should we support more than 64 bytes?

It just means that with this format, you end up wasting at least ~1/8th
of the space with metadata.  That's a bit unfortunate, but I guess it's
not fatal.

I'd definitely call it out in the patch description and make sure other
folks take a look at it.

There's a somewhat easy fix, but that would make the qemu implementation
more complicated: You could just have bmap_len==0x3f imply that there's
another field that contains an extended bitmap length for when you need
long bitmaps.

But, as you note, there's no need for it, so it's a matter of trading
the extra complexity versus the desire to not habing to change the ABI
again for longer (hopefully).

>>> +static int  mark_unused_pages(struct zone *zone,
>>> +		unsigned long *unused_pages, unsigned long size,
>>> +		int order, unsigned long *pos)
>>> +{
>>> +	unsigned long pfn, flags;
>>> +	unsigned int t;
>>> +	struct list_head *curr;
>>> +	struct page_info_item *info;
>>> +
>>> +	if (zone_is_empty(zone))
>>> +		return 0;
>>> +
>>> +	spin_lock_irqsave(&zone->lock, flags);
>>> +
>>> +	if (*pos + zone->free_area[order].nr_free > size)
>>> +		return -ENOSPC;
>>
>> Urg, so this won't partially fill?  So, what the nr_free pages limit where we no
>> longer fit in the kmalloc()'d buffer where this simply won't work?
> 
> Yes.  My initial implementation is partially fill, it's better for the worst case.
> I thought the above code is more efficient for most case ...
> Do you think partially fill the bitmap is better?

Could you please answer the question I asked?

Because if you don't get this right, it could mean that there are system
that simply *fail* here.
Li, Liang Z Dec. 6, 2016, 4:47 a.m. UTC | #4
> >>> +	mutex_lock(&vb->balloon_lock);
> >>> +
> >>> +	for (order = MAX_ORDER - 1; order >= 0; order--) {
> >>
> >> I scratched my head for a bit on this one.  Why are you walking over
> >> orders,
> >> *then* zones.  I *think* you're doing it because you can efficiently
> >> fill the bitmaps at a given order for all zones, then move to a new
> >> bitmap.  But, it would be interesting to document this.
> >
> > Yes, use the order is somewhat strange, but it's helpful to keep the API simple.
> > Do you think it's acceptable?
> 
> Yeah, it's fine.  Just comment it, please.
> 
Good!

> >>> +		if (ret == -ENOSPC) {
> >>> +			void *new_resp_data;
> >>> +
> >>> +			new_resp_data = kmalloc(2 * vb->resp_buf_size,
> >>> +						GFP_KERNEL);
> >>> +			if (new_resp_data) {
> >>> +				kfree(vb->resp_data);
> >>> +				vb->resp_data = new_resp_data;
> >>> +				vb->resp_buf_size *= 2;
> >>
> >> What happens to the data in ->resp_data at this point?  Doesn't this
> >> just throw it away?
> >
> > Yes, so we should make sure the data in resp_data is not inuse.
> 
> But doesn't it have valid data that we just collected and haven't told the
> hypervisor about yet?  Aren't we throwing away good data that cost us
> something to collect?

Indeed.  Some filled data may exist for the previous zone. Should we
change the API to 
'int get_unused_pages(unsigned long *unused_pages, unsigned long size,
		int order, unsigned long *pos, struct zone *zone)' ?

then we can use the 'zone' to record the zone to retry and not discard the
filled data.

> >> ...
> >>> +struct page_info_item {
> >>> +	__le64 start_pfn : 52; /* start pfn for the bitmap */
> >>> +	__le64 page_shift : 6; /* page shift width, in bytes */
> 
> What does a page_shift "in bytes" mean? :)

Obviously, you know. :o
I will try to make it clear.

> 
> >>> +	__le64 bmap_len : 6;  /* bitmap length, in bytes */ };
> >>
> >> Is 'bmap_len' too short?  a 64-byte buffer is a bit tiny.  Right?
> >
> > Currently, we just use the 8 bytes and 0 bytes bitmap, should we support
> more than 64 bytes?
> 
> It just means that with this format, you end up wasting at least ~1/8th of the
> space with metadata.  That's a bit unfortunate, but I guess it's not fatal.
> 
> I'd definitely call it out in the patch description and make sure other folks take
> a look at it.

OK.

> 
> There's a somewhat easy fix, but that would make the qemu implementation
> more complicated: You could just have bmap_len==0x3f imply that there's
> another field that contains an extended bitmap length for when you need long
> bitmaps.
> 
> But, as you note, there's no need for it, so it's a matter of trading the extra
> complexity versus the desire to not habing to change the ABI again for longer
> (hopefully).
> 

Your suggestion still works without changing the current code, just reserve
 ' bmap_len==0x3f' for future extension, and it's not used by the current code.

> >>> +static int  mark_unused_pages(struct zone *zone,
> >>> +		unsigned long *unused_pages, unsigned long size,
> >>> +		int order, unsigned long *pos)
> >>> +{
> >>> +	unsigned long pfn, flags;
> >>> +	unsigned int t;
> >>> +	struct list_head *curr;
> >>> +	struct page_info_item *info;
> >>> +
> >>> +	if (zone_is_empty(zone))
> >>> +		return 0;
> >>> +
> >>> +	spin_lock_irqsave(&zone->lock, flags);
> >>> +
> >>> +	if (*pos + zone->free_area[order].nr_free > size)
> >>> +		return -ENOSPC;
> >>
> >> Urg, so this won't partially fill?  So, what the nr_free pages limit
> >> where we no longer fit in the kmalloc()'d buffer where this simply won't
> work?
> >
> > Yes.  My initial implementation is partially fill, it's better for the worst case.
> > I thought the above code is more efficient for most case ...
> > Do you think partially fill the bitmap is better?
> 
> Could you please answer the question I asked?
> 

For your question:
-------------------------------------------------------------------------------------------------------
>So, what the nr_free pages limit where we no longer fit in the kmalloc()'d buffer
> where this simply won't work?
------------------------------------------------------------------------------------------------------
No, if the buffer is not big enough to save 'nr_free'  pages, get_unused_pages() will return
'-ENOSPC', and the following code will try to allocate a 2x times size buffer for retrying,
until the proper size buffer is allocated. The current order will not be skipped unless the
buffer allocation failed.

> Because if you don't get this right, it could mean that there are system that
> simply *fail* here.
diff mbox

Patch

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index c3ddec3..2626cc0 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -56,7 +56,7 @@ 
 
 struct virtio_balloon {
 	struct virtio_device *vdev;
-	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+	struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *req_vq;
 
 	/* The balloon servicing is delegated to a freezable workqueue. */
 	struct work_struct update_balloon_stats_work;
@@ -75,6 +75,8 @@  struct virtio_balloon {
 	void *resp_hdr;
 	/* Pointer to the start address of response data. */
 	unsigned long *resp_data;
+	/* Size of response data buffer. */
+	unsigned long resp_buf_size;
 	/* Pointer offset of the response data. */
 	unsigned long resp_pos;
 	/* Bitmap and bitmap count used to tell the host the pages */
@@ -83,6 +85,8 @@  struct virtio_balloon {
 	unsigned int nr_page_bmap;
 	/* Used to record the processed pfn range */
 	unsigned long min_pfn, max_pfn, start_pfn, end_pfn;
+	/* Request header */
+	struct virtio_balloon_req_hdr req_hdr;
 	/*
 	 * The pages we've told the Host we're not using are enqueued
 	 * at vb_dev_info->pages list.
@@ -551,6 +555,58 @@  static void update_balloon_stats(struct virtio_balloon *vb)
 				pages_to_bytes(available));
 }
 
+static void send_unused_pages_info(struct virtio_balloon *vb,
+				unsigned long req_id)
+{
+	struct scatterlist sg_in;
+	unsigned long pos = 0;
+	struct virtqueue *vq = vb->req_vq;
+	struct virtio_balloon_resp_hdr *hdr = vb->resp_hdr;
+	int ret, order;
+
+	mutex_lock(&vb->balloon_lock);
+
+	for (order = MAX_ORDER - 1; order >= 0; order--) {
+		pos = 0;
+		ret = get_unused_pages(vb->resp_data,
+			 vb->resp_buf_size / sizeof(unsigned long),
+			 order, &pos);
+		if (ret == -ENOSPC) {
+			void *new_resp_data;
+
+			new_resp_data = kmalloc(2 * vb->resp_buf_size,
+						GFP_KERNEL);
+			if (new_resp_data) {
+				kfree(vb->resp_data);
+				vb->resp_data = new_resp_data;
+				vb->resp_buf_size *= 2;
+				order++;
+				continue;
+			} else
+				dev_warn(&vb->vdev->dev,
+					 "%s: omit some %d order pages\n",
+					 __func__, order);
+		}
+
+		if (pos > 0) {
+			vb->resp_pos = pos;
+			hdr->cmd = BALLOON_GET_UNUSED_PAGES;
+			hdr->id = req_id;
+			if (order > 0)
+				hdr->flag = BALLOON_FLAG_CONT;
+			else
+				hdr->flag = BALLOON_FLAG_DONE;
+
+			send_resp_data(vb, vq, true);
+		}
+	}
+
+	mutex_unlock(&vb->balloon_lock);
+	sg_init_one(&sg_in, &vb->req_hdr, sizeof(vb->req_hdr));
+	virtqueue_add_inbuf(vq, &sg_in, 1, &vb->req_hdr, GFP_KERNEL);
+	virtqueue_kick(vq);
+}
+
 /*
  * While most virtqueues communicate guest-initiated requests to the hypervisor,
  * the stats queue operates in reverse.  The driver initializes the virtqueue
@@ -685,18 +741,56 @@  static void update_balloon_size_func(struct work_struct *work)
 		queue_work(system_freezable_wq, work);
 }
 
+static void misc_handle_rq(struct virtio_balloon *vb)
+{
+	struct virtio_balloon_req_hdr *ptr_hdr;
+	unsigned int len;
+
+	ptr_hdr = virtqueue_get_buf(vb->req_vq, &len);
+	if (!ptr_hdr || len != sizeof(vb->req_hdr))
+		return;
+
+	switch (ptr_hdr->cmd) {
+	case BALLOON_GET_UNUSED_PAGES:
+		send_unused_pages_info(vb, ptr_hdr->param);
+		break;
+	default:
+		break;
+	}
+}
+
+static void misc_request(struct virtqueue *vq)
+{
+	struct virtio_balloon *vb = vq->vdev->priv;
+
+	misc_handle_rq(vb);
+}
+
 static int init_vqs(struct virtio_balloon *vb)
 {
-	struct virtqueue *vqs[3];
-	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack, stats_request };
-	static const char * const names[] = { "inflate", "deflate", "stats" };
+	struct virtqueue *vqs[4];
+	vq_callback_t *callbacks[] = { balloon_ack, balloon_ack,
+					 stats_request, misc_request };
+	static const char * const names[] = { "inflate", "deflate", "stats",
+						 "misc" };
 	int err, nvqs;
 
 	/*
 	 * We expect two virtqueues: inflate and deflate, and
 	 * optionally stat.
 	 */
-	nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2;
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HOST_REQ_VQ))
+		nvqs = 4;
+	else if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ))
+		nvqs = 3;
+	else
+		nvqs = 2;
+
+	if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ)) {
+		__virtio_clear_bit(vb->vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
+		__virtio_clear_bit(vb->vdev, VIRTIO_BALLOON_F_HOST_REQ_VQ);
+	}
+
 	err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names);
 	if (err)
 		return err;
@@ -717,6 +811,18 @@  static int init_vqs(struct virtio_balloon *vb)
 			BUG();
 		virtqueue_kick(vb->stats_vq);
 	}
+	if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_HOST_REQ_VQ)) {
+		struct scatterlist sg_in;
+
+		vb->req_vq = vqs[3];
+		sg_init_one(&sg_in, &vb->req_hdr, sizeof(vb->req_hdr));
+		if (virtqueue_add_inbuf(vb->req_vq, &sg_in, 1,
+		    &vb->req_hdr, GFP_KERNEL) < 0)
+			__virtio_clear_bit(vb->vdev,
+					VIRTIO_BALLOON_F_HOST_REQ_VQ);
+		else
+			virtqueue_kick(vb->req_vq);
+	}
 	return 0;
 }
 
@@ -850,11 +956,13 @@  static int virtballoon_probe(struct virtio_device *vdev)
 	vb->resp_hdr = kzalloc(sizeof(struct virtio_balloon_resp_hdr),
 				 GFP_KERNEL);
 	/* Clear the feature bit if memory allocation fails */
-	if (!vb->resp_hdr)
+	if (!vb->resp_hdr) {
 		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
-	else {
+		__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_HOST_REQ_VQ);
+	} else {
 		vb->page_bitmap[0] = kmalloc(BALLOON_BMAP_SIZE, GFP_KERNEL);
 		if (!vb->page_bitmap[0]) {
+			__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_HOST_REQ_VQ);
 			__virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_BITMAP);
 			kfree(vb->resp_hdr);
 		} else {
@@ -863,12 +971,15 @@  static int virtballoon_probe(struct virtio_device *vdev)
 			if (!vb->resp_data) {
 				__virtio_clear_bit(vdev,
 						VIRTIO_BALLOON_F_PAGE_BITMAP);
+				__virtio_clear_bit(vdev,
+						VIRTIO_BALLOON_F_HOST_REQ_VQ);
 				kfree(vb->page_bitmap[0]);
 				kfree(vb->resp_hdr);
 			}
 		}
 	}
 	vb->resp_pos = 0;
+	vb->resp_buf_size = BALLOON_BMAP_SIZE;
 	mutex_init(&vb->balloon_lock);
 	init_waitqueue_head(&vb->acked);
 	vb->vdev = vdev;
@@ -988,6 +1099,7 @@  static int virtballoon_restore(struct virtio_device *vdev)
 	VIRTIO_BALLOON_F_STATS_VQ,
 	VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
 	VIRTIO_BALLOON_F_PAGE_BITMAP,
+	VIRTIO_BALLOON_F_HOST_REQ_VQ,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a92c8d7..e05ca86 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1772,7 +1772,8 @@  static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
 extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
-
+extern int get_unused_pages(unsigned long *unused_pages, unsigned long size,
+				int order, unsigned long *pos);
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6de9440..d5a5952 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4428,6 +4428,78 @@  void show_free_areas(unsigned int filter)
 	show_swap_cache_info();
 }
 
+struct page_info_item {
+	__le64 start_pfn : 52; /* start pfn for the bitmap */
+	__le64 page_shift : 6; /* page shift width, in bytes */
+	__le64 bmap_len : 6;  /* bitmap length, in bytes */
+};
+
+static int  mark_unused_pages(struct zone *zone,
+		unsigned long *unused_pages, unsigned long size,
+		int order, unsigned long *pos)
+{
+	unsigned long pfn, flags;
+	unsigned int t;
+	struct list_head *curr;
+	struct page_info_item *info;
+
+	if (zone_is_empty(zone))
+		return 0;
+
+	spin_lock_irqsave(&zone->lock, flags);
+
+	if (*pos + zone->free_area[order].nr_free > size)
+		return -ENOSPC;
+	for (t = 0; t < MIGRATE_TYPES; t++) {
+		list_for_each(curr, &zone->free_area[order].free_list[t]) {
+			pfn = page_to_pfn(list_entry(curr, struct page, lru));
+			info = (struct page_info_item *)(unused_pages + *pos);
+			info->start_pfn = pfn;
+			info->page_shift = order + PAGE_SHIFT;
+			*pos += 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&zone->lock, flags);
+
+	return 0;
+}
+
+/*
+ * During live migration, page is always discardable unless it's
+ * content is needed by the system.
+ * get_unused_pages provides an API to get the unused pages, these
+ * unused pages can be discarded if there is no modification since
+ * the request. Some other mechanism, like the dirty page logging
+ * can be used to track the modification.
+ *
+ * This function scans the free page list to get the unused pages
+ * with the specified order, and set the corresponding element in
+ * the bitmap if an unused page is found.
+ *
+ * return -EINVAL if parameters are invalid
+ * return -ENOSPC when bitmap can't contain the pages
+ * return 0 when sccess
+ */
+int get_unused_pages(unsigned long *unused_pages, unsigned long size,
+		int order, unsigned long *pos)
+{
+	struct zone *zone;
+	int ret = 0;
+
+	if (unused_pages == NULL || pos == NULL || *pos < 0)
+		return -EINVAL;
+
+	for_each_populated_zone(zone) {
+		ret = mark_unused_pages(zone, unused_pages, size, order, pos);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(get_unused_pages);
+
 static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
 {
 	zoneref->zone = zone;