diff mbox

[RFC] post copy chardevice (was Re: [RFC] postcopy livemigration proposal)

Message ID 20110812110737.GA13791@valinux.co.jp
State New
Headers show

Commit Message

Isaku Yamahata Aug. 12, 2011, 11:07 a.m. UTC
Here is the what I have right now for post copy chardevice.
The sample user land will follow.
It would give you more concrete idea and help further discussion, I hope.
This is just for discussion, so it's incomplete.

I'm open to other ideas and quite happy to throw away this patch and
go for better way.

thanks,

From e262979e95b3c5a095c8cb0bc178309baa861a3f Mon Sep 17 00:00:00 2001
Message-Id: <e262979e95b3c5a095c8cb0bc178309baa861a3f.1313146664.git.yamahata@valinux.co.jp>
From: Isaku Yamahata <yamahata@valinux.co.jp>
Date: Wed, 10 Aug 2011 18:28:05 +0900
Subject: [PATCH] kvm/postcopy: chardevice for postcopy

This is a character device to hook page access.
The page fault in the area is reported to another user process by
this chardriver. Then, the process fills the page contents and
resolves the page fault.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
 arch/x86/kvm/Kconfig     |    1 +
 arch/x86/kvm/Makefile    |    1 +
 include/linux/kvm.h      |   45 +++
 include/linux/kvm_host.h |    2 +
 mm/memcontrol.c          |    1 +
 mm/shmem.c               |    1 +
 virt/kvm/Kconfig         |    3 +
 virt/kvm/kvm_main.c      |    6 +
 virt/kvm/vmem.c          |  847 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/vmem.h          |   68 ++++
 10 files changed, 975 insertions(+), 0 deletions(-)
 create mode 100644 virt/kvm/vmem.c
 create mode 100644 virt/kvm/vmem.h

Comments

Isaku Yamahata Aug. 12, 2011, 11:09 a.m. UTC | #1
Sample user land program for testing the post copy chardevice.

===========================================================================
/*
 * sample user land for post copy vmem
 *
 * Copyright (c) 2011,
 * National Institute of Advanced Industrial Science and Technology
 *
 * https://sites.google.com/site/grivonhome/quick-kvm-migration
 * Author: Isaku Yamahata <yamahata at valinux co jp>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
 * Place - Suite 330, Boston, MA 02111-1307 USA.
 */

#include <err.h>
#include <inttypes.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

//#include <linux/kvm.h>
#define __user
#include "my-kvm.h"

#if 1
#define DPRINTF(format, ...) \
	printf("%s:%d "format, __func__, __LINE__, ## __VA_ARGS__)
#else
#define DPRINTF(format, ...)	do { } while (0)
#endif

#define VMEM_NR_PAGES	8

void server(int vmem_fd, int shmem_fd, size_t size, size_t page_size)
{
	int nr_pages = size / page_size;

	void* shmem = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
			   shmem_fd, 0);
	if (shmem == MAP_FAILED) {
		err(EXIT_FAILURE, "server: mmap(\"shmem\")");
	}
	close(shmem_fd);

	DPRINTF("KVM_VMEM_READY\n");
	if (ioctl(vmem_fd, KVM_VMEM_READY) < 0) {
		err(EXIT_FAILURE, "server: KVM_VMEM_READY");
	}

	struct kvm_vmem_page_request page_request;
	page_request.pgoffs = malloc(sizeof(*page_request.pgoffs) * nr_pages);
	if (page_request.pgoffs == NULL) {
		err(EXIT_FAILURE, "server: malloc(\"page_request.pgoffs\")");
	}

	struct kvm_vmem_page_cached page_cached;
	page_cached.pgoffs = malloc(sizeof(*page_cached.pgoffs) * nr_pages);
	if (page_cached.pgoffs == NULL) {
		err(EXIT_FAILURE, "server: malloc(\"page_cached.pgoffs\")");
	}

	int fill = 0;
	fill++;
	memset(shmem, fill, page_size);

	page_cached.nr = 1;
	page_cached.pgoffs[0] = 0;

	DPRINTF("KVM_VMEM_MARK_PAGE_CACHED\n");
	if (ioctl(vmem_fd, KVM_VMEM_MARK_PAGE_CACHED, &page_cached)) {
		err(EXIT_FAILURE, "server: KVM_VMEM_MARK_PAGE_CACHED");
	}

	struct kvm_vmem_page_range page_range = {
		.pgoff = 0,
		.nr_pages = 1,
	};
	struct kvm_vmem_make_pages_present pages_present = {
		.nr = 1,
		.ranges = &page_range,
	};
	DPRINTF("KVM_VMEM_MAKE_PAGES_PRESENT\n");
	if (ioctl(vmem_fd, KVM_VMEM_MAKE_PAGES_PRESENT, &pages_present) < 0) {
		err(EXIT_FAILURE, "server: KVM_VMEM_MAKE_PAGES_PRESENT");
	}

	int page_served = 1;
	while (page_served < nr_pages) {
		DPRINTF("KVM_VMEM_GET_PAGE_REQUEST\n");
		page_request.nr = nr_pages;
		if (ioctl(vmem_fd, KVM_VMEM_GET_PAGE_REQUEST, &page_request)) {
			err(EXIT_FAILURE, "server: KVM_VMEM_GET_PAGE_REQUEST");
		}

		DPRINTF("request.nr %d\n", page_request.nr);
		page_cached.nr = 0;
		int i;
		for (i = 0; i < page_request.nr; ++i) {
			memset(shmem + page_size * page_request.pgoffs[i],
			       fill, page_size);
			fill++;
			page_cached.pgoffs[page_cached.nr] =
				page_request.pgoffs[i];
			page_cached.nr++;
			DPRINTF("request[%d] %lx fill: %d\n",
				i, (unsigned long)page_request.pgoffs[i],
				fill - 1);
		}
		DPRINTF("KVM_VMEM_MARK_PAGE_CACHED\n");
		if (ioctl(vmem_fd, KVM_VMEM_MARK_PAGE_CACHED,
			  &page_cached) < 0) {
			err(EXIT_FAILURE, "server: KVM_VMEM_MARK_PAGE_CACHED");
		}
		page_served += page_cached.nr;
	}

#if 0
	DPRINTF("KVM_VMEM_MAKE_VMA_ANONYMOUS\n");
	if (ioctl(vmem_fd, KVM_VMEM_MAKE_VMA_ANONYMOUS)) {
		err(EXIT_FAILURE, "server: KVM_VMEM_MAKE_VMA_ANONYMOUS");
	}
#endif
	munmap(shmem, size);
	close(vmem_fd);
}

void qemu(int vmem_fd, size_t size, size_t page_size)
{
	DPRINTF("mmap\n");
	void *ram = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE,
			 vmem_fd, 0);
	if (ram == MAP_FAILED) {
		err(EXIT_FAILURE, "qemu: mmap");
	}

	DPRINTF("KVM_VMEM_WAIT_READY\n");
	if (ioctl(vmem_fd, KVM_VMEM_WAIT_READY) < 0) {
		err(EXIT_FAILURE, "qemu: KVM_VMEM_WAIT_READY");
	}
	DPRINTF("close\n");
	close(vmem_fd);

	int pages[] = {7, 1, 6, 2, 0, 5, 3, 4};
	int val[VMEM_NR_PAGES];
	int i;
	for (i = 0; i < VMEM_NR_PAGES; ++i) {
		if (i == 2 || i == 6)
			sleep(1);
		DPRINTF("access to %d\n", pages[i]);
		fflush(stdout);
		val[i] = *(uint8_t*)(ram + page_size * pages[i]);
		DPRINTF("page:%d val[i=%d]=%d\n", pages[i], i, val[i]);
	}

	munmap(ram, size);
}

int main(int argc, char **argv)
{
	int kvm_fd = open("/dev/kvm", O_RDWR);
	if (kvm_fd < 0) {
		perror("can't open /dev/kvm");
		exit(EXIT_FAILURE);
	}

	int vmem_dev_fd = ioctl(kvm_fd, KVM_CREATE_VMEM_DEV);
	if (vmem_dev_fd < 0) {
		err(EXIT_FAILURE, "can't create vmem_dev");
	}

	long page_size = sysconf(_SC_PAGESIZE);
	struct kvm_vmem_create create = {
		.size = VMEM_NR_PAGES * page_size,
	};
	if (ioctl(vmem_dev_fd, KVM_CREATE_VMEM, &create) < 0) {
		err(EXIT_FAILURE, "KVM_CREATE_VMEM");
	}
	close(vmem_dev_fd);

	int vmem_fd = create.vmem_fd;
	int shmem_fd = create.shmem_fd;
	size_t size = create.size;

	if (ftruncate(shmem_fd, size) < 0) {
		err(EXIT_FAILURE, "truncate(\"shmem_fd\")");
	}

	printf("vmem_fd %d shmem_fd %d\n", vmem_fd, shmem_fd);
	fflush(stdout);

	pid_t child = fork();
	if (child < 0) {
		err(EXIT_FAILURE, "fork");
	}
	if (child == 0) {
		sleep(1);
		printf("server pid: %d\n", getpid());
		server(vmem_fd, shmem_fd, size, page_size);
		return 0;
	}

	printf("qemu pid: %d server pid: %d\n", getpid(), child);
	close(shmem_fd);
	qemu(vmem_fd, size, page_size);
	return 0;
}

===========================================================================
Blue Swirl Aug. 12, 2011, 9:26 p.m. UTC | #2
On Fri, Aug 12, 2011 at 11:07 AM, Isaku Yamahata <yamahata@valinux.co.jp> wrote:
> Here is the what I have right now for post copy chardevice.
> The sample user land will follow.
> It would give you more concrete idea and help further discussion, I hope.
> This is just for discussion, so it's incomplete.
>
> I'm open to other ideas and quite happy to throw away this patch and
> go for better way.
>
> thanks,
>
> From e262979e95b3c5a095c8cb0bc178309baa861a3f Mon Sep 17 00:00:00 2001
> Message-Id: <e262979e95b3c5a095c8cb0bc178309baa861a3f.1313146664.git.yamahata@valinux.co.jp>
> From: Isaku Yamahata <yamahata@valinux.co.jp>
> Date: Wed, 10 Aug 2011 18:28:05 +0900
> Subject: [PATCH] kvm/postcopy: chardevice for postcopy
>
> This is a character device to hook page access.
> The page fault in the area is reported to another user process by
> this chardriver. Then, the process fills the page contents and
> resolves the page fault.
>
> Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
> ---
>  arch/x86/kvm/Kconfig     |    1 +
>  arch/x86/kvm/Makefile    |    1 +
>  include/linux/kvm.h      |   45 +++
>  include/linux/kvm_host.h |    2 +
>  mm/memcontrol.c          |    1 +
>  mm/shmem.c               |    1 +
>  virt/kvm/Kconfig         |    3 +
>  virt/kvm/kvm_main.c      |    6 +
>  virt/kvm/vmem.c          |  847 ++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/vmem.h          |   68 ++++
>  10 files changed, 975 insertions(+), 0 deletions(-)
>  create mode 100644 virt/kvm/vmem.c
>  create mode 100644 virt/kvm/vmem.h
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 0a09b58..dcbd52e 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -29,6 +29,7 @@ config KVM
>        select HAVE_KVM_EVENTFD
>        select KVM_APIC_ARCHITECTURE
>        select KVM_ASYNC_PF
> +       select KVM_VMEM
>        select USER_RETURN_NOTIFIER
>        select KVM_MMIO
>        select TASKSTATS
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index f15501f..6125f4c 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -10,6 +10,7 @@ kvm-y                 += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
>                                assigned-dev.o)
>  kvm-$(CONFIG_IOMMU_API)        += $(addprefix ../../../virt/kvm/, iommu.o)
>  kvm-$(CONFIG_KVM_ASYNC_PF)     += $(addprefix ../../../virt/kvm/, async_pf.o)
> +kvm-$(CONFIG_KVM_VMEM) += $(addprefix ../../../virt/kvm/, vmem.o)
>
>  kvm-y                  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
>                           i8254.o timer.o
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 55f5afb..623109e 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo {
>  #define KVM_CAP_PPC_SMT 64
>  #define KVM_CAP_PPC_RMA        65
>  #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
> +#define KVM_CAP_POST_COPY_MEMORY 67
>
>  #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -760,6 +761,50 @@ struct kvm_clock_data {
>  /* Available with KVM_CAP_RMA */
>  #define KVM_ALLOCATE_RMA         _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
>
> +struct kvm_vmem_create {
> +       __u64 size;     /* in bytes */
> +       __s32 vmem_fd;
> +       __s32 shmem_fd;
> +};
> +
> +struct kvm_vmem_page_request {
> +       __u32 nr;

Padding will be needed here on 64 bit hosts unless the order is switched.

> +       __u64 __user *pgoffs;
> +};
> +
> +struct kvm_vmem_page_cached {
> +       __u32 nr;

Also here.

> +       __u64 __user *pgoffs;
> +};
> +
> +struct kvm_vmem_page_range {
> +       __u64 pgoff;
> +       __u64 nr_pages;
> +};
> +
> +struct kvm_vmem_make_pages_present {
> +       __u32 nr;

And here.

> +       struct kvm_vmem_page_range __user *ranges;
> +};
> +
> +/* Available with KVM_CAP_POST_COPY_MEMORY */
> +#define KVM_CREATE_VMEM_DEV       _IO(KVMIO,  0xb0)
> +
> +/* ioctl for vmem_dev fd */
> +#define KVM_CREATE_VMEM                  _IOR(KVMIO, 0xb1, __u32)
> +
> +/* ioctl for vmem fd */
> +#define KVM_VMEM_WAIT_READY      _IO(KVMIO,  0xb2)
> +#define KVM_VMEM_READY           _IO(KVMIO,  0xb3)
> +#define KVM_VMEM_GET_PAGE_REQUEST \
> +       _IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request)
> +#define KVM_VMEM_MARK_PAGE_CACHED \
> +       _IOW(KVMIO,  0xb5, struct kvm_vmem_page_cached)
> +#define KVM_VMEM_MAKE_PAGES_PRESENT \
> +       _IOW(KVMIO,  0xb6, struct kvm_vmem_make_pages_present)
> +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7)
> +
> +
>  #define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
>
>  struct kvm_assigned_pci_dev {
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ff4d406..8b3dafa 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -222,6 +222,8 @@ struct kvm_irq_routing_table {};
>
>  #endif
>
> +long kvm_dev_ioctl_create_vmem_dev(void);
> +
>  struct kvm_memslots {
>        int nmemslots;
>        u64 generation;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e013b8e..7f3fc4e 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2838,6 +2838,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
>
>        return ret;
>  }
> +EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge);
>
>  /*
>  * While swap-in, try_charge -> commit or cancel, the page is locked.
> diff --git a/mm/shmem.c b/mm/shmem.c
> index fcedf54..ae7d61f 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -3035,6 +3035,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
>        vma->vm_flags |= VM_CAN_NONLINEAR;
>        return 0;
>  }
> +EXPORT_SYMBOL_GPL(shmem_zero_setup);
>
>  /**
>  * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index f63ccb0..d3040ea 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -18,3 +18,6 @@ config KVM_MMIO
>
>  config KVM_ASYNC_PF
>        bool
> +
> +config KVM_VMEM
> +       bool
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index aefdda3..9e47e20 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2184,6 +2184,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
>        case KVM_CAP_SET_BOOT_CPU_ID:
>  #endif
>        case KVM_CAP_INTERNAL_ERROR_DATA:
> +       case KVM_CAP_POST_COPY_MEMORY:
>                return 1;
>  #ifdef CONFIG_HAVE_KVM_IRQCHIP
>        case KVM_CAP_IRQ_ROUTING:
> @@ -2233,6 +2234,11 @@ static long kvm_dev_ioctl(struct file *filp,
>        case KVM_TRACE_DISABLE:
>                r = -EOPNOTSUPP;
>                break;
> +#ifdef CONFIG_KVM_VMEM
> +       case KVM_CREATE_VMEM_DEV:
> +               r = kvm_dev_ioctl_create_vmem_dev();
> +               break;
> +#endif
>        default:
>                return kvm_arch_dev_ioctl(filp, ioctl, arg);
>        }
> diff --git a/virt/kvm/vmem.c b/virt/kvm/vmem.c
> new file mode 100644
> index 0000000..b413663
> --- /dev/null
> +++ b/virt/kvm/vmem.c
> @@ -0,0 +1,847 @@
> +/*
> + * KVM post copy vmem
> + *
> + * Copyright (c) 2011,
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.

The current address is:

51 Franklin Street, Fifth Floor
Boston, MA 02110-1301
USA

Then there is the version used in QEMU:
if not, see <http://www.gnu.org/licenses/>.

I don't know which one is preferred with kernel.

> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +#include <linux/pagemap.h>
> +#include <linux/mm.h>
> +#include <linux/memcontrol.h>
> +#include <linux/poll.h>
> +#include <linux/file.h>
> +#include <linux/anon_inodes.h>
> +#include "vmem.h"
> +
> +static void kvm_vmem_release_fake_vmf(int ret, struct vm_fault *fake_vmf)
> +{
> +       if (ret & VM_FAULT_LOCKED) {
> +               unlock_page(fake_vmf->page);
> +       }
> +       page_cache_release(fake_vmf->page);
> +}
> +
> +static int kvm_vmem_minor_fault(struct kvm_vmem *vmem,
> +                               struct vm_area_struct *vma,
> +                               struct vm_fault *vmf)
> +{
> +       struct vm_fault fake_vmf;
> +       int ret;
> +       struct page *page;
> +
> +       BUG_ON(!test_bit(vmf->pgoff, vmem->cached));
> +       fake_vmf = *vmf;
> +       fake_vmf.page = NULL;
> +       ret = vmem->vma->vm_ops->fault(vmem->vma, &fake_vmf);
> +       if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))
> +               return ret;
> +
> +       /*
> +        * TODO: pull out fake_vmf->page from shmem file and donate it
> +        * to this vma resolving the page fault.
> +        * vmf->page = fake_vmf->page;
> +        */
> +
> +       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
> +       if (!page)
> +               return VM_FAULT_OOM;
> +       if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) {
> +               kvm_vmem_release_fake_vmf(ret, &fake_vmf);
> +               page_cache_release(page);
> +               return VM_FAULT_OOM;
> +       }
> +
> +       copy_highpage(page, fake_vmf.page);
> +       kvm_vmem_release_fake_vmf(ret, &fake_vmf);
> +
> +       ret |= VM_FAULT_LOCKED;
> +       SetPageUptodate(page);
> +       vmf->page = page;
> +       set_bit(vmf->pgoff, vmem->faulted);
> +
> +       return ret;
> +}
> +
> +static int kvm_vmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +       struct file *filp = vma->vm_file;
> +       struct kvm_vmem *vmem = filp->private_data;
> +
> +       if (vmf->pgoff >= vmem->pgoff_end) {
> +               return VM_FAULT_SIGBUS;
> +       }
> +
> +       BUG_ON(test_bit(vmf->pgoff, vmem->faulted));
> +
> +       if (!test_bit(vmf->pgoff, vmem->cached)) {
> +               /* major fault */
> +               unsigned long bit;
> +               DEFINE_WAIT(wait);
> +
> +               if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
> +                       /* async page fault */
> +                       spin_lock(&vmem->lock);
> +                       if (vmem->async_req_nr < vmem->async_req_max) {
> +                               vmem->async_req[vmem->async_req_nr] =
> +                                       vmf->pgoff;
> +                               vmem->async_req_nr++;
> +                       }
> +                       spin_unlock(&vmem->lock);
> +                       wake_up_poll(&vmem->req_wait, POLLIN);
> +
> +                       if (test_bit(vmf->pgoff, vmem->cached))
> +                               return kvm_vmem_minor_fault(vmem, vma, vmf);
> +                       return VM_FAULT_MAJOR | VM_FAULT_RETRY;
> +               }
> +
> +               spin_lock(&vmem->lock);
> +               bit = find_first_zero_bit(vmem->sync_wait_bitmap,
> +                                         vmem->sync_req_max);
> +               if (likely(bit < vmem->sync_req_max)) {
> +                       vmem->sync_req[bit] = vmf->pgoff;
> +                       prepare_to_wait(&vmem->page_wait[bit], &wait,
> +                                       TASK_UNINTERRUPTIBLE);
> +                       set_bit(bit, vmem->sync_req_bitmap);
> +                       set_bit(bit, vmem->sync_wait_bitmap);
> +                       spin_unlock(&vmem->lock);
> +                       wake_up_poll(&vmem->req_wait, POLLIN);
> +
> +                       if (!test_bit(vmf->pgoff, vmem->cached))
> +                               schedule();
> +                       finish_wait(&vmem->page_wait[bit], &wait);
> +                       clear_bit(bit, vmem->sync_wait_bitmap);
> +               } else {
> +                       struct kvm_vmem_page_req_list page_req_list = {
> +                               .pgoff = vmf->pgoff,
> +                       };
> +                       vmem->req_list_nr++;
> +                       list_add_tail(&page_req_list.list, &vmem->req_list);
> +                       wake_up_poll(&vmem->req_wait, POLLIN);
> +                       for (;;) {
> +                               prepare_to_wait(&vmem->req_list_wait, &wait,
> +                                               TASK_UNINTERRUPTIBLE);
> +                               if (test_bit(vmf->pgoff, vmem->cached)) {
> +                                       vmem->req_list_nr--;
> +                                       break;
> +                               }
> +                               spin_unlock(&vmem->lock);
> +                               schedule();
> +                               spin_lock(&vmem->lock);
> +                       }
> +                       spin_unlock(&vmem->lock);
> +                       finish_wait(&vmem->req_list_wait, &wait);
> +               }
> +
> +               return kvm_vmem_minor_fault(vmem, vma, vmf) | VM_FAULT_MAJOR;
> +       }
> +
> +       return kvm_vmem_minor_fault(vmem, vma, vmf);
> +}
> +
> +/* for partial munmap */
> +static void kvm_vmem_vma_open(struct vm_area_struct *vma)
> +{
> +       struct file *filp = vma->vm_file;
> +       struct kvm_vmem *vmem = filp->private_data;
> +
> +       spin_lock(&vmem->lock);
> +       vmem->vma_nr++;
> +       spin_unlock(&vmem->lock);
> +}
> +
> +static void kvm_vmem_vma_close(struct vm_area_struct *vma)
> +{
> +       struct file *filp = vma->vm_file;
> +       struct kvm_vmem *vmem = filp->private_data;
> +       struct task_struct *task = NULL;
> +
> +       spin_lock(&vmem->lock);
> +       vmem->vma_nr--;
> +       if (vmem->vma_nr == 0) {
> +               task = vmem->task;
> +               vmem->task = NULL;
> +       }
> +       spin_unlock(&vmem->lock);
> +
> +       if (task)
> +               put_task_struct(task);
> +}
> +
> +static const struct vm_operations_struct kvm_vmem_vm_ops = {
> +       .open = kvm_vmem_vma_open,
> +       .close = kvm_vmem_vma_close,
> +       .fault = kvm_vmem_fault,
> +};
> +
> +static int kvm_vmem_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> +       struct kvm_vmem *vmem = filp->private_data;
> +       int error;
> +
> +       /* allow mmap() only once */
> +       spin_lock(&vmem->lock);
> +       if (vmem->mmapped) {
> +               error = -EBUSY;
> +               goto out;
> +       }
> +       if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff >
> +           vmem->pgoff_end) {
> +               error = -EINVAL;
> +               goto out;
> +       }
> +
> +       vmem->mmapped = true;
> +       vmem->vma_nr = 1;
> +       vmem->vm_start = vma->vm_start;
> +       get_task_struct(current);
> +       vmem->task = current;
> +       spin_unlock(&vmem->lock);
> +
> +       vma->vm_ops = &kvm_vmem_vm_ops;
> +       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
> +       vma->vm_flags &= ~VM_SHARED;
> +       return 0;
> +
> +out:
> +       spin_unlock(&vmem->lock);
> +       return error;
> +}
> +
> +static bool kvm_vmem_req_pending(struct kvm_vmem* vmem)
> +{
> +       return !list_empty(&vmem->req_list) ||
> +               !bitmap_empty(vmem->sync_req_bitmap, vmem->sync_req_max) ||
> +               (vmem->async_req_nr > 0);
> +}
> +
> +static unsigned int kvm_vmem_poll(struct file* filp, poll_table *wait)
> +{
> +       struct kvm_vmem *vmem = filp->private_data;
> +       unsigned int events = 0;
> +
> +       poll_wait(filp, &vmem->req_wait, wait);
> +
> +       spin_lock(&vmem->lock);
> +       if (kvm_vmem_req_pending(vmem))
> +               events |= POLLIN;
> +       spin_unlock(&vmem->lock);
> +
> +       return events;
> +}
> +
> +/*
> + * return value
> + * true: finished
> + * false: more request
> + */
> +static bool kvm_vmem_copy_page_request(struct kvm_vmem *vmem,
> +                                      pgoff_t *pgoffs, int req_max,
> +                                      int *req_nr)
> +{
> +       struct kvm_vmem_page_req_list *req_list;
> +       struct kvm_vmem_page_req_list *tmp;
> +
> +       unsigned long bit;
> +
> +       *req_nr = 0;
> +       list_for_each_entry_safe(req_list, tmp, &vmem->req_list, list) {
> +               list_del(&req_list->list);
> +               pgoffs[*req_nr] = req_list->pgoff;
> +               (*req_nr)++;
> +               if (*req_nr >= req_max)
> +                       return false;
> +       }
> +
> +       bit = 0;
> +       for (;;) {
> +               bit = find_next_bit(vmem->sync_req_bitmap, vmem->sync_req_max,
> +                                   bit);
> +               if (bit >= vmem->sync_req_max)
> +                       break;
> +               pgoffs[*req_nr] = vmem->sync_req[bit];
> +               (*req_nr)++;
> +               clear_bit(bit, vmem->sync_req_bitmap);
> +               if (*req_nr >= req_max)
> +                       return false;
> +               bit++;
> +       }
> +
> +       if (vmem->async_req_nr > 0) {
> +               int nr = min(req_max - *req_nr, vmem->async_req_nr);
> +               memcpy(pgoffs + *req_nr, vmem->async_req,
> +                      sizeof(*vmem->async_req) * nr);
> +               vmem->async_req_nr -= nr;
> +               *req_nr += nr;
> +               memmove(vmem->async_req, vmem->sync_req + nr,
> +                       vmem->async_req_nr * sizeof(*vmem->async_req));
> +
> +       }
> +       return vmem->async_req_nr == 0;
> +}
> +
> +static int kvm_vmem_get_page_request(struct kvm_vmem *vmem,
> +                                    struct kvm_vmem_page_request *page_req)
> +{
> +       DEFINE_WAIT(wait);
> +#define REQ_MAX        ((__u32)32)
> +       pgoff_t pgoffs[REQ_MAX];
> +       __u32 req_copied = 0;
> +       int ret = 0;
> +
> +       spin_lock(&vmem->lock);
> +       for (;;) {
> +               prepare_to_wait(&vmem->req_wait, &wait, TASK_INTERRUPTIBLE);
> +               if (kvm_vmem_req_pending(vmem)) {
> +                       break;
> +               }
> +               if (signal_pending(current)) {
> +                       ret = -ERESTARTSYS;
> +                       break;
> +               }
> +               spin_unlock(&vmem->lock);
> +               schedule();
> +               spin_lock(&vmem->lock);
> +       }
> +       finish_wait(&vmem->req_wait, &wait);
> +       if (ret)
> +               goto out_unlock;
> +
> +       while (req_copied < page_req->nr) {
> +               int req_max;
> +               int req_nr;
> +               bool finished;
> +               req_max = min(page_req->nr - req_copied, REQ_MAX);
> +               finished = kvm_vmem_copy_page_request(vmem, pgoffs, req_max,
> +                                                     &req_nr);
> +
> +               spin_unlock(&vmem->lock);
> +
> +               if (req_nr > 0) {
> +                       ret = 0;
> +                       if (copy_to_user(page_req->pgoffs + req_copied, pgoffs,
> +                                        sizeof(*pgoffs) * req_nr)) {
> +                               ret = -EFAULT;
> +                               goto out;
> +                       }
> +               }
> +               req_copied += req_nr;
> +               if (finished)
> +                       goto out;
> +
> +               spin_lock(&vmem->lock);
> +       }
> +
> +out_unlock:
> +       spin_unlock(&vmem->lock);
> +out:
> +       page_req->nr = req_copied;
> +       return ret;
> +}
> +
> +static int kvm_vmem_mark_page_cached(struct kvm_vmem *vmem,
> +                                    struct kvm_vmem_page_cached *page_cached)
> +{
> +       int ret = 0;
> +#define PG_MAX ((__u32)32)
> +       __u64 pgoffs[PG_MAX];
> +       __u32 nr;
> +       unsigned long bit;
> +       bool wake_up_list = false;
> +
> +       nr = 0;
> +       while (nr < page_cached->nr) {
> +               __u32 todo = min(PG_MAX, (page_cached->nr - nr));
> +               int i;
> +
> +               if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> +                                  sizeof(*pgoffs) * todo)) {
> +                       ret = -EFAULT;
> +                       goto out;
> +               }
> +               for (i = 0; i < todo; ++i) {
> +                       if (pgoffs[i] >= vmem->pgoff_end) {
> +                               ret = -EINVAL;
> +                               goto out;
> +                       }
> +                       set_bit(pgoffs[i], vmem->cached);
> +               }
> +               nr += todo;
> +       }
> +
> +       spin_lock(&vmem->lock);
> +       bit = 0;
> +       for (;;) {
> +               bit = find_next_bit(vmem->sync_wait_bitmap, vmem->sync_req_max,
> +                                   bit);
> +               if (bit >= vmem->sync_req_max)
> +                       break;
> +               if (test_bit(vmem->sync_req[bit], vmem->cached))
> +                       wake_up(&vmem->page_wait[bit]);
> +               bit++;
> +       }
> +
> +       if (vmem->req_list_nr > 0)
> +               wake_up_list = true;
> +       spin_unlock(&vmem->lock);
> +
> +       if (wake_up_list)
> +               wake_up_all(&vmem->req_list_wait);
> +
> +out:
> +       return ret;
> +}
> +
> +static bool kvm_vmem_is_vmem_vma(const struct kvm_vmem *vmem,
> +                                const struct vm_area_struct *vma)
> +{
> +       return vma->vm_file && vma->vm_file->private_data == vmem;
> +}
> +
> +static void kvm_vmem_make_pages_present_entry(struct kvm_vmem *vmem,
> +                                             struct kvm_vmem_page_range *range,
> +                                             struct task_struct *task,
> +                                             struct mm_struct *mm,
> +                                             unsigned long vm_start)
> +{
> +       unsigned long pgoff = range->pgoff;
> +       unsigned long range_end = range->pgoff + range->nr_pages;
> +
> +       down_read(&mm->mmap_sem);
> +
> +       while (pgoff < range->pgoff + range->nr_pages) {
> +               unsigned long pgoff_end;
> +               struct vm_area_struct *vma;
> +               unsigned long saddr;
> +               unsigned long eaddr;
> +
> +               /* search unfaulted range */
> +               spin_lock(&vmem->lock);
> +               pgoff = find_next_zero_bit(vmem->faulted, range_end, pgoff);
> +               if (pgoff >= range_end) {
> +                       spin_unlock(&vmem->lock);
> +                       break;
> +               }
> +               pgoff_end = find_next_bit(vmem->faulted, range_end, pgoff);
> +               spin_unlock(&vmem->lock);
> +
> +               saddr = vm_start + (pgoff << PAGE_SHIFT);
> +               eaddr = vm_start + (pgoff_end << PAGE_SHIFT);
> +               vma = find_vma(mm, saddr);
> +               if (vma == NULL) {
> +                       break;
> +               }
> +               if (eaddr < vma->vm_start) {
> +                       pgoff = (vma->vm_start - vm_start) >> PAGE_SHIFT;
> +                       continue;
> +               }
> +
> +               if (kvm_vmem_is_vmem_vma(vmem, vma)) {
> +                       unsigned long start = max(vma->vm_start, saddr);
> +                       unsigned long end = min(vma->vm_end, eaddr);
> +                       int nr_pages = (end - start) >> PAGE_SHIFT;
> +                       get_user_pages(task, mm, start, nr_pages,
> +                                      1, 1, NULL, NULL);
> +                       pgoff = (end - vm_start) >> PAGE_SHIFT;
> +               } else {
> +                       pgoff = (vma->vm_end - vm_start) >> PAGE_SHIFT;
> +               }
> +       }
> +
> +       up_read(&mm->mmap_sem);
> +}
> +
> +static int kvm_vmem_make_pages_present(
> +       struct kvm_vmem *vmem,
> +       struct kvm_vmem_make_pages_present *pages_present)
> +{
> +       struct task_struct *task;
> +       struct mm_struct *mm;
> +       pgoff_t pgoff_end;
> +       unsigned long vm_start;
> +       unsigned long vm_eaddr;
> +
> +#define NUM_ENTRIES    ((__u32)32)
> +       struct kvm_vmem_page_range kranges[NUM_ENTRIES];
> +       __u32 nr = 0;
> +       int ret;
> +
> +       spin_lock(&vmem->lock);
> +       task = vmem->task;
> +       pgoff_end = vmem->pgoff_end;
> +       vm_start = vmem->vm_start;
> +       vm_eaddr = vm_start + vmem->size;
> +       spin_unlock(&vmem->lock);
> +       if (task == NULL)
> +               return 0;
> +       mm = get_task_mm(task);
> +       if (mm == NULL)
> +               return 0;
> +
> +       ret = 0;
> +       while (nr < pages_present->nr) {
> +               int nr_ranges = min(NUM_ENTRIES, pages_present->nr - nr);
> +               int i;
> +
> +               if (copy_from_user(&kranges, pages_present->ranges + nr,
> +                                  sizeof(kranges[0]) * nr_ranges)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +               for (i = 0; i < nr_ranges; ++i) {
> +                       struct kvm_vmem_page_range *range = &kranges[i];
> +                       if (range->pgoff >= pgoff_end ||
> +                           range->nr_pages >= pgoff_end ||
> +                           range->pgoff + range->nr_pages >= pgoff_end) {
> +                               ret = -EINVAL;
> +                               break;
> +                       }
> +                       kvm_vmem_make_pages_present_entry(vmem, range,
> +                                                         task, mm, vm_start);
> +               }
> +               nr += nr_ranges;
> +       }
> +
> +       mmput(mm);
> +       return ret;
> +}
> +
> +static int kvm_vmem_make_vma_anonymous(struct kvm_vmem *vmem)
> +{
> +#if 1
> +       return -ENOSYS;
> +#else
> +       unsigned long saddr;
> +       unsigned long eaddr;
> +       unsigned long addr;
> +       unsigned long bit;
> +       struct task_struct *task;
> +       struct mm_struct *mm;
> +
> +       spin_lock(&vmem->lock);
> +       task = vmem->task;
> +       saddr = vmem->vm_start;
> +       eaddr = saddr + vmem->size;
> +       bit = find_first_zero_bit(vmem->faulted, vmem->pgoff_end);
> +       if (bit < vmem->pgoff_end) {
> +               spin_unlock(&vmem->lock);
> +               return -EBUSY;
> +       }
> +       spin_unlock(&vmem->lock);
> +       if (task == NULL)
> +               return 0;
> +       mm = get_task_mm(task);
> +       if (mm == NULL)
> +               return 0;
> +
> +       addr = saddr;
> +       down_write(&mm->mmap_sem);
> +       while (addr < eaddr) {
> +               struct vm_area_struct *vma;
> +               vma = find_vma(mm, addr);
> +               if (kvm_vmem_is_vmem_vma(vmem, vma)) {
> +                       /* XXX incorrect. race/locking and more fix up */
> +                       struct file *filp = vma->vm_file;
> +                       vma->vm_ops->close(vma);
> +                       vma->vm_ops = NULL;
> +                       vma->vm_file = NULL;
> +                       /* vma->vm_flags */
> +                       fput(filp);
> +               }
> +               addr = vma->vm_end;
> +       }
> +       up_write(&mm->mmap_sem);
> +
> +       mmput(mm);
> +       return 0;
> +#endif
> +}
> +
> +static void kvm_vmem_ready(struct kvm_vmem *vmem)
> +{
> +       spin_lock(&vmem->lock);
> +       vmem->ready = true;
> +       spin_unlock(&vmem->lock);
> +       wake_up_interruptible(&vmem->ready_wait);
> +}
> +
> +static int kvm_vmem_wait_ready(struct kvm_vmem *vmem)
> +{
> +       int ret = 0;
> +       DEFINE_WAIT(wait);
> +
> +       spin_lock(&vmem->lock);
> +       for (;;) {
> +               prepare_to_wait(&vmem->ready_wait, &wait, TASK_INTERRUPTIBLE);
> +               if (vmem->ready) {
> +                       break;
> +               }
> +               if (signal_pending(current)) {
> +                       ret = -ERESTARTSYS;
> +                       break;
> +               }
> +               spin_unlock(&vmem->lock);
> +               schedule();
> +               spin_lock(&vmem->lock);
> +       }
> +       spin_unlock(&vmem->lock);
> +       finish_wait(&vmem->ready_wait, &wait);
> +       return ret;
> +}
> +
> +static long kvm_vmem_ioctl(struct file *filp, unsigned int ioctl,
> +                          unsigned long arg)
> +{
> +       struct kvm_vmem *vmem = filp->private_data;
> +       void __user *argp = (void __user *) arg;
> +       long ret = 0;
> +
> +       switch (ioctl) {
> +       case KVM_VMEM_READY:
> +               kvm_vmem_ready(vmem);
> +               ret = 0;
> +               break;
> +       case KVM_VMEM_WAIT_READY:
> +               ret = kvm_vmem_wait_ready(vmem);
> +               break;
> +       case KVM_VMEM_GET_PAGE_REQUEST: {
> +               struct kvm_vmem_page_request page_request;
> +               ret = -EFAULT;
> +               if (copy_from_user(&page_request, argp, sizeof(page_request)))
> +                       break;
> +               ret = kvm_vmem_get_page_request(vmem, &page_request);
> +               if (ret == 0 &&
> +                   copy_to_user(argp +
> +                                offsetof(struct kvm_vmem_page_request, nr),
> +                                &page_request.nr,
> +                                sizeof(page_request.nr))) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +               break;
> +       }
> +       case KVM_VMEM_MARK_PAGE_CACHED: {
> +               struct kvm_vmem_page_cached page_cached;
> +               ret = -EFAULT;
> +               if (copy_from_user(&page_cached, argp, sizeof(page_cached)))
> +                       break;
> +               ret = kvm_vmem_mark_page_cached(vmem, &page_cached);
> +               break;
> +       }
> +       case KVM_VMEM_MAKE_PAGES_PRESENT: {
> +               struct kvm_vmem_make_pages_present pages_present;
> +               ret = -EFAULT;
> +               if (copy_from_user(&pages_present, argp,
> +                                  sizeof(pages_present)))
> +                       break;
> +               ret = kvm_vmem_make_pages_present(vmem, &pages_present);
> +               break;
> +       }
> +       case KVM_VMEM_MAKE_VMA_ANONYMOUS:
> +               ret = kvm_vmem_make_vma_anonymous(vmem);
> +               break;
> +       default:
> +               ret = -EINVAL;
> +               break;
> +       }
> +       return ret;
> +}
> +
> +static unsigned long kvm_vmem_bitmap_bytes(const struct kvm_vmem *vmem)
> +{
> +       return round_up(vmem->pgoff_end, BITS_PER_LONG) / 8;
> +}
> +
> +static void kvm_vmem_free(struct kvm_vmem *vmem)
> +{
> +       if (vmem->task) {
> +               put_task_struct(vmem->task);
> +               vmem->task = NULL;
> +       }
> +
> +       if (vmem->shmem_filp)
> +               fput(vmem->shmem_filp);
> +       if (kvm_vmem_bitmap_bytes(vmem) > PAGE_SIZE) {
> +               vfree(vmem->cached);
> +               vfree(vmem->faulted);
> +       } else {
> +               kfree(vmem->cached);
> +               kfree(vmem->faulted);
> +       }
> +       kfree(vmem->vma);
> +       kfree(vmem->async_req);
> +       kfree(vmem->sync_req_bitmap);
> +       kfree(vmem->sync_wait_bitmap);
> +       kfree(vmem->page_wait);
> +       kfree(vmem->sync_req);
> +       kfree(vmem);
> +}
> +
> +static int kvm_vmem_release(struct inode *inode, struct file *filp)
> +{
> +       struct kvm_vmem *vmem = filp->private_data;
> +       kvm_vmem_free(vmem);
> +       return 0;
> +}
> +
> +static struct file_operations kvm_vmem_fops = {
> +       .release        = kvm_vmem_release,
> +       .unlocked_ioctl = kvm_vmem_ioctl,
> +       .mmap           = kvm_vmem_mmap,
> +       .poll           = kvm_vmem_poll,
> +       .llseek         = noop_llseek,
> +};
> +
> +static int kvm_create_vmem(struct kvm_vmem_create *create)
> +{
> +       int error = 0;
> +       struct kvm_vmem *vmem = NULL;
> +       struct vm_area_struct *vma = NULL;
> +       int shmem_fd;
> +       unsigned long bitmap_bytes;
> +       unsigned long sync_bitmap_bytes;
> +       int i;
> +
> +       vmem = kzalloc(sizeof(*vmem), GFP_KERNEL);
> +       vmem->task = NULL;

Is this needed, doesn't kzalloc() return zeroed memory?

> +       vmem->mmapped = false;
> +       spin_lock_init(&vmem->lock);
> +       vmem->size = roundup(create->size, PAGE_SIZE);
> +       vmem->pgoff_end = vmem->size >> PAGE_SHIFT;
> +       init_waitqueue_head(&vmem->req_wait);
> +
> +       vma = kzalloc(sizeof(*vma), GFP_KERNEL);
> +       vma->vm_start = 0;

Also here.

> +       vma->vm_end = vmem->size;
> +       /* this shmem file is used for temporal buffer for pages
> +          so it's unlikely that so many pages exists in this shmem file */
> +       vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> +               VM_DONTEXPAND;
> +       vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> +       vma->vm_pgoff = 0;
> +       INIT_LIST_HEAD(&vma->anon_vma_chain);
> +
> +       vmem->vma = vma;
> +
> +       shmem_fd = get_unused_fd();
> +       if (shmem_fd < 0) {
> +               error = shmem_fd;
> +               goto out;
> +       }
> +       error = shmem_zero_setup(vma);
> +       if (error < 0) {
> +               put_unused_fd(shmem_fd);
> +               goto out;
> +       }
> +       vmem->shmem_filp = vma->vm_file;
> +       get_file(vmem->shmem_filp);
> +       fd_install(shmem_fd, vma->vm_file);
> +       create->shmem_fd = shmem_fd;
> +
> +       create->vmem_fd = anon_inode_getfd("kvm-vmem",
> +                                          &kvm_vmem_fops, vmem, O_RDWR);
> +       if (create->vmem_fd < 0) {
> +               error = create->vmem_fd;
> +               goto out;
> +       }
> +
> +       bitmap_bytes = kvm_vmem_bitmap_bytes(vmem);
> +       if (bitmap_bytes > PAGE_SIZE) {
> +               vmem->cached = vzalloc(bitmap_bytes);
> +               vmem->faulted = vzalloc(bitmap_bytes);
> +       } else {
> +               vmem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> +               vmem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> +       }
> +
> +#define ASYNC_REQ_MAX  (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> +       vmem->async_req_max = ASYNC_REQ_MAX;
> +       vmem->async_req_nr = 0;
> +       vmem->async_req = kzalloc(sizeof(*vmem->async_req), GFP_KERNEL);
> +
> +#define SYNC_REQ_MAX   (KVM_MAX_VCPUS)
> +       vmem->sync_req_max = round_up(SYNC_REQ_MAX, BITS_PER_LONG);
> +       sync_bitmap_bytes = sizeof(unsigned long) *
> +               (vmem->sync_req_max / BITS_PER_LONG);
> +       vmem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +       vmem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +       vmem->page_wait = kzalloc(sizeof(*vmem->page_wait) *
> +                                 vmem->sync_req_max, GFP_KERNEL);
> +       for (i = 0; i < vmem->sync_req_max; ++i)
> +               init_waitqueue_head(&vmem->page_wait[i]);
> +       vmem->sync_req = kzalloc(sizeof(*vmem->sync_req) *
> +                                vmem->sync_req_max, GFP_KERNEL);
> +
> +       vmem->req_list_nr = 0;
> +       INIT_LIST_HEAD(&vmem->req_list);
> +       init_waitqueue_head(&vmem->req_list_wait);
> +
> +       init_waitqueue_head(&vmem->ready_wait);
> +       vmem->ready = false;
> +
> +       return 0;
> +
> + out:
> +       kvm_vmem_free(vmem);
> +       return error;
> +}
> +
> +static long kvm_vmem_dev_ioctl(struct file *filp, unsigned int ioctl,
> +                              unsigned long arg)
> +{
> +       void __user *argp = (void __user *) arg;
> +       long ret;
> +
> +       switch (ioctl) {
> +       case KVM_CREATE_VMEM: {
> +               struct kvm_vmem_create create;
> +               if (copy_from_user(&create, argp, sizeof(create))) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +               ret = kvm_create_vmem(&create);
> +               if (copy_to_user(argp, &create, sizeof(create))) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +               break;
> +       }
> +       default:
> +               ret = -EINVAL;
> +               break;
> +       }
> +       return ret;
> +}
> +
> +static int kvm_vmem_dev_release(struct inode *inode, struct file *filp)
> +{
> +       return 0;
> +}
> +
> +static struct file_operations kvm_vmem_dev_fops = {
> +       .release = kvm_vmem_dev_release,
> +       .unlocked_ioctl = kvm_vmem_dev_ioctl,
> +};
> +
> +long kvm_dev_ioctl_create_vmem_dev(void)
> +{
> +       return anon_inode_getfd("kvm-vmem-dev", &kvm_vmem_dev_fops,
> +                               NULL, O_RDWR);
> +}
> diff --git a/virt/kvm/vmem.h b/virt/kvm/vmem.h
> new file mode 100644
> index 0000000..bc7e8cf
> --- /dev/null
> +++ b/virt/kvm/vmem.h
> @@ -0,0 +1,68 @@
> +/*
> + * KVM post copy vmem
> + *
> + * Copyright (c) 2011,
> + * National Institute of Advanced Industrial Science and Technology
> + *
> + * https://sites.google.com/site/grivonhome/quick-kvm-migration
> + * Author: Isaku Yamahata <yamahata at valinux co jp>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.

Old address also here.

> + */
> +
> +#ifndef __KVM_VMEM_H__
> +#define __KVM_VMEM_H__
> +
> +struct kvm_vmem_page_req_list {
> +       struct list_head list;
> +       pgoff_t pgoff;
> +};
> +
> +struct kvm_vmem {
> +       loff_t size;
> +       pgoff_t pgoff_end;
> +       spinlock_t lock;
> +
> +       wait_queue_head_t req_wait;
> +
> +       int async_req_max;
> +       int async_req_nr;
> +       pgoff_t *async_req;
> +
> +       int sync_req_max;

'int' between pointers would mean 4 bytes of structure padding on 64 bit hosts.

> +       unsigned long *sync_req_bitmap;
> +       unsigned long *sync_wait_bitmap;
> +       pgoff_t *sync_req;
> +       wait_queue_head_t *page_wait;
> +
> +       int req_list_nr;
> +       struct list_head req_list;
> +       wait_queue_head_t req_list_wait;
> +
> +       unsigned long *cached;
> +       unsigned long *faulted;
> +
> +       bool mmapped;
> +       unsigned long vm_start;
> +       unsigned int vma_nr;
> +       struct task_struct *task;
> +
> +       wait_queue_head_t ready_wait;
> +       bool ready;
> +
> +       struct file *shmem_filp;
> +       struct vm_area_struct *vma;
> +};
> +
> +#endif /* __KVM_VMEM_H__ */
> --
> 1.7.1.1
>
>
> --
> yamahata
>
>
Avi Kivity Aug. 15, 2011, 7:29 p.m. UTC | #3
On 08/12/2011 04:07 AM, Isaku Yamahata wrote:
> This is a character device to hook page access.
> The page fault in the area is reported to another user process by
> this chardriver. Then, the process fills the page contents and
> resolves the page fault.

Have you considered CUSE (character device in userspace, fs/fuse/cuse.c)?

> index 55f5afb..623109e 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo {
>   #define KVM_CAP_PPC_SMT 64
>   #define KVM_CAP_PPC_RMA	65
>   #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
> +#define KVM_CAP_POST_COPY_MEMORY 67
>
>   #ifdef KVM_CAP_IRQ_ROUTING
>
> @@ -760,6 +761,50 @@ struct kvm_clock_data {
>   /* Available with KVM_CAP_RMA */
>   #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
>
> +struct kvm_vmem_create {
> +	__u64 size;	/* in bytes */
> +	__s32 vmem_fd;
> +	__s32 shmem_fd;
> +};

Should really be outside kvm.h (and virt/kvm), since it's not kvm specific.

> +
> +struct kvm_vmem_page_request {
> +	__u32 nr;
> +	__u64 __user *pgoffs;
> +};
> +
> +struct kvm_vmem_page_cached {
> +	__u32 nr;
> +	__u64 __user *pgoffs;
> +};
> +
> +struct kvm_vmem_page_range {
> +	__u64 pgoff;
> +	__u64 nr_pages;
> +};
> +
> +struct kvm_vmem_make_pages_present {
> +	__u32 nr;
> +	struct kvm_vmem_page_range __user *ranges;
> +};

This is madvise(MADV_WILLNEED), is it not?

> +
> +/* Available with KVM_CAP_POST_COPY_MEMORY */
> +#define KVM_CREATE_VMEM_DEV       _IO(KVMIO,  0xb0)
> +
> +/* ioctl for vmem_dev fd */
> +#define KVM_CREATE_VMEM		  _IOR(KVMIO, 0xb1, __u32)
> +
> +/* ioctl for vmem fd */
> +#define KVM_VMEM_WAIT_READY	  _IO(KVMIO,  0xb2)
> +#define KVM_VMEM_READY		  _IO(KVMIO,  0xb3)
> +#define KVM_VMEM_GET_PAGE_REQUEST \
> +	_IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request)
> +#define KVM_VMEM_MARK_PAGE_CACHED \
> +	_IOW(KVMIO,  0xb5, struct kvm_vmem_page_cached)
> +#define KVM_VMEM_MAKE_PAGES_PRESENT \
> +	_IOW(KVMIO,  0xb6, struct kvm_vmem_make_pages_present)
> +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7)

Can you explain these in some more detail?
Isaku Yamahata Aug. 16, 2011, 1:42 a.m. UTC | #4
On Mon, Aug 15, 2011 at 12:29:37PM -0700, Avi Kivity wrote:
> On 08/12/2011 04:07 AM, Isaku Yamahata wrote:
>> This is a character device to hook page access.
>> The page fault in the area is reported to another user process by
>> this chardriver. Then, the process fills the page contents and
>> resolves the page fault.
>
> Have you considered CUSE (character device in userspace, fs/fuse/cuse.c)?

By looking at dev.c and cuse.c, it doesn't seem to support mmap and
fault handler.

>
>> index 55f5afb..623109e 100644
>> --- a/include/linux/kvm.h
>> +++ b/include/linux/kvm.h
>> @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo {
>>   #define KVM_CAP_PPC_SMT 64
>>   #define KVM_CAP_PPC_RMA	65
>>   #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
>> +#define KVM_CAP_POST_COPY_MEMORY 67
>>
>>   #ifdef KVM_CAP_IRQ_ROUTING
>>
>> @@ -760,6 +761,50 @@ struct kvm_clock_data {
>>   /* Available with KVM_CAP_RMA */
>>   #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
>>
>> +struct kvm_vmem_create {
>> +	__u64 size;	/* in bytes */
>> +	__s32 vmem_fd;
>> +	__s32 shmem_fd;
>> +};
>
> Should really be outside kvm.h (and virt/kvm), since it's not kvm specific.

Okay. I'll un-kvm it.

>> +
>> +struct kvm_vmem_page_request {
>> +	__u32 nr;
>> +	__u64 __user *pgoffs;
>> +};
>> +
>> +struct kvm_vmem_page_cached {
>> +	__u32 nr;
>> +	__u64 __user *pgoffs;
>> +};
>> +
>> +struct kvm_vmem_page_range {
>> +	__u64 pgoff;
>> +	__u64 nr_pages;
>> +};
>> +
>> +struct kvm_vmem_make_pages_present {
>> +	__u32 nr;
>> +	struct kvm_vmem_page_range __user *ranges;
>> +};
>
> This is madvise(MADV_WILLNEED), is it not?

Another process, not qemu process, issues it,
and it make the pages are present in qemu process address space.


>> +
>> +/* Available with KVM_CAP_POST_COPY_MEMORY */
>> +#define KVM_CREATE_VMEM_DEV       _IO(KVMIO,  0xb0)
>> +
>> +/* ioctl for vmem_dev fd */
>> +#define KVM_CREATE_VMEM		  _IOR(KVMIO, 0xb1, __u32)
>> +
>> +/* ioctl for vmem fd */
>> +#define KVM_VMEM_WAIT_READY	  _IO(KVMIO,  0xb2)
>> +#define KVM_VMEM_READY		  _IO(KVMIO,  0xb3)
>> +#define KVM_VMEM_GET_PAGE_REQUEST \
>> +	_IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request)
>> +#define KVM_VMEM_MARK_PAGE_CACHED \
>> +	_IOW(KVMIO,  0xb5, struct kvm_vmem_page_cached)
>> +#define KVM_VMEM_MAKE_PAGES_PRESENT \
>> +	_IOW(KVMIO,  0xb6, struct kvm_vmem_make_pages_present)
>> +#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7)
>
> Can you explain these in some more detail?


KVM_CRATE_VMEM_DEV: create vmem-dev device from kvm device
                    for qemu
KVM_CREATE_VMEM: create vmem device from vmem-dev device.
                 (note:qemu creates more than one memory region.)


KVM_VMEM_WAIT_READY: wait for KVM_VMEM_READY
                     for qemu
KVM_VMEM_READY: unblock KVM_VMEM_WAIT_READY
                for daemon uses
These are for qemu and daemon to synchronise to enter postcopy stage.


KVM_VMEM_GET_PAGE_REQUEST: retrieve page fault of qemu process
KVM_VMEM_MARK_PAGE_CACHED: mark the specified pages pulled from the source
                           for daemon
KVM_VMEM_MAKE_PAGES_PRESENT: make the specified pages present in qemu
                             virtual address space
                             for daemon uses
KVM_VMEM_MAKE_VMA_ANONYMOUS: make the specified vma in the qemu process
                             anonymous
			     I'm not sure whether this can be implemented
                             or not.

I think The following the work flow on the destination helps.

        qemu on the destination
              |
              V
        open(/dev/kvm)
              |
              V
        KVM_CREATE_VMEM_DEV
              |
              V
        Here we have two file descriptors to
        vmem device and shmem file
              |
              |
              |                                  daemon on the destination
              V                                  
        fork()---------------------------------------,
              |                                      |
              V                                      |
        close(socket)                                V
        close(shmem)                              mmap(shmem file)
              |                                      |
              V                                      V
        mmap(vmem device) for guest RAM           close(shmem file)
              |                                      |
              V                                      |
        KVM_VMEM_READY_WAIT <---------------------KVM_VMEM_READY
              |                                      |
              V                                      |
        close(vmem device)                        Here the daemon takes over
              |                                   the owner of the socket 
        entering post copy stage                  to the source
        start guest execution                        |
              |                                      |
              V                                      V
        access guest RAM                          KVM_VMEM_GET_PAGE_REQUEST
              |                                      |
              V                                      V
        page fault ------------------------------>page offset is returned
        block                                        |
                                                     V
                                                  pull page from the source
                                                  write the page contents
                                                  to the shmem.
                                                     |
                                                     V
        unblock     <-----------------------------KVM_VMEM_MARK_PAGE_CACHED
        the fault handler returns the page
        page fault is resolved
              |
              |                                   pages can be pulled
              |                                   backgroundly
              |                                      |
              |                                      V
              |                                   KVM_VMEM_MARK_PAGE_CACHED 
              |                                      |
              V                                      V
        The specified pages<----------------------KVM_VMEM_MAKE_PAGES_PRESENT
        are made present                             |
        so future page fault is avoided.             |
              |                                      |
              V                                      V

                 all the pages are pulled from the source

              |                                      |
              V                                      V
        the vma becomes anonymous<----------------KVM_VMEM_MAKE_VMA_ANONYMOUS
       (note: I'm not sure if this can be implemented or not)
              |                                      |
              V                                      V
        migration completes                        exit()


thanks,
--
yamahata
Avi Kivity Aug. 16, 2011, 1:40 p.m. UTC | #5
On 08/15/2011 06:42 PM, Isaku Yamahata wrote:
> On Mon, Aug 15, 2011 at 12:29:37PM -0700, Avi Kivity wrote:
> >  On 08/12/2011 04:07 AM, Isaku Yamahata wrote:
> >>  This is a character device to hook page access.
> >>  The page fault in the area is reported to another user process by
> >>  this chardriver. Then, the process fills the page contents and
> >>  resolves the page fault.
> >
> >  Have you considered CUSE (character device in userspace, fs/fuse/cuse.c)?
>
> By looking at dev.c and cuse.c, it doesn't seem to support mmap and
> fault handler.

If performance is sufficient, this would be the preferred path.  Enhance 
an existing API which can be useful to others, rather than add a new one.

> >>  +
> >>  +struct kvm_vmem_make_pages_present {
> >>  +	__u32 nr;
> >>  +	struct kvm_vmem_page_range __user *ranges;
> >>  +};
> >
> >  This is madvise(MADV_WILLNEED), is it not?
>
> Another process, not qemu process, issues it,
> and it make the pages are present in qemu process address space.

That process just issues these calls in a loop until all memory is 
present, yes? it seems those few lines could be easily added to qemu.

>
>
> >  Can you explain these in some more detail?
>
>
> KVM_CRATE_VMEM_DEV: create vmem-dev device from kvm device
>                      for qemu
> KVM_CREATE_VMEM: create vmem device from vmem-dev device.
>                   (note:qemu creates more than one memory region.)
>
>
> KVM_VMEM_WAIT_READY: wait for KVM_VMEM_READY
>                       for qemu
> KVM_VMEM_READY: unblock KVM_VMEM_WAIT_READY
>                  for daemon uses
> These are for qemu and daemon to synchronise to enter postcopy stage.

This are eliminated if we fold the daemon into qemu.  Also, could just a 
semaphore or other synchronization mechanism.

>
> KVM_VMEM_GET_PAGE_REQUEST: retrieve page fault of qemu process

Equivalent to the fault callback of CUSE (if we add it)?

> KVM_VMEM_MARK_PAGE_CACHED: mark the specified pages pulled from the source
>                             for daemon

Equivalent to returning from that callback with a new page?

> KVM_VMEM_MAKE_PAGES_PRESENT: make the specified pages present in qemu
>                               virtual address space
>                               for daemon uses
> KVM_VMEM_MAKE_VMA_ANONYMOUS: make the specified vma in the qemu process
>                               anonymous
> 			     I'm not sure whether this can be implemented
>                               or not.
>
> I think The following the work flow on the destination helps.
>
>          qemu on the destination
>                |
>                V
>          open(/dev/kvm)
>                |
>                V
>          KVM_CREATE_VMEM_DEV
>                |
>                V
>          Here we have two file descriptors to
>          vmem device and shmem file
>                |
>                |
>                |                                  daemon on the destination
>                V
>          fork()---------------------------------------,
>                |                                      |
>                V                                      |
>          close(socket)                                V
>          close(shmem)                              mmap(shmem file)
>                |                                      |
>                V                                      V
>          mmap(vmem device) for guest RAM           close(shmem file)
>                |                                      |
>                V                                      |
>          KVM_VMEM_READY_WAIT<---------------------KVM_VMEM_READY
>                |                                      |
>                V                                      |
>          close(vmem device)                        Here the daemon takes over
>                |                                   the owner of the socket
>          entering post copy stage                  to the source
>          start guest execution                        |
>                |                                      |
>                V                                      V
>          access guest RAM                          KVM_VMEM_GET_PAGE_REQUEST
>                |                                      |
>                V                                      V
>          page fault ------------------------------>page offset is returned
>          block                                        |
>                                                       V
>                                                    pull page from the source
>                                                    write the page contents
>                                                    to the shmem.
>                                                       |
>                                                       V
>          unblock<-----------------------------KVM_VMEM_MARK_PAGE_CACHED
>          the fault handler returns the page
>          page fault is resolved
>                |
>                |                                   pages can be pulled
>                |                                   backgroundly
>                |                                      |
>                |                                      V
>                |                                   KVM_VMEM_MARK_PAGE_CACHED
>                |                                      |
>                V                                      V
>          The specified pages<----------------------KVM_VMEM_MAKE_PAGES_PRESENT
>          are made present                             |
>          so future page fault is avoided.             |
>                |                                      |
>                V                                      V
>
>                   all the pages are pulled from the source
>
>                |                                      |
>                V                                      V
>          the vma becomes anonymous<----------------KVM_VMEM_MAKE_VMA_ANONYMOUS
>         (note: I'm not sure if this can be implemented or not)
>                |                                      |
>                V                                      V
>          migration completes                        exit()
>

Yes, thanks, this was very helpful.
diff mbox

Patch

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 0a09b58..dcbd52e 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@  config KVM
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
 	select KVM_ASYNC_PF
+	select KVM_VMEM
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
 	select TASKSTATS
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..6125f4c 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -10,6 +10,7 @@  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				assigned-dev.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
+kvm-$(CONFIG_KVM_VMEM)	+= $(addprefix ../../../virt/kvm/, vmem.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
 			   i8254.o timer.o
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 55f5afb..623109e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -554,6 +554,7 @@  struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_SMT 64
 #define KVM_CAP_PPC_RMA	65
 #define KVM_CAP_MAX_VCPUS 66       /* returns max vcpus per vm */
+#define KVM_CAP_POST_COPY_MEMORY 67
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -760,6 +761,50 @@  struct kvm_clock_data {
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 
+struct kvm_vmem_create {
+	__u64 size;	/* in bytes */
+	__s32 vmem_fd;
+	__s32 shmem_fd;
+};
+
+struct kvm_vmem_page_request {
+	__u32 nr;
+	__u64 __user *pgoffs;
+};
+
+struct kvm_vmem_page_cached {
+	__u32 nr;
+	__u64 __user *pgoffs;
+};
+
+struct kvm_vmem_page_range {
+	__u64 pgoff;
+	__u64 nr_pages;
+};
+
+struct kvm_vmem_make_pages_present {
+	__u32 nr;
+	struct kvm_vmem_page_range __user *ranges;
+};
+
+/* Available with KVM_CAP_POST_COPY_MEMORY */
+#define KVM_CREATE_VMEM_DEV       _IO(KVMIO,  0xb0)
+
+/* ioctl for vmem_dev fd */
+#define KVM_CREATE_VMEM		  _IOR(KVMIO, 0xb1, __u32)
+
+/* ioctl for vmem fd */
+#define KVM_VMEM_WAIT_READY	  _IO(KVMIO,  0xb2)
+#define KVM_VMEM_READY		  _IO(KVMIO,  0xb3)
+#define KVM_VMEM_GET_PAGE_REQUEST \
+	_IOWR(KVMIO, 0xb4, struct kvm_vmem_page_request)
+#define KVM_VMEM_MARK_PAGE_CACHED \
+	_IOW(KVMIO,  0xb5, struct kvm_vmem_page_cached)
+#define KVM_VMEM_MAKE_PAGES_PRESENT \
+	_IOW(KVMIO,  0xb6, struct kvm_vmem_make_pages_present)
+#define KVM_VMEM_MAKE_VMA_ANONYMOUS _IO(KVMIO, 0xb7)
+
+
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
 struct kvm_assigned_pci_dev {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ff4d406..8b3dafa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -222,6 +222,8 @@  struct kvm_irq_routing_table {};
 
 #endif
 
+long kvm_dev_ioctl_create_vmem_dev(void);
+
 struct kvm_memslots {
 	int nmemslots;
 	u64 generation;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e..7f3fc4e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2838,6 +2838,7 @@  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mem_cgroup_cache_charge);
 
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf54..ae7d61f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3035,6 +3035,7 @@  int shmem_zero_setup(struct vm_area_struct *vma)
 	vma->vm_flags |= VM_CAN_NONLINEAR;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(shmem_zero_setup);
 
 /**
  * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index f63ccb0..d3040ea 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -18,3 +18,6 @@  config KVM_MMIO
 
 config KVM_ASYNC_PF
        bool
+
+config KVM_VMEM
+       bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aefdda3..9e47e20 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2184,6 +2184,7 @@  static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
 	case KVM_CAP_INTERNAL_ERROR_DATA:
+	case KVM_CAP_POST_COPY_MEMORY:
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQ_ROUTING:
@@ -2233,6 +2234,11 @@  static long kvm_dev_ioctl(struct file *filp,
 	case KVM_TRACE_DISABLE:
 		r = -EOPNOTSUPP;
 		break;
+#ifdef CONFIG_KVM_VMEM
+	case KVM_CREATE_VMEM_DEV:
+		r = kvm_dev_ioctl_create_vmem_dev();
+		break;
+#endif
 	default:
 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
 	}
diff --git a/virt/kvm/vmem.c b/virt/kvm/vmem.c
new file mode 100644
index 0000000..b413663
--- /dev/null
+++ b/virt/kvm/vmem.c
@@ -0,0 +1,847 @@ 
+/*
+ * KVM post copy vmem
+ *
+ * Copyright (c) 2011,
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/pagemap.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include "vmem.h"
+
+static void kvm_vmem_release_fake_vmf(int ret, struct vm_fault *fake_vmf)
+{
+	if (ret & VM_FAULT_LOCKED) {
+		unlock_page(fake_vmf->page);
+	}
+	page_cache_release(fake_vmf->page);
+}
+
+static int kvm_vmem_minor_fault(struct kvm_vmem *vmem,
+				struct vm_area_struct *vma,
+				struct vm_fault *vmf)
+{
+	struct vm_fault fake_vmf;
+	int ret;
+	struct page *page;
+
+	BUG_ON(!test_bit(vmf->pgoff, vmem->cached));
+	fake_vmf = *vmf;
+	fake_vmf.page = NULL;
+	ret = vmem->vma->vm_ops->fault(vmem->vma, &fake_vmf);
+	if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))
+		return ret;
+
+	/*
+	 * TODO: pull out fake_vmf->page from shmem file and donate it
+	 * to this vma resolving the page fault.
+	 * vmf->page = fake_vmf->page;
+	 */
+
+	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+	if (!page)
+		return VM_FAULT_OOM;
+	if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) {
+		kvm_vmem_release_fake_vmf(ret, &fake_vmf);
+		page_cache_release(page);
+		return VM_FAULT_OOM;
+	}
+
+	copy_highpage(page, fake_vmf.page);
+	kvm_vmem_release_fake_vmf(ret, &fake_vmf);
+
+	ret |= VM_FAULT_LOCKED;
+	SetPageUptodate(page);
+	vmf->page = page;
+	set_bit(vmf->pgoff, vmem->faulted);
+
+	return ret;
+}
+
+static int kvm_vmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *filp = vma->vm_file;
+	struct kvm_vmem *vmem = filp->private_data;
+
+	if (vmf->pgoff >= vmem->pgoff_end) {
+		return VM_FAULT_SIGBUS;
+	}
+
+	BUG_ON(test_bit(vmf->pgoff, vmem->faulted));
+
+	if (!test_bit(vmf->pgoff, vmem->cached)) {
+		/* major fault */
+		unsigned long bit;
+		DEFINE_WAIT(wait);
+
+		if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
+			/* async page fault */
+			spin_lock(&vmem->lock);
+			if (vmem->async_req_nr < vmem->async_req_max) {
+				vmem->async_req[vmem->async_req_nr] =
+					vmf->pgoff;
+				vmem->async_req_nr++;
+			}
+			spin_unlock(&vmem->lock);
+			wake_up_poll(&vmem->req_wait, POLLIN);
+
+			if (test_bit(vmf->pgoff, vmem->cached))
+				return kvm_vmem_minor_fault(vmem, vma, vmf);
+			return VM_FAULT_MAJOR | VM_FAULT_RETRY;
+		}
+
+		spin_lock(&vmem->lock);
+		bit = find_first_zero_bit(vmem->sync_wait_bitmap,
+					  vmem->sync_req_max);
+		if (likely(bit < vmem->sync_req_max)) {
+			vmem->sync_req[bit] = vmf->pgoff;
+			prepare_to_wait(&vmem->page_wait[bit], &wait,
+					TASK_UNINTERRUPTIBLE);
+			set_bit(bit, vmem->sync_req_bitmap);
+			set_bit(bit, vmem->sync_wait_bitmap);
+			spin_unlock(&vmem->lock);
+			wake_up_poll(&vmem->req_wait, POLLIN);
+
+			if (!test_bit(vmf->pgoff, vmem->cached))
+				schedule();
+			finish_wait(&vmem->page_wait[bit], &wait);
+			clear_bit(bit, vmem->sync_wait_bitmap);
+		} else {
+			struct kvm_vmem_page_req_list page_req_list = {
+				.pgoff = vmf->pgoff,
+			};
+			vmem->req_list_nr++;
+			list_add_tail(&page_req_list.list, &vmem->req_list);
+			wake_up_poll(&vmem->req_wait, POLLIN);
+			for (;;) {
+				prepare_to_wait(&vmem->req_list_wait, &wait,
+						TASK_UNINTERRUPTIBLE);
+				if (test_bit(vmf->pgoff, vmem->cached)) {
+					vmem->req_list_nr--;
+					break;
+				}
+				spin_unlock(&vmem->lock);
+				schedule();
+				spin_lock(&vmem->lock);
+			}
+			spin_unlock(&vmem->lock);
+			finish_wait(&vmem->req_list_wait, &wait);
+		}
+
+		return kvm_vmem_minor_fault(vmem, vma, vmf) | VM_FAULT_MAJOR;
+	}
+
+	return kvm_vmem_minor_fault(vmem, vma, vmf);
+}
+
+/* for partial munmap */
+static void kvm_vmem_vma_open(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct kvm_vmem *vmem = filp->private_data;
+
+	spin_lock(&vmem->lock);
+	vmem->vma_nr++;
+	spin_unlock(&vmem->lock);
+}
+
+static void kvm_vmem_vma_close(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct kvm_vmem *vmem = filp->private_data;
+	struct task_struct *task = NULL;
+
+	spin_lock(&vmem->lock);
+	vmem->vma_nr--;
+	if (vmem->vma_nr == 0) {
+		task = vmem->task;
+		vmem->task = NULL;
+	}
+	spin_unlock(&vmem->lock);
+
+	if (task)
+		put_task_struct(task);
+}
+
+static const struct vm_operations_struct kvm_vmem_vm_ops = {
+	.open = kvm_vmem_vma_open,
+	.close = kvm_vmem_vma_close,
+	.fault = kvm_vmem_fault,
+};
+
+static int kvm_vmem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct kvm_vmem *vmem = filp->private_data;
+	int error;
+
+	/* allow mmap() only once */
+	spin_lock(&vmem->lock);
+	if (vmem->mmapped) {
+		error = -EBUSY;
+		goto out;
+	}
+	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff >
+	    vmem->pgoff_end) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	vmem->mmapped = true;
+	vmem->vma_nr = 1;
+	vmem->vm_start = vma->vm_start;
+	get_task_struct(current);
+	vmem->task = current;
+	spin_unlock(&vmem->lock);
+
+	vma->vm_ops = &kvm_vmem_vm_ops;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+	vma->vm_flags &= ~VM_SHARED;
+	return 0;
+
+out:
+	spin_unlock(&vmem->lock);
+	return error;
+}
+
+static bool kvm_vmem_req_pending(struct kvm_vmem* vmem)
+{
+	return !list_empty(&vmem->req_list) ||
+		!bitmap_empty(vmem->sync_req_bitmap, vmem->sync_req_max) ||
+		(vmem->async_req_nr > 0);
+}
+
+static unsigned int kvm_vmem_poll(struct file* filp, poll_table *wait)
+{
+	struct kvm_vmem *vmem = filp->private_data;
+	unsigned int events = 0;
+
+	poll_wait(filp, &vmem->req_wait, wait);
+
+	spin_lock(&vmem->lock);
+	if (kvm_vmem_req_pending(vmem))
+		events |= POLLIN;
+	spin_unlock(&vmem->lock);
+
+	return events;
+}
+
+/*
+ * return value
+ * true: finished
+ * false: more request
+ */
+static bool kvm_vmem_copy_page_request(struct kvm_vmem *vmem,
+				       pgoff_t *pgoffs, int req_max,
+				       int *req_nr)
+{
+	struct kvm_vmem_page_req_list *req_list;
+	struct kvm_vmem_page_req_list *tmp;
+
+	unsigned long bit;
+
+	*req_nr = 0;
+	list_for_each_entry_safe(req_list, tmp, &vmem->req_list, list) {
+		list_del(&req_list->list);
+		pgoffs[*req_nr] = req_list->pgoff;
+		(*req_nr)++;
+		if (*req_nr >= req_max)
+			return false;
+	}
+
+	bit = 0;
+	for (;;) {
+		bit = find_next_bit(vmem->sync_req_bitmap, vmem->sync_req_max,
+				    bit);
+		if (bit >= vmem->sync_req_max)
+			break;
+		pgoffs[*req_nr] = vmem->sync_req[bit];
+		(*req_nr)++;
+		clear_bit(bit, vmem->sync_req_bitmap);
+		if (*req_nr >= req_max)
+			return false;
+		bit++;
+	}
+
+	if (vmem->async_req_nr > 0) {
+		int nr = min(req_max - *req_nr, vmem->async_req_nr);
+		memcpy(pgoffs + *req_nr, vmem->async_req,
+		       sizeof(*vmem->async_req) * nr);
+		vmem->async_req_nr -= nr;
+		*req_nr += nr;
+		memmove(vmem->async_req, vmem->sync_req + nr,
+			vmem->async_req_nr * sizeof(*vmem->async_req));
+
+	}
+	return vmem->async_req_nr == 0;
+}
+
+static int kvm_vmem_get_page_request(struct kvm_vmem *vmem,
+				     struct kvm_vmem_page_request *page_req)
+{
+	DEFINE_WAIT(wait);
+#define REQ_MAX	((__u32)32)
+	pgoff_t pgoffs[REQ_MAX];
+	__u32 req_copied = 0;
+	int ret = 0;
+
+	spin_lock(&vmem->lock);
+	for (;;) {
+		prepare_to_wait(&vmem->req_wait, &wait, TASK_INTERRUPTIBLE);
+		if (kvm_vmem_req_pending(vmem)) {
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		spin_unlock(&vmem->lock);
+		schedule();
+		spin_lock(&vmem->lock);
+	}
+	finish_wait(&vmem->req_wait, &wait);
+	if (ret)
+		goto out_unlock;
+
+	while (req_copied < page_req->nr) {
+		int req_max;
+		int req_nr;
+		bool finished;
+		req_max = min(page_req->nr - req_copied, REQ_MAX);
+		finished = kvm_vmem_copy_page_request(vmem, pgoffs, req_max,
+						      &req_nr);
+
+		spin_unlock(&vmem->lock);
+
+		if (req_nr > 0) {
+			ret = 0;
+			if (copy_to_user(page_req->pgoffs + req_copied, pgoffs,
+					 sizeof(*pgoffs) * req_nr)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		}
+		req_copied += req_nr;
+		if (finished)
+			goto out;
+
+		spin_lock(&vmem->lock);
+	}
+
+out_unlock:
+	spin_unlock(&vmem->lock);
+out:
+	page_req->nr = req_copied;
+	return ret;
+}
+
+static int kvm_vmem_mark_page_cached(struct kvm_vmem *vmem,
+				     struct kvm_vmem_page_cached *page_cached)
+{
+	int ret = 0;
+#define PG_MAX	((__u32)32)
+	__u64 pgoffs[PG_MAX];
+	__u32 nr;
+	unsigned long bit;
+	bool wake_up_list = false;
+
+	nr = 0;
+	while (nr < page_cached->nr) {
+		__u32 todo = min(PG_MAX, (page_cached->nr - nr));
+		int i;
+
+		if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
+				   sizeof(*pgoffs) * todo)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		for (i = 0; i < todo; ++i) {
+			if (pgoffs[i] >= vmem->pgoff_end) {
+				ret = -EINVAL;
+				goto out;
+			}
+			set_bit(pgoffs[i], vmem->cached);
+		}
+		nr += todo;
+	}
+
+	spin_lock(&vmem->lock);
+	bit = 0;
+	for (;;) {
+		bit = find_next_bit(vmem->sync_wait_bitmap, vmem->sync_req_max,
+				    bit);
+		if (bit >= vmem->sync_req_max)
+			break;
+		if (test_bit(vmem->sync_req[bit], vmem->cached))
+			wake_up(&vmem->page_wait[bit]);
+		bit++;
+	}
+
+	if (vmem->req_list_nr > 0)
+		wake_up_list = true;
+	spin_unlock(&vmem->lock);
+
+	if (wake_up_list)
+		wake_up_all(&vmem->req_list_wait);
+
+out:
+	return ret;
+}
+
+static bool kvm_vmem_is_vmem_vma(const struct kvm_vmem *vmem,
+				 const struct vm_area_struct *vma)
+{
+	return vma->vm_file && vma->vm_file->private_data == vmem;
+}
+
+static void kvm_vmem_make_pages_present_entry(struct kvm_vmem *vmem,
+					      struct kvm_vmem_page_range *range,
+					      struct task_struct *task,
+					      struct mm_struct *mm,
+					      unsigned long vm_start)
+{
+	unsigned long pgoff = range->pgoff;
+	unsigned long range_end = range->pgoff + range->nr_pages;
+
+	down_read(&mm->mmap_sem);
+
+	while (pgoff < range->pgoff + range->nr_pages) {
+		unsigned long pgoff_end;
+		struct vm_area_struct *vma;
+		unsigned long saddr;
+		unsigned long eaddr;
+
+		/* search unfaulted range */
+		spin_lock(&vmem->lock);
+		pgoff = find_next_zero_bit(vmem->faulted, range_end, pgoff);
+		if (pgoff >= range_end) {
+			spin_unlock(&vmem->lock);
+			break;
+		}
+		pgoff_end = find_next_bit(vmem->faulted, range_end, pgoff);
+		spin_unlock(&vmem->lock);
+
+		saddr = vm_start + (pgoff << PAGE_SHIFT);
+		eaddr = vm_start + (pgoff_end << PAGE_SHIFT);
+		vma = find_vma(mm, saddr);
+		if (vma == NULL) {
+			break;
+		}
+		if (eaddr < vma->vm_start) {
+			pgoff = (vma->vm_start - vm_start) >> PAGE_SHIFT;
+			continue;
+		}
+
+		if (kvm_vmem_is_vmem_vma(vmem, vma)) {
+			unsigned long start = max(vma->vm_start, saddr);
+			unsigned long end = min(vma->vm_end, eaddr);
+			int nr_pages = (end - start) >> PAGE_SHIFT;
+			get_user_pages(task, mm, start, nr_pages,
+				       1, 1, NULL, NULL);
+			pgoff = (end - vm_start) >> PAGE_SHIFT;
+		} else {
+			pgoff = (vma->vm_end - vm_start) >> PAGE_SHIFT;
+		}
+	}
+
+	up_read(&mm->mmap_sem);
+}
+
+static int kvm_vmem_make_pages_present(
+	struct kvm_vmem *vmem,
+	struct kvm_vmem_make_pages_present *pages_present)
+{
+	struct task_struct *task;
+	struct mm_struct *mm;
+	pgoff_t pgoff_end;
+	unsigned long vm_start;
+	unsigned long vm_eaddr;
+
+#define NUM_ENTRIES	((__u32)32)
+	struct kvm_vmem_page_range kranges[NUM_ENTRIES];
+	__u32 nr = 0;
+	int ret;
+
+	spin_lock(&vmem->lock);
+	task = vmem->task;
+	pgoff_end = vmem->pgoff_end;
+	vm_start = vmem->vm_start;
+	vm_eaddr = vm_start + vmem->size;
+	spin_unlock(&vmem->lock);
+	if (task == NULL)
+		return 0;
+	mm = get_task_mm(task);
+	if (mm == NULL)
+		return 0;
+
+	ret = 0;
+	while (nr < pages_present->nr) {
+		int nr_ranges = min(NUM_ENTRIES, pages_present->nr - nr);
+		int i;
+
+		if (copy_from_user(&kranges, pages_present->ranges + nr,
+				   sizeof(kranges[0]) * nr_ranges)) {
+			ret = -EFAULT;
+			break;
+		}
+		for (i = 0; i < nr_ranges; ++i) {
+			struct kvm_vmem_page_range *range = &kranges[i];
+			if (range->pgoff >= pgoff_end ||
+			    range->nr_pages >= pgoff_end ||
+			    range->pgoff + range->nr_pages >= pgoff_end) {
+				ret = -EINVAL;
+				break;
+			}
+			kvm_vmem_make_pages_present_entry(vmem, range,
+							  task, mm, vm_start);
+		}
+		nr += nr_ranges;
+	}
+
+	mmput(mm);
+	return ret;
+}
+
+static int kvm_vmem_make_vma_anonymous(struct kvm_vmem *vmem)
+{
+#if 1
+	return -ENOSYS;
+#else
+	unsigned long saddr;
+	unsigned long eaddr;
+	unsigned long addr;
+	unsigned long bit;
+	struct task_struct *task;
+	struct mm_struct *mm;
+
+	spin_lock(&vmem->lock);
+	task = vmem->task;
+	saddr = vmem->vm_start;
+	eaddr = saddr + vmem->size;
+	bit = find_first_zero_bit(vmem->faulted, vmem->pgoff_end);
+	if (bit < vmem->pgoff_end) {
+		spin_unlock(&vmem->lock);
+		return -EBUSY;
+	}
+	spin_unlock(&vmem->lock);
+	if (task == NULL)
+		return 0;
+	mm = get_task_mm(task);
+	if (mm == NULL)
+		return 0;
+
+	addr = saddr;
+	down_write(&mm->mmap_sem);
+	while (addr < eaddr) {
+		struct vm_area_struct *vma;
+		vma = find_vma(mm, addr);
+		if (kvm_vmem_is_vmem_vma(vmem, vma)) {
+			/* XXX incorrect. race/locking and more fix up */
+			struct file *filp = vma->vm_file;
+			vma->vm_ops->close(vma);
+			vma->vm_ops = NULL;
+			vma->vm_file = NULL;
+			/* vma->vm_flags */
+			fput(filp);
+		}
+		addr = vma->vm_end;
+	}
+	up_write(&mm->mmap_sem);
+
+	mmput(mm);
+	return 0;
+#endif
+}
+
+static void kvm_vmem_ready(struct kvm_vmem *vmem)
+{
+	spin_lock(&vmem->lock);
+	vmem->ready = true;
+	spin_unlock(&vmem->lock);
+	wake_up_interruptible(&vmem->ready_wait);
+}
+
+static int kvm_vmem_wait_ready(struct kvm_vmem *vmem)
+{
+	int ret = 0;
+	DEFINE_WAIT(wait);
+
+	spin_lock(&vmem->lock);
+	for (;;) {
+		prepare_to_wait(&vmem->ready_wait, &wait, TASK_INTERRUPTIBLE);
+		if (vmem->ready) {
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		spin_unlock(&vmem->lock);
+		schedule();
+		spin_lock(&vmem->lock);
+	}
+	spin_unlock(&vmem->lock);
+	finish_wait(&vmem->ready_wait, &wait);
+	return ret;
+}
+
+static long kvm_vmem_ioctl(struct file *filp, unsigned int ioctl,
+			   unsigned long arg)
+{
+	struct kvm_vmem *vmem = filp->private_data;
+	void __user *argp = (void __user *) arg;
+	long ret = 0;
+
+	switch (ioctl) {
+	case KVM_VMEM_READY:
+		kvm_vmem_ready(vmem);
+		ret = 0;
+		break;
+	case KVM_VMEM_WAIT_READY:
+		ret = kvm_vmem_wait_ready(vmem);
+		break;
+	case KVM_VMEM_GET_PAGE_REQUEST: {
+		struct kvm_vmem_page_request page_request;
+		ret = -EFAULT;
+		if (copy_from_user(&page_request, argp, sizeof(page_request)))
+			break;
+		ret = kvm_vmem_get_page_request(vmem, &page_request);
+		if (ret == 0 &&
+		    copy_to_user(argp +
+				 offsetof(struct kvm_vmem_page_request, nr),
+				 &page_request.nr,
+				 sizeof(page_request.nr))) {
+			ret = -EFAULT;
+			break;
+		}
+		break;
+	}
+	case KVM_VMEM_MARK_PAGE_CACHED: {
+		struct kvm_vmem_page_cached page_cached;
+		ret = -EFAULT;
+		if (copy_from_user(&page_cached, argp, sizeof(page_cached)))
+			break;
+		ret = kvm_vmem_mark_page_cached(vmem, &page_cached);
+		break;
+	}
+	case KVM_VMEM_MAKE_PAGES_PRESENT: {
+		struct kvm_vmem_make_pages_present pages_present;
+		ret = -EFAULT;
+		if (copy_from_user(&pages_present, argp,
+				   sizeof(pages_present)))
+			break;
+		ret = kvm_vmem_make_pages_present(vmem, &pages_present);
+		break;
+	}
+	case KVM_VMEM_MAKE_VMA_ANONYMOUS:
+		ret = kvm_vmem_make_vma_anonymous(vmem);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static unsigned long kvm_vmem_bitmap_bytes(const struct kvm_vmem *vmem)
+{
+	return round_up(vmem->pgoff_end, BITS_PER_LONG) / 8;
+}
+
+static void kvm_vmem_free(struct kvm_vmem *vmem)
+{
+	if (vmem->task) {
+		put_task_struct(vmem->task);
+		vmem->task = NULL;
+	}
+
+	if (vmem->shmem_filp)
+		fput(vmem->shmem_filp);
+	if (kvm_vmem_bitmap_bytes(vmem) > PAGE_SIZE) {
+		vfree(vmem->cached);
+		vfree(vmem->faulted);
+	} else {
+		kfree(vmem->cached);
+		kfree(vmem->faulted);
+	}
+	kfree(vmem->vma);
+	kfree(vmem->async_req);
+	kfree(vmem->sync_req_bitmap);
+	kfree(vmem->sync_wait_bitmap);
+	kfree(vmem->page_wait);
+	kfree(vmem->sync_req);
+	kfree(vmem);
+}
+
+static int kvm_vmem_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_vmem *vmem = filp->private_data;
+	kvm_vmem_free(vmem);
+	return 0;
+}
+
+static struct file_operations kvm_vmem_fops = {
+	.release	= kvm_vmem_release,
+	.unlocked_ioctl = kvm_vmem_ioctl,
+	.mmap		= kvm_vmem_mmap,
+	.poll		= kvm_vmem_poll,
+	.llseek		= noop_llseek,
+};
+
+static int kvm_create_vmem(struct kvm_vmem_create *create)
+{
+	int error = 0;
+	struct kvm_vmem *vmem = NULL;
+	struct vm_area_struct *vma = NULL;
+	int shmem_fd;
+	unsigned long bitmap_bytes;
+	unsigned long sync_bitmap_bytes;
+	int i;
+
+	vmem = kzalloc(sizeof(*vmem), GFP_KERNEL);
+	vmem->task = NULL;
+	vmem->mmapped = false;
+	spin_lock_init(&vmem->lock);
+	vmem->size = roundup(create->size, PAGE_SIZE);
+	vmem->pgoff_end = vmem->size >> PAGE_SHIFT;
+	init_waitqueue_head(&vmem->req_wait);
+
+	vma = kzalloc(sizeof(*vma), GFP_KERNEL);
+	vma->vm_start = 0;
+	vma->vm_end = vmem->size;
+	/* this shmem file is used for temporal buffer for pages
+	   so it's unlikely that so many pages exists in this shmem file */
+	vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
+		VM_DONTEXPAND;
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	vma->vm_pgoff = 0;
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
+
+	vmem->vma = vma;
+
+	shmem_fd = get_unused_fd();
+	if (shmem_fd < 0) {
+		error = shmem_fd;
+		goto out;
+	}
+	error = shmem_zero_setup(vma);
+	if (error < 0) {
+		put_unused_fd(shmem_fd);
+		goto out;
+	}
+	vmem->shmem_filp = vma->vm_file;
+	get_file(vmem->shmem_filp);
+	fd_install(shmem_fd, vma->vm_file);
+	create->shmem_fd = shmem_fd;
+
+	create->vmem_fd = anon_inode_getfd("kvm-vmem",
+					   &kvm_vmem_fops, vmem, O_RDWR);
+	if (create->vmem_fd < 0) {
+		error = create->vmem_fd;
+		goto out;
+	}
+
+	bitmap_bytes = kvm_vmem_bitmap_bytes(vmem);
+	if (bitmap_bytes > PAGE_SIZE) {
+		vmem->cached = vzalloc(bitmap_bytes);
+		vmem->faulted = vzalloc(bitmap_bytes);
+	} else {
+		vmem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
+		vmem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
+	}
+
+#define ASYNC_REQ_MAX	(ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
+	vmem->async_req_max = ASYNC_REQ_MAX;
+	vmem->async_req_nr = 0;
+	vmem->async_req = kzalloc(sizeof(*vmem->async_req), GFP_KERNEL);
+
+#define SYNC_REQ_MAX	(KVM_MAX_VCPUS)
+	vmem->sync_req_max = round_up(SYNC_REQ_MAX, BITS_PER_LONG);
+	sync_bitmap_bytes = sizeof(unsigned long) *
+		(vmem->sync_req_max / BITS_PER_LONG);
+	vmem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
+	vmem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
+	vmem->page_wait = kzalloc(sizeof(*vmem->page_wait) *
+				  vmem->sync_req_max, GFP_KERNEL);
+	for (i = 0; i < vmem->sync_req_max; ++i)
+		init_waitqueue_head(&vmem->page_wait[i]);
+	vmem->sync_req = kzalloc(sizeof(*vmem->sync_req) *
+				 vmem->sync_req_max, GFP_KERNEL);
+
+	vmem->req_list_nr = 0;
+	INIT_LIST_HEAD(&vmem->req_list);
+	init_waitqueue_head(&vmem->req_list_wait);
+
+	init_waitqueue_head(&vmem->ready_wait);
+	vmem->ready = false;
+
+	return 0;
+
+ out:
+	kvm_vmem_free(vmem);
+	return error;
+}
+
+static long kvm_vmem_dev_ioctl(struct file *filp, unsigned int ioctl,
+			       unsigned long arg)
+{
+	void __user *argp = (void __user *) arg;
+	long ret;
+
+	switch (ioctl) {
+	case KVM_CREATE_VMEM: {
+		struct kvm_vmem_create create;
+		if (copy_from_user(&create, argp, sizeof(create))) {
+			ret = -EFAULT;
+			break;
+		}
+		ret = kvm_create_vmem(&create);
+		if (copy_to_user(argp, &create, sizeof(create))) {
+			ret = -EFAULT;
+			break;
+		}
+		break;
+	}
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static int kvm_vmem_dev_release(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static struct file_operations kvm_vmem_dev_fops = {
+	.release = kvm_vmem_dev_release,
+	.unlocked_ioctl = kvm_vmem_dev_ioctl,
+};
+
+long kvm_dev_ioctl_create_vmem_dev(void)
+{
+	return anon_inode_getfd("kvm-vmem-dev", &kvm_vmem_dev_fops,
+				NULL, O_RDWR);
+}
diff --git a/virt/kvm/vmem.h b/virt/kvm/vmem.h
new file mode 100644
index 0000000..bc7e8cf
--- /dev/null
+++ b/virt/kvm/vmem.h
@@ -0,0 +1,68 @@ 
+/*
+ * KVM post copy vmem
+ *
+ * Copyright (c) 2011,
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#ifndef __KVM_VMEM_H__
+#define __KVM_VMEM_H__
+
+struct kvm_vmem_page_req_list {
+	struct list_head list;
+	pgoff_t pgoff;
+};
+
+struct kvm_vmem {
+	loff_t size;
+	pgoff_t pgoff_end;
+	spinlock_t lock;
+
+	wait_queue_head_t req_wait;
+
+	int async_req_max;
+	int async_req_nr;
+	pgoff_t *async_req;
+
+	int sync_req_max;
+	unsigned long *sync_req_bitmap;
+	unsigned long *sync_wait_bitmap;
+	pgoff_t *sync_req;
+	wait_queue_head_t *page_wait;
+
+	int req_list_nr;
+	struct list_head req_list;
+	wait_queue_head_t req_list_wait;
+
+	unsigned long *cached;
+	unsigned long *faulted;
+
+	bool mmapped;
+	unsigned long vm_start;
+	unsigned int vma_nr;
+	struct task_struct *task;
+
+	wait_queue_head_t ready_wait;
+	bool ready;
+
+	struct file *shmem_filp;
+	struct vm_area_struct *vma;
+};
+
+#endif /* __KVM_VMEM_H__ */