diff mbox

[2/2] umem: chardevice for kvm postcopy

Message ID 055fb826d9f0d88910f71b1afed0c06e4bfb8a76.1325055066.git.yamahata@valinux.co.jp
State New
Headers show

Commit Message

Isaku Yamahata Dec. 29, 2011, 1:26 a.m. UTC
This is a character device to hook page access.
The page fault in the area is reported to another user process by
this chardriver. Then, the process fills the page contents and
resolves the page fault.

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
 drivers/char/Kconfig  |    9 +
 drivers/char/Makefile |    1 +
 drivers/char/umem.c   |  898 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/umem.h  |   83 +++++
 4 files changed, 991 insertions(+), 0 deletions(-)
 create mode 100644 drivers/char/umem.c
 create mode 100644 include/linux/umem.h

Comments

Avi Kivity Dec. 29, 2011, 11:17 a.m. UTC | #1
On 12/29/2011 03:26 AM, Isaku Yamahata wrote:
> This is a character device to hook page access.
> The page fault in the area is reported to another user process by
> this chardriver. Then, the process fills the page contents and
> resolves the page fault.
>
>  
> +config UMEM
> +        tristate "/dev/umem user process backed memory support"

tab

> +	default n
> +	help
> +	  User process backed memory driver provides /dev/umem device.
> +	  The /dev/umem device is designed for some sort of distributed
> +	  shared memory. Especially post-copy live migration with KVM.
> +	  When in doubt, say "N".
> +

Need documentation of the protocol between the kernel and userspace; not
just the ioctls, but also how faults are propagated.

> +
> +struct umem_page_req_list {
> +	struct list_head list;
> +	pgoff_t pgoff;
> +};
> +
>
> +
> +
> +static int umem_mark_page_cached(struct umem *umem,
> +				 struct umem_page_cached *page_cached)
> +{
> +	int ret = 0;
> +#define PG_MAX	((__u32)32)
> +	__u64 pgoffs[PG_MAX];
> +	__u32 nr;
> +	unsigned long bit;
> +	bool wake_up_list = false;
> +
> +	nr = 0;
> +	while (nr < page_cached->nr) {
> +		__u32 todo = min(PG_MAX, (page_cached->nr - nr));
> +		int i;
> +
> +		if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> +				   sizeof(*pgoffs) * todo)) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +		for (i = 0; i < todo; ++i) {
> +			if (pgoffs[i] >= umem->pgoff_end) {
> +				ret = -EINVAL;
> +				goto out;
> +			}
> +			set_bit(pgoffs[i], umem->cached);
> +		}
> +		nr += todo;
> +	}
> +

Probably need an smp_wmb() where.

> +	spin_lock(&umem->lock);
> +	bit = 0;
> +	for (;;) {
> +		bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
> +				    bit);
> +		if (bit >= umem->sync_req_max)
> +			break;
> +		if (test_bit(umem->sync_req[bit], umem->cached))
> +			wake_up(&umem->page_wait[bit]);

Why not do this test in the loop above?

> +		bit++;
> +	}
> +
> +	if (umem->req_list_nr > 0)
> +		wake_up_list = true;
> +	spin_unlock(&umem->lock);
> +
> +	if (wake_up_list)
> +		wake_up_all(&umem->req_list_wait);
> +
> +out:
> +	return ret;
> +}
> +
> +
> +
> +static void umem_put(struct umem *umem)
> +{
> +	int ret;
> +
> +	mutex_lock(&umem_list_mutex);
> +	ret = kref_put(&umem->kref, umem_free);
> +	if (ret == 0) {
> +		mutex_unlock(&umem_list_mutex);
> +	}

This looks wrong.

> +}
> +
> +
> +static int umem_create_umem(struct umem_create *create)
> +{
> +	int error = 0;
> +	struct umem *umem = NULL;
> +	struct vm_area_struct *vma;
> +	int shmem_fd;
> +	unsigned long bitmap_bytes;
> +	unsigned long sync_bitmap_bytes;
> +	int i;
> +
> +	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> +	umem->name = create->name;
> +	kref_init(&umem->kref);
> +	INIT_LIST_HEAD(&umem->list);
> +
> +	mutex_lock(&umem_list_mutex);
> +	error = umem_add_list(umem);
> +	if (error) {
> +		goto out;
> +	}
> +
> +	umem->task = NULL;
> +	umem->mmapped = false;
> +	spin_lock_init(&umem->lock);
> +	umem->size = roundup(create->size, PAGE_SIZE);
> +	umem->pgoff_end = umem->size >> PAGE_SHIFT;
> +	init_waitqueue_head(&umem->req_wait);
> +
> +	vma = &umem->vma;
> +	vma->vm_start = 0;
> +	vma->vm_end = umem->size;
> +	/* this shmem file is used for temporal buffer for pages
> +	   so it's unlikely that so many pages exists in this shmem file */
> +	vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> +		VM_DONTEXPAND;
> +	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> +	vma->vm_pgoff = 0;
> +	INIT_LIST_HEAD(&vma->anon_vma_chain);
> +
> +	shmem_fd = get_unused_fd();
> +	if (shmem_fd < 0) {
> +		error = shmem_fd;
> +		goto out;
> +	}
> +	error = shmem_zero_setup(vma);
> +	if (error < 0) {
> +		put_unused_fd(shmem_fd);
> +		goto out;
> +	}
> +	umem->shmem_filp = vma->vm_file;
> +	get_file(umem->shmem_filp);
> +	fd_install(shmem_fd, vma->vm_file);
> +	create->shmem_fd = shmem_fd;
> +
> +	create->umem_fd = anon_inode_getfd("umem",
> +					   &umem_fops, umem, O_RDWR);
> +	if (create->umem_fd < 0) {
> +		error = create->umem_fd;
> +		goto out;
> +	}
> +
> +	bitmap_bytes = umem_bitmap_bytes(umem);
> +	if (bitmap_bytes > PAGE_SIZE) {
> +		umem->cached = vzalloc(bitmap_bytes);
> +		umem->faulted = vzalloc(bitmap_bytes);
> +	} else {
> +		umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> +		umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> +	}
> +
> +	/* those constants are not exported.
> +	   They are just used for default value */
> +#define KVM_MAX_VCPUS	256
> +#define ASYNC_PF_PER_VCPU 64

Best to avoid defaults and require userspace choose.

> +
> +#define ASYNC_REQ_MAX	(ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> +	if (create->async_req_max == 0)
> +		create->async_req_max = ASYNC_REQ_MAX;
> +	umem->async_req_max = create->async_req_max;
> +	umem->async_req_nr = 0;
> +	umem->async_req = kzalloc(
> +		sizeof(*umem->async_req) * umem->async_req_max,
> +		GFP_KERNEL);
> +
> +#define SYNC_REQ_MAX	(KVM_MAX_VCPUS)
> +	if (create->sync_req_max == 0)
> +		create->sync_req_max = SYNC_REQ_MAX;
> +	umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
> +	sync_bitmap_bytes = sizeof(unsigned long) *
> +		(umem->sync_req_max / BITS_PER_LONG);
> +	umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +	umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +	umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
> +				  umem->sync_req_max, GFP_KERNEL);
> +	for (i = 0; i < umem->sync_req_max; ++i)
> +		init_waitqueue_head(&umem->page_wait[i]);
> +	umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
> +				 umem->sync_req_max, GFP_KERNEL);
> +
> +	umem->req_list_nr = 0;
> +	INIT_LIST_HEAD(&umem->req_list);
> +	init_waitqueue_head(&umem->req_list_wait);
> +
> +	mutex_unlock(&umem_list_mutex);
> +	return 0;
> +
> + out:
> +	umem_free(&umem->kref);
> +	return error;
> +}
> +
> +
> +static int umem_reattach_umem(struct umem_create *create)
> +{
> +	struct umem *entry;
> +
> +	mutex_lock(&umem_list_mutex);
> +	list_for_each_entry(entry, &umem_list, list) {
> +		if (umem_name_eq(&entry->name, &create->name)) {
> +			kref_get(&entry->kref);
> +			mutex_unlock(&umem_list_mutex);
> +
> +			create->shmem_fd = get_unused_fd();
> +			if (create->shmem_fd < 0) {
> +				umem_put(entry);
> +				return create->shmem_fd;
> +			}
> +			create->umem_fd = anon_inode_getfd(
> +				"umem", &umem_fops, entry, O_RDWR);
> +			if (create->umem_fd < 0) {
> +				put_unused_fd(create->shmem_fd);
> +				umem_put(entry);
> +				return create->umem_fd;
> +			}
> +			get_file(entry->shmem_filp);
> +			fd_install(create->shmem_fd, entry->shmem_filp);
> +
> +			create->size = entry->size;
> +			create->sync_req_max = entry->sync_req_max;
> +			create->async_req_max = entry->async_req_max;
> +			return 0;
> +		}
> +	}
> +	mutex_unlock(&umem_list_mutex);
> +
> +	return -ENOENT;
> +}

Can you explain how reattach is used?

> +
> +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
> +			   unsigned long arg)
> +{
> +	void __user *argp = (void __user *) arg;
> +	long ret;
> +	struct umem_create *create = NULL;
> +
> +
> +	switch (ioctl) {
> +	case UMEM_DEV_CREATE_UMEM:
> +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> +		if (copy_from_user(create, argp, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		ret = umem_create_umem(create);
> +		if (copy_to_user(argp, create, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		break;

A simpler approach is the open("/dev/umem") returns an mmap()able fd. 
You need to call an ioctl() to set the size, etc. but only you only
operate on that fd.

> +	case UMEM_DEV_LIST:
> +		ret = umem_list_umem(argp);
> +		break;
> +	case UMEM_DEV_REATTACH:
> +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> +		if (copy_from_user(create, argp, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		ret = umem_reattach_umem(create);
> +		if (copy_to_user(argp, create, sizeof(*create))) {
> +			ret = -EFAULT;
> +			break;
> +		}
> +		break;
> +	default:
> +		ret = -EINVAL;
> +		break;
> +	}
> +
> +	kfree(create);
> +	return ret;
> +}
> +
> +
> +#ifdef __KERNEL__
> +#include <linux/compiler.h>
> +#else
> +#define __user
> +#endif

I think a #include <linux/compiler.h> is sufficient, the export process
(see include/linux/Kbuild, add an entry there) takes care of __user.

> +
> +#define UMEM_ID_MAX	256
> +#define UMEM_NAME_MAX	256
> +
> +struct umem_name {
> +	char id[UMEM_ID_MAX];		/* non-zero terminated */
> +	char name[UMEM_NAME_MAX];	/* non-zero terminated */
> +};

IMO, it would be better to avoid names, and use opaque __u64 identifiers
assigned by userspace, or perhaps file descriptors generated by the
kernel.  With names come the complications of namespaces, etc.  One user
can DoS another by grabbing a name that it knows the other user wants to
use.

> +
> +struct umem_create {
> +	__u64 size;	/* in bytes */
> +	__s32 umem_fd;
> +	__s32 shmem_fd;
> +	__u32 async_req_max;
> +	__u32 sync_req_max;
> +	struct umem_name name;
> +};
> +
> +struct umem_page_request {
> +	__u64 __user *pgoffs;

Pointers change their size in 32-bit and 64-bit userspace, best to avoid
them.

> +	__u32 nr;
> +	__u32 padding;
> +};
> +
> +struct umem_page_cached {
> +	__u64 __user *pgoffs;
> +	__u32 nr;
> +	__u32 padding;
> +};
> +
> +#define UMEMIO	0x1E
> +
> +/* ioctl for umem_dev fd */
> +#define UMEM_DEV_CREATE_UMEM	_IOWR(UMEMIO, 0x0, struct umem_create)
> +#define UMEM_DEV_LIST		_IOWR(UMEMIO, 0x1, struct umem_list)

Why is _LIST needed?

> +#define UMEM_DEV_REATTACH	_IOWR(UMEMIO, 0x2, struct umem_create)
> +
> +/* ioctl for umem fd */
> +#define UMEM_GET_PAGE_REQUEST	_IOWR(UMEMIO, 0x10, struct umem_page_request)
> +#define UMEM_MARK_PAGE_CACHED	_IOW (UMEMIO, 0x11, struct umem_page_cached)

You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
file descriptors, instead of an ioctl.  It allows you to implement the
other side in either the kernel or userspace.  This is similar to how
kvm uses an eventfd for communication with vhost-net in the kernel, or
an implementation in userspace.

> +#define UMEM_MAKE_VMA_ANONYMOUS	_IO  (UMEMIO, 0x12)
> +
> +#endif /* __LINUX_UMEM_H */
Isaku Yamahata Dec. 29, 2011, 12:22 p.m. UTC | #2
Thank you for review.

On Thu, Dec 29, 2011 at 01:17:51PM +0200, Avi Kivity wrote:
> > +	default n
> > +	help
> > +	  User process backed memory driver provides /dev/umem device.
> > +	  The /dev/umem device is designed for some sort of distributed
> > +	  shared memory. Especially post-copy live migration with KVM.
> > +	  When in doubt, say "N".
> > +
> 
> Need documentation of the protocol between the kernel and userspace; not
> just the ioctls, but also how faults are propagated.

Will do.

> 
> > +
> > +struct umem_page_req_list {
> > +	struct list_head list;
> > +	pgoff_t pgoff;
> > +};
> > +
> >
> > +
> > +
> > +static int umem_mark_page_cached(struct umem *umem,
> > +				 struct umem_page_cached *page_cached)
> > +{
> > +	int ret = 0;
> > +#define PG_MAX	((__u32)32)
> > +	__u64 pgoffs[PG_MAX];
> > +	__u32 nr;
> > +	unsigned long bit;
> > +	bool wake_up_list = false;
> > +
> > +	nr = 0;
> > +	while (nr < page_cached->nr) {
> > +		__u32 todo = min(PG_MAX, (page_cached->nr - nr));
> > +		int i;
> > +
> > +		if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> > +				   sizeof(*pgoffs) * todo)) {
> > +			ret = -EFAULT;
> > +			goto out;
> > +		}
> > +		for (i = 0; i < todo; ++i) {
> > +			if (pgoffs[i] >= umem->pgoff_end) {
> > +				ret = -EINVAL;
> > +				goto out;
> > +			}
> > +			set_bit(pgoffs[i], umem->cached);
> > +		}
> > +		nr += todo;
> > +	}
> > +
> 
> Probably need an smp_wmb() where.
> 
> > +	spin_lock(&umem->lock);
> > +	bit = 0;
> > +	for (;;) {
> > +		bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
> > +				    bit);
> > +		if (bit >= umem->sync_req_max)
> > +			break;
> > +		if (test_bit(umem->sync_req[bit], umem->cached))
> > +			wake_up(&umem->page_wait[bit]);
> 
> Why not do this test in the loop above?
> 
> > +		bit++;
> > +	}
> > +
> > +	if (umem->req_list_nr > 0)
> > +		wake_up_list = true;
> > +	spin_unlock(&umem->lock);
> > +
> > +	if (wake_up_list)
> > +		wake_up_all(&umem->req_list_wait);
> > +
> > +out:
> > +	return ret;
> > +}
> > +
> > +
> > +
> > +static void umem_put(struct umem *umem)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&umem_list_mutex);
> > +	ret = kref_put(&umem->kref, umem_free);
> > +	if (ret == 0) {
> > +		mutex_unlock(&umem_list_mutex);
> > +	}
> 
> This looks wrong.
> 
> > +}
> > +
> > +
> > +static int umem_create_umem(struct umem_create *create)
> > +{
> > +	int error = 0;
> > +	struct umem *umem = NULL;
> > +	struct vm_area_struct *vma;
> > +	int shmem_fd;
> > +	unsigned long bitmap_bytes;
> > +	unsigned long sync_bitmap_bytes;
> > +	int i;
> > +
> > +	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> > +	umem->name = create->name;
> > +	kref_init(&umem->kref);
> > +	INIT_LIST_HEAD(&umem->list);
> > +
> > +	mutex_lock(&umem_list_mutex);
> > +	error = umem_add_list(umem);
> > +	if (error) {
> > +		goto out;
> > +	}
> > +
> > +	umem->task = NULL;
> > +	umem->mmapped = false;
> > +	spin_lock_init(&umem->lock);
> > +	umem->size = roundup(create->size, PAGE_SIZE);
> > +	umem->pgoff_end = umem->size >> PAGE_SHIFT;
> > +	init_waitqueue_head(&umem->req_wait);
> > +
> > +	vma = &umem->vma;
> > +	vma->vm_start = 0;
> > +	vma->vm_end = umem->size;
> > +	/* this shmem file is used for temporal buffer for pages
> > +	   so it's unlikely that so many pages exists in this shmem file */
> > +	vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> > +		VM_DONTEXPAND;
> > +	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> > +	vma->vm_pgoff = 0;
> > +	INIT_LIST_HEAD(&vma->anon_vma_chain);
> > +
> > +	shmem_fd = get_unused_fd();
> > +	if (shmem_fd < 0) {
> > +		error = shmem_fd;
> > +		goto out;
> > +	}
> > +	error = shmem_zero_setup(vma);
> > +	if (error < 0) {
> > +		put_unused_fd(shmem_fd);
> > +		goto out;
> > +	}
> > +	umem->shmem_filp = vma->vm_file;
> > +	get_file(umem->shmem_filp);
> > +	fd_install(shmem_fd, vma->vm_file);
> > +	create->shmem_fd = shmem_fd;
> > +
> > +	create->umem_fd = anon_inode_getfd("umem",
> > +					   &umem_fops, umem, O_RDWR);
> > +	if (create->umem_fd < 0) {
> > +		error = create->umem_fd;
> > +		goto out;
> > +	}
> > +
> > +	bitmap_bytes = umem_bitmap_bytes(umem);
> > +	if (bitmap_bytes > PAGE_SIZE) {
> > +		umem->cached = vzalloc(bitmap_bytes);
> > +		umem->faulted = vzalloc(bitmap_bytes);
> > +	} else {
> > +		umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> > +		umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> > +	}
> > +
> > +	/* those constants are not exported.
> > +	   They are just used for default value */
> > +#define KVM_MAX_VCPUS	256
> > +#define ASYNC_PF_PER_VCPU 64
> 
> Best to avoid defaults and require userspace choose.

Okay.


> > +
> > +#define ASYNC_REQ_MAX	(ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> > +	if (create->async_req_max == 0)
> > +		create->async_req_max = ASYNC_REQ_MAX;
> > +	umem->async_req_max = create->async_req_max;
> > +	umem->async_req_nr = 0;
> > +	umem->async_req = kzalloc(
> > +		sizeof(*umem->async_req) * umem->async_req_max,
> > +		GFP_KERNEL);
> > +
> > +#define SYNC_REQ_MAX	(KVM_MAX_VCPUS)
> > +	if (create->sync_req_max == 0)
> > +		create->sync_req_max = SYNC_REQ_MAX;
> > +	umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
> > +	sync_bitmap_bytes = sizeof(unsigned long) *
> > +		(umem->sync_req_max / BITS_PER_LONG);
> > +	umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > +	umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > +	umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
> > +				  umem->sync_req_max, GFP_KERNEL);
> > +	for (i = 0; i < umem->sync_req_max; ++i)
> > +		init_waitqueue_head(&umem->page_wait[i]);
> > +	umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
> > +				 umem->sync_req_max, GFP_KERNEL);
> > +
> > +	umem->req_list_nr = 0;
> > +	INIT_LIST_HEAD(&umem->req_list);
> > +	init_waitqueue_head(&umem->req_list_wait);
> > +
> > +	mutex_unlock(&umem_list_mutex);
> > +	return 0;
> > +
> > + out:
> > +	umem_free(&umem->kref);
> > +	return error;
> > +}
> > +
> > +
> > +static int umem_reattach_umem(struct umem_create *create)
> > +{
> > +	struct umem *entry;
> > +
> > +	mutex_lock(&umem_list_mutex);
> > +	list_for_each_entry(entry, &umem_list, list) {
> > +		if (umem_name_eq(&entry->name, &create->name)) {
> > +			kref_get(&entry->kref);
> > +			mutex_unlock(&umem_list_mutex);
> > +
> > +			create->shmem_fd = get_unused_fd();
> > +			if (create->shmem_fd < 0) {
> > +				umem_put(entry);
> > +				return create->shmem_fd;
> > +			}
> > +			create->umem_fd = anon_inode_getfd(
> > +				"umem", &umem_fops, entry, O_RDWR);
> > +			if (create->umem_fd < 0) {
> > +				put_unused_fd(create->shmem_fd);
> > +				umem_put(entry);
> > +				return create->umem_fd;
> > +			}
> > +			get_file(entry->shmem_filp);
> > +			fd_install(create->shmem_fd, entry->shmem_filp);
> > +
> > +			create->size = entry->size;
> > +			create->sync_req_max = entry->sync_req_max;
> > +			create->async_req_max = entry->async_req_max;
> > +			return 0;
> > +		}
> > +	}
> > +	mutex_unlock(&umem_list_mutex);
> > +
> > +	return -ENOENT;
> > +}
> 
> Can you explain how reattach is used?
> 
> > +
> > +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
> > +			   unsigned long arg)
> > +{
> > +	void __user *argp = (void __user *) arg;
> > +	long ret;
> > +	struct umem_create *create = NULL;
> > +
> > +
> > +	switch (ioctl) {
> > +	case UMEM_DEV_CREATE_UMEM:
> > +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> > +		if (copy_from_user(create, argp, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		ret = umem_create_umem(create);
> > +		if (copy_to_user(argp, create, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		break;
> 
> A simpler approach is the open("/dev/umem") returns an mmap()able fd. 
> You need to call an ioctl() to set the size, etc. but only you only
> operate on that fd.

So you are suggesting that /dev/umem and /dev/umemctl should be introduced
and split the functionality.


> > +	case UMEM_DEV_LIST:
> > +		ret = umem_list_umem(argp);
> > +		break;
> > +	case UMEM_DEV_REATTACH:
> > +		create = kmalloc(sizeof(*create), GFP_KERNEL);
> > +		if (copy_from_user(create, argp, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		ret = umem_reattach_umem(create);
> > +		if (copy_to_user(argp, create, sizeof(*create))) {
> > +			ret = -EFAULT;
> > +			break;
> > +		}
> > +		break;
> > +	default:
> > +		ret = -EINVAL;
> > +		break;
> > +	}
> > +
> > +	kfree(create);
> > +	return ret;
> > +}
> > +
> > +
> > +#ifdef __KERNEL__
> > +#include <linux/compiler.h>
> > +#else
> > +#define __user
> > +#endif
> 
> I think a #include <linux/compiler.h> is sufficient, the export process
> (see include/linux/Kbuild, add an entry there) takes care of __user.
> 
> > +
> > +#define UMEM_ID_MAX	256
> > +#define UMEM_NAME_MAX	256
> > +
> > +struct umem_name {
> > +	char id[UMEM_ID_MAX];		/* non-zero terminated */
> > +	char name[UMEM_NAME_MAX];	/* non-zero terminated */
> > +};
> 
> IMO, it would be better to avoid names, and use opaque __u64 identifiers
> assigned by userspace, or perhaps file descriptors generated by the
> kernel.  With names come the complications of namespaces, etc.  One user
> can DoS another by grabbing a name that it knows the other user wants to
> use.

So how about the kernel assigning identifiers which is system global?


> > +
> > +struct umem_create {
> > +	__u64 size;	/* in bytes */
> > +	__s32 umem_fd;
> > +	__s32 shmem_fd;
> > +	__u32 async_req_max;
> > +	__u32 sync_req_max;
> > +	struct umem_name name;
> > +};
> > +
> > +struct umem_page_request {
> > +	__u64 __user *pgoffs;
> 
> Pointers change their size in 32-bit and 64-bit userspace, best to avoid
> them.

Ah yes, right. How about following?
struct {
       __u32 nr;
       __u32 padding;
       __u64 pgoffs[0];
}

> > +	__u32 nr;
> > +	__u32 padding;
> > +};
> > +
> > +struct umem_page_cached {
> > +	__u64 __user *pgoffs;
> > +	__u32 nr;
> > +	__u32 padding;
> > +};
> > +
> > +#define UMEMIO	0x1E
> > +
> > +/* ioctl for umem_dev fd */
> > +#define UMEM_DEV_CREATE_UMEM	_IOWR(UMEMIO, 0x0, struct umem_create)
> > +#define UMEM_DEV_LIST		_IOWR(UMEMIO, 0x1, struct umem_list)
> 
> Why is _LIST needed?
> 
> > +#define UMEM_DEV_REATTACH	_IOWR(UMEMIO, 0x2, struct umem_create)
> > +
> > +/* ioctl for umem fd */
> > +#define UMEM_GET_PAGE_REQUEST	_IOWR(UMEMIO, 0x10, struct umem_page_request)
> > +#define UMEM_MARK_PAGE_CACHED	_IOW (UMEMIO, 0x11, struct umem_page_cached)
> 
> You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
> file descriptors, instead of an ioctl.  It allows you to implement the
> other side in either the kernel or userspace.  This is similar to how
> kvm uses an eventfd for communication with vhost-net in the kernel, or
> an implementation in userspace.

Do you mean that read/write on file descriptors is better than ioctl?
Okay, it would be easy to convert ioctl into read/write.


> > +#define UMEM_MAKE_VMA_ANONYMOUS	_IO  (UMEMIO, 0x12)
> > +
> > +#endif /* __LINUX_UMEM_H */
> 
> 
> -- 
> error compiling committee.c: too many arguments to function
>
Avi Kivity Dec. 29, 2011, 12:47 p.m. UTC | #3
On 12/29/2011 02:22 PM, Isaku Yamahata wrote:
> > 
> > A simpler approach is the open("/dev/umem") returns an mmap()able fd. 
> > You need to call an ioctl() to set the size, etc. but only you only
> > operate on that fd.
>
> So you are suggesting that /dev/umem and /dev/umemctl should be introduced
> and split the functionality.

No; perhaps I'm missing some functionality, but I'm suggesting

  fd = open("/dev/umem");
  ftruncate(fd, size);
  struct umem_config config =  { ... };
  ioctl(fd, UMEM_CONFIG, &config);
  mmap(..., fd, size);

> > 
> > IMO, it would be better to avoid names, and use opaque __u64 identifiers
> > assigned by userspace, or perhaps file descriptors generated by the
> > kernel.  With names come the complications of namespaces, etc.  One user
> > can DoS another by grabbing a name that it knows the other user wants to
> > use.
>
> So how about the kernel assigning identifiers which is system global?

Depends on what you do with the identifiers.  Something like reattach
needs security, you can't just reattach to any random umem segment.

It's really best to stick with file descriptors, which already have a
security model.

>
>
> > > +
> > > +struct umem_create {
> > > +	__u64 size;	/* in bytes */
> > > +	__s32 umem_fd;
> > > +	__s32 shmem_fd;
> > > +	__u32 async_req_max;
> > > +	__u32 sync_req_max;
> > > +	struct umem_name name;
> > > +};
> > > +
> > > +struct umem_page_request {
> > > +	__u64 __user *pgoffs;
> > 
> > Pointers change their size in 32-bit and 64-bit userspace, best to avoid
> > them.
>
> Ah yes, right. How about following?
> struct {
>        __u32 nr;
>        __u32 padding;
>        __u64 pgoffs[0];
> }

Sure.

If we use a pipe to transport requests, you can just send them as a
sequence of __u64 addresses.

> > > +
> > > +/* ioctl for umem fd */
> > > +#define UMEM_GET_PAGE_REQUEST	_IOWR(UMEMIO, 0x10, struct umem_page_request)
> > > +#define UMEM_MARK_PAGE_CACHED	_IOW (UMEMIO, 0x11, struct umem_page_cached)
> > 
> > You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
> > file descriptors, instead of an ioctl.  It allows you to implement the
> > other side in either the kernel or userspace.  This is similar to how
> > kvm uses an eventfd for communication with vhost-net in the kernel, or
> > an implementation in userspace.
>
> Do you mean that read/write on file descriptors is better than ioctl?
> Okay, it would be easy to convert ioctl into read/write.

Yes, they already provide synchronization.  And if you want to implement
a umem provider over RDMA in the kernel, then it's easy to add it; it's
not trivial for the kernel to issue ioctls but reads/writes are easy.

It's also easy to pass file descriptors among processes.

How do FUSE/CUSE pass requests?
diff mbox

Patch

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 4364303..001e3e4 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -15,6 +15,15 @@  config DEVKMEM
 	  kind of kernel debugging operations.
 	  When in doubt, say "N".
 
+config UMEM
+        tristate "/dev/umem user process backed memory support"
+	default n
+	help
+	  User process backed memory driver provides /dev/umem device.
+	  The /dev/umem device is designed for some sort of distributed
+	  shared memory. Especially post-copy live migration with KVM.
+	  When in doubt, say "N".
+
 config STALDRV
 	bool "Stallion multiport serial support"
 	depends on SERIAL_NONSTANDARD
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 32762ba..1eb14dc 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -3,6 +3,7 @@ 
 #
 
 obj-y				+= mem.o random.o
+obj-$(CONFIG_UMEM)		+= umem.o
 obj-$(CONFIG_TTY_PRINTK)	+= ttyprintk.o
 obj-y				+= misc.o
 obj-$(CONFIG_ATARI_DSP56K)	+= dsp56k.o
diff --git a/drivers/char/umem.c b/drivers/char/umem.c
new file mode 100644
index 0000000..df669fb
--- /dev/null
+++ b/drivers/char/umem.c
@@ -0,0 +1,898 @@ 
+/*
+ * UMEM: user process backed memory.
+ *
+ * Copyright (c) 2011,
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/memcontrol.h>
+#include <linux/poll.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+#include <linux/miscdevice.h>
+#include <linux/umem.h>
+
+struct umem_page_req_list {
+	struct list_head list;
+	pgoff_t pgoff;
+};
+
+struct umem {
+	loff_t size;
+	pgoff_t pgoff_end;
+	spinlock_t lock;
+
+	wait_queue_head_t req_wait;
+
+	int async_req_max;
+	int async_req_nr;
+	pgoff_t *async_req;
+
+	int sync_req_max;
+	unsigned long *sync_req_bitmap;
+	unsigned long *sync_wait_bitmap;
+	pgoff_t *sync_req;
+	wait_queue_head_t *page_wait;
+
+	int req_list_nr;
+	struct list_head req_list;
+	wait_queue_head_t req_list_wait;
+
+	unsigned long *cached;
+	unsigned long *faulted;
+
+	bool mmapped;
+	unsigned long vm_start;
+	unsigned int vma_nr;
+	struct task_struct *task;
+
+	struct file *shmem_filp;
+	struct vm_area_struct vma;
+
+	struct kref kref;
+	struct list_head list;
+	struct umem_name name;
+};
+
+
+static LIST_HEAD(umem_list);
+DEFINE_MUTEX(umem_list_mutex);
+
+static bool umem_name_eq(const struct umem_name *lhs,
+			  const struct umem_name *rhs)
+{
+	return memcmp(lhs->id, rhs->id, sizeof(lhs->id)) == 0 &&
+		memcmp(lhs->name, rhs->name, sizeof(lhs->name)) == 0;
+}
+
+static int umem_add_list(struct umem *umem)
+{
+	struct umem *entry;
+	BUG_ON(!mutex_is_locked(&umem_list_mutex));
+	list_for_each_entry(entry, &umem_list, list) {
+		if (umem_name_eq(&entry->name, &umem->name)) {
+			mutex_unlock(&umem_list_mutex);
+			return -EBUSY;
+		}
+	}
+
+	list_add(&umem->list, &umem_list);
+	return 0;
+}
+
+static void umem_release_fake_vmf(int ret, struct vm_fault *fake_vmf)
+{
+	if (ret & VM_FAULT_LOCKED) {
+		unlock_page(fake_vmf->page);
+	}
+	page_cache_release(fake_vmf->page);
+}
+
+static int umem_minor_fault(struct umem *umem,
+			    struct vm_area_struct *vma,
+			    struct vm_fault *vmf)
+{
+	struct vm_fault fake_vmf;
+	int ret;
+	struct page *page;
+
+	BUG_ON(!test_bit(vmf->pgoff, umem->cached));
+	fake_vmf = *vmf;
+	fake_vmf.page = NULL;
+	ret = umem->vma.vm_ops->fault(&umem->vma, &fake_vmf);
+	if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))
+		return ret;
+
+	/*
+	 * TODO: pull out fake_vmf->page from shmem file and donate it
+	 * to this vma resolving the page fault.
+	 * vmf->page = fake_vmf->page;
+	 */
+
+	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+	if (!page)
+		return VM_FAULT_OOM;
+	if (mem_cgroup_cache_charge(page, vma->vm_mm, GFP_KERNEL)) {
+		umem_release_fake_vmf(ret, &fake_vmf);
+		page_cache_release(page);
+		return VM_FAULT_OOM;
+	}
+
+	copy_highpage(page, fake_vmf.page);
+	umem_release_fake_vmf(ret, &fake_vmf);
+
+	ret |= VM_FAULT_LOCKED;
+	SetPageUptodate(page);
+	vmf->page = page;
+	set_bit(vmf->pgoff, umem->faulted);
+
+	return ret;
+}
+
+static int umem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *filp = vma->vm_file;
+	struct umem *umem = filp->private_data;
+
+	if (vmf->pgoff >= umem->pgoff_end) {
+		return VM_FAULT_SIGBUS;
+	}
+
+	BUG_ON(test_bit(vmf->pgoff, umem->faulted));
+
+	if (!test_bit(vmf->pgoff, umem->cached)) {
+		/* major fault */
+		unsigned long bit;
+		DEFINE_WAIT(wait);
+
+		if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
+			/* async page fault */
+			spin_lock(&umem->lock);
+			if (umem->async_req_nr < umem->async_req_max) {
+				umem->async_req[umem->async_req_nr] =
+					vmf->pgoff;
+				umem->async_req_nr++;
+			}
+			spin_unlock(&umem->lock);
+			wake_up_poll(&umem->req_wait, POLLIN);
+
+			if (test_bit(vmf->pgoff, umem->cached))
+				return umem_minor_fault(umem, vma, vmf);
+			return VM_FAULT_MAJOR | VM_FAULT_RETRY;
+		}
+
+		spin_lock(&umem->lock);
+		bit = find_first_zero_bit(umem->sync_wait_bitmap,
+					  umem->sync_req_max);
+		if (likely(bit < umem->sync_req_max)) {
+			umem->sync_req[bit] = vmf->pgoff;
+			prepare_to_wait(&umem->page_wait[bit], &wait,
+					TASK_UNINTERRUPTIBLE);
+			set_bit(bit, umem->sync_req_bitmap);
+			set_bit(bit, umem->sync_wait_bitmap);
+			spin_unlock(&umem->lock);
+			wake_up_poll(&umem->req_wait, POLLIN);
+
+			if (!test_bit(vmf->pgoff, umem->cached))
+				schedule();
+			finish_wait(&umem->page_wait[bit], &wait);
+			clear_bit(bit, umem->sync_wait_bitmap);
+		} else {
+			struct umem_page_req_list page_req_list = {
+				.pgoff = vmf->pgoff,
+			};
+			umem->req_list_nr++;
+			list_add_tail(&page_req_list.list, &umem->req_list);
+			wake_up_poll(&umem->req_wait, POLLIN);
+			for (;;) {
+				prepare_to_wait(&umem->req_list_wait, &wait,
+						TASK_UNINTERRUPTIBLE);
+				if (test_bit(vmf->pgoff, umem->cached)) {
+					umem->req_list_nr--;
+					break;
+				}
+				spin_unlock(&umem->lock);
+				schedule();
+				spin_lock(&umem->lock);
+			}
+			spin_unlock(&umem->lock);
+			finish_wait(&umem->req_list_wait, &wait);
+		}
+
+		return umem_minor_fault(umem, vma, vmf) | VM_FAULT_MAJOR;
+	}
+
+	return umem_minor_fault(umem, vma, vmf);
+}
+
+/* for partial munmap */
+static void umem_vma_open(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct umem *umem = filp->private_data;
+
+	spin_lock(&umem->lock);
+	umem->vma_nr++;
+	spin_unlock(&umem->lock);
+}
+
+static void umem_vma_close(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct umem *umem = filp->private_data;
+	struct task_struct *task = NULL;
+
+	spin_lock(&umem->lock);
+	umem->vma_nr--;
+	if (umem->vma_nr == 0) {
+		task = umem->task;
+		umem->task = NULL;
+	}
+	spin_unlock(&umem->lock);
+
+	if (task)
+		put_task_struct(task);
+}
+
+static const struct vm_operations_struct umem_vm_ops = {
+	.open = umem_vma_open,
+	.close = umem_vma_close,
+	.fault = umem_fault,
+};
+
+static int umem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct umem *umem = filp->private_data;
+	int error;
+
+	/* allow mmap() only once */
+	spin_lock(&umem->lock);
+	if (umem->mmapped) {
+		error = -EBUSY;
+		goto out;
+	}
+	if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff >
+	    umem->pgoff_end) {
+		error = -EINVAL;
+		goto out;
+	}
+
+	umem->mmapped = true;
+	umem->vma_nr = 1;
+	umem->vm_start = vma->vm_start;
+	get_task_struct(current);
+	umem->task = current;
+	spin_unlock(&umem->lock);
+
+	vma->vm_ops = &umem_vm_ops;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+	vma->vm_flags &= ~VM_SHARED;
+	return 0;
+
+out:
+	spin_unlock(&umem->lock);
+	return error;
+}
+
+static bool umem_req_pending(struct umem* umem)
+{
+	return !list_empty(&umem->req_list) ||
+		!bitmap_empty(umem->sync_req_bitmap, umem->sync_req_max) ||
+		(umem->async_req_nr > 0);
+}
+
+static unsigned int umem_poll(struct file* filp, poll_table *wait)
+{
+	struct umem *umem = filp->private_data;
+	unsigned int events = 0;
+
+	poll_wait(filp, &umem->req_wait, wait);
+
+	spin_lock(&umem->lock);
+	if (umem_req_pending(umem))
+		events |= POLLIN;
+	spin_unlock(&umem->lock);
+
+	return events;
+}
+
+/*
+ * return value
+ * true: finished
+ * false: more request
+ */
+static bool umem_copy_page_request(struct umem *umem,
+				   pgoff_t *pgoffs, int req_max,
+				   int *req_nr)
+{
+	struct umem_page_req_list *req_list;
+	struct umem_page_req_list *tmp;
+
+	unsigned long bit;
+
+	*req_nr = 0;
+	list_for_each_entry_safe(req_list, tmp, &umem->req_list, list) {
+		list_del(&req_list->list);
+		pgoffs[*req_nr] = req_list->pgoff;
+		(*req_nr)++;
+		if (*req_nr >= req_max)
+			return false;
+	}
+
+	bit = 0;
+	for (;;) {
+		bit = find_next_bit(umem->sync_req_bitmap, umem->sync_req_max,
+				    bit);
+		if (bit >= umem->sync_req_max)
+			break;
+		pgoffs[*req_nr] = umem->sync_req[bit];
+		(*req_nr)++;
+		clear_bit(bit, umem->sync_req_bitmap);
+		if (*req_nr >= req_max)
+			return false;
+		bit++;
+	}
+
+	if (umem->async_req_nr > 0) {
+		int nr = min(req_max - *req_nr, umem->async_req_nr);
+		memcpy(pgoffs + *req_nr, umem->async_req,
+		       sizeof(*umem->async_req) * nr);
+		umem->async_req_nr -= nr;
+		*req_nr += nr;
+		memmove(umem->async_req, umem->sync_req + nr,
+			umem->async_req_nr * sizeof(*umem->async_req));
+
+	}
+	return umem->async_req_nr == 0;
+}
+
+static int umem_get_page_request(struct umem *umem,
+				 struct umem_page_request *page_req)
+{
+	DEFINE_WAIT(wait);
+#define REQ_MAX	((__u32)32)
+	pgoff_t pgoffs[REQ_MAX];
+	__u32 req_copied = 0;
+	int ret = 0;
+
+	spin_lock(&umem->lock);
+	for (;;) {
+		prepare_to_wait(&umem->req_wait, &wait, TASK_INTERRUPTIBLE);
+		if (umem_req_pending(umem)) {
+			break;
+		}
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		spin_unlock(&umem->lock);
+		schedule();
+		spin_lock(&umem->lock);
+	}
+	finish_wait(&umem->req_wait, &wait);
+	if (ret)
+		goto out_unlock;
+
+	while (req_copied < page_req->nr) {
+		int req_max;
+		int req_nr;
+		bool finished;
+		req_max = min(page_req->nr - req_copied, REQ_MAX);
+		finished = umem_copy_page_request(umem, pgoffs, req_max,
+						  &req_nr);
+
+		spin_unlock(&umem->lock);
+
+		if (req_nr > 0) {
+			ret = 0;
+			if (copy_to_user(page_req->pgoffs + req_copied, pgoffs,
+					 sizeof(*pgoffs) * req_nr)) {
+				ret = -EFAULT;
+				goto out;
+			}
+		}
+		req_copied += req_nr;
+		if (finished)
+			goto out;
+
+		spin_lock(&umem->lock);
+	}
+
+out_unlock:
+	spin_unlock(&umem->lock);
+out:
+	page_req->nr = req_copied;
+	return ret;
+}
+
+static int umem_mark_page_cached(struct umem *umem,
+				 struct umem_page_cached *page_cached)
+{
+	int ret = 0;
+#define PG_MAX	((__u32)32)
+	__u64 pgoffs[PG_MAX];
+	__u32 nr;
+	unsigned long bit;
+	bool wake_up_list = false;
+
+	nr = 0;
+	while (nr < page_cached->nr) {
+		__u32 todo = min(PG_MAX, (page_cached->nr - nr));
+		int i;
+
+		if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
+				   sizeof(*pgoffs) * todo)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		for (i = 0; i < todo; ++i) {
+			if (pgoffs[i] >= umem->pgoff_end) {
+				ret = -EINVAL;
+				goto out;
+			}
+			set_bit(pgoffs[i], umem->cached);
+		}
+		nr += todo;
+	}
+
+	spin_lock(&umem->lock);
+	bit = 0;
+	for (;;) {
+		bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
+				    bit);
+		if (bit >= umem->sync_req_max)
+			break;
+		if (test_bit(umem->sync_req[bit], umem->cached))
+			wake_up(&umem->page_wait[bit]);
+		bit++;
+	}
+
+	if (umem->req_list_nr > 0)
+		wake_up_list = true;
+	spin_unlock(&umem->lock);
+
+	if (wake_up_list)
+		wake_up_all(&umem->req_list_wait);
+
+out:
+	return ret;
+}
+
+static int umem_make_vma_anonymous(struct umem *umem)
+{
+#if 1
+	return -ENOSYS;
+#else
+	unsigned long saddr;
+	unsigned long eaddr;
+	unsigned long addr;
+	unsigned long bit;
+	struct task_struct *task;
+	struct mm_struct *mm;
+
+	spin_lock(&umem->lock);
+	task = umem->task;
+	saddr = umem->vm_start;
+	eaddr = saddr + umem->size;
+	bit = find_first_zero_bit(umem->faulted, umem->pgoff_end);
+	if (bit < umem->pgoff_end) {
+		spin_unlock(&umem->lock);
+		return -EBUSY;
+	}
+	spin_unlock(&umem->lock);
+	if (task == NULL)
+		return 0;
+	mm = get_task_mm(task);
+	if (mm == NULL)
+		return 0;
+
+	addr = saddr;
+	down_write(&mm->mmap_sem);
+	while (addr < eaddr) {
+		struct vm_area_struct *vma;
+		vma = find_vma(mm, addr);
+		if (umem_is_umem_vma(umem, vma)) {
+			/* XXX incorrect. race/locking and more fix up */
+			struct file *filp = vma->vm_file;
+			vma->vm_ops->close(vma);
+			vma->vm_ops = NULL;
+			vma->vm_file = NULL;
+			/* vma->vm_flags */
+			fput(filp);
+		}
+		addr = vma->vm_end;
+	}
+	up_write(&mm->mmap_sem);
+
+	mmput(mm);
+	return 0;
+#endif
+}
+
+static long umem_ioctl(struct file *filp, unsigned int ioctl,
+			   unsigned long arg)
+{
+	struct umem *umem = filp->private_data;
+	void __user *argp = (void __user *) arg;
+	long ret = 0;
+
+	switch (ioctl) {
+	case UMEM_GET_PAGE_REQUEST: {
+		struct umem_page_request page_request;
+		ret = -EFAULT;
+		if (copy_from_user(&page_request, argp, sizeof(page_request)))
+			break;
+		ret = umem_get_page_request(umem, &page_request);
+		if (ret == 0 &&
+		    copy_to_user(argp +
+				 offsetof(struct umem_page_request, nr),
+				 &page_request.nr,
+				 sizeof(page_request.nr))) {
+			ret = -EFAULT;
+			break;
+		}
+		break;
+	}
+	case UMEM_MARK_PAGE_CACHED: {
+		struct umem_page_cached page_cached;
+		ret = -EFAULT;
+		if (copy_from_user(&page_cached, argp, sizeof(page_cached)))
+			break;
+		ret = umem_mark_page_cached(umem, &page_cached);
+		break;
+	}
+	case UMEM_MAKE_VMA_ANONYMOUS:
+		ret = umem_make_vma_anonymous(umem);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static unsigned long umem_bitmap_bytes(const struct umem *umem)
+{
+	return round_up(umem->pgoff_end, BITS_PER_LONG) / 8;
+}
+
+
+static void umem_free(struct kref *kref)
+{
+	struct umem *umem = container_of(kref, struct umem, kref);
+
+	BUG_ON(!mutex_is_locked(&umem_list_mutex));
+	list_del(&umem->list);
+	mutex_unlock(&umem_list_mutex);
+
+	if (umem->task) {
+		put_task_struct(umem->task);
+		umem->task = NULL;
+	}
+
+	if (umem->shmem_filp)
+		fput(umem->shmem_filp);
+	if (umem_bitmap_bytes(umem) > PAGE_SIZE) {
+		vfree(umem->cached);
+		vfree(umem->faulted);
+	} else {
+		kfree(umem->cached);
+		kfree(umem->faulted);
+	}
+	kfree(umem->async_req);
+	kfree(umem->sync_req_bitmap);
+	kfree(umem->sync_wait_bitmap);
+	kfree(umem->page_wait);
+	kfree(umem->sync_req);
+	kfree(umem);
+}
+
+static void umem_put(struct umem *umem)
+{
+	int ret;
+
+	mutex_lock(&umem_list_mutex);
+	ret = kref_put(&umem->kref, umem_free);
+	if (ret == 0) {
+		mutex_unlock(&umem_list_mutex);
+	}
+}
+
+static int umem_release(struct inode *inode, struct file *filp)
+{
+	struct umem *umem = filp->private_data;
+	umem_put(umem);
+	return 0;
+}
+
+static struct file_operations umem_fops = {
+	.release	= umem_release,
+	.unlocked_ioctl = umem_ioctl,
+	.mmap		= umem_mmap,
+	.poll		= umem_poll,
+	.llseek		= noop_llseek,
+};
+
+static int umem_create_umem(struct umem_create *create)
+{
+	int error = 0;
+	struct umem *umem = NULL;
+	struct vm_area_struct *vma;
+	int shmem_fd;
+	unsigned long bitmap_bytes;
+	unsigned long sync_bitmap_bytes;
+	int i;
+
+	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+	umem->name = create->name;
+	kref_init(&umem->kref);
+	INIT_LIST_HEAD(&umem->list);
+
+	mutex_lock(&umem_list_mutex);
+	error = umem_add_list(umem);
+	if (error) {
+		goto out;
+	}
+
+	umem->task = NULL;
+	umem->mmapped = false;
+	spin_lock_init(&umem->lock);
+	umem->size = roundup(create->size, PAGE_SIZE);
+	umem->pgoff_end = umem->size >> PAGE_SHIFT;
+	init_waitqueue_head(&umem->req_wait);
+
+	vma = &umem->vma;
+	vma->vm_start = 0;
+	vma->vm_end = umem->size;
+	/* this shmem file is used for temporal buffer for pages
+	   so it's unlikely that so many pages exists in this shmem file */
+	vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
+		VM_DONTEXPAND;
+	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+	vma->vm_pgoff = 0;
+	INIT_LIST_HEAD(&vma->anon_vma_chain);
+
+	shmem_fd = get_unused_fd();
+	if (shmem_fd < 0) {
+		error = shmem_fd;
+		goto out;
+	}
+	error = shmem_zero_setup(vma);
+	if (error < 0) {
+		put_unused_fd(shmem_fd);
+		goto out;
+	}
+	umem->shmem_filp = vma->vm_file;
+	get_file(umem->shmem_filp);
+	fd_install(shmem_fd, vma->vm_file);
+	create->shmem_fd = shmem_fd;
+
+	create->umem_fd = anon_inode_getfd("umem",
+					   &umem_fops, umem, O_RDWR);
+	if (create->umem_fd < 0) {
+		error = create->umem_fd;
+		goto out;
+	}
+
+	bitmap_bytes = umem_bitmap_bytes(umem);
+	if (bitmap_bytes > PAGE_SIZE) {
+		umem->cached = vzalloc(bitmap_bytes);
+		umem->faulted = vzalloc(bitmap_bytes);
+	} else {
+		umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
+		umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
+	}
+
+	/* those constants are not exported.
+	   They are just used for default value */
+#define KVM_MAX_VCPUS	256
+#define ASYNC_PF_PER_VCPU 64
+
+#define ASYNC_REQ_MAX	(ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
+	if (create->async_req_max == 0)
+		create->async_req_max = ASYNC_REQ_MAX;
+	umem->async_req_max = create->async_req_max;
+	umem->async_req_nr = 0;
+	umem->async_req = kzalloc(
+		sizeof(*umem->async_req) * umem->async_req_max,
+		GFP_KERNEL);
+
+#define SYNC_REQ_MAX	(KVM_MAX_VCPUS)
+	if (create->sync_req_max == 0)
+		create->sync_req_max = SYNC_REQ_MAX;
+	umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
+	sync_bitmap_bytes = sizeof(unsigned long) *
+		(umem->sync_req_max / BITS_PER_LONG);
+	umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
+	umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
+	umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
+				  umem->sync_req_max, GFP_KERNEL);
+	for (i = 0; i < umem->sync_req_max; ++i)
+		init_waitqueue_head(&umem->page_wait[i]);
+	umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
+				 umem->sync_req_max, GFP_KERNEL);
+
+	umem->req_list_nr = 0;
+	INIT_LIST_HEAD(&umem->req_list);
+	init_waitqueue_head(&umem->req_list_wait);
+
+	mutex_unlock(&umem_list_mutex);
+	return 0;
+
+ out:
+	umem_free(&umem->kref);
+	return error;
+}
+
+static int umem_list_umem(struct umem_list __user *u_list)
+{
+	struct umem_list k_list;
+	struct umem *entry;
+	struct umem_name __user *u_name = u_list->names;
+	__u32 nr = 0;
+
+	if (copy_from_user(&k_list, u_list, sizeof(k_list))) {
+		return -EFAULT;
+	}
+
+	mutex_lock(&umem_list_mutex);
+	list_for_each_entry(entry, &umem_list, list) {
+		if (nr < k_list.nr) {
+			if (copy_to_user(u_name, &entry->name,
+					 sizeof(entry->name))) {
+				mutex_unlock(&umem_list_mutex);
+				return -EFAULT;
+			}
+			u_name++;
+		}
+		nr++;
+	}
+	mutex_unlock(&umem_list_mutex);
+
+	k_list.nr = nr;
+	if (copy_to_user(u_list, &k_list, sizeof(k_list))) {
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int umem_reattach_umem(struct umem_create *create)
+{
+	struct umem *entry;
+
+	mutex_lock(&umem_list_mutex);
+	list_for_each_entry(entry, &umem_list, list) {
+		if (umem_name_eq(&entry->name, &create->name)) {
+			kref_get(&entry->kref);
+			mutex_unlock(&umem_list_mutex);
+
+			create->shmem_fd = get_unused_fd();
+			if (create->shmem_fd < 0) {
+				umem_put(entry);
+				return create->shmem_fd;
+			}
+			create->umem_fd = anon_inode_getfd(
+				"umem", &umem_fops, entry, O_RDWR);
+			if (create->umem_fd < 0) {
+				put_unused_fd(create->shmem_fd);
+				umem_put(entry);
+				return create->umem_fd;
+			}
+			get_file(entry->shmem_filp);
+			fd_install(create->shmem_fd, entry->shmem_filp);
+
+			create->size = entry->size;
+			create->sync_req_max = entry->sync_req_max;
+			create->async_req_max = entry->async_req_max;
+			return 0;
+		}
+	}
+	mutex_unlock(&umem_list_mutex);
+
+	return -ENOENT;
+}
+
+static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
+			   unsigned long arg)
+{
+	void __user *argp = (void __user *) arg;
+	long ret;
+	struct umem_create *create = NULL;
+
+
+	switch (ioctl) {
+	case UMEM_DEV_CREATE_UMEM:
+		create = kmalloc(sizeof(*create), GFP_KERNEL);
+		if (copy_from_user(create, argp, sizeof(*create))) {
+			ret = -EFAULT;
+			break;
+		}
+		ret = umem_create_umem(create);
+		if (copy_to_user(argp, create, sizeof(*create))) {
+			ret = -EFAULT;
+			break;
+		}
+		break;
+	case UMEM_DEV_LIST:
+		ret = umem_list_umem(argp);
+		break;
+	case UMEM_DEV_REATTACH:
+		create = kmalloc(sizeof(*create), GFP_KERNEL);
+		if (copy_from_user(create, argp, sizeof(*create))) {
+			ret = -EFAULT;
+			break;
+		}
+		ret = umem_reattach_umem(create);
+		if (copy_to_user(argp, create, sizeof(*create))) {
+			ret = -EFAULT;
+			break;
+		}
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	kfree(create);
+	return ret;
+}
+
+static int umem_dev_release(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static struct file_operations umem_dev_fops = {
+	.release = umem_dev_release,
+	.unlocked_ioctl = umem_dev_ioctl,
+};
+
+static struct miscdevice umem_dev = {
+	MISC_DYNAMIC_MINOR,
+	"umem",
+	&umem_dev_fops,
+};
+
+static int __init umem_init(void)
+{
+	int r;
+	r = misc_register(&umem_dev);
+	if (r) {
+		printk(KERN_ERR "umem: misc device register failed\n");
+		return r;
+	}
+	return 0;
+}
+module_init(umem_init);
+
+static void __exit umem_exit(void)
+{
+	misc_deregister(&umem_dev);
+}
+module_exit(umem_exit);
+
+MODULE_DESCRIPTION("UMEM user process backed memory driver "
+		   "for distributed shared memory");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Isaku Yamahata");
diff --git a/include/linux/umem.h b/include/linux/umem.h
new file mode 100644
index 0000000..e1a8633
--- /dev/null
+++ b/include/linux/umem.h
@@ -0,0 +1,83 @@ 
+/*
+ * User process backed memory.
+ * This is mainly for KVM post copy.
+ *
+ * Copyright (c) 2011,
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __LINUX_UMEM_H
+#define __LINUX_UMEM_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifdef __KERNEL__
+#include <linux/compiler.h>
+#else
+#define __user
+#endif
+
+#define UMEM_ID_MAX	256
+#define UMEM_NAME_MAX	256
+
+struct umem_name {
+	char id[UMEM_ID_MAX];		/* non-zero terminated */
+	char name[UMEM_NAME_MAX];	/* non-zero terminated */
+};
+
+struct umem_list {
+	__u32 nr;
+	__u32 padding;
+	struct umem_name names[0];
+};
+
+struct umem_create {
+	__u64 size;	/* in bytes */
+	__s32 umem_fd;
+	__s32 shmem_fd;
+	__u32 async_req_max;
+	__u32 sync_req_max;
+	struct umem_name name;
+};
+
+struct umem_page_request {
+	__u64 __user *pgoffs;
+	__u32 nr;
+	__u32 padding;
+};
+
+struct umem_page_cached {
+	__u64 __user *pgoffs;
+	__u32 nr;
+	__u32 padding;
+};
+
+#define UMEMIO	0x1E
+
+/* ioctl for umem_dev fd */
+#define UMEM_DEV_CREATE_UMEM	_IOWR(UMEMIO, 0x0, struct umem_create)
+#define UMEM_DEV_LIST		_IOWR(UMEMIO, 0x1, struct umem_list)
+#define UMEM_DEV_REATTACH	_IOWR(UMEMIO, 0x2, struct umem_create)
+
+/* ioctl for umem fd */
+#define UMEM_GET_PAGE_REQUEST	_IOWR(UMEMIO, 0x10, struct umem_page_request)
+#define UMEM_MARK_PAGE_CACHED	_IOW (UMEMIO, 0x11, struct umem_page_cached)
+#define UMEM_MAKE_VMA_ANONYMOUS	_IO  (UMEMIO, 0x12)
+
+#endif /* __LINUX_UMEM_H */