diff mbox

[PATCHv2] rdma: add a new IB_ACCESS_GIFT flag

Message ID 20130324155153.GA8597@redhat.com
State New
Headers show

Commit Message

Michael S. Tsirkin March 24, 2013, 3:51 p.m. UTC
At the moment registering an MR breaks COW.  This breaks memory
overcommit for users such as KVM: we have a lot of COW pages, e.g.
instances of the zero page or pages shared using KSM.

If the application does not care that adapter sees stale data (for
example, it tracks writes reregisters and resends), it can use a new
IBV_ACCESS_GIFT flag to prevent registration from breaking COW.

The semantics are similar to that of SPLICE_F_GIFT thus the name.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---

Please review and consider for 3.10.

Changes from v1:
	rename APP_READONLY to _GIFT: similar to vmsplice's F_GIFT.

 drivers/infiniband/core/umem.c | 21 ++++++++++++---------
 include/rdma/ib_verbs.h        |  9 ++++++++-
 2 files changed, 20 insertions(+), 10 deletions(-)

Comments

Michael S. Tsirkin April 2, 2013, 3:51 p.m. UTC | #1
On Sun, Mar 24, 2013 at 05:51:53PM +0200, Michael S. Tsirkin wrote:
> At the moment registering an MR breaks COW.  This breaks memory
> overcommit for users such as KVM: we have a lot of COW pages, e.g.
> instances of the zero page or pages shared using KSM.
> 
> If the application does not care that adapter sees stale data (for
> example, it tracks writes reregisters and resends), it can use a new
> IBV_ACCESS_GIFT flag to prevent registration from breaking COW.
> 
> The semantics are similar to that of SPLICE_F_GIFT thus the name.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

Roland, Michael is yet to test this but could you please
confirm whether this looks acceptable to you?

> ---
> 
> Please review and consider for 3.10.
> 
> Changes from v1:
> 	rename APP_READONLY to _GIFT: similar to vmsplice's F_GIFT.
> 
>  drivers/infiniband/core/umem.c | 21 ++++++++++++---------
>  include/rdma/ib_verbs.h        |  9 ++++++++-
>  2 files changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index a841123..5dee86d 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -89,6 +89,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  	int ret;
>  	int off;
>  	int i;
> +	bool gift, writable;
>  	DEFINE_DMA_ATTRS(attrs);
>  
>  	if (dmasync)
> @@ -96,6 +97,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  
>  	if (!can_do_mlock())
>  		return ERR_PTR(-EPERM);
> +	/*
> +	 * We ask for writable memory if any access flags other than
> +	 * "remote read" or "gift" are set.  "Local write" and "remote write"
> +	 * obviously require write access.  "Remote atomic" can do
> +	 * things like fetch and add, which will modify memory, and
> +	 * "MW bind" can change permissions by binding a window.
> +	 */
> +	gift  = access & IB_ACCESS_GIFT;
> +	writable = access & ~(IB_ACCESS_REMOTE_READ | IB_ACCESS_GIFT);
>  
>  	umem = kmalloc(sizeof *umem, GFP_KERNEL);
>  	if (!umem)
> @@ -105,14 +115,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  	umem->length    = size;
>  	umem->offset    = addr & ~PAGE_MASK;
>  	umem->page_size = PAGE_SIZE;
> -	/*
> -	 * We ask for writable memory if any access flags other than
> -	 * "remote read" are set.  "Local write" and "remote write"
> -	 * obviously require write access.  "Remote atomic" can do
> -	 * things like fetch and add, which will modify memory, and
> -	 * "MW bind" can change permissions by binding a window.
> -	 */
> -	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
> +	umem->writable  = writable;
>  
>  	/* We assume the memory is from hugetlb until proved otherwise */
>  	umem->hugetlb   = 1;
> @@ -152,7 +155,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>  		ret = get_user_pages(current, current->mm, cur_base,
>  				     min_t(unsigned long, npages,
>  					   PAGE_SIZE / sizeof (struct page *)),
> -				     1, !umem->writable, page_list, vma_list);
> +				     !gift, !umem->writable, page_list, vma_list);
>  
>  		if (ret < 0)
>  			goto out;
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 98cc4b2..2e6e13c 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -871,7 +871,14 @@ enum ib_access_flags {
>  	IB_ACCESS_REMOTE_READ	= (1<<2),
>  	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
>  	IB_ACCESS_MW_BIND	= (1<<4),
> -	IB_ZERO_BASED		= (1<<5)
> +	IB_ZERO_BASED		= (1<<5),
> +	/*
> +	 * IB_ACCESS_GIFT: This memory is a gift to the adapter.  If memory is
> +	 * modified after registration, the local version and data seen by the
> +	 * adapter through this region rkey may differ.
> +	 * Only legal with IB_ACCESS_REMOTE_READ or no permissions.
> +	 */
> +	IB_ACCESS_GIFT		= (1<<6)
>  };
>  
>  struct ib_phys_buf {
> -- 
> MST
Roland Dreier April 2, 2013, 4:57 p.m. UTC | #2
On Tue, Apr 2, 2013 at 8:51 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>> At the moment registering an MR breaks COW.  This breaks memory
>> overcommit for users such as KVM: we have a lot of COW pages, e.g.
>> instances of the zero page or pages shared using KSM.
>>
>> If the application does not care that adapter sees stale data (for
>> example, it tracks writes reregisters and resends), it can use a new
>> IBV_ACCESS_GIFT flag to prevent registration from breaking COW.
>>
>> The semantics are similar to that of SPLICE_F_GIFT thus the name.
>>
>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> Roland, Michael is yet to test this but could you please
> confirm whether this looks acceptable to you?

The patch itself is reasonable I guess, given the needs of this particular app.

I'm not particularly happy with the name of the flag.  The analogy
with SPLICE_F_GIFT doesn't seem particularly strong and I'm not
convinced even the splice flag name is very understandable.  But in
the RDMA case there's not really any sense in which we're "gifting"
memory to the adapter -- we're just telling the library "please don't
trigger copy-on-write" and it doesn't seem particularly easy for users
to understand that from the flag name.

 - R.
Michael S. Tsirkin April 2, 2013, 5:05 p.m. UTC | #3
On Tue, Apr 02, 2013 at 09:57:38AM -0700, Roland Dreier wrote:
> On Tue, Apr 2, 2013 at 8:51 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> >> At the moment registering an MR breaks COW.  This breaks memory
> >> overcommit for users such as KVM: we have a lot of COW pages, e.g.
> >> instances of the zero page or pages shared using KSM.
> >>
> >> If the application does not care that adapter sees stale data (for
> >> example, it tracks writes reregisters and resends), it can use a new
> >> IBV_ACCESS_GIFT flag to prevent registration from breaking COW.
> >>
> >> The semantics are similar to that of SPLICE_F_GIFT thus the name.
> >>
> >> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> >
> > Roland, Michael is yet to test this but could you please
> > confirm whether this looks acceptable to you?
> 
> The patch itself is reasonable I guess, given the needs of this particular app.
> 
> I'm not particularly happy with the name of the flag.  The analogy
> with SPLICE_F_GIFT doesn't seem particularly strong and I'm not
> convinced even the splice flag name is very understandable.  But in
> the RDMA case there's not really any sense in which we're "gifting"
> memory to the adapter -- we're just telling the library "please don't
> trigger copy-on-write" and it doesn't seem particularly easy for users
> to understand that from the flag name.
> 
>  - R.

The point really is that any writes by application
won't be seen until re-registration, right?
OK, what's a better name?  IBV_ACCESS_NON_COHERENT?
Please tell me what is preferable and we'll go ahead with it.
mrhines@linux.vnet.ibm.com April 2, 2013, 10:17 p.m. UTC | #4
I'm getting around to it, Michael, I promise =).

Just came back from vacation.

I have to re-build the ib_ucm kernel module from the original SUSE 
kernel that I'm using along before I can test it......

The machines I'm using are slightly tied up with other things, so its 
taking me a little time to prepare to apply the patch and test the new 
kernel module...

- Michael

On 04/02/2013 01:05 PM, Michael S. Tsirkin wrote:
> On Tue, Apr 02, 2013 at 09:57:38AM -0700, Roland Dreier wrote:
>> On Tue, Apr 2, 2013 at 8:51 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
>>>> At the moment registering an MR breaks COW.  This breaks memory
>>>> overcommit for users such as KVM: we have a lot of COW pages, e.g.
>>>> instances of the zero page or pages shared using KSM.
>>>>
>>>> If the application does not care that adapter sees stale data (for
>>>> example, it tracks writes reregisters and resends), it can use a new
>>>> IBV_ACCESS_GIFT flag to prevent registration from breaking COW.
>>>>
>>>> The semantics are similar to that of SPLICE_F_GIFT thus the name.
>>>>
>>>> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>>> Roland, Michael is yet to test this but could you please
>>> confirm whether this looks acceptable to you?
>> The patch itself is reasonable I guess, given the needs of this particular app.
>>
>> I'm not particularly happy with the name of the flag.  The analogy
>> with SPLICE_F_GIFT doesn't seem particularly strong and I'm not
>> convinced even the splice flag name is very understandable.  But in
>> the RDMA case there's not really any sense in which we're "gifting"
>> memory to the adapter -- we're just telling the library "please don't
>> trigger copy-on-write" and it doesn't seem particularly easy for users
>> to understand that from the flag name.
>>
>>   - R.
> The point really is that any writes by application
> won't be seen until re-registration, right?
> OK, what's a better name?  IBV_ACCESS_NON_COHERENT?
> Please tell me what is preferable and we'll go ahead with it.
>
Michael S. Tsirkin April 3, 2013, 4:07 p.m. UTC | #5
On Tue, Apr 02, 2013 at 08:05:21PM +0300, Michael S. Tsirkin wrote:
> On Tue, Apr 02, 2013 at 09:57:38AM -0700, Roland Dreier wrote:
> > On Tue, Apr 2, 2013 at 8:51 AM, Michael S. Tsirkin <mst@redhat.com> wrote:
> > >> At the moment registering an MR breaks COW.  This breaks memory
> > >> overcommit for users such as KVM: we have a lot of COW pages, e.g.
> > >> instances of the zero page or pages shared using KSM.
> > >>
> > >> If the application does not care that adapter sees stale data (for
> > >> example, it tracks writes reregisters and resends), it can use a new
> > >> IBV_ACCESS_GIFT flag to prevent registration from breaking COW.
> > >>
> > >> The semantics are similar to that of SPLICE_F_GIFT thus the name.
> > >>
> > >> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > >
> > > Roland, Michael is yet to test this but could you please
> > > confirm whether this looks acceptable to you?
> > 
> > The patch itself is reasonable I guess, given the needs of this particular app.
> > 
> > I'm not particularly happy with the name of the flag.  The analogy
> > with SPLICE_F_GIFT doesn't seem particularly strong and I'm not
> > convinced even the splice flag name is very understandable.  But in
> > the RDMA case there's not really any sense in which we're "gifting"
> > memory to the adapter -- we're just telling the library "please don't
> > trigger copy-on-write" and it doesn't seem particularly easy for users
> > to understand that from the flag name.
> > 
> >  - R.
> 
> The point really is that any writes by application
> won't be seen until re-registration, right?
> OK, what's a better name?  IBV_ACCESS_NON_COHERENT?
> Please tell me what is preferable and we'll go ahead with it.

Um. ping? We are at -rc5 and things need to fall into place
if we are to have it in 3.10 ...

> -- 
> MST
mrhines@linux.vnet.ibm.com April 5, 2013, 8:17 p.m. UTC | #6
The userland part of the patch was missing (IBV_ACCESS_GIFT).

I added flag that to /usr/include in addition to this patch and did a 
test RDMA migrate and it seems to work without any problems.

I also removed the IBV_*_WRITE flags on the sender-side and activated 
cgroups with the "memory.memsw.limit_in_bytes" activated and the 
migration with RDMA also succeeded without any problems (both with *and* 
without GIFT also worked).

Any additional tests you would like?


- Michael

On 03/24/2013 11:51 AM, Michael S. Tsirkin wrote:
> At the moment registering an MR breaks COW.  This breaks memory
> overcommit for users such as KVM: we have a lot of COW pages, e.g.
> instances of the zero page or pages shared using KSM.
>
> If the application does not care that adapter sees stale data (for
> example, it tracks writes reregisters and resends), it can use a new
> IBV_ACCESS_GIFT flag to prevent registration from breaking COW.
>
> The semantics are similar to that of SPLICE_F_GIFT thus the name.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> Please review and consider for 3.10.
>
> Changes from v1:
> 	rename APP_READONLY to _GIFT: similar to vmsplice's F_GIFT.
>
>   drivers/infiniband/core/umem.c | 21 ++++++++++++---------
>   include/rdma/ib_verbs.h        |  9 ++++++++-
>   2 files changed, 20 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index a841123..5dee86d 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -89,6 +89,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>   	int ret;
>   	int off;
>   	int i;
> +	bool gift, writable;
>   	DEFINE_DMA_ATTRS(attrs);
>
>   	if (dmasync)
> @@ -96,6 +97,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>
>   	if (!can_do_mlock())
>   		return ERR_PTR(-EPERM);
> +	/*
> +	 * We ask for writable memory if any access flags other than
> +	 * "remote read" or "gift" are set.  "Local write" and "remote write"
> +	 * obviously require write access.  "Remote atomic" can do
> +	 * things like fetch and add, which will modify memory, and
> +	 * "MW bind" can change permissions by binding a window.
> +	 */
> +	gift  = access & IB_ACCESS_GIFT;
> +	writable = access & ~(IB_ACCESS_REMOTE_READ | IB_ACCESS_GIFT);
>
>   	umem = kmalloc(sizeof *umem, GFP_KERNEL);
>   	if (!umem)
> @@ -105,14 +115,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>   	umem->length    = size;
>   	umem->offset    = addr & ~PAGE_MASK;
>   	umem->page_size = PAGE_SIZE;
> -	/*
> -	 * We ask for writable memory if any access flags other than
> -	 * "remote read" are set.  "Local write" and "remote write"
> -	 * obviously require write access.  "Remote atomic" can do
> -	 * things like fetch and add, which will modify memory, and
> -	 * "MW bind" can change permissions by binding a window.
> -	 */
> -	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
> +	umem->writable  = writable;
>
>   	/* We assume the memory is from hugetlb until proved otherwise */
>   	umem->hugetlb   = 1;
> @@ -152,7 +155,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
>   		ret = get_user_pages(current, current->mm, cur_base,
>   				     min_t(unsigned long, npages,
>   					   PAGE_SIZE / sizeof (struct page *)),
> -				     1, !umem->writable, page_list, vma_list);
> +				     !gift, !umem->writable, page_list, vma_list);
>
>   		if (ret < 0)
>   			goto out;
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 98cc4b2..2e6e13c 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -871,7 +871,14 @@ enum ib_access_flags {
>   	IB_ACCESS_REMOTE_READ	= (1<<2),
>   	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
>   	IB_ACCESS_MW_BIND	= (1<<4),
> -	IB_ZERO_BASED		= (1<<5)
> +	IB_ZERO_BASED		= (1<<5),
> +	/*
> +	 * IB_ACCESS_GIFT: This memory is a gift to the adapter.  If memory is
> +	 * modified after registration, the local version and data seen by the
> +	 * adapter through this region rkey may differ.
> +	 * Only legal with IB_ACCESS_REMOTE_READ or no permissions.
> +	 */
> +	IB_ACCESS_GIFT		= (1<<6)
>   };
>
>   struct ib_phys_buf {
Roland Dreier April 5, 2013, 8:43 p.m. UTC | #7
On Fri, Apr 5, 2013 at 1:17 PM, Michael R. Hines
<mrhines@linux.vnet.ibm.com> wrote:
> I also removed the IBV_*_WRITE flags on the sender-side and activated
> cgroups with the "memory.memsw.limit_in_bytes" activated and the migration
> with RDMA also succeeded without any problems (both with *and* without GIFT
> also worked).

Not sure I'm interpreting this correctly.  Are you saying that things
worked without actually setting the GIFT flag?   In which case why are
we adding this flag?

 - R.
mrhines@linux.vnet.ibm.com April 5, 2013, 8:51 p.m. UTC | #8
Sorry, I was wrong. ignore the comments about cgroups. That's still 
broken. (i.e. trying to register RDMA memory while using a cgroup swap 
limit cause the process get killed).

But the GIFT flag patch works (my understanding is that GIFT flag allows 
the adapter to transmit stale memory information, it does not have 
anything to do with cgroups specifically).

Am I missing something? I was only testing the GIFT flag patch.

Note: I only turned it on - I did not verify the (non) consitency of the 
memory that was transmitted.

- Michael


On 04/05/2013 04:43 PM, Roland Dreier wrote:
> On Fri, Apr 5, 2013 at 1:17 PM, Michael R. Hines
> <mrhines@linux.vnet.ibm.com> wrote:
>> I also removed the IBV_*_WRITE flags on the sender-side and activated
>> cgroups with the "memory.memsw.limit_in_bytes" activated and the migration
>> with RDMA also succeeded without any problems (both with *and* without GIFT
>> also worked).
> Not sure I'm interpreting this correctly.  Are you saying that things
> worked without actually setting the GIFT flag?   In which case why are
> we adding this flag?
>
>   - R.
>
mrhines@linux.vnet.ibm.com April 5, 2013, 8:54 p.m. UTC | #9
To be more specific, here's what I did:

1. apply kernel module patch - re-insert module
1. QEMU does: ibv_reg_mr(........IBV_ACCESS_GIFT | IBV_ACCESS_REMOTE_READ)
2. Start the RDMA migration
3. Migration completes without any errors

This test does *not* work with a cgroup swap limit, however. The process 
gets killed. (Both with and without GIFT)

- Michael

On 04/05/2013 04:43 PM, Roland Dreier wrote:
> On Fri, Apr 5, 2013 at 1:17 PM, Michael R. Hines
> <mrhines@linux.vnet.ibm.com> wrote:
>> I also removed the IBV_*_WRITE flags on the sender-side and activated
>> cgroups with the "memory.memsw.limit_in_bytes" activated and the migration
>> with RDMA also succeeded without any problems (both with *and* without GIFT
>> also worked).
> Not sure I'm interpreting this correctly.  Are you saying that things
> worked without actually setting the GIFT flag?   In which case why are
> we adding this flag?
>
>   - R.
>
Roland Dreier April 5, 2013, 9:03 p.m. UTC | #10
On Fri, Apr 5, 2013 at 1:51 PM, Michael R. Hines
<mrhines@linux.vnet.ibm.com> wrote:
> Sorry, I was wrong. ignore the comments about cgroups. That's still broken.
> (i.e. trying to register RDMA memory while using a cgroup swap limit cause
> the process get killed).
>
> But the GIFT flag patch works (my understanding is that GIFT flag allows the
> adapter to transmit stale memory information, it does not have anything to
> do with cgroups specifically).

The point of the GIFT patch is to avoid triggering copy-on-write so
that memory doesn't blow up during migration.  If that doesn't work
then there's no point to the patch.

 - R.
mrhines@linux.vnet.ibm.com April 5, 2013, 9:32 p.m. UTC | #11
Well, I have the "is_dup_page()" commented out.......when RDMA is 
activated.....

Is there something else in QEMU that could be touching the page that I 
don't know about?

- Michael


On 04/05/2013 05:03 PM, Roland Dreier wrote:
> On Fri, Apr 5, 2013 at 1:51 PM, Michael R. Hines
> <mrhines@linux.vnet.ibm.com> wrote:
>> Sorry, I was wrong. ignore the comments about cgroups. That's still broken.
>> (i.e. trying to register RDMA memory while using a cgroup swap limit cause
>> the process get killed).
>>
>> But the GIFT flag patch works (my understanding is that GIFT flag allows the
>> adapter to transmit stale memory information, it does not have anything to
>> do with cgroups specifically).
> The point of the GIFT patch is to avoid triggering copy-on-write so
> that memory doesn't blow up during migration.  If that doesn't work
> then there's no point to the patch.
>
>   - R.
>
Michael S. Tsirkin April 9, 2013, 4:39 p.m. UTC | #12
On Fri, Apr 05, 2013 at 04:54:39PM -0400, Michael R. Hines wrote:
> To be more specific, here's what I did:
> 
> 1. apply kernel module patch - re-insert module
> 1. QEMU does: ibv_reg_mr(........IBV_ACCESS_GIFT | IBV_ACCESS_REMOTE_READ)
> 2. Start the RDMA migration
> 3. Migration completes without any errors
> 
> This test does *not* work with a cgroup swap limit, however. The
> process gets killed. (Both with and without GIFT)
> 
> - Michael

Try to attach a debugger and see where it is when it gets killed?

> On 04/05/2013 04:43 PM, Roland Dreier wrote:
> >On Fri, Apr 5, 2013 at 1:17 PM, Michael R. Hines
> ><mrhines@linux.vnet.ibm.com> wrote:
> >>I also removed the IBV_*_WRITE flags on the sender-side and activated
> >>cgroups with the "memory.memsw.limit_in_bytes" activated and the migration
> >>with RDMA also succeeded without any problems (both with *and* without GIFT
> >>also worked).
> >Not sure I'm interpreting this correctly.  Are you saying that things
> >worked without actually setting the GIFT flag?   In which case why are
> >we adding this flag?
> >
> >  - R.
> >
>
mrhines@linux.vnet.ibm.com April 9, 2013, 5:56 p.m. UTC | #13
On 04/09/2013 12:39 PM, Michael S. Tsirkin wrote:
> On Fri, Apr 05, 2013 at 04:54:39PM -0400, Michael R. Hines wrote:
>> To be more specific, here's what I did:
>>
>> 1. apply kernel module patch - re-insert module
>> 1. QEMU does: ibv_reg_mr(........IBV_ACCESS_GIFT | IBV_ACCESS_REMOTE_READ)
>> 2. Start the RDMA migration
>> 3. Migration completes without any errors
>>
>> This test does *not* work with a cgroup swap limit, however. The
>> process gets killed. (Both with and without GIFT)
>>
>> - Michael
> Try to attach a debugger and see where it is when it gets killed?
>

It's killed by cgroups - not a CPU exception.

The same test works fine using TCP migration with cgroups - everything 
is fine there.

The memory that RDMA attempted to register hits some kind of cgroups policy
which results in a kernel message saying that the cgroup swap limit was hit
and then it goes ahead and kills the process altogether.

It's not a QEMU problem - it seems to be a kernel bug.
Michael S. Tsirkin April 9, 2013, 7 p.m. UTC | #14
On Tue, Apr 09, 2013 at 01:56:00PM -0400, Michael R. Hines wrote:
> On 04/09/2013 12:39 PM, Michael S. Tsirkin wrote:
> >On Fri, Apr 05, 2013 at 04:54:39PM -0400, Michael R. Hines wrote:
> >>To be more specific, here's what I did:
> >>
> >>1. apply kernel module patch - re-insert module
> >>1. QEMU does: ibv_reg_mr(........IBV_ACCESS_GIFT | IBV_ACCESS_REMOTE_READ)
> >>2. Start the RDMA migration
> >>3. Migration completes without any errors
> >>
> >>This test does *not* work with a cgroup swap limit, however. The
> >>process gets killed. (Both with and without GIFT)
> >>
> >>- Michael
> >Try to attach a debugger and see where it is when it gets killed?
> >
> 
> It's killed by cgroups - not a CPU exception.
> 
> The same test works fine using TCP migration with cgroups -
> everything is fine there.
> 
> The memory that RDMA attempted to register hits some kind of cgroups policy
> which results in a kernel message saying that the cgroup swap limit was hit
> and then it goes ahead and kills the process altogether.
> 
> It's not a QEMU problem - it seems to be a kernel bug.

Maybe cgroup swap limit really is buggy. That's interesting, but not
really related to this patch.  What's interesting is whether we save
lots memory by using this patch.
Couldn't you dump the pagemap for the qemu process and calculate real
memory usage before and after applying the patch?
Michael S. Tsirkin April 9, 2013, 7:03 p.m. UTC | #15
presumably is_dup_page reads the page, so should not break COW ...

I'm not sure about the cgroups swap limit - you might have
too many non COW pages so attempting to fault them all in
makes you exceed the limit. You really should look at
what is going on in the pagemap, to see if there's
measureable gain from the patch.


On Fri, Apr 05, 2013 at 05:32:30PM -0400, Michael R. Hines wrote:
> Well, I have the "is_dup_page()" commented out.......when RDMA is
> activated.....
> 
> Is there something else in QEMU that could be touching the page that
> I don't know about?
> 
> - Michael
> 
> 
> On 04/05/2013 05:03 PM, Roland Dreier wrote:
> >On Fri, Apr 5, 2013 at 1:51 PM, Michael R. Hines
> ><mrhines@linux.vnet.ibm.com> wrote:
> >>Sorry, I was wrong. ignore the comments about cgroups. That's still broken.
> >>(i.e. trying to register RDMA memory while using a cgroup swap limit cause
> >>the process get killed).
> >>
> >>But the GIFT flag patch works (my understanding is that GIFT flag allows the
> >>adapter to transmit stale memory information, it does not have anything to
> >>do with cgroups specifically).
> >The point of the GIFT patch is to avoid triggering copy-on-write so
> >that memory doesn't blow up during migration.  If that doesn't work
> >then there's no point to the patch.
> >
> >  - R.
> >
Michael S. Tsirkin April 9, 2013, 7:09 p.m. UTC | #16
On Fri, Apr 05, 2013 at 02:03:33PM -0700, Roland Dreier wrote:
> On Fri, Apr 5, 2013 at 1:51 PM, Michael R. Hines
> <mrhines@linux.vnet.ibm.com> wrote:
> > Sorry, I was wrong. ignore the comments about cgroups. That's still broken.
> > (i.e. trying to register RDMA memory while using a cgroup swap limit cause
> > the process get killed).
> >
> > But the GIFT flag patch works (my understanding is that GIFT flag allows the
> > adapter to transmit stale memory information, it does not have anything to
> > do with cgroups specifically).
> 
> The point of the GIFT patch is to avoid triggering copy-on-write so
> that memory doesn't blow up during migration.  If that doesn't work
> then there's no point to the patch.
> 
>  - R.

Absolutely. Checking whether an OOM gets triggered looks like a heavy
handed approach to testing the feature though.
It's relevant, but there could be many other reasons for it to trigger.
See Documentation/cgroups/memory.txt section "Troubleshooting".

It's easier to just check whether this patch reduces the memory consumption,
that's the point really.
Michael S. Tsirkin April 9, 2013, 7:12 p.m. UTC | #17
On Fri, Apr 05, 2013 at 01:43:49PM -0700, Roland Dreier wrote:
> On Fri, Apr 5, 2013 at 1:17 PM, Michael R. Hines
> <mrhines@linux.vnet.ibm.com> wrote:
> > I also removed the IBV_*_WRITE flags on the sender-side and activated
> > cgroups with the "memory.memsw.limit_in_bytes" activated and the migration
> > with RDMA also succeeded without any problems (both with *and* without GIFT
> > also worked).
> 
> Not sure I'm interpreting this correctly.  Are you saying that things
> worked without actually setting the GIFT flag?   In which case why are
> we adding this flag?
> 
>  - R.

We are adding the flag to reduce memory when there's lots of COW pages.
There's no guarantee there will be COW pages so I expect things to work
both with and without breaking COW, just using much more memory when we
break COW.
Michael S. Tsirkin April 9, 2013, 8:34 p.m. UTC | #18
On Fri, Apr 05, 2013 at 04:17:36PM -0400, Michael R. Hines wrote:
> The userland part of the patch was missing (IBV_ACCESS_GIFT).
> 
> I added flag that to /usr/include in addition to this patch and did
> a test RDMA migrate and it seems to work without any problems.
> 
> I also removed the IBV_*_WRITE flags on the sender-side and
> activated cgroups with the "memory.memsw.limit_in_bytes" activated
> and the migration with RDMA also succeeded without any problems
> (both with *and* without GIFT also worked).
> 
> Any additional tests you would like?
> 
> 
> - Michael

RDMA can't really work with swap so not sure how that's relevant.

Please check memory.usage_in_bytes - is it lower with
the GIFT flag?  I think this is what we really care about.
Michael S. Tsirkin April 9, 2013, 8:37 p.m. UTC | #19
On Tue, Apr 09, 2013 at 11:34:09PM +0300, Michael S. Tsirkin wrote:
> On Fri, Apr 05, 2013 at 04:17:36PM -0400, Michael R. Hines wrote:
> > The userland part of the patch was missing (IBV_ACCESS_GIFT).
> > 
> > I added flag that to /usr/include in addition to this patch and did
> > a test RDMA migrate and it seems to work without any problems.
> > 
> > I also removed the IBV_*_WRITE flags on the sender-side and
> > activated cgroups with the "memory.memsw.limit_in_bytes" activated
> > and the migration with RDMA also succeeded without any problems
> > (both with *and* without GIFT also worked).
> > 
> > Any additional tests you would like?
> > 
> > 
> > - Michael
> 
> RDMA can't really work with swap so not sure how that's relevant.
> 
> Please check memory.usage_in_bytes - is it lower with
> the GIFT flag?  I think this is what we really care about.

oh and no reason to set memsw.limit_in_bytes I think.

> -- 
> MST
mrhines@linux.vnet.ibm.com April 10, 2013, 1:26 a.m. UTC | #20
With respect, I'm going to offload testing this patch back to the author =)
because I'm trying to address all of Paolo's other minor issues
with the RDMA patch before we can merge.

Since dynamic page registration (as you requested) is now fully
implemented, this patch is less urgent since we now have a
mechanism in place to avoid page pinning on both sides of the migration.

- Michael

On 04/09/2013 03:03 PM, Michael S. Tsirkin wrote:
> presumably is_dup_page reads the page, so should not break COW ...
>
> I'm not sure about the cgroups swap limit - you might have
> too many non COW pages so attempting to fault them all in
> makes you exceed the limit. You really should look at
> what is going on in the pagemap, to see if there's
> measureable gain from the patch.
>
>
> On Fri, Apr 05, 2013 at 05:32:30PM -0400, Michael R. Hines wrote:
>> Well, I have the "is_dup_page()" commented out.......when RDMA is
>> activated.....
>>
>> Is there something else in QEMU that could be touching the page that
>> I don't know about?
>>
>> - Michael
>>
>>
>> On 04/05/2013 05:03 PM, Roland Dreier wrote:
>>> On Fri, Apr 5, 2013 at 1:51 PM, Michael R. Hines
>>> <mrhines@linux.vnet.ibm.com> wrote:
>>>> Sorry, I was wrong. ignore the comments about cgroups. That's still broken.
>>>> (i.e. trying to register RDMA memory while using a cgroup swap limit cause
>>>> the process get killed).
>>>>
>>>> But the GIFT flag patch works (my understanding is that GIFT flag allows the
>>>> adapter to transmit stale memory information, it does not have anything to
>>>> do with cgroups specifically).
>>> The point of the GIFT patch is to avoid triggering copy-on-write so
>>> that memory doesn't blow up during migration.  If that doesn't work
>>> then there's no point to the patch.
>>>
>>>   - R.
>>>
Michael S. Tsirkin April 10, 2013, 3:24 a.m. UTC | #21
On Tue, Apr 09, 2013 at 09:26:59PM -0400, Michael R. Hines wrote:
> With respect, I'm going to offload testing this patch back to the author =)
> because I'm trying to address all of Paolo's other minor issues
> with the RDMA patch before we can merge.

Fair enough, this likely means it won't happen anytime soon though.

> Since dynamic page registration (as you requested) is now fully
> implemented, this patch is less urgent since we now have a
> mechanism in place to avoid page pinning on both sides of the migration.
> 
> - Michael
> 

Which mechanism do you refer to? You patches still seem to pin
each page in guest memory at some point, which will break all
COW. In particular any pagemap tricks to detect duplicates
on source that I suggested won't work.

> On 04/09/2013 03:03 PM, Michael S. Tsirkin wrote:
> >presumably is_dup_page reads the page, so should not break COW ...
> >
> >I'm not sure about the cgroups swap limit - you might have
> >too many non COW pages so attempting to fault them all in
> >makes you exceed the limit. You really should look at
> >what is going on in the pagemap, to see if there's
> >measureable gain from the patch.
> >
> >
> >On Fri, Apr 05, 2013 at 05:32:30PM -0400, Michael R. Hines wrote:
> >>Well, I have the "is_dup_page()" commented out.......when RDMA is
> >>activated.....
> >>
> >>Is there something else in QEMU that could be touching the page that
> >>I don't know about?
> >>
> >>- Michael
> >>
> >>
> >>On 04/05/2013 05:03 PM, Roland Dreier wrote:
> >>>On Fri, Apr 5, 2013 at 1:51 PM, Michael R. Hines
> >>><mrhines@linux.vnet.ibm.com> wrote:
> >>>>Sorry, I was wrong. ignore the comments about cgroups. That's still broken.
> >>>>(i.e. trying to register RDMA memory while using a cgroup swap limit cause
> >>>>the process get killed).
> >>>>
> >>>>But the GIFT flag patch works (my understanding is that GIFT flag allows the
> >>>>adapter to transmit stale memory information, it does not have anything to
> >>>>do with cgroups specifically).
> >>>The point of the GIFT patch is to avoid triggering copy-on-write so
> >>>that memory doesn't blow up during migration.  If that doesn't work
> >>>then there's no point to the patch.
> >>>
> >>>  - R.
> >>>
mrhines@linux.vnet.ibm.com April 10, 2013, 4:32 a.m. UTC | #22
On 04/09/2013 11:24 PM, Michael S. Tsirkin wrote:
> Which mechanism do you refer to? You patches still seem to pin each 
> page in guest memory at some point, which will break all COW. In 
> particular any pagemap tricks to detect duplicates on source that I 
> suggested won't work. 

Sorry, I mispoke. I'm reffering to dynamic server page registration.

Of course it does not eliminate pinning - but it does mitigate the foot 
print of the VM as a feature that was requested.

I have implemented it and documented it.

- Michael

>> On 04/09/2013 03:03 PM, Michael S. Tsirkin wrote:
>>> presumably is_dup_page reads the page, so should not break COW ...
>>>
>>> I'm not sure about the cgroups swap limit - you might have
>>> too many non COW pages so attempting to fault them all in
>>> makes you exceed the limit. You really should look at
>>> what is going on in the pagemap, to see if there's
>>> measureable gain from the patch.
>>>
>>>
>>> On Fri, Apr 05, 2013 at 05:32:30PM -0400, Michael R. Hines wrote:
>>>> Well, I have the "is_dup_page()" commented out.......when RDMA is
>>>> activated.....
>>>>
>>>> Is there something else in QEMU that could be touching the page that
>>>> I don't know about?
>>>>
>>>> - Michael
>>>>
>>>>
>>>> On 04/05/2013 05:03 PM, Roland Dreier wrote:
>>>>> On Fri, Apr 5, 2013 at 1:51 PM, Michael R. Hines
>>>>> <mrhines@linux.vnet.ibm.com> wrote:
>>>>>> Sorry, I was wrong. ignore the comments about cgroups. That's still broken.
>>>>>> (i.e. trying to register RDMA memory while using a cgroup swap limit cause
>>>>>> the process get killed).
>>>>>>
>>>>>> But the GIFT flag patch works (my understanding is that GIFT flag allows the
>>>>>> adapter to transmit stale memory information, it does not have anything to
>>>>>> do with cgroups specifically).
>>>>> The point of the GIFT patch is to avoid triggering copy-on-write so
>>>>> that memory doesn't blow up during migration.  If that doesn't work
>>>>> then there's no point to the patch.
>>>>>
>>>>>   - R.
>>>>>
Michael S. Tsirkin April 10, 2013, 5:32 a.m. UTC | #23
On Wed, Apr 10, 2013 at 12:32:31AM -0400, Michael R. Hines wrote:
> On 04/09/2013 11:24 PM, Michael S. Tsirkin wrote:
> >Which mechanism do you refer to? You patches still seem to pin
> >each page in guest memory at some point, which will break all COW.
> >In particular any pagemap tricks to detect duplicates on source
> >that I suggested won't work.
> 
> Sorry, I mispoke. I'm reffering to dynamic server page registration.
> 
> Of course it does not eliminate pinning - but it does mitigate the
> foot print of the VM as a feature that was requested.
> 
> I have implemented it and documented it.
> 
> - Michael

Okay, but GIFT is supposed to be used on send side: it's only allowed
with local/remote read access, and serves to reduce memory usage
on send side.
For example, disable zero page detection and look at memory usage
on send side before and after migration.
Dynamic registration on the receive side is nice but seems
completely unrelated ...
Michael S. Tsirkin April 10, 2013, 3:05 p.m. UTC | #24
On Wed, Apr 10, 2013 at 11:48:24AM -0400, Michael R. Hines wrote:
> 
> There's a very nice, simple client/server RDMA application on the
> internet you can use to test your patch.
> 
> http://thegeekinthecorner.wordpress.com/2010/09/28/
> rdma-read-and-write-with-ib-verbs/
> 
> This guy provides the source code which dumps several gigabytes over RDMA
> to the other side.
> 
> There's no need to run QEMU to test your patch,
> assuming you have access to infiniband hardware.
> 
> - Michael


Does this app have any COW pages?

> On 04/10/2013 01:32 AM, Michael S. Tsirkin wrote:
> 
>     On Wed, Apr 10, 2013 at 12:32:31AM -0400, Michael R. Hines wrote:
> 
>         On 04/09/2013 11:24 PM, Michael S. Tsirkin wrote:
> 
>             Which mechanism do you refer to? You patches still seem to pin
>             each page in guest memory at some point, which will break all COW.
>             In particular any pagemap tricks to detect duplicates on source
>             that I suggested won't work.
> 
>         Sorry, I mispoke. I'm reffering to dynamic server page registration.
> 
>         Of course it does not eliminate pinning - but it does mitigate the
>         foot print of the VM as a feature that was requested.
> 
>         I have implemented it and documented it.
> 
>         - Michael
> 
>     Okay, but GIFT is supposed to be used on send side: it's only allowed
>     with local/remote read access, and serves to reduce memory usage
>     on send side.
>     For example, disable zero page detection and look at memory usage
>     on send side before and after migration.
>     Dynamic registration on the receive side is nice but seems
>     completely unrelated ...
> 
> 
>
mrhines@linux.vnet.ibm.com April 10, 2013, 3:48 p.m. UTC | #25
There's a very nice, simple client/server RDMA application on the
internet you can use to test your patch.

http://thegeekinthecorner.wordpress.com/2010/09/28/rdma-read-and-write-with-ib-verbs/

This guy provides the source code which dumps several gigabytes over RDMA
to the other side.

There's no need to run QEMU to test your patch,
assuming you have access to infiniband hardware.

- Michael

On 04/10/2013 01:32 AM, Michael S. Tsirkin wrote:
> On Wed, Apr 10, 2013 at 12:32:31AM -0400, Michael R. Hines wrote:
>> On 04/09/2013 11:24 PM, Michael S. Tsirkin wrote:
>>> Which mechanism do you refer to? You patches still seem to pin
>>> each page in guest memory at some point, which will break all COW.
>>> In particular any pagemap tricks to detect duplicates on source
>>> that I suggested won't work.
>> Sorry, I mispoke. I'm reffering to dynamic server page registration.
>>
>> Of course it does not eliminate pinning - but it does mitigate the
>> foot print of the VM as a feature that was requested.
>>
>> I have implemented it and documented it.
>>
>> - Michael
> Okay, but GIFT is supposed to be used on send side: it's only allowed
> with local/remote read access, and serves to reduce memory usage
> on send side.
> For example, disable zero page detection and look at memory usage
> on send side before and after migration.
> Dynamic registration on the receive side is nice but seems
> completely unrelated ...
>
mrhines@linux.vnet.ibm.com April 10, 2013, 4:28 p.m. UTC | #26
On 04/10/2013 11:05 AM, Michael S. Tsirkin wrote:
> On Wed, Apr 10, 2013 at 11:48:24AM -0400, Michael R. Hines wrote:
>> There's a very nice, simple client/server RDMA application on the
>> internet you can use to test your patch.
>>
>> http://thegeekinthecorner.wordpress.com/2010/09/28/
>> rdma-read-and-write-with-ib-verbs/
>>
>> This guy provides the source code which dumps several gigabytes over RDMA
>> to the other side.
>>
>> There's no need to run QEMU to test your patch,
>> assuming you have access to infiniband hardware.
>>
>> - Michael
>
> Does this app have any COW pages?

It can easily be made to be. It's just a couple of C source files.

I've only used it for basic testing.
diff mbox

Patch

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index a841123..5dee86d 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -89,6 +89,7 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	int ret;
 	int off;
 	int i;
+	bool gift, writable;
 	DEFINE_DMA_ATTRS(attrs);
 
 	if (dmasync)
@@ -96,6 +97,15 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 
 	if (!can_do_mlock())
 		return ERR_PTR(-EPERM);
+	/*
+	 * We ask for writable memory if any access flags other than
+	 * "remote read" or "gift" are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	gift  = access & IB_ACCESS_GIFT;
+	writable = access & ~(IB_ACCESS_REMOTE_READ | IB_ACCESS_GIFT);
 
 	umem = kmalloc(sizeof *umem, GFP_KERNEL);
 	if (!umem)
@@ -105,14 +115,7 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 	umem->length    = size;
 	umem->offset    = addr & ~PAGE_MASK;
 	umem->page_size = PAGE_SIZE;
-	/*
-	 * We ask for writable memory if any access flags other than
-	 * "remote read" are set.  "Local write" and "remote write"
-	 * obviously require write access.  "Remote atomic" can do
-	 * things like fetch and add, which will modify memory, and
-	 * "MW bind" can change permissions by binding a window.
-	 */
-	umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
+	umem->writable  = writable;
 
 	/* We assume the memory is from hugetlb until proved otherwise */
 	umem->hugetlb   = 1;
@@ -152,7 +155,7 @@  struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
 		ret = get_user_pages(current, current->mm, cur_base,
 				     min_t(unsigned long, npages,
 					   PAGE_SIZE / sizeof (struct page *)),
-				     1, !umem->writable, page_list, vma_list);
+				     !gift, !umem->writable, page_list, vma_list);
 
 		if (ret < 0)
 			goto out;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 98cc4b2..2e6e13c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -871,7 +871,14 @@  enum ib_access_flags {
 	IB_ACCESS_REMOTE_READ	= (1<<2),
 	IB_ACCESS_REMOTE_ATOMIC	= (1<<3),
 	IB_ACCESS_MW_BIND	= (1<<4),
-	IB_ZERO_BASED		= (1<<5)
+	IB_ZERO_BASED		= (1<<5),
+	/*
+	 * IB_ACCESS_GIFT: This memory is a gift to the adapter.  If memory is
+	 * modified after registration, the local version and data seen by the
+	 * adapter through this region rkey may differ.
+	 * Only legal with IB_ACCESS_REMOTE_READ or no permissions.
+	 */
+	IB_ACCESS_GIFT		= (1<<6)
 };
 
 struct ib_phys_buf {