[v3,04/20] mm: VMA sequence count

Message ID 1504894024-2750-5-git-send-email-ldufour@linux.vnet.ibm.com
State New
Headers show
Series
  • Speculative page faults
Related show

Commit Message

Laurent Dufour Sept. 8, 2017, 6:06 p.m.
From: Peter Zijlstra <peterz@infradead.org>

Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence
counts such that we can easily test if a VMA is changed.

The unmap_page_range() one allows us to make assumptions about
page-tables; when we find the seqcount hasn't changed we can assume
page-tables are still valid.

The flip side is that we cannot distinguish between a vma_adjust() and
the unmap_page_range() -- where with the former we could have
re-checked the vma bounds against the address.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

[Port to 4.12 kernel]
[Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence]
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
 include/linux/mm_types.h |  1 +
 mm/memory.c              |  2 ++
 mm/mmap.c                | 21 ++++++++++++++++++---
 3 files changed, 21 insertions(+), 3 deletions(-)

Comments

Sergey Senozhatsky Sept. 13, 2017, 11:53 a.m. | #1
Hi,

On (09/08/17 20:06), Laurent Dufour wrote:
[..]
> @@ -903,6 +910,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
>  		mm->map_count--;
>  		mpol_put(vma_policy(next));
>  		kmem_cache_free(vm_area_cachep, next);
> +		write_seqcount_end(&next->vm_sequence);
>  		/*
>  		 * In mprotect's case 6 (see comments on vma_merge),
>  		 * we must remove another next too. It would clutter
> @@ -932,11 +940,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
>  		if (remove_next == 2) {
>  			remove_next = 1;
>  			end = next->vm_end;
> +			write_seqcount_end(&vma->vm_sequence);
>  			goto again;
> -		}
> -		else if (next)
> +		} else if (next) {
> +			if (next != vma)
> +				write_seqcount_begin_nested(&next->vm_sequence,
> +							    SINGLE_DEPTH_NESTING);
>  			vma_gap_update(next);
> -		else {
> +		} else {
>  			/*
>  			 * If remove_next == 2 we obviously can't
>  			 * reach this path.
> @@ -962,6 +973,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
>  	if (insert && file)
>  		uprobe_mmap(insert);
>  
> +	if (next && next != vma)
> +		write_seqcount_end(&next->vm_sequence);
> +	write_seqcount_end(&vma->vm_sequence);


ok, so what I got on my box is:

vm_munmap()  -> down_write_killable(&mm->mmap_sem)
 do_munmap()
  __split_vma()
   __vma_adjust()  -> write_seqcount_begin(&vma->vm_sequence)
                   -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING)

so this gives 3 dependencies  ->mmap_sem   ->   ->vm_seq
                              ->vm_seq     ->   ->vm_seq/1
                              ->mmap_sem   ->   ->vm_seq/1


SyS_mremap() -> down_write_killable(&current->mm->mmap_sem)
 move_vma()   -> write_seqcount_begin(&vma->vm_sequence)
              -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING);
  move_page_tables()
   __pte_alloc()
    pte_alloc_one()
     __alloc_pages_nodemask()
      fs_reclaim_acquire()


I think here we have prepare_alloc_pages() call, that does

        -> fs_reclaim_acquire(gfp_mask)
        -> fs_reclaim_release(gfp_mask)

so that adds one more dependency  ->mmap_sem   ->   ->vm_seq    ->   fs_reclaim
                                  ->mmap_sem   ->   ->vm_seq/1  ->   fs_reclaim


now, under memory pressure we hit the slow path and perform direct
reclaim. direct reclaim is done under fs_reclaim lock, so we end up
with the following call chain

__alloc_pages_nodemask()
 __alloc_pages_slowpath()
  __perform_reclaim()       ->   fs_reclaim_acquire(gfp_mask);
   try_to_free_pages()
    shrink_node()
     shrink_active_list()
      rmap_walk_file()      ->   i_mmap_lock_read(mapping);


and this break the existing dependency. since we now take the leaf lock
(fs_reclaim) first and the the root lock (->mmap_sem).


well, seems to be the case.

	-ss
Laurent Dufour Sept. 13, 2017, 4:56 p.m. | #2
Hi Sergey,

On 13/09/2017 13:53, Sergey Senozhatsky wrote:
> Hi,
> 
> On (09/08/17 20:06), Laurent Dufour wrote:
> [..]
>> @@ -903,6 +910,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
>>  		mm->map_count--;
>>  		mpol_put(vma_policy(next));
>>  		kmem_cache_free(vm_area_cachep, next);
>> +		write_seqcount_end(&next->vm_sequence);
>>  		/*
>>  		 * In mprotect's case 6 (see comments on vma_merge),
>>  		 * we must remove another next too. It would clutter
>> @@ -932,11 +940,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
>>  		if (remove_next == 2) {
>>  			remove_next = 1;
>>  			end = next->vm_end;
>> +			write_seqcount_end(&vma->vm_sequence);
>>  			goto again;
>> -		}
>> -		else if (next)
>> +		} else if (next) {
>> +			if (next != vma)
>> +				write_seqcount_begin_nested(&next->vm_sequence,
>> +							    SINGLE_DEPTH_NESTING);
>>  			vma_gap_update(next);
>> -		else {
>> +		} else {
>>  			/*
>>  			 * If remove_next == 2 we obviously can't
>>  			 * reach this path.
>> @@ -962,6 +973,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
>>  	if (insert && file)
>>  		uprobe_mmap(insert);
>>  
>> +	if (next && next != vma)
>> +		write_seqcount_end(&next->vm_sequence);
>> +	write_seqcount_end(&vma->vm_sequence);
> 
> 
> ok, so what I got on my box is:
> 
> vm_munmap()  -> down_write_killable(&mm->mmap_sem)
>  do_munmap()
>   __split_vma()
>    __vma_adjust()  -> write_seqcount_begin(&vma->vm_sequence)
>                    -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING)
> 
> so this gives 3 dependencies  ->mmap_sem   ->   ->vm_seq
>                               ->vm_seq     ->   ->vm_seq/1
>                               ->mmap_sem   ->   ->vm_seq/1
> 
> 
> SyS_mremap() -> down_write_killable(&current->mm->mmap_sem)
>  move_vma()   -> write_seqcount_begin(&vma->vm_sequence)
>               -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING);
>   move_page_tables()
>    __pte_alloc()
>     pte_alloc_one()
>      __alloc_pages_nodemask()
>       fs_reclaim_acquire()
> 
> 
> I think here we have prepare_alloc_pages() call, that does
> 
>         -> fs_reclaim_acquire(gfp_mask)
>         -> fs_reclaim_release(gfp_mask)
> 
> so that adds one more dependency  ->mmap_sem   ->   ->vm_seq    ->   fs_reclaim
>                                   ->mmap_sem   ->   ->vm_seq/1  ->   fs_reclaim
> 
> 
> now, under memory pressure we hit the slow path and perform direct
> reclaim. direct reclaim is done under fs_reclaim lock, so we end up
> with the following call chain
> 
> __alloc_pages_nodemask()
>  __alloc_pages_slowpath()
>   __perform_reclaim()       ->   fs_reclaim_acquire(gfp_mask);
>    try_to_free_pages()
>     shrink_node()
>      shrink_active_list()
>       rmap_walk_file()      ->   i_mmap_lock_read(mapping);
> 
> 
> and this break the existing dependency. since we now take the leaf lock
> (fs_reclaim) first and the the root lock (->mmap_sem).

Thanks for looking at this.
I'm sorry, I should have miss something.

My understanding is that there are 2 chains of locks:
 1. from __vma_adjust() mmap_sem -> i_mmap_rwsem -> vm_seq
 2. from move_vmap() mmap_sem -> vm_seq -> fs_reclaim
 2. from __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem

So the solution would be to have in __vma_adjust()
 mmap_sem -> vm_seq -> i_mmap_rwsem

But this will raised the following dependency from  unmap_mapping_range()
unmap_mapping_range() 		-> i_mmap_rwsem
 unmap_mapping_range_tree()
  unmap_mapping_range_vma()
   zap_page_range_single()
    unmap_single_vma()
     unmap_page_range()	 	-> vm_seq

And there is no way to get rid of it easily as in unmap_mapping_range()
there is no VMA identified yet.

That's being said I can't see any clear way to get lock dependency cleaned
here.
Furthermore, this is not clear to me how a deadlock could happen as vm_seq
is a sequence lock, and there is no way to get blocked here.

Cheers,
Laurent.

> 
> well, seems to be the case.
> 
> 	-ss
>
Sergey Senozhatsky Sept. 14, 2017, 12:31 a.m. | #3
Hi,

On (09/13/17 18:56), Laurent Dufour wrote:
> Hi Sergey,
> 
> On 13/09/2017 13:53, Sergey Senozhatsky wrote:
> > Hi,
> > 
> > On (09/08/17 20:06), Laurent Dufour wrote:
[..]
> > ok, so what I got on my box is:
> > 
> > vm_munmap()  -> down_write_killable(&mm->mmap_sem)
> >  do_munmap()
> >   __split_vma()
> >    __vma_adjust()  -> write_seqcount_begin(&vma->vm_sequence)
> >                    -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING)
> > 
> > so this gives 3 dependencies  ->mmap_sem   ->   ->vm_seq
> >                               ->vm_seq     ->   ->vm_seq/1
> >                               ->mmap_sem   ->   ->vm_seq/1
> > 
> > 
> > SyS_mremap() -> down_write_killable(&current->mm->mmap_sem)
> >  move_vma()   -> write_seqcount_begin(&vma->vm_sequence)
> >               -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING);
> >   move_page_tables()
> >    __pte_alloc()
> >     pte_alloc_one()
> >      __alloc_pages_nodemask()
> >       fs_reclaim_acquire()
> > 
> > 
> > I think here we have prepare_alloc_pages() call, that does
> > 
> >         -> fs_reclaim_acquire(gfp_mask)
> >         -> fs_reclaim_release(gfp_mask)
> > 
> > so that adds one more dependency  ->mmap_sem   ->   ->vm_seq    ->   fs_reclaim
> >                                   ->mmap_sem   ->   ->vm_seq/1  ->   fs_reclaim
> > 
> > 
> > now, under memory pressure we hit the slow path and perform direct
> > reclaim. direct reclaim is done under fs_reclaim lock, so we end up
> > with the following call chain
> > 
> > __alloc_pages_nodemask()
> >  __alloc_pages_slowpath()
> >   __perform_reclaim()       ->   fs_reclaim_acquire(gfp_mask);
> >    try_to_free_pages()
> >     shrink_node()
> >      shrink_active_list()
> >       rmap_walk_file()      ->   i_mmap_lock_read(mapping);
> > 
> > 
> > and this break the existing dependency. since we now take the leaf lock
> > (fs_reclaim) first and the the root lock (->mmap_sem).
> 
> Thanks for looking at this.
> I'm sorry, I should have miss something.

no prob :)


> My understanding is that there are 2 chains of locks:
>  1. from __vma_adjust() mmap_sem -> i_mmap_rwsem -> vm_seq
>  2. from move_vmap() mmap_sem -> vm_seq -> fs_reclaim
>  2. from __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem

yes, as far as lockdep warning suggests.

> So the solution would be to have in __vma_adjust()
>  mmap_sem -> vm_seq -> i_mmap_rwsem
> 
> But this will raised the following dependency from  unmap_mapping_range()
> unmap_mapping_range() 		-> i_mmap_rwsem
>  unmap_mapping_range_tree()
>   unmap_mapping_range_vma()
>    zap_page_range_single()
>     unmap_single_vma()
>      unmap_page_range()	 	-> vm_seq
> 
> And there is no way to get rid of it easily as in unmap_mapping_range()
> there is no VMA identified yet.
> 
> That's being said I can't see any clear way to get lock dependency cleaned
> here.
> Furthermore, this is not clear to me how a deadlock could happen as vm_seq
> is a sequence lock, and there is no way to get blocked here.

as far as I understand,
   seq locks can deadlock, technically. not on the write() side, but on
the read() side:

read_seqcount_begin()
 raw_read_seqcount_begin()
   __read_seqcount_begin()

and __read_seqcount_begin() spins for ever

   __read_seqcount_begin()
   {
    repeat:
     ret = READ_ONCE(s->sequence);
     if (unlikely(ret & 1)) {
         cpu_relax();
         goto repeat;
     }
     return ret;
   }


so if there are two CPUs, one doing write_seqcount() and the other one
doing read_seqcount() then what can happen is something like this

	CPU0					CPU1

						fs_reclaim_acquire()
	write_seqcount_begin()
	fs_reclaim_acquire()			read_seqcount_begin()
	write_seqcount_end()

CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from
CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin()
and now waits for fs_reclaim_acquire(). makes sense?

	-ss
Laurent Dufour Sept. 14, 2017, 7:55 a.m. | #4
Hi,

On 14/09/2017 02:31, Sergey Senozhatsky wrote:
> Hi,
> 
> On (09/13/17 18:56), Laurent Dufour wrote:
>> Hi Sergey,
>>
>> On 13/09/2017 13:53, Sergey Senozhatsky wrote:
>>> Hi,
>>>
>>> On (09/08/17 20:06), Laurent Dufour wrote:
> [..]
>>> ok, so what I got on my box is:
>>>
>>> vm_munmap()  -> down_write_killable(&mm->mmap_sem)
>>>  do_munmap()
>>>   __split_vma()
>>>    __vma_adjust()  -> write_seqcount_begin(&vma->vm_sequence)
>>>                    -> write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING)
>>>
>>> so this gives 3 dependencies  ->mmap_sem   ->   ->vm_seq
>>>                               ->vm_seq     ->   ->vm_seq/1
>>>                               ->mmap_sem   ->   ->vm_seq/1
>>>
>>>
>>> SyS_mremap() -> down_write_killable(&current->mm->mmap_sem)
>>>  move_vma()   -> write_seqcount_begin(&vma->vm_sequence)
>>>               -> write_seqcount_begin_nested(&new_vma->vm_sequence, SINGLE_DEPTH_NESTING);
>>>   move_page_tables()
>>>    __pte_alloc()
>>>     pte_alloc_one()
>>>      __alloc_pages_nodemask()
>>>       fs_reclaim_acquire()
>>>
>>>
>>> I think here we have prepare_alloc_pages() call, that does
>>>
>>>         -> fs_reclaim_acquire(gfp_mask)
>>>         -> fs_reclaim_release(gfp_mask)
>>>
>>> so that adds one more dependency  ->mmap_sem   ->   ->vm_seq    ->   fs_reclaim
>>>                                   ->mmap_sem   ->   ->vm_seq/1  ->   fs_reclaim
>>>
>>>
>>> now, under memory pressure we hit the slow path and perform direct
>>> reclaim. direct reclaim is done under fs_reclaim lock, so we end up
>>> with the following call chain
>>>
>>> __alloc_pages_nodemask()
>>>  __alloc_pages_slowpath()
>>>   __perform_reclaim()       ->   fs_reclaim_acquire(gfp_mask);
>>>    try_to_free_pages()
>>>     shrink_node()
>>>      shrink_active_list()
>>>       rmap_walk_file()      ->   i_mmap_lock_read(mapping);
>>>
>>>
>>> and this break the existing dependency. since we now take the leaf lock
>>> (fs_reclaim) first and the the root lock (->mmap_sem).
>>
>> Thanks for looking at this.
>> I'm sorry, I should have miss something.
> 
> no prob :)
> 
> 
>> My understanding is that there are 2 chains of locks:
>>  1. from __vma_adjust() mmap_sem -> i_mmap_rwsem -> vm_seq
>>  2. from move_vmap() mmap_sem -> vm_seq -> fs_reclaim
>>  2. from __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem
> 
> yes, as far as lockdep warning suggests.
> 
>> So the solution would be to have in __vma_adjust()
>>  mmap_sem -> vm_seq -> i_mmap_rwsem
>>
>> But this will raised the following dependency from  unmap_mapping_range()
>> unmap_mapping_range() 		-> i_mmap_rwsem
>>  unmap_mapping_range_tree()
>>   unmap_mapping_range_vma()
>>    zap_page_range_single()
>>     unmap_single_vma()
>>      unmap_page_range()	 	-> vm_seq
>>
>> And there is no way to get rid of it easily as in unmap_mapping_range()
>> there is no VMA identified yet.
>>
>> That's being said I can't see any clear way to get lock dependency cleaned
>> here.
>> Furthermore, this is not clear to me how a deadlock could happen as vm_seq
>> is a sequence lock, and there is no way to get blocked here.
> 
> as far as I understand,
>    seq locks can deadlock, technically. not on the write() side, but on
> the read() side:
> 
> read_seqcount_begin()
>  raw_read_seqcount_begin()
>    __read_seqcount_begin()
> 
> and __read_seqcount_begin() spins for ever
> 
>    __read_seqcount_begin()
>    {
>     repeat:
>      ret = READ_ONCE(s->sequence);
>      if (unlikely(ret & 1)) {
>          cpu_relax();
>          goto repeat;
>      }
>      return ret;
>    }
> 
> 
> so if there are two CPUs, one doing write_seqcount() and the other one
> doing read_seqcount() then what can happen is something like this
> 
> 	CPU0					CPU1
> 
> 						fs_reclaim_acquire()
> 	write_seqcount_begin()
> 	fs_reclaim_acquire()			read_seqcount_begin()
> 	write_seqcount_end()
> 
> CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from
> CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin()
> and now waits for fs_reclaim_acquire(). makes sense?

Yes, this makes sense.

But in the case of this series, there is no call to
__read_seqcount_begin(), and the reader (the speculative page fault
handler), is just checking for (vm_seq & 1) and if this is true, simply
exit the speculative path without waiting.
So there is no deadlock possibility.

The bad case would be to have 2 concurrent threads calling
write_seqcount_begin() on the same VMA, leading a wrongly freed sequence
lock but this can't happen because of the mmap_sem holding for write in
such a case.

Cheers,
Laurent.
Sergey Senozhatsky Sept. 14, 2017, 8:13 a.m. | #5
Hi,

On (09/14/17 09:55), Laurent Dufour wrote:
[..]
> > so if there are two CPUs, one doing write_seqcount() and the other one
> > doing read_seqcount() then what can happen is something like this
> > 
> > 	CPU0					CPU1
> > 
> > 						fs_reclaim_acquire()
> > 	write_seqcount_begin()
> > 	fs_reclaim_acquire()			read_seqcount_begin()
> > 	write_seqcount_end()
> > 
> > CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from
> > CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin()
> > and now waits for fs_reclaim_acquire(). makes sense?
> 
> Yes, this makes sense.
> 
> But in the case of this series, there is no call to
> __read_seqcount_begin(), and the reader (the speculative page fault
> handler), is just checking for (vm_seq & 1) and if this is true, simply
> exit the speculative path without waiting.
> So there is no deadlock possibility.

probably lockdep just knows that those locks interleave at some
point.


by the way, I think there is one path that can spin

find_vma_srcu()
 read_seqbegin()
  read_seqcount_begin()
   raw_read_seqcount_begin()
    __read_seqcount_begin()

	-ss
Laurent Dufour Sept. 14, 2017, 8:58 a.m. | #6
On 14/09/2017 10:13, Sergey Senozhatsky wrote:
> Hi,
> 
> On (09/14/17 09:55), Laurent Dufour wrote:
> [..]
>>> so if there are two CPUs, one doing write_seqcount() and the other one
>>> doing read_seqcount() then what can happen is something like this
>>>
>>> 	CPU0					CPU1
>>>
>>> 						fs_reclaim_acquire()
>>> 	write_seqcount_begin()
>>> 	fs_reclaim_acquire()			read_seqcount_begin()
>>> 	write_seqcount_end()
>>>
>>> CPU0 can't write_seqcount_end() because of fs_reclaim_acquire() from
>>> CPU1, CPU1 can't read_seqcount_begin() because CPU0 did write_seqcount_begin()
>>> and now waits for fs_reclaim_acquire(). makes sense?
>>
>> Yes, this makes sense.
>>
>> But in the case of this series, there is no call to
>> __read_seqcount_begin(), and the reader (the speculative page fault
>> handler), is just checking for (vm_seq & 1) and if this is true, simply
>> exit the speculative path without waiting.
>> So there is no deadlock possibility.
> 
> probably lockdep just knows that those locks interleave at some
> point.
> 
> 
> by the way, I think there is one path that can spin
> 
> find_vma_srcu()
>  read_seqbegin()
>   read_seqcount_begin()
>    raw_read_seqcount_begin()
>     __read_seqcount_begin()


That's right, but here this is the  sequence counter mm->mm_seq, not the
vm_seq one.

This one is held to protect walking the VMA list "locklessly"...

Cheers,
Laurent.
Sergey Senozhatsky Sept. 14, 2017, 9:11 a.m. | #7
On (09/14/17 10:58), Laurent Dufour wrote:
[..]
> That's right, but here this is the  sequence counter mm->mm_seq, not the
> vm_seq one.

d'oh... you are right.

	-ss
Laurent Dufour Sept. 14, 2017, 9:15 a.m. | #8
On 14/09/2017 11:11, Sergey Senozhatsky wrote:
> On (09/14/17 10:58), Laurent Dufour wrote:
> [..]
>> That's right, but here this is the  sequence counter mm->mm_seq, not the
>> vm_seq one.
> 
> d'oh... you are right.

So I'm doubting about the probability of a deadlock here, but I don't like
to see lockdep complaining. Is there an easy way to make it happy ?
Sergey Senozhatsky Sept. 14, 2017, 9:40 a.m. | #9
On (09/14/17 11:15), Laurent Dufour wrote:
> On 14/09/2017 11:11, Sergey Senozhatsky wrote:
> > On (09/14/17 10:58), Laurent Dufour wrote:
> > [..]
> >> That's right, but here this is the  sequence counter mm->mm_seq, not the
> >> vm_seq one.
> > 
> > d'oh... you are right.
> 
> So I'm doubting about the probability of a deadlock here, but I don't like
> to see lockdep complaining. Is there an easy way to make it happy ?


 /*
  * well... answering your question - it seems raw versions of seqcount
  * functions don't call lockdep's lock_acquire/lock_release...
  *
  * but I have never told you that. never.
  */


lockdep, perhaps, can be wrong sometimes, and may be it's one of those
cases. may be not... I'm not a MM guy myself.

below is a lockdep splat I got yesterday. that's v3 of SPF patch set.


[ 2763.365898] ======================================================
[ 2763.365899] WARNING: possible circular locking dependency detected
[ 2763.365902] 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837 Not tainted
[ 2763.365903] ------------------------------------------------------
[ 2763.365905] khugepaged/42 is trying to acquire lock:
[ 2763.365906]  (&mapping->i_mmap_rwsem){++++}, at: [<ffffffff811181cc>] rmap_walk_file+0x5a/0x142
[ 2763.365913] 
               but task is already holding lock:
[ 2763.365915]  (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35
[ 2763.365920] 
               which lock already depends on the new lock.

[ 2763.365922] 
               the existing dependency chain (in reverse order) is:
[ 2763.365924] 
               -> #3 (fs_reclaim){+.+.}:
[ 2763.365930]        lock_acquire+0x176/0x19e
[ 2763.365932]        fs_reclaim_acquire+0x32/0x35
[ 2763.365934]        __alloc_pages_nodemask+0x6d/0x1f9
[ 2763.365937]        pte_alloc_one+0x17/0x62
[ 2763.365940]        __pte_alloc+0x1f/0x83
[ 2763.365943]        move_page_tables+0x2c3/0x5a2
[ 2763.365944]        move_vma.isra.25+0xff/0x29f
[ 2763.365946]        SyS_mremap+0x41b/0x49e
[ 2763.365949]        entry_SYSCALL_64_fastpath+0x18/0xad
[ 2763.365951] 
               -> #2 (&vma->vm_sequence/1){+.+.}:
[ 2763.365955]        lock_acquire+0x176/0x19e
[ 2763.365958]        write_seqcount_begin_nested+0x1b/0x1d
[ 2763.365959]        __vma_adjust+0x1c4/0x5f1
[ 2763.365961]        __split_vma+0x12c/0x181
[ 2763.365963]        do_munmap+0x128/0x2af
[ 2763.365965]        vm_munmap+0x5a/0x73
[ 2763.365968]        elf_map+0xb1/0xce
[ 2763.365970]        load_elf_binary+0x91e/0x137a
[ 2763.365973]        search_binary_handler+0x70/0x1f3
[ 2763.365974]        do_execveat_common+0x45e/0x68e
[ 2763.365978]        call_usermodehelper_exec_async+0xf7/0x11f
[ 2763.365980]        ret_from_fork+0x27/0x40
[ 2763.365981] 
               -> #1 (&vma->vm_sequence){+.+.}:
[ 2763.365985]        lock_acquire+0x176/0x19e
[ 2763.365987]        write_seqcount_begin_nested+0x1b/0x1d
[ 2763.365989]        __vma_adjust+0x1a9/0x5f1
[ 2763.365991]        __split_vma+0x12c/0x181
[ 2763.365993]        do_munmap+0x128/0x2af
[ 2763.365994]        vm_munmap+0x5a/0x73
[ 2763.365996]        elf_map+0xb1/0xce
[ 2763.365998]        load_elf_binary+0x91e/0x137a
[ 2763.365999]        search_binary_handler+0x70/0x1f3
[ 2763.366001]        do_execveat_common+0x45e/0x68e
[ 2763.366003]        call_usermodehelper_exec_async+0xf7/0x11f
[ 2763.366005]        ret_from_fork+0x27/0x40
[ 2763.366006] 
               -> #0 (&mapping->i_mmap_rwsem){++++}:
[ 2763.366010]        __lock_acquire+0xa72/0xca0
[ 2763.366012]        lock_acquire+0x176/0x19e
[ 2763.366015]        down_read+0x3b/0x55
[ 2763.366017]        rmap_walk_file+0x5a/0x142
[ 2763.366018]        page_referenced+0xfc/0x134
[ 2763.366022]        shrink_active_list+0x1ac/0x37d
[ 2763.366024]        shrink_node_memcg.constprop.72+0x3ca/0x567
[ 2763.366026]        shrink_node+0x3f/0x14c
[ 2763.366028]        try_to_free_pages+0x288/0x47a
[ 2763.366030]        __alloc_pages_slowpath+0x3a7/0xa49
[ 2763.366032]        __alloc_pages_nodemask+0xf1/0x1f9
[ 2763.366035]        khugepaged+0xc8/0x167c
[ 2763.366037]        kthread+0x133/0x13b
[ 2763.366039]        ret_from_fork+0x27/0x40
[ 2763.366040] 
               other info that might help us debug this:

[ 2763.366042] Chain exists of:
                 &mapping->i_mmap_rwsem --> &vma->vm_sequence/1 --> fs_reclaim

[ 2763.366048]  Possible unsafe locking scenario:

[ 2763.366049]        CPU0                    CPU1
[ 2763.366050]        ----                    ----
[ 2763.366051]   lock(fs_reclaim);
[ 2763.366054]                                lock(&vma->vm_sequence/1);
[ 2763.366056]                                lock(fs_reclaim);
[ 2763.366058]   lock(&mapping->i_mmap_rwsem);
[ 2763.366061] 
                *** DEADLOCK ***

[ 2763.366063] 1 lock held by khugepaged/42:
[ 2763.366064]  #0:  (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35
[ 2763.366068] 
               stack backtrace:
[ 2763.366071] CPU: 2 PID: 42 Comm: khugepaged Not tainted 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837
[ 2763.366073] Call Trace:
[ 2763.366077]  dump_stack+0x67/0x8e
[ 2763.366080]  print_circular_bug+0x2a1/0x2af
[ 2763.366083]  ? graph_unlock+0x69/0x69
[ 2763.366085]  check_prev_add+0x76/0x20d
[ 2763.366087]  ? graph_unlock+0x69/0x69
[ 2763.366090]  __lock_acquire+0xa72/0xca0
[ 2763.366093]  ? __save_stack_trace+0xa3/0xbf
[ 2763.366096]  lock_acquire+0x176/0x19e
[ 2763.366098]  ? rmap_walk_file+0x5a/0x142
[ 2763.366100]  down_read+0x3b/0x55
[ 2763.366102]  ? rmap_walk_file+0x5a/0x142
[ 2763.366103]  rmap_walk_file+0x5a/0x142
[ 2763.366106]  page_referenced+0xfc/0x134
[ 2763.366108]  ? page_vma_mapped_walk_done.isra.17+0xb/0xb
[ 2763.366109]  ? page_get_anon_vma+0x6d/0x6d
[ 2763.366112]  shrink_active_list+0x1ac/0x37d
[ 2763.366115]  shrink_node_memcg.constprop.72+0x3ca/0x567
[ 2763.366118]  ? ___might_sleep+0xd5/0x234
[ 2763.366121]  shrink_node+0x3f/0x14c
[ 2763.366123]  try_to_free_pages+0x288/0x47a
[ 2763.366126]  __alloc_pages_slowpath+0x3a7/0xa49
[ 2763.366128]  ? ___might_sleep+0xd5/0x234
[ 2763.366131]  __alloc_pages_nodemask+0xf1/0x1f9
[ 2763.366133]  khugepaged+0xc8/0x167c
[ 2763.366138]  ? remove_wait_queue+0x47/0x47
[ 2763.366140]  ? collapse_shmem.isra.45+0x828/0x828
[ 2763.366142]  kthread+0x133/0x13b
[ 2763.366145]  ? __list_del_entry+0x1d/0x1d
[ 2763.366147]  ret_from_fork+0x27/0x40

	-ss
Laurent Dufour Sept. 15, 2017, 12:38 p.m. | #10
Hi,

On 14/09/2017 11:40, Sergey Senozhatsky wrote:
> On (09/14/17 11:15), Laurent Dufour wrote:
>> On 14/09/2017 11:11, Sergey Senozhatsky wrote:
>>> On (09/14/17 10:58), Laurent Dufour wrote:
>>> [..]
>>>> That's right, but here this is the  sequence counter mm->mm_seq, not the
>>>> vm_seq one.
>>>
>>> d'oh... you are right.
>>
>> So I'm doubting about the probability of a deadlock here, but I don't like
>> to see lockdep complaining. Is there an easy way to make it happy ?
> 
> 
>  /*
>   * well... answering your question - it seems raw versions of seqcount
>   * functions don't call lockdep's lock_acquire/lock_release...
>   *
>   * but I have never told you that. never.
>   */

Hum... I'm not sure that would be the best way since in other case lockdep
checks are valid, but if getting rid of locked's warning is required to get
this series upstream, I'd use raw versions... Please advice...

> 
> lockdep, perhaps, can be wrong sometimes, and may be it's one of those
> cases. may be not... I'm not a MM guy myself.

From the code reading I can't see any valid reason for a circular lock
dependency.

> below is a lockdep splat I got yesterday. that's v3 of SPF patch set.

This is exactly the same you got previously, and I still can't see how the
chain "&mapping->i_mmap_rwsem --> &vma->vm_sequence/1 --> fs_reclaim" could
happen.

Cheers,
Laurent.

> 
> [ 2763.365898] ======================================================
> [ 2763.365899] WARNING: possible circular locking dependency detected
> [ 2763.365902] 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837 Not tainted
> [ 2763.365903] ------------------------------------------------------
> [ 2763.365905] khugepaged/42 is trying to acquire lock:
> [ 2763.365906]  (&mapping->i_mmap_rwsem){++++}, at: [<ffffffff811181cc>] rmap_walk_file+0x5a/0x142
> [ 2763.365913] 
>                but task is already holding lock:
> [ 2763.365915]  (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35
> [ 2763.365920] 
>                which lock already depends on the new lock.
> 
> [ 2763.365922] 
>                the existing dependency chain (in reverse order) is:
> [ 2763.365924] 
>                -> #3 (fs_reclaim){+.+.}:
> [ 2763.365930]        lock_acquire+0x176/0x19e
> [ 2763.365932]        fs_reclaim_acquire+0x32/0x35
> [ 2763.365934]        __alloc_pages_nodemask+0x6d/0x1f9
> [ 2763.365937]        pte_alloc_one+0x17/0x62
> [ 2763.365940]        __pte_alloc+0x1f/0x83
> [ 2763.365943]        move_page_tables+0x2c3/0x5a2
> [ 2763.365944]        move_vma.isra.25+0xff/0x29f
> [ 2763.365946]        SyS_mremap+0x41b/0x49e
> [ 2763.365949]        entry_SYSCALL_64_fastpath+0x18/0xad
> [ 2763.365951] 
>                -> #2 (&vma->vm_sequence/1){+.+.}:
> [ 2763.365955]        lock_acquire+0x176/0x19e
> [ 2763.365958]        write_seqcount_begin_nested+0x1b/0x1d
> [ 2763.365959]        __vma_adjust+0x1c4/0x5f1
> [ 2763.365961]        __split_vma+0x12c/0x181
> [ 2763.365963]        do_munmap+0x128/0x2af
> [ 2763.365965]        vm_munmap+0x5a/0x73
> [ 2763.365968]        elf_map+0xb1/0xce
> [ 2763.365970]        load_elf_binary+0x91e/0x137a
> [ 2763.365973]        search_binary_handler+0x70/0x1f3
> [ 2763.365974]        do_execveat_common+0x45e/0x68e
> [ 2763.365978]        call_usermodehelper_exec_async+0xf7/0x11f
> [ 2763.365980]        ret_from_fork+0x27/0x40
> [ 2763.365981] 
>                -> #1 (&vma->vm_sequence){+.+.}:
> [ 2763.365985]        lock_acquire+0x176/0x19e
> [ 2763.365987]        write_seqcount_begin_nested+0x1b/0x1d
> [ 2763.365989]        __vma_adjust+0x1a9/0x5f1
> [ 2763.365991]        __split_vma+0x12c/0x181
> [ 2763.365993]        do_munmap+0x128/0x2af
> [ 2763.365994]        vm_munmap+0x5a/0x73
> [ 2763.365996]        elf_map+0xb1/0xce
> [ 2763.365998]        load_elf_binary+0x91e/0x137a
> [ 2763.365999]        search_binary_handler+0x70/0x1f3
> [ 2763.366001]        do_execveat_common+0x45e/0x68e
> [ 2763.366003]        call_usermodehelper_exec_async+0xf7/0x11f
> [ 2763.366005]        ret_from_fork+0x27/0x40
> [ 2763.366006] 
>                -> #0 (&mapping->i_mmap_rwsem){++++}:
> [ 2763.366010]        __lock_acquire+0xa72/0xca0
> [ 2763.366012]        lock_acquire+0x176/0x19e
> [ 2763.366015]        down_read+0x3b/0x55
> [ 2763.366017]        rmap_walk_file+0x5a/0x142
> [ 2763.366018]        page_referenced+0xfc/0x134
> [ 2763.366022]        shrink_active_list+0x1ac/0x37d
> [ 2763.366024]        shrink_node_memcg.constprop.72+0x3ca/0x567
> [ 2763.366026]        shrink_node+0x3f/0x14c
> [ 2763.366028]        try_to_free_pages+0x288/0x47a
> [ 2763.366030]        __alloc_pages_slowpath+0x3a7/0xa49
> [ 2763.366032]        __alloc_pages_nodemask+0xf1/0x1f9
> [ 2763.366035]        khugepaged+0xc8/0x167c
> [ 2763.366037]        kthread+0x133/0x13b
> [ 2763.366039]        ret_from_fork+0x27/0x40
> [ 2763.366040] 
>                other info that might help us debug this:
> 
> [ 2763.366042] Chain exists of:
>                  &mapping->i_mmap_rwsem --> &vma->vm_sequence/1 --> fs_reclaim
> 
> [ 2763.366048]  Possible unsafe locking scenario:
> 
> [ 2763.366049]        CPU0                    CPU1
> [ 2763.366050]        ----                    ----
> [ 2763.366051]   lock(fs_reclaim);
> [ 2763.366054]                                lock(&vma->vm_sequence/1);
> [ 2763.366056]                                lock(fs_reclaim);
> [ 2763.366058]   lock(&mapping->i_mmap_rwsem);
> [ 2763.366061] 
>                 *** DEADLOCK ***
> 
> [ 2763.366063] 1 lock held by khugepaged/42:
> [ 2763.366064]  #0:  (fs_reclaim){+.+.}, at: [<ffffffff810e99dc>] fs_reclaim_acquire+0x12/0x35
> [ 2763.366068] 
>                stack backtrace:
> [ 2763.366071] CPU: 2 PID: 42 Comm: khugepaged Not tainted 4.13.0-next-20170913-dbg-00039-ge3c06ea4b028-dirty #1837
> [ 2763.366073] Call Trace:
> [ 2763.366077]  dump_stack+0x67/0x8e
> [ 2763.366080]  print_circular_bug+0x2a1/0x2af
> [ 2763.366083]  ? graph_unlock+0x69/0x69
> [ 2763.366085]  check_prev_add+0x76/0x20d
> [ 2763.366087]  ? graph_unlock+0x69/0x69
> [ 2763.366090]  __lock_acquire+0xa72/0xca0
> [ 2763.366093]  ? __save_stack_trace+0xa3/0xbf
> [ 2763.366096]  lock_acquire+0x176/0x19e
> [ 2763.366098]  ? rmap_walk_file+0x5a/0x142
> [ 2763.366100]  down_read+0x3b/0x55
> [ 2763.366102]  ? rmap_walk_file+0x5a/0x142
> [ 2763.366103]  rmap_walk_file+0x5a/0x142
> [ 2763.366106]  page_referenced+0xfc/0x134
> [ 2763.366108]  ? page_vma_mapped_walk_done.isra.17+0xb/0xb
> [ 2763.366109]  ? page_get_anon_vma+0x6d/0x6d
> [ 2763.366112]  shrink_active_list+0x1ac/0x37d
> [ 2763.366115]  shrink_node_memcg.constprop.72+0x3ca/0x567
> [ 2763.366118]  ? ___might_sleep+0xd5/0x234
> [ 2763.366121]  shrink_node+0x3f/0x14c
> [ 2763.366123]  try_to_free_pages+0x288/0x47a
> [ 2763.366126]  __alloc_pages_slowpath+0x3a7/0xa49
> [ 2763.366128]  ? ___might_sleep+0xd5/0x234
> [ 2763.366131]  __alloc_pages_nodemask+0xf1/0x1f9
> [ 2763.366133]  khugepaged+0xc8/0x167c
> [ 2763.366138]  ? remove_wait_queue+0x47/0x47
> [ 2763.366140]  ? collapse_shmem.isra.45+0x828/0x828
> [ 2763.366142]  kthread+0x133/0x13b
> [ 2763.366145]  ? __list_del_entry+0x1d/0x1d
> [ 2763.366147]  ret_from_fork+0x27/0x40
> 
> 	-ss
>

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 46f4ecf5479a..df9a530c8ca1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -344,6 +344,7 @@  struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+	seqcount_t vm_sequence;
 } __randomize_layout;
 
 struct core_thread {
diff --git a/mm/memory.c b/mm/memory.c
index 530d887ca885..f250e7c92948 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1499,6 +1499,7 @@  void unmap_page_range(struct mmu_gather *tlb,
 	unsigned long next;
 
 	BUG_ON(addr >= end);
+	write_seqcount_begin(&vma->vm_sequence);
 	tlb_start_vma(tlb, vma);
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
@@ -1508,6 +1509,7 @@  void unmap_page_range(struct mmu_gather *tlb,
 		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
+	write_seqcount_end(&vma->vm_sequence);
 }
 
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 680506faceae..0a0012c7e50c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -558,6 +558,8 @@  void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 	else
 		mm->highest_vm_end = vm_end_gap(vma);
 
+	seqcount_init(&vma->vm_sequence);
+
 	/*
 	 * vma->vm_prev wasn't known when we followed the rbtree to find the
 	 * correct insertion point for that vma. As a result, we could not
@@ -799,6 +801,11 @@  int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 		}
 	}
 
+	write_seqcount_begin(&vma->vm_sequence);
+	if (next && next != vma)
+		write_seqcount_begin_nested(&next->vm_sequence,
+					    SINGLE_DEPTH_NESTING);
+
 	anon_vma = vma->anon_vma;
 	if (!anon_vma && adjust_next)
 		anon_vma = next->anon_vma;
@@ -903,6 +910,7 @@  int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 		mm->map_count--;
 		mpol_put(vma_policy(next));
 		kmem_cache_free(vm_area_cachep, next);
+		write_seqcount_end(&next->vm_sequence);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -932,11 +940,14 @@  int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 		if (remove_next == 2) {
 			remove_next = 1;
 			end = next->vm_end;
+			write_seqcount_end(&vma->vm_sequence);
 			goto again;
-		}
-		else if (next)
+		} else if (next) {
+			if (next != vma)
+				write_seqcount_begin_nested(&next->vm_sequence,
+							    SINGLE_DEPTH_NESTING);
 			vma_gap_update(next);
-		else {
+		} else {
 			/*
 			 * If remove_next == 2 we obviously can't
 			 * reach this path.
@@ -962,6 +973,10 @@  int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	if (insert && file)
 		uprobe_mmap(insert);
 
+	if (next && next != vma)
+		write_seqcount_end(&next->vm_sequence);
+	write_seqcount_end(&vma->vm_sequence);
+
 	validate_mm(mm);
 
 	return 0;