diff mbox series

[bpf,v2,3/6] bpf: sockhash fix omitted bucket lock in sock_close

Message ID 20180614164457.24994.33710.stgit@john-Precision-Tower-5810
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series BPF fixes for sockhash | expand

Commit Message

John Fastabend June 14, 2018, 4:44 p.m. UTC
First in tcp_close, reduce scope of sk_callback_lock() the lock is
only needed for protecting smap_release_sock() the ingress and cork
lists are protected by sock lock. Having the lock in wider scope is
harmless but may confuse the reader who may infer it is in fact
needed.

Next, in sock_hash_delete_elem() the pattern is as follows,

  sock_hash_delete_elem()
     [...]
     spin_lock(bucket_lock)
     l = lookup_elem_raw()
     if (l)
        hlist_del_rcu()
        write_lock(sk_callback_lock)
         .... destroy psock ...
        write_unlock(sk_callback_lock)
     spin_unlock(bucket_lock)

The ordering is necessary because we only know the {p}sock after
dereferencing the hash table which we can't do unless we have the
bucket lock held. Once we have the bucket lock and the psock element
it is deleted from the hashmap to ensure any other path doing a lookup
will fail. Finally, the refcnt is decremented and if zero the psock
is destroyed.

In parallel with the above (or free'ing the map) a tcp close event
may trigger tcp_close(). Which at the moment omits the bucket lock
altogether (oops!) where the flow looks like this,

  bpf_tcp_close()
     [...]
     write_lock(sk_callback_lock)
     for each psock->maps // list of maps this sock is part of
         hlist_del_rcu(ref_hash_node);
         .... destroy psock ...
     write_unlock(sk_callback_lock)

Obviously, and demonstrated by syzbot, this is broken because
we can have multiple threads deleting entries via hlist_del_rcu().

To fix this we might be tempted to wrap the hlist operation in a
bucket lock but that would create a lock inversion problem. In
summary to follow locking rules maps needs the sk_callback_lock but we
need the bucket lock to do the hlist_del_rcu. To resolve the lock
inversion problem note that when bpf_tcp_close is called no updates
can happen in parallel, due to ESTABLISH state check in update logic,
so pop the head of the list repeatedly and remove the reference until
no more are left. If a delete happens in parallel from the BPF API
that is OK as well because it will do a similar action, lookup the
sock in the map/hash, delete it from the map/hash, and dec the refcnt.
We check for this case before doing a destroy on the psock to ensure
we don't have two threads tearing down a psock. The new logic is
as follows,

  bpf_tcp_close()
  e = psock_map_pop(psock->maps) // done with sk_callback_lock
  bucket_lock() // lock hash list bucket
  l = lookup_elem_raw(head, hash, key, key_size);
  if (l) {
     //only get here if elmnt was not already removed
     hlist_del_rcu()
     ... destroy psock...
  }
  bucket_unlock()

And finally for all the above to work add missing sk_callback_lock
around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
delete and update may corrupt maps list.

(As an aside the sk_callback_lock serves two purposes. The
 first, is to update the sock callbacks sk_data_ready, sk_write_space,
 etc. The second is to protect the psock 'maps' list. The 'maps' list
 is used to (as shown above) to delete all map/hash references to a
 sock when the sock is closed)

(If we did not have the ESTABLISHED state guarantee from tcp_close
 then we could not ensure completion because updates could happen
 forever and pin thread in delete loop.)

Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 0 files changed

Comments

Martin KaFai Lau June 15, 2018, 5:41 a.m. UTC | #1
On Thu, Jun 14, 2018 at 09:44:57AM -0700, John Fastabend wrote:
> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> only needed for protecting smap_release_sock() the ingress and cork
> lists are protected by sock lock. Having the lock in wider scope is
> harmless but may confuse the reader who may infer it is in fact
> needed.
> 
> Next, in sock_hash_delete_elem() the pattern is as follows,
> 
>   sock_hash_delete_elem()
>      [...]
>      spin_lock(bucket_lock)
>      l = lookup_elem_raw()
>      if (l)
>         hlist_del_rcu()
>         write_lock(sk_callback_lock)
>          .... destroy psock ...
>         write_unlock(sk_callback_lock)
>      spin_unlock(bucket_lock)
> 
> The ordering is necessary because we only know the {p}sock after
> dereferencing the hash table which we can't do unless we have the
> bucket lock held. Once we have the bucket lock and the psock element
> it is deleted from the hashmap to ensure any other path doing a lookup
> will fail. Finally, the refcnt is decremented and if zero the psock
> is destroyed.
> 
> In parallel with the above (or free'ing the map) a tcp close event
> may trigger tcp_close(). Which at the moment omits the bucket lock
> altogether (oops!) where the flow looks like this,
> 
>   bpf_tcp_close()
>      [...]
>      write_lock(sk_callback_lock)
>      for each psock->maps // list of maps this sock is part of
>          hlist_del_rcu(ref_hash_node);
>          .... destroy psock ...
>      write_unlock(sk_callback_lock)
> 
> Obviously, and demonstrated by syzbot, this is broken because
> we can have multiple threads deleting entries via hlist_del_rcu().
> 
> To fix this we might be tempted to wrap the hlist operation in a
> bucket lock but that would create a lock inversion problem. In
> summary to follow locking rules maps needs the sk_callback_lock but we
> need the bucket lock to do the hlist_del_rcu. To resolve the lock
> inversion problem note that when bpf_tcp_close is called no updates
> can happen in parallel, due to ESTABLISH state check in update logic,
> so pop the head of the list repeatedly and remove the reference until
> no more are left. If a delete happens in parallel from the BPF API
> that is OK as well because it will do a similar action, lookup the
> sock in the map/hash, delete it from the map/hash, and dec the refcnt.
> We check for this case before doing a destroy on the psock to ensure
> we don't have two threads tearing down a psock. The new logic is
> as follows,
> 
>   bpf_tcp_close()
>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>   bucket_lock() // lock hash list bucket
>   l = lookup_elem_raw(head, hash, key, key_size);
>   if (l) {
>      //only get here if elmnt was not already removed
>      hlist_del_rcu()
>      ... destroy psock...
>   }
>   bucket_unlock()
> 
> And finally for all the above to work add missing sk_callback_lock
> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> delete and update may corrupt maps list.
> 
> (As an aside the sk_callback_lock serves two purposes. The
>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>  is used to (as shown above) to delete all map/hash references to a
>  sock when the sock is closed)
> 
> (If we did not have the ESTABLISHED state guarantee from tcp_close
>  then we could not ensure completion because updates could happen
>  forever and pin thread in delete loop.)
> 
> Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  0 files changed
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index f1ab52d..04764f5 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -258,16 +258,54 @@ static void bpf_tcp_release(struct sock *sk)
>  	rcu_read_unlock();
>  }
>  
> +static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> +					 u32 hash, void *key, u32 key_size)
> +{
> +	struct htab_elem *l;
> +
> +	hlist_for_each_entry_rcu(l, head, hash_node) {
> +		if (l->hash == hash && !memcmp(&l->key, key, key_size))
> +			return l;
> +	}
> +
> +	return NULL;
> +}
> +
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> +	return &htab->buckets[hash & (htab->n_buckets - 1)];
> +}
> +
> +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> +	return &__select_bucket(htab, hash)->head;
> +}
> +
>  static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
>  {
>  	atomic_dec(&htab->count);
>  	kfree_rcu(l, rcu);
>  }
>  
> +struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
> +					   struct smap_psock *psock)
> +{
> +	struct smap_psock_map_entry *e;
> +
> +	write_lock_bh(&sk->sk_callback_lock);
> +	e = list_first_entry_or_null(&psock->maps,
> +				     struct smap_psock_map_entry,
> +				     list);
> +	if (e)
> +		list_del(&e->list);
> +	write_unlock_bh(&sk->sk_callback_lock);
> +	return e;
> +}
> +
>  static void bpf_tcp_close(struct sock *sk, long timeout)
>  {
>  	void (*close_fun)(struct sock *sk, long timeout);
> -	struct smap_psock_map_entry *e, *tmp;
> +	struct smap_psock_map_entry *e;
>  	struct sk_msg_buff *md, *mtmp;
>  	struct smap_psock *psock;
>  	struct sock *osk;
> @@ -286,7 +324,6 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>  	 */
>  	close_fun = psock->save_close;
>  
> -	write_lock_bh(&sk->sk_callback_lock);
>  	if (psock->cork) {
>  		free_start_sg(psock->sock, psock->cork);
>  		kfree(psock->cork);
> @@ -299,20 +336,48 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>  		kfree(md);
>  	}
>  
> -	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
> +	/* Sock is in TCP_CLOSE state so any concurrent adds or updates will be
> +	 * blocked by ESTABLISHED check. However, tcp_close() + delete + free
> +	 * can all run at the same time. If a tcp_close + delete happens each
> +	 * code path will remove the entry for the map/hash before deleting it.
> +	 * In the map case a xchg and then check to verify we have a sk protects
> +	 * two paths from tearing down the same object. For hash map we lock the
> +	 * bucket and remove the object from the hash map before destroying to
> +	 * ensure that only one reference exists. By pulling object off the head
> +	 * of the list with (with sk_callback_lock) if multiple deleters are
> +	 * running we avoid duplicate references.
> +	 */
> +	e = psock_map_pop(sk, psock);
> +	while (e) {
>  		if (e->entry) {
>  			osk = cmpxchg(e->entry, sk, NULL);
>  			if (osk == sk) {
> -				list_del(&e->list);
>  				smap_release_sock(psock, sk);
>  			}
>  		} else {
> -			hlist_del_rcu(&e->hash_link->hash_node);
> -			smap_release_sock(psock, e->hash_link->sk);
> -			free_htab_elem(e->htab, e->hash_link);
> +			struct htab_elem *link = e->hash_link;
> +			struct hlist_head *head;
> +			struct htab_elem *l;
> +			struct bucket *b;
> +
> +			b = __select_bucket(e->htab, link->hash);
> +			head = &b->head;
> +			raw_spin_lock_bh(&b->lock);
> +			l = lookup_elem_raw(head,
> +					    link->hash, link->key,
> +					    e->htab->elem_size);
> +			/* If another thread deleted this object skip deletion.
> +			 * The refcnt on psock may or may not be zero.
> +			 */
> +			if (l) {
> +				hlist_del_rcu(&e->hash_link->hash_node);
> +				smap_release_sock(psock, e->hash_link->sk);
> +				free_htab_elem(e->htab, e->hash_link);
> +			}
> +			raw_spin_unlock_bh(&b->lock);
>  		}
> +		e = psock_map_pop(sk, psock);
>  	}
> -	write_unlock_bh(&sk->sk_callback_lock);
>  	rcu_read_unlock();
>  	close_fun(sk, timeout);
>  }
> @@ -2088,16 +2153,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
>  	return ERR_PTR(err);
>  }
>  
> -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> -{
> -	return &htab->buckets[hash & (htab->n_buckets - 1)];
> -}
> -
> -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
> -{
> -	return &__select_bucket(htab, hash)->head;
> -}
> -
>  static void sock_hash_free(struct bpf_map *map)
>  {
>  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> @@ -2114,10 +2169,13 @@ static void sock_hash_free(struct bpf_map *map)
>  	 */
>  	rcu_read_lock();
>  	for (i = 0; i < htab->n_buckets; i++) {
> -		struct hlist_head *head = select_bucket(htab, i);
> +		struct bucket *b = __select_bucket(htab, i);
> +		struct hlist_head *head;
>  		struct hlist_node *n;
>  		struct htab_elem *l;
>  
> +		raw_spin_lock_bh(&b->lock);
There is a synchronize_rcu() at the beginning of sock_hash_free().
Taking the bucket's lock of the free-ing map at this point is a bit
suspicious.  What may still access the map after synchronize_rcu()?


> +		head = &b->head;
>  		hlist_for_each_entry_safe(l, n, head, hash_node) {
>  			struct sock *sock = l->sk;
>  			struct smap_psock *psock;
> @@ -2137,6 +2195,7 @@ static void sock_hash_free(struct bpf_map *map)
>  			write_unlock_bh(&sock->sk_callback_lock);
>  			kfree(l);
>  		}
> +		raw_spin_unlock_bh(&b->lock);
>  	}
>  	rcu_read_unlock();
>  	bpf_map_area_free(htab->buckets);
> @@ -2167,19 +2226,6 @@ static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
>  	return l_new;
>  }
>  
> -static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
> -					 u32 hash, void *key, u32 key_size)
> -{
> -	struct htab_elem *l;
> -
> -	hlist_for_each_entry_rcu(l, head, hash_node) {
> -		if (l->hash == hash && !memcmp(&l->key, key, key_size))
> -			return l;
> -	}
> -
> -	return NULL;
> -}
> -
>  static inline u32 htab_map_hash(const void *key, u32 key_len)
>  {
>  	return jhash(key, key_len, 0);
> @@ -2307,8 +2353,10 @@ static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
>  		psock = smap_psock_sk(l_old->sk);
>  
>  		hlist_del_rcu(&l_old->hash_node);
> +		write_lock_bh(&l_old->sk->sk_callback_lock);
>  		smap_list_remove(psock, NULL, l_old);
>  		smap_release_sock(psock, l_old->sk);
> +		write_unlock_bh(&l_old->sk->sk_callback_lock);
>  		free_htab_elem(htab, l_old);
>  	}
>  	raw_spin_unlock_bh(&b->lock);
>
John Fastabend June 15, 2018, 3:23 p.m. UTC | #2
On 06/14/2018 10:41 PM, Martin KaFai Lau wrote:
> On Thu, Jun 14, 2018 at 09:44:57AM -0700, John Fastabend wrote:
>> First in tcp_close, reduce scope of sk_callback_lock() the lock is
>> only needed for protecting smap_release_sock() the ingress and cork
>> lists are protected by sock lock. Having the lock in wider scope is
>> harmless but may confuse the reader who may infer it is in fact
>> needed.
>>
>> Next, in sock_hash_delete_elem() the pattern is as follows,
>>
>>   sock_hash_delete_elem()
>>      [...]
>>      spin_lock(bucket_lock)
>>      l = lookup_elem_raw()
>>      if (l)
>>         hlist_del_rcu()
>>         write_lock(sk_callback_lock)
>>          .... destroy psock ...
>>         write_unlock(sk_callback_lock)
>>      spin_unlock(bucket_lock)
>>
>> The ordering is necessary because we only know the {p}sock after
>> dereferencing the hash table which we can't do unless we have the
>> bucket lock held. Once we have the bucket lock and the psock element
>> it is deleted from the hashmap to ensure any other path doing a lookup
>> will fail. Finally, the refcnt is decremented and if zero the psock
>> is destroyed.
>>
>> In parallel with the above (or free'ing the map) a tcp close event
>> may trigger tcp_close(). Which at the moment omits the bucket lock
>> altogether (oops!) where the flow looks like this,
>>
>>   bpf_tcp_close()
>>      [...]
>>      write_lock(sk_callback_lock)
>>      for each psock->maps // list of maps this sock is part of
>>          hlist_del_rcu(ref_hash_node);
>>          .... destroy psock ...
>>      write_unlock(sk_callback_lock)
>>
>> Obviously, and demonstrated by syzbot, this is broken because
>> we can have multiple threads deleting entries via hlist_del_rcu().
>>
>> To fix this we might be tempted to wrap the hlist operation in a
>> bucket lock but that would create a lock inversion problem. In
>> summary to follow locking rules maps needs the sk_callback_lock but we
>> need the bucket lock to do the hlist_del_rcu. To resolve the lock
>> inversion problem note that when bpf_tcp_close is called no updates
>> can happen in parallel, due to ESTABLISH state check in update logic,
>> so pop the head of the list repeatedly and remove the reference until
>> no more are left. If a delete happens in parallel from the BPF API
>> that is OK as well because it will do a similar action, lookup the
>> sock in the map/hash, delete it from the map/hash, and dec the refcnt.
>> We check for this case before doing a destroy on the psock to ensure
>> we don't have two threads tearing down a psock. The new logic is
>> as follows,
>>
>>   bpf_tcp_close()
>>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
>>   bucket_lock() // lock hash list bucket
>>   l = lookup_elem_raw(head, hash, key, key_size);
>>   if (l) {
>>      //only get here if elmnt was not already removed
>>      hlist_del_rcu()
>>      ... destroy psock...
>>   }
>>   bucket_unlock()
>>
>> And finally for all the above to work add missing sk_callback_lock
>> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
>> delete and update may corrupt maps list.
>>
>> (As an aside the sk_callback_lock serves two purposes. The
>>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
>>  etc. The second is to protect the psock 'maps' list. The 'maps' list
>>  is used to (as shown above) to delete all map/hash references to a
>>  sock when the sock is closed)
>>
>> (If we did not have the ESTABLISHED state guarantee from tcp_close
>>  then we could not ensure completion because updates could happen
>>  forever and pin thread in delete loop.)
>>
>> Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com
>> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>> ---
>>  0 files changed
>>

^^^^ Will fix this 0 files changes as well.

>>  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
>> @@ -2114,10 +2169,13 @@ static void sock_hash_free(struct bpf_map *map)
>>  	 */
>>  	rcu_read_lock();
>>  	for (i = 0; i < htab->n_buckets; i++) {
>> -		struct hlist_head *head = select_bucket(htab, i);
>> +		struct bucket *b = __select_bucket(htab, i);
>> +		struct hlist_head *head;
>>  		struct hlist_node *n;
>>  		struct htab_elem *l;
>>  
>> +		raw_spin_lock_bh(&b->lock);
> There is a synchronize_rcu() at the beginning of sock_hash_free().
> Taking the bucket's lock of the free-ing map at this point is a bit
> suspicious.  What may still access the map after synchronize_rcu()?
> 

tcp_close() may be called while the map is being free. The sync_rcu will
only sync the BPF side.

.John
Martin KaFai Lau June 15, 2018, 3:45 p.m. UTC | #3
On Fri, Jun 15, 2018 at 08:23:14AM -0700, John Fastabend wrote:
> On 06/14/2018 10:41 PM, Martin KaFai Lau wrote:
> > On Thu, Jun 14, 2018 at 09:44:57AM -0700, John Fastabend wrote:
> >> First in tcp_close, reduce scope of sk_callback_lock() the lock is
> >> only needed for protecting smap_release_sock() the ingress and cork
> >> lists are protected by sock lock. Having the lock in wider scope is
> >> harmless but may confuse the reader who may infer it is in fact
> >> needed.
> >>
> >> Next, in sock_hash_delete_elem() the pattern is as follows,
> >>
> >>   sock_hash_delete_elem()
> >>      [...]
> >>      spin_lock(bucket_lock)
> >>      l = lookup_elem_raw()
> >>      if (l)
> >>         hlist_del_rcu()
> >>         write_lock(sk_callback_lock)
> >>          .... destroy psock ...
> >>         write_unlock(sk_callback_lock)
> >>      spin_unlock(bucket_lock)
> >>
> >> The ordering is necessary because we only know the {p}sock after
> >> dereferencing the hash table which we can't do unless we have the
> >> bucket lock held. Once we have the bucket lock and the psock element
> >> it is deleted from the hashmap to ensure any other path doing a lookup
> >> will fail. Finally, the refcnt is decremented and if zero the psock
> >> is destroyed.
> >>
> >> In parallel with the above (or free'ing the map) a tcp close event
> >> may trigger tcp_close(). Which at the moment omits the bucket lock
> >> altogether (oops!) where the flow looks like this,
> >>
> >>   bpf_tcp_close()
> >>      [...]
> >>      write_lock(sk_callback_lock)
> >>      for each psock->maps // list of maps this sock is part of
> >>          hlist_del_rcu(ref_hash_node);
> >>          .... destroy psock ...
> >>      write_unlock(sk_callback_lock)
> >>
> >> Obviously, and demonstrated by syzbot, this is broken because
> >> we can have multiple threads deleting entries via hlist_del_rcu().
> >>
> >> To fix this we might be tempted to wrap the hlist operation in a
> >> bucket lock but that would create a lock inversion problem. In
> >> summary to follow locking rules maps needs the sk_callback_lock but we
> >> need the bucket lock to do the hlist_del_rcu. To resolve the lock
> >> inversion problem note that when bpf_tcp_close is called no updates
> >> can happen in parallel, due to ESTABLISH state check in update logic,
> >> so pop the head of the list repeatedly and remove the reference until
> >> no more are left. If a delete happens in parallel from the BPF API
> >> that is OK as well because it will do a similar action, lookup the
> >> sock in the map/hash, delete it from the map/hash, and dec the refcnt.
> >> We check for this case before doing a destroy on the psock to ensure
> >> we don't have two threads tearing down a psock. The new logic is
> >> as follows,
> >>
> >>   bpf_tcp_close()
> >>   e = psock_map_pop(psock->maps) // done with sk_callback_lock
> >>   bucket_lock() // lock hash list bucket
> >>   l = lookup_elem_raw(head, hash, key, key_size);
> >>   if (l) {
> >>      //only get here if elmnt was not already removed
> >>      hlist_del_rcu()
> >>      ... destroy psock...
> >>   }
> >>   bucket_unlock()
> >>
> >> And finally for all the above to work add missing sk_callback_lock
> >> around smap_list_remove in sock_hash_ctx_update_elem(). Otherwise
> >> delete and update may corrupt maps list.
> >>
> >> (As an aside the sk_callback_lock serves two purposes. The
> >>  first, is to update the sock callbacks sk_data_ready, sk_write_space,
> >>  etc. The second is to protect the psock 'maps' list. The 'maps' list
> >>  is used to (as shown above) to delete all map/hash references to a
> >>  sock when the sock is closed)
> >>
> >> (If we did not have the ESTABLISHED state guarantee from tcp_close
> >>  then we could not ensure completion because updates could happen
> >>  forever and pin thread in delete loop.)
> >>
> >> Reported-by: syzbot+0ce137753c78f7b6acc1@syzkaller.appspotmail.com
> >> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> >> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> >> ---
> >>  0 files changed
> >>
> 
> ^^^^ Will fix this 0 files changes as well.
> 
> >>  	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> >> @@ -2114,10 +2169,13 @@ static void sock_hash_free(struct bpf_map *map)
> >>  	 */
> >>  	rcu_read_lock();
> >>  	for (i = 0; i < htab->n_buckets; i++) {
> >> -		struct hlist_head *head = select_bucket(htab, i);
> >> +		struct bucket *b = __select_bucket(htab, i);
> >> +		struct hlist_head *head;
> >>  		struct hlist_node *n;
> >>  		struct htab_elem *l;
> >>  
> >> +		raw_spin_lock_bh(&b->lock);
> > There is a synchronize_rcu() at the beginning of sock_hash_free().
> > Taking the bucket's lock of the free-ing map at this point is a bit
> > suspicious.  What may still access the map after synchronize_rcu()?
> > 
> 
> tcp_close() may be called while the map is being free. The sync_rcu will
> only sync the BPF side.
Thanks for the explanation.

hmm....so the bpf_tcp_close(), which is under rcu_read_lock(), may still
has a reference to this hash.  Should it wait for another rcu grace period
before freeing the htab->buckets and htab?
diff mbox series

Patch

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index f1ab52d..04764f5 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -258,16 +258,54 @@  static void bpf_tcp_release(struct sock *sk)
 	rcu_read_unlock();
 }
 
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
+					 u32 hash, void *key, u32 key_size)
+{
+	struct htab_elem *l;
+
+	hlist_for_each_entry_rcu(l, head, hash_node) {
+		if (l->hash == hash && !memcmp(&l->key, key, key_size))
+			return l;
+	}
+
+	return NULL;
+}
+
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
 static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
 	atomic_dec(&htab->count);
 	kfree_rcu(l, rcu);
 }
 
+struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
+					   struct smap_psock *psock)
+{
+	struct smap_psock_map_entry *e;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	e = list_first_entry_or_null(&psock->maps,
+				     struct smap_psock_map_entry,
+				     list);
+	if (e)
+		list_del(&e->list);
+	write_unlock_bh(&sk->sk_callback_lock);
+	return e;
+}
+
 static void bpf_tcp_close(struct sock *sk, long timeout)
 {
 	void (*close_fun)(struct sock *sk, long timeout);
-	struct smap_psock_map_entry *e, *tmp;
+	struct smap_psock_map_entry *e;
 	struct sk_msg_buff *md, *mtmp;
 	struct smap_psock *psock;
 	struct sock *osk;
@@ -286,7 +324,6 @@  static void bpf_tcp_close(struct sock *sk, long timeout)
 	 */
 	close_fun = psock->save_close;
 
-	write_lock_bh(&sk->sk_callback_lock);
 	if (psock->cork) {
 		free_start_sg(psock->sock, psock->cork);
 		kfree(psock->cork);
@@ -299,20 +336,48 @@  static void bpf_tcp_close(struct sock *sk, long timeout)
 		kfree(md);
 	}
 
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+	/* Sock is in TCP_CLOSE state so any concurrent adds or updates will be
+	 * blocked by ESTABLISHED check. However, tcp_close() + delete + free
+	 * can all run at the same time. If a tcp_close + delete happens each
+	 * code path will remove the entry for the map/hash before deleting it.
+	 * In the map case a xchg and then check to verify we have a sk protects
+	 * two paths from tearing down the same object. For hash map we lock the
+	 * bucket and remove the object from the hash map before destroying to
+	 * ensure that only one reference exists. By pulling object off the head
+	 * of the list with (with sk_callback_lock) if multiple deleters are
+	 * running we avoid duplicate references.
+	 */
+	e = psock_map_pop(sk, psock);
+	while (e) {
 		if (e->entry) {
 			osk = cmpxchg(e->entry, sk, NULL);
 			if (osk == sk) {
-				list_del(&e->list);
 				smap_release_sock(psock, sk);
 			}
 		} else {
-			hlist_del_rcu(&e->hash_link->hash_node);
-			smap_release_sock(psock, e->hash_link->sk);
-			free_htab_elem(e->htab, e->hash_link);
+			struct htab_elem *link = e->hash_link;
+			struct hlist_head *head;
+			struct htab_elem *l;
+			struct bucket *b;
+
+			b = __select_bucket(e->htab, link->hash);
+			head = &b->head;
+			raw_spin_lock_bh(&b->lock);
+			l = lookup_elem_raw(head,
+					    link->hash, link->key,
+					    e->htab->elem_size);
+			/* If another thread deleted this object skip deletion.
+			 * The refcnt on psock may or may not be zero.
+			 */
+			if (l) {
+				hlist_del_rcu(&e->hash_link->hash_node);
+				smap_release_sock(psock, e->hash_link->sk);
+				free_htab_elem(e->htab, e->hash_link);
+			}
+			raw_spin_unlock_bh(&b->lock);
 		}
+		e = psock_map_pop(sk, psock);
 	}
-	write_unlock_bh(&sk->sk_callback_lock);
 	rcu_read_unlock();
 	close_fun(sk, timeout);
 }
@@ -2088,16 +2153,6 @@  static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 	return ERR_PTR(err);
 }
 
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
-
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &__select_bucket(htab, hash)->head;
-}
-
 static void sock_hash_free(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -2114,10 +2169,13 @@  static void sock_hash_free(struct bpf_map *map)
 	 */
 	rcu_read_lock();
 	for (i = 0; i < htab->n_buckets; i++) {
-		struct hlist_head *head = select_bucket(htab, i);
+		struct bucket *b = __select_bucket(htab, i);
+		struct hlist_head *head;
 		struct hlist_node *n;
 		struct htab_elem *l;
 
+		raw_spin_lock_bh(&b->lock);
+		head = &b->head;
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			struct sock *sock = l->sk;
 			struct smap_psock *psock;
@@ -2137,6 +2195,7 @@  static void sock_hash_free(struct bpf_map *map)
 			write_unlock_bh(&sock->sk_callback_lock);
 			kfree(l);
 		}
+		raw_spin_unlock_bh(&b->lock);
 	}
 	rcu_read_unlock();
 	bpf_map_area_free(htab->buckets);
@@ -2167,19 +2226,6 @@  static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
 	return l_new;
 }
 
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
-					 u32 hash, void *key, u32 key_size)
-{
-	struct htab_elem *l;
-
-	hlist_for_each_entry_rcu(l, head, hash_node) {
-		if (l->hash == hash && !memcmp(&l->key, key, key_size))
-			return l;
-	}
-
-	return NULL;
-}
-
 static inline u32 htab_map_hash(const void *key, u32 key_len)
 {
 	return jhash(key, key_len, 0);
@@ -2307,8 +2353,10 @@  static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 		psock = smap_psock_sk(l_old->sk);
 
 		hlist_del_rcu(&l_old->hash_node);
+		write_lock_bh(&l_old->sk->sk_callback_lock);
 		smap_list_remove(psock, NULL, l_old);
 		smap_release_sock(psock, l_old->sk);
+		write_unlock_bh(&l_old->sk->sk_callback_lock);
 		free_htab_elem(htab, l_old);
 	}
 	raw_spin_unlock_bh(&b->lock);