[bpf,4/6] bpf: sockmap, tcp_disconnect to listen transition

Message ID 20180613175014.4264.84395.stgit@john-Precision-Tower-5810
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series
  • BPF fixes for sockhash
Related show

Commit Message

John Fastabend June 13, 2018, 5:50 p.m.
After adding checks to ensure TCP is in ESTABLISHED state when a
sock is added we need to also ensure that user does not transition
through tcp_disconnect() and back into ESTABLISHED state without
sockmap removing the sock.

To do this add unhash hook and remove sock from map there.

Reported-by: Eric Dumazet <edumazet@google.com>
Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 kernel/bpf/sockmap.c |   67 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 17 deletions(-)

Comments

Martin KaFai Lau June 14, 2018, 12:56 a.m. | #1
On Wed, Jun 13, 2018 at 10:50:14AM -0700, John Fastabend wrote:
> After adding checks to ensure TCP is in ESTABLISHED state when a
> sock is added we need to also ensure that user does not transition
> through tcp_disconnect() and back into ESTABLISHED state without
> sockmap removing the sock.
> 
> To do this add unhash hook and remove sock from map there.
In bpf_tcp_init():
        sk->sk_prot = &tcp_bpf_proto;

I may have missed a lock while reading sockmap.c.
Is it possible that tcp_disconnect() is being called while
the above assignment is also being done (e.g. through BPF_MAP_UPDATE_ELEM)?
The same situation go for the ESTABLISHED check.

> 
> Reported-by: Eric Dumazet <edumazet@google.com>
> Fixes: 81110384441a ("bpf: sockmap, add hash map support")
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  kernel/bpf/sockmap.c |   67 +++++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 50 insertions(+), 17 deletions(-)
> 
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 2e848cd..91c7b47 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -130,6 +130,7 @@ struct smap_psock {
>  
>  	struct proto *sk_proto;
>  	void (*save_close)(struct sock *sk, long timeout);
> +	void (*save_unhash)(struct sock *sk);
>  	void (*save_data_ready)(struct sock *sk);
>  	void (*save_write_space)(struct sock *sk);
>  };
> @@ -141,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
>  			    int offset, size_t size, int flags);
>  static void bpf_tcp_close(struct sock *sk, long timeout);
> +static void bpf_tcp_unhash(struct sock *sk);
>  
>  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
>  {
> @@ -182,6 +184,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
>  {
>  	prot[SOCKMAP_BASE]			= *base;
>  	prot[SOCKMAP_BASE].close		= bpf_tcp_close;
> +	prot[SOCKMAP_BASE].unhash		= bpf_tcp_unhash;
>  	prot[SOCKMAP_BASE].recvmsg		= bpf_tcp_recvmsg;
>  	prot[SOCKMAP_BASE].stream_memory_read	= bpf_tcp_stream_read;
>  
> @@ -215,6 +218,7 @@ static int bpf_tcp_init(struct sock *sk)
>  	}
>  
>  	psock->save_close = sk->sk_prot->close;
> +	psock->save_unhash = sk->sk_prot->unhash;
>  	psock->sk_proto = sk->sk_prot;
>  
>  	/* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
> @@ -302,28 +306,12 @@ struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
>  	return e;
>  }
>  
> -static void bpf_tcp_close(struct sock *sk, long timeout)
> +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
>  {
> -	void (*close_fun)(struct sock *sk, long timeout);
>  	struct smap_psock_map_entry *e;
>  	struct sk_msg_buff *md, *mtmp;
> -	struct smap_psock *psock;
>  	struct sock *osk;
>  
> -	rcu_read_lock();
> -	psock = smap_psock_sk(sk);
> -	if (unlikely(!psock)) {
> -		rcu_read_unlock();
> -		return sk->sk_prot->close(sk, timeout);
> -	}
> -
> -	/* The psock may be destroyed anytime after exiting the RCU critial
> -	 * section so by the time we use close_fun the psock may no longer
> -	 * be valid. However, bpf_tcp_close is called with the sock lock
> -	 * held so the close hook and sk are still valid.
> -	 */
> -	close_fun = psock->save_close;
> -
>  	if (psock->cork) {
>  		free_start_sg(psock->sock, psock->cork);
>  		kfree(psock->cork);
> @@ -378,6 +366,51 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
>  		}
>  		e = psock_map_pop(sk, psock);
>  	}
> +}
> +
> +static void bpf_tcp_unhash(struct sock *sk)
> +{
> +	void (*unhash_fun)(struct sock *sk);
> +	struct smap_psock *psock;
> +
> +	rcu_read_lock();
> +	psock = smap_psock_sk(sk);
> +	if (unlikely(!psock)) {
> +		rcu_read_unlock();
> +		return sk->sk_prot->unhash(sk);
> +	}
> +
> +	/* The psock may be destroyed anytime after exiting the RCU critial
> +	 * section so by the time we use close_fun the psock may no longer
> +	 * be valid. However, bpf_tcp_close is called with the sock lock
> +	 * held so the close hook and sk are still valid.
> +	 */
> +	unhash_fun = psock->save_unhash;
> +	bpf_tcp_remove(sk, psock);
> +	rcu_read_unlock();
> +	unhash_fun(sk);
> +
> +}
> +
> +static void bpf_tcp_close(struct sock *sk, long timeout)
> +{
> +	void (*close_fun)(struct sock *sk, long timeout);
> +	struct smap_psock *psock;
> +
> +	rcu_read_lock();
> +	psock = smap_psock_sk(sk);
> +	if (unlikely(!psock)) {
> +		rcu_read_unlock();
> +		return sk->sk_prot->close(sk, timeout);
> +	}
> +
> +	/* The psock may be destroyed anytime after exiting the RCU critial
> +	 * section so by the time we use close_fun the psock may no longer
> +	 * be valid. However, bpf_tcp_close is called with the sock lock
> +	 * held so the close hook and sk are still valid.
> +	 */
> +	close_fun = psock->save_close;
> +	bpf_tcp_remove(sk, psock);
>  	rcu_read_unlock();
>  	close_fun(sk, timeout);
>  }
>
John Fastabend June 14, 2018, 5:48 a.m. | #2
On 06/13/2018 05:56 PM, Martin KaFai Lau wrote:
> On Wed, Jun 13, 2018 at 10:50:14AM -0700, John Fastabend wrote:
>> After adding checks to ensure TCP is in ESTABLISHED state when a
>> sock is added we need to also ensure that user does not transition
>> through tcp_disconnect() and back into ESTABLISHED state without
>> sockmap removing the sock.
>>
>> To do this add unhash hook and remove sock from map there.
> In bpf_tcp_init():
>         sk->sk_prot = &tcp_bpf_proto;
> 
> I may have missed a lock while reading sockmap.c.
> Is it possible that tcp_disconnect() is being called while
> the above assignment is also being done (e.g. through BPF_MAP_UPDATE_ELEM)?
> The same situation go for the ESTABLISHED check.
> 

Right because ESTABLISHED is checked without any locking its
possible that the state changes during the update (from userspce
BPF_MAP_UPDATE, from sock_ops program it is locked). I have
the below patch on my tree now, I was thinking to send it as
a follow on but on second thought it likely makes more sense
as part of the patch that adds the ESTABLISHED check.

Also after the below the sk_callback lock used to protect
psock->maps is becoming increasingly pointless it allows the
delete and map free ops to be called without taking the full
sock lock. It might be time to just drop it in bpf-next and
use the sock lock in the delete cases. The more annoying part
will be the delete will have to have different userspace and
bpf program helpers so we know when we need the lock.

--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2074,17 +2074,20 @@ static int sock_map_update_elem(struct bpf_map *map,
                return -EINVAL;
        }

+       lock_sock(skops.sk);
        /* ULPs are currently supported only for TCP sockets in ESTABLISHED
         * state.
         */
        if (skops.sk->sk_type != SOCK_STREAM ||
            skops.sk->sk_protocol != IPPROTO_TCP ||
            skops.sk->sk_state != TCP_ESTABLISHED) {
-               fput(socket->file);
-               return -EOPNOTSUPP;
+               err = -EOPNOTSUPP;
+               goto out;
        }

        err = sock_map_ctx_update_elem(&skops, map, key, flags);
+out:
+       release_sock(skops.sk);
        fput(socket->file);
        return err;
 }
@@ -2423,17 +2426,20 @@ static int sock_hash_update_elem(struct bpf_map
*map,
                return -EINVAL;
        }

+       lock_sock(skops.sk);
        /* ULPs are currently supported only for TCP sockets in ESTABLISHED
         * state.
         */
        if (skops.sk->sk_type != SOCK_STREAM ||
            skops.sk->sk_protocol != IPPROTO_TCP ||
            skops.sk->sk_state != TCP_ESTABLISHED) {
-               fput(socket->file);
-               return -EOPNOTSUPP;
+               err = -EOPNOTSUPP;
+               goto out;
        }

        err = sock_hash_ctx_update_elem(&skops, map, key, flags);
+out:
+       release_sock(skops.sk);
        fput(socket->file);
        return err;
John Fastabend June 14, 2018, 4:47 p.m. | #3
On 06/13/2018 10:48 PM, John Fastabend wrote:
> On 06/13/2018 05:56 PM, Martin KaFai Lau wrote:
>> On Wed, Jun 13, 2018 at 10:50:14AM -0700, John Fastabend wrote:
>>> After adding checks to ensure TCP is in ESTABLISHED state when a
>>> sock is added we need to also ensure that user does not transition
>>> through tcp_disconnect() and back into ESTABLISHED state without
>>> sockmap removing the sock.
>>>
>>> To do this add unhash hook and remove sock from map there.
>> In bpf_tcp_init():
>>         sk->sk_prot = &tcp_bpf_proto;
>>
>> I may have missed a lock while reading sockmap.c.
>> Is it possible that tcp_disconnect() is being called while
>> the above assignment is also being done (e.g. through BPF_MAP_UPDATE_ELEM)?
>> The same situation go for the ESTABLISHED check.
>>
> 
> Right because ESTABLISHED is checked without any locking its
> possible that the state changes during the update (from userspce
> BPF_MAP_UPDATE, from sock_ops program it is locked). I have
> the below patch on my tree now, I was thinking to send it as
> a follow on but on second thought it likely makes more sense
> as part of the patch that adds the ESTABLISHED check.
> 
> Also after the below the sk_callback lock used to protect
> psock->maps is becoming increasingly pointless it allows the
> delete and map free ops to be called without taking the full
> sock lock. It might be time to just drop it in bpf-next and
> use the sock lock in the delete cases. The more annoying part
> will be the delete will have to have different userspace and
> bpf program helpers so we know when we need the lock.
> 
> --- a/kernel/bpf/sockmap.c

Hi Martin,

I went ahead and sent a v2 with the sock lock addition included.

Thanks,
John

Patch

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 2e848cd..91c7b47 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -130,6 +130,7 @@  struct smap_psock {
 
 	struct proto *sk_proto;
 	void (*save_close)(struct sock *sk, long timeout);
+	void (*save_unhash)(struct sock *sk);
 	void (*save_data_ready)(struct sock *sk);
 	void (*save_write_space)(struct sock *sk);
 };
@@ -141,6 +142,7 @@  static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
 			    int offset, size_t size, int flags);
 static void bpf_tcp_close(struct sock *sk, long timeout);
+static void bpf_tcp_unhash(struct sock *sk);
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 {
@@ -182,6 +184,7 @@  static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
 {
 	prot[SOCKMAP_BASE]			= *base;
 	prot[SOCKMAP_BASE].close		= bpf_tcp_close;
+	prot[SOCKMAP_BASE].unhash		= bpf_tcp_unhash;
 	prot[SOCKMAP_BASE].recvmsg		= bpf_tcp_recvmsg;
 	prot[SOCKMAP_BASE].stream_memory_read	= bpf_tcp_stream_read;
 
@@ -215,6 +218,7 @@  static int bpf_tcp_init(struct sock *sk)
 	}
 
 	psock->save_close = sk->sk_prot->close;
+	psock->save_unhash = sk->sk_prot->unhash;
 	psock->sk_proto = sk->sk_prot;
 
 	/* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
@@ -302,28 +306,12 @@  struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
 	return e;
 }
 
-static void bpf_tcp_close(struct sock *sk, long timeout)
+static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
 {
-	void (*close_fun)(struct sock *sk, long timeout);
 	struct smap_psock_map_entry *e;
 	struct sk_msg_buff *md, *mtmp;
-	struct smap_psock *psock;
 	struct sock *osk;
 
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		return sk->sk_prot->close(sk, timeout);
-	}
-
-	/* The psock may be destroyed anytime after exiting the RCU critial
-	 * section so by the time we use close_fun the psock may no longer
-	 * be valid. However, bpf_tcp_close is called with the sock lock
-	 * held so the close hook and sk are still valid.
-	 */
-	close_fun = psock->save_close;
-
 	if (psock->cork) {
 		free_start_sg(psock->sock, psock->cork);
 		kfree(psock->cork);
@@ -378,6 +366,51 @@  static void bpf_tcp_close(struct sock *sk, long timeout)
 		}
 		e = psock_map_pop(sk, psock);
 	}
+}
+
+static void bpf_tcp_unhash(struct sock *sk)
+{
+	void (*unhash_fun)(struct sock *sk);
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return sk->sk_prot->unhash(sk);
+	}
+
+	/* The psock may be destroyed anytime after exiting the RCU critial
+	 * section so by the time we use close_fun the psock may no longer
+	 * be valid. However, bpf_tcp_close is called with the sock lock
+	 * held so the close hook and sk are still valid.
+	 */
+	unhash_fun = psock->save_unhash;
+	bpf_tcp_remove(sk, psock);
+	rcu_read_unlock();
+	unhash_fun(sk);
+
+}
+
+static void bpf_tcp_close(struct sock *sk, long timeout)
+{
+	void (*close_fun)(struct sock *sk, long timeout);
+	struct smap_psock *psock;
+
+	rcu_read_lock();
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	/* The psock may be destroyed anytime after exiting the RCU critial
+	 * section so by the time we use close_fun the psock may no longer
+	 * be valid. However, bpf_tcp_close is called with the sock lock
+	 * held so the close hook and sk are still valid.
+	 */
+	close_fun = psock->save_close;
+	bpf_tcp_remove(sk, psock);
 	rcu_read_unlock();
 	close_fun(sk, timeout);
 }