diff mbox series

[bpf-next,v4,02/12] net, sk_msg: Annotate lockless access to sk_prot on clone

Message ID 20200123155534.114313-3-jakub@cloudflare.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Extend SOCKMAP to store listening sockets | expand

Commit Message

Jakub Sitnicki Jan. 23, 2020, 3:55 p.m. UTC
sk_msg and ULP frameworks override protocol callbacks pointer in
sk->sk_prot, while tcp accesses it locklessly when cloning the listening
socket, that is with neither sk_lock nor sk_callback_lock held.

Once we enable use of listening sockets with sockmap (and hence sk_msg),
there will be shared access to sk->sk_prot if socket is getting cloned
while being inserted/deleted to/from the sockmap from another CPU:

Read side:

tcp_v4_rcv
  sk = __inet_lookup_skb(...)
  tcp_check_req(sk)
    inet_csk(sk)->icsk_af_ops->syn_recv_sock
      tcp_v4_syn_recv_sock
        tcp_create_openreq_child
          inet_csk_clone_lock
            sk_clone_lock
              READ_ONCE(sk->sk_prot)

Write side:

sock_map_ops->map_update_elem
  sock_map_update_elem
    sock_map_update_common
      sock_map_link_no_progs
        tcp_bpf_init
          tcp_bpf_update_sk_prot
            sk_psock_update_proto
              WRITE_ONCE(sk->sk_prot, ops)

sock_map_ops->map_delete_elem
  sock_map_delete_elem
    __sock_map_delete
     sock_map_unref
       sk_psock_put
         sk_psock_drop
           sk_psock_restore_proto
             tcp_update_ulp
               WRITE_ONCE(sk->sk_prot, proto)

Mark the shared access with READ_ONCE/WRITE_ONCE annotations.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
---
 include/linux/skmsg.h | 3 ++-
 net/core/sock.c       | 5 +++--
 net/ipv4/tcp_bpf.c    | 4 +++-
 net/ipv4/tcp_ulp.c    | 3 ++-
 net/tls/tls_main.c    | 3 ++-
 5 files changed, 12 insertions(+), 6 deletions(-)

Comments

Eric Dumazet Jan. 23, 2020, 5:18 p.m. UTC | #1
On 1/23/20 7:55 AM, Jakub Sitnicki wrote:
> sk_msg and ULP frameworks override protocol callbacks pointer in
> sk->sk_prot, while tcp accesses it locklessly when cloning the listening
> socket, that is with neither sk_lock nor sk_callback_lock held.
> 
> Once we enable use of listening sockets with sockmap (and hence sk_msg),
> there will be shared access to sk->sk_prot if socket is getting cloned
> while being inserted/deleted to/from the sockmap from another CPU:
> 
> Read side:
> 
> tcp_v4_rcv
>   sk = __inet_lookup_skb(...)
>   tcp_check_req(sk)
>     inet_csk(sk)->icsk_af_ops->syn_recv_sock
>       tcp_v4_syn_recv_sock
>         tcp_create_openreq_child
>           inet_csk_clone_lock
>             sk_clone_lock
>               READ_ONCE(sk->sk_prot)
> 
> Write side:
> 
> sock_map_ops->map_update_elem
>   sock_map_update_elem
>     sock_map_update_common
>       sock_map_link_no_progs
>         tcp_bpf_init
>           tcp_bpf_update_sk_prot
>             sk_psock_update_proto
>               WRITE_ONCE(sk->sk_prot, ops)
> 
> sock_map_ops->map_delete_elem
>   sock_map_delete_elem
>     __sock_map_delete
>      sock_map_unref
>        sk_psock_put
>          sk_psock_drop
>            sk_psock_restore_proto
>              tcp_update_ulp
>                WRITE_ONCE(sk->sk_prot, proto)
> 
> Mark the shared access with READ_ONCE/WRITE_ONCE annotations.
> 
> Acked-by: Martin KaFai Lau <kafai@fb.com>
> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
> ---
>  include/linux/skmsg.h | 3 ++-
>  net/core/sock.c       | 5 +++--
>  net/ipv4/tcp_bpf.c    | 4 +++-
>  net/ipv4/tcp_ulp.c    | 3 ++-
>  net/tls/tls_main.c    | 3 ++-
>  5 files changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index 41ea1258d15e..55c834a5c25e 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -352,7 +352,8 @@ static inline void sk_psock_update_proto(struct sock *sk,
>  	psock->saved_write_space = sk->sk_write_space;
>  
>  	psock->sk_proto = sk->sk_prot;
> -	sk->sk_prot = ops;
> +	/* Pairs with lockless read in sk_clone_lock() */
> +	WRITE_ONCE(sk->sk_prot, ops);


Note there are dozens of calls like

if (sk->sk_prot->handler)
    sk->sk_prot->handler(...);

Some of them being done lockless.

I know it is painful, but presumably we need

const struct proto *ops = READ_ONCE(sk->sk_prot);

if (ops->handler)
    ops->handler(....);
Jakub Sitnicki Jan. 23, 2020, 6:56 p.m. UTC | #2
On Thu, Jan 23, 2020 at 06:18 PM CET, Eric Dumazet wrote:
> On 1/23/20 7:55 AM, Jakub Sitnicki wrote:
>> sk_msg and ULP frameworks override protocol callbacks pointer in
>> sk->sk_prot, while tcp accesses it locklessly when cloning the listening
>> socket, that is with neither sk_lock nor sk_callback_lock held.
>>
>> Once we enable use of listening sockets with sockmap (and hence sk_msg),
>> there will be shared access to sk->sk_prot if socket is getting cloned
>> while being inserted/deleted to/from the sockmap from another CPU:
>>
>> Read side:
>>
>> tcp_v4_rcv
>>   sk = __inet_lookup_skb(...)
>>   tcp_check_req(sk)
>>     inet_csk(sk)->icsk_af_ops->syn_recv_sock
>>       tcp_v4_syn_recv_sock
>>         tcp_create_openreq_child
>>           inet_csk_clone_lock
>>             sk_clone_lock
>>               READ_ONCE(sk->sk_prot)
>>
>> Write side:
>>
>> sock_map_ops->map_update_elem
>>   sock_map_update_elem
>>     sock_map_update_common
>>       sock_map_link_no_progs
>>         tcp_bpf_init
>>           tcp_bpf_update_sk_prot
>>             sk_psock_update_proto
>>               WRITE_ONCE(sk->sk_prot, ops)
>>
>> sock_map_ops->map_delete_elem
>>   sock_map_delete_elem
>>     __sock_map_delete
>>      sock_map_unref
>>        sk_psock_put
>>          sk_psock_drop
>>            sk_psock_restore_proto
>>              tcp_update_ulp
>>                WRITE_ONCE(sk->sk_prot, proto)
>>
>> Mark the shared access with READ_ONCE/WRITE_ONCE annotations.
>>
>> Acked-by: Martin KaFai Lau <kafai@fb.com>
>> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
>> ---
>>  include/linux/skmsg.h | 3 ++-
>>  net/core/sock.c       | 5 +++--
>>  net/ipv4/tcp_bpf.c    | 4 +++-
>>  net/ipv4/tcp_ulp.c    | 3 ++-
>>  net/tls/tls_main.c    | 3 ++-
>>  5 files changed, 12 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
>> index 41ea1258d15e..55c834a5c25e 100644
>> --- a/include/linux/skmsg.h
>> +++ b/include/linux/skmsg.h
>> @@ -352,7 +352,8 @@ static inline void sk_psock_update_proto(struct sock *sk,
>>  	psock->saved_write_space = sk->sk_write_space;
>>
>>  	psock->sk_proto = sk->sk_prot;
>> -	sk->sk_prot = ops;
>> +	/* Pairs with lockless read in sk_clone_lock() */
>> +	WRITE_ONCE(sk->sk_prot, ops);
>
>
> Note there are dozens of calls like
>
> if (sk->sk_prot->handler)
>     sk->sk_prot->handler(...);
>
> Some of them being done lockless.
>
> I know it is painful, but presumably we need
>
> const struct proto *ops = READ_ONCE(sk->sk_prot);
>
> if (ops->handler)
>     ops->handler(....);

Yikes! That will be quite an audit. Thank you for taking a look.

Now I think I understand what John had in mind when asking for pushing
these annotations to the bpf tree as well [0].

Considering these are lacking today, can I do it as a follow up?

[0] https://lore.kernel.org/bpf/20200110105027.257877-1-jakub@cloudflare.com/T/#m6a4f84a922a393719a7ea7b33dafdb6c66b72827
John Fastabend Jan. 23, 2020, 7:15 p.m. UTC | #3
Jakub Sitnicki wrote:
> On Thu, Jan 23, 2020 at 06:18 PM CET, Eric Dumazet wrote:
> > On 1/23/20 7:55 AM, Jakub Sitnicki wrote:
> >> sk_msg and ULP frameworks override protocol callbacks pointer in
> >> sk->sk_prot, while tcp accesses it locklessly when cloning the listening
> >> socket, that is with neither sk_lock nor sk_callback_lock held.
> >>
> >> Once we enable use of listening sockets with sockmap (and hence sk_msg),
> >> there will be shared access to sk->sk_prot if socket is getting cloned
> >> while being inserted/deleted to/from the sockmap from another CPU:

[...]

> >>  include/linux/skmsg.h | 3 ++-
> >>  net/core/sock.c       | 5 +++--
> >>  net/ipv4/tcp_bpf.c    | 4 +++-
> >>  net/ipv4/tcp_ulp.c    | 3 ++-
> >>  net/tls/tls_main.c    | 3 ++-
> >>  5 files changed, 12 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> >> index 41ea1258d15e..55c834a5c25e 100644
> >> --- a/include/linux/skmsg.h
> >> +++ b/include/linux/skmsg.h
> >> @@ -352,7 +352,8 @@ static inline void sk_psock_update_proto(struct sock *sk,
> >>  	psock->saved_write_space = sk->sk_write_space;
> >>
> >>  	psock->sk_proto = sk->sk_prot;
> >> -	sk->sk_prot = ops;
> >> +	/* Pairs with lockless read in sk_clone_lock() */
> >> +	WRITE_ONCE(sk->sk_prot, ops);
> >
> >
> > Note there are dozens of calls like
> >
> > if (sk->sk_prot->handler)
> >     sk->sk_prot->handler(...);
> >
> > Some of them being done lockless.
> >
> > I know it is painful, but presumably we need

Correct.

> >
> > const struct proto *ops = READ_ONCE(sk->sk_prot);
> >
> > if (ops->handler)
> >     ops->handler(....);
> 
> Yikes! That will be quite an audit. Thank you for taking a look.
> 
> Now I think I understand what John had in mind when asking for pushing
> these annotations to the bpf tree as well [0].

Yep this is what I meant. But your patches don't make the situation
any worse its already there.

> 
> Considering these are lacking today, can I do it as a follow up?

In my opinion yes. I think pulling your patches in is OK and I started
doing this conversion already so we can have a fix shortly. I didn't
want to push it into rc7 though so I'll push it next week or into
net-next tree.

.John
Jakub Sitnicki Jan. 27, 2020, 9:36 a.m. UTC | #4
On Thu, Jan 23, 2020 at 08:15 PM CET, John Fastabend wrote:
> Jakub Sitnicki wrote:
>> On Thu, Jan 23, 2020 at 06:18 PM CET, Eric Dumazet wrote:
>> > On 1/23/20 7:55 AM, Jakub Sitnicki wrote:
>> >> sk_msg and ULP frameworks override protocol callbacks pointer in
>> >> sk->sk_prot, while tcp accesses it locklessly when cloning the listening
>> >> socket, that is with neither sk_lock nor sk_callback_lock held.
>> >>
>> >> Once we enable use of listening sockets with sockmap (and hence sk_msg),
>> >> there will be shared access to sk->sk_prot if socket is getting cloned
>> >> while being inserted/deleted to/from the sockmap from another CPU:
>
> [...]
>
>> >>  include/linux/skmsg.h | 3 ++-
>> >>  net/core/sock.c       | 5 +++--
>> >>  net/ipv4/tcp_bpf.c    | 4 +++-
>> >>  net/ipv4/tcp_ulp.c    | 3 ++-
>> >>  net/tls/tls_main.c    | 3 ++-
>> >>  5 files changed, 12 insertions(+), 6 deletions(-)
>> >>
>> >> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
>> >> index 41ea1258d15e..55c834a5c25e 100644
>> >> --- a/include/linux/skmsg.h
>> >> +++ b/include/linux/skmsg.h
>> >> @@ -352,7 +352,8 @@ static inline void sk_psock_update_proto(struct sock *sk,
>> >>  	psock->saved_write_space = sk->sk_write_space;
>> >>
>> >>  	psock->sk_proto = sk->sk_prot;
>> >> -	sk->sk_prot = ops;
>> >> +	/* Pairs with lockless read in sk_clone_lock() */
>> >> +	WRITE_ONCE(sk->sk_prot, ops);
>> >
>> >
>> > Note there are dozens of calls like
>> >
>> > if (sk->sk_prot->handler)
>> >     sk->sk_prot->handler(...);
>> >
>> > Some of them being done lockless.
>> >
>> > I know it is painful, but presumably we need
>
> Correct.
>
>> >
>> > const struct proto *ops = READ_ONCE(sk->sk_prot);
>> >
>> > if (ops->handler)
>> >     ops->handler(....);

[...]

>>
>> Considering these are lacking today, can I do it as a follow up?
>
> In my opinion yes. I think pulling your patches in is OK and I started
> doing this conversion already so we can have a fix shortly. I didn't
> want to push it into rc7 though so I'll push it next week or into
> net-next tree.

Thanks, John. I'll take a look if there is any access to sk_prot, apart
from the clone path, applicable only to TCP listening sockets that needs
annotating.
diff mbox series

Patch

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 41ea1258d15e..55c834a5c25e 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -352,7 +352,8 @@  static inline void sk_psock_update_proto(struct sock *sk,
 	psock->saved_write_space = sk->sk_write_space;
 
 	psock->sk_proto = sk->sk_prot;
-	sk->sk_prot = ops;
+	/* Pairs with lockless read in sk_clone_lock() */
+	WRITE_ONCE(sk->sk_prot, ops);
 }
 
 static inline void sk_psock_restore_proto(struct sock *sk,
diff --git a/net/core/sock.c b/net/core/sock.c
index a4c8fac781ff..3953bb23f4d0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1792,16 +1792,17 @@  static void sk_init_common(struct sock *sk)
  */
 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 {
+	struct proto *prot = READ_ONCE(sk->sk_prot);
 	struct sock *newsk;
 	bool is_charged = true;
 
-	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
 	if (newsk != NULL) {
 		struct sk_filter *filter;
 
 		sock_copy(newsk, sk);
 
-		newsk->sk_prot_creator = sk->sk_prot;
+		newsk->sk_prot_creator = prot;
 
 		/* SANITY */
 		if (likely(newsk->sk_net_refcnt))
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index e38705165ac9..4f25aba44ead 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -648,8 +648,10 @@  static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
 	/* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
 	 * or added requiring sk_prot hook updates. We keep original saved
 	 * hooks in this case.
+	 *
+	 * Pairs with lockless read in sk_clone_lock().
 	 */
-	sk->sk_prot = &tcp_bpf_prots[family][config];
+	WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
 }
 
 static int tcp_bpf_assert_proto_ops(struct proto *ops)
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index 12ab5db2b71c..acd1ea0a66f7 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -104,7 +104,8 @@  void tcp_update_ulp(struct sock *sk, struct proto *proto)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (!icsk->icsk_ulp_ops) {
-		sk->sk_prot = proto;
+		/* Pairs with lockless read in sk_clone_lock() */
+		WRITE_ONCE(sk->sk_prot, proto);
 		return;
 	}
 
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index dac24c7aa7d4..f0748e951dea 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -740,7 +740,8 @@  static void tls_update(struct sock *sk, struct proto *p)
 	if (likely(ctx))
 		ctx->sk_proto = p;
 	else
-		sk->sk_prot = p;
+		/* Pairs with lockless read in sk_clone_lock() */
+		WRITE_ONCE(sk->sk_prot, p);
 }
 
 static int tls_get_info(const struct sock *sk, struct sk_buff *skb)