Message ID | 20180613175014.4264.84395.stgit@john-Precision-Tower-5810 |
---|---|
State | Changes Requested, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | BPF fixes for sockhash | expand |
On Wed, Jun 13, 2018 at 10:50:14AM -0700, John Fastabend wrote: > After adding checks to ensure TCP is in ESTABLISHED state when a > sock is added we need to also ensure that user does not transition > through tcp_disconnect() and back into ESTABLISHED state without > sockmap removing the sock. > > To do this add unhash hook and remove sock from map there. In bpf_tcp_init(): sk->sk_prot = &tcp_bpf_proto; I may have missed a lock while reading sockmap.c. Is it possible that tcp_disconnect() is being called while the above assignment is also being done (e.g. through BPF_MAP_UPDATE_ELEM)? The same situation go for the ESTABLISHED check. > > Reported-by: Eric Dumazet <edumazet@google.com> > Fixes: 81110384441a ("bpf: sockmap, add hash map support") > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > --- > kernel/bpf/sockmap.c | 67 +++++++++++++++++++++++++++++++++++++------------- > 1 file changed, 50 insertions(+), 17 deletions(-) > > diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c > index 2e848cd..91c7b47 100644 > --- a/kernel/bpf/sockmap.c > +++ b/kernel/bpf/sockmap.c > @@ -130,6 +130,7 @@ struct smap_psock { > > struct proto *sk_proto; > void (*save_close)(struct sock *sk, long timeout); > + void (*save_unhash)(struct sock *sk); > void (*save_data_ready)(struct sock *sk); > void (*save_write_space)(struct sock *sk); > }; > @@ -141,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, > static int bpf_tcp_sendpage(struct sock *sk, struct page *page, > int offset, size_t size, int flags); > static void bpf_tcp_close(struct sock *sk, long timeout); > +static void bpf_tcp_unhash(struct sock *sk); > > static inline struct smap_psock *smap_psock_sk(const struct sock *sk) > { > @@ -182,6 +184,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], > { > prot[SOCKMAP_BASE] = *base; > prot[SOCKMAP_BASE].close = bpf_tcp_close; > + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; > prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; > prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; > > @@ -215,6 +218,7 @@ static int bpf_tcp_init(struct sock *sk) > } > > psock->save_close = sk->sk_prot->close; > + psock->save_unhash = sk->sk_prot->unhash; > psock->sk_proto = sk->sk_prot; > > /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ > @@ -302,28 +306,12 @@ struct smap_psock_map_entry *psock_map_pop(struct sock *sk, > return e; > } > > -static void bpf_tcp_close(struct sock *sk, long timeout) > +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) > { > - void (*close_fun)(struct sock *sk, long timeout); > struct smap_psock_map_entry *e; > struct sk_msg_buff *md, *mtmp; > - struct smap_psock *psock; > struct sock *osk; > > - rcu_read_lock(); > - psock = smap_psock_sk(sk); > - if (unlikely(!psock)) { > - rcu_read_unlock(); > - return sk->sk_prot->close(sk, timeout); > - } > - > - /* The psock may be destroyed anytime after exiting the RCU critial > - * section so by the time we use close_fun the psock may no longer > - * be valid. However, bpf_tcp_close is called with the sock lock > - * held so the close hook and sk are still valid. > - */ > - close_fun = psock->save_close; > - > if (psock->cork) { > free_start_sg(psock->sock, psock->cork); > kfree(psock->cork); > @@ -378,6 +366,51 @@ static void bpf_tcp_close(struct sock *sk, long timeout) > } > e = psock_map_pop(sk, psock); > } > +} > + > +static void bpf_tcp_unhash(struct sock *sk) > +{ > + void (*unhash_fun)(struct sock *sk); > + struct smap_psock *psock; > + > + rcu_read_lock(); > + psock = smap_psock_sk(sk); > + if (unlikely(!psock)) { > + rcu_read_unlock(); > + return sk->sk_prot->unhash(sk); > + } > + > + /* The psock may be destroyed anytime after exiting the RCU critial > + * section so by the time we use close_fun the psock may no longer > + * be valid. However, bpf_tcp_close is called with the sock lock > + * held so the close hook and sk are still valid. > + */ > + unhash_fun = psock->save_unhash; > + bpf_tcp_remove(sk, psock); > + rcu_read_unlock(); > + unhash_fun(sk); > + > +} > + > +static void bpf_tcp_close(struct sock *sk, long timeout) > +{ > + void (*close_fun)(struct sock *sk, long timeout); > + struct smap_psock *psock; > + > + rcu_read_lock(); > + psock = smap_psock_sk(sk); > + if (unlikely(!psock)) { > + rcu_read_unlock(); > + return sk->sk_prot->close(sk, timeout); > + } > + > + /* The psock may be destroyed anytime after exiting the RCU critial > + * section so by the time we use close_fun the psock may no longer > + * be valid. However, bpf_tcp_close is called with the sock lock > + * held so the close hook and sk are still valid. > + */ > + close_fun = psock->save_close; > + bpf_tcp_remove(sk, psock); > rcu_read_unlock(); > close_fun(sk, timeout); > } >
On 06/13/2018 05:56 PM, Martin KaFai Lau wrote: > On Wed, Jun 13, 2018 at 10:50:14AM -0700, John Fastabend wrote: >> After adding checks to ensure TCP is in ESTABLISHED state when a >> sock is added we need to also ensure that user does not transition >> through tcp_disconnect() and back into ESTABLISHED state without >> sockmap removing the sock. >> >> To do this add unhash hook and remove sock from map there. > In bpf_tcp_init(): > sk->sk_prot = &tcp_bpf_proto; > > I may have missed a lock while reading sockmap.c. > Is it possible that tcp_disconnect() is being called while > the above assignment is also being done (e.g. through BPF_MAP_UPDATE_ELEM)? > The same situation go for the ESTABLISHED check. > Right because ESTABLISHED is checked without any locking its possible that the state changes during the update (from userspce BPF_MAP_UPDATE, from sock_ops program it is locked). I have the below patch on my tree now, I was thinking to send it as a follow on but on second thought it likely makes more sense as part of the patch that adds the ESTABLISHED check. Also after the below the sk_callback lock used to protect psock->maps is becoming increasingly pointless it allows the delete and map free ops to be called without taking the full sock lock. It might be time to just drop it in bpf-next and use the sock lock in the delete cases. The more annoying part will be the delete will have to have different userspace and bpf program helpers so we know when we need the lock. --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -2074,17 +2074,20 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); /* ULPs are currently supported only for TCP sockets in ESTABLISHED * state. */ if (skops.sk->sk_type != SOCK_STREAM || skops.sk->sk_protocol != IPPROTO_TCP || skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto out; } err = sock_map_ctx_update_elem(&skops, map, key, flags); +out: + release_sock(skops.sk); fput(socket->file); return err; } @@ -2423,17 +2426,20 @@ static int sock_hash_update_elem(struct bpf_map *map, return -EINVAL; } + lock_sock(skops.sk); /* ULPs are currently supported only for TCP sockets in ESTABLISHED * state. */ if (skops.sk->sk_type != SOCK_STREAM || skops.sk->sk_protocol != IPPROTO_TCP || skops.sk->sk_state != TCP_ESTABLISHED) { - fput(socket->file); - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + goto out; } err = sock_hash_ctx_update_elem(&skops, map, key, flags); +out: + release_sock(skops.sk); fput(socket->file); return err;
On 06/13/2018 10:48 PM, John Fastabend wrote: > On 06/13/2018 05:56 PM, Martin KaFai Lau wrote: >> On Wed, Jun 13, 2018 at 10:50:14AM -0700, John Fastabend wrote: >>> After adding checks to ensure TCP is in ESTABLISHED state when a >>> sock is added we need to also ensure that user does not transition >>> through tcp_disconnect() and back into ESTABLISHED state without >>> sockmap removing the sock. >>> >>> To do this add unhash hook and remove sock from map there. >> In bpf_tcp_init(): >> sk->sk_prot = &tcp_bpf_proto; >> >> I may have missed a lock while reading sockmap.c. >> Is it possible that tcp_disconnect() is being called while >> the above assignment is also being done (e.g. through BPF_MAP_UPDATE_ELEM)? >> The same situation go for the ESTABLISHED check. >> > > Right because ESTABLISHED is checked without any locking its > possible that the state changes during the update (from userspce > BPF_MAP_UPDATE, from sock_ops program it is locked). I have > the below patch on my tree now, I was thinking to send it as > a follow on but on second thought it likely makes more sense > as part of the patch that adds the ESTABLISHED check. > > Also after the below the sk_callback lock used to protect > psock->maps is becoming increasingly pointless it allows the > delete and map free ops to be called without taking the full > sock lock. It might be time to just drop it in bpf-next and > use the sock lock in the delete cases. The more annoying part > will be the delete will have to have different userspace and > bpf program helpers so we know when we need the lock. > > --- a/kernel/bpf/sockmap.c Hi Martin, I went ahead and sent a v2 with the sock lock addition included. Thanks, John
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 2e848cd..91c7b47 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -130,6 +130,7 @@ struct smap_psock { struct proto *sk_proto; void (*save_close)(struct sock *sk, long timeout); + void (*save_unhash)(struct sock *sk); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); }; @@ -141,6 +142,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); static void bpf_tcp_close(struct sock *sk, long timeout); +static void bpf_tcp_unhash(struct sock *sk); static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { @@ -182,6 +184,7 @@ static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS], { prot[SOCKMAP_BASE] = *base; prot[SOCKMAP_BASE].close = bpf_tcp_close; + prot[SOCKMAP_BASE].unhash = bpf_tcp_unhash; prot[SOCKMAP_BASE].recvmsg = bpf_tcp_recvmsg; prot[SOCKMAP_BASE].stream_memory_read = bpf_tcp_stream_read; @@ -215,6 +218,7 @@ static int bpf_tcp_init(struct sock *sk) } psock->save_close = sk->sk_prot->close; + psock->save_unhash = sk->sk_prot->unhash; psock->sk_proto = sk->sk_prot; /* Build IPv6 sockmap whenever the address of tcpv6_prot changes */ @@ -302,28 +306,12 @@ struct smap_psock_map_entry *psock_map_pop(struct sock *sk, return e; } -static void bpf_tcp_close(struct sock *sk, long timeout) +static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock) { - void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e; struct sk_msg_buff *md, *mtmp; - struct smap_psock *psock; struct sock *osk; - rcu_read_lock(); - psock = smap_psock_sk(sk); - if (unlikely(!psock)) { - rcu_read_unlock(); - return sk->sk_prot->close(sk, timeout); - } - - /* The psock may be destroyed anytime after exiting the RCU critial - * section so by the time we use close_fun the psock may no longer - * be valid. However, bpf_tcp_close is called with the sock lock - * held so the close hook and sk are still valid. - */ - close_fun = psock->save_close; - if (psock->cork) { free_start_sg(psock->sock, psock->cork); kfree(psock->cork); @@ -378,6 +366,51 @@ static void bpf_tcp_close(struct sock *sk, long timeout) } e = psock_map_pop(sk, psock); } +} + +static void bpf_tcp_unhash(struct sock *sk) +{ + void (*unhash_fun)(struct sock *sk); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return sk->sk_prot->unhash(sk); + } + + /* The psock may be destroyed anytime after exiting the RCU critial + * section so by the time we use close_fun the psock may no longer + * be valid. However, bpf_tcp_close is called with the sock lock + * held so the close hook and sk are still valid. + */ + unhash_fun = psock->save_unhash; + bpf_tcp_remove(sk, psock); + rcu_read_unlock(); + unhash_fun(sk); + +} + +static void bpf_tcp_close(struct sock *sk, long timeout) +{ + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return sk->sk_prot->close(sk, timeout); + } + + /* The psock may be destroyed anytime after exiting the RCU critial + * section so by the time we use close_fun the psock may no longer + * be valid. However, bpf_tcp_close is called with the sock lock + * held so the close hook and sk are still valid. + */ + close_fun = psock->save_close; + bpf_tcp_remove(sk, psock); rcu_read_unlock(); close_fun(sk, timeout); }
After adding checks to ensure TCP is in ESTABLISHED state when a sock is added we need to also ensure that user does not transition through tcp_disconnect() and back into ESTABLISHED state without sockmap removing the sock. To do this add unhash hook and remove sock from map there. Reported-by: Eric Dumazet <edumazet@google.com> Fixes: 81110384441a ("bpf: sockmap, add hash map support") Signed-off-by: John Fastabend <john.fastabend@gmail.com> --- kernel/bpf/sockmap.c | 67 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 17 deletions(-)