diff mbox series

[bpf-next,v2,2/3] bpf: sockmap, add sock close() hook to remove socks

Message ID 20180126181424.13175.69428.stgit@john-Precision-Tower-5810
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series bpf: sockmap fixes | expand

Commit Message

John Fastabend Jan. 26, 2018, 6:14 p.m. UTC
The selftests test_maps program was leaving dangling BPF sockmap
programs around because not all psock elements were removed from
the map. The elements in turn hold a reference on the BPF program
they are attached to causing BPF programs to stay open even after
test_maps has completed.

The original intent was that sk_state_change() would be called
when TCP socks went through TCP_CLOSE state. However, because
socks may be in SOCK_DEAD state or the sock may be a listening
socket the event is not always triggered.

To resolve this use the ULP infrastructure and register our own
proto close() handler. This fixes the above case.

Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
Reported-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/net/tcp.h    |    2 +
 kernel/bpf/sockmap.c |  146 +++++++++++++++++++++++++++-----------------------
 2 files changed, 81 insertions(+), 67 deletions(-)

Comments

John Fastabend Jan. 26, 2018, 9:02 p.m. UTC | #1
On 01/26/2018 10:14 AM, John Fastabend wrote:
> The selftests test_maps program was leaving dangling BPF sockmap
> programs around because not all psock elements were removed from
> the map. The elements in turn hold a reference on the BPF program
> they are attached to causing BPF programs to stay open even after
> test_maps has completed.
> 
> The original intent was that sk_state_change() would be called
> when TCP socks went through TCP_CLOSE state. However, because
> socks may be in SOCK_DEAD state or the sock may be a listening
> socket the event is not always triggered.
> 
> To resolve this use the ULP infrastructure and register our own
> proto close() handler. This fixes the above case.
> 
> Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support")
> Reported-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---

[...]

v3 will be needed.

> +void bpf_tcp_close(struct sock *sk, long timeout)
> +{
> +	struct smap_psock_map_entry *e, *tmp;
> +	struct smap_psock *psock;
> +	struct sock *osk;
> +
> +	psock = smap_psock_sk(sk);
> +	if (unlikely(!psock))
> +		return sk->sk_prot->close(sk, timeout);
> +
> +	write_lock_bh(&sk->sk_callback_lock);
> +	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
> +		osk = cmpxchg(e->entry, sk, NULL);
> +		if (osk == sk) {
> +			list_del(&e->list);
> +			smap_release_sock(psock, sk);
> +		}
> +	}
> +	write_unlock_bh(&sk->sk_callback_lock);
> +	return psock->save_close(sk, timeout);

We need this to be in an RCU critical section. Else the release op
could free the psock immediately presumably. I've not actually triggered
this case, but seems possible. Also the save_close call can not be done
inside RCU so we need to cache the value and run it at the end. This is
OK because we have the sock lock held.

Probably like this,

void bpf_tcp_close(struct sock *sk, long timeout)
{
        struct smap_psock_map_entry *e, *tmp;
        struct smap_psock *psock;
        struct sock *osk;
        void (*close_fun)(struct sock *sk, long timeout);

        rcu_read_lock();
        psock = smap_psock_sk(sk);
        if (unlikely(!psock))
                return sk->sk_prot->close(sk, timeout);

        /* Although the psock may be destroyed, after RCU grace period, the
         * sk will not because we are holding the sock lock here. So we can
         * call the original close routine outside the RCU critical section.
         */
        close_fun = psock->save_close;

        write_lock_bh(&sk->sk_callback_lock);
        list_for_each_entry_safe(e, tmp, &psock->maps, list) {
                osk = cmpxchg(e->entry, sk, NULL);
                if (osk == sk) {
                        list_del(&e->list);
                        smap_release_sock(psock, sk);
                }
        }
        write_unlock_bh(&sk->sk_callback_lock);
        rcu_read_unlock(); 
        close_fun(sk, timeout);
}
diff mbox series

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ba10ca7..e082101 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1985,6 +1985,7 @@  static inline void tcp_listendrop(const struct sock *sk)
 
 enum {
 	TCP_ULP_TLS,
+	TCP_ULP_BPF,
 };
 
 struct tcp_ulp_ops {
@@ -2003,6 +2004,7 @@  struct tcp_ulp_ops {
 int tcp_register_ulp(struct tcp_ulp_ops *type);
 void tcp_unregister_ulp(struct tcp_ulp_ops *type);
 int tcp_set_ulp(struct sock *sk, const char *name);
+int tcp_set_ulp_id(struct sock *sk, const int ulp);
 void tcp_get_available_ulp(char *buf, size_t len);
 void tcp_cleanup_ulp(struct sock *sk);
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 0314d17..4e7ba37 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -86,9 +86,10 @@  struct smap_psock {
 	struct work_struct tx_work;
 	struct work_struct gc_work;
 
+	struct proto *sk_proto;
+	void (*save_close)(struct sock *sk, long timeout);
 	void (*save_data_ready)(struct sock *sk);
 	void (*save_write_space)(struct sock *sk);
-	void (*save_state_change)(struct sock *sk);
 };
 
 static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,80 @@  static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
 	return rcu_dereference_sk_user_data(sk);
 }
 
+struct proto tcp_bpf_proto;
+static int bpf_tcp_init(struct sock *sk)
+{
+	struct smap_psock *psock;
+
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock))
+		return -EINVAL;
+
+	if (unlikely(psock->sk_proto))
+		return -EBUSY;
+
+	psock->save_close = sk->sk_prot->close;
+	psock->sk_proto = sk->sk_prot;
+	sk->sk_prot = &tcp_bpf_proto;
+	return 0;
+}
+
+static void bpf_tcp_release(struct sock *sk)
+{
+	struct smap_psock *psock = smap_psock_sk(sk);
+
+	if (likely(psock)) {
+		sk->sk_prot = psock->sk_proto;
+		psock->sk_proto = NULL;
+	}
+}
+
+static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
+
+void bpf_tcp_close(struct sock *sk, long timeout)
+{
+	struct smap_psock_map_entry *e, *tmp;
+	struct smap_psock *psock;
+	struct sock *osk;
+
+	psock = smap_psock_sk(sk);
+	if (unlikely(!psock))
+		return sk->sk_prot->close(sk, timeout);
+
+	write_lock_bh(&sk->sk_callback_lock);
+	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
+		osk = cmpxchg(e->entry, sk, NULL);
+		if (osk == sk) {
+			list_del(&e->list);
+			smap_release_sock(psock, sk);
+		}
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	return psock->save_close(sk, timeout);
+}
+
 enum __sk_action {
 	__SK_DROP = 0,
 	__SK_PASS,
 	__SK_REDIRECT,
 };
 
+static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
+	.name		= "bpf_tcp",
+	.uid		= TCP_ULP_BPF,
+	.user_visible	= false,
+	.owner		= NULL,
+	.init		= bpf_tcp_init,
+	.release	= bpf_tcp_release,
+};
+
+static int bpf_tcp_ulp_register(void)
+{
+	tcp_bpf_proto = tcp_prot;
+	tcp_bpf_proto.close = bpf_tcp_close;
+	return tcp_register_ulp(&bpf_tcp_ulp_ops);
+}
+
 static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 {
 	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +235,6 @@  static void smap_report_sk_error(struct smap_psock *psock, int err)
 	sk->sk_error_report(sk);
 }
 
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-
-/* Called with lock_sock(sk) held */
-static void smap_state_change(struct sock *sk)
-{
-	struct smap_psock_map_entry *e, *tmp;
-	struct smap_psock *psock;
-	struct socket_wq *wq;
-	struct sock *osk;
-
-	rcu_read_lock();
-
-	/* Allowing transitions into an established syn_recv states allows
-	 * for early binding sockets to a smap object before the connection
-	 * is established.
-	 */
-	switch (sk->sk_state) {
-	case TCP_SYN_SENT:
-	case TCP_SYN_RECV:
-	case TCP_ESTABLISHED:
-		break;
-	case TCP_CLOSE_WAIT:
-	case TCP_CLOSING:
-	case TCP_LAST_ACK:
-	case TCP_FIN_WAIT1:
-	case TCP_FIN_WAIT2:
-	case TCP_LISTEN:
-		break;
-	case TCP_CLOSE:
-		/* Only release if the map entry is in fact the sock in
-		 * question. There is a case where the operator deletes
-		 * the sock from the map, but the TCP sock is closed before
-		 * the psock is detached. Use cmpxchg to verify correct
-		 * sock is removed.
-		 */
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		write_lock_bh(&sk->sk_callback_lock);
-		list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-			osk = cmpxchg(e->entry, sk, NULL);
-			if (osk == sk) {
-				list_del(&e->list);
-				smap_release_sock(psock, sk);
-			}
-		}
-		write_unlock_bh(&sk->sk_callback_lock);
-		break;
-	default:
-		psock = smap_psock_sk(sk);
-		if (unlikely(!psock))
-			break;
-		smap_report_sk_error(psock, EPIPE);
-		break;
-	}
-
-	wq = rcu_dereference(sk->sk_wq);
-	if (skwq_has_sleeper(wq))
-		wake_up_interruptible_all(&wq->wait);
-	rcu_read_unlock();
-}
-
 static void smap_read_sock_strparser(struct strparser *strp,
 				     struct sk_buff *skb)
 {
@@ -322,10 +329,8 @@  static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	sk->sk_data_ready = psock->save_data_ready;
 	sk->sk_write_space = psock->save_write_space;
-	sk->sk_state_change = psock->save_state_change;
 	psock->save_data_ready = NULL;
 	psock->save_write_space = NULL;
-	psock->save_state_change = NULL;
 	strp_stop(&psock->strp);
 	psock->strp_enabled = false;
 }
@@ -350,6 +355,7 @@  static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
 	if (psock->refcnt)
 		return;
 
+	tcp_cleanup_ulp(sock);
 	smap_stop_sock(psock, sock);
 	clear_bit(SMAP_TX_RUNNING, &psock->state);
 	rcu_assign_sk_user_data(sock, NULL);
@@ -427,10 +433,8 @@  static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
 		return;
 	psock->save_data_ready = sk->sk_data_ready;
 	psock->save_write_space = sk->sk_write_space;
-	psock->save_state_change = sk->sk_state_change;
 	sk->sk_data_ready = smap_data_ready;
 	sk->sk_write_space = smap_write_space;
-	sk->sk_state_change = smap_state_change;
 	psock->strp_enabled = true;
 }
 
@@ -509,6 +513,10 @@  static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 	if (attr->value_size > KMALLOC_MAX_SIZE)
 		return ERR_PTR(-E2BIG);
 
+	err = bpf_tcp_ulp_register();
+	if (err && err != -EEXIST)
+		return ERR_PTR(err);
+
 	stab = kzalloc(sizeof(*stab), GFP_USER);
 	if (!stab)
 		return ERR_PTR(-ENOMEM);
@@ -754,6 +762,10 @@  static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
 			goto out_progs;
 		}
 
+		err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
+		if (err)
+			goto out_progs;
+
 		set_bit(SMAP_TX_RUNNING, &psock->state);
 	}