diff mbox series

[bpf-next,v2,3/4] bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT:

Message ID 20180327172322.11354.54016.stgit@john-Precision-Tower-5810
State Changes Requested, archived
Delegated to: BPF Maintainers
Headers show
Series bpf, sockmap BPF_F_INGRESS support | expand

Commit Message

John Fastabend March 27, 2018, 5:23 p.m. UTC
Add support for the BPF_F_INGRESS flag in skb redirect helper. To
do this convert skb into a scatterlist and push into ingress queue.
This is the same logic that is used in the sk_msg redirect helper
so it should feel familiar.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/filter.h |    1 +
 kernel/bpf/sockmap.c   |   94 +++++++++++++++++++++++++++++++++++++++---------
 net/core/filter.c      |    2 +
 3 files changed, 78 insertions(+), 19 deletions(-)

Comments

Daniel Borkmann March 28, 2018, 2:21 p.m. UTC | #1
On 03/27/2018 07:23 PM, John Fastabend wrote:
> Add support for the BPF_F_INGRESS flag in skb redirect helper. To
> do this convert skb into a scatterlist and push into ingress queue.
> This is the same logic that is used in the sk_msg redirect helper
> so it should feel familiar.
> 
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ---
>  include/linux/filter.h |    1 +
>  kernel/bpf/sockmap.c   |   94 +++++++++++++++++++++++++++++++++++++++---------
>  net/core/filter.c      |    2 +
>  3 files changed, 78 insertions(+), 19 deletions(-)
[...]
>  		if (!sg->length && md->sg_start == md->sg_end) {
>  			list_del(&md->list);
> +			if (md->skb)
> +				consume_skb(md->skb);
>  			kfree(md);
>  		}
>  	}
> @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
>  		__SK_DROP;
>  }
>  
> +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
> +{
> +	struct sock *sk = psock->sock;
> +	int copied = 0, num_sg;
> +	struct sk_msg_buff *r;
> +
> +	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
> +	if (unlikely(!r))
> +		return -EAGAIN;
> +
> +	if (!sk_rmem_schedule(sk, skb, skb->len)) {
> +		kfree(r);
> +		return -EAGAIN;
> +	}
> +	sk_mem_charge(sk, skb->len);

Usually mem accounting is based on truesize. This is not done here since
you need the exact length of the skb for the sg list later on, right?

> +	sg_init_table(r->sg_data, MAX_SKB_FRAGS);
> +	num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
> +	if (unlikely(num_sg < 0)) {
> +		kfree(r);

Don't we need to undo the mem charge here in case of error?

> +		return num_sg;
> +	}
> +	copied = skb->len;
> +	r->sg_start = 0;
> +	r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
> +	r->skb = skb;
> +	list_add_tail(&r->list, &psock->ingress);
> +	sk->sk_data_ready(sk);
> +	return copied;
> +}
John Fastabend March 28, 2018, 3:45 p.m. UTC | #2
On 03/28/2018 07:21 AM, Daniel Borkmann wrote:
> On 03/27/2018 07:23 PM, John Fastabend wrote:
>> Add support for the BPF_F_INGRESS flag in skb redirect helper. To
>> do this convert skb into a scatterlist and push into ingress queue.
>> This is the same logic that is used in the sk_msg redirect helper
>> so it should feel familiar.
>>
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>> ---
>>  include/linux/filter.h |    1 +
>>  kernel/bpf/sockmap.c   |   94 +++++++++++++++++++++++++++++++++++++++---------
>>  net/core/filter.c      |    2 +
>>  3 files changed, 78 insertions(+), 19 deletions(-)
> [...]
>>  		if (!sg->length && md->sg_start == md->sg_end) {
>>  			list_del(&md->list);
>> +			if (md->skb)
>> +				consume_skb(md->skb);
>>  			kfree(md);
>>  		}
>>  	}
>> @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
>>  		__SK_DROP;
>>  }
>>  
>> +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
>> +{
>> +	struct sock *sk = psock->sock;
>> +	int copied = 0, num_sg;
>> +	struct sk_msg_buff *r;
>> +
>> +	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
>> +	if (unlikely(!r))
>> +		return -EAGAIN;
>> +
>> +	if (!sk_rmem_schedule(sk, skb, skb->len)) {
>> +		kfree(r);
>> +		return -EAGAIN;
>> +	}
>> +	sk_mem_charge(sk, skb->len);
> 
> Usually mem accounting is based on truesize. This is not done here since
> you need the exact length of the skb for the sg list later on, right?

Correct.

> 
>> +	sg_init_table(r->sg_data, MAX_SKB_FRAGS);
>> +	num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
>> +	if (unlikely(num_sg < 0)) {
>> +		kfree(r);
> 
> Don't we need to undo the mem charge here in case of error?
> 

Actually, I'll just move the sk_mem_charge() down below this error
then we don't need to unwind it.

>> +		return num_sg;
>> +	}
>> +	copied = skb->len;
>> +	r->sg_start = 0;
>> +	r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
>> +	r->skb = skb;
>> +	list_add_tail(&r->list, &psock->ingress);
>> +	sk->sk_data_ready(sk);
>> +	return copied;
>> +}
Daniel Borkmann March 28, 2018, 3:46 p.m. UTC | #3
On 03/28/2018 05:45 PM, John Fastabend wrote:
> On 03/28/2018 07:21 AM, Daniel Borkmann wrote:
>> On 03/27/2018 07:23 PM, John Fastabend wrote:
>>> Add support for the BPF_F_INGRESS flag in skb redirect helper. To
>>> do this convert skb into a scatterlist and push into ingress queue.
>>> This is the same logic that is used in the sk_msg redirect helper
>>> so it should feel familiar.
>>>
>>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
>>> ---
>>>  include/linux/filter.h |    1 +
>>>  kernel/bpf/sockmap.c   |   94 +++++++++++++++++++++++++++++++++++++++---------
>>>  net/core/filter.c      |    2 +
>>>  3 files changed, 78 insertions(+), 19 deletions(-)
>> [...]
>>>  		if (!sg->length && md->sg_start == md->sg_end) {
>>>  			list_del(&md->list);
>>> +			if (md->skb)
>>> +				consume_skb(md->skb);
>>>  			kfree(md);
>>>  		}
>>>  	}
>>> @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
>>>  		__SK_DROP;
>>>  }
>>>  
>>> +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
>>> +{
>>> +	struct sock *sk = psock->sock;
>>> +	int copied = 0, num_sg;
>>> +	struct sk_msg_buff *r;
>>> +
>>> +	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
>>> +	if (unlikely(!r))
>>> +		return -EAGAIN;
>>> +
>>> +	if (!sk_rmem_schedule(sk, skb, skb->len)) {
>>> +		kfree(r);
>>> +		return -EAGAIN;
>>> +	}
>>> +	sk_mem_charge(sk, skb->len);
>>
>> Usually mem accounting is based on truesize. This is not done here since
>> you need the exact length of the skb for the sg list later on, right?
> 
> Correct.
> 
>>
>>> +	sg_init_table(r->sg_data, MAX_SKB_FRAGS);
>>> +	num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
>>> +	if (unlikely(num_sg < 0)) {
>>> +		kfree(r);
>>
>> Don't we need to undo the mem charge here in case of error?
>>
> 
> Actually, I'll just move the sk_mem_charge() down below this error
> then we don't need to unwind it.

Agree, good point.
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d0e207f..7de778a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -521,6 +521,7 @@  struct sk_msg_buff {
 	__u32 key;
 	__u32 flags;
 	struct bpf_map *map;
+	struct sk_buff *skb;
 	struct list_head list;
 };
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index b30bada..b52e1c4 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -785,7 +785,8 @@  static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 				i++;
 				if (i == MAX_SKB_FRAGS)
 					i = 0;
-				put_page(page);
+				if (!md->skb)
+					put_page(page);
 			}
 			if (copied == len)
 				break;
@@ -794,6 +795,8 @@  static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 
 		if (!sg->length && md->sg_start == md->sg_end) {
 			list_del(&md->list);
+			if (md->skb)
+				consume_skb(md->skb);
 			kfree(md);
 		}
 	}
@@ -1045,27 +1048,72 @@  static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 		__SK_DROP;
 }
 
+static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
+{
+	struct sock *sk = psock->sock;
+	int copied = 0, num_sg;
+	struct sk_msg_buff *r;
+
+	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
+	if (unlikely(!r))
+		return -EAGAIN;
+
+	if (!sk_rmem_schedule(sk, skb, skb->len)) {
+		kfree(r);
+		return -EAGAIN;
+	}
+	sk_mem_charge(sk, skb->len);
+
+	sg_init_table(r->sg_data, MAX_SKB_FRAGS);
+	num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
+	if (unlikely(num_sg < 0)) {
+		kfree(r);
+		return num_sg;
+	}
+	copied = skb->len;
+	r->sg_start = 0;
+	r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
+	r->skb = skb;
+	list_add_tail(&r->list, &psock->ingress);
+	sk->sk_data_ready(sk);
+	return copied;
+}
+
 static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
 {
+	struct smap_psock *peer;
 	struct sock *sk;
+	__u32 in;
 	int rc;
 
 	rc = smap_verdict_func(psock, skb);
 	switch (rc) {
 	case __SK_REDIRECT:
 		sk = do_sk_redirect_map(skb);
-		if (likely(sk)) {
-			struct smap_psock *peer = smap_psock_sk(sk);
-
-			if (likely(peer &&
-				   test_bit(SMAP_TX_RUNNING, &peer->state) &&
-				   !sock_flag(sk, SOCK_DEAD) &&
-				   sock_writeable(sk))) {
-				skb_set_owner_w(skb, sk);
-				skb_queue_tail(&peer->rxqueue, skb);
-				schedule_work(&peer->tx_work);
-				break;
-			}
+		if (!sk) {
+			kfree_skb(skb);
+			break;
+		}
+
+		peer = smap_psock_sk(sk);
+		in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
+
+		if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
+			     !test_bit(SMAP_TX_RUNNING, &peer->state))) {
+			kfree_skb(skb);
+			break;
+		}
+
+		if (!in && sock_writeable(sk)) {
+			skb_set_owner_w(skb, sk);
+			skb_queue_tail(&peer->rxqueue, skb);
+			schedule_work(&peer->tx_work);
+			break;
+		} else if (in &&
+			   atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
+			skb_queue_tail(&peer->rxqueue, skb);
+			schedule_work(&peer->tx_work);
+			break;
 		}
 	/* Fall through and free skb otherwise */
 	case __SK_DROP:
@@ -1127,15 +1175,23 @@  static void smap_tx_work(struct work_struct *w)
 	}
 
 	while ((skb = skb_dequeue(&psock->rxqueue))) {
+		__u32 flags;
+
 		rem = skb->len;
 		off = 0;
 start:
+		flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
 		do {
-			if (likely(psock->sock->sk_socket))
-				n = skb_send_sock_locked(psock->sock,
-							 skb, off, rem);
-			else
+			if (likely(psock->sock->sk_socket)) {
+				if (flags)
+					n = smap_do_ingress(psock, skb);
+				else
+					n = skb_send_sock_locked(psock->sock,
+								 skb, off, rem);
+			} else {
 				n = -EINVAL;
+			}
+
 			if (n <= 0) {
 				if (n == -EAGAIN) {
 					/* Retry when space is available */
@@ -1153,7 +1209,9 @@  static void smap_tx_work(struct work_struct *w)
 			rem -= n;
 			off += n;
 		} while (rem);
-		kfree_skb(skb);
+
+		if (!flags)
+			kfree_skb(skb);
 	}
 out:
 	release_sock(psock->sock);
diff --git a/net/core/filter.c b/net/core/filter.c
index 11b1f16..b46916d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1855,7 +1855,7 @@  int skb_do_redirect(struct sk_buff *skb)
 	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 
 	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags))
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
 		return SK_DROP;
 
 	tcb->bpf.key = key;