Message ID | 20180327172322.11354.54016.stgit@john-Precision-Tower-5810 |
---|---|
State | Changes Requested, archived |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf, sockmap BPF_F_INGRESS support | expand |
On 03/27/2018 07:23 PM, John Fastabend wrote: > Add support for the BPF_F_INGRESS flag in skb redirect helper. To > do this convert skb into a scatterlist and push into ingress queue. > This is the same logic that is used in the sk_msg redirect helper > so it should feel familiar. > > Signed-off-by: John Fastabend <john.fastabend@gmail.com> > --- > include/linux/filter.h | 1 + > kernel/bpf/sockmap.c | 94 +++++++++++++++++++++++++++++++++++++++--------- > net/core/filter.c | 2 + > 3 files changed, 78 insertions(+), 19 deletions(-) [...] > if (!sg->length && md->sg_start == md->sg_end) { > list_del(&md->list); > + if (md->skb) > + consume_skb(md->skb); > kfree(md); > } > } > @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) > __SK_DROP; > } > > +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) > +{ > + struct sock *sk = psock->sock; > + int copied = 0, num_sg; > + struct sk_msg_buff *r; > + > + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); > + if (unlikely(!r)) > + return -EAGAIN; > + > + if (!sk_rmem_schedule(sk, skb, skb->len)) { > + kfree(r); > + return -EAGAIN; > + } > + sk_mem_charge(sk, skb->len); Usually mem accounting is based on truesize. This is not done here since you need the exact length of the skb for the sg list later on, right? > + sg_init_table(r->sg_data, MAX_SKB_FRAGS); > + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); > + if (unlikely(num_sg < 0)) { > + kfree(r); Don't we need to undo the mem charge here in case of error? > + return num_sg; > + } > + copied = skb->len; > + r->sg_start = 0; > + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; > + r->skb = skb; > + list_add_tail(&r->list, &psock->ingress); > + sk->sk_data_ready(sk); > + return copied; > +}
On 03/28/2018 07:21 AM, Daniel Borkmann wrote: > On 03/27/2018 07:23 PM, John Fastabend wrote: >> Add support for the BPF_F_INGRESS flag in skb redirect helper. To >> do this convert skb into a scatterlist and push into ingress queue. >> This is the same logic that is used in the sk_msg redirect helper >> so it should feel familiar. >> >> Signed-off-by: John Fastabend <john.fastabend@gmail.com> >> --- >> include/linux/filter.h | 1 + >> kernel/bpf/sockmap.c | 94 +++++++++++++++++++++++++++++++++++++++--------- >> net/core/filter.c | 2 + >> 3 files changed, 78 insertions(+), 19 deletions(-) > [...] >> if (!sg->length && md->sg_start == md->sg_end) { >> list_del(&md->list); >> + if (md->skb) >> + consume_skb(md->skb); >> kfree(md); >> } >> } >> @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) >> __SK_DROP; >> } >> >> +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) >> +{ >> + struct sock *sk = psock->sock; >> + int copied = 0, num_sg; >> + struct sk_msg_buff *r; >> + >> + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); >> + if (unlikely(!r)) >> + return -EAGAIN; >> + >> + if (!sk_rmem_schedule(sk, skb, skb->len)) { >> + kfree(r); >> + return -EAGAIN; >> + } >> + sk_mem_charge(sk, skb->len); > > Usually mem accounting is based on truesize. This is not done here since > you need the exact length of the skb for the sg list later on, right? Correct. > >> + sg_init_table(r->sg_data, MAX_SKB_FRAGS); >> + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); >> + if (unlikely(num_sg < 0)) { >> + kfree(r); > > Don't we need to undo the mem charge here in case of error? > Actually, I'll just move the sk_mem_charge() down below this error then we don't need to unwind it. >> + return num_sg; >> + } >> + copied = skb->len; >> + r->sg_start = 0; >> + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; >> + r->skb = skb; >> + list_add_tail(&r->list, &psock->ingress); >> + sk->sk_data_ready(sk); >> + return copied; >> +}
On 03/28/2018 05:45 PM, John Fastabend wrote: > On 03/28/2018 07:21 AM, Daniel Borkmann wrote: >> On 03/27/2018 07:23 PM, John Fastabend wrote: >>> Add support for the BPF_F_INGRESS flag in skb redirect helper. To >>> do this convert skb into a scatterlist and push into ingress queue. >>> This is the same logic that is used in the sk_msg redirect helper >>> so it should feel familiar. >>> >>> Signed-off-by: John Fastabend <john.fastabend@gmail.com> >>> --- >>> include/linux/filter.h | 1 + >>> kernel/bpf/sockmap.c | 94 +++++++++++++++++++++++++++++++++++++++--------- >>> net/core/filter.c | 2 + >>> 3 files changed, 78 insertions(+), 19 deletions(-) >> [...] >>> if (!sg->length && md->sg_start == md->sg_end) { >>> list_del(&md->list); >>> + if (md->skb) >>> + consume_skb(md->skb); >>> kfree(md); >>> } >>> } >>> @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) >>> __SK_DROP; >>> } >>> >>> +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) >>> +{ >>> + struct sock *sk = psock->sock; >>> + int copied = 0, num_sg; >>> + struct sk_msg_buff *r; >>> + >>> + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); >>> + if (unlikely(!r)) >>> + return -EAGAIN; >>> + >>> + if (!sk_rmem_schedule(sk, skb, skb->len)) { >>> + kfree(r); >>> + return -EAGAIN; >>> + } >>> + sk_mem_charge(sk, skb->len); >> >> Usually mem accounting is based on truesize. This is not done here since >> you need the exact length of the skb for the sg list later on, right? > > Correct. > >> >>> + sg_init_table(r->sg_data, MAX_SKB_FRAGS); >>> + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); >>> + if (unlikely(num_sg < 0)) { >>> + kfree(r); >> >> Don't we need to undo the mem charge here in case of error? >> > > Actually, I'll just move the sk_mem_charge() down below this error > then we don't need to unwind it. Agree, good point.
diff --git a/include/linux/filter.h b/include/linux/filter.h index d0e207f..7de778a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -521,6 +521,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct sk_buff *skb; struct list_head list; }; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index b30bada..b52e1c4 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -785,7 +785,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, i++; if (i == MAX_SKB_FRAGS) i = 0; - put_page(page); + if (!md->skb) + put_page(page); } if (copied == len) break; @@ -794,6 +795,8 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (!sg->length && md->sg_start == md->sg_end) { list_del(&md->list); + if (md->skb) + consume_skb(md->skb); kfree(md); } } @@ -1045,27 +1048,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) __SK_DROP; } +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sock; + int copied = 0, num_sg; + struct sk_msg_buff *r; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!r)) + return -EAGAIN; + + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(r); + return -EAGAIN; + } + sk_mem_charge(sk, skb->len); + + sg_init_table(r->sg_data, MAX_SKB_FRAGS); + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); + if (unlikely(num_sg < 0)) { + kfree(r); + return num_sg; + } + copied = skb->len; + r->sg_start = 0; + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; + r->skb = skb; + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + return copied; +} + static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { + struct smap_psock *peer; struct sock *sk; + __u32 in; int rc; rc = smap_verdict_func(psock, skb); switch (rc) { case __SK_REDIRECT: sk = do_sk_redirect_map(skb); - if (likely(sk)) { - struct smap_psock *peer = smap_psock_sk(sk); - - if (likely(peer && - test_bit(SMAP_TX_RUNNING, &peer->state) && - !sock_flag(sk, SOCK_DEAD) && - sock_writeable(sk))) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } + if (!sk) { + kfree_skb(skb); + break; + } + + peer = smap_psock_sk(sk); + in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; + + if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || + !test_bit(SMAP_TX_RUNNING, &peer->state))) { + kfree_skb(skb); + break; + } + + if (!in && sock_writeable(sk)) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } else if (in && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; } /* Fall through and free skb otherwise */ case __SK_DROP: @@ -1127,15 +1175,23 @@ static void smap_tx_work(struct work_struct *w) } while ((skb = skb_dequeue(&psock->rxqueue))) { + __u32 flags; + rem = skb->len; off = 0; start: + flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; do { - if (likely(psock->sock->sk_socket)) - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - else + if (likely(psock->sock->sk_socket)) { + if (flags) + n = smap_do_ingress(psock, skb); + else + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + } else { n = -EINVAL; + } + if (n <= 0) { if (n == -EAGAIN) { /* Retry when space is available */ @@ -1153,7 +1209,9 @@ static void smap_tx_work(struct work_struct *w) rem -= n; off += n; } while (rem); - kfree_skb(skb); + + if (!flags) + kfree_skb(skb); } out: release_sock(psock->sock); diff --git a/net/core/filter.c b/net/core/filter.c index 11b1f16..b46916d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1855,7 +1855,7 @@ int skb_do_redirect(struct sk_buff *skb) struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.key = key;
Add support for the BPF_F_INGRESS flag in skb redirect helper. To do this convert skb into a scatterlist and push into ingress queue. This is the same logic that is used in the sk_msg redirect helper so it should feel familiar. Signed-off-by: John Fastabend <john.fastabend@gmail.com> --- include/linux/filter.h | 1 + kernel/bpf/sockmap.c | 94 +++++++++++++++++++++++++++++++++++++++--------- net/core/filter.c | 2 + 3 files changed, 78 insertions(+), 19 deletions(-)