Message ID | 20190214060939.101851-1-posk@google.com |
---|---|
State | Changes Requested |
Delegated to: | BPF Maintainers |
Headers | show |
Series | [bpf-next] bpf: fix memory leak in bpf_lwt_xmit_reroute | expand |
On 2/13/19 11:09 PM, Peter Oskolkov wrote: > On error the skb should be freed. Tested with diff/steps > provided by David Ahern. > > Reported-by: David Ahern <dsahern@gmail.com> > Fixes: 3bd0b15281af ("bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c") > Signed-off-by: Peter Oskolkov <posk@google.com> > --- > net/core/lwt_bpf.c | 24 ++++++++++++++++-------- > 1 file changed, 16 insertions(+), 8 deletions(-) > > diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c > index 32251f3fcda0..f3273cbb6b22 100644 > --- a/net/core/lwt_bpf.c > +++ b/net/core/lwt_bpf.c > @@ -179,18 +179,19 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); > int oif = l3mdev ? l3mdev->ifindex : 0; > struct dst_entry *dst = NULL; > + int err = -EAFNOSUPPORT; > struct sock *sk; > struct net *net; > bool ipv4; > - int err; > > if (skb->protocol == htons(ETH_P_IP)) > ipv4 = true; > else if (skb->protocol == htons(ETH_P_IPV6)) > ipv4 = false; > else > - return -EAFNOSUPPORT; > + goto err; > > + err = -EINVAL; > sk = sk_to_full_sk(skb->sk); > if (sk) { > if (sk->sk_bound_dev_if) > @@ -216,7 +217,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > > rt = ip_route_output_key(net, &fl4); > if (IS_ERR(rt)) > - return -EINVAL; > + goto err; > dst = &rt->dst; > } else { > struct ipv6hdr *iph6 = ipv6_hdr(skb); > @@ -231,12 +232,15 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > fl6.saddr = iph6->saddr; > > err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); > - if (err || IS_ERR(dst)) > - return -EINVAL; > + if (err || IS_ERR(dst)) { > + err = -EINVAL; > + goto err; > + } > } > if (unlikely(dst->error)) { > dst_release(dst); > - return -EINVAL; > + err = -EINVAL; > + goto err; > } > > /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it EINVAL is a confusing return code; it is not an EINVAL problem, it is a routing problem: ... starting egress IPv4 encap test ping: sendmsg: Invalid argument FAIL: test_ping: 1 Versus returning the error from the lookup: ... starting egress IPv4 encap test ping: sendmsg: No route to host FAIL: test_ping: 1 diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index f3273cbb6b22..a1901ba319fc 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -191,7 +191,6 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) else goto err; - err = -EINVAL; sk = sk_to_full_sk(skb->sk); if (sk) { if (sk->sk_bound_dev_if) @@ -216,8 +215,10 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.saddr = iph->saddr; rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) + if (IS_ERR(rt)) { + err = PTR_ERR(rt); goto err; + } dst = &rt->dst; } else { struct ipv6hdr *iph6 = ipv6_hdr(skb); @@ -232,14 +233,12 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl6.saddr = iph6->saddr; err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); - if (err || IS_ERR(dst)) { - err = -EINVAL; + if (err || IS_ERR(dst)) goto err; - } } if (unlikely(dst->error)) { dst_release(dst); - err = -EINVAL; + err = dst->error; goto err; } > @@ -246,17 +250,21 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > */ > err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); > if (unlikely(err)) > - return err; > + goto err; > > skb_dst_drop(skb); > skb_dst_set(skb, dst); > > err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); > if (unlikely(err)) > - return err; > + goto err; > > /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ > return LWTUNNEL_XMIT_DONE; > + > +err: > + kfree_skb(skb); > + return err; > } > > static int bpf_xmit(struct sk_buff *skb) > I figured it was a leaked skb. Also, the test script needs to be updated as well with the negative tests -- ie., toggle the route from a dev/gateway to a reject (e.g.,unreachable) and back. Also, don't exit on the first failure - run all of them. Having the result line up is more user friendly. e.g., # ./fib_tests.sh Single path route test Start point TEST: IPv4 fibmatch [ OK ] TEST: IPv6 fibmatch [ OK ] Nexthop device deleted TEST: IPv4 fibmatch - no route [ OK ] TEST: IPv6 fibmatch - no route [ OK ] ...
On Thu, Feb 14, 2019 at 10:11 AM David Ahern <dsahern@gmail.com> wrote: > > On 2/13/19 11:09 PM, Peter Oskolkov wrote: > > On error the skb should be freed. Tested with diff/steps > > provided by David Ahern. > > > > Reported-by: David Ahern <dsahern@gmail.com> > > Fixes: 3bd0b15281af ("bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c") > > Signed-off-by: Peter Oskolkov <posk@google.com> > > --- > > net/core/lwt_bpf.c | 24 ++++++++++++++++-------- > > 1 file changed, 16 insertions(+), 8 deletions(-) > > > > diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c > > index 32251f3fcda0..f3273cbb6b22 100644 > > --- a/net/core/lwt_bpf.c > > +++ b/net/core/lwt_bpf.c > > @@ -179,18 +179,19 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > > struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); > > int oif = l3mdev ? l3mdev->ifindex : 0; > > struct dst_entry *dst = NULL; > > + int err = -EAFNOSUPPORT; > > struct sock *sk; > > struct net *net; > > bool ipv4; > > - int err; > > > > if (skb->protocol == htons(ETH_P_IP)) > > ipv4 = true; > > else if (skb->protocol == htons(ETH_P_IPV6)) > > ipv4 = false; > > else > > - return -EAFNOSUPPORT; > > + goto err; > > > > + err = -EINVAL; > > sk = sk_to_full_sk(skb->sk); > > if (sk) { > > if (sk->sk_bound_dev_if) > > @@ -216,7 +217,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > > > > rt = ip_route_output_key(net, &fl4); > > if (IS_ERR(rt)) > > - return -EINVAL; > > + goto err; > > dst = &rt->dst; > > } else { > > struct ipv6hdr *iph6 = ipv6_hdr(skb); > > @@ -231,12 +232,15 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > > fl6.saddr = iph6->saddr; > > > > err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); > > - if (err || IS_ERR(dst)) > > - return -EINVAL; > > + if (err || IS_ERR(dst)) { > > + err = -EINVAL; > > + goto err; > > + } > > } > > if (unlikely(dst->error)) { > > dst_release(dst); > > - return -EINVAL; > > + err = -EINVAL; > > + goto err; > > } > > > > /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it > > EINVAL is a confusing return code; it is not an EINVAL problem, it is a > routing problem: Thanks, David! Sent a v2 of the patch. > > ... > starting egress IPv4 encap test > ping: sendmsg: Invalid argument > FAIL: test_ping: 1 > > > Versus returning the error from the lookup: > ... > starting egress IPv4 encap test > ping: sendmsg: No route to host > FAIL: test_ping: 1 > > > diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c > index f3273cbb6b22..a1901ba319fc 100644 > --- a/net/core/lwt_bpf.c > +++ b/net/core/lwt_bpf.c > @@ -191,7 +191,6 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > else > goto err; > > - err = -EINVAL; > sk = sk_to_full_sk(skb->sk); > if (sk) { > if (sk->sk_bound_dev_if) > @@ -216,8 +215,10 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > fl4.saddr = iph->saddr; > > rt = ip_route_output_key(net, &fl4); > - if (IS_ERR(rt)) > + if (IS_ERR(rt)) { > + err = PTR_ERR(rt); > goto err; > + } > dst = &rt->dst; > } else { > struct ipv6hdr *iph6 = ipv6_hdr(skb); > @@ -232,14 +233,12 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > fl6.saddr = iph6->saddr; > > err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); > - if (err || IS_ERR(dst)) { > - err = -EINVAL; > + if (err || IS_ERR(dst)) > goto err; > - } > } > if (unlikely(dst->error)) { > dst_release(dst); > - err = -EINVAL; > + err = dst->error; > goto err; > } > > > > > > @@ -246,17 +250,21 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) > > */ > > err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); > > if (unlikely(err)) > > - return err; > > + goto err; > > > > skb_dst_drop(skb); > > skb_dst_set(skb, dst); > > > > err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); > > if (unlikely(err)) > > - return err; > > + goto err; > > > > /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ > > return LWTUNNEL_XMIT_DONE; > > + > > +err: > > + kfree_skb(skb); > > + return err; > > } > > > > static int bpf_xmit(struct sk_buff *skb) > > > > I figured it was a leaked skb. > > Also, the test script needs to be updated as well with the negative > tests -- ie., toggle the route from a dev/gateway to a reject > (e.g.,unreachable) and back. > > Also, don't exit on the first failure - run all of them. I'll refactor the test as you suggest here when I add VRF and GRO tests in a couple of weeks, if this is OK. > > Having the result line up is more user friendly. e.g., > > # ./fib_tests.sh > > Single path route test > Start point > TEST: IPv4 fibmatch [ OK ] > TEST: IPv6 fibmatch [ OK ] > Nexthop device deleted > TEST: IPv4 fibmatch - no route [ OK ] > TEST: IPv6 fibmatch - no route [ OK ] > ...
On 2/14/19 11:42 AM, Peter Oskolkov wrote: > I'll refactor the test as you suggest here > when I add VRF and GRO tests in a couple of weeks, if this is OK. IMO, the tests should go in with the feature, not a release later. If we are at -rc6 this week then you might get next week as well. The unreachable toggle is a fairly quick integration. GRO really should also get in the same cycle as the feature. Preferably VRF tests as well since you have the commands. The pretty printing cleanup can be done later.
On Thu, Feb 14, 2019 at 11:10 AM David Ahern <dsahern@gmail.com> wrote: > > On 2/14/19 11:42 AM, Peter Oskolkov wrote: > > I'll refactor the test as you suggest here > > when I add VRF and GRO tests in a couple of weeks, if this is OK. > > IMO, the tests should go in with the feature, not a release later. If we > are at -rc6 this week then you might get next week as well. > > The unreachable toggle is a fairly quick integration. GRO really should > also get in the same cycle as the feature. Preferably VRF tests as well > since you have the commands. OK, I'll work on the negative tests and GRO first. > > The pretty printing cleanup can be done later.
On 2/28/19 10:57 AM, Peter Oskolkov wrote: > David: I'm not sure how to test GSO (I assume we are talking about GSO > here) in > the selftest: the encapping code sets SKB_GSO_DODGY flag, and veth does > not support > dodginess: "tx-gso-robust: off [fixed]". > > If the "dodgy" flag is not set, then gso validation in dev.c passes, and > large GSO packets > happily go through; if the "dodgy" flag is set, "dodgy" GSO packets are > rejected, TCP does > segmentation, and non-GSO packets happily go through (with an mtu tweak > to the LWT tunnel). > > So I see three options: > - add a sysctl to _not_ set SKB_GSO_DODGY flag in lwt_bpf.c => > handle_gso_type(); > - change veth to accept "dodgy" GSO packets > - test the code "as is", meaning that GSO will be tried and disabled by > TCP stack > > Which approach would you prefer? > definitely not a sysctl. After that, I don't have a suggestion for GSO at the moment.
On Fri, Mar 1, 2019 at 9:27 PM David Ahern <dsahern@gmail.com> wrote: > > On 2/28/19 10:57 AM, Peter Oskolkov wrote: > > David: I'm not sure how to test GSO (I assume we are talking about GSO > > here) in > > the selftest: the encapping code sets SKB_GSO_DODGY flag, and veth does > > not support > > dodginess: "tx-gso-robust: off [fixed]". > > > > If the "dodgy" flag is not set, then gso validation in dev.c passes, and > > large GSO packets > > happily go through; if the "dodgy" flag is set, "dodgy" GSO packets are > > rejected, TCP does > > segmentation, and non-GSO packets happily go through (with an mtu tweak > > to the LWT tunnel). Very few devices unconditionally accept dodgy packets (only veth?). A device that lacks the robust gso feature will cause a gso packet with dodgy flag to enter software gso instead of passing to device segmentation offload. That should be perfect for checking that the packets can be segmented correctly with the new header. If the gso layer drops the packets, that is not due to dropping all dodgy sources. It will be dropped somewhere else inside gso, indication that something is not as expected with the packet. > > So I see three options: > > - add a sysctl to _not_ set SKB_GSO_DODGY flag in lwt_bpf.c => > > handle_gso_type(); > > - change veth to accept "dodgy" GSO packets Neither, as these would bypass segmentation offload and pass the large packet to the receive path. It is more interesting to validate the packet in gso. > > - test the code "as is", meaning that GSO will be tried and disabled by > > TCP stack > > > > Which approach would you prefer? > > > > definitely not a sysctl. > > After that, I don't have a suggestion for GSO at the moment.
On Sun, Mar 3, 2019 at 9:55 PM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > On Fri, Mar 1, 2019 at 9:27 PM David Ahern <dsahern@gmail.com> wrote: > > > > On 2/28/19 10:57 AM, Peter Oskolkov wrote: > > > David: I'm not sure how to test GSO (I assume we are talking about GSO > > > here) in > > > the selftest: the encapping code sets SKB_GSO_DODGY flag, and veth does > > > not support > > > dodginess: "tx-gso-robust: off [fixed]". > > > > > > If the "dodgy" flag is not set, then gso validation in dev.c passes, and > > > large GSO packets > > > happily go through; if the "dodgy" flag is set, "dodgy" GSO packets are > > > rejected, TCP does > > > segmentation, and non-GSO packets happily go through (with an mtu tweak > > > to the LWT tunnel). > > Very few devices unconditionally accept dodgy packets (only veth?). virtio-net, I meant. But there are a few other virtual devices, like macvlan and xen-netfront > A device that lacks the robust gso feature will cause a gso packet > with dodgy flag to enter software gso instead of passing to device > segmentation offload. > > That should be perfect for checking that the packets can be segmented > correctly with the new header. > > If the gso layer drops the packets, that is not due to dropping all > dodgy sources. It will be dropped somewhere else inside gso, > indication that something is not as expected with the packet. > > > > So I see three options: > > > - add a sysctl to _not_ set SKB_GSO_DODGY flag in lwt_bpf.c => > > > handle_gso_type(); > > > - change veth to accept "dodgy" GSO packets > > Neither, as these would bypass segmentation offload and pass the large > packet to the receive path. It is more interesting to validate the > packet in gso. That said, one solution would be to make veth gso_robust configurable through ethtool, by advertising it in hw_features. > > > - test the code "as is", meaning that GSO will be tried and disabled by > > > TCP stack > > > > > > Which approach would you prefer? > > > > > > > definitely not a sysctl. > > > > After that, I don't have a suggestion for GSO at the moment.
On Sun, Mar 3, 2019 at 6:55 PM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > On Fri, Mar 1, 2019 at 9:27 PM David Ahern <dsahern@gmail.com> wrote: > > > > On 2/28/19 10:57 AM, Peter Oskolkov wrote: > > > David: I'm not sure how to test GSO (I assume we are talking about GSO > > > here) in > > > the selftest: the encapping code sets SKB_GSO_DODGY flag, and veth does > > > not support > > > dodginess: "tx-gso-robust: off [fixed]". > > > > > > If the "dodgy" flag is not set, then gso validation in dev.c passes, and > > > large GSO packets > > > happily go through; if the "dodgy" flag is set, "dodgy" GSO packets are > > > rejected, TCP does > > > segmentation, and non-GSO packets happily go through (with an mtu tweak > > > to the LWT tunnel). > > Very few devices unconditionally accept dodgy packets (only veth?). > > A device that lacks the robust gso feature will cause a gso packet > with dodgy flag to enter software gso instead of passing to device > segmentation offload. > > That should be perfect for checking that the packets can be segmented > correctly with the new header. > > If the gso layer drops the packets, that is not due to dropping all > dodgy sources. It will be dropped somewhere else inside gso, > indication that something is not as expected with the packet. > > > > So I see three options: > > > - add a sysctl to _not_ set SKB_GSO_DODGY flag in lwt_bpf.c => > > > handle_gso_type(); > > > - change veth to accept "dodgy" GSO packets > > Neither, as these would bypass segmentation offload and pass the large > packet to the receive path. It is more interesting to validate the > packet in gso. I found the problem: skb->inner_protocol was not set, so software GSO fallback failed. I have a patch that fixes the issue: IPIP+GRE+TCP gso works! net-next is closed though... Will have to wait for net-next to reopen. > > > > - test the code "as is", meaning that GSO will be tried and disabled by > > > TCP stack > > > > > > Which approach would you prefer? > > > > > > > definitely not a sysctl. > > > > After that, I don't have a suggestion for GSO at the moment.
On 3/4/19 1:39 PM, Peter Oskolkov wrote: > I found the problem: skb->inner_protocol was not set, so software GSO > fallback failed. I have a patch that fixes the issue: IPIP+GRE+TCP > gso works! net-next is closed though... Will have to wait for net-next > to reopen. That's a bug fix. I suggest sending now.
On Mon, Mar 4, 2019 at 1:03 PM David Ahern <dsahern@gmail.com> wrote: > > On 3/4/19 1:39 PM, Peter Oskolkov wrote: > > I found the problem: skb->inner_protocol was not set, so software GSO > > fallback failed. I have a patch that fixes the issue: IPIP+GRE+TCP > > gso works! net-next is closed though... Will have to wait for net-next > > to reopen. > > That's a bug fix. I suggest sending now. I see the encap patches neither in net nor in bpf trees, only in net-next and bpf-next. And *-next trees are closed, so there is nowhere to send the fix. Am I missing something?
On 03/04/2019 02:37 PM, Peter Oskolkov wrote: > On Mon, Mar 4, 2019 at 1:03 PM David Ahern <dsahern@gmail.com> wrote: >> >> On 3/4/19 1:39 PM, Peter Oskolkov wrote: >>> I found the problem: skb->inner_protocol was not set, so software GSO >>> fallback failed. I have a patch that fixes the issue: IPIP+GRE+TCP >>> gso works! net-next is closed though... Will have to wait for net-next >>> to reopen. >> >> That's a bug fix. I suggest sending now. > > I see the encap patches neither in net nor in bpf trees, only in > net-next and bpf-next. And *-next trees are closed, so there is > nowhere to send the fix. Am I missing something? > David has not yet sent his pull request to Linus. This means your fix needs to target net-next net-next is closed for patches that are meant for linux-5.2, but 'open' for fixes (targeting 5.1)
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 32251f3fcda0..f3273cbb6b22 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -179,18 +179,19 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); int oif = l3mdev ? l3mdev->ifindex : 0; struct dst_entry *dst = NULL; + int err = -EAFNOSUPPORT; struct sock *sk; struct net *net; bool ipv4; - int err; if (skb->protocol == htons(ETH_P_IP)) ipv4 = true; else if (skb->protocol == htons(ETH_P_IPV6)) ipv4 = false; else - return -EAFNOSUPPORT; + goto err; + err = -EINVAL; sk = sk_to_full_sk(skb->sk); if (sk) { if (sk->sk_bound_dev_if) @@ -216,7 +217,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) - return -EINVAL; + goto err; dst = &rt->dst; } else { struct ipv6hdr *iph6 = ipv6_hdr(skb); @@ -231,12 +232,15 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl6.saddr = iph6->saddr; err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); - if (err || IS_ERR(dst)) - return -EINVAL; + if (err || IS_ERR(dst)) { + err = -EINVAL; + goto err; + } } if (unlikely(dst->error)) { dst_release(dst); - return -EINVAL; + err = -EINVAL; + goto err; } /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it @@ -246,17 +250,21 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) */ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) - return err; + goto err; skb_dst_drop(skb); skb_dst_set(skb, dst); err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) - return err; + goto err; /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; + +err: + kfree_skb(skb); + return err; } static int bpf_xmit(struct sk_buff *skb)
On error the skb should be freed. Tested with diff/steps provided by David Ahern. Reported-by: David Ahern <dsahern@gmail.com> Fixes: 3bd0b15281af ("bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c") Signed-off-by: Peter Oskolkov <posk@google.com> --- net/core/lwt_bpf.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-)