Message ID | 20190523175429.13302-3-sthemmin@microsoft.com |
---|---|
State | Changes Requested |
Delegated to: | David Miller |
Headers | show |
Series | XDP generic fixes | expand |
On Thu, 2019-05-23 at 10:54 -0700, Stephen Hemminger wrote: > When a device is stacked like (team, bonding, failsafe or netvsc) the > XDP generic program for the parent device was not called. > > Move the call to XDP generic inside __netif_receive_skb_core where > it can be done multiple times for stacked case. > > Suggested-by: Jiri Pirko <jiri@resnulli.us> > Fixes: d445516966dc ("net: xdp: support xdp generic on virtual > devices") > Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com> > --- > v1 - call xdp_generic in netvsc handler > v2 - do xdp_generic in generic rx handler processing > v3 - move xdp_generic call inside the another pass loop > > net/core/dev.c | 56 ++++++++++------------------------------------ > ---- > 1 file changed, 11 insertions(+), 45 deletions(-) > > diff --git a/net/core/dev.c b/net/core/dev.c > index b6b8505cfb3e..696776e14d00 100644 > --- a/net/core/dev.c > +++ b/net/core/dev.c > @@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff > *skb) > > trace_netif_rx(skb); > > - if (static_branch_unlikely(&generic_xdp_needed_key)) { > - int ret; > - > - preempt_disable(); > - rcu_read_lock(); > - ret = do_xdp_generic(rcu_dereference(skb->dev- > >xdp_prog), skb); > - rcu_read_unlock(); > - preempt_enable(); > - > - /* Consider XDP consuming the packet a success from > - * the netdev point of view we do not want to count > - * this as an error. > - */ > - if (ret != XDP_PASS) > - return NET_RX_SUCCESS; > - } > - Adding Jesper, There is a small behavioral change due to this patch, the XDP program after this patch will run on the RPS CPU, if configured, which could cause some behavioral changes in xdp_redirect_cpu: bpf_redirect_map(cpu_map). Maybe this is acceptable, but it should be documented, as the current assumption dictates: XDP program runs on the core where the XDP frame/SKB was first seen. > #ifdef CONFIG_RPS > if (static_branch_unlikely(&rps_needed)) { > struct rps_dev_flow voidflow, *rflow = &voidflow; > @@ -4858,6 +4841,17 @@ static int __netif_receive_skb_core(struct > sk_buff *skb, bool pfmemalloc, > > __this_cpu_inc(softnet_data.processed); > > + if (static_branch_unlikely(&generic_xdp_needed_key)) { > + int ret2; > + > + preempt_disable(); > + ret2 = do_xdp_generic(rcu_dereference(skb->dev- > >xdp_prog), skb); > + preempt_enable(); > + > + if (ret2 != XDP_PASS) > + return NET_RX_DROP; > + } > + > if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || > skb->protocol == cpu_to_be16(ETH_P_8021AD)) { > skb = skb_vlan_untag(skb); > @@ -5178,19 +5172,6 @@ static int netif_receive_skb_internal(struct > sk_buff *skb) > if (skb_defer_rx_timestamp(skb)) > return NET_RX_SUCCESS; > > - if (static_branch_unlikely(&generic_xdp_needed_key)) { > - int ret; > - > - preempt_disable(); > - rcu_read_lock(); > - ret = do_xdp_generic(rcu_dereference(skb->dev- > >xdp_prog), skb); > - rcu_read_unlock(); > - preempt_enable(); > - > - if (ret != XDP_PASS) > - return NET_RX_DROP; > - } > - > rcu_read_lock(); > #ifdef CONFIG_RPS > if (static_branch_unlikely(&rps_needed)) { > @@ -5224,21 +5205,6 @@ static void > netif_receive_skb_list_internal(struct list_head *head) > } > list_splice_init(&sublist, head); > > - if (static_branch_unlikely(&generic_xdp_needed_key)) { > - preempt_disable(); > - rcu_read_lock(); > - list_for_each_entry_safe(skb, next, head, list) { > - xdp_prog = rcu_dereference(skb->dev->xdp_prog); > - skb_list_del_init(skb); > - if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) > - list_add_tail(&skb->list, &sublist); > - } > - rcu_read_unlock(); > - preempt_enable(); > - /* Put passed packets back on main list */ > - list_splice_init(&sublist, head); > - } > - > rcu_read_lock(); > #ifdef CONFIG_RPS > if (static_branch_unlikely(&rps_needed)) {
On Thu, 23 May 2019 19:19:40 +0000 Saeed Mahameed <saeedm@mellanox.com> wrote: > On Thu, 2019-05-23 at 10:54 -0700, Stephen Hemminger wrote: > > When a device is stacked like (team, bonding, failsafe or netvsc) the > > XDP generic program for the parent device was not called. > > > > Move the call to XDP generic inside __netif_receive_skb_core where > > it can be done multiple times for stacked case. > > > > Suggested-by: Jiri Pirko <jiri@resnulli.us> > > Fixes: d445516966dc ("net: xdp: support xdp generic on virtual > > devices") > > Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com> > > --- > > v1 - call xdp_generic in netvsc handler > > v2 - do xdp_generic in generic rx handler processing > > v3 - move xdp_generic call inside the another pass loop > > > > net/core/dev.c | 56 ++++++++++------------------------------------ > > ---- > > 1 file changed, 11 insertions(+), 45 deletions(-) > > > > diff --git a/net/core/dev.c b/net/core/dev.c > > index b6b8505cfb3e..696776e14d00 100644 > > --- a/net/core/dev.c > > +++ b/net/core/dev.c > > @@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff > > *skb) > > > > trace_netif_rx(skb); > > > > - if (static_branch_unlikely(&generic_xdp_needed_key)) { > > - int ret; > > - > > - preempt_disable(); > > - rcu_read_lock(); > > - ret = do_xdp_generic(rcu_dereference(skb->dev- > > >xdp_prog), skb); > > - rcu_read_unlock(); > > - preempt_enable(); > > - > > - /* Consider XDP consuming the packet a success from > > - * the netdev point of view we do not want to count > > - * this as an error. > > - */ > > - if (ret != XDP_PASS) > > - return NET_RX_SUCCESS; > > - } > > - > > Adding Jesper, > > There is a small behavioral change due to this patch, > the XDP program after this patch will run on the RPS CPU, if > configured, which could cause some behavioral changes in > xdp_redirect_cpu: bpf_redirect_map(cpu_map). > > Maybe this is acceptable, but it should be documented, as the current > assumption dictates: XDP program runs on the core where the XDP > frame/SKB was first seen. Or maybe XDP should just force off RPS (like it does gro)
On 2019/5/24 上午3:19, Saeed Mahameed wrote: > On Thu, 2019-05-23 at 10:54 -0700, Stephen Hemminger wrote: >> When a device is stacked like (team, bonding, failsafe or netvsc) the >> XDP generic program for the parent device was not called. >> >> Move the call to XDP generic inside __netif_receive_skb_core where >> it can be done multiple times for stacked case. >> >> Suggested-by: Jiri Pirko <jiri@resnulli.us> >> Fixes: d445516966dc ("net: xdp: support xdp generic on virtual >> devices") >> Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com> >> --- >> v1 - call xdp_generic in netvsc handler >> v2 - do xdp_generic in generic rx handler processing >> v3 - move xdp_generic call inside the another pass loop >> >> net/core/dev.c | 56 ++++++++++------------------------------------ >> ---- >> 1 file changed, 11 insertions(+), 45 deletions(-) >> >> diff --git a/net/core/dev.c b/net/core/dev.c >> index b6b8505cfb3e..696776e14d00 100644 >> --- a/net/core/dev.c >> +++ b/net/core/dev.c >> @@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff >> *skb) >> >> trace_netif_rx(skb); >> >> - if (static_branch_unlikely(&generic_xdp_needed_key)) { >> - int ret; >> - >> - preempt_disable(); >> - rcu_read_lock(); >> - ret = do_xdp_generic(rcu_dereference(skb->dev- >>> xdp_prog), skb); >> - rcu_read_unlock(); >> - preempt_enable(); >> - >> - /* Consider XDP consuming the packet a success from >> - * the netdev point of view we do not want to count >> - * this as an error. >> - */ >> - if (ret != XDP_PASS) >> - return NET_RX_SUCCESS; >> - } >> - > Adding Jesper, > > There is a small behavioral change due to this patch, At least not for current code. We don't support skb in cpumap (though the fix should not be hard), in dp_do_generic_redirect_map() we had: } else { /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } > the XDP program after this patch will run on the RPS CPU, if > configured, which could cause some behavioral changes in > xdp_redirect_cpu: bpf_redirect_map(cpu_map). > > Maybe this is acceptable, but it should be documented, as the current > assumption dictates: XDP program runs on the core where the XDP > frame/SKB was first seen. At lest for TUN, this is not true. XDP frames were built by vhost_net and passed to TUN. There's no guarantee that vhost_net kthread won't move to another core. Thanks
On Thu, 23 May 2019 13:15:44 -0700 Stephen Hemminger <stephen@networkplumber.org> wrote: > On Thu, 23 May 2019 19:19:40 +0000 > Saeed Mahameed <saeedm@mellanox.com> wrote: > > > On Thu, 2019-05-23 at 10:54 -0700, Stephen Hemminger wrote: > > > When a device is stacked like (team, bonding, failsafe or netvsc) the > > > XDP generic program for the parent device was not called. > > > > > > Move the call to XDP generic inside __netif_receive_skb_core where > > > it can be done multiple times for stacked case. > > > > > > Suggested-by: Jiri Pirko <jiri@resnulli.us> > > > Fixes: d445516966dc ("net: xdp: support xdp generic on virtual > > > devices") > > > Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com> > > > --- > > > v1 - call xdp_generic in netvsc handler > > > v2 - do xdp_generic in generic rx handler processing > > > v3 - move xdp_generic call inside the another pass loop > > > > > > net/core/dev.c | 56 ++++++++++------------------------------------ > > > ---- > > > 1 file changed, 11 insertions(+), 45 deletions(-) > > > > > > diff --git a/net/core/dev.c b/net/core/dev.c > > > index b6b8505cfb3e..696776e14d00 100644 > > > --- a/net/core/dev.c > > > +++ b/net/core/dev.c > > > @@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff > > > *skb) > > > > > > trace_netif_rx(skb); > > > > > > - if (static_branch_unlikely(&generic_xdp_needed_key)) { > > > - int ret; > > > - > > > - preempt_disable(); > > > - rcu_read_lock(); > > > - ret = do_xdp_generic(rcu_dereference(skb->dev- > > > >xdp_prog), skb); > > > - rcu_read_unlock(); > > > - preempt_enable(); > > > - > > > - /* Consider XDP consuming the packet a success from > > > - * the netdev point of view we do not want to count > > > - * this as an error. > > > - */ > > > - if (ret != XDP_PASS) > > > - return NET_RX_SUCCESS; > > > - } > > > - > > > > Adding Jesper, > > > > There is a small behavioral change due to this patch, > > the XDP program after this patch will run on the RPS CPU, if > > configured, which could cause some behavioral changes in > > xdp_redirect_cpu: bpf_redirect_map(cpu_map). > > > > Maybe this is acceptable, but it should be documented, as the current > > assumption dictates: XDP program runs on the core where the XDP > > frame/SKB was first seen. This does break some assumptions, that I worry about. I've not optimized generic XDP much, as this is suppose to be slow-path, but as you can see in my evaluation[1] generic-XDP do have a performance potential (XDP drop: native=12Mpps and generic=8.4Mpps), but the XDP-generic performance dies as soon as we e.g. do XDP_TX (native=10Mpps and generic=4Mpps). The reason is lack of bulking. We could implement the same kind of RX-bulking tricks as we do for XDP_REDIRECT, where bpf_redirect_map store frames in the map and send them once NAPI-poll exit via a xdp_do_flush_map(). These tricks depend on per-CPU data (bpf_redirect_info), thus I cannot see how this could work if XDP-generic now happens after RPS on a remote CPU. Notice, that TX bulking at XDP-generic level, is actually rather simple, as netstack TX path-code support xmit_more via creating a list of SKBs... Last time I hacked it up, I saw 20%-30% speedup... anyone motivated to do this? > > Or maybe XDP should just force off RPS (like it does gro) I guess, we could force off RPS. But I do see one valid use-case of combining CPUMAP redirect with RFS (Receive Flow Steering) which is part of RPS. Yes, I know we/I *also* have to implement generic-XDP-cpumap support. But for native-XDP CPUMAP redirect, from 1-2 RX-CPUs into N-remote CPUs via CPUMAP, and then lets RFS send SKBs to where the application runs, does make sense to me. (I do have an "assignment" to implement this in eBPF here[2]). [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/Documentation/blogposts/xdp25_eval_generic_xdp_tx.rst [2] https://github.com/xdp-project/xdp-project/blob/master/areas/cpumap.org#cpumap-implement-dynamic-load-balancer-that-is-ooo-safe
On Fri, 24 May 2019 12:17:27 +0800 Jason Wang <jasowang@redhat.com> wrote: > > Maybe this is acceptable, but it should be documented, as the current > > assumption dictates: XDP program runs on the core where the XDP > > frame/SKB was first seen. > > > At lest for TUN, this is not true. XDP frames were built by vhost_net > and passed to TUN. There's no guarantee that vhost_net kthread won't > move to another core. This sound a little scary, as we depend on per-CPU variables (e.g. bpf_redirect_info). Can the vhost_net kthread move between CPUs within/during the NAPI-poll?
On 2019/5/24 下午6:07, Jesper Dangaard Brouer wrote: > On Fri, 24 May 2019 12:17:27 +0800 > Jason Wang <jasowang@redhat.com> wrote: > >>> Maybe this is acceptable, but it should be documented, as the current >>> assumption dictates: XDP program runs on the core where the XDP >>> frame/SKB was first seen. >> >> At lest for TUN, this is not true. XDP frames were built by vhost_net >> and passed to TUN. There's no guarantee that vhost_net kthread won't >> move to another core. > This sound a little scary, as we depend on per-CPU variables (e.g. > bpf_redirect_info). Can the vhost_net kthread move between CPUs > within/during the NAPI-poll? The RX of TUN will not go for NAPI usually. What we did is: 1) Vhost kthread prepares an array of XDP frames and pass them to TUN through sendmsg 2) TUN will disable bh and run XDP for each frames then enable bh So kthread can move to another CPU before 2) but we guarantee that the per-CPU dependency of XDP work in 2). TUN indeed has a NAPI mode which is mainly used for hardening, and XDP was not implemented on that path (this could be fixed in the future). Thanks
On 2019/5/24 下午5:33, Jesper Dangaard Brouer wrote: > On Thu, 23 May 2019 13:15:44 -0700 > Stephen Hemminger <stephen@networkplumber.org> wrote: > >> On Thu, 23 May 2019 19:19:40 +0000 >> Saeed Mahameed <saeedm@mellanox.com> wrote: >> >>> On Thu, 2019-05-23 at 10:54 -0700, Stephen Hemminger wrote: >>>> When a device is stacked like (team, bonding, failsafe or netvsc) the >>>> XDP generic program for the parent device was not called. >>>> >>>> Move the call to XDP generic inside __netif_receive_skb_core where >>>> it can be done multiple times for stacked case. >>>> >>>> Suggested-by: Jiri Pirko <jiri@resnulli.us> >>>> Fixes: d445516966dc ("net: xdp: support xdp generic on virtual >>>> devices") >>>> Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com> >>>> --- >>>> v1 - call xdp_generic in netvsc handler >>>> v2 - do xdp_generic in generic rx handler processing >>>> v3 - move xdp_generic call inside the another pass loop >>>> >>>> net/core/dev.c | 56 ++++++++++------------------------------------ >>>> ---- >>>> 1 file changed, 11 insertions(+), 45 deletions(-) >>>> >>>> diff --git a/net/core/dev.c b/net/core/dev.c >>>> index b6b8505cfb3e..696776e14d00 100644 >>>> --- a/net/core/dev.c >>>> +++ b/net/core/dev.c >>>> @@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff >>>> *skb) >>>> >>>> trace_netif_rx(skb); >>>> >>>> - if (static_branch_unlikely(&generic_xdp_needed_key)) { >>>> - int ret; >>>> - >>>> - preempt_disable(); >>>> - rcu_read_lock(); >>>> - ret = do_xdp_generic(rcu_dereference(skb->dev- >>>>> xdp_prog), skb); >>>> - rcu_read_unlock(); >>>> - preempt_enable(); >>>> - >>>> - /* Consider XDP consuming the packet a success from >>>> - * the netdev point of view we do not want to count >>>> - * this as an error. >>>> - */ >>>> - if (ret != XDP_PASS) >>>> - return NET_RX_SUCCESS; >>>> - } >>>> - >>> Adding Jesper, >>> >>> There is a small behavioral change due to this patch, >>> the XDP program after this patch will run on the RPS CPU, if >>> configured, which could cause some behavioral changes in >>> xdp_redirect_cpu: bpf_redirect_map(cpu_map). >>> >>> Maybe this is acceptable, but it should be documented, as the current >>> assumption dictates: XDP program runs on the core where the XDP >>> frame/SKB was first seen. > This does break some assumptions, that I worry about. I've not > optimized generic XDP much, as this is suppose to be slow-path, but as > you can see in my evaluation[1] generic-XDP do have a performance > potential (XDP drop: native=12Mpps and generic=8.4Mpps), but the > XDP-generic performance dies as soon as we e.g. do XDP_TX > (native=10Mpps and generic=4Mpps). The reason is lack of bulking. > > We could implement the same kind of RX-bulking tricks as we do for > XDP_REDIRECT, where bpf_redirect_map store frames in the map and send > them once NAPI-poll exit via a xdp_do_flush_map(). These tricks > depend on per-CPU data (bpf_redirect_info), thus I cannot see how this > could work if XDP-generic now happens after RPS on a remote CPU. RPS uses backlog NAPI so per-CPU is probably not an issue. Thanks > > Notice, that TX bulking at XDP-generic level, is actually rather > simple, as netstack TX path-code support xmit_more via creating a list > of SKBs... Last time I hacked it up, I saw 20%-30% speedup... anyone > motivated to do this? > >> Or maybe XDP should just force off RPS (like it does gro) > I guess, we could force off RPS. But I do see one valid use-case of > combining CPUMAP redirect with RFS (Receive Flow Steering) which is part > of RPS. Yes, I know we/I *also* have to implement generic-XDP-cpumap > support. But for native-XDP CPUMAP redirect, from 1-2 RX-CPUs into > N-remote CPUs via CPUMAP, and then lets RFS send SKBs to where the > application runs, does make sense to me. (I do have an "assignment" to > implement this in eBPF here[2]). > > > [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/Documentation/blogposts/xdp25_eval_generic_xdp_tx.rst > > [2] https://github.com/xdp-project/xdp-project/blob/master/areas/cpumap.org#cpumap-implement-dynamic-load-balancer-that-is-ooo-safe
From: Stephen Hemminger <stephen@networkplumber.org> Date: Thu, 23 May 2019 10:54:29 -0700 > @@ -4858,6 +4841,17 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, > > __this_cpu_inc(softnet_data.processed); > > + if (static_branch_unlikely(&generic_xdp_needed_key)) { > + int ret2; > + > + preempt_disable(); > + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); > + preempt_enable(); > + > + if (ret2 != XDP_PASS) > + return NET_RX_DROP; > + } This function just did a skb_reset_mac_len(). do_xdp_generic() can modify skb->mac_header. This means we have to redo the skb_reset_mac_len() to handle the potentially changed value.
diff --git a/net/core/dev.c b/net/core/dev.c index b6b8505cfb3e..696776e14d00 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4502,23 +4502,6 @@ static int netif_rx_internal(struct sk_buff *skb) trace_netif_rx(skb); - if (static_branch_unlikely(&generic_xdp_needed_key)) { - int ret; - - preempt_disable(); - rcu_read_lock(); - ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - rcu_read_unlock(); - preempt_enable(); - - /* Consider XDP consuming the packet a success from - * the netdev point of view we do not want to count - * this as an error. - */ - if (ret != XDP_PASS) - return NET_RX_SUCCESS; - } - #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4858,6 +4841,17 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc, __this_cpu_inc(softnet_data.processed); + if (static_branch_unlikely(&generic_xdp_needed_key)) { + int ret2; + + preempt_disable(); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + preempt_enable(); + + if (ret2 != XDP_PASS) + return NET_RX_DROP; + } + if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || skb->protocol == cpu_to_be16(ETH_P_8021AD)) { skb = skb_vlan_untag(skb); @@ -5178,19 +5172,6 @@ static int netif_receive_skb_internal(struct sk_buff *skb) if (skb_defer_rx_timestamp(skb)) return NET_RX_SUCCESS; - if (static_branch_unlikely(&generic_xdp_needed_key)) { - int ret; - - preempt_disable(); - rcu_read_lock(); - ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); - rcu_read_unlock(); - preempt_enable(); - - if (ret != XDP_PASS) - return NET_RX_DROP; - } - rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) { @@ -5224,21 +5205,6 @@ static void netif_receive_skb_list_internal(struct list_head *head) } list_splice_init(&sublist, head); - if (static_branch_unlikely(&generic_xdp_needed_key)) { - preempt_disable(); - rcu_read_lock(); - list_for_each_entry_safe(skb, next, head, list) { - xdp_prog = rcu_dereference(skb->dev->xdp_prog); - skb_list_del_init(skb); - if (do_xdp_generic(xdp_prog, skb) == XDP_PASS) - list_add_tail(&skb->list, &sublist); - } - rcu_read_unlock(); - preempt_enable(); - /* Put passed packets back on main list */ - list_splice_init(&sublist, head); - } - rcu_read_lock(); #ifdef CONFIG_RPS if (static_branch_unlikely(&rps_needed)) {
When a device is stacked like (team, bonding, failsafe or netvsc) the XDP generic program for the parent device was not called. Move the call to XDP generic inside __netif_receive_skb_core where it can be done multiple times for stacked case. Suggested-by: Jiri Pirko <jiri@resnulli.us> Fixes: d445516966dc ("net: xdp: support xdp generic on virtual devices") Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com> --- v1 - call xdp_generic in netvsc handler v2 - do xdp_generic in generic rx handler processing v3 - move xdp_generic call inside the another pass loop net/core/dev.c | 56 ++++++++++---------------------------------------- 1 file changed, 11 insertions(+), 45 deletions(-)