diff mbox series

[net-next,v2,1/2] ipv6: introduce and uses route look hints for list input

Message ID 643f2b258e275e915fa96ef0c635f9c5ff804c9d.1574071944.git.pabeni@redhat.com
State Changes Requested, archived
Headers show
Series net: introduce and use route hint | expand

Commit Message

Paolo Abeni Nov. 18, 2019, 11:01 a.m. UTC
When doing RX batch packet processing, we currently always repeat
the route lookup for each ingress packet. If policy routing is
configured, and IPV6_SUBTREES is disabled at build time, we
know that packets with the same destination address will use
the same dst.

This change tries to avoid per packet route lookup caching
the destination address of the latest successful lookup, and
reusing it for the next packet when the above conditions are
in place. Ingress traffic for most servers should fit.

The measured performance delta under UDP flood vs a recvmmsg
receiver is as follow:

vanilla		patched		delta
Kpps		Kpps		%
1431		1664		+14

In the worst-case scenario - each packet has a different
destination address - the performance delta is within noise
range.

v1 -> v2:
 - fix build issue with !CONFIG_IPV6_MULTIPLE_TABLES
 - fix potential race when fib6_has_custom_rules is set
   while processing a packet batch

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 net/ipv6/ip6_input.c | 40 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

Comments

Willem de Bruijn Nov. 18, 2019, 8:29 p.m. UTC | #1
On Mon, Nov 18, 2019 at 6:03 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> When doing RX batch packet processing, we currently always repeat
> the route lookup for each ingress packet. If policy routing is
> configured, and IPV6_SUBTREES is disabled at build time, we
> know that packets with the same destination address will use
> the same dst.
>
> This change tries to avoid per packet route lookup caching
> the destination address of the latest successful lookup, and
> reusing it for the next packet when the above conditions are
> in place. Ingress traffic for most servers should fit.
>
> The measured performance delta under UDP flood vs a recvmmsg
> receiver is as follow:
>
> vanilla         patched         delta
> Kpps            Kpps            %
> 1431            1664            +14

Since IPv4 speed-up is almost half and code considerably more complex,
maybe only do IPv6?

>
> In the worst-case scenario - each packet has a different
> destination address - the performance delta is within noise
> range.
>
> v1 -> v2:
>  - fix build issue with !CONFIG_IPV6_MULTIPLE_TABLES
>  - fix potential race when fib6_has_custom_rules is set
>    while processing a packet batch
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
>  net/ipv6/ip6_input.c | 40 ++++++++++++++++++++++++++++++++++++----
>  1 file changed, 36 insertions(+), 4 deletions(-)
>
> diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
> index ef7f707d9ae3..f559ad6b09ef 100644
> --- a/net/ipv6/ip6_input.c
> +++ b/net/ipv6/ip6_input.c
> @@ -44,10 +44,16 @@
>  #include <net/inet_ecn.h>
>  #include <net/dst_metadata.h>
>
> +struct ip6_route_input_hint {
> +       unsigned long   refdst;
> +       struct in6_addr daddr;
> +};
> +
>  INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *));
>  INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *));
>  static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
> -                               struct sk_buff *skb)
> +                               struct sk_buff *skb,
> +                               struct ip6_route_input_hint *hint)
>  {
>         void (*edemux)(struct sk_buff *skb);
>
> @@ -59,7 +65,13 @@ static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
>                         INDIRECT_CALL_2(edemux, tcp_v6_early_demux,
>                                         udp_v6_early_demux, skb);
>         }
> -       if (!skb_valid_dst(skb))
> +
> +       if (skb_valid_dst(skb))
> +               return;
> +
> +       if (hint && ipv6_addr_equal(&hint->daddr, &ipv6_hdr(skb)->daddr))
> +               __skb_dst_copy(skb, hint->refdst);
> +       else
>                 ip6_route_input(skb);

Is it possible to do the address comparison in ip6_list_rcv_finish
itself and pass a pointer to refdst if safe? To avoid new struct
definition, memcpy and to have all logic in one place. Need to
keep a pointer to the prev skb, then, instead.

>  static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
>                                 struct list_head *head)
>  {
> +       struct ip6_route_input_hint _hint, *hint = NULL;
>         struct dst_entry *curr_dst = NULL;
>         struct sk_buff *skb, *next;
>         struct list_head sublist;
> @@ -104,9 +127,18 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
>                 skb = l3mdev_ip6_rcv(skb);
>                 if (!skb)
>                         continue;
> -               ip6_rcv_finish_core(net, sk, skb);
> +               ip6_rcv_finish_core(net, sk, skb, hint);
>                 dst = skb_dst(skb);
>                 if (curr_dst != dst) {
> +                       if (ip6_can_cache_route_hint(net)) {
> +                               _hint.refdst = skb->_skb_refdst;
> +                               memcpy(&_hint.daddr, &ipv6_hdr(skb)->daddr,
> +                                      sizeof(_hint.daddr));
> +                               hint = &_hint;
> +                       } else {
> +                               hint = NULL;
> +                       }

not needed. ip6_can_cache_route_hit is the same for all iterations of
the loop (indeed, compile time static), so if false, hint is never
set.




> +
>                         /* dispatch old sublist */
>                         if (!list_empty(&sublist))
>                                 ip6_sublist_rcv_finish(&sublist);
> --
> 2.21.0
>
Paolo Abeni Nov. 18, 2019, 9:58 p.m. UTC | #2
On Mon, 2019-11-18 at 15:29 -0500, Willem de Bruijn wrote:
> On Mon, Nov 18, 2019 at 6:03 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > When doing RX batch packet processing, we currently always repeat
> > the route lookup for each ingress packet. If policy routing is
> > configured, and IPV6_SUBTREES is disabled at build time, we
> > know that packets with the same destination address will use
> > the same dst.
> > 
> > This change tries to avoid per packet route lookup caching
> > the destination address of the latest successful lookup, and
> > reusing it for the next packet when the above conditions are
> > in place. Ingress traffic for most servers should fit.
> > 
> > The measured performance delta under UDP flood vs a recvmmsg
> > receiver is as follow:
> > 
> > vanilla         patched         delta
> > Kpps            Kpps            %
> > 1431            1664            +14
> 
> Since IPv4 speed-up is almost half and code considerably more complex,
> maybe only do IPv6?

uhmm... I would avoid that kind of assimmetry, and I would not look
down on a 8% speedup, if possible.

> > In the worst-case scenario - each packet has a different
> > destination address - the performance delta is within noise
> > range.
> > 
> > v1 -> v2:
> >  - fix build issue with !CONFIG_IPV6_MULTIPLE_TABLES
> >  - fix potential race when fib6_has_custom_rules is set
> >    while processing a packet batch
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> >  net/ipv6/ip6_input.c | 40 ++++++++++++++++++++++++++++++++++++----
> >  1 file changed, 36 insertions(+), 4 deletions(-)
> > 
> > diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
> > index ef7f707d9ae3..f559ad6b09ef 100644
> > --- a/net/ipv6/ip6_input.c
> > +++ b/net/ipv6/ip6_input.c
> > @@ -44,10 +44,16 @@
> >  #include <net/inet_ecn.h>
> >  #include <net/dst_metadata.h>
> > 
> > +struct ip6_route_input_hint {
> > +       unsigned long   refdst;
> > +       struct in6_addr daddr;
> > +};
> > +
> >  INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *));
> >  INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *));
> >  static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
> > -                               struct sk_buff *skb)
> > +                               struct sk_buff *skb,
> > +                               struct ip6_route_input_hint *hint)
> >  {
> >         void (*edemux)(struct sk_buff *skb);
> > 
> > @@ -59,7 +65,13 @@ static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
> >                         INDIRECT_CALL_2(edemux, tcp_v6_early_demux,
> >                                         udp_v6_early_demux, skb);
> >         }
> > -       if (!skb_valid_dst(skb))
> > +
> > +       if (skb_valid_dst(skb))
> > +               return;
> > +
> > +       if (hint && ipv6_addr_equal(&hint->daddr, &ipv6_hdr(skb)->daddr))
> > +               __skb_dst_copy(skb, hint->refdst);
> > +       else
> >                 ip6_route_input(skb);
> 
> Is it possible to do the address comparison in ip6_list_rcv_finish
> itself and pass a pointer to refdst if safe? To avoid new struct
> definition, memcpy and to have all logic in one place. Need to
> keep a pointer to the prev skb, then, instead.

I haven't tought about that. Sounds promising. I'll try, thanks.

> >  static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
> >                                 struct list_head *head)
> >  {
> > +       struct ip6_route_input_hint _hint, *hint = NULL;
> >         struct dst_entry *curr_dst = NULL;
> >         struct sk_buff *skb, *next;
> >         struct list_head sublist;
> > @@ -104,9 +127,18 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
> >                 skb = l3mdev_ip6_rcv(skb);
> >                 if (!skb)
> >                         continue;
> > -               ip6_rcv_finish_core(net, sk, skb);
> > +               ip6_rcv_finish_core(net, sk, skb, hint);
> >                 dst = skb_dst(skb);
> >                 if (curr_dst != dst) {
> > +                       if (ip6_can_cache_route_hint(net)) {
> > +                               _hint.refdst = skb->_skb_refdst;
> > +                               memcpy(&_hint.daddr, &ipv6_hdr(skb)->daddr,
> > +                                      sizeof(_hint.daddr));
> > +                               hint = &_hint;
> > +                       } else {
> > +                               hint = NULL;
> > +                       }
> 
> not needed. ip6_can_cache_route_hit is the same for all iterations of
> the loop (indeed, compile time static), so if false, hint is never
> set.

I think this is needed, instead: if CONFIG_MULTIPLE_TABLES=y,
fib6_has_custom_rules can change at runtime - from 'false' to 'true'.
If we don't reset 'hint', we could end-up with use-after-free.

Cheers,

Paolo
Willem de Bruijn Nov. 19, 2019, 2:10 p.m. UTC | #3
On Mon, Nov 18, 2019 at 4:59 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Mon, 2019-11-18 at 15:29 -0500, Willem de Bruijn wrote:
> > On Mon, Nov 18, 2019 at 6:03 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > When doing RX batch packet processing, we currently always repeat
> > > the route lookup for each ingress packet. If policy routing is
> > > configured, and IPV6_SUBTREES is disabled at build time, we
> > > know that packets with the same destination address will use
> > > the same dst.
> > >
> > > This change tries to avoid per packet route lookup caching
> > > the destination address of the latest successful lookup, and
> > > reusing it for the next packet when the above conditions are
> > > in place. Ingress traffic for most servers should fit.
> > >
> > > The measured performance delta under UDP flood vs a recvmmsg
> > > receiver is as follow:
> > >
> > > vanilla         patched         delta
> > > Kpps            Kpps            %
> > > 1431            1664            +14
> >
> > Since IPv4 speed-up is almost half and code considerably more complex,
> > maybe only do IPv6?
>
> uhmm... I would avoid that kind of assimmetry, and I would not look
> down on a 8% speedup, if possible.

Okay, that's fair.

> > > @@ -104,9 +127,18 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
> > >                 skb = l3mdev_ip6_rcv(skb);
> > >                 if (!skb)
> > >                         continue;
> > > -               ip6_rcv_finish_core(net, sk, skb);
> > > +               ip6_rcv_finish_core(net, sk, skb, hint);
> > >                 dst = skb_dst(skb);
> > >                 if (curr_dst != dst) {
> > > +                       if (ip6_can_cache_route_hint(net)) {
> > > +                               _hint.refdst = skb->_skb_refdst;
> > > +                               memcpy(&_hint.daddr, &ipv6_hdr(skb)->daddr,
> > > +                                      sizeof(_hint.daddr));
> > > +                               hint = &_hint;
> > > +                       } else {
> > > +                               hint = NULL;
> > > +                       }
> >
> > not needed. ip6_can_cache_route_hit is the same for all iterations of
> > the loop (indeed, compile time static), so if false, hint is never
> > set.
>
> I think this is needed, instead: if CONFIG_MULTIPLE_TABLES=y,
> fib6_has_custom_rules can change at runtime - from 'false' to 'true'.
> If we don't reset 'hint', we could end-up with use-after-free.

Uhm, of course, this is not compile time static at all. I clearly
missed a part.

But such a config change does not expect instantaneous effect on
packets in flight, like those in the recv rcu critical section? In
which case it should be safe to treat all skbs in the list the same.

I would need to read that code more closely to be certain, and the
current solution errs on the side of caution, so is definitely fine as
is, of course.
diff mbox series

Patch

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index ef7f707d9ae3..f559ad6b09ef 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -44,10 +44,16 @@ 
 #include <net/inet_ecn.h>
 #include <net/dst_metadata.h>
 
+struct ip6_route_input_hint {
+	unsigned long	refdst;
+	struct in6_addr daddr;
+};
+
 INDIRECT_CALLABLE_DECLARE(void udp_v6_early_demux(struct sk_buff *));
 INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *));
 static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
-				struct sk_buff *skb)
+				struct sk_buff *skb,
+				struct ip6_route_input_hint *hint)
 {
 	void (*edemux)(struct sk_buff *skb);
 
@@ -59,7 +65,13 @@  static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
 			INDIRECT_CALL_2(edemux, tcp_v6_early_demux,
 					udp_v6_early_demux, skb);
 	}
-	if (!skb_valid_dst(skb))
+
+	if (skb_valid_dst(skb))
+		return;
+
+	if (hint && ipv6_addr_equal(&hint->daddr, &ipv6_hdr(skb)->daddr))
+		__skb_dst_copy(skb, hint->refdst);
+	else
 		ip6_route_input(skb);
 }
 
@@ -71,7 +83,7 @@  int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	skb = l3mdev_ip6_rcv(skb);
 	if (!skb)
 		return NET_RX_SUCCESS;
-	ip6_rcv_finish_core(net, sk, skb);
+	ip6_rcv_finish_core(net, sk, skb, NULL);
 
 	return dst_input(skb);
 }
@@ -86,9 +98,20 @@  static void ip6_sublist_rcv_finish(struct list_head *head)
 	}
 }
 
+static bool ip6_can_cache_route_hint(struct net *net)
+{
+	return !IS_ENABLED(IPV6_SUBTREES) &&
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	       !net->ipv6.fib6_has_custom_rules;
+#else
+	       1;
+#endif
+}
+
 static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
 				struct list_head *head)
 {
+	struct ip6_route_input_hint _hint, *hint = NULL;
 	struct dst_entry *curr_dst = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
@@ -104,9 +127,18 @@  static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
 		skb = l3mdev_ip6_rcv(skb);
 		if (!skb)
 			continue;
-		ip6_rcv_finish_core(net, sk, skb);
+		ip6_rcv_finish_core(net, sk, skb, hint);
 		dst = skb_dst(skb);
 		if (curr_dst != dst) {
+			if (ip6_can_cache_route_hint(net)) {
+				_hint.refdst = skb->_skb_refdst;
+				memcpy(&_hint.daddr, &ipv6_hdr(skb)->daddr,
+				       sizeof(_hint.daddr));
+				hint = &_hint;
+			} else {
+				hint = NULL;
+			}
+
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
 				ip6_sublist_rcv_finish(&sublist);