diff mbox series

[net-next,v2,2/2] ipv4: use dst hint for ipv4 list receive

Message ID 592c763828171c414e8927878b1a22027e33dee7.1574071944.git.pabeni@redhat.com
State Changes Requested, archived
Headers show
Series net: introduce and use route hint | expand

Commit Message

Paolo Abeni Nov. 18, 2019, 11:01 a.m. UTC
This is alike the previous change, with some additional ipv4 specific
quirk. Even when using the route hint we still have to do perform
additional per packet checks about source address validity: a new
helper is added to wrap them.

Moreover, the ipv4 route lookup, even in the absence of policy routing,
may depend on pkts ToS, so we cache that values, too.

Explicitly avoid hints for local broadcast: this simplify the code
and broadcasts are slower path anyway.

UDP flood performances vs recvmmsg() receiver:

vanilla		patched		delta
Kpps		Kpps		%
1683		1833		+8

In the worst case scenario - each packet has a different
destination address - the performance delta is within noise
range.

v1 -> v2:
 - fix build issue with !CONFIG_IP_MULTIPLE_TABLES

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/route.h | 11 +++++++++++
 net/ipv4/ip_input.c | 38 +++++++++++++++++++++++++++++++++-----
 net/ipv4/route.c    | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+), 5 deletions(-)

Comments

David Ahern Nov. 18, 2019, 4:07 p.m. UTC | #1
On 11/18/19 4:01 AM, Paolo Abeni wrote:
> @@ -535,9 +540,20 @@ static void ip_sublist_rcv_finish(struct list_head *head)
>  	}
>  }
>  
> +static bool ip_can_cache_route_hint(struct net *net, struct rtable *rt)
> +{
> +	return rt->rt_type != RTN_BROADCAST &&
> +#ifdef CONFIG_IP_MULTIPLE_TABLES
> +	       !net->ipv6.fib6_has_custom_rules;

that should be ipv4, not ipv6, right?

Also, for readability it would be better to have 2 helpers in
include//net/fib_rules.h that return true false and manage the net
namespace issue.

> +#else
> +	       1;
> +#endif
> +}
> +
Paolo Abeni Nov. 18, 2019, 4:31 p.m. UTC | #2
Hi,

Thank you for the feedback.

On Mon, 2019-11-18 at 09:07 -0700, David Ahern wrote:
> On 11/18/19 4:01 AM, Paolo Abeni wrote:
> > @@ -535,9 +540,20 @@ static void ip_sublist_rcv_finish(struct list_head *head)
> >  	}
> >  }
> >  
> > +static bool ip_can_cache_route_hint(struct net *net, struct rtable *rt)
> > +{
> > +	return rt->rt_type != RTN_BROADCAST &&
> > +#ifdef CONFIG_IP_MULTIPLE_TABLES
> > +	       !net->ipv6.fib6_has_custom_rules;
> 
> that should be ipv4, not ipv6, right?

Indeed. More coffee needed here, sorry.

> Also, for readability it would be better to have 2 helpers in
> include//net/fib_rules.h that return true false and manage the net
> namespace issue.

Double checking I parsed the above correctly. Do you mean something
like the following - I think net/ip_fib.h fits more, as it already
deals with CONFIG_IP_MULTIPLE_TABLES?

---
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 52b2406a5dfc..b6c5cd544402 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -272,6 +272,11 @@ void fib_free_table(struct fib_table *tb);
 #define TABLE_LOCAL_INDEX      (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1))
 #define TABLE_MAIN_INDEX       (RT_TABLE_MAIN  & (FIB_TABLE_HASHSZ - 1))
 
+static bool fib4_has_custom_rules(struct net *net)
+{
+       return 0;
+}
+
 static inline struct fib_table *fib_get_table(struct net *net, u32 id)
 {
        struct hlist_node *tb_hlist;
@@ -341,6 +346,11 @@ void __net_exit fib4_rules_exit(struct net *net);
 struct fib_table *fib_new_table(struct net *net, u32 id);
 struct fib_table *fib_get_table(struct net *net, u32 id);
 
+static bool fib4_has_custom_rules(struct net *net)
+{
+       return net->ipv4.fib_has_custom_rules;
+}
+
 int __fib_lookup(struct net *net, struct flowi4 *flp,
                 struct fib_result *res, unsigned int flags);
---
plus something similar for the previous patch, in include/net/ip6_fib.h

Thank you,

Paolo
David Ahern Nov. 18, 2019, 4:40 p.m. UTC | #3
On 11/18/19 9:31 AM, Paolo Abeni wrote:
>> Also, for readability it would be better to have 2 helpers in
>> include//net/fib_rules.h that return true false and manage the net
>> namespace issue.
> 
> Double checking I parsed the above correctly. Do you mean something
> like the following - I think net/ip_fib.h fits more, as it already
> deals with CONFIG_IP_MULTIPLE_TABLES?

sure.

And it looks like they already exist in net//ipv4/fib_frontend.c, so
those can be moved to ip_fib.h
diff mbox series

Patch

diff --git a/include/net/route.h b/include/net/route.h
index 6c516840380d..f7a8a52318cd 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -185,6 +185,17 @@  int ip_route_input_rcu(struct sk_buff *skb, __be32 dst, __be32 src,
 		       u8 tos, struct net_device *devin,
 		       struct fib_result *res);
 
+struct ip_route_input_hint {
+	unsigned long	refdst;
+	__be32		daddr;
+	char		tos;
+	bool		local;
+};
+
+int ip_route_use_hint(struct sk_buff *skb, __be32 dst, __be32 src,
+		      u8 tos, struct net_device *devin,
+		      struct ip_route_input_hint *hint);
+
 static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
 				 u8 tos, struct net_device *devin)
 {
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 24a95126e698..25f6fcc65380 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -305,7 +305,8 @@  static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
 INDIRECT_CALLABLE_DECLARE(int udp_v4_early_demux(struct sk_buff *));
 INDIRECT_CALLABLE_DECLARE(int tcp_v4_early_demux(struct sk_buff *));
 static int ip_rcv_finish_core(struct net *net, struct sock *sk,
-			      struct sk_buff *skb, struct net_device *dev)
+			      struct sk_buff *skb, struct net_device *dev,
+			      struct ip_route_input_hint *hint)
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	int (*edemux)(struct sk_buff *skb);
@@ -335,8 +336,12 @@  static int ip_rcv_finish_core(struct net *net, struct sock *sk,
 	 *	how the packet travels inside Linux networking.
 	 */
 	if (!skb_valid_dst(skb)) {
-		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
-					   iph->tos, dev);
+		if (hint && hint->daddr == iph->daddr && hint->tos == iph->tos)
+			err = ip_route_use_hint(skb, iph->daddr, iph->saddr,
+						iph->tos, dev, hint);
+		else
+			err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+						   iph->tos, dev);
 		if (unlikely(err))
 			goto drop_error;
 	}
@@ -408,7 +413,7 @@  static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	if (!skb)
 		return NET_RX_SUCCESS;
 
-	ret = ip_rcv_finish_core(net, sk, skb, dev);
+	ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
 	if (ret != NET_RX_DROP)
 		ret = dst_input(skb);
 	return ret;
@@ -535,9 +540,20 @@  static void ip_sublist_rcv_finish(struct list_head *head)
 	}
 }
 
+static bool ip_can_cache_route_hint(struct net *net, struct rtable *rt)
+{
+	return rt->rt_type != RTN_BROADCAST &&
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	       !net->ipv6.fib6_has_custom_rules;
+#else
+	       1;
+#endif
+}
+
 static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 			       struct list_head *head)
 {
+	struct ip_route_input_hint _hint, *hint = NULL;
 	struct dst_entry *curr_dst = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
@@ -554,11 +570,23 @@  static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		skb = l3mdev_ip_rcv(skb);
 		if (!skb)
 			continue;
-		if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP)
+		if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
 			continue;
 
 		dst = skb_dst(skb);
 		if (curr_dst != dst) {
+			struct rtable *rt = (struct rtable *)dst;
+
+			if (ip_can_cache_route_hint(net, rt)) {
+				_hint.refdst = skb->_skb_refdst;
+				_hint.daddr = ip_hdr(skb)->daddr;
+				_hint.tos = ip_hdr(skb)->tos;
+				_hint.local = rt->rt_type == RTN_LOCAL;
+				hint = &_hint;
+			} else {
+				hint = NULL;
+			}
+
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
 				ip_sublist_rcv_finish(&sublist);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index dcc4fa10138d..b0ddff17db80 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2019,6 +2019,44 @@  static int ip_mkroute_input(struct sk_buff *skb,
 	return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }
 
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and this is not a local broadcast.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+		      u8 tos, struct net_device *dev,
+		      struct ip_route_input_hint *hint)
+{
+	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct net *net = dev_net(dev);
+	int err = -EINVAL;
+	u32 itag = 0;
+
+	if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+		goto martian_source;
+
+	if (ipv4_is_zeronet(saddr))
+		goto martian_source;
+
+	if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+		goto martian_source;
+
+	if (hint->local) {
+		err = fib_validate_source(skb, saddr, daddr, tos, 0, dev,
+					  in_dev, &itag);
+		if (err < 0)
+			goto martian_source;
+	}
+
+	err = 0;
+	__skb_dst_copy(skb, hint->refdst);
+	return err;
+
+martian_source:
+	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+	return err;
+}
+
 /*
  *	NOTE. We drop all the packets that has local source
  *	addresses, because every properly looped back packet