diff mbox

[net-next,v3] route: allow to route in a peer netns via lwt framework

Message ID 1438175774-4408-1-git-send-email-nicolas.dichtel@6wind.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Nicolas Dichtel July 29, 2015, 1:16 p.m. UTC
This patch takes advantage of the newly added lwtunnel framework to
allow the user to set routes that point to a peer netns.

Packets are injected to the peer netns via the loopback device. It works
only when the output device is 'lo'.

Example:
ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo

The goal is to be scalable when the number of netns is high (10k or more).
Which this patch, we can save two interfaces (veth) per netns, which helps
to to reduce memory consumption and the time needed to create a netns.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---

v3: fix skb use after free in loopback_xmit()
    inc err stats if unable to find the peer netns
    fix a checkpatch style report

v2: rework loopback handling part (update stats and call skb_dst_force())
    fix ipv6 processing
    check lwtunnel type before converting data to a nsid

 drivers/net/loopback.c        | 40 +++++++++++++++++++++++++--------
 include/net/lwtunnel.h        | 27 ++++++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  1 +
 net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/route.c              |  9 ++++++--
 5 files changed, 118 insertions(+), 11 deletions(-)

Comments

Eric Dumazet July 29, 2015, 3:20 p.m. UTC | #1
On Wed, 2015-07-29 at 15:16 +0200, Nicolas Dichtel wrote:
> This patch takes advantage of the newly added lwtunnel framework to
> allow the user to set routes that point to a peer netns.
> 
> Packets are injected to the peer netns via the loopback device. It works
> only when the output device is 'lo'.
> 
> Example:
> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
> 
> The goal is to be scalable when the number of netns is high (10k or more).
> Which this patch, we can save two interfaces (veth) per netns, which helps
> to to reduce memory consumption and the time needed to create a netns.


Really this is a hack Nicolas.

get_net_ns_by_id() was not meant to be used in data (fast ???) path.

Same for get_net() and put_net()

Plumbing like that should not happen in lo start_xmit()


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Dichtel July 29, 2015, 9:17 p.m. UTC | #2
Le 29/07/2015 17:20, Eric Dumazet a écrit :
> On Wed, 2015-07-29 at 15:16 +0200, Nicolas Dichtel wrote:
>> This patch takes advantage of the newly added lwtunnel framework to
>> allow the user to set routes that point to a peer netns.
>>
>> Packets are injected to the peer netns via the loopback device. It works
>> only when the output device is 'lo'.
>>
>> Example:
>> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
>>
>> The goal is to be scalable when the number of netns is high (10k or more).
>> Which this patch, we can save two interfaces (veth) per netns, which helps
>> to to reduce memory consumption and the time needed to create a netns.
>
>
> Really this is a hack Nicolas.
>
> get_net_ns_by_id() was not meant to be used in data (fast ???) path.
>
> Same for get_net() and put_net()
>
> Plumbing like that should not happen in lo start_xmit()
Yes, I think you're right. I was a bit too enthusiasm with this new framework.
Do you think it would be acceptable if the netns was directly referenced
instead of a nsid?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index c76283c2f84a..1b83efcbfbb3 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -57,6 +57,7 @@ 
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <net/lwtunnel.h>
 
 struct pcpu_lstats {
 	u64			packets;
@@ -71,29 +72,49 @@  struct pcpu_lstats {
 static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 				 struct net_device *dev)
 {
+	int nsid = skb_lwt_netns_info(skb);
 	struct pcpu_lstats *lb_stats;
-	int len;
-
-	skb_orphan(skb);
+	struct net *peernet = NULL;
+	int len, ret;
 
 	/* Before queueing this packet to netif_rx(),
 	 * make sure dst is refcounted.
 	 */
 	skb_dst_force(skb);
 
-	skb->protocol = eth_type_trans(skb, dev);
-
-	/* it's OK to use per_cpu_ptr() because BHs are off */
-	lb_stats = this_cpu_ptr(dev->lstats);
+	if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
+		peernet = get_net_ns_by_id(dev_net(dev), nsid);
+		if (!peernet) {
+			dev->stats.tx_errors++;
+			kfree_skb(skb);
+			goto end;
+		}
+
+		/* it's OK to use per_cpu_ptr() because BHs are off */
+		lb_stats = this_cpu_ptr(peernet->loopback_dev->lstats);
+		len = skb->len;
+		ret = dev_forward_skb(peernet->loopback_dev, skb);
+	} else {
+		skb_orphan(skb);
+
+		skb->protocol = eth_type_trans(skb, dev);
+
+		/* it's OK to use per_cpu_ptr() because BHs are off */
+		lb_stats = this_cpu_ptr(dev->lstats);
+		len = skb->len;
+		ret = netif_rx(skb);
+	}
 
-	len = skb->len;
-	if (likely(netif_rx(skb) == NET_RX_SUCCESS)) {
+	if (likely(ret == NET_RX_SUCCESS)) {
 		u64_stats_update_begin(&lb_stats->syncp);
 		lb_stats->bytes += len;
 		lb_stats->packets++;
 		u64_stats_update_end(&lb_stats->syncp);
 	}
 
+end:
+	if (peernet)
+		put_net(peernet);
 	return NETDEV_TX_OK;
 }
 
@@ -122,6 +143,7 @@  static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
 	stats->tx_packets = packets;
 	stats->rx_bytes   = bytes;
 	stats->tx_bytes   = bytes;
+	stats->tx_errors  = dev->stats.tx_errors;
 	return stats;
 }
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index b02039081b04..78376da1afa2 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -5,7 +5,9 @@ 
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
+#include <linux/net_namespace.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 
 #define LWTUNNEL_HASH_BITS   7
 #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
@@ -147,4 +149,29 @@  static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
 
 #endif
 
+static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
+{
+	return (u32 *)lwtstate->data;
+}
+
+static inline int skb_lwt_netns_info(struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct rtable *rt = (struct rtable *)skb_dst(skb);
+
+		if (rt &&
+		    rt->rt_lwtstate &&
+		    rt->rt_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+			return *lwt_netns_info(rt->rt_lwtstate);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+		if (rt6 &&
+		    rt6->rt6i_lwtstate &&
+		    rt6->rt6i_lwtstate->type & LWTUNNEL_ENCAP_NETNS)
+			return *lwt_netns_info(rt6->rt6i_lwtstate);
+	}
+
+	return NETNSA_NSID_NOT_ASSIGNED;
+}
 #endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..6715e7a1b335 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@  enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
+	LWTUNNEL_ENCAP_NETNS,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..c1267aac373d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@ 
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/lwtunnel.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -725,6 +726,56 @@  out:
 	rtnl_set_sk_err(net, RTNLGRP_NSID, err);
 }
 
+static int lwt_netns_build_state(struct net_device *dev, struct nlattr *nla,
+				 struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[NETNSA_MAX + 1];
+	struct lwtunnel_state *newts;
+	int *nsid;
+	int ret;
+
+	ret = nla_parse_nested(tb, NETNSA_MAX, nla, rtnl_net_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[NETNSA_NSID])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*nsid));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = sizeof(*nsid);
+	nsid = lwt_netns_info(newts);
+	*nsid = nla_get_s32(tb[NETNSA_NSID]);
+	newts->type = LWTUNNEL_ENCAP_NETNS;
+
+	*ts = newts;
+	return 0;
+}
+
+static int lwt_netns_fill_encap_info(struct sk_buff *skb,
+				     struct lwtunnel_state *lwtstate)
+{
+	int *nsid = lwt_netns_info(lwtstate);
+
+	if (nla_put_s32(skb, NETNSA_NSID, *nsid))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int lwt_netns_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(4);	/* NETNSA_NSID */
+}
+
+static const struct lwtunnel_encap_ops lwt_netns_ops = {
+	.build_state = lwt_netns_build_state,
+	.fill_encap = lwt_netns_fill_encap_info,
+	.get_encap_size = lwt_netns_encap_nlsize,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -762,6 +813,7 @@  static int __init net_ns_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
 		      NULL);
 
+	lwtunnel_encap_add_ops(&lwt_netns_ops, LWTUNNEL_ENCAP_NETNS);
 	return 0;
 }
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 54fccf0d705d..6e77d4b1380d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1796,10 +1796,13 @@  int ip6_route_add(struct fib6_config *cfg)
 	rt->rt6i_metric = cfg->fc_metric;
 
 	/* We cannot add true routes via loopback here,
-	   they would result in kernel looping; promote them to reject routes
+	 * they would result in kernel looping; promote them to reject routes.
+	 * Exception: routes that point to a peer netns.
 	 */
 	if ((cfg->fc_flags & RTF_REJECT) ||
 	    (dev && (dev->flags & IFF_LOOPBACK) &&
+	     (!rt->rt6i_lwtstate ||
+	      rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS) &&
 	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
 	     !(cfg->fc_flags & RTF_LOCAL))) {
 		/* hold loopback dev/idev if we haven't done so. */
@@ -2880,7 +2883,9 @@  static int rt6_fill_node(struct net *net,
 	}
 	else if (rt->rt6i_flags & RTF_LOCAL)
 		rtm->rtm_type = RTN_LOCAL;
-	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
+	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK) &&
+		 (!rt->rt6i_lwtstate ||
+		  rt->rt6i_lwtstate->type != LWTUNNEL_ENCAP_NETNS))
 		rtm->rtm_type = RTN_LOCAL;
 	else
 		rtm->rtm_type = RTN_UNICAST;