diff mbox

[net-next] route: allow to route in a peer netns via lwt framework

Message ID 1437661349-17620-1-git-send-email-nicolas.dichtel@6wind.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Nicolas Dichtel July 23, 2015, 2:22 p.m. UTC
This patch takes advantage of the newly added lwtunnel framework to
allow the user to set routes that points to a peer netns.

Packets are injected to the peer netns via the loopback device. It works
only when the output device is 'lo'.

Example:
ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 drivers/net/loopback.c        | 16 +++++++++++++
 include/net/lwtunnel.h        | 23 +++++++++++++++++++
 include/uapi/linux/lwtunnel.h |  1 +
 net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+)

Comments

Roopa Prabhu July 23, 2015, 3:01 p.m. UTC | #1
On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
> This patch takes advantage of the newly added lwtunnel framework to
> allow the user to set routes that points to a peer netns.
>
> Packets are injected to the peer netns via the loopback device. It works
> only when the output device is 'lo'.
>
> Example:
> ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo
>
> Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
> ---
>   drivers/net/loopback.c        | 16 +++++++++++++
>   include/net/lwtunnel.h        | 23 +++++++++++++++++++
>   include/uapi/linux/lwtunnel.h |  1 +
>   net/core/net_namespace.c      | 52 +++++++++++++++++++++++++++++++++++++++++++
>   4 files changed, 92 insertions(+)
>
> diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
> index c76283c2f84a..758d02f592f9 100644
> --- a/drivers/net/loopback.c
> +++ b/drivers/net/loopback.c
> @@ -57,6 +57,7 @@
>   #include <linux/percpu.h>
>   #include <net/net_namespace.h>
>   #include <linux/u64_stats_sync.h>
> +#include <net/lwtunnel.h>
>   
>   struct pcpu_lstats {
>   	u64			packets;
> @@ -71,9 +72,23 @@ struct pcpu_lstats {
>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>   				 struct net_device *dev)
>   {
> +	int nsid = skb_lwt_netns_info(skb);
>   	struct pcpu_lstats *lb_stats;
>   	int len;
>   
> +	if (nsid >= 0) {
> +		struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
> +
> +		if (!peernet) {
> +			kfree_skb(skb);
> +			goto end;
> +		}
> +
> +		dev_forward_skb(peernet->loopback_dev, skb);
> +		put_net(peernet);
> +		goto end;
> +	}
> +
>   	skb_orphan(skb);
>   
>   	/* Before queueing this packet to netif_rx(),
> @@ -94,6 +109,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>   		u64_stats_update_end(&lb_stats->syncp);
>   	}
>   
> +end:
>   	return NETDEV_TX_OK;
>   }
>   
> diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
> index 918e03c1dafa..cc05ce3c1aae 100644
> --- a/include/net/lwtunnel.h
> +++ b/include/net/lwtunnel.h
> @@ -5,7 +5,9 @@
>   #include <linux/netdevice.h>
>   #include <linux/skbuff.h>
>   #include <linux/types.h>
> +#include <linux/net_namespace.h>
>   #include <net/route.h>
> +#include <net/ip6_fib.h>
>   
>   #define LWTUNNEL_HASH_BITS   7
>   #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
> @@ -141,4 +143,25 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
>   
>   #endif
>   
> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
> +{
> +	return (u32 *)lwtstate->data;
> +}
> +
> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
> +{
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		struct rtable *rt = (struct rtable *)skb_dst(skb);
> +
> +		if (rt && rt->rt_lwtstate)
> +			return *lwt_netns_info(rt->rt_lwtstate);
> +	} else if (skb->protocol == htons(ETH_P_IPV6)) {
> +		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
> +
> +		if (rt6 && rt6->rt6i_lwtstate)
> +			return *lwt_netns_info(rt6->rt6i_lwtstate);
> +	}
> +
> +	return NETNSA_NSID_NOT_ASSIGNED;
> +}
>   #endif /* __NET_LWTUNNEL_H */
since these apis' don't have to be netns specific,
Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?

and seems like they should be declared for both CONFIG_LWTUNNEL 'y' and 'n'.

Thanks,
Roopa





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Dichtel July 23, 2015, 3:25 p.m. UTC | #2
Le 23/07/2015 17:01, roopa a écrit :
> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
[snip]
>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
>> +{
>> +    return (u32 *)lwtstate->data;
>> +}
>> +
>> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
>> +{
>> +    if (skb->protocol == htons(ETH_P_IP)) {
>> +        struct rtable *rt = (struct rtable *)skb_dst(skb);
>> +
>> +        if (rt && rt->rt_lwtstate)
>> +            return *lwt_netns_info(rt->rt_lwtstate);
>> +    } else if (skb->protocol == htons(ETH_P_IPV6)) {
>> +        struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
>> +
>> +        if (rt6 && rt6->rt6i_lwtstate)
>> +            return *lwt_netns_info(rt6->rt6i_lwtstate);
>> +    }
>> +
>> +    return NETNSA_NSID_NOT_ASSIGNED;
>> +}
>>   #endif /* __NET_LWTUNNEL_H */
> since these apis' don't have to be netns specific,
> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?
They are specific to netns because lwtstate->data is interpreted as an u32 *.
But I agree that a test is missing against lwtstate->type to ensure that data
will be a nsid.

>
> and seems like they should be declared for both CONFIG_LWTUNNEL 'y' and 'n'.
It is outside the "#ifdef CONFIG_LWTUNNEL". I can successfully compile with and
without CONFIG_LWTUNNEL.

Thank you,
Nicolas
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu July 23, 2015, 3:50 p.m. UTC | #3
On 7/23/15, 8:25 AM, Nicolas Dichtel wrote:
> Le 23/07/2015 17:01, roopa a écrit :
>> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
> [snip]
>>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
>>> +{
>>> +    return (u32 *)lwtstate->data;
>>> +}
>>> +
>>> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
>>> +{
>>> +    if (skb->protocol == htons(ETH_P_IP)) {
>>> +        struct rtable *rt = (struct rtable *)skb_dst(skb);
>>> +
>>> +        if (rt && rt->rt_lwtstate)
>>> +            return *lwt_netns_info(rt->rt_lwtstate);
>>> +    } else if (skb->protocol == htons(ETH_P_IPV6)) {
>>> +        struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
>>> +
>>> +        if (rt6 && rt6->rt6i_lwtstate)
>>> +            return *lwt_netns_info(rt6->rt6i_lwtstate);
>>> +    }
>>> +
>>> +    return NETNSA_NSID_NOT_ASSIGNED;
>>> +}
>>>   #endif /* __NET_LWTUNNEL_H */
>> since these apis' don't have to be netns specific,
>> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?
> They are specific to netns because lwtstate->data is interpreted as an 
> u32 *.
> But I agree that a test is missing against lwtstate->type to ensure 
> that data
> will be a nsid.
>
o ok..., the api's in lwtunnel.h today are not specific to an encap type.
they are generic, so skb_lwtunnel_state() which returns struct 
lwtunnel_state could go here.
the encap specific ones can go in the respective callers. Recently 
thomas added a similar
skb_tunnel_info() for ip tunnels. I did  like to have a generic version 
of your skb_lwt_netns_info in lwtunnel.h. I could use it in my mpls 
output func too.


>>
>> and seems like they should be declared for both CONFIG_LWTUNNEL 'y' 
>> and 'n'.
> It is outside the "#ifdef CONFIG_LWTUNNEL". I can successfully compile 
> with and
> without CONFIG_LWTUNNEL.
ok,

thanks,
Roopa

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Dichtel July 24, 2015, 12:24 p.m. UTC | #4
Le 23/07/2015 17:50, roopa a écrit :
> On 7/23/15, 8:25 AM, Nicolas Dichtel wrote:
>> Le 23/07/2015 17:01, roopa a écrit :
>>> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote:
>> [snip]
>>>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
>>>> +{
>>>> +    return (u32 *)lwtstate->data;
>>>> +}
>>>> +
>>>> +static inline int skb_lwt_netns_info(struct sk_buff *skb)
>>>> +{
>>>> +    if (skb->protocol == htons(ETH_P_IP)) {
>>>> +        struct rtable *rt = (struct rtable *)skb_dst(skb);
>>>> +
>>>> +        if (rt && rt->rt_lwtstate)
>>>> +            return *lwt_netns_info(rt->rt_lwtstate);
>>>> +    } else if (skb->protocol == htons(ETH_P_IPV6)) {
>>>> +        struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
>>>> +
>>>> +        if (rt6 && rt6->rt6i_lwtstate)
>>>> +            return *lwt_netns_info(rt6->rt6i_lwtstate);
>>>> +    }
>>>> +
>>>> +    return NETNSA_NSID_NOT_ASSIGNED;
>>>> +}
>>>>   #endif /* __NET_LWTUNNEL_H */
>>> since these apis' don't have to be netns specific,
>>> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ?
>> They are specific to netns because lwtstate->data is interpreted as an u32 *.
>> But I agree that a test is missing against lwtstate->type to ensure that data
>> will be a nsid.
>>
> o ok..., the api's in lwtunnel.h today are not specific to an encap type.
> they are generic, so skb_lwtunnel_state() which returns struct lwtunnel_state
> could go here.
> the encap specific ones can go in the respective callers. Recently thomas added
> a similar
> skb_tunnel_info() for ip tunnels. I did  like to have a generic version of your
> skb_lwt_netns_info in lwtunnel.h. I could use it in my mpls output func too.
Sure, but my goal was to not create a new .h file just for these two helpers.
It's related to lwtunnel, thus I was thinking they can go here.


Regards,
Nicolas
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu July 24, 2015, 1:50 p.m. UTC | #5
On 7/24/15, 5:24 AM, Nicolas Dichtel wrote:
> Sure, but my goal was to not create a new .h file just for these two 
> helpers.
> It's related to lwtunnel, thus I was thinking they can go here.
ok..., since your lwt namespace functions went into net_namespace.c, I 
was thinking
these should really go into net_namespace.h. Does that work for you ?
If that does not, then yes, they could live here.

Thanks,
Roopa

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Dichtel July 24, 2015, 2:11 p.m. UTC | #6
Le 24/07/2015 15:50, roopa a écrit :
> On 7/24/15, 5:24 AM, Nicolas Dichtel wrote:
>> Sure, but my goal was to not create a new .h file just for these two helpers.
>> It's related to lwtunnel, thus I was thinking they can go here.
> ok..., since your lwt namespace functions went into net_namespace.c, I was thinking
> these should really go into net_namespace.h. Does that work for you ?
Not so easy, it's a problem of chicken and egg. If I add this to
net/net_namespace.h, I need to include net/lwtunnel.h but this file already
includes net/net_namespace.h (included directly or indirectly by most of the
network headers).


Regards,
Nicolas
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Ahern July 24, 2015, 2:28 p.m. UTC | #7
On 7/23/15 8:22 AM, Nicolas Dichtel wrote:
>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>   				 struct net_device *dev)
>   {
> +	int nsid = skb_lwt_netns_info(skb);
>   	struct pcpu_lstats *lb_stats;
>   	int len;
>
> +	if (nsid >= 0) {
> +		struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
> +
> +		if (!peernet) {

If nsid is > 0 then the peer namespace should exist right? So for this 
failure path why not increment tx_error stat?


> +			kfree_skb(skb);
> +			goto end;
> +		}
> +
> +		dev_forward_skb(peernet->loopback_dev, skb);
> +		put_net(peernet);
> +		goto end;
> +	}
> +
>   	skb_orphan(skb);
>
>   	/* Before queueing this packet to netif_rx(),

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Dichtel July 24, 2015, 2:32 p.m. UTC | #8
Le 24/07/2015 16:28, David Ahern a écrit :
> On 7/23/15 8:22 AM, Nicolas Dichtel wrote:
>>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>>                    struct net_device *dev)
>>   {
>> +    int nsid = skb_lwt_netns_info(skb);
>>       struct pcpu_lstats *lb_stats;
>>       int len;
>>
>> +    if (nsid >= 0) {
>> +        struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
>> +
>> +        if (!peernet) {
>
> If nsid is > 0 then the peer namespace should exist right? So for this failure
> path why not increment tx_error stat?
I was not sure about that, because before my patch we increment statistics only
in case of NET_RX_SUCCESS.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Ahern July 24, 2015, 3:19 p.m. UTC | #9
On 7/24/15 8:32 AM, Nicolas Dichtel wrote:
> Le 24/07/2015 16:28, David Ahern a écrit :
>> On 7/23/15 8:22 AM, Nicolas Dichtel wrote:
>>>   static netdev_tx_t loopback_xmit(struct sk_buff *skb,
>>>                    struct net_device *dev)
>>>   {
>>> +    int nsid = skb_lwt_netns_info(skb);
>>>       struct pcpu_lstats *lb_stats;
>>>       int len;
>>>
>>> +    if (nsid >= 0) {
>>> +        struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
>>> +
>>> +        if (!peernet) {
>>
>> If nsid is > 0 then the peer namespace should exist right? So for this
>> failure
>> path why not increment tx_error stat?
> I was not sure about that, because before my patch we increment
> statistics only
> in case of NET_RX_SUCCESS.

In this case you are knowingly dropping packets. Would be nice to have a 
counter showing that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Dichtel July 27, 2015, 8:07 p.m. UTC | #10
Le 24/07/2015 17:19, David Ahern a écrit :
> In this case you are knowingly dropping packets. Would be nice to have a counter
> showing that.
Ok.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index c76283c2f84a..758d02f592f9 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -57,6 +57,7 @@ 
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <net/lwtunnel.h>
 
 struct pcpu_lstats {
 	u64			packets;
@@ -71,9 +72,23 @@  struct pcpu_lstats {
 static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 				 struct net_device *dev)
 {
+	int nsid = skb_lwt_netns_info(skb);
 	struct pcpu_lstats *lb_stats;
 	int len;
 
+	if (nsid >= 0) {
+		struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid);
+
+		if (!peernet) {
+			kfree_skb(skb);
+			goto end;
+		}
+
+		dev_forward_skb(peernet->loopback_dev, skb);
+		put_net(peernet);
+		goto end;
+	}
+
 	skb_orphan(skb);
 
 	/* Before queueing this packet to netif_rx(),
@@ -94,6 +109,7 @@  static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 		u64_stats_update_end(&lb_stats->syncp);
 	}
 
+end:
 	return NETDEV_TX_OK;
 }
 
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 918e03c1dafa..cc05ce3c1aae 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -5,7 +5,9 @@ 
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/types.h>
+#include <linux/net_namespace.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 
 #define LWTUNNEL_HASH_BITS   7
 #define LWTUNNEL_HASH_SIZE   (1 << LWTUNNEL_HASH_BITS)
@@ -141,4 +143,25 @@  static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb)
 
 #endif
 
+static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate)
+{
+	return (u32 *)lwtstate->data;
+}
+
+static inline int skb_lwt_netns_info(struct sk_buff *skb)
+{
+	if (skb->protocol == htons(ETH_P_IP)) {
+		struct rtable *rt = (struct rtable *)skb_dst(skb);
+
+		if (rt && rt->rt_lwtstate)
+			return *lwt_netns_info(rt->rt_lwtstate);
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+		if (rt6 && rt6->rt6i_lwtstate)
+			return *lwt_netns_info(rt6->rt6i_lwtstate);
+	}
+
+	return NETNSA_NSID_NOT_ASSIGNED;
+}
 #endif /* __NET_LWTUNNEL_H */
diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h
index 31377bbea3f8..6715e7a1b335 100644
--- a/include/uapi/linux/lwtunnel.h
+++ b/include/uapi/linux/lwtunnel.h
@@ -7,6 +7,7 @@  enum lwtunnel_encap_types {
 	LWTUNNEL_ENCAP_NONE,
 	LWTUNNEL_ENCAP_MPLS,
 	LWTUNNEL_ENCAP_IP,
+	LWTUNNEL_ENCAP_NETNS,
 	__LWTUNNEL_ENCAP_MAX,
 };
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b629b1..c1267aac373d 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@ 
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <net/lwtunnel.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -725,6 +726,56 @@  out:
 	rtnl_set_sk_err(net, RTNLGRP_NSID, err);
 }
 
+static int lwt_netns_build_state(struct net_device *dev, struct nlattr *nla,
+				 struct lwtunnel_state **ts)
+{
+	struct nlattr *tb[NETNSA_MAX + 1];
+	struct lwtunnel_state *newts;
+	int *nsid;
+	int ret;
+
+	ret = nla_parse_nested(tb, NETNSA_MAX, nla, rtnl_net_policy);
+	if (ret < 0)
+		return ret;
+
+	if (!tb[NETNSA_NSID])
+		return -EINVAL;
+
+	newts = lwtunnel_state_alloc(sizeof(*nsid));
+	if (!newts)
+		return -ENOMEM;
+
+	newts->len = sizeof(*nsid);
+	nsid = lwt_netns_info(newts);
+	*nsid = nla_get_s32(tb[NETNSA_NSID]);
+	newts->type = LWTUNNEL_ENCAP_NETNS;
+
+	*ts = newts;
+	return 0;
+}
+
+static int lwt_netns_fill_encap_info(struct sk_buff *skb,
+				     struct lwtunnel_state *lwtstate)
+{
+	int *nsid = lwt_netns_info(lwtstate);
+
+	if (nla_put_s32(skb, NETNSA_NSID, *nsid))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int lwt_netns_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+	return nla_total_size(4);	/* NETNSA_NSID */
+}
+
+static const struct lwtunnel_encap_ops lwt_netns_ops = {
+	.build_state = lwt_netns_build_state,
+	.fill_encap = lwt_netns_fill_encap_info,
+	.get_encap_size = lwt_netns_encap_nlsize,
+};
+
 static int __init net_ns_init(void)
 {
 	struct net_generic *ng;
@@ -762,6 +813,7 @@  static int __init net_ns_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
 		      NULL);
 
+	lwtunnel_encap_add_ops(&lwt_netns_ops, LWTUNNEL_ENCAP_NETNS);
 	return 0;
 }