Message ID | 1437661349-17620-1-git-send-email-nicolas.dichtel@6wind.com |
---|---|
State | Superseded, archived |
Delegated to: | David Miller |
Headers | show |
On 7/23/15, 7:22 AM, Nicolas Dichtel wrote: > This patch takes advantage of the newly added lwtunnel framework to > allow the user to set routes that points to a peer netns. > > Packets are injected to the peer netns via the loopback device. It works > only when the output device is 'lo'. > > Example: > ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo > > Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> > --- > drivers/net/loopback.c | 16 +++++++++++++ > include/net/lwtunnel.h | 23 +++++++++++++++++++ > include/uapi/linux/lwtunnel.h | 1 + > net/core/net_namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++ > 4 files changed, 92 insertions(+) > > diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c > index c76283c2f84a..758d02f592f9 100644 > --- a/drivers/net/loopback.c > +++ b/drivers/net/loopback.c > @@ -57,6 +57,7 @@ > #include <linux/percpu.h> > #include <net/net_namespace.h> > #include <linux/u64_stats_sync.h> > +#include <net/lwtunnel.h> > > struct pcpu_lstats { > u64 packets; > @@ -71,9 +72,23 @@ struct pcpu_lstats { > static netdev_tx_t loopback_xmit(struct sk_buff *skb, > struct net_device *dev) > { > + int nsid = skb_lwt_netns_info(skb); > struct pcpu_lstats *lb_stats; > int len; > > + if (nsid >= 0) { > + struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid); > + > + if (!peernet) { > + kfree_skb(skb); > + goto end; > + } > + > + dev_forward_skb(peernet->loopback_dev, skb); > + put_net(peernet); > + goto end; > + } > + > skb_orphan(skb); > > /* Before queueing this packet to netif_rx(), > @@ -94,6 +109,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, > u64_stats_update_end(&lb_stats->syncp); > } > > +end: > return NETDEV_TX_OK; > } > > diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h > index 918e03c1dafa..cc05ce3c1aae 100644 > --- a/include/net/lwtunnel.h > +++ b/include/net/lwtunnel.h > @@ -5,7 +5,9 @@ > #include <linux/netdevice.h> > #include <linux/skbuff.h> > #include <linux/types.h> > +#include <linux/net_namespace.h> > #include <net/route.h> > +#include <net/ip6_fib.h> > > #define LWTUNNEL_HASH_BITS 7 > #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) > @@ -141,4 +143,25 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) > > #endif > > +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate) > +{ > + return (u32 *)lwtstate->data; > +} > + > +static inline int skb_lwt_netns_info(struct sk_buff *skb) > +{ > + if (skb->protocol == htons(ETH_P_IP)) { > + struct rtable *rt = (struct rtable *)skb_dst(skb); > + > + if (rt && rt->rt_lwtstate) > + return *lwt_netns_info(rt->rt_lwtstate); > + } else if (skb->protocol == htons(ETH_P_IPV6)) { > + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); > + > + if (rt6 && rt6->rt6i_lwtstate) > + return *lwt_netns_info(rt6->rt6i_lwtstate); > + } > + > + return NETNSA_NSID_NOT_ASSIGNED; > +} > #endif /* __NET_LWTUNNEL_H */ since these apis' don't have to be netns specific, Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ? and seems like they should be declared for both CONFIG_LWTUNNEL 'y' and 'n'. Thanks, Roopa -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le 23/07/2015 17:01, roopa a écrit : > On 7/23/15, 7:22 AM, Nicolas Dichtel wrote: [snip] >> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate) >> +{ >> + return (u32 *)lwtstate->data; >> +} >> + >> +static inline int skb_lwt_netns_info(struct sk_buff *skb) >> +{ >> + if (skb->protocol == htons(ETH_P_IP)) { >> + struct rtable *rt = (struct rtable *)skb_dst(skb); >> + >> + if (rt && rt->rt_lwtstate) >> + return *lwt_netns_info(rt->rt_lwtstate); >> + } else if (skb->protocol == htons(ETH_P_IPV6)) { >> + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); >> + >> + if (rt6 && rt6->rt6i_lwtstate) >> + return *lwt_netns_info(rt6->rt6i_lwtstate); >> + } >> + >> + return NETNSA_NSID_NOT_ASSIGNED; >> +} >> #endif /* __NET_LWTUNNEL_H */ > since these apis' don't have to be netns specific, > Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ? They are specific to netns because lwtstate->data is interpreted as an u32 *. But I agree that a test is missing against lwtstate->type to ensure that data will be a nsid. > > and seems like they should be declared for both CONFIG_LWTUNNEL 'y' and 'n'. It is outside the "#ifdef CONFIG_LWTUNNEL". I can successfully compile with and without CONFIG_LWTUNNEL. Thank you, Nicolas -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/23/15, 8:25 AM, Nicolas Dichtel wrote: > Le 23/07/2015 17:01, roopa a écrit : >> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote: > [snip] >>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate) >>> +{ >>> + return (u32 *)lwtstate->data; >>> +} >>> + >>> +static inline int skb_lwt_netns_info(struct sk_buff *skb) >>> +{ >>> + if (skb->protocol == htons(ETH_P_IP)) { >>> + struct rtable *rt = (struct rtable *)skb_dst(skb); >>> + >>> + if (rt && rt->rt_lwtstate) >>> + return *lwt_netns_info(rt->rt_lwtstate); >>> + } else if (skb->protocol == htons(ETH_P_IPV6)) { >>> + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); >>> + >>> + if (rt6 && rt6->rt6i_lwtstate) >>> + return *lwt_netns_info(rt6->rt6i_lwtstate); >>> + } >>> + >>> + return NETNSA_NSID_NOT_ASSIGNED; >>> +} >>> #endif /* __NET_LWTUNNEL_H */ >> since these apis' don't have to be netns specific, >> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ? > They are specific to netns because lwtstate->data is interpreted as an > u32 *. > But I agree that a test is missing against lwtstate->type to ensure > that data > will be a nsid. > o ok..., the api's in lwtunnel.h today are not specific to an encap type. they are generic, so skb_lwtunnel_state() which returns struct lwtunnel_state could go here. the encap specific ones can go in the respective callers. Recently thomas added a similar skb_tunnel_info() for ip tunnels. I did like to have a generic version of your skb_lwt_netns_info in lwtunnel.h. I could use it in my mpls output func too. >> >> and seems like they should be declared for both CONFIG_LWTUNNEL 'y' >> and 'n'. > It is outside the "#ifdef CONFIG_LWTUNNEL". I can successfully compile > with and > without CONFIG_LWTUNNEL. ok, thanks, Roopa -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le 23/07/2015 17:50, roopa a écrit : > On 7/23/15, 8:25 AM, Nicolas Dichtel wrote: >> Le 23/07/2015 17:01, roopa a écrit : >>> On 7/23/15, 7:22 AM, Nicolas Dichtel wrote: >> [snip] >>>> +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate) >>>> +{ >>>> + return (u32 *)lwtstate->data; >>>> +} >>>> + >>>> +static inline int skb_lwt_netns_info(struct sk_buff *skb) >>>> +{ >>>> + if (skb->protocol == htons(ETH_P_IP)) { >>>> + struct rtable *rt = (struct rtable *)skb_dst(skb); >>>> + >>>> + if (rt && rt->rt_lwtstate) >>>> + return *lwt_netns_info(rt->rt_lwtstate); >>>> + } else if (skb->protocol == htons(ETH_P_IPV6)) { >>>> + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); >>>> + >>>> + if (rt6 && rt6->rt6i_lwtstate) >>>> + return *lwt_netns_info(rt6->rt6i_lwtstate); >>>> + } >>>> + >>>> + return NETNSA_NSID_NOT_ASSIGNED; >>>> +} >>>> #endif /* __NET_LWTUNNEL_H */ >>> since these apis' don't have to be netns specific, >>> Can they just be named lwtunnel_get_state_data and skb_lwtunnel_state ? >> They are specific to netns because lwtstate->data is interpreted as an u32 *. >> But I agree that a test is missing against lwtstate->type to ensure that data >> will be a nsid. >> > o ok..., the api's in lwtunnel.h today are not specific to an encap type. > they are generic, so skb_lwtunnel_state() which returns struct lwtunnel_state > could go here. > the encap specific ones can go in the respective callers. Recently thomas added > a similar > skb_tunnel_info() for ip tunnels. I did like to have a generic version of your > skb_lwt_netns_info in lwtunnel.h. I could use it in my mpls output func too. Sure, but my goal was to not create a new .h file just for these two helpers. It's related to lwtunnel, thus I was thinking they can go here. Regards, Nicolas -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/24/15, 5:24 AM, Nicolas Dichtel wrote: > Sure, but my goal was to not create a new .h file just for these two > helpers. > It's related to lwtunnel, thus I was thinking they can go here. ok..., since your lwt namespace functions went into net_namespace.c, I was thinking these should really go into net_namespace.h. Does that work for you ? If that does not, then yes, they could live here. Thanks, Roopa -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le 24/07/2015 15:50, roopa a écrit : > On 7/24/15, 5:24 AM, Nicolas Dichtel wrote: >> Sure, but my goal was to not create a new .h file just for these two helpers. >> It's related to lwtunnel, thus I was thinking they can go here. > ok..., since your lwt namespace functions went into net_namespace.c, I was thinking > these should really go into net_namespace.h. Does that work for you ? Not so easy, it's a problem of chicken and egg. If I add this to net/net_namespace.h, I need to include net/lwtunnel.h but this file already includes net/net_namespace.h (included directly or indirectly by most of the network headers). Regards, Nicolas -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/23/15 8:22 AM, Nicolas Dichtel wrote: > static netdev_tx_t loopback_xmit(struct sk_buff *skb, > struct net_device *dev) > { > + int nsid = skb_lwt_netns_info(skb); > struct pcpu_lstats *lb_stats; > int len; > > + if (nsid >= 0) { > + struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid); > + > + if (!peernet) { If nsid is > 0 then the peer namespace should exist right? So for this failure path why not increment tx_error stat? > + kfree_skb(skb); > + goto end; > + } > + > + dev_forward_skb(peernet->loopback_dev, skb); > + put_net(peernet); > + goto end; > + } > + > skb_orphan(skb); > > /* Before queueing this packet to netif_rx(), -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le 24/07/2015 16:28, David Ahern a écrit : > On 7/23/15 8:22 AM, Nicolas Dichtel wrote: >> static netdev_tx_t loopback_xmit(struct sk_buff *skb, >> struct net_device *dev) >> { >> + int nsid = skb_lwt_netns_info(skb); >> struct pcpu_lstats *lb_stats; >> int len; >> >> + if (nsid >= 0) { >> + struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid); >> + >> + if (!peernet) { > > If nsid is > 0 then the peer namespace should exist right? So for this failure > path why not increment tx_error stat? I was not sure about that, because before my patch we increment statistics only in case of NET_RX_SUCCESS. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 7/24/15 8:32 AM, Nicolas Dichtel wrote: > Le 24/07/2015 16:28, David Ahern a écrit : >> On 7/23/15 8:22 AM, Nicolas Dichtel wrote: >>> static netdev_tx_t loopback_xmit(struct sk_buff *skb, >>> struct net_device *dev) >>> { >>> + int nsid = skb_lwt_netns_info(skb); >>> struct pcpu_lstats *lb_stats; >>> int len; >>> >>> + if (nsid >= 0) { >>> + struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid); >>> + >>> + if (!peernet) { >> >> If nsid is > 0 then the peer namespace should exist right? So for this >> failure >> path why not increment tx_error stat? > I was not sure about that, because before my patch we increment > statistics only > in case of NET_RX_SUCCESS. In this case you are knowingly dropping packets. Would be nice to have a counter showing that. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le 24/07/2015 17:19, David Ahern a écrit : > In this case you are knowingly dropping packets. Would be nice to have a counter > showing that. Ok. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index c76283c2f84a..758d02f592f9 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -57,6 +57,7 @@ #include <linux/percpu.h> #include <net/net_namespace.h> #include <linux/u64_stats_sync.h> +#include <net/lwtunnel.h> struct pcpu_lstats { u64 packets; @@ -71,9 +72,23 @@ struct pcpu_lstats { static netdev_tx_t loopback_xmit(struct sk_buff *skb, struct net_device *dev) { + int nsid = skb_lwt_netns_info(skb); struct pcpu_lstats *lb_stats; int len; + if (nsid >= 0) { + struct net *peernet = get_net_ns_by_id(dev_net(dev), nsid); + + if (!peernet) { + kfree_skb(skb); + goto end; + } + + dev_forward_skb(peernet->loopback_dev, skb); + put_net(peernet); + goto end; + } + skb_orphan(skb); /* Before queueing this packet to netif_rx(), @@ -94,6 +109,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb, u64_stats_update_end(&lb_stats->syncp); } +end: return NETDEV_TX_OK; } diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 918e03c1dafa..cc05ce3c1aae 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -5,7 +5,9 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> +#include <linux/net_namespace.h> #include <net/route.h> +#include <net/ip6_fib.h> #define LWTUNNEL_HASH_BITS 7 #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) @@ -141,4 +143,25 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) #endif +static inline u32 *lwt_netns_info(struct lwtunnel_state *lwtstate) +{ + return (u32 *)lwtstate->data; +} + +static inline int skb_lwt_netns_info(struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IP)) { + struct rtable *rt = (struct rtable *)skb_dst(skb); + + if (rt && rt->rt_lwtstate) + return *lwt_netns_info(rt->rt_lwtstate); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); + + if (rt6 && rt6->rt6i_lwtstate) + return *lwt_netns_info(rt6->rt6i_lwtstate); + } + + return NETNSA_NSID_NOT_ASSIGNED; +} #endif /* __NET_LWTUNNEL_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 31377bbea3f8..6715e7a1b335 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -7,6 +7,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_NONE, LWTUNNEL_ENCAP_MPLS, LWTUNNEL_ENCAP_IP, + LWTUNNEL_ENCAP_NETNS, __LWTUNNEL_ENCAP_MAX, }; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 2c2eb1b629b1..c1267aac373d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include <net/netlink.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/lwtunnel.h> /* * Our network namespace constructor/destructor lists @@ -725,6 +726,56 @@ out: rtnl_set_sk_err(net, RTNLGRP_NSID, err); } +static int lwt_netns_build_state(struct net_device *dev, struct nlattr *nla, + struct lwtunnel_state **ts) +{ + struct nlattr *tb[NETNSA_MAX + 1]; + struct lwtunnel_state *newts; + int *nsid; + int ret; + + ret = nla_parse_nested(tb, NETNSA_MAX, nla, rtnl_net_policy); + if (ret < 0) + return ret; + + if (!tb[NETNSA_NSID]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*nsid)); + if (!newts) + return -ENOMEM; + + newts->len = sizeof(*nsid); + nsid = lwt_netns_info(newts); + *nsid = nla_get_s32(tb[NETNSA_NSID]); + newts->type = LWTUNNEL_ENCAP_NETNS; + + *ts = newts; + return 0; +} + +static int lwt_netns_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + int *nsid = lwt_netns_info(lwtstate); + + if (nla_put_s32(skb, NETNSA_NSID, *nsid)) + return -ENOMEM; + + return 0; +} + +static int lwt_netns_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(4); /* NETNSA_NSID */ +} + +static const struct lwtunnel_encap_ops lwt_netns_ops = { + .build_state = lwt_netns_build_state, + .fill_encap = lwt_netns_fill_encap_info, + .get_encap_size = lwt_netns_encap_nlsize, +}; + static int __init net_ns_init(void) { struct net_generic *ng; @@ -762,6 +813,7 @@ static int __init net_ns_init(void) rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid, NULL); + lwtunnel_encap_add_ops(&lwt_netns_ops, LWTUNNEL_ENCAP_NETNS); return 0; }
This patch takes advantage of the newly added lwtunnel framework to allow the user to set routes that points to a peer netns. Packets are injected to the peer netns via the loopback device. It works only when the output device is 'lo'. Example: ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com> --- drivers/net/loopback.c | 16 +++++++++++++ include/net/lwtunnel.h | 23 +++++++++++++++++++ include/uapi/linux/lwtunnel.h | 1 + net/core/net_namespace.c | 52 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+)