diff mbox

switching network namespace midway

Message ID 20121102022542.GD18091@kvack.org
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Benjamin LaHaise Nov. 2, 2012, 2:25 a.m. UTC
On Thu, Oct 25, 2012 at 09:15:34AM -0700, Eric W. Biederman wrote:
> > I've read IPv4 gre code, and it appears to do the right thing on rx, but it 
> > does *not* appear to handle namespaces correctly on transmit.  In general, 
> > I would expect pretty much all code to get namespace handling correct for 
> > the rx case.  I'll have a closer look at fixing this tomorrow if nobody else 
> > beats me to it.
> 
> It will be interesting to see what you come up with. 

Well, I finally had some time to work on the ip_gre module a bit today, 
and here's what I came up with.  The basic idea is to store the network 
namespace in the ip_tunnel structure at creation time for use in sending 
and receiving packets, allowing the gre network device to be safely moved 
into another network namespace.  This works for me in moving a gre tunnel 
into an lxc container, and survives module unload and namespace 
destruction.  I'll try to spend a bit more time adding similar support to 
the other ip_tunnel devices over the next few days.  Comments/thoughts?

		-ben

Comments

Eric W. Biederman Nov. 2, 2012, 6:18 a.m. UTC | #1
Benjamin LaHaise <bcrl@kvack.org> writes:

> On Thu, Oct 25, 2012 at 09:15:34AM -0700, Eric W. Biederman wrote:
>> > I've read IPv4 gre code, and it appears to do the right thing on rx, but it 
>> > does *not* appear to handle namespaces correctly on transmit.  In general, 
>> > I would expect pretty much all code to get namespace handling correct for 
>> > the rx case.  I'll have a closer look at fixing this tomorrow if nobody else 
>> > beats me to it.
>> 
>> It will be interesting to see what you come up with. 
>
> Well, I finally had some time to work on the ip_gre module a bit today, 
> and here's what I came up with.  The basic idea is to store the network 
> namespace in the ip_tunnel structure at creation time for use in sending 
> and receiving packets, allowing the gre network device to be safely moved 
> into another network namespace.  This works for me in moving a gre tunnel 
> into an lxc container, and survives module unload and namespace 
> destruction.  I'll try to spend a bit more time adding similar support to 
> the other ip_tunnel devices over the next few days.  Comments/thoughts?
>
> 		-ben

You need a per network namespace exit function to delete the tunnel when
the xmit direction goes away.  Otherwise we have a very nasty race if
the original network namespace exits.

NETNS_LOCAL may make sense on the reference device that is used to
support ioctls for creating devices.

ipgre_open ?  It looks like it needs to be handled.  Probably that
ip_route_output_gre needs to be moved.

ipv6?

Eric

> -- 
> "Thought is the essence of where you are now."
> 
> 
> diff --git a/include/net/ipip.h b/include/net/ipip.h
> index ddc077c..9cfba92 100644
> --- a/include/net/ipip.h
> +++ b/include/net/ipip.h
> @@ -19,6 +19,7 @@ struct ip_tunnel_6rd_parm {
>  struct ip_tunnel {
>  	struct ip_tunnel __rcu	*next;
>  	struct net_device	*dev;
> +	struct net		*net;		/* Namespace for packet i/o */
>  
>  	int			err_count;	/* Number of arrived ICMP errors */
>  	unsigned long		err_time;	/* Time when the last ICMP error arrived */
> diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
> index 7240f8e..705dc66 100644
> --- a/net/ipv4/ip_gre.c
> +++ b/net/ipv4/ip_gre.c
> @@ -461,6 +461,7 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
>  	dev_net_set(dev, net);
>  
>  	nt = netdev_priv(dev);
> +	nt->net = net;
>  	nt->parms = *parms;
>  	dev->rtnl_link_ops = &ipgre_link_ops;
>  
> @@ -484,8 +485,10 @@ failed_free:
>  
>  static void ipgre_tunnel_uninit(struct net_device *dev)
>  {
> -	struct net *net = dev_net(dev);
> -	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
> +	struct ip_tunnel *tunnel = netdev_priv(dev);
> +	struct ipgre_net *ign;
> +
> +	ign = net_generic(tunnel->net, ipgre_net_id);
>  
>  	ipgre_tunnel_unlink(ign, netdev_priv(dev));
>  	dev_put(dev);
> @@ -837,7 +840,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
>  			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
>  	}
>  
> -	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
> +	rt = ip_route_output_gre(tunnel->net, &fl4, dst, tiph->saddr,
>  				 tunnel->parms.o_key, RT_TOS(tos),
>  				 tunnel->parms.link);
>  	if (IS_ERR(rt)) {
> @@ -1010,7 +1013,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
>  		struct flowi4 fl4;
>  		struct rtable *rt;
>  
> -		rt = ip_route_output_gre(dev_net(dev), &fl4,
> +		rt = ip_route_output_gre(tunnel->net, &fl4,
>  					 iph->daddr, iph->saddr,
>  					 tunnel->parms.o_key,
>  					 RT_TOS(iph->tos),
> @@ -1341,7 +1344,6 @@ static void ipgre_tunnel_setup(struct net_device *dev)
>  	dev->flags		= IFF_NOARP;
>  	dev->iflink		= 0;
>  	dev->addr_len		= 4;
> -	dev->features		|= NETIF_F_NETNS_LOCAL;
>  	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
>  
>  	dev->features		|= GRE_FEATURES;
> @@ -1432,6 +1434,7 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
>  static int __net_init ipgre_init_net(struct net *net)
>  {
>  	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
> +	struct ip_tunnel *tunnel;
>  	int err;
>  
>  	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
> @@ -1445,6 +1448,9 @@ static int __net_init ipgre_init_net(struct net *net)
>  	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
>  	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
>  
> +	tunnel = netdev_priv(ign->fb_tunnel_dev);
> +	tunnel->net = net;
> +
>  	if ((err = register_netdev(ign->fb_tunnel_dev)))
>  		goto err_reg_dev;
>  
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin LaHaise Nov. 2, 2012, 2:03 p.m. UTC | #2
On Thu, Nov 01, 2012 at 11:18:58PM -0700, Eric W. Biederman wrote:
> You need a per network namespace exit function to delete the tunnel when
> the xmit direction goes away.  Otherwise we have a very nasty race if
> the original network namespace exits.

That already exists as ipgre_exit_net().  Since the ip_tunnel structure 
remains hashed in the network namespace that creation occurred in, this 
case should be covered.

> NETNS_LOCAL may make sense on the reference device that is used to
> support ioctls for creating devices.

*nod*  That makes sense.

> ipgre_open ?  It looks like it needs to be handled.  Probably that
> ip_route_output_gre needs to be moved.

Good catch.  Will respin with that changed.

> ipv6?

That's next on the list.  There are also issues with ipip, ipmr and 
ipvti, as well as their ipv6 versions.

		-ben
Eric W. Biederman Nov. 2, 2012, 8:45 p.m. UTC | #3
Benjamin LaHaise <bcrl@kvack.org> writes:

> On Thu, Nov 01, 2012 at 11:18:58PM -0700, Eric W. Biederman wrote:
>> You need a per network namespace exit function to delete the tunnel when
>> the xmit direction goes away.  Otherwise we have a very nasty race if
>> the original network namespace exits.
>
> That already exists as ipgre_exit_net().  Since the ip_tunnel structure 
> remains hashed in the network namespace that creation occurred in, this 
> case should be covered.

*blink*  I had looked for that, but I definitely missed that one.

The consequence of the design where we can run ioctls on network devices
we can't even see but whose ipaddrs are in our network namespace is
interesting.  Correct but interesting.

>> NETNS_LOCAL may make sense on the reference device that is used to
>> support ioctls for creating devices.
>
> *nod*  That makes sense.

After a second look.  fb_tunnel_dev is very special in the code so
allowing fb_tunnel_dev to change network namespace is almost certain
to create problems, so it should get a NETNS_LOCAL.

>> ipgre_open ?  It looks like it needs to be handled.  Probably that
>> ip_route_output_gre needs to be moved.
>
> Good catch.  Will respin with that changed.

I'm not seeing anything else but I will look again after you respin.

>> ipv6?
>
> That's next on the list.  There are also issues with ipip, ipmr and 
> ipvti, as well as their ipv6 versions.

I don't see sense in ip multicast routing tunnels (ipmr) changing
network namespaces as they are essentially aliases for other network
devices, and aren't really proper tunnels.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Nicolas Dichtel June 24, 2013, 2:13 p.m. UTC | #4
This patch is a follow up of the thread "switching network namespace midway":
http://marc.info/?t=135101459500004&r=1&w=2

The goal of this serie is to add x-netns support for the module sit, ie the
encapsulation addresses and the network device are not owned by the same
namespace.

Example to configure a tunnel:

modprobe sit
ip netns add netns1
ip link add sit1 type sit remote 10.16.0.121 local 10.16.0.249
ip l s sit1 netns netns1
ip netns exec netns1 ip l s lo up
ip netns exec netns1 ip l s sit1 up
ip netns exec netns1 ip a a dev sit1 192.168.2.123 remote 192.168.2.121
ip netns exec netns1 ip -6 a a dev sit1 2001:1234::123 remote 2001:1234::121

I sent this serie as an RFC to get feedback from people. Once this serie is
approved, I will add the same feature for the module ipip and ip6_tunnel.

 include/linux/netdevice.h |  1 +
 include/net/ip_tunnels.h  |  1 +
 net/core/dev.c            | 34 ++++++++++++++++++++++----------
 net/ipv4/ip_tunnel.c      |  7 ++++++-
 net/ipv6/sit.c            | 49 ++++++++++++++++++++++++-----------------------
 5 files changed, 57 insertions(+), 35 deletions(-)

Comments are welcome.

Regards,
Nicolas
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ipip.h b/include/net/ipip.h
index ddc077c..9cfba92 100644
--- a/include/net/ipip.h
+++ b/include/net/ipip.h
@@ -19,6 +19,7 @@  struct ip_tunnel_6rd_parm {
 struct ip_tunnel {
 	struct ip_tunnel __rcu	*next;
 	struct net_device	*dev;
+	struct net		*net;		/* Namespace for packet i/o */
 
 	int			err_count;	/* Number of arrived ICMP errors */
 	unsigned long		err_time;	/* Time when the last ICMP error arrived */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7240f8e..705dc66 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -461,6 +461,7 @@  static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 	dev_net_set(dev, net);
 
 	nt = netdev_priv(dev);
+	nt->net = net;
 	nt->parms = *parms;
 	dev->rtnl_link_ops = &ipgre_link_ops;
 
@@ -484,8 +485,10 @@  failed_free:
 
 static void ipgre_tunnel_uninit(struct net_device *dev)
 {
-	struct net *net = dev_net(dev);
-	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+	struct ipgre_net *ign;
+
+	ign = net_generic(tunnel->net, ipgre_net_id);
 
 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
 	dev_put(dev);
@@ -837,7 +840,7 @@  static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
 	}
 
-	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
+	rt = ip_route_output_gre(tunnel->net, &fl4, dst, tiph->saddr,
 				 tunnel->parms.o_key, RT_TOS(tos),
 				 tunnel->parms.link);
 	if (IS_ERR(rt)) {
@@ -1010,7 +1013,7 @@  static int ipgre_tunnel_bind_dev(struct net_device *dev)
 		struct flowi4 fl4;
 		struct rtable *rt;
 
-		rt = ip_route_output_gre(dev_net(dev), &fl4,
+		rt = ip_route_output_gre(tunnel->net, &fl4,
 					 iph->daddr, iph->saddr,
 					 tunnel->parms.o_key,
 					 RT_TOS(iph->tos),
@@ -1341,7 +1344,6 @@  static void ipgre_tunnel_setup(struct net_device *dev)
 	dev->flags		= IFF_NOARP;
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
-	dev->features		|= NETIF_F_NETNS_LOCAL;
 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 
 	dev->features		|= GRE_FEATURES;
@@ -1432,6 +1434,7 @@  static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
 static int __net_init ipgre_init_net(struct net *net)
 {
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
+	struct ip_tunnel *tunnel;
 	int err;
 
 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
@@ -1445,6 +1448,9 @@  static int __net_init ipgre_init_net(struct net *net)
 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
 
+	tunnel = netdev_priv(ign->fb_tunnel_dev);
+	tunnel->net = net;
+
 	if ((err = register_netdev(ign->fb_tunnel_dev)))
 		goto err_reg_dev;