diff mbox

netns: unix: only allow to find out unix socket in same net namespace

Message ID 1377059473-25526-1-git-send-email-gaofeng@cn.fujitsu.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Gao feng Aug. 21, 2013, 4:31 a.m. UTC
Unix sockets are private resources of net namespace,
allowing one net namespace to access to other netns's unix
sockets is meaningless.

I'm researching a problem about shutdown from container,
if the cotainer shares the same file /run/systemd/private
with host, when we run shutdown -h xxx in container, the
shutdown message will be send to the systemd-shutdownd
through unix socket /run/systemd/private, and because
systemd-shutdownd is running in host, so finally, the host
will become shutdown.

We should make sure unix sockets are per net namespace to
avoid this problem.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/unix/af_unix.c |  8 ++++++--
 net/unix/diag.c    | 11 ++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

Comments

Gao feng Aug. 21, 2013, 4:58 a.m. UTC | #1
cc containers@lists.linux-foundation.org

On 08/21/2013 12:31 PM, Gao feng wrote:
> Unix sockets are private resources of net namespace,
> allowing one net namespace to access to other netns's unix
> sockets is meaningless.
> 
> I'm researching a problem about shutdown from container,
> if the cotainer shares the same file /run/systemd/private
> with host, when we run shutdown -h xxx in container, the
> shutdown message will be send to the systemd-shutdownd
> through unix socket /run/systemd/private, and because
> systemd-shutdownd is running in host, so finally, the host
> will become shutdown.
> 
> We should make sure unix sockets are per net namespace to
> avoid this problem.
> 
> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
> ---
>  net/unix/af_unix.c |  8 ++++++--
>  net/unix/diag.c    | 11 ++++++++---
>  2 files changed, 14 insertions(+), 5 deletions(-)
> 
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index c4ce243..98e3689 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -295,7 +295,8 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
>  	return s;
>  }
>  
> -static struct sock *unix_find_socket_byinode(struct inode *i)
> +static struct sock *unix_find_socket_byinode(struct net *net,
> +					     struct inode *i)
>  {
>  	struct sock *s;
>  
> @@ -304,6 +305,9 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
>  		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
>  		struct dentry *dentry = unix_sk(s)->path.dentry;
>  
> +		if (!net_eq(sock_net(s), net))
> +			continue;
> +
>  		if (dentry && dentry->d_inode == i) {
>  			sock_hold(s);
>  			goto found;
> @@ -784,7 +788,7 @@ static struct sock *unix_find_other(struct net *net,
>  		err = -ECONNREFUSED;
>  		if (!S_ISSOCK(inode->i_mode))
>  			goto put_fail;
> -		u = unix_find_socket_byinode(inode);
> +		u = unix_find_socket_byinode(net, inode);
>  		if (!u)
>  			goto put_fail;
>  
> diff --git a/net/unix/diag.c b/net/unix/diag.c
> index d591091..80ada12 100644
> --- a/net/unix/diag.c
> +++ b/net/unix/diag.c
> @@ -218,20 +218,25 @@ done:
>  	return skb->len;
>  }
>  
> -static struct sock *unix_lookup_by_ino(int ino)
> +static struct sock *unix_lookup_by_ino(struct net *net, int ino)
>  {
>  	int i;
>  	struct sock *sk;
>  
>  	spin_lock(&unix_table_lock);
>  	for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
> -		sk_for_each(sk, &unix_socket_table[i])
> +		sk_for_each(sk, &unix_socket_table[i]) {
> +
> +			if (!net_eq(sock_net(sk), net))
> +				continue;
> +
>  			if (ino == sock_i_ino(sk)) {
>  				sock_hold(sk);
>  				spin_unlock(&unix_table_lock);
>  
>  				return sk;
>  			}
> +		}
>  	}
>  
>  	spin_unlock(&unix_table_lock);
> @@ -251,7 +256,7 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
>  	if (req->udiag_ino == 0)
>  		goto out_nosk;
>  
> -	sk = unix_lookup_by_ino(req->udiag_ino);
> +	sk = unix_lookup_by_ino(net, req->udiag_ino);
>  	err = -ENOENT;
>  	if (sk == NULL)
>  		goto out_nosk;
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Aug. 21, 2013, 5:30 a.m. UTC | #2
Gao feng <gaofeng@cn.fujitsu.com> writes:

> Unix sockets are private resources of net namespace,
> allowing one net namespace to access to other netns's unix
> sockets is meaningless.

Allowing one net namespace to access another netns's unix socket is
deliberate behavior.  This is a desired and useful feature, and
only a misconfiguration of visible files would allow this to be a
problem.

> I'm researching a problem about shutdown from container,
> if the cotainer shares the same file /run/systemd/private
> with host, when we run shutdown -h xxx in container, the
> shutdown message will be send to the systemd-shutdownd
> through unix socket /run/systemd/private, and because
> systemd-shutdownd is running in host, so finally, the host
> will become shutdown.

The simple answer is don't do that then.  I can see no reason
to share /run outside of the container unless you want this kind of
behavior.

Quite frankly I want this behavior if I am using network namespaces
to support multiple routing contexts. That is if I am using scripts
like:

ip netns add other
ip netns exec other script

I don't want to have to remember to say 
ip netns orig exec shutdown -h now

There are more compelling uses and there is no cost in supporting this
in the kernel.

What kind of misconfiguration caused someone to complain about this?


> We should make sure unix sockets are per net namespace to
> avoid this problem.

Nacked-by: "Eric W. Biederman" <ebiederm@xmission.com>


> Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
> ---
>  net/unix/af_unix.c |  8 ++++++--
>  net/unix/diag.c    | 11 ++++++++---
>  2 files changed, 14 insertions(+), 5 deletions(-)
>
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index c4ce243..98e3689 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -295,7 +295,8 @@ static inline struct sock *unix_find_socket_byname(struct net *net,
>  	return s;
>  }
>  
> -static struct sock *unix_find_socket_byinode(struct inode *i)
> +static struct sock *unix_find_socket_byinode(struct net *net,
> +					     struct inode *i)
>  {
>  	struct sock *s;
>  
> @@ -304,6 +305,9 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
>  		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
>  		struct dentry *dentry = unix_sk(s)->path.dentry;
>  
> +		if (!net_eq(sock_net(s), net))
> +			continue;
> +
>  		if (dentry && dentry->d_inode == i) {
>  			sock_hold(s);
>  			goto found;
> @@ -784,7 +788,7 @@ static struct sock *unix_find_other(struct net *net,
>  		err = -ECONNREFUSED;
>  		if (!S_ISSOCK(inode->i_mode))
>  			goto put_fail;
> -		u = unix_find_socket_byinode(inode);
> +		u = unix_find_socket_byinode(net, inode);
>  		if (!u)
>  			goto put_fail;
>  
> diff --git a/net/unix/diag.c b/net/unix/diag.c
> index d591091..80ada12 100644
> --- a/net/unix/diag.c
> +++ b/net/unix/diag.c
> @@ -218,20 +218,25 @@ done:
>  	return skb->len;
>  }
>  
> -static struct sock *unix_lookup_by_ino(int ino)
> +static struct sock *unix_lookup_by_ino(struct net *net, int ino)
>  {
>  	int i;
>  	struct sock *sk;
>  
>  	spin_lock(&unix_table_lock);
>  	for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
> -		sk_for_each(sk, &unix_socket_table[i])
> +		sk_for_each(sk, &unix_socket_table[i]) {
> +
> +			if (!net_eq(sock_net(sk), net))
> +				continue;
> +
>  			if (ino == sock_i_ino(sk)) {
>  				sock_hold(sk);
>  				spin_unlock(&unix_table_lock);
>  
>  				return sk;
>  			}
> +		}
>  	}
>  
>  	spin_unlock(&unix_table_lock);
> @@ -251,7 +256,7 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
>  	if (req->udiag_ino == 0)
>  		goto out_nosk;
>  
> -	sk = unix_lookup_by_ino(req->udiag_ino);
> +	sk = unix_lookup_by_ino(net, req->udiag_ino);
>  	err = -ENOENT;
>  	if (sk == NULL)
>  		goto out_nosk;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gao feng Aug. 21, 2013, 6:54 a.m. UTC | #3
cc libvirt-list

On 08/21/2013 01:30 PM, Eric W. Biederman wrote:
> Gao feng <gaofeng@cn.fujitsu.com> writes:
> 
>> Unix sockets are private resources of net namespace,
>> allowing one net namespace to access to other netns's unix
>> sockets is meaningless.
> 
> Allowing one net namespace to access another netns's unix socket is
> deliberate behavior.  This is a desired and useful feature, and
> only a misconfiguration of visible files would allow this to be a
> problem.
> 
>> I'm researching a problem about shutdown from container,
>> if the cotainer shares the same file /run/systemd/private
>> with host, when we run shutdown -h xxx in container, the
>> shutdown message will be send to the systemd-shutdownd
>> through unix socket /run/systemd/private, and because
>> systemd-shutdownd is running in host, so finally, the host
>> will become shutdown.
> 
> The simple answer is don't do that then.  I can see no reason
> to share /run outside of the container unless you want this kind of
> behavior.
> 
> Quite frankly I want this behavior if I am using network namespaces
> to support multiple routing contexts. That is if I am using scripts
> like:
> 
> ip netns add other
> ip netns exec other script
> 
> I don't want to have to remember to say 
> ip netns orig exec shutdown -h now
> 
> There are more compelling uses and there is no cost in supporting this
> in the kernel.
> 
> What kind of misconfiguration caused someone to complain about this?
> 

libvirt lxc allows user to set up a container which shares the same root
directory with host.

seems like the unix sockets whose sun_path is an abstract socket address
are net namespace aware.

Should we use "abstract" type of address instead of a file system pathname
for systemd in this case?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric W. Biederman Aug. 21, 2013, 7:06 a.m. UTC | #4
Gao feng <gaofeng@cn.fujitsu.com> writes:

> cc libvirt-list
>
> On 08/21/2013 01:30 PM, Eric W. Biederman wrote:
>> Gao feng <gaofeng@cn.fujitsu.com> writes:
>> 
>>> Unix sockets are private resources of net namespace,
>>> allowing one net namespace to access to other netns's unix
>>> sockets is meaningless.
>> 
>> Allowing one net namespace to access another netns's unix socket is
>> deliberate behavior.  This is a desired and useful feature, and
>> only a misconfiguration of visible files would allow this to be a
>> problem.
>> 
>>> I'm researching a problem about shutdown from container,
>>> if the cotainer shares the same file /run/systemd/private
>>> with host, when we run shutdown -h xxx in container, the
>>> shutdown message will be send to the systemd-shutdownd
>>> through unix socket /run/systemd/private, and because
>>> systemd-shutdownd is running in host, so finally, the host
>>> will become shutdown.
>> 
>> The simple answer is don't do that then.  I can see no reason
>> to share /run outside of the container unless you want this kind of
>> behavior.
>> 
>> Quite frankly I want this behavior if I am using network namespaces
>> to support multiple routing contexts. That is if I am using scripts
>> like:
>> 
>> ip netns add other
>> ip netns exec other script
>> 
>> I don't want to have to remember to say 
>> ip netns orig exec shutdown -h now
>> 
>> There are more compelling uses and there is no cost in supporting this
>> in the kernel.
>> 
>> What kind of misconfiguration caused someone to complain about this?
>> 
>
> libvirt lxc allows user to set up a container which shares the same root
> directory with host.
>
> seems like the unix sockets whose sun_path is an abstract socket address
> are net namespace aware.
>
> Should we use "abstract" type of address instead of a file system pathname
> for systemd in this case?

I suspect libvirt should simply not share /run or any other normally
writable directory with the host.  Sharing /run /var/run or even /tmp
seems extremely dubious if you want some kind of containment, and
without strange things spilling through.

Eric

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gao feng Aug. 21, 2013, 7:22 a.m. UTC | #5
On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
> Gao feng <gaofeng@cn.fujitsu.com> writes:
> 
>> cc libvirt-list
>>
>> On 08/21/2013 01:30 PM, Eric W. Biederman wrote:
>>> Gao feng <gaofeng@cn.fujitsu.com> writes:
>>>
>>>> Unix sockets are private resources of net namespace,
>>>> allowing one net namespace to access to other netns's unix
>>>> sockets is meaningless.
>>>
>>> Allowing one net namespace to access another netns's unix socket is
>>> deliberate behavior.  This is a desired and useful feature, and
>>> only a misconfiguration of visible files would allow this to be a
>>> problem.
>>>
>>>> I'm researching a problem about shutdown from container,
>>>> if the cotainer shares the same file /run/systemd/private
>>>> with host, when we run shutdown -h xxx in container, the
>>>> shutdown message will be send to the systemd-shutdownd
>>>> through unix socket /run/systemd/private, and because
>>>> systemd-shutdownd is running in host, so finally, the host
>>>> will become shutdown.
>>>
>>> The simple answer is don't do that then.  I can see no reason
>>> to share /run outside of the container unless you want this kind of
>>> behavior.
>>>
>>> Quite frankly I want this behavior if I am using network namespaces
>>> to support multiple routing contexts. That is if I am using scripts
>>> like:
>>>
>>> ip netns add other
>>> ip netns exec other script
>>>
>>> I don't want to have to remember to say 
>>> ip netns orig exec shutdown -h now
>>>
>>> There are more compelling uses and there is no cost in supporting this
>>> in the kernel.
>>>
>>> What kind of misconfiguration caused someone to complain about this?
>>>
>>
>> libvirt lxc allows user to set up a container which shares the same root
>> directory with host.
>>
>> seems like the unix sockets whose sun_path is an abstract socket address
>> are net namespace aware.
>>
>> Should we use "abstract" type of address instead of a file system pathname
>> for systemd in this case?
> 
> I suspect libvirt should simply not share /run or any other normally
> writable directory with the host.  Sharing /run /var/run or even /tmp
> seems extremely dubious if you want some kind of containment, and
> without strange things spilling through.
> 

right now I only take note of the unix socket /run/systemd/private,
but there may have many similar unix sockets, they can exist in any
path. the strange problems will still happen.

anyway, I will send a patch to setup a fresh tmpfs for the /run directory of
container first.

Eric, Thanks for your help!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kay Sievers Aug. 21, 2013, 9:51 a.m. UTC | #6
On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
> On 08/21/2013 03:06 PM, Eric W. Biederman wrote:

>> I suspect libvirt should simply not share /run or any other normally
>> writable directory with the host.  Sharing /run /var/run or even /tmp
>> seems extremely dubious if you want some kind of containment, and
>> without strange things spilling through.

Right, /run or /var cannot be shared. It's not only about sockets,
many other things will also go really wrong that way.

> right now I only take note of the unix socket /run/systemd/private,
> but there may have many similar unix sockets, they can exist in any
> path. the strange problems will still happen.
>
> anyway, I will send a patch to setup a fresh tmpfs for the /run directory of
> container first.

This is what systemd-nspawn does for a container setup:
  http://cgit.freedesktop.org/systemd/systemd/tree/src/nspawn/nspawn.c#n350

Kay
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel P. Berrangé Aug. 21, 2013, 9:56 a.m. UTC | #7
On Wed, Aug 21, 2013 at 11:51:53AM +0200, Kay Sievers wrote:
> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
> > On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
> 
> >> I suspect libvirt should simply not share /run or any other normally
> >> writable directory with the host.  Sharing /run /var/run or even /tmp
> >> seems extremely dubious if you want some kind of containment, and
> >> without strange things spilling through.
> 
> Right, /run or /var cannot be shared. It's not only about sockets,
> many other things will also go really wrong that way.

Libvirt already allows the app defining the container config to
set private mounts for any directory including /run and /var.

If an admin or app wants to run systemd inside a container, it is
their responsibility to ensure they setup the filesystem in a
suitable manner. Libvirt is not going to enforce use of a private
/run or /var, since that's a policy decision for a specific
use case.


Daniel
Eric W. Biederman Aug. 21, 2013, 10:42 a.m. UTC | #8
Gao feng <gaofeng@cn.fujitsu.com> writes:

> right now I only take note of the unix socket /run/systemd/private,
> but there may have many similar unix sockets, they can exist in any
> path. the strange problems will still happen.

It could just as easily have been a fifo in the filesystem, and the
result would have been the same.

The network namespace are all about communicating between network
namespaces and that is what was allowed here.

If you don't want a socket or a fifo or any other file to be used by a
container don't give it access to it.  It really is that simple.

Eric
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gao feng Aug. 22, 2013, 1:36 a.m. UTC | #9
On 08/21/2013 06:42 PM, Eric W. Biederman wrote:
> Gao feng <gaofeng@cn.fujitsu.com> writes:
> 
>> right now I only take note of the unix socket /run/systemd/private,
>> but there may have many similar unix sockets, they can exist in any
>> path. the strange problems will still happen.
> 
> It could just as easily have been a fifo in the filesystem, and the
> result would have been the same.
> 
> The network namespace are all about communicating between network
> namespaces and that is what was allowed here.
> 
> If you don't want a socket or a fifo or any other file to be used by a
> container don't give it access to it.  It really is that simple.
> 

Hmm, I tend to think you are right...

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Aug. 25, 2013, 5:16 p.m. UTC | #10
On Wed, 2013-08-21 at 11:51 +0200, Kay Sievers wrote:
> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
> > On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
> 
> >> I suspect libvirt should simply not share /run or any other normally
> >> writable directory with the host.  Sharing /run /var/run or even /tmp
> >> seems extremely dubious if you want some kind of containment, and
> >> without strange things spilling through.
> 
> Right, /run or /var cannot be shared. It's not only about sockets,
> many other things will also go really wrong that way.

This is very narrow thinking about what a container might be and will
cause trouble as people start to create novel uses for containers in the
cloud if you try to impose this on our current infrastructure.

One of the cgroup only container uses we see at Parallels (so no
separate filesystem and no net namespaces) is pure apache load balancer
type shared hosting.  In this scenario, base apache is effectively
brought up in the host environment, but then spawned instances are
resource limited using cgroups according to what the customer has paid.
Obviously all apache instances are sharing /var and /run from the host
(mostly for logging and pid storage and static pages).  The reason some
hosters do this is that it allows much higher density simple web serving
(either static pages from quota limited chroots or dynamic pages limited
by database space constraints) because each "instance" shares so much
from the host.  The service is obviously much more basic than giving
each customer a container running apache, but it's much easier for the
hoster to administer and it serves the customer just as well for a large
cross section of use cases and for those it doesn't serve, the hoster
usually has separate container hosting (for a higher price, of course).

James

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kay Sievers Aug. 25, 2013, 5:37 p.m. UTC | #11
On Sun, Aug 25, 2013 at 7:16 PM, James Bottomley
<jbottomley@parallels.com> wrote:
> On Wed, 2013-08-21 at 11:51 +0200, Kay Sievers wrote:
>> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
>> > On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
>>
>> >> I suspect libvirt should simply not share /run or any other normally
>> >> writable directory with the host.  Sharing /run /var/run or even /tmp
>> >> seems extremely dubious if you want some kind of containment, and
>> >> without strange things spilling through.
>>
>> Right, /run or /var cannot be shared. It's not only about sockets,
>> many other things will also go really wrong that way.
>
> This is very narrow thinking about what a container might be and will
> cause trouble as people start to create novel uses for containers in the
> cloud if you try to impose this on our current infrastructure.
>
> One of the cgroup only container uses we see at Parallels (so no
> separate filesystem and no net namespaces) is pure apache load balancer
> type shared hosting.  In this scenario, base apache is effectively
> brought up in the host environment, but then spawned instances are
> resource limited using cgroups according to what the customer has paid.
> Obviously all apache instances are sharing /var and /run from the host
> (mostly for logging and pid storage and static pages).  The reason some
> hosters do this is that it allows much higher density simple web serving
> (either static pages from quota limited chroots or dynamic pages limited
> by database space constraints) because each "instance" shares so much
> from the host.  The service is obviously much more basic than giving
> each customer a container running apache, but it's much easier for the
> hoster to administer and it serves the customer just as well for a large
> cross section of use cases and for those it doesn't serve, the hoster
> usually has separate container hosting (for a higher price, of course).

The "container" as we talk about has it's own init, and no, it cannot
share /var or /run.

The stuff you talk about has nothing to do with that, it's not
different from all services or a multi-instantiated service on the
host sharing the same /run and /var.

Kay
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Aug. 25, 2013, 6:16 p.m. UTC | #12
On Sun, 2013-08-25 at 19:37 +0200, Kay Sievers wrote:
> On Sun, Aug 25, 2013 at 7:16 PM, James Bottomley
> <jbottomley@parallels.com> wrote:
> > On Wed, 2013-08-21 at 11:51 +0200, Kay Sievers wrote:
> >> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
> >> > On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
> >>
> >> >> I suspect libvirt should simply not share /run or any other normally
> >> >> writable directory with the host.  Sharing /run /var/run or even /tmp
> >> >> seems extremely dubious if you want some kind of containment, and
> >> >> without strange things spilling through.
> >>
> >> Right, /run or /var cannot be shared. It's not only about sockets,
> >> many other things will also go really wrong that way.
> >
> > This is very narrow thinking about what a container might be and will
> > cause trouble as people start to create novel uses for containers in the
> > cloud if you try to impose this on our current infrastructure.
> >
> > One of the cgroup only container uses we see at Parallels (so no
> > separate filesystem and no net namespaces) is pure apache load balancer
> > type shared hosting.  In this scenario, base apache is effectively
> > brought up in the host environment, but then spawned instances are
> > resource limited using cgroups according to what the customer has paid.
> > Obviously all apache instances are sharing /var and /run from the host
> > (mostly for logging and pid storage and static pages).  The reason some
> > hosters do this is that it allows much higher density simple web serving
> > (either static pages from quota limited chroots or dynamic pages limited
> > by database space constraints) because each "instance" shares so much
> > from the host.  The service is obviously much more basic than giving
> > each customer a container running apache, but it's much easier for the
> > hoster to administer and it serves the customer just as well for a large
> > cross section of use cases and for those it doesn't serve, the hoster
> > usually has separate container hosting (for a higher price, of course).
> 
> The "container" as we talk about has it's own init, and no, it cannot
> share /var or /run.

This is what we would call an IaaS container: bringing up init and
effectively a new OS inside a container is the closest containers come
to being like hypervisors.  It's the most common use case of Parallels
containers in the field, so I'm certainly not telling you it's a bad
idea.

> The stuff you talk about has nothing to do with that, it's not
> different from all services or a multi-instantiated service on the
> host sharing the same /run and /var.

I gave you one example: a really simplistic one.  A more sophisticated
example is a PaaS or SaaS container where you bring the OS up in the
host but spawn a particular application into its own container (this is
essentially similar to what Docker does).  Often in this case, you do
add separate mount and network namespaces to make the application
isolated and migrateable with its own IP address.  The reason you share
init and most of the OS from the host is for elasticity and density,
which are fast becoming a holy grail type quest of cloud orchestration
systems: if you don't have to bring up the OS from init and you can just
start the application from a C/R image (orders of magnitude smaller than
a full system image) and slap on the necessary namespaces as you clone
it, you have something that comes online in miliseconds which is a feat
no hypervisor based virtualisation can match.

I'm not saying don't pursue the IaaS case, it's definitely useful ...
I'm just saying it would be a serious mistake to think that's the only
use case for containers and we certainly shouldn't adjust Linux to serve
only that use case.

James

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gao feng Aug. 26, 2013, 1:06 a.m. UTC | #13
On 08/26/2013 02:16 AM, James Bottomley wrote:
> On Sun, 2013-08-25 at 19:37 +0200, Kay Sievers wrote:
>> On Sun, Aug 25, 2013 at 7:16 PM, James Bottomley
>> <jbottomley@parallels.com> wrote:
>>> On Wed, 2013-08-21 at 11:51 +0200, Kay Sievers wrote:
>>>> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
>>>>> On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
>>>>
>>>>>> I suspect libvirt should simply not share /run or any other normally
>>>>>> writable directory with the host.  Sharing /run /var/run or even /tmp
>>>>>> seems extremely dubious if you want some kind of containment, and
>>>>>> without strange things spilling through.
>>>>
>>>> Right, /run or /var cannot be shared. It's not only about sockets,
>>>> many other things will also go really wrong that way.
>>>
>>> This is very narrow thinking about what a container might be and will
>>> cause trouble as people start to create novel uses for containers in the
>>> cloud if you try to impose this on our current infrastructure.
>>>
>>> One of the cgroup only container uses we see at Parallels (so no
>>> separate filesystem and no net namespaces) is pure apache load balancer
>>> type shared hosting.  In this scenario, base apache is effectively
>>> brought up in the host environment, but then spawned instances are
>>> resource limited using cgroups according to what the customer has paid.
>>> Obviously all apache instances are sharing /var and /run from the host
>>> (mostly for logging and pid storage and static pages).  The reason some
>>> hosters do this is that it allows much higher density simple web serving
>>> (either static pages from quota limited chroots or dynamic pages limited
>>> by database space constraints) because each "instance" shares so much
>>> from the host.  The service is obviously much more basic than giving
>>> each customer a container running apache, but it's much easier for the
>>> hoster to administer and it serves the customer just as well for a large
>>> cross section of use cases and for those it doesn't serve, the hoster
>>> usually has separate container hosting (for a higher price, of course).
>>
>> The "container" as we talk about has it's own init, and no, it cannot
>> share /var or /run.
> 
> This is what we would call an IaaS container: bringing up init and
> effectively a new OS inside a container is the closest containers come
> to being like hypervisors.  It's the most common use case of Parallels
> containers in the field, so I'm certainly not telling you it's a bad
> idea.
> 
>> The stuff you talk about has nothing to do with that, it's not
>> different from all services or a multi-instantiated service on the
>> host sharing the same /run and /var.
> 
> I gave you one example: a really simplistic one.  A more sophisticated
> example is a PaaS or SaaS container where you bring the OS up in the
> host but spawn a particular application into its own container (this is
> essentially similar to what Docker does).  Often in this case, you do
> add separate mount and network namespaces to make the application
> isolated and migrateable with its own IP address.  The reason you share
> init and most of the OS from the host is for elasticity and density,
> which are fast becoming a holy grail type quest of cloud orchestration
> systems: if you don't have to bring up the OS from init and you can just
> start the application from a C/R image (orders of magnitude smaller than
> a full system image) and slap on the necessary namespaces as you clone
> it, you have something that comes online in miliseconds which is a feat
> no hypervisor based virtualisation can match.
> 
> I'm not saying don't pursue the IaaS case, it's definitely useful ...
> I'm just saying it would be a serious mistake to think that's the only
> use case for containers and we certainly shouldn't adjust Linux to serve
> only that use case.
>

The feature you said above VS contianer-reboot-host bug, I prefer to fix
the bug. and this feature can be achieved even container unshares /run directory
with host by default, for libvirt, user can set the container configuration to
make the container shares the /run directory with host.

I would like to say, the reboot from container bug is more urgent and need
to be fixed.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Aug. 26, 2013, 3:19 a.m. UTC | #14
On Mon, 2013-08-26 at 09:06 +0800, Gao feng wrote:
> On 08/26/2013 02:16 AM, James Bottomley wrote:
> > On Sun, 2013-08-25 at 19:37 +0200, Kay Sievers wrote:
> >> On Sun, Aug 25, 2013 at 7:16 PM, James Bottomley
> >> <jbottomley@parallels.com> wrote:
> >>> On Wed, 2013-08-21 at 11:51 +0200, Kay Sievers wrote:
> >>>> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
> >>>>> On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
> >>>>
> >>>>>> I suspect libvirt should simply not share /run or any other normally
> >>>>>> writable directory with the host.  Sharing /run /var/run or even /tmp
> >>>>>> seems extremely dubious if you want some kind of containment, and
> >>>>>> without strange things spilling through.
> >>>>
> >>>> Right, /run or /var cannot be shared. It's not only about sockets,
> >>>> many other things will also go really wrong that way.
> >>>
> >>> This is very narrow thinking about what a container might be and will
> >>> cause trouble as people start to create novel uses for containers in the
> >>> cloud if you try to impose this on our current infrastructure.
> >>>
> >>> One of the cgroup only container uses we see at Parallels (so no
> >>> separate filesystem and no net namespaces) is pure apache load balancer
> >>> type shared hosting.  In this scenario, base apache is effectively
> >>> brought up in the host environment, but then spawned instances are
> >>> resource limited using cgroups according to what the customer has paid.
> >>> Obviously all apache instances are sharing /var and /run from the host
> >>> (mostly for logging and pid storage and static pages).  The reason some
> >>> hosters do this is that it allows much higher density simple web serving
> >>> (either static pages from quota limited chroots or dynamic pages limited
> >>> by database space constraints) because each "instance" shares so much
> >>> from the host.  The service is obviously much more basic than giving
> >>> each customer a container running apache, but it's much easier for the
> >>> hoster to administer and it serves the customer just as well for a large
> >>> cross section of use cases and for those it doesn't serve, the hoster
> >>> usually has separate container hosting (for a higher price, of course).
> >>
> >> The "container" as we talk about has it's own init, and no, it cannot
> >> share /var or /run.
> > 
> > This is what we would call an IaaS container: bringing up init and
> > effectively a new OS inside a container is the closest containers come
> > to being like hypervisors.  It's the most common use case of Parallels
> > containers in the field, so I'm certainly not telling you it's a bad
> > idea.
> > 
> >> The stuff you talk about has nothing to do with that, it's not
> >> different from all services or a multi-instantiated service on the
> >> host sharing the same /run and /var.
> > 
> > I gave you one example: a really simplistic one.  A more sophisticated
> > example is a PaaS or SaaS container where you bring the OS up in the
> > host but spawn a particular application into its own container (this is
> > essentially similar to what Docker does).  Often in this case, you do
> > add separate mount and network namespaces to make the application
> > isolated and migrateable with its own IP address.  The reason you share
> > init and most of the OS from the host is for elasticity and density,
> > which are fast becoming a holy grail type quest of cloud orchestration
> > systems: if you don't have to bring up the OS from init and you can just
> > start the application from a C/R image (orders of magnitude smaller than
> > a full system image) and slap on the necessary namespaces as you clone
> > it, you have something that comes online in miliseconds which is a feat
> > no hypervisor based virtualisation can match.
> > 
> > I'm not saying don't pursue the IaaS case, it's definitely useful ...
> > I'm just saying it would be a serious mistake to think that's the only
> > use case for containers and we certainly shouldn't adjust Linux to serve
> > only that use case.
> >
> 
> The feature you said above VS contianer-reboot-host bug, I prefer to
> fix
> the bug.

What bug?

>  and this feature can be achieved even container unshares /run
> directory
> with host by default, for libvirt, user can set the container
> configuration to
> make the container shares the /run directory with host.
> 
> I would like to say, the reboot from container bug is more urgent and
> need
> to be fixed.

Are you talking about the old bug where trying to reboot an lxc
container from within it would reboot the entire system?  If so, OpenVZ
has never suffered from that problem and I thought it was fixed
upstream.  I've not tested lxc tools, but the latest vzctl from the
openvz website will bring up a container on the vanilla 3.9 kernel
(provided you have USER_NS compiled in) can also be used to reboot the
container, so I see no reason it wouldn't work for lxc as well.

James

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gao feng Aug. 26, 2013, 3:35 a.m. UTC | #15
On 08/26/2013 11:19 AM, James Bottomley wrote:
> On Mon, 2013-08-26 at 09:06 +0800, Gao feng wrote:
>> On 08/26/2013 02:16 AM, James Bottomley wrote:
>>> On Sun, 2013-08-25 at 19:37 +0200, Kay Sievers wrote:
>>>> On Sun, Aug 25, 2013 at 7:16 PM, James Bottomley
>>>> <jbottomley@parallels.com> wrote:
>>>>> On Wed, 2013-08-21 at 11:51 +0200, Kay Sievers wrote:
>>>>>> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
>>>>>>> On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
>>>>>>
>>>>>>>> I suspect libvirt should simply not share /run or any other normally
>>>>>>>> writable directory with the host.  Sharing /run /var/run or even /tmp
>>>>>>>> seems extremely dubious if you want some kind of containment, and
>>>>>>>> without strange things spilling through.
>>>>>>
>>>>>> Right, /run or /var cannot be shared. It's not only about sockets,
>>>>>> many other things will also go really wrong that way.
>>>>>
>>>>> This is very narrow thinking about what a container might be and will
>>>>> cause trouble as people start to create novel uses for containers in the
>>>>> cloud if you try to impose this on our current infrastructure.
>>>>>
>>>>> One of the cgroup only container uses we see at Parallels (so no
>>>>> separate filesystem and no net namespaces) is pure apache load balancer
>>>>> type shared hosting.  In this scenario, base apache is effectively
>>>>> brought up in the host environment, but then spawned instances are
>>>>> resource limited using cgroups according to what the customer has paid.
>>>>> Obviously all apache instances are sharing /var and /run from the host
>>>>> (mostly for logging and pid storage and static pages).  The reason some
>>>>> hosters do this is that it allows much higher density simple web serving
>>>>> (either static pages from quota limited chroots or dynamic pages limited
>>>>> by database space constraints) because each "instance" shares so much
>>>>> from the host.  The service is obviously much more basic than giving
>>>>> each customer a container running apache, but it's much easier for the
>>>>> hoster to administer and it serves the customer just as well for a large
>>>>> cross section of use cases and for those it doesn't serve, the hoster
>>>>> usually has separate container hosting (for a higher price, of course).
>>>>
>>>> The "container" as we talk about has it's own init, and no, it cannot
>>>> share /var or /run.
>>>
>>> This is what we would call an IaaS container: bringing up init and
>>> effectively a new OS inside a container is the closest containers come
>>> to being like hypervisors.  It's the most common use case of Parallels
>>> containers in the field, so I'm certainly not telling you it's a bad
>>> idea.
>>>
>>>> The stuff you talk about has nothing to do with that, it's not
>>>> different from all services or a multi-instantiated service on the
>>>> host sharing the same /run and /var.
>>>
>>> I gave you one example: a really simplistic one.  A more sophisticated
>>> example is a PaaS or SaaS container where you bring the OS up in the
>>> host but spawn a particular application into its own container (this is
>>> essentially similar to what Docker does).  Often in this case, you do
>>> add separate mount and network namespaces to make the application
>>> isolated and migrateable with its own IP address.  The reason you share
>>> init and most of the OS from the host is for elasticity and density,
>>> which are fast becoming a holy grail type quest of cloud orchestration
>>> systems: if you don't have to bring up the OS from init and you can just
>>> start the application from a C/R image (orders of magnitude smaller than
>>> a full system image) and slap on the necessary namespaces as you clone
>>> it, you have something that comes online in miliseconds which is a feat
>>> no hypervisor based virtualisation can match.
>>>
>>> I'm not saying don't pursue the IaaS case, it's definitely useful ...
>>> I'm just saying it would be a serious mistake to think that's the only
>>> use case for containers and we certainly shouldn't adjust Linux to serve
>>> only that use case.
>>>
>>
>> The feature you said above VS contianer-reboot-host bug, I prefer to
>> fix
>> the bug.
> 
> What bug?
> 
>>  and this feature can be achieved even container unshares /run
>> directory
>> with host by default, for libvirt, user can set the container
>> configuration to
>> make the container shares the /run directory with host.
>>
>> I would like to say, the reboot from container bug is more urgent and
>> need
>> to be fixed.
> 
> Are you talking about the old bug where trying to reboot an lxc
> container from within it would reboot the entire system? 

Yes, we are discussing this problem in this whole thread.

 If so, OpenVZ
> has never suffered from that problem and I thought it was fixed
> upstream.  I've not tested lxc tools, but the latest vzctl from the
> openvz website will bring up a container on the vanilla 3.9 kernel
> (provided you have USER_NS compiled in) can also be used to reboot the
> container, so I see no reason it wouldn't work for lxc as well.
> 

I'm using libvirt lxc not lxc-tools.
Not all of users enable user namespace, I trust these container management
tools can have right/proper setting which inhibit this reboot-problem occur.
but I don't think this reboot-problem won't happen in any configuration.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Aug. 26, 2013, 3:53 a.m. UTC | #16
On Mon, 2013-08-26 at 11:35 +0800, Gao feng wrote:
> On 08/26/2013 11:19 AM, James Bottomley wrote:
> > Yes, we are discussing this problem in this whole thread.

I wasn't really watching that bit, since the problem looks solved to me.
I was just reacting against the unfortunate notion that a container
should run init.

>  If so, OpenVZ
> > has never suffered from that problem and I thought it was fixed
> > upstream.  I've not tested lxc tools, but the latest vzctl from the
> > openvz website will bring up a container on the vanilla 3.9 kernel
> > (provided you have USER_NS compiled in) can also be used to reboot the
> > container, so I see no reason it wouldn't work for lxc as well.
> > 
> 
> I'm using libvirt lxc not lxc-tools.
> Not all of users enable user namespace, I trust these container
> management
> tools can have right/proper setting which inhibit this reboot-problem
> occur.
> but I don't think this reboot-problem won't happen in any
> configuration.

It sounds like you're setting up your containers wrongly.  If a
container can reboot the system it means that host root capabilities
have leaked into the container, which is a big security no-no.  The
upstream way of avoiding this is USER_NS (because root in the container
is now not root in the host).  The OpenVZ kernel uses a different
mechanism to solve the problem, but we think USER_NS is the better way
to go on this.

James

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Serge E. Hallyn Aug. 26, 2013, 1:53 p.m. UTC | #17
Quoting Gao feng (gaofeng@cn.fujitsu.com):
> On 08/26/2013 11:19 AM, James Bottomley wrote:
> > On Mon, 2013-08-26 at 09:06 +0800, Gao feng wrote:
> >> On 08/26/2013 02:16 AM, James Bottomley wrote:
> >>> On Sun, 2013-08-25 at 19:37 +0200, Kay Sievers wrote:
> >>>> On Sun, Aug 25, 2013 at 7:16 PM, James Bottomley
> >>>> <jbottomley@parallels.com> wrote:
> >>>>> On Wed, 2013-08-21 at 11:51 +0200, Kay Sievers wrote:
> >>>>>> On Wed, Aug 21, 2013 at 9:22 AM, Gao feng <gaofeng@cn.fujitsu.com> wrote:
> >>>>>>> On 08/21/2013 03:06 PM, Eric W. Biederman wrote:
> >>>>>>
> >>>>>>>> I suspect libvirt should simply not share /run or any other normally
> >>>>>>>> writable directory with the host.  Sharing /run /var/run or even /tmp
> >>>>>>>> seems extremely dubious if you want some kind of containment, and
> >>>>>>>> without strange things spilling through.
> >>>>>>
> >>>>>> Right, /run or /var cannot be shared. It's not only about sockets,
> >>>>>> many other things will also go really wrong that way.
> >>>>>
> >>>>> This is very narrow thinking about what a container might be and will
> >>>>> cause trouble as people start to create novel uses for containers in the
> >>>>> cloud if you try to impose this on our current infrastructure.
> >>>>>
> >>>>> One of the cgroup only container uses we see at Parallels (so no
> >>>>> separate filesystem and no net namespaces) is pure apache load balancer
> >>>>> type shared hosting.  In this scenario, base apache is effectively
> >>>>> brought up in the host environment, but then spawned instances are
> >>>>> resource limited using cgroups according to what the customer has paid.
> >>>>> Obviously all apache instances are sharing /var and /run from the host
> >>>>> (mostly for logging and pid storage and static pages).  The reason some
> >>>>> hosters do this is that it allows much higher density simple web serving
> >>>>> (either static pages from quota limited chroots or dynamic pages limited
> >>>>> by database space constraints) because each "instance" shares so much
> >>>>> from the host.  The service is obviously much more basic than giving
> >>>>> each customer a container running apache, but it's much easier for the
> >>>>> hoster to administer and it serves the customer just as well for a large
> >>>>> cross section of use cases and for those it doesn't serve, the hoster
> >>>>> usually has separate container hosting (for a higher price, of course).
> >>>>
> >>>> The "container" as we talk about has it's own init, and no, it cannot
> >>>> share /var or /run.
> >>>
> >>> This is what we would call an IaaS container: bringing up init and
> >>> effectively a new OS inside a container is the closest containers come
> >>> to being like hypervisors.  It's the most common use case of Parallels
> >>> containers in the field, so I'm certainly not telling you it's a bad
> >>> idea.
> >>>
> >>>> The stuff you talk about has nothing to do with that, it's not
> >>>> different from all services or a multi-instantiated service on the
> >>>> host sharing the same /run and /var.
> >>>
> >>> I gave you one example: a really simplistic one.  A more sophisticated
> >>> example is a PaaS or SaaS container where you bring the OS up in the
> >>> host but spawn a particular application into its own container (this is
> >>> essentially similar to what Docker does).  Often in this case, you do
> >>> add separate mount and network namespaces to make the application
> >>> isolated and migrateable with its own IP address.  The reason you share
> >>> init and most of the OS from the host is for elasticity and density,
> >>> which are fast becoming a holy grail type quest of cloud orchestration
> >>> systems: if you don't have to bring up the OS from init and you can just
> >>> start the application from a C/R image (orders of magnitude smaller than
> >>> a full system image) and slap on the necessary namespaces as you clone
> >>> it, you have something that comes online in miliseconds which is a feat
> >>> no hypervisor based virtualisation can match.
> >>>
> >>> I'm not saying don't pursue the IaaS case, it's definitely useful ...
> >>> I'm just saying it would be a serious mistake to think that's the only
> >>> use case for containers and we certainly shouldn't adjust Linux to serve
> >>> only that use case.
> >>>
> >>
> >> The feature you said above VS contianer-reboot-host bug, I prefer to
> >> fix
> >> the bug.
> > 
> > What bug?
> > 
> >>  and this feature can be achieved even container unshares /run
> >> directory
> >> with host by default, for libvirt, user can set the container
> >> configuration to
> >> make the container shares the /run directory with host.
> >>
> >> I would like to say, the reboot from container bug is more urgent and
> >> need
> >> to be fixed.
> > 
> > Are you talking about the old bug where trying to reboot an lxc
> > container from within it would reboot the entire system? 
> 
> Yes, we are discussing this problem in this whole thread.
> 
>  If so, OpenVZ
> > has never suffered from that problem and I thought it was fixed
> > upstream.  I've not tested lxc tools, but the latest vzctl from the
> > openvz website will bring up a container on the vanilla 3.9 kernel
> > (provided you have USER_NS compiled in) can also be used to reboot the
> > container, so I see no reason it wouldn't work for lxc as well.
> > 
> 
> I'm using libvirt lxc not lxc-tools.
> Not all of users enable user namespace, I trust these container management
> tools can have right/proper setting which inhibit this reboot-problem occur.
> but I don't think this reboot-problem won't happen in any configuration.

On any recent kernel, reboot syscall from inside a non-init pid-ns will
not reboot the host.  If from within a non-init pid-ns you are managing
to reboot the host, then you have a problem with how userspace is set
up.  The container is being allowed to request init on the host to
do the reboot - ie by sharing /dev/initctl inode with the host, or by
being in same net namespace as upstart on the host.

The fact that it's possible to create such containers is not a bug.

(On older kernels, you have to drop CAP_SYS_BOOT to prevent use of
reboot system call, as all lxc-like programs did.)

-serge
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c4ce243..98e3689 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -295,7 +295,8 @@  static inline struct sock *unix_find_socket_byname(struct net *net,
 	return s;
 }
 
-static struct sock *unix_find_socket_byinode(struct inode *i)
+static struct sock *unix_find_socket_byinode(struct net *net,
+					     struct inode *i)
 {
 	struct sock *s;
 
@@ -304,6 +305,9 @@  static struct sock *unix_find_socket_byinode(struct inode *i)
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->path.dentry;
 
+		if (!net_eq(sock_net(s), net))
+			continue;
+
 		if (dentry && dentry->d_inode == i) {
 			sock_hold(s);
 			goto found;
@@ -784,7 +788,7 @@  static struct sock *unix_find_other(struct net *net,
 		err = -ECONNREFUSED;
 		if (!S_ISSOCK(inode->i_mode))
 			goto put_fail;
-		u = unix_find_socket_byinode(inode);
+		u = unix_find_socket_byinode(net, inode);
 		if (!u)
 			goto put_fail;
 
diff --git a/net/unix/diag.c b/net/unix/diag.c
index d591091..80ada12 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -218,20 +218,25 @@  done:
 	return skb->len;
 }
 
-static struct sock *unix_lookup_by_ino(int ino)
+static struct sock *unix_lookup_by_ino(struct net *net, int ino)
 {
 	int i;
 	struct sock *sk;
 
 	spin_lock(&unix_table_lock);
 	for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) {
-		sk_for_each(sk, &unix_socket_table[i])
+		sk_for_each(sk, &unix_socket_table[i]) {
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+
 			if (ino == sock_i_ino(sk)) {
 				sock_hold(sk);
 				spin_unlock(&unix_table_lock);
 
 				return sk;
 			}
+		}
 	}
 
 	spin_unlock(&unix_table_lock);
@@ -251,7 +256,7 @@  static int unix_diag_get_exact(struct sk_buff *in_skb,
 	if (req->udiag_ino == 0)
 		goto out_nosk;
 
-	sk = unix_lookup_by_ino(req->udiag_ino);
+	sk = unix_lookup_by_ino(net, req->udiag_ino);
 	err = -ENOENT;
 	if (sk == NULL)
 		goto out_nosk;