diff mbox

[net-next] netns: correctly use per-netns ipv4 sysctl_tcp_mem

Message ID 50079D47.6040001@huawei.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Huang Qiang July 19, 2012, 5:38 a.m. UTC
From: Yang Zhenzhang <yangzhenzhang@huawei.com>

Now, kernel allows each net namespace to independently set up its levels
for tcp memory pressure thresholds.

But it seems there is a bug, as using the following steps:

[root@host socket]# lxc-start -n test -f config /bin/bash
[root@net-test socket]# ip route add default via 192.168.58.2
[root@net-test socket]# echo 0 0 0 > /proc/sys/net/ipv4/tcp_mem
[root@net-test socket]# scp root@192.168.58.174:/home/tcp_mem_test .

and it still can transport the "tcp_mem_test" file which we hope it
would not.

It's because inet_init() (net/ipv4/af_inet.c)initialize the tcp_prot.sysctl_mem:
tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;

So when the protocal is TCP, sk->sk_prot->sysctl_mem(following code)
always use the ipv4 sysctl_tcp_mem of init_net namespace rather than
it's own net namespace.
This patch simply set "prot" equal to net->ipv4.sysctl_tcp_mem when
the protocol type is TCP.

Signed-off-by: Yang Zhenzhang <yangzhenzhang@huawei.com>
Signed-off-by: Huang Qiang <h.huangqiang@huawei.com>
---
 include/net/sock.h |   24 ++++++++++++++++--------
 1 files changed, 16 insertions(+), 8 deletions(-)

Comments

Eric Dumazet July 19, 2012, 6:03 a.m. UTC | #1
On Thu, 2012-07-19 at 13:38 +0800, Huang Qiang wrote:
> From: Yang Zhenzhang <yangzhenzhang@huawei.com>
> 
> Now, kernel allows each net namespace to independently set up its levels
> for tcp memory pressure thresholds.
> 
> But it seems there is a bug, as using the following steps:
> 
> [root@host socket]# lxc-start -n test -f config /bin/bash
> [root@net-test socket]# ip route add default via 192.168.58.2
> [root@net-test socket]# echo 0 0 0 > /proc/sys/net/ipv4/tcp_mem
> [root@net-test socket]# scp root@192.168.58.174:/home/tcp_mem_test .
> 
> and it still can transport the "tcp_mem_test" file which we hope it
> would not.
> 
> It's because inet_init() (net/ipv4/af_inet.c)initialize the tcp_prot.sysctl_mem:
> tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
> 
> So when the protocal is TCP, sk->sk_prot->sysctl_mem(following code)
> always use the ipv4 sysctl_tcp_mem of init_net namespace rather than
> it's own net namespace.
> This patch simply set "prot" equal to net->ipv4.sysctl_tcp_mem when
> the protocol type is TCP.
> 
> Signed-off-by: Yang Zhenzhang <yangzhenzhang@huawei.com>
> Signed-off-by: Huang Qiang <h.huangqiang@huawei.com>
> ---
>  include/net/sock.h |   24 ++++++++++++++++--------
>  1 files changed, 16 insertions(+), 8 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 88de092..61f4363 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -59,6 +59,7 @@
>  #include <linux/static_key.h>
>  #include <linux/aio.h>
>  #include <linux/sched.h>
> +#include <linux/in.h>
> 
>  #include <linux/filter.h>
>  #include <linux/rculist_nulls.h>
> @@ -1064,14 +1065,6 @@ static inline void sk_enter_memory_pressure(struct sock *sk)
>  	sk->sk_prot->enter_memory_pressure(sk);
>  }
> 
> -static inline long sk_prot_mem_limits(const struct sock *sk, int index)
> -{
> -	long *prot = sk->sk_prot->sysctl_mem;
> -	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
> -		prot = sk->sk_cgrp->sysctl_mem;
> -	return prot[index];
> -}
> -
>  static inline void memcg_memory_allocated_add(struct cg_proto *prot,
>  					      unsigned long amt,
>  					      int *parent_status)
> @@ -2155,6 +2148,21 @@ static inline void sk_change_net(struct sock *sk, struct net *net)
>  	sock_net_set(sk, hold_net(net));
>  }
> 
> +static inline long sk_prot_mem_limits(const struct sock *sk, int index)
> +{
> +	long *prot = sk->sk_prot->sysctl_mem;
> +
> +	if (sk->sk_protocol == IPPROTO_TCP) {
> +		struct net *net = sock_net(sk);
> +		prot = net->ipv4.sysctl_tcp_mem;
> +	}
> +

	if (sk->sk_protocol == IPPROTO_TCP)
		prot = sock_net(sk)->ipv4.sysctl_tcp_mem;

> +	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
> +		prot = sk->sk_cgrp->sysctl_mem;
> +
> +	return prot[index];
> +}
> +
>  static inline struct sock *skb_steal_sock(struct sk_buff *skb)
>  {
>  	if (skb->sk) {


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa July 25, 2012, 12:45 p.m. UTC | #2
Hi,


On 07/19/2012 10:03 AM, Eric Dumazet wrote:
> On Thu, 2012-07-19 at 13:38 +0800, Huang Qiang wrote:
>> From: Yang Zhenzhang <yangzhenzhang@huawei.com>
>>
>> Now, kernel allows each net namespace to independently set up its levels
>> for tcp memory pressure thresholds.

Not really.

So the real limitation here, is done by the memory controller in cgroup,
not the proc files. AFAIK, lxc does not (yet) touches that file by
default, but it does create a memcg placeholder for you container, where
you can set that yourself.

cgroups are outside the realm of the admin, however. So once the
limitation is in place, you might want to restrain their further,
and that's the role of the files in /proc.

The goal is to have something that is as close as possible to a real
system in a container, where an admin could freely set this. (but of
course, never going over its allowance)

You can note this by what reads in sysctl_ipv4.c, when that file is
written to:

#ifdef CONFIG_MEMCG_KMEM
        rcu_read_lock();
        memcg = mem_cgroup_from_task(current);

        tcp_prot_mem(memcg, vec[0], 0);
        tcp_prot_mem(memcg, vec[1], 1);
        tcp_prot_mem(memcg, vec[2], 2);
        rcu_read_unlock();
#endif

This function is defined in tcp_memcontrol.c

void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
{
        struct tcp_memcontrol *tcp;
        struct cg_proto *cg_proto;

        cg_proto = tcp_prot.proto_cgroup(memcg);
        if (!cg_proto)
                return;

        tcp = tcp_from_cgproto(cg_proto);

        tcp->tcp_prot_mem[idx] = val;
}

tcp_prot_mem[] ends up being the vector you access as:

	prot = sk->sk_cgrp->sysctl_mem;

in the function you patch.

I hope it helps.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index 88de092..61f4363 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -59,6 +59,7 @@ 
 #include <linux/static_key.h>
 #include <linux/aio.h>
 #include <linux/sched.h>
+#include <linux/in.h>

 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -1064,14 +1065,6 @@  static inline void sk_enter_memory_pressure(struct sock *sk)
 	sk->sk_prot->enter_memory_pressure(sk);
 }

-static inline long sk_prot_mem_limits(const struct sock *sk, int index)
-{
-	long *prot = sk->sk_prot->sysctl_mem;
-	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		prot = sk->sk_cgrp->sysctl_mem;
-	return prot[index];
-}
-
 static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 					      unsigned long amt,
 					      int *parent_status)
@@ -2155,6 +2148,21 @@  static inline void sk_change_net(struct sock *sk, struct net *net)
 	sock_net_set(sk, hold_net(net));
 }

+static inline long sk_prot_mem_limits(const struct sock *sk, int index)
+{
+	long *prot = sk->sk_prot->sysctl_mem;
+
+	if (sk->sk_protocol == IPPROTO_TCP) {
+		struct net *net = sock_net(sk);
+		prot = net->ipv4.sysctl_tcp_mem;
+	}
+
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		prot = sk->sk_cgrp->sysctl_mem;
+
+	return prot[index];
+}
+
 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 {
 	if (skb->sk) {