diff mbox

per-cgroup tcp buffer limitation

Message ID 1315276556-10970-1-git-send-email-glommer@parallels.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Glauber Costa Sept. 6, 2011, 2:35 a.m. UTC
This patch introduces per-cgroup tcp buffers limitation. This allows
sysadmins to specify a maximum amount of kernel memory that
tcp connections can use at any point in time. TCP is the main interest
in this work, but extending it to other protocols would be easy.

It piggybacks in the memory control mechanism already present in
/proc/sys/net/ipv4/tcp_mem. There is a soft limit, and a hard limit,
that will suppress allocation when reached. For each cgroup, however,
the file kmem.tcp_maxmem will be used to cap those values.

The usage I have in mind here is containers. Each container will
define its own values for soft and hard limits, but none of them will
be possibly bigger than the value the box' sysadmin specified from
the outside.

To test for any performance impacts of this patch, I used netperf's
TCP_RR benchmark on localhost, so we can have both recv and snd in action.

Command line used was ./src/netperf -t TCP_RR -H localhost, and the
results:

Without the patch

Comments

KAMEZAWA Hiroyuki Sept. 6, 2011, 10 a.m. UTC | #1
On Mon,  5 Sep 2011 23:35:56 -0300
Glauber Costa <glommer@parallels.com> wrote:

> This patch introduces per-cgroup tcp buffers limitation. This allows
> sysadmins to specify a maximum amount of kernel memory that
> tcp connections can use at any point in time. TCP is the main interest
> in this work, but extending it to other protocols would be easy.
> 
> It piggybacks in the memory control mechanism already present in
> /proc/sys/net/ipv4/tcp_mem. There is a soft limit, and a hard limit,
> that will suppress allocation when reached. For each cgroup, however,
> the file kmem.tcp_maxmem will be used to cap those values.
> 
> The usage I have in mind here is containers. Each container will
> define its own values for soft and hard limits, but none of them will
> be possibly bigger than the value the box' sysadmin specified from
> the outside.
> 
> To test for any performance impacts of this patch, I used netperf's
> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
> 
> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
> results:
> 
> Without the patch
> =================
> 
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
> 
> 16384  87380  1        1       10.00    26996.35
> 16384  87380
> 
> With the patch
> ===============
> 
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
> 
> 16384  87380  1        1       10.00    27291.86
> 16384  87380
> 
> The difference is within a one-percent range.
> 
> Nesting cgroups doesn't seem to be the dominating factor as well,
> with nestings up to 10 levels not showing a significant performance
> difference.
> 
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>
> ---
>  crypto/af_alg.c               |    7 ++-
>  include/linux/cgroup_subsys.h |    4 +
>  include/net/netns/ipv4.h      |    1 +
>  include/net/sock.h            |   66 +++++++++++++++-
>  include/net/tcp.h             |   12 ++-
>  include/net/udp.h             |    3 +-
>  include/trace/events/sock.h   |   10 +-
>  init/Kconfig                  |   11 +++
>  mm/Makefile                   |    1 +
>  net/core/sock.c               |  136 +++++++++++++++++++++++++++-------
>  net/decnet/af_decnet.c        |   21 +++++-
>  net/ipv4/proc.c               |    8 +-
>  net/ipv4/sysctl_net_ipv4.c    |   59 +++++++++++++--
>  net/ipv4/tcp.c                |  164 +++++++++++++++++++++++++++++++++++-----
>  net/ipv4/tcp_input.c          |   17 ++--
>  net/ipv4/tcp_ipv4.c           |   27 +++++--
>  net/ipv4/tcp_output.c         |    2 +-
>  net/ipv4/tcp_timer.c          |    2 +-
>  net/ipv4/udp.c                |   20 ++++-
>  net/ipv6/tcp_ipv6.c           |   16 +++-
>  net/ipv6/udp.c                |    4 +-
>  net/sctp/socket.c             |   35 +++++++--
>  22 files changed, 514 insertions(+), 112 deletions(-)

Hmm...could you please devide patches into a few patches ?

If I was you, I'll devide the patches into

 - Kconfig/Makefile/kmem_cgroup.c skelton.
 - changes to struct sock and macro definition
 - hooks to tcp.
 - hooks to udp
 - hooks to sctp

And why not including mm/kmem_cgroup.c ?

some comments below.


> 
> diff --git a/crypto/af_alg.c b/crypto/af_alg.c
> index ac33d5f..df168d8 100644
> --- a/crypto/af_alg.c
> +++ b/crypto/af_alg.c
> @@ -29,10 +29,15 @@ struct alg_type_list {
>  
>  static atomic_long_t alg_memory_allocated;
>  
> +static atomic_long_t *memory_allocated_alg(struct kmem_cgroup *sg)
> +{
> +	return &alg_memory_allocated;
> +}
> +
>  static struct proto alg_proto = {
>  	.name			= "ALG",
>  	.owner			= THIS_MODULE,
> -	.memory_allocated	= &alg_memory_allocated,
> +	.memory_allocated	= memory_allocated_alg,
>  	.obj_size		= sizeof(struct alg_sock),
>  };
>  
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index ac663c1..363b8e8 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -35,6 +35,10 @@ SUBSYS(cpuacct)
>  SUBSYS(mem_cgroup)
>  #endif
>  
> +#ifdef CONFIG_CGROUP_KMEM
> +SUBSYS(kmem)
> +#endif
> +
>  /* */
>  
>  #ifdef CONFIG_CGROUP_DEVICE
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index d786b4f..bbd023a 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>  	int current_rt_cache_rebuild_count;
>  
>  	unsigned int sysctl_ping_group_range[2];
> +	long sysctl_tcp_mem[3];
>  
>  	atomic_t rt_genid;
>  	atomic_t dev_addr_genid;
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 8e4062f..e085148 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -62,7 +62,9 @@
>  #include <linux/atomic.h>
>  #include <net/dst.h>
>  #include <net/checksum.h>
> +#include <linux/kmem_cgroup.h>
>  
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>  /*
>   * This structure really needs to be cleaned up.
>   * Most of it is for TCP, and not used by any of
> @@ -339,6 +341,7 @@ struct sock {
>  #endif
>  	__u32			sk_mark;
>  	u32			sk_classid;
> +	struct kmem_cgroup	*sk_cgrp;
>  	void			(*sk_state_change)(struct sock *sk);
>  	void			(*sk_data_ready)(struct sock *sk, int bytes);
>  	void			(*sk_write_space)(struct sock *sk);
> @@ -786,16 +789,21 @@ struct proto {
>  
>  	/* Memory pressure */
>  	void			(*enter_memory_pressure)(struct sock *sk);
> -	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
> -	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
> +	/* Current allocated memory. */
> +	atomic_long_t		*(*memory_allocated)(struct kmem_cgroup *sg);
> +	/* Current number of sockets. */
> +	struct percpu_counter	*(*sockets_allocated)(struct kmem_cgroup *sg);
> +
> +	int			(*init_cgroup)(struct cgroup *cgrp,
> +					       struct cgroup_subsys *ss);
>  	/*
>  	 * Pressure flag: try to collapse.
>  	 * Technical note: it is used by multiple contexts non atomically.
>  	 * All the __sk_mem_schedule() is of this nature: accounting
>  	 * is strict, actions are advisory and have some latency.
>  	 */
> -	int			*memory_pressure;
> -	long			*sysctl_mem;
> +	int			*(*memory_pressure)(struct kmem_cgroup *sg);
> +	long			*(*prot_mem)(struct kmem_cgroup *sg);

Hmm. Socket interface callbacks doesn't have documentation ?
Adding explanation in Documenation is better, isn't it ?


>  	int			*sysctl_wmem;
>  	int			*sysctl_rmem;
>  	int			max_header;
> @@ -826,6 +834,56 @@ struct proto {
>  #endif
>  };
>  
> +#define sk_memory_pressure(sk)						\
> +({									\
> +	int *__ret = NULL;						\
> +	if ((sk)->sk_prot->memory_pressure)				\
> +		__ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);	\
> +	__ret;								\
> +})
> +
> +#define sk_sockets_allocated(sk)				\
> +({								\
> +	struct percpu_counter *__p;				\
> +	__p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);	\
> +	__p;							\
> +})
> +
> +#define sk_memory_allocated(sk)					\
> +({								\
> +	atomic_long_t *__mem;					\
> +	__mem = (sk)->sk_prot->memory_allocated(sk->sk_cgrp);	\
> +	__mem;							\
> +})
> +
> +#define sk_prot_mem(sk)						\
> +({								\
> +	long *__mem = (sk)->sk_prot->prot_mem(sk->sk_cgrp);	\
> +	__mem;							\
> +})
> +
> +#define sg_memory_pressure(prot, sg)				\
> +({								\
> +	int *__ret = NULL;					\
> +	if (prot->memory_pressure)				\
> +		__ret = (prot)->memory_pressure(sg);		\
> +	__ret;							\
> +})
> +
> +#define sg_memory_allocated(prot, sg)				\
> +({								\
> +	atomic_long_t *__mem;					\
> +	__mem = (prot)->memory_allocated(sg);			\
> +	__mem;							\
> +})
> +
> +#define sg_sockets_allocated(prot, sg)				\
> +({								\
> +	struct percpu_counter *__p;				\
> +	__p = (prot)->sockets_allocated(sg);			\
> +	__p;							\
> +})
> +

All functions are worth to be inlined ?



>  extern int proto_register(struct proto *prot, int alloc_slab);
>  extern void proto_unregister(struct proto *prot);
>  
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 149a415..8e1ec4a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>  extern int sysctl_tcp_reordering;
>  extern int sysctl_tcp_ecn;
>  extern int sysctl_tcp_dsack;
> -extern long sysctl_tcp_mem[3];
>  extern int sysctl_tcp_wmem[3];
>  extern int sysctl_tcp_rmem[3];
>  extern int sysctl_tcp_app_win;
> @@ -253,9 +252,12 @@ extern int sysctl_tcp_cookie_size;
>  extern int sysctl_tcp_thin_linear_timeouts;
>  extern int sysctl_tcp_thin_dupack;
>  
> -extern atomic_long_t tcp_memory_allocated;
> -extern struct percpu_counter tcp_sockets_allocated;
> -extern int tcp_memory_pressure;
> +struct kmem_cgroup;
> +extern long *tcp_sysctl_mem(struct kmem_cgroup *sg);
> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg);
> +int *memory_pressure_tcp(struct kmem_cgroup *sg);
> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg);
>  
>  /*
>   * The next routines deal with comparing 32 bit unsigned ints
> @@ -286,7 +288,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
>  	}
>  
>  	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
> -	    atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])
> +	    atomic_long_read(sk_memory_allocated(sk)) > sk_prot_mem(sk)[2])

Why not sk_memory_allocated() returns the value ?
Is it required to return pointer ?


>  		return true;
>  	return false;
>  }
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 67ea6fc..0e27388 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -105,7 +105,8 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
>  
>  extern struct proto udp_prot;
>  
> -extern atomic_long_t udp_memory_allocated;
> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg);
> +long *udp_sysctl_mem(struct kmem_cgroup *sg);
>  
>  /* sysctl variables for udp */
>  extern long sysctl_udp_mem[3];
> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
> index 779abb9..12a6083 100644
> --- a/include/trace/events/sock.h
> +++ b/include/trace/events/sock.h
> @@ -37,7 +37,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>  
>  	TP_STRUCT__entry(
>  		__array(char, name, 32)
> -		__field(long *, sysctl_mem)
> +		__field(long *, prot_mem)
>  		__field(long, allocated)
>  		__field(int, sysctl_rmem)
>  		__field(int, rmem_alloc)
> @@ -45,7 +45,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>  
>  	TP_fast_assign(
>  		strncpy(__entry->name, prot->name, 32);
> -		__entry->sysctl_mem = prot->sysctl_mem;
> +		__entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
>  		__entry->allocated = allocated;
>  		__entry->sysctl_rmem = prot->sysctl_rmem[0];
>  		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
> @@ -54,9 +54,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
>  	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
>  		"sysctl_rmem=%d rmem_alloc=%d",
>  		__entry->name,
> -		__entry->sysctl_mem[0],
> -		__entry->sysctl_mem[1],
> -		__entry->sysctl_mem[2],
> +		__entry->prot_mem[0],
> +		__entry->prot_mem[1],
> +		__entry->prot_mem[2],
>  		__entry->allocated,
>  		__entry->sysctl_rmem,
>  		__entry->rmem_alloc)
> diff --git a/init/Kconfig b/init/Kconfig
> index d627783..5955ac2 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -690,6 +690,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
>  	  select this option (if, for some reason, they need to disable it
>  	  then swapaccount=0 does the trick).
>  
> +config CGROUP_KMEM
> +	bool "Kernel Memory Resource Controller for Control Groups"
> +	depends on CGROUPS
> +	help
> +	  The Kernel Memory cgroup can limit the amount of memory used by
> +	  certain kernel objects in the system. Those are fundamentally
> +	  different from the entities handled by the Memory Controller,
> +	  which are page-based, and can be swapped. Users of the kmem
> +	  cgroup can use it to guarantee that no group of processes will
> +	  ever exhaust kernel resources alone.
> +

This help seems nice but please add Documentation/cgroup/kmem.




>  config CGROUP_PERF
>  	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
>  	depends on PERF_EVENTS && CGROUPS
> diff --git a/mm/Makefile b/mm/Makefile
> index 836e416..1b1aa24 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -45,6 +45,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>  obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o
>  obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>  obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 3449df8..2d968ea 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -134,6 +134,24 @@
>  #include <net/tcp.h>
>  #endif
>  
> +static DEFINE_RWLOCK(proto_list_lock);
> +static LIST_HEAD(proto_list);
> +
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +	struct proto *proto;
> +	int ret = 0;
> +
> +	read_lock(&proto_list_lock);
> +	list_for_each_entry(proto, &proto_list, node) {
> +		if (proto->init_cgroup)
> +			ret |= proto->init_cgroup(cgrp, ss);
> +	}
> +	read_unlock(&proto_list_lock);
> +
> +	return ret;
> +}
> +
>  /*
>   * Each address family might have different locking rules, so we have
>   * one slock key per address family:
> @@ -1114,6 +1132,31 @@ void sock_update_classid(struct sock *sk)
>  EXPORT_SYMBOL(sock_update_classid);
>  #endif
>  
> +void sock_update_kmem_cgrp(struct sock *sk)
> +{
> +#ifdef CONFIG_CGROUP_KMEM
> +	sk->sk_cgrp = kcg_from_task(current);
> +
> +	/*
> +	 * We don't need to protect against anything task-related, because
> +	 * we are basically stuck with the sock pointer that won't change,
> +	 * even if the task that originated the socket changes cgroups.
> +	 *
> +	 * What we do have to guarantee, is that the chain leading us to
> +	 * the top level won't change under our noses. Incrementing the
> +	 * reference count via cgroup_exclude_rmdir guarantees that.
> +	 */
> +	cgroup_exclude_rmdir(&sk->sk_cgrp->css);
> +#endif
> +}

I'm not sure this kind of bare cgroup code in core/sock.c will be
welcomed by network guys.




> +
> +void sock_release_kmem_cgrp(struct sock *sk)
> +{
> +#ifdef CONFIG_CGROUP_KMEM
> +	cgroup_release_and_wakeup_rmdir(&sk->sk_cgrp->css);
> +#endif
> +}
> +
>  /**
>   *	sk_alloc - All socket objects are allocated here
>   *	@net: the applicable net namespace
> @@ -1139,6 +1182,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>  		atomic_set(&sk->sk_wmem_alloc, 1);
>  
>  		sock_update_classid(sk);
> +		sock_update_kmem_cgrp(sk);
>  	}
>  
>  	return sk;
> @@ -1170,6 +1214,7 @@ static void __sk_free(struct sock *sk)
>  		put_cred(sk->sk_peer_cred);
>  	put_pid(sk->sk_peer_pid);
>  	put_net(sock_net(sk));
> +	sock_release_kmem_cgrp(sk);
>  	sk_prot_free(sk->sk_prot_creator, sk);
>  }
>  
> @@ -1287,8 +1332,8 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
>  		sk_set_socket(newsk, NULL);
>  		newsk->sk_wq = NULL;
>  
> -		if (newsk->sk_prot->sockets_allocated)
> -			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
> +		if (sk_sockets_allocated(sk))
> +			percpu_counter_inc(sk_sockets_allocated(sk));
How about 
	sk_sockets_allocated_inc(sk);
?


>  
>  		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
>  		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
> @@ -1676,29 +1721,51 @@ EXPORT_SYMBOL(sk_wait_data);
>   */
>  int __sk_mem_schedule(struct sock *sk, int size, int kind)
>  {
> -	struct proto *prot = sk->sk_prot;
>  	int amt = sk_mem_pages(size);
> +	struct proto *prot = sk->sk_prot;
>  	long allocated;
> +	int *memory_pressure;
> +	long *prot_mem;
> +	int parent_failure = 0;
> +	struct kmem_cgroup *sg;
>  
>  	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
> -	allocated = atomic_long_add_return(amt, prot->memory_allocated);
> +
> +	memory_pressure = sk_memory_pressure(sk);
> +	prot_mem = sk_prot_mem(sk);
> +
> +	allocated = atomic_long_add_return(amt, sk_memory_allocated(sk));
> +
> +#ifdef CONFIG_CGROUP_KMEM
> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
> +		long alloc;
> +		/*
> +		 * Large nestings are not the common case, and stopping in the
> +		 * middle would be complicated enough, that we bill it all the
> +		 * way through the root, and if needed, unbill everything later
> +		 */
> +		alloc = atomic_long_add_return(amt,
> +					       sg_memory_allocated(prot, sg));
> +		parent_failure |= (alloc > sk_prot_mem(sk)[2]);
> +	}
> +#endif
> +
> +	/* Over hard limit (we, or our parents) */
> +	if (parent_failure || (allocated > prot_mem[2]))
> +		goto suppress_allocation;
>  
>  	/* Under limit. */
> -	if (allocated <= prot->sysctl_mem[0]) {
> -		if (prot->memory_pressure && *prot->memory_pressure)
> -			*prot->memory_pressure = 0;
> +	if (allocated <= prot_mem[0]) {
> +		if (memory_pressure && *memory_pressure)
> +			*memory_pressure = 0;
>  		return 1;
>  	}
>  
>  	/* Under pressure. */
> -	if (allocated > prot->sysctl_mem[1])
> +	if (allocated > prot_mem[1])
>  		if (prot->enter_memory_pressure)
>  			prot->enter_memory_pressure(sk);
>  
> -	/* Over hard limit. */
> -	if (allocated > prot->sysctl_mem[2])
> -		goto suppress_allocation;
> -
>  	/* guarantee minimum buffer size under pressure */
>  	if (kind == SK_MEM_RECV) {
>  		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
> @@ -1712,13 +1779,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
>  				return 1;
>  	}
>  
> -	if (prot->memory_pressure) {
> +	if (memory_pressure) {
>  		int alloc;
>  
> -		if (!*prot->memory_pressure)
> +		if (!*memory_pressure)
>  			return 1;
> -		alloc = percpu_counter_read_positive(prot->sockets_allocated);
> -		if (prot->sysctl_mem[2] > alloc *
> +		alloc = percpu_counter_read_positive(sk_sockets_allocated(sk));
> +		if (prot_mem[2] > alloc *
>  		    sk_mem_pages(sk->sk_wmem_queued +
>  				 atomic_read(&sk->sk_rmem_alloc) +
>  				 sk->sk_forward_alloc))
> @@ -1741,7 +1808,13 @@ suppress_allocation:
>  
>  	/* Alas. Undo changes. */
>  	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
> -	atomic_long_sub(amt, prot->memory_allocated);
> +
> +	atomic_long_sub(amt, sk_memory_allocated(sk));
> +
> +#ifdef CONFIG_CGROUP_KMEM
> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent)
> +		atomic_long_sub(amt, sg_memory_allocated(prot, sg));
> +#endif
>  	return 0;
>  }
>  EXPORT_SYMBOL(__sk_mem_schedule);
> @@ -1753,14 +1826,24 @@ EXPORT_SYMBOL(__sk_mem_schedule);
>  void __sk_mem_reclaim(struct sock *sk)
>  {
>  	struct proto *prot = sk->sk_prot;
> +	struct kmem_cgroup *sg = sk->sk_cgrp;
> +	int *memory_pressure = sk_memory_pressure(sk);
>  
>  	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
> -		   prot->memory_allocated);
> +		   sk_memory_allocated(sk));
> +
> +#ifdef CONFIG_CGROUP_KMEM
> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
> +		atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
> +						sg_memory_allocated(prot, sg));
> +	}
> +#endif
> +
>  	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
>  
> -	if (prot->memory_pressure && *prot->memory_pressure &&
> -	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
> -		*prot->memory_pressure = 0;
> +	if (memory_pressure && *memory_pressure &&
> +	    (atomic_long_read(sk_memory_allocated(sk)) < sk_prot_mem(sk)[0]))
> +		*memory_pressure = 0;
>  }
>  EXPORT_SYMBOL(__sk_mem_reclaim);
>  

IMHO, I like to hide atomic_long_xxxx ops under kmem cgroup ops.

And use callbacks like
	kmem_cgroup_read(SOCKET_MEM_ALLOCATED, sk)

If other component uses kmem_cgroup, a generic interface will be
helpful because optimization/fix in generic interface will improve
all users of kmem_cgroup.



> @@ -2252,9 +2335,6 @@ void sk_common_release(struct sock *sk)
>  }
>  EXPORT_SYMBOL(sk_common_release);
>  
> -static DEFINE_RWLOCK(proto_list_lock);
> -static LIST_HEAD(proto_list);
> -
>  #ifdef CONFIG_PROC_FS
>  #define PROTO_INUSE_NR	64	/* should be enough for the first time */
>  struct prot_inuse {
> @@ -2479,13 +2559,17 @@ static char proto_method_implemented(const void *method)
>  
>  static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
>  {
> +	struct kmem_cgroup *sg = kcg_from_task(current);
> +
>  	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
>  			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
>  		   proto->name,
>  		   proto->obj_size,
>  		   sock_prot_inuse_get(seq_file_net(seq), proto),
> -		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
> -		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
> +		   proto->memory_allocated != NULL ?
> +			atomic_long_read(sg_memory_allocated(proto, sg)) : -1L,
> +		   proto->memory_pressure != NULL ?
> +			*sg_memory_pressure(proto, sg) ? "yes" : "no" : "NI",
>  		   proto->max_header,
>  		   proto->slab == NULL ? "no" : "yes",
>  		   module_name(proto->owner),
> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
> index 19acd00..463b299 100644
> --- a/net/decnet/af_decnet.c
> +++ b/net/decnet/af_decnet.c
> @@ -458,13 +458,28 @@ static void dn_enter_memory_pressure(struct sock *sk)
>  	}
>  }
>  
> +static atomic_long_t *memory_allocated_dn(struct kmem_cgroup *sg)
> +{
> +	return &decnet_memory_allocated;
> +}
> +
> +static int *memory_pressure_dn(struct kmem_cgroup *sg)
> +{
> +	return &dn_memory_pressure;
> +}
> +
> +static long *dn_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +	return sysctl_decnet_mem;
> +}
> +
>  static struct proto dn_proto = {
>  	.name			= "NSP",
>  	.owner			= THIS_MODULE,
>  	.enter_memory_pressure	= dn_enter_memory_pressure,
> -	.memory_pressure	= &dn_memory_pressure,
> -	.memory_allocated	= &decnet_memory_allocated,
> -	.sysctl_mem		= sysctl_decnet_mem,
> +	.memory_pressure	= memory_pressure_dn,
> +	.memory_allocated	= memory_allocated_dn,
> +	.prot_mem		= dn_sysctl_mem,
>  	.sysctl_wmem		= sysctl_decnet_wmem,
>  	.sysctl_rmem		= sysctl_decnet_rmem,
>  	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
> index b14ec7d..9c80acf 100644
> --- a/net/ipv4/proc.c
> +++ b/net/ipv4/proc.c
> @@ -52,20 +52,22 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
>  {
>  	struct net *net = seq->private;
>  	int orphans, sockets;
> +	struct kmem_cgroup *sg = kcg_from_task(current);
> +	struct percpu_counter *allocated = sg_sockets_allocated(&tcp_prot, sg);
>  
>  	local_bh_disable();
>  	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
> -	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
> +	sockets = percpu_counter_sum_positive(allocated);
>  	local_bh_enable();
>  
>  	socket_seq_show(seq);
>  	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
>  		   sock_prot_inuse_get(net, &tcp_prot), orphans,
>  		   tcp_death_row.tw_count, sockets,
> -		   atomic_long_read(&tcp_memory_allocated));
> +		   atomic_long_read(sg_memory_allocated((&tcp_prot), sg)));
>  	seq_printf(seq, "UDP: inuse %d mem %ld\n",
>  		   sock_prot_inuse_get(net, &udp_prot),
> -		   atomic_long_read(&udp_memory_allocated));
> +		   atomic_long_read(sg_memory_allocated((&udp_prot), sg)));
>  	seq_printf(seq, "UDPLITE: inuse %d\n",
>  		   sock_prot_inuse_get(net, &udplite_prot));
>  	seq_printf(seq, "RAW: inuse %d\n",
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 69fd720..5e89480 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -14,6 +14,8 @@
>  #include <linux/init.h>
>  #include <linux/slab.h>
>  #include <linux/nsproxy.h>
> +#include <linux/kmem_cgroup.h>
> +#include <linux/swap.h>
>  #include <net/snmp.h>
>  #include <net/icmp.h>
>  #include <net/ip.h>
> @@ -174,6 +176,43 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>  	return ret;
>  }
>  
> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
> +			   void __user *buffer, size_t *lenp,
> +			   loff_t *ppos)
> +{
> +	int ret;
> +	unsigned long vec[3];
> +	struct kmem_cgroup *kmem = kcg_from_task(current);
> +	struct net *net = current->nsproxy->net_ns;
> +	int i;
> +
> +	ctl_table tmp = {
> +		.data = &vec,
> +		.maxlen = sizeof(vec),
> +		.mode = ctl->mode,
> +	};
> +
> +	if (!write) {
> +		ctl->data = &net->ipv4.sysctl_tcp_mem;
> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
> +	}
> +
> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
> +	if (ret)
> +		return ret;
> +
> +	for (i = 0; i < 3; i++)
> +		if (vec[i] > kmem->tcp_max_memory)
> +			return -EINVAL;
> +
> +	for (i = 0; i < 3; i++) {
> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
> +		kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i];
> +	}
> +
> +	return 0;
> +}
> +
>  static struct ctl_table ipv4_table[] = {
>  	{
>  		.procname	= "tcp_timestamps",
> @@ -433,13 +472,6 @@ static struct ctl_table ipv4_table[] = {
>  		.proc_handler	= proc_dointvec
>  	},
>  	{
> -		.procname	= "tcp_mem",
> -		.data		= &sysctl_tcp_mem,
> -		.maxlen		= sizeof(sysctl_tcp_mem),
> -		.mode		= 0644,
> -		.proc_handler	= proc_doulongvec_minmax
> -	},
> -	{
>  		.procname	= "tcp_wmem",
>  		.data		= &sysctl_tcp_wmem,
>  		.maxlen		= sizeof(sysctl_tcp_wmem),
> @@ -721,6 +753,12 @@ static struct ctl_table ipv4_net_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= ipv4_ping_group_range,
>  	},
> +	{
> +		.procname	= "tcp_mem",
> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
> +		.mode		= 0644,
> +		.proc_handler	= ipv4_tcp_mem,
> +	},
>  	{ }
>  };
>  
> @@ -734,6 +772,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>  static __net_init int ipv4_sysctl_init_net(struct net *net)
>  {
>  	struct ctl_table *table;
> +	unsigned long limit;
>  
>  	table = ipv4_net_table;
>  	if (!net_eq(net, &init_net)) {
> @@ -769,6 +808,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>  
>  	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>  
> +	limit = nr_free_buffer_pages() / 8;
> +	limit = max(limit, 128UL);
> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
> +	net->ipv4.sysctl_tcp_mem[1] = limit;
> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
> +

What this calculation means ? Documented somewhere ?



>  	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>  			net_ipv4_ctl_path, table);
>  	if (net->ipv4.ipv4_hdr == NULL)
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 46febca..e1918fa 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -266,6 +266,7 @@
>  #include <linux/crypto.h>
>  #include <linux/time.h>
>  #include <linux/slab.h>
> +#include <linux/nsproxy.h>
>  
>  #include <net/icmp.h>
>  #include <net/tcp.h>
> @@ -282,23 +283,12 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>  struct percpu_counter tcp_orphan_count;
>  EXPORT_SYMBOL_GPL(tcp_orphan_count);
>  
> -long sysctl_tcp_mem[3] __read_mostly;
>  int sysctl_tcp_wmem[3] __read_mostly;
>  int sysctl_tcp_rmem[3] __read_mostly;
>  
> -EXPORT_SYMBOL(sysctl_tcp_mem);
>  EXPORT_SYMBOL(sysctl_tcp_rmem);
>  EXPORT_SYMBOL(sysctl_tcp_wmem);
>  
> -atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
> -EXPORT_SYMBOL(tcp_memory_allocated);
> -
> -/*
> - * Current number of TCP sockets.
> - */
> -struct percpu_counter tcp_sockets_allocated;
> -EXPORT_SYMBOL(tcp_sockets_allocated);
> -
>  /*
>   * TCP splice context
>   */
> @@ -308,23 +298,157 @@ struct tcp_splice_state {
>  	unsigned int flags;
>  };
>  
> +#ifdef CONFIG_CGROUP_KMEM
>  /*
>   * Pressure flag: try to collapse.
>   * Technical note: it is used by multiple contexts non atomically.
>   * All the __sk_mem_schedule() is of this nature: accounting
>   * is strict, actions are advisory and have some latency.
>   */
> -int tcp_memory_pressure __read_mostly;
> -EXPORT_SYMBOL(tcp_memory_pressure);
> -
>  void tcp_enter_memory_pressure(struct sock *sk)
>  {
> +	struct kmem_cgroup *sg = sk->sk_cgrp;
> +	if (!sg->tcp_memory_pressure) {
> +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
> +		sg->tcp_memory_pressure = 1;
> +	}
> +}
> +
> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +	return sg->tcp_prot_mem;
> +}
> +
> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +	return &(sg->tcp_memory_allocated);
> +}
> +
> +static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
> +	struct net *net = current->nsproxy->net_ns;
> +	int i;
> +
> +	if (!cgroup_lock_live_group(cgrp))
> +		return -ENODEV;
> +
> +	/*
> +	 * We can't allow more memory than our parents. Since this
> +	 * will be tested for all calls, by induction, there is no need
> +	 * to test any parent other than our own
> +	 * */
> +	if (sg->parent && (val > sg->parent->tcp_max_memory))
> +		val = sg->parent->tcp_max_memory;
> +
> +	sg->tcp_max_memory = val;
> +
> +	for (i = 0; i < 3; i++)
> +		sg->tcp_prot_mem[i]  = min_t(long, val,
> +					     net->ipv4.sysctl_tcp_mem[i]);
> +
> +	cgroup_unlock();
> +
> +	return 0;
> +}
> +

Do we really need cgroup_lock/unlock ?



> +static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft)
> +{
> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
> +	u64 ret;
> +
> +	if (!cgroup_lock_live_group(cgrp))
> +		return -ENODEV;
> +	ret = sg->tcp_max_memory;
> +
> +	cgroup_unlock();
> +	return ret;
> +}
> +


Hmm, can't you implement this function as

	kmem_cgroup_read(SOCK_TCP_MAXMEM, sg);

? How do you think ?

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 6, 2011, 3:28 p.m. UTC | #2
On 09/06/2011 07:00 AM, KAMEZAWA Hiroyuki wrote:
> On Mon,  5 Sep 2011 23:35:56 -0300
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> This patch introduces per-cgroup tcp buffers limitation. This allows
>> sysadmins to specify a maximum amount of kernel memory that
>> tcp connections can use at any point in time. TCP is the main interest
>> in this work, but extending it to other protocols would be easy.
>>
>> It piggybacks in the memory control mechanism already present in
>> /proc/sys/net/ipv4/tcp_mem. There is a soft limit, and a hard limit,
>> that will suppress allocation when reached. For each cgroup, however,
>> the file kmem.tcp_maxmem will be used to cap those values.
>>
>> The usage I have in mind here is containers. Each container will
>> define its own values for soft and hard limits, but none of them will
>> be possibly bigger than the value the box' sysadmin specified from
>> the outside.
>>
>> To test for any performance impacts of this patch, I used netperf's
>> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
>>
>> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
>> results:
>>
>> Without the patch
>> =================
>>
>> Socket Size   Request  Resp.   Elapsed  Trans.
>> Send   Recv   Size     Size    Time     Rate
>> bytes  Bytes  bytes    bytes   secs.    per sec
>>
>> 16384  87380  1        1       10.00    26996.35
>> 16384  87380
>>
>> With the patch
>> ===============
>>
>> Local /Remote
>> Socket Size   Request  Resp.   Elapsed  Trans.
>> Send   Recv   Size     Size    Time     Rate
>> bytes  Bytes  bytes    bytes   secs.    per sec
>>
>> 16384  87380  1        1       10.00    27291.86
>> 16384  87380
>>
>> The difference is within a one-percent range.
>>
>> Nesting cgroups doesn't seem to be the dominating factor as well,
>> with nestings up to 10 levels not showing a significant performance
>> difference.
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: David S. Miller<davem@davemloft.net>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Eric W. Biederman<ebiederm@xmission.com>
>> ---
>>   crypto/af_alg.c               |    7 ++-
>>   include/linux/cgroup_subsys.h |    4 +
>>   include/net/netns/ipv4.h      |    1 +
>>   include/net/sock.h            |   66 +++++++++++++++-
>>   include/net/tcp.h             |   12 ++-
>>   include/net/udp.h             |    3 +-
>>   include/trace/events/sock.h   |   10 +-
>>   init/Kconfig                  |   11 +++
>>   mm/Makefile                   |    1 +
>>   net/core/sock.c               |  136 +++++++++++++++++++++++++++-------
>>   net/decnet/af_decnet.c        |   21 +++++-
>>   net/ipv4/proc.c               |    8 +-
>>   net/ipv4/sysctl_net_ipv4.c    |   59 +++++++++++++--
>>   net/ipv4/tcp.c                |  164 +++++++++++++++++++++++++++++++++++-----
>>   net/ipv4/tcp_input.c          |   17 ++--
>>   net/ipv4/tcp_ipv4.c           |   27 +++++--
>>   net/ipv4/tcp_output.c         |    2 +-
>>   net/ipv4/tcp_timer.c          |    2 +-
>>   net/ipv4/udp.c                |   20 ++++-
>>   net/ipv6/tcp_ipv6.c           |   16 +++-
>>   net/ipv6/udp.c                |    4 +-
>>   net/sctp/socket.c             |   35 +++++++--
>>   22 files changed, 514 insertions(+), 112 deletions(-)
>
> Hmm...could you please devide patches into a few patches ?
>
> If I was you, I'll devide the patches into
>
>   - Kconfig/Makefile/kmem_cgroup.c skelton.
>   - changes to struct sock and macro definition
>   - hooks to tcp.
>   - hooks to udp
>   - hooks to sctp

Sure, I can do it.

> And why not including mm/kmem_cgroup.c ?
Because I am an idiot and forgot to git add.

>
> some comments below.
>
>
>>
>> diff --git a/crypto/af_alg.c b/crypto/af_alg.c
>> index ac33d5f..df168d8 100644
>> --- a/crypto/af_alg.c
>> +++ b/crypto/af_alg.c
>> @@ -29,10 +29,15 @@ struct alg_type_list {
>>
>>   static atomic_long_t alg_memory_allocated;
>>
>> +static atomic_long_t *memory_allocated_alg(struct kmem_cgroup *sg)
>> +{
>> +	return&alg_memory_allocated;
>> +}
>> +
>>   static struct proto alg_proto = {
>>   	.name			= "ALG",
>>   	.owner			= THIS_MODULE,
>> -	.memory_allocated	=&alg_memory_allocated,
>> +	.memory_allocated	= memory_allocated_alg,
>>   	.obj_size		= sizeof(struct alg_sock),
>>   };
>>
>> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
>> index ac663c1..363b8e8 100644
>> --- a/include/linux/cgroup_subsys.h
>> +++ b/include/linux/cgroup_subsys.h
>> @@ -35,6 +35,10 @@ SUBSYS(cpuacct)
>>   SUBSYS(mem_cgroup)
>>   #endif
>>
>> +#ifdef CONFIG_CGROUP_KMEM
>> +SUBSYS(kmem)
>> +#endif
>> +
>>   /* */
>>
>>   #ifdef CONFIG_CGROUP_DEVICE
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index d786b4f..bbd023a 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>>   	int current_rt_cache_rebuild_count;
>>
>>   	unsigned int sysctl_ping_group_range[2];
>> +	long sysctl_tcp_mem[3];
>>
>>   	atomic_t rt_genid;
>>   	atomic_t dev_addr_genid;
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 8e4062f..e085148 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -62,7 +62,9 @@
>>   #include<linux/atomic.h>
>>   #include<net/dst.h>
>>   #include<net/checksum.h>
>> +#include<linux/kmem_cgroup.h>
>>
>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>>   /*
>>    * This structure really needs to be cleaned up.
>>    * Most of it is for TCP, and not used by any of
>> @@ -339,6 +341,7 @@ struct sock {
>>   #endif
>>   	__u32			sk_mark;
>>   	u32			sk_classid;
>> +	struct kmem_cgroup	*sk_cgrp;
>>   	void			(*sk_state_change)(struct sock *sk);
>>   	void			(*sk_data_ready)(struct sock *sk, int bytes);
>>   	void			(*sk_write_space)(struct sock *sk);
>> @@ -786,16 +789,21 @@ struct proto {
>>
>>   	/* Memory pressure */
>>   	void			(*enter_memory_pressure)(struct sock *sk);
>> -	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
>> -	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
>> +	/* Current allocated memory. */
>> +	atomic_long_t		*(*memory_allocated)(struct kmem_cgroup *sg);
>> +	/* Current number of sockets. */
>> +	struct percpu_counter	*(*sockets_allocated)(struct kmem_cgroup *sg);
>> +
>> +	int			(*init_cgroup)(struct cgroup *cgrp,
>> +					       struct cgroup_subsys *ss);
>>   	/*
>>   	 * Pressure flag: try to collapse.
>>   	 * Technical note: it is used by multiple contexts non atomically.
>>   	 * All the __sk_mem_schedule() is of this nature: accounting
>>   	 * is strict, actions are advisory and have some latency.
>>   	 */
>> -	int			*memory_pressure;
>> -	long			*sysctl_mem;
>> +	int			*(*memory_pressure)(struct kmem_cgroup *sg);
>> +	long			*(*prot_mem)(struct kmem_cgroup *sg);
>
> Hmm. Socket interface callbacks doesn't have documentation ?
> Adding explanation in Documenation is better, isn't it ?

Okay, sure thing.

>
>>   	int			*sysctl_wmem;
>>   	int			*sysctl_rmem;
>>   	int			max_header;
>> @@ -826,6 +834,56 @@ struct proto {
>>   #endif
>>   };
>>
>> +#define sk_memory_pressure(sk)						\
>> +({									\
>> +	int *__ret = NULL;						\
>> +	if ((sk)->sk_prot->memory_pressure)				\
>> +		__ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);	\
>> +	__ret;								\
>> +})
>> +
>> +#define sk_sockets_allocated(sk)				\
>> +({								\
>> +	struct percpu_counter *__p;				\
>> +	__p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);	\
>> +	__p;							\
>> +})
>> +
>> +#define sk_memory_allocated(sk)					\
>> +({								\
>> +	atomic_long_t *__mem;					\
>> +	__mem = (sk)->sk_prot->memory_allocated(sk->sk_cgrp);	\
>> +	__mem;							\
>> +})
>> +
>> +#define sk_prot_mem(sk)						\
>> +({								\
>> +	long *__mem = (sk)->sk_prot->prot_mem(sk->sk_cgrp);	\
>> +	__mem;							\
>> +})
>> +
>> +#define sg_memory_pressure(prot, sg)				\
>> +({								\
>> +	int *__ret = NULL;					\
>> +	if (prot->memory_pressure)				\
>> +		__ret = (prot)->memory_pressure(sg);		\
>> +	__ret;							\
>> +})
>> +
>> +#define sg_memory_allocated(prot, sg)				\
>> +({								\
>> +	atomic_long_t *__mem;					\
>> +	__mem = (prot)->memory_allocated(sg);			\
>> +	__mem;							\
>> +})
>> +
>> +#define sg_sockets_allocated(prot, sg)				\
>> +({								\
>> +	struct percpu_counter *__p;				\
>> +	__p = (prot)->sockets_allocated(sg);			\
>> +	__p;							\
>> +})
>> +
>
> All functions are worth to be inlined ?
Using the law of minimum disruption, I wanted to make them all
valid left values for any expressions, that's why it is this way.
I see that below, you suggest using things like 
sg_sockets_allocated_inc() , dec() and read(). I prefer them to be 
lvalues, but I am fine with either.


>
>
>>   extern int proto_register(struct proto *prot, int alloc_slab);
>>   extern void proto_unregister(struct proto *prot);
>>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 149a415..8e1ec4a 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>>   extern int sysctl_tcp_reordering;
>>   extern int sysctl_tcp_ecn;
>>   extern int sysctl_tcp_dsack;
>> -extern long sysctl_tcp_mem[3];
>>   extern int sysctl_tcp_wmem[3];
>>   extern int sysctl_tcp_rmem[3];
>>   extern int sysctl_tcp_app_win;
>> @@ -253,9 +252,12 @@ extern int sysctl_tcp_cookie_size;
>>   extern int sysctl_tcp_thin_linear_timeouts;
>>   extern int sysctl_tcp_thin_dupack;
>>
>> -extern atomic_long_t tcp_memory_allocated;
>> -extern struct percpu_counter tcp_sockets_allocated;
>> -extern int tcp_memory_pressure;
>> +struct kmem_cgroup;
>> +extern long *tcp_sysctl_mem(struct kmem_cgroup *sg);
>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg);
>> +int *memory_pressure_tcp(struct kmem_cgroup *sg);
>> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg);
>>
>>   /*
>>    * The next routines deal with comparing 32 bit unsigned ints
>> @@ -286,7 +288,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
>>   	}
>>
>>   	if (sk->sk_wmem_queued>  SOCK_MIN_SNDBUF&&
>> -	    atomic_long_read(&tcp_memory_allocated)>  sysctl_tcp_mem[2])
>> +	    atomic_long_read(sk_memory_allocated(sk))>  sk_prot_mem(sk)[2])
>
> Why not sk_memory_allocated() returns the value ?
> Is it required to return pointer ?

Same thing. I don't feel strongly about this, and can change it.

>
>>   		return true;
>>   	return false;
>>   }
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index 67ea6fc..0e27388 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -105,7 +105,8 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
>>
>>   extern struct proto udp_prot;
>>
>> -extern atomic_long_t udp_memory_allocated;
>> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg);
>> +long *udp_sysctl_mem(struct kmem_cgroup *sg);
>>
>>   /* sysctl variables for udp */
>>   extern long sysctl_udp_mem[3];
>> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
>> index 779abb9..12a6083 100644
>> --- a/include/trace/events/sock.h
>> +++ b/include/trace/events/sock.h
>> @@ -37,7 +37,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>
>>   	TP_STRUCT__entry(
>>   		__array(char, name, 32)
>> -		__field(long *, sysctl_mem)
>> +		__field(long *, prot_mem)
>>   		__field(long, allocated)
>>   		__field(int, sysctl_rmem)
>>   		__field(int, rmem_alloc)
>> @@ -45,7 +45,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>
>>   	TP_fast_assign(
>>   		strncpy(__entry->name, prot->name, 32);
>> -		__entry->sysctl_mem = prot->sysctl_mem;
>> +		__entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
>>   		__entry->allocated = allocated;
>>   		__entry->sysctl_rmem = prot->sysctl_rmem[0];
>>   		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
>> @@ -54,9 +54,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>   	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
>>   		"sysctl_rmem=%d rmem_alloc=%d",
>>   		__entry->name,
>> -		__entry->sysctl_mem[0],
>> -		__entry->sysctl_mem[1],
>> -		__entry->sysctl_mem[2],
>> +		__entry->prot_mem[0],
>> +		__entry->prot_mem[1],
>> +		__entry->prot_mem[2],
>>   		__entry->allocated,
>>   		__entry->sysctl_rmem,
>>   		__entry->rmem_alloc)
>> diff --git a/init/Kconfig b/init/Kconfig
>> index d627783..5955ac2 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -690,6 +690,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
>>   	  select this option (if, for some reason, they need to disable it
>>   	  then swapaccount=0 does the trick).
>>
>> +config CGROUP_KMEM
>> +	bool "Kernel Memory Resource Controller for Control Groups"
>> +	depends on CGROUPS
>> +	help
>> +	  The Kernel Memory cgroup can limit the amount of memory used by
>> +	  certain kernel objects in the system. Those are fundamentally
>> +	  different from the entities handled by the Memory Controller,
>> +	  which are page-based, and can be swapped. Users of the kmem
>> +	  cgroup can use it to guarantee that no group of processes will
>> +	  ever exhaust kernel resources alone.
>> +
>
> This help seems nice but please add Documentation/cgroup/kmem.
Yes, sure.

>
>
>
>
>>   config CGROUP_PERF
>>   	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
>>   	depends on PERF_EVENTS&&  CGROUPS
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 836e416..1b1aa24 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -45,6 +45,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>>   obj-$(CONFIG_QUICKLIST) += quicklist.o
>>   obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>>   obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
>> +obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o
>>   obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>>   obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>>   obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 3449df8..2d968ea 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -134,6 +134,24 @@
>>   #include<net/tcp.h>
>>   #endif
>>
>> +static DEFINE_RWLOCK(proto_list_lock);
>> +static LIST_HEAD(proto_list);
>> +
>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> +{
>> +	struct proto *proto;
>> +	int ret = 0;
>> +
>> +	read_lock(&proto_list_lock);
>> +	list_for_each_entry(proto,&proto_list, node) {
>> +		if (proto->init_cgroup)
>> +			ret |= proto->init_cgroup(cgrp, ss);
>> +	}
>> +	read_unlock(&proto_list_lock);
>> +
>> +	return ret;
>> +}
>> +
>>   /*
>>    * Each address family might have different locking rules, so we have
>>    * one slock key per address family:
>> @@ -1114,6 +1132,31 @@ void sock_update_classid(struct sock *sk)
>>   EXPORT_SYMBOL(sock_update_classid);
>>   #endif
>>
>> +void sock_update_kmem_cgrp(struct sock *sk)
>> +{
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	sk->sk_cgrp = kcg_from_task(current);
>> +
>> +	/*
>> +	 * We don't need to protect against anything task-related, because
>> +	 * we are basically stuck with the sock pointer that won't change,
>> +	 * even if the task that originated the socket changes cgroups.
>> +	 *
>> +	 * What we do have to guarantee, is that the chain leading us to
>> +	 * the top level won't change under our noses. Incrementing the
>> +	 * reference count via cgroup_exclude_rmdir guarantees that.
>> +	 */
>> +	cgroup_exclude_rmdir(&sk->sk_cgrp->css);
>> +#endif
>> +}
>
> I'm not sure this kind of bare cgroup code in core/sock.c will be
> welcomed by network guys.

What is your suggestion then? Just a wrapper or do you have something 
else in mind ?

>
>
>
>
>> +
>> +void sock_release_kmem_cgrp(struct sock *sk)
>> +{
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	cgroup_release_and_wakeup_rmdir(&sk->sk_cgrp->css);
>> +#endif
>> +}
>> +
>>   /**
>>    *	sk_alloc - All socket objects are allocated here
>>    *	@net: the applicable net namespace
>> @@ -1139,6 +1182,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>>   		atomic_set(&sk->sk_wmem_alloc, 1);
>>
>>   		sock_update_classid(sk);
>> +		sock_update_kmem_cgrp(sk);
>>   	}
>>
>>   	return sk;
>> @@ -1170,6 +1214,7 @@ static void __sk_free(struct sock *sk)
>>   		put_cred(sk->sk_peer_cred);
>>   	put_pid(sk->sk_peer_pid);
>>   	put_net(sock_net(sk));
>> +	sock_release_kmem_cgrp(sk);
>>   	sk_prot_free(sk->sk_prot_creator, sk);
>>   }
>>
>> @@ -1287,8 +1332,8 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
>>   		sk_set_socket(newsk, NULL);
>>   		newsk->sk_wq = NULL;
>>
>> -		if (newsk->sk_prot->sockets_allocated)
>> -			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
>> +		if (sk_sockets_allocated(sk))
>> +			percpu_counter_inc(sk_sockets_allocated(sk));
> How about
> 	sk_sockets_allocated_inc(sk);
> ?
>
>
>>
>>   		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
>>   		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
>> @@ -1676,29 +1721,51 @@ EXPORT_SYMBOL(sk_wait_data);
>>    */
>>   int __sk_mem_schedule(struct sock *sk, int size, int kind)
>>   {
>> -	struct proto *prot = sk->sk_prot;
>>   	int amt = sk_mem_pages(size);
>> +	struct proto *prot = sk->sk_prot;
>>   	long allocated;
>> +	int *memory_pressure;
>> +	long *prot_mem;
>> +	int parent_failure = 0;
>> +	struct kmem_cgroup *sg;
>>
>>   	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
>> -	allocated = atomic_long_add_return(amt, prot->memory_allocated);
>> +
>> +	memory_pressure = sk_memory_pressure(sk);
>> +	prot_mem = sk_prot_mem(sk);
>> +
>> +	allocated = atomic_long_add_return(amt, sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>> +		long alloc;
>> +		/*
>> +		 * Large nestings are not the common case, and stopping in the
>> +		 * middle would be complicated enough, that we bill it all the
>> +		 * way through the root, and if needed, unbill everything later
>> +		 */
>> +		alloc = atomic_long_add_return(amt,
>> +					       sg_memory_allocated(prot, sg));
>> +		parent_failure |= (alloc>  sk_prot_mem(sk)[2]);
>> +	}
>> +#endif
>> +
>> +	/* Over hard limit (we, or our parents) */
>> +	if (parent_failure || (allocated>  prot_mem[2]))
>> +		goto suppress_allocation;
>>
>>   	/* Under limit. */
>> -	if (allocated<= prot->sysctl_mem[0]) {
>> -		if (prot->memory_pressure&&  *prot->memory_pressure)
>> -			*prot->memory_pressure = 0;
>> +	if (allocated<= prot_mem[0]) {
>> +		if (memory_pressure&&  *memory_pressure)
>> +			*memory_pressure = 0;
>>   		return 1;
>>   	}
>>
>>   	/* Under pressure. */
>> -	if (allocated>  prot->sysctl_mem[1])
>> +	if (allocated>  prot_mem[1])
>>   		if (prot->enter_memory_pressure)
>>   			prot->enter_memory_pressure(sk);
>>
>> -	/* Over hard limit. */
>> -	if (allocated>  prot->sysctl_mem[2])
>> -		goto suppress_allocation;
>> -
>>   	/* guarantee minimum buffer size under pressure */
>>   	if (kind == SK_MEM_RECV) {
>>   		if (atomic_read(&sk->sk_rmem_alloc)<  prot->sysctl_rmem[0])
>> @@ -1712,13 +1779,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
>>   				return 1;
>>   	}
>>
>> -	if (prot->memory_pressure) {
>> +	if (memory_pressure) {
>>   		int alloc;
>>
>> -		if (!*prot->memory_pressure)
>> +		if (!*memory_pressure)
>>   			return 1;
>> -		alloc = percpu_counter_read_positive(prot->sockets_allocated);
>> -		if (prot->sysctl_mem[2]>  alloc *
>> +		alloc = percpu_counter_read_positive(sk_sockets_allocated(sk));
>> +		if (prot_mem[2]>  alloc *
>>   		    sk_mem_pages(sk->sk_wmem_queued +
>>   				 atomic_read(&sk->sk_rmem_alloc) +
>>   				 sk->sk_forward_alloc))
>> @@ -1741,7 +1808,13 @@ suppress_allocation:
>>
>>   	/* Alas. Undo changes. */
>>   	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
>> -	atomic_long_sub(amt, prot->memory_allocated);
>> +
>> +	atomic_long_sub(amt, sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent)
>> +		atomic_long_sub(amt, sg_memory_allocated(prot, sg));
>> +#endif
>>   	return 0;
>>   }
>>   EXPORT_SYMBOL(__sk_mem_schedule);
>> @@ -1753,14 +1826,24 @@ EXPORT_SYMBOL(__sk_mem_schedule);
>>   void __sk_mem_reclaim(struct sock *sk)
>>   {
>>   	struct proto *prot = sk->sk_prot;
>> +	struct kmem_cgroup *sg = sk->sk_cgrp;
>> +	int *memory_pressure = sk_memory_pressure(sk);
>>
>>   	atomic_long_sub(sk->sk_forward_alloc>>  SK_MEM_QUANTUM_SHIFT,
>> -		   prot->memory_allocated);
>> +		   sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>> +		atomic_long_sub(sk->sk_forward_alloc>>  SK_MEM_QUANTUM_SHIFT,
>> +						sg_memory_allocated(prot, sg));
>> +	}
>> +#endif
>> +
>>   	sk->sk_forward_alloc&= SK_MEM_QUANTUM - 1;
>>
>> -	if (prot->memory_pressure&&  *prot->memory_pressure&&
>> -	    (atomic_long_read(prot->memory_allocated)<  prot->sysctl_mem[0]))
>> -		*prot->memory_pressure = 0;
>> +	if (memory_pressure&&  *memory_pressure&&
>> +	    (atomic_long_read(sk_memory_allocated(sk))<  sk_prot_mem(sk)[0]))
>> +		*memory_pressure = 0;
>>   }
>>   EXPORT_SYMBOL(__sk_mem_reclaim);
>>
>
> IMHO, I like to hide atomic_long_xxxx ops under kmem cgroup ops.
>
> And use callbacks like
> 	kmem_cgroup_read(SOCKET_MEM_ALLOCATED, sk)
>
> If other component uses kmem_cgroup, a generic interface will be
> helpful because optimization/fix in generic interface will improve
> all users of kmem_cgroup.

I honestly don't like it too much.
It introduces at least one conditional to test for the kind of variable,
which can harm some fast paths, and the return value will not always be 
of the same type. Also, kmem_cgroup_read would not do too much besides 
acessing a data. Not a likely hot target for future optimizations.

>
>
>> @@ -2252,9 +2335,6 @@ void sk_common_release(struct sock *sk)
>>   }
>>   EXPORT_SYMBOL(sk_common_release);
>>
>> -static DEFINE_RWLOCK(proto_list_lock);
>> -static LIST_HEAD(proto_list);
>> -
>>   #ifdef CONFIG_PROC_FS
>>   #define PROTO_INUSE_NR	64	/* should be enough for the first time */
>>   struct prot_inuse {
>> @@ -2479,13 +2559,17 @@ static char proto_method_implemented(const void *method)
>>
>>   static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
>>   {
>> +	struct kmem_cgroup *sg = kcg_from_task(current);
>> +
>>   	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
>>   			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
>>   		   proto->name,
>>   		   proto->obj_size,
>>   		   sock_prot_inuse_get(seq_file_net(seq), proto),
>> -		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
>> -		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
>> +		   proto->memory_allocated != NULL ?
>> +			atomic_long_read(sg_memory_allocated(proto, sg)) : -1L,
>> +		   proto->memory_pressure != NULL ?
>> +			*sg_memory_pressure(proto, sg) ? "yes" : "no" : "NI",
>>   		   proto->max_header,
>>   		   proto->slab == NULL ? "no" : "yes",
>>   		   module_name(proto->owner),
>> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
>> index 19acd00..463b299 100644
>> --- a/net/decnet/af_decnet.c
>> +++ b/net/decnet/af_decnet.c
>> @@ -458,13 +458,28 @@ static void dn_enter_memory_pressure(struct sock *sk)
>>   	}
>>   }
>>
>> +static atomic_long_t *memory_allocated_dn(struct kmem_cgroup *sg)
>> +{
>> +	return&decnet_memory_allocated;
>> +}
>> +
>> +static int *memory_pressure_dn(struct kmem_cgroup *sg)
>> +{
>> +	return&dn_memory_pressure;
>> +}
>> +
>> +static long *dn_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +	return sysctl_decnet_mem;
>> +}
>> +
>>   static struct proto dn_proto = {
>>   	.name			= "NSP",
>>   	.owner			= THIS_MODULE,
>>   	.enter_memory_pressure	= dn_enter_memory_pressure,
>> -	.memory_pressure	=&dn_memory_pressure,
>> -	.memory_allocated	=&decnet_memory_allocated,
>> -	.sysctl_mem		= sysctl_decnet_mem,
>> +	.memory_pressure	= memory_pressure_dn,
>> +	.memory_allocated	= memory_allocated_dn,
>> +	.prot_mem		= dn_sysctl_mem,
>>   	.sysctl_wmem		= sysctl_decnet_wmem,
>>   	.sysctl_rmem		= sysctl_decnet_rmem,
>>   	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
>> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
>> index b14ec7d..9c80acf 100644
>> --- a/net/ipv4/proc.c
>> +++ b/net/ipv4/proc.c
>> @@ -52,20 +52,22 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
>>   {
>>   	struct net *net = seq->private;
>>   	int orphans, sockets;
>> +	struct kmem_cgroup *sg = kcg_from_task(current);
>> +	struct percpu_counter *allocated = sg_sockets_allocated(&tcp_prot, sg);
>>
>>   	local_bh_disable();
>>   	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
>> -	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
>> +	sockets = percpu_counter_sum_positive(allocated);
>>   	local_bh_enable();
>>
>>   	socket_seq_show(seq);
>>   	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
>>   		   sock_prot_inuse_get(net,&tcp_prot), orphans,
>>   		   tcp_death_row.tw_count, sockets,
>> -		   atomic_long_read(&tcp_memory_allocated));
>> +		   atomic_long_read(sg_memory_allocated((&tcp_prot), sg)));
>>   	seq_printf(seq, "UDP: inuse %d mem %ld\n",
>>   		   sock_prot_inuse_get(net,&udp_prot),
>> -		   atomic_long_read(&udp_memory_allocated));
>> +		   atomic_long_read(sg_memory_allocated((&udp_prot), sg)));
>>   	seq_printf(seq, "UDPLITE: inuse %d\n",
>>   		   sock_prot_inuse_get(net,&udplite_prot));
>>   	seq_printf(seq, "RAW: inuse %d\n",
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 69fd720..5e89480 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -14,6 +14,8 @@
>>   #include<linux/init.h>
>>   #include<linux/slab.h>
>>   #include<linux/nsproxy.h>
>> +#include<linux/kmem_cgroup.h>
>> +#include<linux/swap.h>
>>   #include<net/snmp.h>
>>   #include<net/icmp.h>
>>   #include<net/ip.h>
>> @@ -174,6 +176,43 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>>   	return ret;
>>   }
>>
>> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
>> +			   void __user *buffer, size_t *lenp,
>> +			   loff_t *ppos)
>> +{
>> +	int ret;
>> +	unsigned long vec[3];
>> +	struct kmem_cgroup *kmem = kcg_from_task(current);
>> +	struct net *net = current->nsproxy->net_ns;
>> +	int i;
>> +
>> +	ctl_table tmp = {
>> +		.data =&vec,
>> +		.maxlen = sizeof(vec),
>> +		.mode = ctl->mode,
>> +	};
>> +
>> +	if (!write) {
>> +		ctl->data =&net->ipv4.sysctl_tcp_mem;
>> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
>> +	}
>> +
>> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
>> +	if (ret)
>> +		return ret;
>> +
>> +	for (i = 0; i<  3; i++)
>> +		if (vec[i]>  kmem->tcp_max_memory)
>> +			return -EINVAL;
>> +
>> +	for (i = 0; i<  3; i++) {
>> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
>> +		kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i];
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static struct ctl_table ipv4_table[] = {
>>   	{
>>   		.procname	= "tcp_timestamps",
>> @@ -433,13 +472,6 @@ static struct ctl_table ipv4_table[] = {
>>   		.proc_handler	= proc_dointvec
>>   	},
>>   	{
>> -		.procname	= "tcp_mem",
>> -		.data		=&sysctl_tcp_mem,
>> -		.maxlen		= sizeof(sysctl_tcp_mem),
>> -		.mode		= 0644,
>> -		.proc_handler	= proc_doulongvec_minmax
>> -	},
>> -	{
>>   		.procname	= "tcp_wmem",
>>   		.data		=&sysctl_tcp_wmem,
>>   		.maxlen		= sizeof(sysctl_tcp_wmem),
>> @@ -721,6 +753,12 @@ static struct ctl_table ipv4_net_table[] = {
>>   		.mode		= 0644,
>>   		.proc_handler	= ipv4_ping_group_range,
>>   	},
>> +	{
>> +		.procname	= "tcp_mem",
>> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
>> +		.mode		= 0644,
>> +		.proc_handler	= ipv4_tcp_mem,
>> +	},
>>   	{ }
>>   };
>>
>> @@ -734,6 +772,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>>   static __net_init int ipv4_sysctl_init_net(struct net *net)
>>   {
>>   	struct ctl_table *table;
>> +	unsigned long limit;
>>
>>   	table = ipv4_net_table;
>>   	if (!net_eq(net,&init_net)) {
>> @@ -769,6 +808,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>>
>>   	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>>
>> +	limit = nr_free_buffer_pages() / 8;
>> +	limit = max(limit, 128UL);
>> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
>> +	net->ipv4.sysctl_tcp_mem[1] = limit;
>> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
>> +
>
> What this calculation means ? Documented somewhere ?
I just copied the numbers from the socket initialization, because I 
wanted to keep the same behaviour. No documentation there, no 
documentation here. I don't know if there is any particular reason to 
choose them over any other. If someone more knowleadgeable would
comment, I can use the chance to introduce a comment.

>
>
>
>>   	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>>   			net_ipv4_ctl_path, table);
>>   	if (net->ipv4.ipv4_hdr == NULL)
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 46febca..e1918fa 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -266,6 +266,7 @@
>>   #include<linux/crypto.h>
>>   #include<linux/time.h>
>>   #include<linux/slab.h>
>> +#include<linux/nsproxy.h>
>>
>>   #include<net/icmp.h>
>>   #include<net/tcp.h>
>> @@ -282,23 +283,12 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>>   struct percpu_counter tcp_orphan_count;
>>   EXPORT_SYMBOL_GPL(tcp_orphan_count);
>>
>> -long sysctl_tcp_mem[3] __read_mostly;
>>   int sysctl_tcp_wmem[3] __read_mostly;
>>   int sysctl_tcp_rmem[3] __read_mostly;
>>
>> -EXPORT_SYMBOL(sysctl_tcp_mem);
>>   EXPORT_SYMBOL(sysctl_tcp_rmem);
>>   EXPORT_SYMBOL(sysctl_tcp_wmem);
>>
>> -atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
>> -EXPORT_SYMBOL(tcp_memory_allocated);
>> -
>> -/*
>> - * Current number of TCP sockets.
>> - */
>> -struct percpu_counter tcp_sockets_allocated;
>> -EXPORT_SYMBOL(tcp_sockets_allocated);
>> -
>>   /*
>>    * TCP splice context
>>    */
>> @@ -308,23 +298,157 @@ struct tcp_splice_state {
>>   	unsigned int flags;
>>   };
>>
>> +#ifdef CONFIG_CGROUP_KMEM
>>   /*
>>    * Pressure flag: try to collapse.
>>    * Technical note: it is used by multiple contexts non atomically.
>>    * All the __sk_mem_schedule() is of this nature: accounting
>>    * is strict, actions are advisory and have some latency.
>>    */
>> -int tcp_memory_pressure __read_mostly;
>> -EXPORT_SYMBOL(tcp_memory_pressure);
>> -
>>   void tcp_enter_memory_pressure(struct sock *sk)
>>   {
>> +	struct kmem_cgroup *sg = sk->sk_cgrp;
>> +	if (!sg->tcp_memory_pressure) {
>> +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
>> +		sg->tcp_memory_pressure = 1;
>> +	}
>> +}
>> +
>> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +	return sg->tcp_prot_mem;
>> +}
>> +
>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
>> +{
>> +	return&(sg->tcp_memory_allocated);
>> +}
>> +
>> +static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64 val)
>> +{
>> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>> +	struct net *net = current->nsproxy->net_ns;
>> +	int i;
>> +
>> +	if (!cgroup_lock_live_group(cgrp))
>> +		return -ENODEV;
>> +
>> +	/*
>> +	 * We can't allow more memory than our parents. Since this
>> +	 * will be tested for all calls, by induction, there is no need
>> +	 * to test any parent other than our own
>> +	 * */
>> +	if (sg->parent&&  (val>  sg->parent->tcp_max_memory))
>> +		val = sg->parent->tcp_max_memory;
>> +
>> +	sg->tcp_max_memory = val;
>> +
>> +	for (i = 0; i<  3; i++)
>> +		sg->tcp_prot_mem[i]  = min_t(long, val,
>> +					     net->ipv4.sysctl_tcp_mem[i]);
>> +
>> +	cgroup_unlock();
>> +
>> +	return 0;
>> +}
>> +
>
> Do we really need cgroup_lock/unlock ?
The task writing to a cgroup file not necessarily belongs to it. This 
means there is no guarantee that the cgroup will exist through the rest 
of this operation. Maybe this could be done with rcu, since all we need 
is a reference to the cgroup (and the parent). But other cgroup file 
writers seem to use this lock...
>
>
>
>> +static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft)
>> +{
>> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>> +	u64 ret;
>> +
>> +	if (!cgroup_lock_live_group(cgrp))
>> +		return -ENODEV;
>> +	ret = sg->tcp_max_memory;
>> +
>> +	cgroup_unlock();
>> +	return ret;
>> +}
>> +
>
>
> Hmm, can't you implement this function as
>
> 	kmem_cgroup_read(SOCK_TCP_MAXMEM, sg);

I can, but as I stated above, I don't think it is a nice change.

>
> Thanks,
> -Kame

Thank you very much for you attention.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Thelen Sept. 6, 2011, 4:08 p.m. UTC | #3
On Mon, Sep 5, 2011 at 7:35 PM, Glauber Costa <glommer@parallels.com> wrote:
> This patch introduces per-cgroup tcp buffers limitation. This allows
> sysadmins to specify a maximum amount of kernel memory that
> tcp connections can use at any point in time. TCP is the main interest
> in this work, but extending it to other protocols would be easy.

With this approach we would be giving admins the ability to
independently limit user memory with memcg and kernel memory with this
new kmem cgroup.

At least in some situations admins prefer to give a particular
container X bytes without thinking about the kernel vs user split.
Sometimes the admin would prefer the kernel to keep the total
user+kernel memory below a certain threshold.  To achieve this with
this approach would we need a user space agent to monitor both kernel
and user usage for a container and grow/shrink memcg/kmem limits?

Do you foresee the kmem cgroup growing to include reclaimable slab,
where freeing one type of memory allows for reclaim of the other?

> It piggybacks in the memory control mechanism already present in
> /proc/sys/net/ipv4/tcp_mem. There is a soft limit, and a hard limit,
> that will suppress allocation when reached. For each cgroup, however,
> the file kmem.tcp_maxmem will be used to cap those values.
>
> The usage I have in mind here is containers. Each container will
> define its own values for soft and hard limits, but none of them will
> be possibly bigger than the value the box' sysadmin specified from
> the outside.
>
> To test for any performance impacts of this patch, I used netperf's
> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
>
> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
> results:
>
> Without the patch
> =================
>
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    26996.35
> 16384  87380
>
> With the patch
> ===============
>
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    27291.86
> 16384  87380
>
> The difference is within a one-percent range.
>
> Nesting cgroups doesn't seem to be the dominating factor as well,
> with nestings up to 10 levels not showing a significant performance
> difference.
>
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Eric W. Biederman <ebiederm@xmission.com>
> ---
>  crypto/af_alg.c               |    7 ++-
>  include/linux/cgroup_subsys.h |    4 +
>  include/net/netns/ipv4.h      |    1 +
>  include/net/sock.h            |   66 +++++++++++++++-
>  include/net/tcp.h             |   12 ++-
>  include/net/udp.h             |    3 +-
>  include/trace/events/sock.h   |   10 +-
>  init/Kconfig                  |   11 +++
>  mm/Makefile                   |    1 +
>  net/core/sock.c               |  136 +++++++++++++++++++++++++++-------
>  net/decnet/af_decnet.c        |   21 +++++-
>  net/ipv4/proc.c               |    8 +-
>  net/ipv4/sysctl_net_ipv4.c    |   59 +++++++++++++--
>  net/ipv4/tcp.c                |  164 +++++++++++++++++++++++++++++++++++-----
>  net/ipv4/tcp_input.c          |   17 ++--
>  net/ipv4/tcp_ipv4.c           |   27 +++++--
>  net/ipv4/tcp_output.c         |    2 +-
>  net/ipv4/tcp_timer.c          |    2 +-
>  net/ipv4/udp.c                |   20 ++++-
>  net/ipv6/tcp_ipv6.c           |   16 +++-
>  net/ipv6/udp.c                |    4 +-
>  net/sctp/socket.c             |   35 +++++++--
>  22 files changed, 514 insertions(+), 112 deletions(-)
>
> diff --git a/crypto/af_alg.c b/crypto/af_alg.c
> index ac33d5f..df168d8 100644
> --- a/crypto/af_alg.c
> +++ b/crypto/af_alg.c
> @@ -29,10 +29,15 @@ struct alg_type_list {
>
>  static atomic_long_t alg_memory_allocated;
>
> +static atomic_long_t *memory_allocated_alg(struct kmem_cgroup *sg)
> +{
> +       return &alg_memory_allocated;
> +}
> +
>  static struct proto alg_proto = {
>        .name                   = "ALG",
>        .owner                  = THIS_MODULE,
> -       .memory_allocated       = &alg_memory_allocated,
> +       .memory_allocated       = memory_allocated_alg,
>        .obj_size               = sizeof(struct alg_sock),
>  };
>
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index ac663c1..363b8e8 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -35,6 +35,10 @@ SUBSYS(cpuacct)
>  SUBSYS(mem_cgroup)
>  #endif
>
> +#ifdef CONFIG_CGROUP_KMEM
> +SUBSYS(kmem)
> +#endif
> +
>  /* */
>
>  #ifdef CONFIG_CGROUP_DEVICE
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index d786b4f..bbd023a 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>        int current_rt_cache_rebuild_count;
>
>        unsigned int sysctl_ping_group_range[2];
> +       long sysctl_tcp_mem[3];
>
>        atomic_t rt_genid;
>        atomic_t dev_addr_genid;
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 8e4062f..e085148 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -62,7 +62,9 @@
>  #include <linux/atomic.h>
>  #include <net/dst.h>
>  #include <net/checksum.h>
> +#include <linux/kmem_cgroup.h>
>
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>  /*
>  * This structure really needs to be cleaned up.
>  * Most of it is for TCP, and not used by any of
> @@ -339,6 +341,7 @@ struct sock {
>  #endif
>        __u32                   sk_mark;
>        u32                     sk_classid;
> +       struct kmem_cgroup      *sk_cgrp;
>        void                    (*sk_state_change)(struct sock *sk);
>        void                    (*sk_data_ready)(struct sock *sk, int bytes);
>        void                    (*sk_write_space)(struct sock *sk);
> @@ -786,16 +789,21 @@ struct proto {
>
>        /* Memory pressure */
>        void                    (*enter_memory_pressure)(struct sock *sk);
> -       atomic_long_t           *memory_allocated;      /* Current allocated memory. */
> -       struct percpu_counter   *sockets_allocated;     /* Current number of sockets. */
> +       /* Current allocated memory. */
> +       atomic_long_t           *(*memory_allocated)(struct kmem_cgroup *sg);
> +       /* Current number of sockets. */
> +       struct percpu_counter   *(*sockets_allocated)(struct kmem_cgroup *sg);
> +
> +       int                     (*init_cgroup)(struct cgroup *cgrp,
> +                                              struct cgroup_subsys *ss);
>        /*
>         * Pressure flag: try to collapse.
>         * Technical note: it is used by multiple contexts non atomically.
>         * All the __sk_mem_schedule() is of this nature: accounting
>         * is strict, actions are advisory and have some latency.
>         */
> -       int                     *memory_pressure;
> -       long                    *sysctl_mem;
> +       int                     *(*memory_pressure)(struct kmem_cgroup *sg);
> +       long                    *(*prot_mem)(struct kmem_cgroup *sg);
>        int                     *sysctl_wmem;
>        int                     *sysctl_rmem;
>        int                     max_header;
> @@ -826,6 +834,56 @@ struct proto {
>  #endif
>  };
>
> +#define sk_memory_pressure(sk)                                         \
> +({                                                                     \
> +       int *__ret = NULL;                                              \
> +       if ((sk)->sk_prot->memory_pressure)                             \
> +               __ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);    \
> +       __ret;                                                          \
> +})
> +
> +#define sk_sockets_allocated(sk)                               \
> +({                                                             \
> +       struct percpu_counter *__p;                             \
> +       __p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);    \
> +       __p;                                                    \
> +})
> +
> +#define sk_memory_allocated(sk)                                        \
> +({                                                             \
> +       atomic_long_t *__mem;                                   \
> +       __mem = (sk)->sk_prot->memory_allocated(sk->sk_cgrp);   \
> +       __mem;                                                  \
> +})
> +
> +#define sk_prot_mem(sk)                                                \
> +({                                                             \
> +       long *__mem = (sk)->sk_prot->prot_mem(sk->sk_cgrp);     \
> +       __mem;                                                  \
> +})
> +
> +#define sg_memory_pressure(prot, sg)                           \
> +({                                                             \
> +       int *__ret = NULL;                                      \
> +       if (prot->memory_pressure)                              \
> +               __ret = (prot)->memory_pressure(sg);            \
> +       __ret;                                                  \
> +})
> +
> +#define sg_memory_allocated(prot, sg)                          \
> +({                                                             \
> +       atomic_long_t *__mem;                                   \
> +       __mem = (prot)->memory_allocated(sg);                   \
> +       __mem;                                                  \
> +})
> +
> +#define sg_sockets_allocated(prot, sg)                         \
> +({                                                             \
> +       struct percpu_counter *__p;                             \
> +       __p = (prot)->sockets_allocated(sg);                    \
> +       __p;                                                    \
> +})
> +
>  extern int proto_register(struct proto *prot, int alloc_slab);
>  extern void proto_unregister(struct proto *prot);
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 149a415..8e1ec4a 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>  extern int sysctl_tcp_reordering;
>  extern int sysctl_tcp_ecn;
>  extern int sysctl_tcp_dsack;
> -extern long sysctl_tcp_mem[3];
>  extern int sysctl_tcp_wmem[3];
>  extern int sysctl_tcp_rmem[3];
>  extern int sysctl_tcp_app_win;
> @@ -253,9 +252,12 @@ extern int sysctl_tcp_cookie_size;
>  extern int sysctl_tcp_thin_linear_timeouts;
>  extern int sysctl_tcp_thin_dupack;
>
> -extern atomic_long_t tcp_memory_allocated;
> -extern struct percpu_counter tcp_sockets_allocated;
> -extern int tcp_memory_pressure;
> +struct kmem_cgroup;
> +extern long *tcp_sysctl_mem(struct kmem_cgroup *sg);
> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg);
> +int *memory_pressure_tcp(struct kmem_cgroup *sg);
> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg);
>
>  /*
>  * The next routines deal with comparing 32 bit unsigned ints
> @@ -286,7 +288,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
>        }
>
>        if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
> -           atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])
> +           atomic_long_read(sk_memory_allocated(sk)) > sk_prot_mem(sk)[2])
>                return true;
>        return false;
>  }
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 67ea6fc..0e27388 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -105,7 +105,8 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
>
>  extern struct proto udp_prot;
>
> -extern atomic_long_t udp_memory_allocated;
> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg);
> +long *udp_sysctl_mem(struct kmem_cgroup *sg);
>
>  /* sysctl variables for udp */
>  extern long sysctl_udp_mem[3];
> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
> index 779abb9..12a6083 100644
> --- a/include/trace/events/sock.h
> +++ b/include/trace/events/sock.h
> @@ -37,7 +37,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>
>        TP_STRUCT__entry(
>                __array(char, name, 32)
> -               __field(long *, sysctl_mem)
> +               __field(long *, prot_mem)
>                __field(long, allocated)
>                __field(int, sysctl_rmem)
>                __field(int, rmem_alloc)
> @@ -45,7 +45,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>
>        TP_fast_assign(
>                strncpy(__entry->name, prot->name, 32);
> -               __entry->sysctl_mem = prot->sysctl_mem;
> +               __entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
>                __entry->allocated = allocated;
>                __entry->sysctl_rmem = prot->sysctl_rmem[0];
>                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
> @@ -54,9 +54,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
>        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
>                "sysctl_rmem=%d rmem_alloc=%d",
>                __entry->name,
> -               __entry->sysctl_mem[0],
> -               __entry->sysctl_mem[1],
> -               __entry->sysctl_mem[2],
> +               __entry->prot_mem[0],
> +               __entry->prot_mem[1],
> +               __entry->prot_mem[2],
>                __entry->allocated,
>                __entry->sysctl_rmem,
>                __entry->rmem_alloc)
> diff --git a/init/Kconfig b/init/Kconfig
> index d627783..5955ac2 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -690,6 +690,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
>          select this option (if, for some reason, they need to disable it
>          then swapaccount=0 does the trick).
>
> +config CGROUP_KMEM
> +       bool "Kernel Memory Resource Controller for Control Groups"
> +       depends on CGROUPS
> +       help
> +         The Kernel Memory cgroup can limit the amount of memory used by
> +         certain kernel objects in the system. Those are fundamentally
> +         different from the entities handled by the Memory Controller,
> +         which are page-based, and can be swapped. Users of the kmem
> +         cgroup can use it to guarantee that no group of processes will
> +         ever exhaust kernel resources alone.
> +
>  config CGROUP_PERF
>        bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
>        depends on PERF_EVENTS && CGROUPS
> diff --git a/mm/Makefile b/mm/Makefile
> index 836e416..1b1aa24 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -45,6 +45,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>  obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o
>  obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>  obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 3449df8..2d968ea 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -134,6 +134,24 @@
>  #include <net/tcp.h>
>  #endif
>
> +static DEFINE_RWLOCK(proto_list_lock);
> +static LIST_HEAD(proto_list);
> +
> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
> +{
> +       struct proto *proto;
> +       int ret = 0;
> +
> +       read_lock(&proto_list_lock);
> +       list_for_each_entry(proto, &proto_list, node) {
> +               if (proto->init_cgroup)
> +                       ret |= proto->init_cgroup(cgrp, ss);
> +       }
> +       read_unlock(&proto_list_lock);
> +
> +       return ret;
> +}
> +
>  /*
>  * Each address family might have different locking rules, so we have
>  * one slock key per address family:
> @@ -1114,6 +1132,31 @@ void sock_update_classid(struct sock *sk)
>  EXPORT_SYMBOL(sock_update_classid);
>  #endif
>
> +void sock_update_kmem_cgrp(struct sock *sk)
> +{
> +#ifdef CONFIG_CGROUP_KMEM
> +       sk->sk_cgrp = kcg_from_task(current);
> +
> +       /*
> +        * We don't need to protect against anything task-related, because
> +        * we are basically stuck with the sock pointer that won't change,
> +        * even if the task that originated the socket changes cgroups.
> +        *
> +        * What we do have to guarantee, is that the chain leading us to
> +        * the top level won't change under our noses. Incrementing the
> +        * reference count via cgroup_exclude_rmdir guarantees that.
> +        */
> +       cgroup_exclude_rmdir(&sk->sk_cgrp->css);
> +#endif
> +}
> +
> +void sock_release_kmem_cgrp(struct sock *sk)
> +{
> +#ifdef CONFIG_CGROUP_KMEM
> +       cgroup_release_and_wakeup_rmdir(&sk->sk_cgrp->css);
> +#endif
> +}
> +
>  /**
>  *     sk_alloc - All socket objects are allocated here
>  *     @net: the applicable net namespace
> @@ -1139,6 +1182,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>                atomic_set(&sk->sk_wmem_alloc, 1);
>
>                sock_update_classid(sk);
> +               sock_update_kmem_cgrp(sk);
>        }
>
>        return sk;
> @@ -1170,6 +1214,7 @@ static void __sk_free(struct sock *sk)
>                put_cred(sk->sk_peer_cred);
>        put_pid(sk->sk_peer_pid);
>        put_net(sock_net(sk));
> +       sock_release_kmem_cgrp(sk);
>        sk_prot_free(sk->sk_prot_creator, sk);
>  }
>
> @@ -1287,8 +1332,8 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
>                sk_set_socket(newsk, NULL);
>                newsk->sk_wq = NULL;
>
> -               if (newsk->sk_prot->sockets_allocated)
> -                       percpu_counter_inc(newsk->sk_prot->sockets_allocated);
> +               if (sk_sockets_allocated(sk))
> +                       percpu_counter_inc(sk_sockets_allocated(sk));
>
>                if (sock_flag(newsk, SOCK_TIMESTAMP) ||
>                    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
> @@ -1676,29 +1721,51 @@ EXPORT_SYMBOL(sk_wait_data);
>  */
>  int __sk_mem_schedule(struct sock *sk, int size, int kind)
>  {
> -       struct proto *prot = sk->sk_prot;
>        int amt = sk_mem_pages(size);
> +       struct proto *prot = sk->sk_prot;
>        long allocated;
> +       int *memory_pressure;
> +       long *prot_mem;
> +       int parent_failure = 0;
> +       struct kmem_cgroup *sg;
>
>        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
> -       allocated = atomic_long_add_return(amt, prot->memory_allocated);
> +
> +       memory_pressure = sk_memory_pressure(sk);
> +       prot_mem = sk_prot_mem(sk);
> +
> +       allocated = atomic_long_add_return(amt, sk_memory_allocated(sk));
> +
> +#ifdef CONFIG_CGROUP_KMEM
> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
> +               long alloc;
> +               /*
> +                * Large nestings are not the common case, and stopping in the
> +                * middle would be complicated enough, that we bill it all the
> +                * way through the root, and if needed, unbill everything later
> +                */
> +               alloc = atomic_long_add_return(amt,
> +                                              sg_memory_allocated(prot, sg));
> +               parent_failure |= (alloc > sk_prot_mem(sk)[2]);
> +       }
> +#endif
> +
> +       /* Over hard limit (we, or our parents) */
> +       if (parent_failure || (allocated > prot_mem[2]))
> +               goto suppress_allocation;
>
>        /* Under limit. */
> -       if (allocated <= prot->sysctl_mem[0]) {
> -               if (prot->memory_pressure && *prot->memory_pressure)
> -                       *prot->memory_pressure = 0;
> +       if (allocated <= prot_mem[0]) {
> +               if (memory_pressure && *memory_pressure)
> +                       *memory_pressure = 0;
>                return 1;
>        }
>
>        /* Under pressure. */
> -       if (allocated > prot->sysctl_mem[1])
> +       if (allocated > prot_mem[1])
>                if (prot->enter_memory_pressure)
>                        prot->enter_memory_pressure(sk);
>
> -       /* Over hard limit. */
> -       if (allocated > prot->sysctl_mem[2])
> -               goto suppress_allocation;
> -
>        /* guarantee minimum buffer size under pressure */
>        if (kind == SK_MEM_RECV) {
>                if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
> @@ -1712,13 +1779,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
>                                return 1;
>        }
>
> -       if (prot->memory_pressure) {
> +       if (memory_pressure) {
>                int alloc;
>
> -               if (!*prot->memory_pressure)
> +               if (!*memory_pressure)
>                        return 1;
> -               alloc = percpu_counter_read_positive(prot->sockets_allocated);
> -               if (prot->sysctl_mem[2] > alloc *
> +               alloc = percpu_counter_read_positive(sk_sockets_allocated(sk));
> +               if (prot_mem[2] > alloc *
>                    sk_mem_pages(sk->sk_wmem_queued +
>                                 atomic_read(&sk->sk_rmem_alloc) +
>                                 sk->sk_forward_alloc))
> @@ -1741,7 +1808,13 @@ suppress_allocation:
>
>        /* Alas. Undo changes. */
>        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
> -       atomic_long_sub(amt, prot->memory_allocated);
> +
> +       atomic_long_sub(amt, sk_memory_allocated(sk));
> +
> +#ifdef CONFIG_CGROUP_KMEM
> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent)
> +               atomic_long_sub(amt, sg_memory_allocated(prot, sg));
> +#endif
>        return 0;
>  }
>  EXPORT_SYMBOL(__sk_mem_schedule);
> @@ -1753,14 +1826,24 @@ EXPORT_SYMBOL(__sk_mem_schedule);
>  void __sk_mem_reclaim(struct sock *sk)
>  {
>        struct proto *prot = sk->sk_prot;
> +       struct kmem_cgroup *sg = sk->sk_cgrp;
> +       int *memory_pressure = sk_memory_pressure(sk);
>
>        atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
> -                  prot->memory_allocated);
> +                  sk_memory_allocated(sk));
> +
> +#ifdef CONFIG_CGROUP_KMEM
> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
> +               atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
> +                                               sg_memory_allocated(prot, sg));
> +       }
> +#endif
> +
>        sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
>
> -       if (prot->memory_pressure && *prot->memory_pressure &&
> -           (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
> -               *prot->memory_pressure = 0;
> +       if (memory_pressure && *memory_pressure &&
> +           (atomic_long_read(sk_memory_allocated(sk)) < sk_prot_mem(sk)[0]))
> +               *memory_pressure = 0;
>  }
>  EXPORT_SYMBOL(__sk_mem_reclaim);
>
> @@ -2252,9 +2335,6 @@ void sk_common_release(struct sock *sk)
>  }
>  EXPORT_SYMBOL(sk_common_release);
>
> -static DEFINE_RWLOCK(proto_list_lock);
> -static LIST_HEAD(proto_list);
> -
>  #ifdef CONFIG_PROC_FS
>  #define PROTO_INUSE_NR 64      /* should be enough for the first time */
>  struct prot_inuse {
> @@ -2479,13 +2559,17 @@ static char proto_method_implemented(const void *method)
>
>  static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
>  {
> +       struct kmem_cgroup *sg = kcg_from_task(current);
> +
>        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
>                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
>                   proto->name,
>                   proto->obj_size,
>                   sock_prot_inuse_get(seq_file_net(seq), proto),
> -                  proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
> -                  proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
> +                  proto->memory_allocated != NULL ?
> +                       atomic_long_read(sg_memory_allocated(proto, sg)) : -1L,
> +                  proto->memory_pressure != NULL ?
> +                       *sg_memory_pressure(proto, sg) ? "yes" : "no" : "NI",
>                   proto->max_header,
>                   proto->slab == NULL ? "no" : "yes",
>                   module_name(proto->owner),
> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
> index 19acd00..463b299 100644
> --- a/net/decnet/af_decnet.c
> +++ b/net/decnet/af_decnet.c
> @@ -458,13 +458,28 @@ static void dn_enter_memory_pressure(struct sock *sk)
>        }
>  }
>
> +static atomic_long_t *memory_allocated_dn(struct kmem_cgroup *sg)
> +{
> +       return &decnet_memory_allocated;
> +}
> +
> +static int *memory_pressure_dn(struct kmem_cgroup *sg)
> +{
> +       return &dn_memory_pressure;
> +}
> +
> +static long *dn_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +       return sysctl_decnet_mem;
> +}
> +
>  static struct proto dn_proto = {
>        .name                   = "NSP",
>        .owner                  = THIS_MODULE,
>        .enter_memory_pressure  = dn_enter_memory_pressure,
> -       .memory_pressure        = &dn_memory_pressure,
> -       .memory_allocated       = &decnet_memory_allocated,
> -       .sysctl_mem             = sysctl_decnet_mem,
> +       .memory_pressure        = memory_pressure_dn,
> +       .memory_allocated       = memory_allocated_dn,
> +       .prot_mem               = dn_sysctl_mem,
>        .sysctl_wmem            = sysctl_decnet_wmem,
>        .sysctl_rmem            = sysctl_decnet_rmem,
>        .max_header             = DN_MAX_NSP_DATA_HEADER + 64,
> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
> index b14ec7d..9c80acf 100644
> --- a/net/ipv4/proc.c
> +++ b/net/ipv4/proc.c
> @@ -52,20 +52,22 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
>  {
>        struct net *net = seq->private;
>        int orphans, sockets;
> +       struct kmem_cgroup *sg = kcg_from_task(current);
> +       struct percpu_counter *allocated = sg_sockets_allocated(&tcp_prot, sg);
>
>        local_bh_disable();
>        orphans = percpu_counter_sum_positive(&tcp_orphan_count);
> -       sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
> +       sockets = percpu_counter_sum_positive(allocated);
>        local_bh_enable();
>
>        socket_seq_show(seq);
>        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
>                   sock_prot_inuse_get(net, &tcp_prot), orphans,
>                   tcp_death_row.tw_count, sockets,
> -                  atomic_long_read(&tcp_memory_allocated));
> +                  atomic_long_read(sg_memory_allocated((&tcp_prot), sg)));
>        seq_printf(seq, "UDP: inuse %d mem %ld\n",
>                   sock_prot_inuse_get(net, &udp_prot),
> -                  atomic_long_read(&udp_memory_allocated));
> +                  atomic_long_read(sg_memory_allocated((&udp_prot), sg)));
>        seq_printf(seq, "UDPLITE: inuse %d\n",
>                   sock_prot_inuse_get(net, &udplite_prot));
>        seq_printf(seq, "RAW: inuse %d\n",
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 69fd720..5e89480 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -14,6 +14,8 @@
>  #include <linux/init.h>
>  #include <linux/slab.h>
>  #include <linux/nsproxy.h>
> +#include <linux/kmem_cgroup.h>
> +#include <linux/swap.h>
>  #include <net/snmp.h>
>  #include <net/icmp.h>
>  #include <net/ip.h>
> @@ -174,6 +176,43 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>        return ret;
>  }
>
> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
> +                          void __user *buffer, size_t *lenp,
> +                          loff_t *ppos)
> +{
> +       int ret;
> +       unsigned long vec[3];
> +       struct kmem_cgroup *kmem = kcg_from_task(current);
> +       struct net *net = current->nsproxy->net_ns;
> +       int i;
> +
> +       ctl_table tmp = {
> +               .data = &vec,
> +               .maxlen = sizeof(vec),
> +               .mode = ctl->mode,
> +       };
> +
> +       if (!write) {
> +               ctl->data = &net->ipv4.sysctl_tcp_mem;
> +               return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
> +       }
> +
> +       ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
> +       if (ret)
> +               return ret;
> +
> +       for (i = 0; i < 3; i++)
> +               if (vec[i] > kmem->tcp_max_memory)
> +                       return -EINVAL;
> +
> +       for (i = 0; i < 3; i++) {
> +               net->ipv4.sysctl_tcp_mem[i] = vec[i];
> +               kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i];
> +       }
> +
> +       return 0;
> +}
> +
>  static struct ctl_table ipv4_table[] = {
>        {
>                .procname       = "tcp_timestamps",
> @@ -433,13 +472,6 @@ static struct ctl_table ipv4_table[] = {
>                .proc_handler   = proc_dointvec
>        },
>        {
> -               .procname       = "tcp_mem",
> -               .data           = &sysctl_tcp_mem,
> -               .maxlen         = sizeof(sysctl_tcp_mem),
> -               .mode           = 0644,
> -               .proc_handler   = proc_doulongvec_minmax
> -       },
> -       {
>                .procname       = "tcp_wmem",
>                .data           = &sysctl_tcp_wmem,
>                .maxlen         = sizeof(sysctl_tcp_wmem),
> @@ -721,6 +753,12 @@ static struct ctl_table ipv4_net_table[] = {
>                .mode           = 0644,
>                .proc_handler   = ipv4_ping_group_range,
>        },
> +       {
> +               .procname       = "tcp_mem",
> +               .maxlen         = sizeof(init_net.ipv4.sysctl_tcp_mem),
> +               .mode           = 0644,
> +               .proc_handler   = ipv4_tcp_mem,
> +       },
>        { }
>  };
>
> @@ -734,6 +772,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>  static __net_init int ipv4_sysctl_init_net(struct net *net)
>  {
>        struct ctl_table *table;
> +       unsigned long limit;
>
>        table = ipv4_net_table;
>        if (!net_eq(net, &init_net)) {
> @@ -769,6 +808,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>
>        net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>
> +       limit = nr_free_buffer_pages() / 8;
> +       limit = max(limit, 128UL);
> +       net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
> +       net->ipv4.sysctl_tcp_mem[1] = limit;
> +       net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
> +
>        net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>                        net_ipv4_ctl_path, table);
>        if (net->ipv4.ipv4_hdr == NULL)
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 46febca..e1918fa 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -266,6 +266,7 @@
>  #include <linux/crypto.h>
>  #include <linux/time.h>
>  #include <linux/slab.h>
> +#include <linux/nsproxy.h>
>
>  #include <net/icmp.h>
>  #include <net/tcp.h>
> @@ -282,23 +283,12 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>  struct percpu_counter tcp_orphan_count;
>  EXPORT_SYMBOL_GPL(tcp_orphan_count);
>
> -long sysctl_tcp_mem[3] __read_mostly;
>  int sysctl_tcp_wmem[3] __read_mostly;
>  int sysctl_tcp_rmem[3] __read_mostly;
>
> -EXPORT_SYMBOL(sysctl_tcp_mem);
>  EXPORT_SYMBOL(sysctl_tcp_rmem);
>  EXPORT_SYMBOL(sysctl_tcp_wmem);
>
> -atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
> -EXPORT_SYMBOL(tcp_memory_allocated);
> -
> -/*
> - * Current number of TCP sockets.
> - */
> -struct percpu_counter tcp_sockets_allocated;
> -EXPORT_SYMBOL(tcp_sockets_allocated);
> -
>  /*
>  * TCP splice context
>  */
> @@ -308,23 +298,157 @@ struct tcp_splice_state {
>        unsigned int flags;
>  };
>
> +#ifdef CONFIG_CGROUP_KMEM
>  /*
>  * Pressure flag: try to collapse.
>  * Technical note: it is used by multiple contexts non atomically.
>  * All the __sk_mem_schedule() is of this nature: accounting
>  * is strict, actions are advisory and have some latency.
>  */
> -int tcp_memory_pressure __read_mostly;
> -EXPORT_SYMBOL(tcp_memory_pressure);
> -
>  void tcp_enter_memory_pressure(struct sock *sk)
>  {
> +       struct kmem_cgroup *sg = sk->sk_cgrp;
> +       if (!sg->tcp_memory_pressure) {
> +               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
> +               sg->tcp_memory_pressure = 1;
> +       }
> +}
> +
> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +       return sg->tcp_prot_mem;
> +}
> +
> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +       return &(sg->tcp_memory_allocated);
> +}
> +
> +static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
> +       struct net *net = current->nsproxy->net_ns;
> +       int i;
> +
> +       if (!cgroup_lock_live_group(cgrp))
> +               return -ENODEV;
> +
> +       /*
> +        * We can't allow more memory than our parents. Since this
> +        * will be tested for all calls, by induction, there is no need
> +        * to test any parent other than our own
> +        * */
> +       if (sg->parent && (val > sg->parent->tcp_max_memory))
> +               val = sg->parent->tcp_max_memory;
> +
> +       sg->tcp_max_memory = val;
> +
> +       for (i = 0; i < 3; i++)
> +               sg->tcp_prot_mem[i]  = min_t(long, val,
> +                                            net->ipv4.sysctl_tcp_mem[i]);
> +
> +       cgroup_unlock();
> +
> +       return 0;
> +}
> +
> +static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft)
> +{
> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
> +       u64 ret;
> +
> +       if (!cgroup_lock_live_group(cgrp))
> +               return -ENODEV;
> +       ret = sg->tcp_max_memory;
> +
> +       cgroup_unlock();
> +       return ret;
> +}
> +
> +static struct cftype tcp_files[] = {
> +       {
> +               .name = "tcp_maxmem",
> +               .write_u64 = tcp_write_maxmem,
> +               .read_u64 = tcp_read_maxmem,
> +       },
> +};
> +
> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
> +{
> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
> +       unsigned long limit;
> +       struct net *net = current->nsproxy->net_ns;
> +
> +       sg->tcp_memory_pressure = 0;
> +       atomic_long_set(&sg->tcp_memory_allocated, 0);
> +       percpu_counter_init(&sg->tcp_sockets_allocated, 0);
> +
> +       limit = nr_free_buffer_pages() / 8;
> +       limit = max(limit, 128UL);
> +
> +       if (sg->parent)
> +               sg->tcp_max_memory = sg->parent->tcp_max_memory;
> +       else
> +               sg->tcp_max_memory = limit * 2;
> +
> +       sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
> +       sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
> +       sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
> +
> +       return cgroup_add_files(cgrp, ss, tcp_files, ARRAY_SIZE(tcp_files));
> +}
> +EXPORT_SYMBOL(tcp_init_cgroup);
> +
> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
> +{
> +       return &sg->tcp_memory_pressure;
> +}
> +
> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +       return &sg->tcp_sockets_allocated;
> +}
> +#else
> +
> +/* Current number of TCP sockets. */
> +struct percpu_counter tcp_sockets_allocated;
> +atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
> +int tcp_memory_pressure;
> +
> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
> +{
> +       return &tcp_memory_pressure;
> +}
> +
> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +       return &tcp_sockets_allocated;
> +}
> +
> +void tcp_enter_memory_pressure(struct sock *sock)
> +{
>        if (!tcp_memory_pressure) {
>                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
>                tcp_memory_pressure = 1;
>        }
>  }
> +
> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +       return init_net.ipv4.sysctl_tcp_mem;
> +}
> +
> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
> +{
> +       return &tcp_memory_allocated;
> +}
> +#endif /* CONFIG_CGROUP_KMEM */
> +
> +EXPORT_SYMBOL(memory_pressure_tcp);
> +EXPORT_SYMBOL(sockets_allocated_tcp);
>  EXPORT_SYMBOL(tcp_enter_memory_pressure);
> +EXPORT_SYMBOL(tcp_sysctl_mem);
> +EXPORT_SYMBOL(memory_allocated_tcp);
>
>  /* Convert seconds to retransmits based on initial and max timeout */
>  static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
> @@ -3226,7 +3350,9 @@ void __init tcp_init(void)
>
>        BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
>
> +#ifndef CONFIG_CGROUP_KMEM
>        percpu_counter_init(&tcp_sockets_allocated, 0);
> +#endif
>        percpu_counter_init(&tcp_orphan_count, 0);
>        tcp_hashinfo.bind_bucket_cachep =
>                kmem_cache_create("tcp_bind_bucket",
> @@ -3277,14 +3403,10 @@ void __init tcp_init(void)
>        sysctl_tcp_max_orphans = cnt / 2;
>        sysctl_max_syn_backlog = max(128, cnt / 256);
>
> -       limit = nr_free_buffer_pages() / 8;
> -       limit = max(limit, 128UL);
> -       sysctl_tcp_mem[0] = limit / 4 * 3;
> -       sysctl_tcp_mem[1] = limit;
> -       sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
> -
>        /* Set per-socket limits to no more than 1/128 the pressure threshold */
> -       limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
> +       limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
> +       limit <<= (PAGE_SHIFT - 7);
> +
>        max_share = min(4UL*1024*1024, limit);
>
>        sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index ea0d218..c44e830 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -316,7 +316,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
>        /* Check #1 */
>        if (tp->rcv_ssthresh < tp->window_clamp &&
>            (int)tp->rcv_ssthresh < tcp_space(sk) &&
> -           !tcp_memory_pressure) {
> +           !sk_memory_pressure(sk)) {
>                int incr;
>
>                /* Check #2. Increase window, if skb with such overhead
> @@ -393,15 +393,16 @@ static void tcp_clamp_window(struct sock *sk)
>  {
>        struct tcp_sock *tp = tcp_sk(sk);
>        struct inet_connection_sock *icsk = inet_csk(sk);
> +       struct proto *prot = sk->sk_prot;
>
>        icsk->icsk_ack.quick = 0;
>
> -       if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
> +       if (sk->sk_rcvbuf < prot->sysctl_rmem[2] &&
>            !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
> -           !tcp_memory_pressure &&
> -           atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
> +           !sk_memory_pressure(sk) &&
> +           atomic_long_read(sk_memory_allocated(sk)) < sk_prot_mem(sk)[0]) {
>                sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
> -                                   sysctl_tcp_rmem[2]);
> +                                   prot->sysctl_rmem[2]);
>        }
>        if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
>                tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
> @@ -4806,7 +4807,7 @@ static int tcp_prune_queue(struct sock *sk)
>
>        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
>                tcp_clamp_window(sk);
> -       else if (tcp_memory_pressure)
> +       else if (sk_memory_pressure(sk))
>                tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
>
>        tcp_collapse_ofo_queue(sk);
> @@ -4872,11 +4873,11 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
>                return 0;
>
>        /* If we are under global TCP memory pressure, do not expand.  */
> -       if (tcp_memory_pressure)
> +       if (sk_memory_pressure(sk))
>                return 0;
>
>        /* If we are under soft global TCP memory pressure, do not expand.  */
> -       if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
> +       if (atomic_long_read(sk_memory_allocated(sk)) >= sk_prot_mem(sk)[0])
>                return 0;
>
>        /* If we filled the congestion window, do not expand.  */
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 1c12b8e..af6c095 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1848,6 +1848,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>  {
>        struct inet_connection_sock *icsk = inet_csk(sk);
>        struct tcp_sock *tp = tcp_sk(sk);
> +       struct kmem_cgroup *sg;
>
>        skb_queue_head_init(&tp->out_of_order_queue);
>        tcp_init_xmit_timers(sk);
> @@ -1901,7 +1902,13 @@ static int tcp_v4_init_sock(struct sock *sk)
>        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
>
>        local_bh_disable();
> -       percpu_counter_inc(&tcp_sockets_allocated);
> +       percpu_counter_inc(sk_sockets_allocated(sk));
> +
> +#ifdef CONFIG_CGROUP_KMEM
> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
> +               percpu_counter_inc(sg_sockets_allocated(sk->sk_prot, sg));
> +#endif
> +
>        local_bh_enable();
>
>        return 0;
> @@ -1910,6 +1917,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>  void tcp_v4_destroy_sock(struct sock *sk)
>  {
>        struct tcp_sock *tp = tcp_sk(sk);
> +       struct kmem_cgroup *sg;
>
>        tcp_clear_xmit_timers(sk);
>
> @@ -1957,7 +1965,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
>                tp->cookie_values = NULL;
>        }
>
> -       percpu_counter_dec(&tcp_sockets_allocated);
> +       percpu_counter_dec(sk_sockets_allocated(sk));
> +#ifdef CONFIG_CGROUP_KMEM
> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
> +               percpu_counter_dec(sg_sockets_allocated(sk->sk_prot, sg));
> +#endif
>  }
>  EXPORT_SYMBOL(tcp_v4_destroy_sock);
>
> @@ -2598,11 +2610,14 @@ struct proto tcp_prot = {
>        .unhash                 = inet_unhash,
>        .get_port               = inet_csk_get_port,
>        .enter_memory_pressure  = tcp_enter_memory_pressure,
> -       .sockets_allocated      = &tcp_sockets_allocated,
> +       .memory_pressure        = memory_pressure_tcp,
> +       .sockets_allocated      = sockets_allocated_tcp,
>        .orphan_count           = &tcp_orphan_count,
> -       .memory_allocated       = &tcp_memory_allocated,
> -       .memory_pressure        = &tcp_memory_pressure,
> -       .sysctl_mem             = sysctl_tcp_mem,
> +       .memory_allocated       = memory_allocated_tcp,
> +#ifdef CONFIG_CGROUP_KMEM
> +       .init_cgroup            = tcp_init_cgroup,
> +#endif
> +       .prot_mem               = tcp_sysctl_mem,
>        .sysctl_wmem            = sysctl_tcp_wmem,
>        .sysctl_rmem            = sysctl_tcp_rmem,
>        .max_header             = MAX_TCP_HEADER,
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 882e0b0..06aeb31 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1912,7 +1912,7 @@ u32 __tcp_select_window(struct sock *sk)
>        if (free_space < (full_space >> 1)) {
>                icsk->icsk_ack.quick = 0;
>
> -               if (tcp_memory_pressure)
> +               if (sk_memory_pressure(sk))
>                        tp->rcv_ssthresh = min(tp->rcv_ssthresh,
>                                               4U * tp->advmss);
>
> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> index ecd44b0..2c67617 100644
> --- a/net/ipv4/tcp_timer.c
> +++ b/net/ipv4/tcp_timer.c
> @@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data)
>        }
>
>  out:
> -       if (tcp_memory_pressure)
> +       if (sk_memory_pressure(sk))
>                sk_mem_reclaim(sk);
>  out_unlock:
>        bh_unlock_sock(sk);
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 1b5a193..6c08c65 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -120,9 +120,6 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
>  int sysctl_udp_wmem_min __read_mostly;
>  EXPORT_SYMBOL(sysctl_udp_wmem_min);
>
> -atomic_long_t udp_memory_allocated;
> -EXPORT_SYMBOL(udp_memory_allocated);
> -
>  #define MAX_UDP_PORTS 65536
>  #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
>
> @@ -1918,6 +1915,19 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
>  }
>  EXPORT_SYMBOL(udp_poll);
>
> +static atomic_long_t udp_memory_allocated;
> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg)
> +{
> +       return &udp_memory_allocated;
> +}
> +EXPORT_SYMBOL(memory_allocated_udp);
> +
> +long *udp_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +       return sysctl_udp_mem;
> +}
> +EXPORT_SYMBOL(udp_sysctl_mem);
> +
>  struct proto udp_prot = {
>        .name              = "UDP",
>        .owner             = THIS_MODULE,
> @@ -1936,8 +1946,8 @@ struct proto udp_prot = {
>        .unhash            = udp_lib_unhash,
>        .rehash            = udp_v4_rehash,
>        .get_port          = udp_v4_get_port,
> -       .memory_allocated  = &udp_memory_allocated,
> -       .sysctl_mem        = sysctl_udp_mem,
> +       .memory_allocated  = &memory_allocated_udp,
> +       .prot_mem          = udp_sysctl_mem,
>        .sysctl_wmem       = &sysctl_udp_wmem_min,
>        .sysctl_rmem       = &sysctl_udp_rmem_min,
>        .obj_size          = sizeof(struct udp_sock),
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index d1fb63f..0762e68 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1959,6 +1959,7 @@ static int tcp_v6_init_sock(struct sock *sk)
>  {
>        struct inet_connection_sock *icsk = inet_csk(sk);
>        struct tcp_sock *tp = tcp_sk(sk);
> +       struct kmem_cgroup *sg;
>
>        skb_queue_head_init(&tp->out_of_order_queue);
>        tcp_init_xmit_timers(sk);
> @@ -2012,7 +2013,12 @@ static int tcp_v6_init_sock(struct sock *sk)
>        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
>
>        local_bh_disable();
> -       percpu_counter_inc(&tcp_sockets_allocated);
> +       percpu_counter_inc(sk_sockets_allocated(sk));
> +#ifdef CONFIG_CGROUP_KMEM
> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
> +               percpu_counter_dec(sg_sockets_allocated(sk->sk_prot, sg));
> +#endif
> +
>        local_bh_enable();
>
>        return 0;
> @@ -2221,11 +2227,11 @@ struct proto tcpv6_prot = {
>        .unhash                 = inet_unhash,
>        .get_port               = inet_csk_get_port,
>        .enter_memory_pressure  = tcp_enter_memory_pressure,
> -       .sockets_allocated      = &tcp_sockets_allocated,
> -       .memory_allocated       = &tcp_memory_allocated,
> -       .memory_pressure        = &tcp_memory_pressure,
> +       .sockets_allocated      = sockets_allocated_tcp,
> +       .memory_allocated       = memory_allocated_tcp,
> +       .memory_pressure        = memory_pressure_tcp,
>        .orphan_count           = &tcp_orphan_count,
> -       .sysctl_mem             = sysctl_tcp_mem,
> +       .prot_mem               = tcp_sysctl_mem,
>        .sysctl_wmem            = sysctl_tcp_wmem,
>        .sysctl_rmem            = sysctl_tcp_rmem,
>        .max_header             = MAX_TCP_HEADER,
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 29213b5..ef4b5b3 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1465,8 +1465,8 @@ struct proto udpv6_prot = {
>        .unhash            = udp_lib_unhash,
>        .rehash            = udp_v6_rehash,
>        .get_port          = udp_v6_get_port,
> -       .memory_allocated  = &udp_memory_allocated,
> -       .sysctl_mem        = sysctl_udp_mem,
> +       .memory_allocated  = memory_allocated_udp,
> +       .prot_mem          = udp_sysctl_mem,
>        .sysctl_wmem       = &sysctl_udp_wmem_min,
>        .sysctl_rmem       = &sysctl_udp_rmem_min,
>        .obj_size          = sizeof(struct udp6_sock),
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 836aa63..1b0300d 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -119,11 +119,30 @@ static int sctp_memory_pressure;
>  static atomic_long_t sctp_memory_allocated;
>  struct percpu_counter sctp_sockets_allocated;
>
> +static long *sctp_sysctl_mem(struct kmem_cgroup *sg)
> +{
> +       return sysctl_sctp_mem;
> +}
> +
>  static void sctp_enter_memory_pressure(struct sock *sk)
>  {
>        sctp_memory_pressure = 1;
>  }
>
> +static int *memory_pressure_sctp(struct kmem_cgroup *sg)
> +{
> +       return &sctp_memory_pressure;
> +}
> +
> +static atomic_long_t *memory_allocated_sctp(struct kmem_cgroup *sg)
> +{
> +       return &sctp_memory_allocated;
> +}
> +
> +static struct percpu_counter *sockets_allocated_sctp(struct kmem_cgroup *sg)
> +{
> +       return &sctp_sockets_allocated;
> +}
>
>  /* Get the sndbuf space available at the time on the association.  */
>  static inline int sctp_wspace(struct sctp_association *asoc)
> @@ -6831,13 +6850,13 @@ struct proto sctp_prot = {
>        .unhash      =  sctp_unhash,
>        .get_port    =  sctp_get_port,
>        .obj_size    =  sizeof(struct sctp_sock),
> -       .sysctl_mem  =  sysctl_sctp_mem,
> +       .prot_mem    =  sctp_sysctl_mem,
>        .sysctl_rmem =  sysctl_sctp_rmem,
>        .sysctl_wmem =  sysctl_sctp_wmem,
> -       .memory_pressure = &sctp_memory_pressure,
> +       .memory_pressure = memory_pressure_sctp,
>        .enter_memory_pressure = sctp_enter_memory_pressure,
> -       .memory_allocated = &sctp_memory_allocated,
> -       .sockets_allocated = &sctp_sockets_allocated,
> +       .memory_allocated = memory_allocated_sctp,
> +       .sockets_allocated = sockets_allocated_sctp,
>  };
>
>  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> @@ -6863,12 +6882,12 @@ struct proto sctpv6_prot = {
>        .unhash         = sctp_unhash,
>        .get_port       = sctp_get_port,
>        .obj_size       = sizeof(struct sctp6_sock),
> -       .sysctl_mem     = sysctl_sctp_mem,
> +       .prot_mem       = sctp_sysctl_mem,
>        .sysctl_rmem    = sysctl_sctp_rmem,
>        .sysctl_wmem    = sysctl_sctp_wmem,
> -       .memory_pressure = &sctp_memory_pressure,
> +       .memory_pressure = memory_pressure_sctp,
>        .enter_memory_pressure = sctp_enter_memory_pressure,
> -       .memory_allocated = &sctp_memory_allocated,
> -       .sockets_allocated = &sctp_sockets_allocated,
> +       .memory_allocated = memory_allocated_sctp,
> +       .sockets_allocated = sockets_allocated_sctp,
>  };
>  #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
> --
> 1.7.6
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 6, 2011, 4:16 p.m. UTC | #4
On 09/06/2011 01:08 PM, Greg Thelen wrote:
> On Mon, Sep 5, 2011 at 7:35 PM, Glauber Costa<glommer@parallels.com>  wrote:
>> This patch introduces per-cgroup tcp buffers limitation. This allows
>> sysadmins to specify a maximum amount of kernel memory that
>> tcp connections can use at any point in time. TCP is the main interest
>> in this work, but extending it to other protocols would be easy.

Hello Greg,

> With this approach we would be giving admins the ability to
> independently limit user memory with memcg and kernel memory with this
> new kmem cgroup.
>
> At least in some situations admins prefer to give a particular
> container X bytes without thinking about the kernel vs user split.
> Sometimes the admin would prefer the kernel to keep the total
> user+kernel memory below a certain threshold.  To achieve this with
> this approach would we need a user space agent to monitor both kernel
> and user usage for a container and grow/shrink memcg/kmem limits?
Yes, I believe so. And this is not only valid for containers: the 
information we expose in proc, sys, cgroups, etc, is always much more 
fine grained than a considerable part of the users want. Tools come to 
fill this gap.

>
> Do you foresee the kmem cgroup growing to include reclaimable slab,
> where freeing one type of memory allows for reclaim of the other?
Yes, absolutely.

>
>> It piggybacks in the memory control mechanism already present in
>> /proc/sys/net/ipv4/tcp_mem. There is a soft limit, and a hard limit,
>> that will suppress allocation when reached. For each cgroup, however,
>> the file kmem.tcp_maxmem will be used to cap those values.
>>
>> The usage I have in mind here is containers. Each container will
>> define its own values for soft and hard limits, but none of them will
>> be possibly bigger than the value the box' sysadmin specified from
>> the outside.
>>
>> To test for any performance impacts of this patch, I used netperf's
>> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
>>
>> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
>> results:
>>
>> Without the patch
>> =================
>>
>> Socket Size   Request  Resp.   Elapsed  Trans.
>> Send   Recv   Size     Size    Time     Rate
>> bytes  Bytes  bytes    bytes   secs.    per sec
>>
>> 16384  87380  1        1       10.00    26996.35
>> 16384  87380
>>
>> With the patch
>> ===============
>>
>> Local /Remote
>> Socket Size   Request  Resp.   Elapsed  Trans.
>> Send   Recv   Size     Size    Time     Rate
>> bytes  Bytes  bytes    bytes   secs.    per sec
>>
>> 16384  87380  1        1       10.00    27291.86
>> 16384  87380
>>
>> The difference is within a one-percent range.
>>
>> Nesting cgroups doesn't seem to be the dominating factor as well,
>> with nestings up to 10 levels not showing a significant performance
>> difference.
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: David S. Miller<davem@davemloft.net>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Eric W. Biederman<ebiederm@xmission.com>
>> ---
>>   crypto/af_alg.c               |    7 ++-
>>   include/linux/cgroup_subsys.h |    4 +
>>   include/net/netns/ipv4.h      |    1 +
>>   include/net/sock.h            |   66 +++++++++++++++-
>>   include/net/tcp.h             |   12 ++-
>>   include/net/udp.h             |    3 +-
>>   include/trace/events/sock.h   |   10 +-
>>   init/Kconfig                  |   11 +++
>>   mm/Makefile                   |    1 +
>>   net/core/sock.c               |  136 +++++++++++++++++++++++++++-------
>>   net/decnet/af_decnet.c        |   21 +++++-
>>   net/ipv4/proc.c               |    8 +-
>>   net/ipv4/sysctl_net_ipv4.c    |   59 +++++++++++++--
>>   net/ipv4/tcp.c                |  164 +++++++++++++++++++++++++++++++++++-----
>>   net/ipv4/tcp_input.c          |   17 ++--
>>   net/ipv4/tcp_ipv4.c           |   27 +++++--
>>   net/ipv4/tcp_output.c         |    2 +-
>>   net/ipv4/tcp_timer.c          |    2 +-
>>   net/ipv4/udp.c                |   20 ++++-
>>   net/ipv6/tcp_ipv6.c           |   16 +++-
>>   net/ipv6/udp.c                |    4 +-
>>   net/sctp/socket.c             |   35 +++++++--
>>   22 files changed, 514 insertions(+), 112 deletions(-)
>>
>> diff --git a/crypto/af_alg.c b/crypto/af_alg.c
>> index ac33d5f..df168d8 100644
>> --- a/crypto/af_alg.c
>> +++ b/crypto/af_alg.c
>> @@ -29,10 +29,15 @@ struct alg_type_list {
>>
>>   static atomic_long_t alg_memory_allocated;
>>
>> +static atomic_long_t *memory_allocated_alg(struct kmem_cgroup *sg)
>> +{
>> +       return&alg_memory_allocated;
>> +}
>> +
>>   static struct proto alg_proto = {
>>         .name                   = "ALG",
>>         .owner                  = THIS_MODULE,
>> -       .memory_allocated       =&alg_memory_allocated,
>> +       .memory_allocated       = memory_allocated_alg,
>>         .obj_size               = sizeof(struct alg_sock),
>>   };
>>
>> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
>> index ac663c1..363b8e8 100644
>> --- a/include/linux/cgroup_subsys.h
>> +++ b/include/linux/cgroup_subsys.h
>> @@ -35,6 +35,10 @@ SUBSYS(cpuacct)
>>   SUBSYS(mem_cgroup)
>>   #endif
>>
>> +#ifdef CONFIG_CGROUP_KMEM
>> +SUBSYS(kmem)
>> +#endif
>> +
>>   /* */
>>
>>   #ifdef CONFIG_CGROUP_DEVICE
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index d786b4f..bbd023a 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>>         int current_rt_cache_rebuild_count;
>>
>>         unsigned int sysctl_ping_group_range[2];
>> +       long sysctl_tcp_mem[3];
>>
>>         atomic_t rt_genid;
>>         atomic_t dev_addr_genid;
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 8e4062f..e085148 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -62,7 +62,9 @@
>>   #include<linux/atomic.h>
>>   #include<net/dst.h>
>>   #include<net/checksum.h>
>> +#include<linux/kmem_cgroup.h>
>>
>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>>   /*
>>   * This structure really needs to be cleaned up.
>>   * Most of it is for TCP, and not used by any of
>> @@ -339,6 +341,7 @@ struct sock {
>>   #endif
>>         __u32                   sk_mark;
>>         u32                     sk_classid;
>> +       struct kmem_cgroup      *sk_cgrp;
>>         void                    (*sk_state_change)(struct sock *sk);
>>         void                    (*sk_data_ready)(struct sock *sk, int bytes);
>>         void                    (*sk_write_space)(struct sock *sk);
>> @@ -786,16 +789,21 @@ struct proto {
>>
>>         /* Memory pressure */
>>         void                    (*enter_memory_pressure)(struct sock *sk);
>> -       atomic_long_t           *memory_allocated;      /* Current allocated memory. */
>> -       struct percpu_counter   *sockets_allocated;     /* Current number of sockets. */
>> +       /* Current allocated memory. */
>> +       atomic_long_t           *(*memory_allocated)(struct kmem_cgroup *sg);
>> +       /* Current number of sockets. */
>> +       struct percpu_counter   *(*sockets_allocated)(struct kmem_cgroup *sg);
>> +
>> +       int                     (*init_cgroup)(struct cgroup *cgrp,
>> +                                              struct cgroup_subsys *ss);
>>         /*
>>          * Pressure flag: try to collapse.
>>          * Technical note: it is used by multiple contexts non atomically.
>>          * All the __sk_mem_schedule() is of this nature: accounting
>>          * is strict, actions are advisory and have some latency.
>>          */
>> -       int                     *memory_pressure;
>> -       long                    *sysctl_mem;
>> +       int                     *(*memory_pressure)(struct kmem_cgroup *sg);
>> +       long                    *(*prot_mem)(struct kmem_cgroup *sg);
>>         int                     *sysctl_wmem;
>>         int                     *sysctl_rmem;
>>         int                     max_header;
>> @@ -826,6 +834,56 @@ struct proto {
>>   #endif
>>   };
>>
>> +#define sk_memory_pressure(sk)                                         \
>> +({                                                                     \
>> +       int *__ret = NULL;                                              \
>> +       if ((sk)->sk_prot->memory_pressure)                             \
>> +               __ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);    \
>> +       __ret;                                                          \
>> +})
>> +
>> +#define sk_sockets_allocated(sk)                               \
>> +({                                                             \
>> +       struct percpu_counter *__p;                             \
>> +       __p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);    \
>> +       __p;                                                    \
>> +})
>> +
>> +#define sk_memory_allocated(sk)                                        \
>> +({                                                             \
>> +       atomic_long_t *__mem;                                   \
>> +       __mem = (sk)->sk_prot->memory_allocated(sk->sk_cgrp);   \
>> +       __mem;                                                  \
>> +})
>> +
>> +#define sk_prot_mem(sk)                                                \
>> +({                                                             \
>> +       long *__mem = (sk)->sk_prot->prot_mem(sk->sk_cgrp);     \
>> +       __mem;                                                  \
>> +})
>> +
>> +#define sg_memory_pressure(prot, sg)                           \
>> +({                                                             \
>> +       int *__ret = NULL;                                      \
>> +       if (prot->memory_pressure)                              \
>> +               __ret = (prot)->memory_pressure(sg);            \
>> +       __ret;                                                  \
>> +})
>> +
>> +#define sg_memory_allocated(prot, sg)                          \
>> +({                                                             \
>> +       atomic_long_t *__mem;                                   \
>> +       __mem = (prot)->memory_allocated(sg);                   \
>> +       __mem;                                                  \
>> +})
>> +
>> +#define sg_sockets_allocated(prot, sg)                         \
>> +({                                                             \
>> +       struct percpu_counter *__p;                             \
>> +       __p = (prot)->sockets_allocated(sg);                    \
>> +       __p;                                                    \
>> +})
>> +
>>   extern int proto_register(struct proto *prot, int alloc_slab);
>>   extern void proto_unregister(struct proto *prot);
>>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 149a415..8e1ec4a 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>>   extern int sysctl_tcp_reordering;
>>   extern int sysctl_tcp_ecn;
>>   extern int sysctl_tcp_dsack;
>> -extern long sysctl_tcp_mem[3];
>>   extern int sysctl_tcp_wmem[3];
>>   extern int sysctl_tcp_rmem[3];
>>   extern int sysctl_tcp_app_win;
>> @@ -253,9 +252,12 @@ extern int sysctl_tcp_cookie_size;
>>   extern int sysctl_tcp_thin_linear_timeouts;
>>   extern int sysctl_tcp_thin_dupack;
>>
>> -extern atomic_long_t tcp_memory_allocated;
>> -extern struct percpu_counter tcp_sockets_allocated;
>> -extern int tcp_memory_pressure;
>> +struct kmem_cgroup;
>> +extern long *tcp_sysctl_mem(struct kmem_cgroup *sg);
>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg);
>> +int *memory_pressure_tcp(struct kmem_cgroup *sg);
>> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg);
>>
>>   /*
>>   * The next routines deal with comparing 32 bit unsigned ints
>> @@ -286,7 +288,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
>>         }
>>
>>         if (sk->sk_wmem_queued>  SOCK_MIN_SNDBUF&&
>> -           atomic_long_read(&tcp_memory_allocated)>  sysctl_tcp_mem[2])
>> +           atomic_long_read(sk_memory_allocated(sk))>  sk_prot_mem(sk)[2])
>>                 return true;
>>         return false;
>>   }
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index 67ea6fc..0e27388 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -105,7 +105,8 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
>>
>>   extern struct proto udp_prot;
>>
>> -extern atomic_long_t udp_memory_allocated;
>> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg);
>> +long *udp_sysctl_mem(struct kmem_cgroup *sg);
>>
>>   /* sysctl variables for udp */
>>   extern long sysctl_udp_mem[3];
>> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
>> index 779abb9..12a6083 100644
>> --- a/include/trace/events/sock.h
>> +++ b/include/trace/events/sock.h
>> @@ -37,7 +37,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>
>>         TP_STRUCT__entry(
>>                 __array(char, name, 32)
>> -               __field(long *, sysctl_mem)
>> +               __field(long *, prot_mem)
>>                 __field(long, allocated)
>>                 __field(int, sysctl_rmem)
>>                 __field(int, rmem_alloc)
>> @@ -45,7 +45,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>
>>         TP_fast_assign(
>>                 strncpy(__entry->name, prot->name, 32);
>> -               __entry->sysctl_mem = prot->sysctl_mem;
>> +               __entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
>>                 __entry->allocated = allocated;
>>                 __entry->sysctl_rmem = prot->sysctl_rmem[0];
>>                 __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
>> @@ -54,9 +54,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>         TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
>>                 "sysctl_rmem=%d rmem_alloc=%d",
>>                 __entry->name,
>> -               __entry->sysctl_mem[0],
>> -               __entry->sysctl_mem[1],
>> -               __entry->sysctl_mem[2],
>> +               __entry->prot_mem[0],
>> +               __entry->prot_mem[1],
>> +               __entry->prot_mem[2],
>>                 __entry->allocated,
>>                 __entry->sysctl_rmem,
>>                 __entry->rmem_alloc)
>> diff --git a/init/Kconfig b/init/Kconfig
>> index d627783..5955ac2 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -690,6 +690,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
>>           select this option (if, for some reason, they need to disable it
>>           then swapaccount=0 does the trick).
>>
>> +config CGROUP_KMEM
>> +       bool "Kernel Memory Resource Controller for Control Groups"
>> +       depends on CGROUPS
>> +       help
>> +         The Kernel Memory cgroup can limit the amount of memory used by
>> +         certain kernel objects in the system. Those are fundamentally
>> +         different from the entities handled by the Memory Controller,
>> +         which are page-based, and can be swapped. Users of the kmem
>> +         cgroup can use it to guarantee that no group of processes will
>> +         ever exhaust kernel resources alone.
>> +
>>   config CGROUP_PERF
>>         bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
>>         depends on PERF_EVENTS&&  CGROUPS
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 836e416..1b1aa24 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -45,6 +45,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>>   obj-$(CONFIG_QUICKLIST) += quicklist.o
>>   obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>>   obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
>> +obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o
>>   obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>>   obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>>   obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 3449df8..2d968ea 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -134,6 +134,24 @@
>>   #include<net/tcp.h>
>>   #endif
>>
>> +static DEFINE_RWLOCK(proto_list_lock);
>> +static LIST_HEAD(proto_list);
>> +
>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> +{
>> +       struct proto *proto;
>> +       int ret = 0;
>> +
>> +       read_lock(&proto_list_lock);
>> +       list_for_each_entry(proto,&proto_list, node) {
>> +               if (proto->init_cgroup)
>> +                       ret |= proto->init_cgroup(cgrp, ss);
>> +       }
>> +       read_unlock(&proto_list_lock);
>> +
>> +       return ret;
>> +}
>> +
>>   /*
>>   * Each address family might have different locking rules, so we have
>>   * one slock key per address family:
>> @@ -1114,6 +1132,31 @@ void sock_update_classid(struct sock *sk)
>>   EXPORT_SYMBOL(sock_update_classid);
>>   #endif
>>
>> +void sock_update_kmem_cgrp(struct sock *sk)
>> +{
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       sk->sk_cgrp = kcg_from_task(current);
>> +
>> +       /*
>> +        * We don't need to protect against anything task-related, because
>> +        * we are basically stuck with the sock pointer that won't change,
>> +        * even if the task that originated the socket changes cgroups.
>> +        *
>> +        * What we do have to guarantee, is that the chain leading us to
>> +        * the top level won't change under our noses. Incrementing the
>> +        * reference count via cgroup_exclude_rmdir guarantees that.
>> +        */
>> +       cgroup_exclude_rmdir(&sk->sk_cgrp->css);
>> +#endif
>> +}
>> +
>> +void sock_release_kmem_cgrp(struct sock *sk)
>> +{
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       cgroup_release_and_wakeup_rmdir(&sk->sk_cgrp->css);
>> +#endif
>> +}
>> +
>>   /**
>>   *     sk_alloc - All socket objects are allocated here
>>   *     @net: the applicable net namespace
>> @@ -1139,6 +1182,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>>                 atomic_set(&sk->sk_wmem_alloc, 1);
>>
>>                 sock_update_classid(sk);
>> +               sock_update_kmem_cgrp(sk);
>>         }
>>
>>         return sk;
>> @@ -1170,6 +1214,7 @@ static void __sk_free(struct sock *sk)
>>                 put_cred(sk->sk_peer_cred);
>>         put_pid(sk->sk_peer_pid);
>>         put_net(sock_net(sk));
>> +       sock_release_kmem_cgrp(sk);
>>         sk_prot_free(sk->sk_prot_creator, sk);
>>   }
>>
>> @@ -1287,8 +1332,8 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
>>                 sk_set_socket(newsk, NULL);
>>                 newsk->sk_wq = NULL;
>>
>> -               if (newsk->sk_prot->sockets_allocated)
>> -                       percpu_counter_inc(newsk->sk_prot->sockets_allocated);
>> +               if (sk_sockets_allocated(sk))
>> +                       percpu_counter_inc(sk_sockets_allocated(sk));
>>
>>                 if (sock_flag(newsk, SOCK_TIMESTAMP) ||
>>                     sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
>> @@ -1676,29 +1721,51 @@ EXPORT_SYMBOL(sk_wait_data);
>>   */
>>   int __sk_mem_schedule(struct sock *sk, int size, int kind)
>>   {
>> -       struct proto *prot = sk->sk_prot;
>>         int amt = sk_mem_pages(size);
>> +       struct proto *prot = sk->sk_prot;
>>         long allocated;
>> +       int *memory_pressure;
>> +       long *prot_mem;
>> +       int parent_failure = 0;
>> +       struct kmem_cgroup *sg;
>>
>>         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
>> -       allocated = atomic_long_add_return(amt, prot->memory_allocated);
>> +
>> +       memory_pressure = sk_memory_pressure(sk);
>> +       prot_mem = sk_prot_mem(sk);
>> +
>> +       allocated = atomic_long_add_return(amt, sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>> +               long alloc;
>> +               /*
>> +                * Large nestings are not the common case, and stopping in the
>> +                * middle would be complicated enough, that we bill it all the
>> +                * way through the root, and if needed, unbill everything later
>> +                */
>> +               alloc = atomic_long_add_return(amt,
>> +                                              sg_memory_allocated(prot, sg));
>> +               parent_failure |= (alloc>  sk_prot_mem(sk)[2]);
>> +       }
>> +#endif
>> +
>> +       /* Over hard limit (we, or our parents) */
>> +       if (parent_failure || (allocated>  prot_mem[2]))
>> +               goto suppress_allocation;
>>
>>         /* Under limit. */
>> -       if (allocated<= prot->sysctl_mem[0]) {
>> -               if (prot->memory_pressure&&  *prot->memory_pressure)
>> -                       *prot->memory_pressure = 0;
>> +       if (allocated<= prot_mem[0]) {
>> +               if (memory_pressure&&  *memory_pressure)
>> +                       *memory_pressure = 0;
>>                 return 1;
>>         }
>>
>>         /* Under pressure. */
>> -       if (allocated>  prot->sysctl_mem[1])
>> +       if (allocated>  prot_mem[1])
>>                 if (prot->enter_memory_pressure)
>>                         prot->enter_memory_pressure(sk);
>>
>> -       /* Over hard limit. */
>> -       if (allocated>  prot->sysctl_mem[2])
>> -               goto suppress_allocation;
>> -
>>         /* guarantee minimum buffer size under pressure */
>>         if (kind == SK_MEM_RECV) {
>>                 if (atomic_read(&sk->sk_rmem_alloc)<  prot->sysctl_rmem[0])
>> @@ -1712,13 +1779,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
>>                                 return 1;
>>         }
>>
>> -       if (prot->memory_pressure) {
>> +       if (memory_pressure) {
>>                 int alloc;
>>
>> -               if (!*prot->memory_pressure)
>> +               if (!*memory_pressure)
>>                         return 1;
>> -               alloc = percpu_counter_read_positive(prot->sockets_allocated);
>> -               if (prot->sysctl_mem[2]>  alloc *
>> +               alloc = percpu_counter_read_positive(sk_sockets_allocated(sk));
>> +               if (prot_mem[2]>  alloc *
>>                     sk_mem_pages(sk->sk_wmem_queued +
>>                                  atomic_read(&sk->sk_rmem_alloc) +
>>                                  sk->sk_forward_alloc))
>> @@ -1741,7 +1808,13 @@ suppress_allocation:
>>
>>         /* Alas. Undo changes. */
>>         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
>> -       atomic_long_sub(amt, prot->memory_allocated);
>> +
>> +       atomic_long_sub(amt, sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent)
>> +               atomic_long_sub(amt, sg_memory_allocated(prot, sg));
>> +#endif
>>         return 0;
>>   }
>>   EXPORT_SYMBOL(__sk_mem_schedule);
>> @@ -1753,14 +1826,24 @@ EXPORT_SYMBOL(__sk_mem_schedule);
>>   void __sk_mem_reclaim(struct sock *sk)
>>   {
>>         struct proto *prot = sk->sk_prot;
>> +       struct kmem_cgroup *sg = sk->sk_cgrp;
>> +       int *memory_pressure = sk_memory_pressure(sk);
>>
>>         atomic_long_sub(sk->sk_forward_alloc>>  SK_MEM_QUANTUM_SHIFT,
>> -                  prot->memory_allocated);
>> +                  sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>> +               atomic_long_sub(sk->sk_forward_alloc>>  SK_MEM_QUANTUM_SHIFT,
>> +                                               sg_memory_allocated(prot, sg));
>> +       }
>> +#endif
>> +
>>         sk->sk_forward_alloc&= SK_MEM_QUANTUM - 1;
>>
>> -       if (prot->memory_pressure&&  *prot->memory_pressure&&
>> -           (atomic_long_read(prot->memory_allocated)<  prot->sysctl_mem[0]))
>> -               *prot->memory_pressure = 0;
>> +       if (memory_pressure&&  *memory_pressure&&
>> +           (atomic_long_read(sk_memory_allocated(sk))<  sk_prot_mem(sk)[0]))
>> +               *memory_pressure = 0;
>>   }
>>   EXPORT_SYMBOL(__sk_mem_reclaim);
>>
>> @@ -2252,9 +2335,6 @@ void sk_common_release(struct sock *sk)
>>   }
>>   EXPORT_SYMBOL(sk_common_release);
>>
>> -static DEFINE_RWLOCK(proto_list_lock);
>> -static LIST_HEAD(proto_list);
>> -
>>   #ifdef CONFIG_PROC_FS
>>   #define PROTO_INUSE_NR 64      /* should be enough for the first time */
>>   struct prot_inuse {
>> @@ -2479,13 +2559,17 @@ static char proto_method_implemented(const void *method)
>>
>>   static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
>>   {
>> +       struct kmem_cgroup *sg = kcg_from_task(current);
>> +
>>         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
>>                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
>>                    proto->name,
>>                    proto->obj_size,
>>                    sock_prot_inuse_get(seq_file_net(seq), proto),
>> -                  proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
>> -                  proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
>> +                  proto->memory_allocated != NULL ?
>> +                       atomic_long_read(sg_memory_allocated(proto, sg)) : -1L,
>> +                  proto->memory_pressure != NULL ?
>> +                       *sg_memory_pressure(proto, sg) ? "yes" : "no" : "NI",
>>                    proto->max_header,
>>                    proto->slab == NULL ? "no" : "yes",
>>                    module_name(proto->owner),
>> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
>> index 19acd00..463b299 100644
>> --- a/net/decnet/af_decnet.c
>> +++ b/net/decnet/af_decnet.c
>> @@ -458,13 +458,28 @@ static void dn_enter_memory_pressure(struct sock *sk)
>>         }
>>   }
>>
>> +static atomic_long_t *memory_allocated_dn(struct kmem_cgroup *sg)
>> +{
>> +       return&decnet_memory_allocated;
>> +}
>> +
>> +static int *memory_pressure_dn(struct kmem_cgroup *sg)
>> +{
>> +       return&dn_memory_pressure;
>> +}
>> +
>> +static long *dn_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +       return sysctl_decnet_mem;
>> +}
>> +
>>   static struct proto dn_proto = {
>>         .name                   = "NSP",
>>         .owner                  = THIS_MODULE,
>>         .enter_memory_pressure  = dn_enter_memory_pressure,
>> -       .memory_pressure        =&dn_memory_pressure,
>> -       .memory_allocated       =&decnet_memory_allocated,
>> -       .sysctl_mem             = sysctl_decnet_mem,
>> +       .memory_pressure        = memory_pressure_dn,
>> +       .memory_allocated       = memory_allocated_dn,
>> +       .prot_mem               = dn_sysctl_mem,
>>         .sysctl_wmem            = sysctl_decnet_wmem,
>>         .sysctl_rmem            = sysctl_decnet_rmem,
>>         .max_header             = DN_MAX_NSP_DATA_HEADER + 64,
>> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
>> index b14ec7d..9c80acf 100644
>> --- a/net/ipv4/proc.c
>> +++ b/net/ipv4/proc.c
>> @@ -52,20 +52,22 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
>>   {
>>         struct net *net = seq->private;
>>         int orphans, sockets;
>> +       struct kmem_cgroup *sg = kcg_from_task(current);
>> +       struct percpu_counter *allocated = sg_sockets_allocated(&tcp_prot, sg);
>>
>>         local_bh_disable();
>>         orphans = percpu_counter_sum_positive(&tcp_orphan_count);
>> -       sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
>> +       sockets = percpu_counter_sum_positive(allocated);
>>         local_bh_enable();
>>
>>         socket_seq_show(seq);
>>         seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
>>                    sock_prot_inuse_get(net,&tcp_prot), orphans,
>>                    tcp_death_row.tw_count, sockets,
>> -                  atomic_long_read(&tcp_memory_allocated));
>> +                  atomic_long_read(sg_memory_allocated((&tcp_prot), sg)));
>>         seq_printf(seq, "UDP: inuse %d mem %ld\n",
>>                    sock_prot_inuse_get(net,&udp_prot),
>> -                  atomic_long_read(&udp_memory_allocated));
>> +                  atomic_long_read(sg_memory_allocated((&udp_prot), sg)));
>>         seq_printf(seq, "UDPLITE: inuse %d\n",
>>                    sock_prot_inuse_get(net,&udplite_prot));
>>         seq_printf(seq, "RAW: inuse %d\n",
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 69fd720..5e89480 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -14,6 +14,8 @@
>>   #include<linux/init.h>
>>   #include<linux/slab.h>
>>   #include<linux/nsproxy.h>
>> +#include<linux/kmem_cgroup.h>
>> +#include<linux/swap.h>
>>   #include<net/snmp.h>
>>   #include<net/icmp.h>
>>   #include<net/ip.h>
>> @@ -174,6 +176,43 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>>         return ret;
>>   }
>>
>> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
>> +                          void __user *buffer, size_t *lenp,
>> +                          loff_t *ppos)
>> +{
>> +       int ret;
>> +       unsigned long vec[3];
>> +       struct kmem_cgroup *kmem = kcg_from_task(current);
>> +       struct net *net = current->nsproxy->net_ns;
>> +       int i;
>> +
>> +       ctl_table tmp = {
>> +               .data =&vec,
>> +               .maxlen = sizeof(vec),
>> +               .mode = ctl->mode,
>> +       };
>> +
>> +       if (!write) {
>> +               ctl->data =&net->ipv4.sysctl_tcp_mem;
>> +               return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
>> +       }
>> +
>> +       ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
>> +       if (ret)
>> +               return ret;
>> +
>> +       for (i = 0; i<  3; i++)
>> +               if (vec[i]>  kmem->tcp_max_memory)
>> +                       return -EINVAL;
>> +
>> +       for (i = 0; i<  3; i++) {
>> +               net->ipv4.sysctl_tcp_mem[i] = vec[i];
>> +               kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i];
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>>   static struct ctl_table ipv4_table[] = {
>>         {
>>                 .procname       = "tcp_timestamps",
>> @@ -433,13 +472,6 @@ static struct ctl_table ipv4_table[] = {
>>                 .proc_handler   = proc_dointvec
>>         },
>>         {
>> -               .procname       = "tcp_mem",
>> -               .data           =&sysctl_tcp_mem,
>> -               .maxlen         = sizeof(sysctl_tcp_mem),
>> -               .mode           = 0644,
>> -               .proc_handler   = proc_doulongvec_minmax
>> -       },
>> -       {
>>                 .procname       = "tcp_wmem",
>>                 .data           =&sysctl_tcp_wmem,
>>                 .maxlen         = sizeof(sysctl_tcp_wmem),
>> @@ -721,6 +753,12 @@ static struct ctl_table ipv4_net_table[] = {
>>                 .mode           = 0644,
>>                 .proc_handler   = ipv4_ping_group_range,
>>         },
>> +       {
>> +               .procname       = "tcp_mem",
>> +               .maxlen         = sizeof(init_net.ipv4.sysctl_tcp_mem),
>> +               .mode           = 0644,
>> +               .proc_handler   = ipv4_tcp_mem,
>> +       },
>>         { }
>>   };
>>
>> @@ -734,6 +772,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>>   static __net_init int ipv4_sysctl_init_net(struct net *net)
>>   {
>>         struct ctl_table *table;
>> +       unsigned long limit;
>>
>>         table = ipv4_net_table;
>>         if (!net_eq(net,&init_net)) {
>> @@ -769,6 +808,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>>
>>         net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>>
>> +       limit = nr_free_buffer_pages() / 8;
>> +       limit = max(limit, 128UL);
>> +       net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
>> +       net->ipv4.sysctl_tcp_mem[1] = limit;
>> +       net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
>> +
>>         net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>>                         net_ipv4_ctl_path, table);
>>         if (net->ipv4.ipv4_hdr == NULL)
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 46febca..e1918fa 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -266,6 +266,7 @@
>>   #include<linux/crypto.h>
>>   #include<linux/time.h>
>>   #include<linux/slab.h>
>> +#include<linux/nsproxy.h>
>>
>>   #include<net/icmp.h>
>>   #include<net/tcp.h>
>> @@ -282,23 +283,12 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>>   struct percpu_counter tcp_orphan_count;
>>   EXPORT_SYMBOL_GPL(tcp_orphan_count);
>>
>> -long sysctl_tcp_mem[3] __read_mostly;
>>   int sysctl_tcp_wmem[3] __read_mostly;
>>   int sysctl_tcp_rmem[3] __read_mostly;
>>
>> -EXPORT_SYMBOL(sysctl_tcp_mem);
>>   EXPORT_SYMBOL(sysctl_tcp_rmem);
>>   EXPORT_SYMBOL(sysctl_tcp_wmem);
>>
>> -atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
>> -EXPORT_SYMBOL(tcp_memory_allocated);
>> -
>> -/*
>> - * Current number of TCP sockets.
>> - */
>> -struct percpu_counter tcp_sockets_allocated;
>> -EXPORT_SYMBOL(tcp_sockets_allocated);
>> -
>>   /*
>>   * TCP splice context
>>   */
>> @@ -308,23 +298,157 @@ struct tcp_splice_state {
>>         unsigned int flags;
>>   };
>>
>> +#ifdef CONFIG_CGROUP_KMEM
>>   /*
>>   * Pressure flag: try to collapse.
>>   * Technical note: it is used by multiple contexts non atomically.
>>   * All the __sk_mem_schedule() is of this nature: accounting
>>   * is strict, actions are advisory and have some latency.
>>   */
>> -int tcp_memory_pressure __read_mostly;
>> -EXPORT_SYMBOL(tcp_memory_pressure);
>> -
>>   void tcp_enter_memory_pressure(struct sock *sk)
>>   {
>> +       struct kmem_cgroup *sg = sk->sk_cgrp;
>> +       if (!sg->tcp_memory_pressure) {
>> +               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
>> +               sg->tcp_memory_pressure = 1;
>> +       }
>> +}
>> +
>> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +       return sg->tcp_prot_mem;
>> +}
>> +
>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
>> +{
>> +       return&(sg->tcp_memory_allocated);
>> +}
>> +
>> +static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64 val)
>> +{
>> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>> +       struct net *net = current->nsproxy->net_ns;
>> +       int i;
>> +
>> +       if (!cgroup_lock_live_group(cgrp))
>> +               return -ENODEV;
>> +
>> +       /*
>> +        * We can't allow more memory than our parents. Since this
>> +        * will be tested for all calls, by induction, there is no need
>> +        * to test any parent other than our own
>> +        * */
>> +       if (sg->parent&&  (val>  sg->parent->tcp_max_memory))
>> +               val = sg->parent->tcp_max_memory;
>> +
>> +       sg->tcp_max_memory = val;
>> +
>> +       for (i = 0; i<  3; i++)
>> +               sg->tcp_prot_mem[i]  = min_t(long, val,
>> +                                            net->ipv4.sysctl_tcp_mem[i]);
>> +
>> +       cgroup_unlock();
>> +
>> +       return 0;
>> +}
>> +
>> +static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft)
>> +{
>> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>> +       u64 ret;
>> +
>> +       if (!cgroup_lock_live_group(cgrp))
>> +               return -ENODEV;
>> +       ret = sg->tcp_max_memory;
>> +
>> +       cgroup_unlock();
>> +       return ret;
>> +}
>> +
>> +static struct cftype tcp_files[] = {
>> +       {
>> +               .name = "tcp_maxmem",
>> +               .write_u64 = tcp_write_maxmem,
>> +               .read_u64 = tcp_read_maxmem,
>> +       },
>> +};
>> +
>> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
>> +{
>> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>> +       unsigned long limit;
>> +       struct net *net = current->nsproxy->net_ns;
>> +
>> +       sg->tcp_memory_pressure = 0;
>> +       atomic_long_set(&sg->tcp_memory_allocated, 0);
>> +       percpu_counter_init(&sg->tcp_sockets_allocated, 0);
>> +
>> +       limit = nr_free_buffer_pages() / 8;
>> +       limit = max(limit, 128UL);
>> +
>> +       if (sg->parent)
>> +               sg->tcp_max_memory = sg->parent->tcp_max_memory;
>> +       else
>> +               sg->tcp_max_memory = limit * 2;
>> +
>> +       sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
>> +       sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
>> +       sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
>> +
>> +       return cgroup_add_files(cgrp, ss, tcp_files, ARRAY_SIZE(tcp_files));
>> +}
>> +EXPORT_SYMBOL(tcp_init_cgroup);
>> +
>> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
>> +{
>> +       return&sg->tcp_memory_pressure;
>> +}
>> +
>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
>> +{
>> +       return&sg->tcp_sockets_allocated;
>> +}
>> +#else
>> +
>> +/* Current number of TCP sockets. */
>> +struct percpu_counter tcp_sockets_allocated;
>> +atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
>> +int tcp_memory_pressure;
>> +
>> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
>> +{
>> +       return&tcp_memory_pressure;
>> +}
>> +
>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
>> +{
>> +       return&tcp_sockets_allocated;
>> +}
>> +
>> +void tcp_enter_memory_pressure(struct sock *sock)
>> +{
>>         if (!tcp_memory_pressure) {
>>                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
>>                 tcp_memory_pressure = 1;
>>         }
>>   }
>> +
>> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +       return init_net.ipv4.sysctl_tcp_mem;
>> +}
>> +
>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
>> +{
>> +       return&tcp_memory_allocated;
>> +}
>> +#endif /* CONFIG_CGROUP_KMEM */
>> +
>> +EXPORT_SYMBOL(memory_pressure_tcp);
>> +EXPORT_SYMBOL(sockets_allocated_tcp);
>>   EXPORT_SYMBOL(tcp_enter_memory_pressure);
>> +EXPORT_SYMBOL(tcp_sysctl_mem);
>> +EXPORT_SYMBOL(memory_allocated_tcp);
>>
>>   /* Convert seconds to retransmits based on initial and max timeout */
>>   static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
>> @@ -3226,7 +3350,9 @@ void __init tcp_init(void)
>>
>>         BUILD_BUG_ON(sizeof(struct tcp_skb_cb)>  sizeof(skb->cb));
>>
>> +#ifndef CONFIG_CGROUP_KMEM
>>         percpu_counter_init(&tcp_sockets_allocated, 0);
>> +#endif
>>         percpu_counter_init(&tcp_orphan_count, 0);
>>         tcp_hashinfo.bind_bucket_cachep =
>>                 kmem_cache_create("tcp_bind_bucket",
>> @@ -3277,14 +3403,10 @@ void __init tcp_init(void)
>>         sysctl_tcp_max_orphans = cnt / 2;
>>         sysctl_max_syn_backlog = max(128, cnt / 256);
>>
>> -       limit = nr_free_buffer_pages() / 8;
>> -       limit = max(limit, 128UL);
>> -       sysctl_tcp_mem[0] = limit / 4 * 3;
>> -       sysctl_tcp_mem[1] = limit;
>> -       sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
>> -
>>         /* Set per-socket limits to no more than 1/128 the pressure threshold */
>> -       limit = ((unsigned long)sysctl_tcp_mem[1])<<  (PAGE_SHIFT - 7);
>> +       limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
>> +       limit<<= (PAGE_SHIFT - 7);
>> +
>>         max_share = min(4UL*1024*1024, limit);
>>
>>         sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index ea0d218..c44e830 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -316,7 +316,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
>>         /* Check #1 */
>>         if (tp->rcv_ssthresh<  tp->window_clamp&&
>>             (int)tp->rcv_ssthresh<  tcp_space(sk)&&
>> -           !tcp_memory_pressure) {
>> +           !sk_memory_pressure(sk)) {
>>                 int incr;
>>
>>                 /* Check #2. Increase window, if skb with such overhead
>> @@ -393,15 +393,16 @@ static void tcp_clamp_window(struct sock *sk)
>>   {
>>         struct tcp_sock *tp = tcp_sk(sk);
>>         struct inet_connection_sock *icsk = inet_csk(sk);
>> +       struct proto *prot = sk->sk_prot;
>>
>>         icsk->icsk_ack.quick = 0;
>>
>> -       if (sk->sk_rcvbuf<  sysctl_tcp_rmem[2]&&
>> +       if (sk->sk_rcvbuf<  prot->sysctl_rmem[2]&&
>>             !(sk->sk_userlocks&  SOCK_RCVBUF_LOCK)&&
>> -           !tcp_memory_pressure&&
>> -           atomic_long_read(&tcp_memory_allocated)<  sysctl_tcp_mem[0]) {
>> +           !sk_memory_pressure(sk)&&
>> +           atomic_long_read(sk_memory_allocated(sk))<  sk_prot_mem(sk)[0]) {
>>                 sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
>> -                                   sysctl_tcp_rmem[2]);
>> +                                   prot->sysctl_rmem[2]);
>>         }
>>         if (atomic_read(&sk->sk_rmem_alloc)>  sk->sk_rcvbuf)
>>                 tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
>> @@ -4806,7 +4807,7 @@ static int tcp_prune_queue(struct sock *sk)
>>
>>         if (atomic_read(&sk->sk_rmem_alloc)>= sk->sk_rcvbuf)
>>                 tcp_clamp_window(sk);
>> -       else if (tcp_memory_pressure)
>> +       else if (sk_memory_pressure(sk))
>>                 tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
>>
>>         tcp_collapse_ofo_queue(sk);
>> @@ -4872,11 +4873,11 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
>>                 return 0;
>>
>>         /* If we are under global TCP memory pressure, do not expand.  */
>> -       if (tcp_memory_pressure)
>> +       if (sk_memory_pressure(sk))
>>                 return 0;
>>
>>         /* If we are under soft global TCP memory pressure, do not expand.  */
>> -       if (atomic_long_read(&tcp_memory_allocated)>= sysctl_tcp_mem[0])
>> +       if (atomic_long_read(sk_memory_allocated(sk))>= sk_prot_mem(sk)[0])
>>                 return 0;
>>
>>         /* If we filled the congestion window, do not expand.  */
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index 1c12b8e..af6c095 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -1848,6 +1848,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>>   {
>>         struct inet_connection_sock *icsk = inet_csk(sk);
>>         struct tcp_sock *tp = tcp_sk(sk);
>> +       struct kmem_cgroup *sg;
>>
>>         skb_queue_head_init(&tp->out_of_order_queue);
>>         tcp_init_xmit_timers(sk);
>> @@ -1901,7 +1902,13 @@ static int tcp_v4_init_sock(struct sock *sk)
>>         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
>>
>>         local_bh_disable();
>> -       percpu_counter_inc(&tcp_sockets_allocated);
>> +       percpu_counter_inc(sk_sockets_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
>> +               percpu_counter_inc(sg_sockets_allocated(sk->sk_prot, sg));
>> +#endif
>> +
>>         local_bh_enable();
>>
>>         return 0;
>> @@ -1910,6 +1917,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>>   void tcp_v4_destroy_sock(struct sock *sk)
>>   {
>>         struct tcp_sock *tp = tcp_sk(sk);
>> +       struct kmem_cgroup *sg;
>>
>>         tcp_clear_xmit_timers(sk);
>>
>> @@ -1957,7 +1965,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
>>                 tp->cookie_values = NULL;
>>         }
>>
>> -       percpu_counter_dec(&tcp_sockets_allocated);
>> +       percpu_counter_dec(sk_sockets_allocated(sk));
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
>> +               percpu_counter_dec(sg_sockets_allocated(sk->sk_prot, sg));
>> +#endif
>>   }
>>   EXPORT_SYMBOL(tcp_v4_destroy_sock);
>>
>> @@ -2598,11 +2610,14 @@ struct proto tcp_prot = {
>>         .unhash                 = inet_unhash,
>>         .get_port               = inet_csk_get_port,
>>         .enter_memory_pressure  = tcp_enter_memory_pressure,
>> -       .sockets_allocated      =&tcp_sockets_allocated,
>> +       .memory_pressure        = memory_pressure_tcp,
>> +       .sockets_allocated      = sockets_allocated_tcp,
>>         .orphan_count           =&tcp_orphan_count,
>> -       .memory_allocated       =&tcp_memory_allocated,
>> -       .memory_pressure        =&tcp_memory_pressure,
>> -       .sysctl_mem             = sysctl_tcp_mem,
>> +       .memory_allocated       = memory_allocated_tcp,
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       .init_cgroup            = tcp_init_cgroup,
>> +#endif
>> +       .prot_mem               = tcp_sysctl_mem,
>>         .sysctl_wmem            = sysctl_tcp_wmem,
>>         .sysctl_rmem            = sysctl_tcp_rmem,
>>         .max_header             = MAX_TCP_HEADER,
>> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>> index 882e0b0..06aeb31 100644
>> --- a/net/ipv4/tcp_output.c
>> +++ b/net/ipv4/tcp_output.c
>> @@ -1912,7 +1912,7 @@ u32 __tcp_select_window(struct sock *sk)
>>         if (free_space<  (full_space>>  1)) {
>>                 icsk->icsk_ack.quick = 0;
>>
>> -               if (tcp_memory_pressure)
>> +               if (sk_memory_pressure(sk))
>>                         tp->rcv_ssthresh = min(tp->rcv_ssthresh,
>>                                                4U * tp->advmss);
>>
>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>> index ecd44b0..2c67617 100644
>> --- a/net/ipv4/tcp_timer.c
>> +++ b/net/ipv4/tcp_timer.c
>> @@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data)
>>         }
>>
>>   out:
>> -       if (tcp_memory_pressure)
>> +       if (sk_memory_pressure(sk))
>>                 sk_mem_reclaim(sk);
>>   out_unlock:
>>         bh_unlock_sock(sk);
>> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
>> index 1b5a193..6c08c65 100644
>> --- a/net/ipv4/udp.c
>> +++ b/net/ipv4/udp.c
>> @@ -120,9 +120,6 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
>>   int sysctl_udp_wmem_min __read_mostly;
>>   EXPORT_SYMBOL(sysctl_udp_wmem_min);
>>
>> -atomic_long_t udp_memory_allocated;
>> -EXPORT_SYMBOL(udp_memory_allocated);
>> -
>>   #define MAX_UDP_PORTS 65536
>>   #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
>>
>> @@ -1918,6 +1915,19 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
>>   }
>>   EXPORT_SYMBOL(udp_poll);
>>
>> +static atomic_long_t udp_memory_allocated;
>> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg)
>> +{
>> +       return&udp_memory_allocated;
>> +}
>> +EXPORT_SYMBOL(memory_allocated_udp);
>> +
>> +long *udp_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +       return sysctl_udp_mem;
>> +}
>> +EXPORT_SYMBOL(udp_sysctl_mem);
>> +
>>   struct proto udp_prot = {
>>         .name              = "UDP",
>>         .owner             = THIS_MODULE,
>> @@ -1936,8 +1946,8 @@ struct proto udp_prot = {
>>         .unhash            = udp_lib_unhash,
>>         .rehash            = udp_v4_rehash,
>>         .get_port          = udp_v4_get_port,
>> -       .memory_allocated  =&udp_memory_allocated,
>> -       .sysctl_mem        = sysctl_udp_mem,
>> +       .memory_allocated  =&memory_allocated_udp,
>> +       .prot_mem          = udp_sysctl_mem,
>>         .sysctl_wmem       =&sysctl_udp_wmem_min,
>>         .sysctl_rmem       =&sysctl_udp_rmem_min,
>>         .obj_size          = sizeof(struct udp_sock),
>> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
>> index d1fb63f..0762e68 100644
>> --- a/net/ipv6/tcp_ipv6.c
>> +++ b/net/ipv6/tcp_ipv6.c
>> @@ -1959,6 +1959,7 @@ static int tcp_v6_init_sock(struct sock *sk)
>>   {
>>         struct inet_connection_sock *icsk = inet_csk(sk);
>>         struct tcp_sock *tp = tcp_sk(sk);
>> +       struct kmem_cgroup *sg;
>>
>>         skb_queue_head_init(&tp->out_of_order_queue);
>>         tcp_init_xmit_timers(sk);
>> @@ -2012,7 +2013,12 @@ static int tcp_v6_init_sock(struct sock *sk)
>>         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
>>
>>         local_bh_disable();
>> -       percpu_counter_inc(&tcp_sockets_allocated);
>> +       percpu_counter_inc(sk_sockets_allocated(sk));
>> +#ifdef CONFIG_CGROUP_KMEM
>> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
>> +               percpu_counter_dec(sg_sockets_allocated(sk->sk_prot, sg));
>> +#endif
>> +
>>         local_bh_enable();
>>
>>         return 0;
>> @@ -2221,11 +2227,11 @@ struct proto tcpv6_prot = {
>>         .unhash                 = inet_unhash,
>>         .get_port               = inet_csk_get_port,
>>         .enter_memory_pressure  = tcp_enter_memory_pressure,
>> -       .sockets_allocated      =&tcp_sockets_allocated,
>> -       .memory_allocated       =&tcp_memory_allocated,
>> -       .memory_pressure        =&tcp_memory_pressure,
>> +       .sockets_allocated      = sockets_allocated_tcp,
>> +       .memory_allocated       = memory_allocated_tcp,
>> +       .memory_pressure        = memory_pressure_tcp,
>>         .orphan_count           =&tcp_orphan_count,
>> -       .sysctl_mem             = sysctl_tcp_mem,
>> +       .prot_mem               = tcp_sysctl_mem,
>>         .sysctl_wmem            = sysctl_tcp_wmem,
>>         .sysctl_rmem            = sysctl_tcp_rmem,
>>         .max_header             = MAX_TCP_HEADER,
>> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
>> index 29213b5..ef4b5b3 100644
>> --- a/net/ipv6/udp.c
>> +++ b/net/ipv6/udp.c
>> @@ -1465,8 +1465,8 @@ struct proto udpv6_prot = {
>>         .unhash            = udp_lib_unhash,
>>         .rehash            = udp_v6_rehash,
>>         .get_port          = udp_v6_get_port,
>> -       .memory_allocated  =&udp_memory_allocated,
>> -       .sysctl_mem        = sysctl_udp_mem,
>> +       .memory_allocated  = memory_allocated_udp,
>> +       .prot_mem          = udp_sysctl_mem,
>>         .sysctl_wmem       =&sysctl_udp_wmem_min,
>>         .sysctl_rmem       =&sysctl_udp_rmem_min,
>>         .obj_size          = sizeof(struct udp6_sock),
>> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
>> index 836aa63..1b0300d 100644
>> --- a/net/sctp/socket.c
>> +++ b/net/sctp/socket.c
>> @@ -119,11 +119,30 @@ static int sctp_memory_pressure;
>>   static atomic_long_t sctp_memory_allocated;
>>   struct percpu_counter sctp_sockets_allocated;
>>
>> +static long *sctp_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +       return sysctl_sctp_mem;
>> +}
>> +
>>   static void sctp_enter_memory_pressure(struct sock *sk)
>>   {
>>         sctp_memory_pressure = 1;
>>   }
>>
>> +static int *memory_pressure_sctp(struct kmem_cgroup *sg)
>> +{
>> +       return&sctp_memory_pressure;
>> +}
>> +
>> +static atomic_long_t *memory_allocated_sctp(struct kmem_cgroup *sg)
>> +{
>> +       return&sctp_memory_allocated;
>> +}
>> +
>> +static struct percpu_counter *sockets_allocated_sctp(struct kmem_cgroup *sg)
>> +{
>> +       return&sctp_sockets_allocated;
>> +}
>>
>>   /* Get the sndbuf space available at the time on the association.  */
>>   static inline int sctp_wspace(struct sctp_association *asoc)
>> @@ -6831,13 +6850,13 @@ struct proto sctp_prot = {
>>         .unhash      =  sctp_unhash,
>>         .get_port    =  sctp_get_port,
>>         .obj_size    =  sizeof(struct sctp_sock),
>> -       .sysctl_mem  =  sysctl_sctp_mem,
>> +       .prot_mem    =  sctp_sysctl_mem,
>>         .sysctl_rmem =  sysctl_sctp_rmem,
>>         .sysctl_wmem =  sysctl_sctp_wmem,
>> -       .memory_pressure =&sctp_memory_pressure,
>> +       .memory_pressure = memory_pressure_sctp,
>>         .enter_memory_pressure = sctp_enter_memory_pressure,
>> -       .memory_allocated =&sctp_memory_allocated,
>> -       .sockets_allocated =&sctp_sockets_allocated,
>> +       .memory_allocated = memory_allocated_sctp,
>> +       .sockets_allocated = sockets_allocated_sctp,
>>   };
>>
>>   #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
>> @@ -6863,12 +6882,12 @@ struct proto sctpv6_prot = {
>>         .unhash         = sctp_unhash,
>>         .get_port       = sctp_get_port,
>>         .obj_size       = sizeof(struct sctp6_sock),
>> -       .sysctl_mem     = sysctl_sctp_mem,
>> +       .prot_mem       = sctp_sysctl_mem,
>>         .sysctl_rmem    = sysctl_sctp_rmem,
>>         .sysctl_wmem    = sysctl_sctp_wmem,
>> -       .memory_pressure =&sctp_memory_pressure,
>> +       .memory_pressure = memory_pressure_sctp,
>>         .enter_memory_pressure = sctp_enter_memory_pressure,
>> -       .memory_allocated =&sctp_memory_allocated,
>> -       .sockets_allocated =&sctp_sockets_allocated,
>> +       .memory_allocated = memory_allocated_sctp,
>> +       .sockets_allocated = sockets_allocated_sctp,
>>   };
>>   #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
>> --
>> 1.7.6
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
>> Don't email:<a href=mailto:"dont@kvack.org">  email@kvack.org</a>
>>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Thelen Sept. 6, 2011, 10:12 p.m. UTC | #5
On Tue, Sep 6, 2011 at 9:16 AM, Glauber Costa <glommer@parallels.com> wrote:
> On 09/06/2011 01:08 PM, Greg Thelen wrote:
>>
>> On Mon, Sep 5, 2011 at 7:35 PM, Glauber Costa<glommer@parallels.com>
>>  wrote:
>>>
>>> This patch introduces per-cgroup tcp buffers limitation. This allows
>>> sysadmins to specify a maximum amount of kernel memory that
>>> tcp connections can use at any point in time. TCP is the main interest
>>> in this work, but extending it to other protocols would be easy.
>
> Hello Greg,
>
>> With this approach we would be giving admins the ability to
>> independently limit user memory with memcg and kernel memory with this
>> new kmem cgroup.
>>
>> At least in some situations admins prefer to give a particular
>> container X bytes without thinking about the kernel vs user split.
>> Sometimes the admin would prefer the kernel to keep the total
>> user+kernel memory below a certain threshold.  To achieve this with
>> this approach would we need a user space agent to monitor both kernel
>> and user usage for a container and grow/shrink memcg/kmem limits?
>
> Yes, I believe so. And this is not only valid for containers: the
> information we expose in proc, sys, cgroups, etc, is always much more fine
> grained than a considerable part of the users want. Tools come to fill this
> gap.

In your use cases do jobs separately specify independent kmem usage
limits and user memory usage limits?

I presume for people who want to simply dedicate X bytes of memory to
container C that a user-space agent would need to poll both
memcg/X/memory.usage_in_bytes and kmem/X/kmem.usage_in_bytes (or some
other file) to determine if memory limits should be adjusted (i.e. if
kernel memory is growing, then user memory would need to shrink).  So
far my use cases involve a single memory limit which includes both
kernel and user memory.  So I would need a user space agent to poll
{memcg,kmem}.usage_in_bytes to apply pressure to memcg if kmem grows
and visa versa.

Do you foresee instantiation of multiple kmem cgroups, so that a
process could be added into kmem/K1 or kmem/K2?  If so do you plan on
supporting migration between cgroups and/or migration of kmem charge
between K1 to K2?

>> Do you foresee the kmem cgroup growing to include reclaimable slab,
>> where freeing one type of memory allows for reclaim of the other?
>
> Yes, absolutely.

Small comments below.

>>> It piggybacks in the memory control mechanism already present in
>>> /proc/sys/net/ipv4/tcp_mem. There is a soft limit, and a hard limit,
>>> that will suppress allocation when reached. For each cgroup, however,
>>> the file kmem.tcp_maxmem will be used to cap those values.
>>>
>>> The usage I have in mind here is containers. Each container will
>>> define its own values for soft and hard limits, but none of them will
>>> be possibly bigger than the value the box' sysadmin specified from
>>> the outside.
>>>
>>> To test for any performance impacts of this patch, I used netperf's
>>> TCP_RR benchmark on localhost, so we can have both recv and snd in
>>> action.
>>>
>>> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
>>> results:
>>>
>>> Without the patch
>>> =================
>>>
>>> Socket Size   Request  Resp.   Elapsed  Trans.
>>> Send   Recv   Size     Size    Time     Rate
>>> bytes  Bytes  bytes    bytes   secs.    per sec
>>>
>>> 16384  87380  1        1       10.00    26996.35
>>> 16384  87380
>>>
>>> With the patch
>>> ===============
>>>
>>> Local /Remote
>>> Socket Size   Request  Resp.   Elapsed  Trans.
>>> Send   Recv   Size     Size    Time     Rate
>>> bytes  Bytes  bytes    bytes   secs.    per sec
>>>
>>> 16384  87380  1        1       10.00    27291.86
>>> 16384  87380
>>>
>>> The difference is within a one-percent range.
>>>
>>> Nesting cgroups doesn't seem to be the dominating factor as well,
>>> with nestings up to 10 levels not showing a significant performance
>>> difference.
>>>
>>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>>> CC: David S. Miller<davem@davemloft.net>
>>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>>> CC: Eric W. Biederman<ebiederm@xmission.com>
>>> ---
>>>  crypto/af_alg.c               |    7 ++-
>>>  include/linux/cgroup_subsys.h |    4 +
>>>  include/net/netns/ipv4.h      |    1 +
>>>  include/net/sock.h            |   66 +++++++++++++++-
>>>  include/net/tcp.h             |   12 ++-
>>>  include/net/udp.h             |    3 +-
>>>  include/trace/events/sock.h   |   10 +-
>>>  init/Kconfig                  |   11 +++
>>>  mm/Makefile                   |    1 +
>>>  net/core/sock.c               |  136 +++++++++++++++++++++++++++-------
>>>  net/decnet/af_decnet.c        |   21 +++++-
>>>  net/ipv4/proc.c               |    8 +-
>>>  net/ipv4/sysctl_net_ipv4.c    |   59 +++++++++++++--
>>>  net/ipv4/tcp.c                |  164
>>> +++++++++++++++++++++++++++++++++++-----
>>>  net/ipv4/tcp_input.c          |   17 ++--
>>>  net/ipv4/tcp_ipv4.c           |   27 +++++--
>>>  net/ipv4/tcp_output.c         |    2 +-
>>>  net/ipv4/tcp_timer.c          |    2 +-
>>>  net/ipv4/udp.c                |   20 ++++-
>>>  net/ipv6/tcp_ipv6.c           |   16 +++-
>>>  net/ipv6/udp.c                |    4 +-
>>>  net/sctp/socket.c             |   35 +++++++--
>>>  22 files changed, 514 insertions(+), 112 deletions(-)
>>>
>>> diff --git a/crypto/af_alg.c b/crypto/af_alg.c
>>> index ac33d5f..df168d8 100644
>>> --- a/crypto/af_alg.c
>>> +++ b/crypto/af_alg.c
>>> @@ -29,10 +29,15 @@ struct alg_type_list {
>>>
>>>  static atomic_long_t alg_memory_allocated;
>>>
>>> +static atomic_long_t *memory_allocated_alg(struct kmem_cgroup *sg)
>>> +{
>>> +       return&alg_memory_allocated;
>>> +}
>>> +
>>>  static struct proto alg_proto = {
>>>        .name                   = "ALG",
>>>        .owner                  = THIS_MODULE,
>>> -       .memory_allocated       =&alg_memory_allocated,
>>> +       .memory_allocated       = memory_allocated_alg,
>>>        .obj_size               = sizeof(struct alg_sock),
>>>  };
>>>
>>> diff --git a/include/linux/cgroup_subsys.h
>>> b/include/linux/cgroup_subsys.h
>>> index ac663c1..363b8e8 100644
>>> --- a/include/linux/cgroup_subsys.h
>>> +++ b/include/linux/cgroup_subsys.h
>>> @@ -35,6 +35,10 @@ SUBSYS(cpuacct)
>>>  SUBSYS(mem_cgroup)
>>>  #endif
>>>
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +SUBSYS(kmem)
>>> +#endif
>>> +
>>>  /* */
>>>
>>>  #ifdef CONFIG_CGROUP_DEVICE
>>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>>> index d786b4f..bbd023a 100644
>>> --- a/include/net/netns/ipv4.h
>>> +++ b/include/net/netns/ipv4.h
>>> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>>>        int current_rt_cache_rebuild_count;
>>>
>>>        unsigned int sysctl_ping_group_range[2];
>>> +       long sysctl_tcp_mem[3];
>>>
>>>        atomic_t rt_genid;
>>>        atomic_t dev_addr_genid;
>>> diff --git a/include/net/sock.h b/include/net/sock.h
>>> index 8e4062f..e085148 100644
>>> --- a/include/net/sock.h
>>> +++ b/include/net/sock.h
>>> @@ -62,7 +62,9 @@
>>>  #include<linux/atomic.h>
>>>  #include<net/dst.h>
>>>  #include<net/checksum.h>
>>> +#include<linux/kmem_cgroup.h>
>>>
>>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>>>  /*
>>>  * This structure really needs to be cleaned up.
>>>  * Most of it is for TCP, and not used by any of
>>> @@ -339,6 +341,7 @@ struct sock {
>>>  #endif
>>>        __u32                   sk_mark;
>>>        u32                     sk_classid;
>>> +       struct kmem_cgroup      *sk_cgrp;
>>>        void                    (*sk_state_change)(struct sock *sk);
>>>        void                    (*sk_data_ready)(struct sock *sk, int
>>> bytes);
>>>        void                    (*sk_write_space)(struct sock *sk);
>>> @@ -786,16 +789,21 @@ struct proto {
>>>
>>>        /* Memory pressure */
>>>        void                    (*enter_memory_pressure)(struct sock *sk);
>>> -       atomic_long_t           *memory_allocated;      /* Current
>>> allocated memory. */
>>> -       struct percpu_counter   *sockets_allocated;     /* Current number
>>> of sockets. */
>>> +       /* Current allocated memory. */
>>> +       atomic_long_t           *(*memory_allocated)(struct kmem_cgroup
>>> *sg);
>>> +       /* Current number of sockets. */
>>> +       struct percpu_counter   *(*sockets_allocated)(struct kmem_cgroup
>>> *sg);
>>> +
>>> +       int                     (*init_cgroup)(struct cgroup *cgrp,
>>> +                                              struct cgroup_subsys *ss);
>>>        /*
>>>         * Pressure flag: try to collapse.
>>>         * Technical note: it is used by multiple contexts non atomically.
>>>         * All the __sk_mem_schedule() is of this nature: accounting
>>>         * is strict, actions are advisory and have some latency.
>>>         */
>>> -       int                     *memory_pressure;
>>> -       long                    *sysctl_mem;
>>> +       int                     *(*memory_pressure)(struct kmem_cgroup
>>> *sg);
>>> +       long                    *(*prot_mem)(struct kmem_cgroup *sg);
>>>        int                     *sysctl_wmem;
>>>        int                     *sysctl_rmem;
>>>        int                     max_header;
>>> @@ -826,6 +834,56 @@ struct proto {
>>>  #endif
>>>  };
>>>
>>> +#define sk_memory_pressure(sk)                                         \
>>> +({                                                                     \
>>> +       int *__ret = NULL;                                              \
>>> +       if ((sk)->sk_prot->memory_pressure)                             \
>>> +               __ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);    \
>>> +       __ret;                                                          \
>>> +})
>>> +
>>> +#define sk_sockets_allocated(sk)                               \
>>> +({                                                             \
>>> +       struct percpu_counter *__p;                             \
>>> +       __p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);    \
>>> +       __p;                                                    \
>>> +})

Could this be simplified as (same applies to following few macros):

static inline struct percpu_counter *sk_sockets_allocated(struct sock *sk)
{
        return sk->sk_prot->sockets_allocated(sk->sk_cgrp);
}

>>> +#define sk_memory_allocated(sk)                                        \
>>> +({                                                             \
>>> +       atomic_long_t *__mem;                                   \
>>> +       __mem = (sk)->sk_prot->memory_allocated(sk->sk_cgrp);   \
>>> +       __mem;                                                  \
>>> +})
>>> +
>>> +#define sk_prot_mem(sk)                                                \
>>> +({                                                             \
>>> +       long *__mem = (sk)->sk_prot->prot_mem(sk->sk_cgrp);     \
>>> +       __mem;                                                  \
>>> +})
>>> +
>>> +#define sg_memory_pressure(prot, sg)                           \
>>> +({                                                             \
>>> +       int *__ret = NULL;                                      \
>>> +       if (prot->memory_pressure)                              \
>>> +               __ret = (prot)->memory_pressure(sg);            \
>>> +       __ret;                                                  \
>>> +})
>>> +
>>> +#define sg_memory_allocated(prot, sg)                          \
>>> +({                                                             \
>>> +       atomic_long_t *__mem;                                   \
>>> +       __mem = (prot)->memory_allocated(sg);                   \
>>> +       __mem;                                                  \
>>> +})
>>> +
>>> +#define sg_sockets_allocated(prot, sg)                         \
>>> +({                                                             \
>>> +       struct percpu_counter *__p;                             \
>>> +       __p = (prot)->sockets_allocated(sg);                    \
>>> +       __p;                                                    \
>>> +})
>>> +
>>>  extern int proto_register(struct proto *prot, int alloc_slab);
>>>  extern void proto_unregister(struct proto *prot);
>>>
>>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>>> index 149a415..8e1ec4a 100644
>>> --- a/include/net/tcp.h
>>> +++ b/include/net/tcp.h
>>> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>>>  extern int sysctl_tcp_reordering;
>>>  extern int sysctl_tcp_ecn;
>>>  extern int sysctl_tcp_dsack;
>>> -extern long sysctl_tcp_mem[3];
>>>  extern int sysctl_tcp_wmem[3];
>>>  extern int sysctl_tcp_rmem[3];
>>>  extern int sysctl_tcp_app_win;
>>> @@ -253,9 +252,12 @@ extern int sysctl_tcp_cookie_size;
>>>  extern int sysctl_tcp_thin_linear_timeouts;
>>>  extern int sysctl_tcp_thin_dupack;
>>>
>>> -extern atomic_long_t tcp_memory_allocated;
>>> -extern struct percpu_counter tcp_sockets_allocated;
>>> -extern int tcp_memory_pressure;
>>> +struct kmem_cgroup;
>>> +extern long *tcp_sysctl_mem(struct kmem_cgroup *sg);
>>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg);
>>> +int *memory_pressure_tcp(struct kmem_cgroup *sg);
>>> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
>>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg);
>>>
>>>  /*
>>>  * The next routines deal with comparing 32 bit unsigned ints
>>> @@ -286,7 +288,7 @@ static inline bool tcp_too_many_orphans(struct sock
>>> *sk, int shift)
>>>        }
>>>
>>>        if (sk->sk_wmem_queued>  SOCK_MIN_SNDBUF&&
>>> -           atomic_long_read(&tcp_memory_allocated)>  sysctl_tcp_mem[2])
>>> +           atomic_long_read(sk_memory_allocated(sk))>
>>>  sk_prot_mem(sk)[2])
>>>                return true;
>>>        return false;
>>>  }
>>> diff --git a/include/net/udp.h b/include/net/udp.h
>>> index 67ea6fc..0e27388 100644
>>> --- a/include/net/udp.h
>>> +++ b/include/net/udp.h
>>> @@ -105,7 +105,8 @@ static inline struct udp_hslot *udp_hashslot2(struct
>>> udp_table *table,
>>>
>>>  extern struct proto udp_prot;
>>>
>>> -extern atomic_long_t udp_memory_allocated;
>>> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg);
>>> +long *udp_sysctl_mem(struct kmem_cgroup *sg);
>>>
>>>  /* sysctl variables for udp */
>>>  extern long sysctl_udp_mem[3];
>>> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
>>> index 779abb9..12a6083 100644
>>> --- a/include/trace/events/sock.h
>>> +++ b/include/trace/events/sock.h
>>> @@ -37,7 +37,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>>
>>>        TP_STRUCT__entry(
>>>                __array(char, name, 32)
>>> -               __field(long *, sysctl_mem)
>>> +               __field(long *, prot_mem)
>>>                __field(long, allocated)
>>>                __field(int, sysctl_rmem)
>>>                __field(int, rmem_alloc)
>>> @@ -45,7 +45,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>>
>>>        TP_fast_assign(
>>>                strncpy(__entry->name, prot->name, 32);
>>> -               __entry->sysctl_mem = prot->sysctl_mem;
>>> +               __entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
>>>                __entry->allocated = allocated;
>>>                __entry->sysctl_rmem = prot->sysctl_rmem[0];
>>>                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
>>> @@ -54,9 +54,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>>        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
>>>                "sysctl_rmem=%d rmem_alloc=%d",
>>>                __entry->name,
>>> -               __entry->sysctl_mem[0],
>>> -               __entry->sysctl_mem[1],
>>> -               __entry->sysctl_mem[2],
>>> +               __entry->prot_mem[0],
>>> +               __entry->prot_mem[1],
>>> +               __entry->prot_mem[2],
>>>                __entry->allocated,
>>>                __entry->sysctl_rmem,
>>>                __entry->rmem_alloc)
>>> diff --git a/init/Kconfig b/init/Kconfig
>>> index d627783..5955ac2 100644
>>> --- a/init/Kconfig
>>> +++ b/init/Kconfig
>>> @@ -690,6 +690,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
>>>          select this option (if, for some reason, they need to disable it
>>>          then swapaccount=0 does the trick).
>>>
>>> +config CGROUP_KMEM
>>> +       bool "Kernel Memory Resource Controller for Control Groups"
>>> +       depends on CGROUPS
>>> +       help
>>> +         The Kernel Memory cgroup can limit the amount of memory used by
>>> +         certain kernel objects in the system. Those are fundamentally
>>> +         different from the entities handled by the Memory Controller,
>>> +         which are page-based, and can be swapped. Users of the kmem
>>> +         cgroup can use it to guarantee that no group of processes will
>>> +         ever exhaust kernel resources alone.
>>> +
>>>  config CGROUP_PERF
>>>        bool "Enable perf_event per-cpu per-container group (cgroup)
>>> monitoring"
>>>        depends on PERF_EVENTS&&  CGROUPS
>>> diff --git a/mm/Makefile b/mm/Makefile
>>> index 836e416..1b1aa24 100644
>>> --- a/mm/Makefile
>>> +++ b/mm/Makefile
>>> @@ -45,6 +45,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>>>  obj-$(CONFIG_QUICKLIST) += quicklist.o
>>>  obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>>>  obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
>>> +obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o
>>>  obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>>>  obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>>>  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>> index 3449df8..2d968ea 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -134,6 +134,24 @@
>>>  #include<net/tcp.h>
>>>  #endif
>>>
>>> +static DEFINE_RWLOCK(proto_list_lock);
>>> +static LIST_HEAD(proto_list);
>>> +
>>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>>> +{
>>> +       struct proto *proto;
>>> +       int ret = 0;
>>> +
>>> +       read_lock(&proto_list_lock);
>>> +       list_for_each_entry(proto,&proto_list, node) {
>>> +               if (proto->init_cgroup)
>>> +                       ret |= proto->init_cgroup(cgrp, ss);
>>> +       }
>>> +       read_unlock(&proto_list_lock);
>>> +
>>> +       return ret;
>>> +}
>>> +
>>>  /*
>>>  * Each address family might have different locking rules, so we have
>>>  * one slock key per address family:
>>> @@ -1114,6 +1132,31 @@ void sock_update_classid(struct sock *sk)
>>>  EXPORT_SYMBOL(sock_update_classid);
>>>  #endif
>>>
>>> +void sock_update_kmem_cgrp(struct sock *sk)
>>> +{
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       sk->sk_cgrp = kcg_from_task(current);
>>> +
>>> +       /*
>>> +        * We don't need to protect against anything task-related,
>>> because
>>> +        * we are basically stuck with the sock pointer that won't
>>> change,
>>> +        * even if the task that originated the socket changes cgroups.
>>> +        *
>>> +        * What we do have to guarantee, is that the chain leading us to
>>> +        * the top level won't change under our noses. Incrementing the
>>> +        * reference count via cgroup_exclude_rmdir guarantees that.
>>> +        */
>>> +       cgroup_exclude_rmdir(&sk->sk_cgrp->css);
>>> +#endif
>>> +}
>>> +
>>> +void sock_release_kmem_cgrp(struct sock *sk)
>>> +{
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       cgroup_release_and_wakeup_rmdir(&sk->sk_cgrp->css);
>>> +#endif
>>> +}
>>> +
>>>  /**
>>>  *     sk_alloc - All socket objects are allocated here
>>>  *     @net: the applicable net namespace
>>> @@ -1139,6 +1182,7 @@ struct sock *sk_alloc(struct net *net, int family,
>>> gfp_t priority,
>>>                atomic_set(&sk->sk_wmem_alloc, 1);
>>>
>>>                sock_update_classid(sk);
>>> +               sock_update_kmem_cgrp(sk);
>>>        }
>>>
>>>        return sk;
>>> @@ -1170,6 +1214,7 @@ static void __sk_free(struct sock *sk)
>>>                put_cred(sk->sk_peer_cred);
>>>        put_pid(sk->sk_peer_pid);
>>>        put_net(sock_net(sk));
>>> +       sock_release_kmem_cgrp(sk);
>>>        sk_prot_free(sk->sk_prot_creator, sk);
>>>  }
>>>
>>> @@ -1287,8 +1332,8 @@ struct sock *sk_clone(const struct sock *sk, const
>>> gfp_t priority)
>>>                sk_set_socket(newsk, NULL);
>>>                newsk->sk_wq = NULL;
>>>
>>> -               if (newsk->sk_prot->sockets_allocated)
>>> -
>>> percpu_counter_inc(newsk->sk_prot->sockets_allocated);
>>> +               if (sk_sockets_allocated(sk))
>>> +                       percpu_counter_inc(sk_sockets_allocated(sk));
>>>
>>>                if (sock_flag(newsk, SOCK_TIMESTAMP) ||
>>>                    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
>>> @@ -1676,29 +1721,51 @@ EXPORT_SYMBOL(sk_wait_data);
>>>  */
>>>  int __sk_mem_schedule(struct sock *sk, int size, int kind)
>>>  {
>>> -       struct proto *prot = sk->sk_prot;
>>>        int amt = sk_mem_pages(size);
>>> +       struct proto *prot = sk->sk_prot;
>>>        long allocated;
>>> +       int *memory_pressure;
>>> +       long *prot_mem;
>>> +       int parent_failure = 0;
>>> +       struct kmem_cgroup *sg;
>>>
>>>        sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
>>> -       allocated = atomic_long_add_return(amt, prot->memory_allocated);
>>> +
>>> +       memory_pressure = sk_memory_pressure(sk);
>>> +       prot_mem = sk_prot_mem(sk);
>>> +
>>> +       allocated = atomic_long_add_return(amt, sk_memory_allocated(sk));
>>> +
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>>> +               long alloc;
>>> +               /*
>>> +                * Large nestings are not the common case, and stopping
>>> in the
>>> +                * middle would be complicated enough, that we bill it
>>> all the
>>> +                * way through the root, and if needed, unbill everything
>>> later
>>> +                */
>>> +               alloc = atomic_long_add_return(amt,
>>> +                                              sg_memory_allocated(prot,
>>> sg));
>>> +               parent_failure |= (alloc>  sk_prot_mem(sk)[2]);
>>> +       }
>>> +#endif
>>> +
>>> +       /* Over hard limit (we, or our parents) */
>>> +       if (parent_failure || (allocated>  prot_mem[2]))
>>> +               goto suppress_allocation;
>>>
>>>        /* Under limit. */
>>> -       if (allocated<= prot->sysctl_mem[0]) {
>>> -               if (prot->memory_pressure&&  *prot->memory_pressure)
>>> -                       *prot->memory_pressure = 0;
>>> +       if (allocated<= prot_mem[0]) {
>>> +               if (memory_pressure&&  *memory_pressure)
>>> +                       *memory_pressure = 0;
>>>                return 1;
>>>        }
>>>
>>>        /* Under pressure. */
>>> -       if (allocated>  prot->sysctl_mem[1])
>>> +       if (allocated>  prot_mem[1])
>>>                if (prot->enter_memory_pressure)
>>>                        prot->enter_memory_pressure(sk);
>>>
>>> -       /* Over hard limit. */
>>> -       if (allocated>  prot->sysctl_mem[2])
>>> -               goto suppress_allocation;
>>> -
>>>        /* guarantee minimum buffer size under pressure */
>>>        if (kind == SK_MEM_RECV) {
>>>                if (atomic_read(&sk->sk_rmem_alloc)<
>>>  prot->sysctl_rmem[0])
>>> @@ -1712,13 +1779,13 @@ int __sk_mem_schedule(struct sock *sk, int size,
>>> int kind)
>>>                                return 1;
>>>        }
>>>
>>> -       if (prot->memory_pressure) {
>>> +       if (memory_pressure) {
>>>                int alloc;
>>>
>>> -               if (!*prot->memory_pressure)
>>> +               if (!*memory_pressure)
>>>                        return 1;
>>> -               alloc =
>>> percpu_counter_read_positive(prot->sockets_allocated);
>>> -               if (prot->sysctl_mem[2]>  alloc *
>>> +               alloc =
>>> percpu_counter_read_positive(sk_sockets_allocated(sk));
>>> +               if (prot_mem[2]>  alloc *
>>>                    sk_mem_pages(sk->sk_wmem_queued +
>>>                                 atomic_read(&sk->sk_rmem_alloc) +
>>>                                 sk->sk_forward_alloc))
>>> @@ -1741,7 +1808,13 @@ suppress_allocation:
>>>
>>>        /* Alas. Undo changes. */
>>>        sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
>>> -       atomic_long_sub(amt, prot->memory_allocated);
>>> +
>>> +       atomic_long_sub(amt, sk_memory_allocated(sk));
>>> +
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent)
>>> +               atomic_long_sub(amt, sg_memory_allocated(prot, sg));
>>> +#endif
>>>        return 0;
>>>  }
>>>  EXPORT_SYMBOL(__sk_mem_schedule);
>>> @@ -1753,14 +1826,24 @@ EXPORT_SYMBOL(__sk_mem_schedule);
>>>  void __sk_mem_reclaim(struct sock *sk)
>>>  {
>>>        struct proto *prot = sk->sk_prot;
>>> +       struct kmem_cgroup *sg = sk->sk_cgrp;
>>> +       int *memory_pressure = sk_memory_pressure(sk);
>>>
>>>        atomic_long_sub(sk->sk_forward_alloc>>  SK_MEM_QUANTUM_SHIFT,
>>> -                  prot->memory_allocated);
>>> +                  sk_memory_allocated(sk));
>>> +
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>>> +               atomic_long_sub(sk->sk_forward_alloc>>
>>>  SK_MEM_QUANTUM_SHIFT,
>>> +                                               sg_memory_allocated(prot,
>>> sg));
>>> +       }
>>> +#endif
>>> +
>>>        sk->sk_forward_alloc&= SK_MEM_QUANTUM - 1;
>>>
>>> -       if (prot->memory_pressure&&  *prot->memory_pressure&&
>>> -           (atomic_long_read(prot->memory_allocated)<
>>>  prot->sysctl_mem[0]))
>>> -               *prot->memory_pressure = 0;
>>> +       if (memory_pressure&&  *memory_pressure&&
>>> +           (atomic_long_read(sk_memory_allocated(sk))<
>>>  sk_prot_mem(sk)[0]))
>>> +               *memory_pressure = 0;
>>>  }
>>>  EXPORT_SYMBOL(__sk_mem_reclaim);
>>>
>>> @@ -2252,9 +2335,6 @@ void sk_common_release(struct sock *sk)
>>>  }
>>>  EXPORT_SYMBOL(sk_common_release);
>>>
>>> -static DEFINE_RWLOCK(proto_list_lock);
>>> -static LIST_HEAD(proto_list);
>>> -
>>>  #ifdef CONFIG_PROC_FS
>>>  #define PROTO_INUSE_NR 64      /* should be enough for the first time */
>>>  struct prot_inuse {
>>> @@ -2479,13 +2559,17 @@ static char proto_method_implemented(const void
>>> *method)
>>>
>>>  static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
>>>  {
>>> +       struct kmem_cgroup *sg = kcg_from_task(current);
>>> +
>>>        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
>>>                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c
>>> %2c %2c %2c %2c %2c %2c %2c\n",
>>>                   proto->name,
>>>                   proto->obj_size,
>>>                   sock_prot_inuse_get(seq_file_net(seq), proto),
>>> -                  proto->memory_allocated != NULL ?
>>> atomic_long_read(proto->memory_allocated) : -1L,
>>> -                  proto->memory_pressure != NULL ?
>>> *proto->memory_pressure ? "yes" : "no" : "NI",
>>> +                  proto->memory_allocated != NULL ?
>>> +                       atomic_long_read(sg_memory_allocated(proto, sg))
>>> : -1L,
>>> +                  proto->memory_pressure != NULL ?
>>> +                       *sg_memory_pressure(proto, sg) ? "yes" : "no" :
>>> "NI",
>>>                   proto->max_header,
>>>                   proto->slab == NULL ? "no" : "yes",
>>>                   module_name(proto->owner),
>>> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
>>> index 19acd00..463b299 100644
>>> --- a/net/decnet/af_decnet.c
>>> +++ b/net/decnet/af_decnet.c
>>> @@ -458,13 +458,28 @@ static void dn_enter_memory_pressure(struct sock
>>> *sk)
>>>        }
>>>  }
>>>
>>> +static atomic_long_t *memory_allocated_dn(struct kmem_cgroup *sg)
>>> +{
>>> +       return&decnet_memory_allocated;
>>> +}
>>> +
>>> +static int *memory_pressure_dn(struct kmem_cgroup *sg)
>>> +{
>>> +       return&dn_memory_pressure;
>>> +}
>>> +
>>> +static long *dn_sysctl_mem(struct kmem_cgroup *sg)
>>> +{
>>> +       return sysctl_decnet_mem;
>>> +}
>>> +
>>>  static struct proto dn_proto = {
>>>        .name                   = "NSP",
>>>        .owner                  = THIS_MODULE,
>>>        .enter_memory_pressure  = dn_enter_memory_pressure,
>>> -       .memory_pressure        =&dn_memory_pressure,
>>> -       .memory_allocated       =&decnet_memory_allocated,
>>> -       .sysctl_mem             = sysctl_decnet_mem,
>>> +       .memory_pressure        = memory_pressure_dn,
>>> +       .memory_allocated       = memory_allocated_dn,
>>> +       .prot_mem               = dn_sysctl_mem,
>>>        .sysctl_wmem            = sysctl_decnet_wmem,
>>>        .sysctl_rmem            = sysctl_decnet_rmem,
>>>        .max_header             = DN_MAX_NSP_DATA_HEADER + 64,
>>> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
>>> index b14ec7d..9c80acf 100644
>>> --- a/net/ipv4/proc.c
>>> +++ b/net/ipv4/proc.c
>>> @@ -52,20 +52,22 @@ static int sockstat_seq_show(struct seq_file *seq,
>>> void *v)
>>>  {
>>>        struct net *net = seq->private;
>>>        int orphans, sockets;
>>> +       struct kmem_cgroup *sg = kcg_from_task(current);
>>> +       struct percpu_counter *allocated =
>>> sg_sockets_allocated(&tcp_prot, sg);
>>>
>>>        local_bh_disable();
>>>        orphans = percpu_counter_sum_positive(&tcp_orphan_count);
>>> -       sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
>>> +       sockets = percpu_counter_sum_positive(allocated);
>>>        local_bh_enable();
>>>
>>>        socket_seq_show(seq);
>>>        seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem
>>> %ld\n",
>>>                   sock_prot_inuse_get(net,&tcp_prot), orphans,
>>>                   tcp_death_row.tw_count, sockets,
>>> -                  atomic_long_read(&tcp_memory_allocated));
>>> +                  atomic_long_read(sg_memory_allocated((&tcp_prot),
>>> sg)));
>>>        seq_printf(seq, "UDP: inuse %d mem %ld\n",
>>>                   sock_prot_inuse_get(net,&udp_prot),
>>> -                  atomic_long_read(&udp_memory_allocated));
>>> +                  atomic_long_read(sg_memory_allocated((&udp_prot),
>>> sg)));
>>>        seq_printf(seq, "UDPLITE: inuse %d\n",
>>>                   sock_prot_inuse_get(net,&udplite_prot));
>>>        seq_printf(seq, "RAW: inuse %d\n",
>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>>> index 69fd720..5e89480 100644
>>> --- a/net/ipv4/sysctl_net_ipv4.c
>>> +++ b/net/ipv4/sysctl_net_ipv4.c
>>> @@ -14,6 +14,8 @@
>>>  #include<linux/init.h>
>>>  #include<linux/slab.h>
>>>  #include<linux/nsproxy.h>
>>> +#include<linux/kmem_cgroup.h>
>>> +#include<linux/swap.h>
>>>  #include<net/snmp.h>
>>>  #include<net/icmp.h>
>>>  #include<net/ip.h>
>>> @@ -174,6 +176,43 @@ static int proc_allowed_congestion_control(ctl_table
>>> *ctl,
>>>        return ret;
>>>  }
>>>
>>> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
>>> +                          void __user *buffer, size_t *lenp,
>>> +                          loff_t *ppos)
>>> +{
>>> +       int ret;
>>> +       unsigned long vec[3];
>>> +       struct kmem_cgroup *kmem = kcg_from_task(current);
>>> +       struct net *net = current->nsproxy->net_ns;
>>> +       int i;
>>> +
>>> +       ctl_table tmp = {
>>> +               .data =&vec,
>>> +               .maxlen = sizeof(vec),
>>> +               .mode = ctl->mode,
>>> +       };
>>> +
>>> +       if (!write) {
>>> +               ctl->data =&net->ipv4.sysctl_tcp_mem;
>>> +               return proc_doulongvec_minmax(ctl, write, buffer, lenp,
>>> ppos);
>>> +       }
>>> +
>>> +       ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
>>> +       if (ret)
>>> +               return ret;
>>> +
>>> +       for (i = 0; i<  3; i++)
>>> +               if (vec[i]>  kmem->tcp_max_memory)
>>> +                       return -EINVAL;
>>> +
>>> +       for (i = 0; i<  3; i++) {
>>> +               net->ipv4.sysctl_tcp_mem[i] = vec[i];
>>> +               kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i];
>>> +       }
>>> +
>>> +       return 0;
>>> +}
>>> +
>>>  static struct ctl_table ipv4_table[] = {
>>>        {
>>>                .procname       = "tcp_timestamps",
>>> @@ -433,13 +472,6 @@ static struct ctl_table ipv4_table[] = {
>>>                .proc_handler   = proc_dointvec
>>>        },
>>>        {
>>> -               .procname       = "tcp_mem",
>>> -               .data           =&sysctl_tcp_mem,
>>> -               .maxlen         = sizeof(sysctl_tcp_mem),
>>> -               .mode           = 0644,
>>> -               .proc_handler   = proc_doulongvec_minmax
>>> -       },
>>> -       {
>>>                .procname       = "tcp_wmem",
>>>                .data           =&sysctl_tcp_wmem,
>>>                .maxlen         = sizeof(sysctl_tcp_wmem),
>>> @@ -721,6 +753,12 @@ static struct ctl_table ipv4_net_table[] = {
>>>                .mode           = 0644,
>>>                .proc_handler   = ipv4_ping_group_range,
>>>        },
>>> +       {
>>> +               .procname       = "tcp_mem",
>>> +               .maxlen         = sizeof(init_net.ipv4.sysctl_tcp_mem),
>>> +               .mode           = 0644,
>>> +               .proc_handler   = ipv4_tcp_mem,
>>> +       },
>>>        { }
>>>  };
>>>
>>> @@ -734,6 +772,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>>>  static __net_init int ipv4_sysctl_init_net(struct net *net)
>>>  {
>>>        struct ctl_table *table;
>>> +       unsigned long limit;
>>>
>>>        table = ipv4_net_table;
>>>        if (!net_eq(net,&init_net)) {
>>> @@ -769,6 +808,12 @@ static __net_init int ipv4_sysctl_init_net(struct
>>> net *net)
>>>
>>>        net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>>>
>>> +       limit = nr_free_buffer_pages() / 8;
>>> +       limit = max(limit, 128UL);
>>> +       net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
>>> +       net->ipv4.sysctl_tcp_mem[1] = limit;
>>> +       net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
>>> +
>>>        net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>>>                        net_ipv4_ctl_path, table);
>>>        if (net->ipv4.ipv4_hdr == NULL)
>>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>>> index 46febca..e1918fa 100644
>>> --- a/net/ipv4/tcp.c
>>> +++ b/net/ipv4/tcp.c
>>> @@ -266,6 +266,7 @@
>>>  #include<linux/crypto.h>
>>>  #include<linux/time.h>
>>>  #include<linux/slab.h>
>>> +#include<linux/nsproxy.h>
>>>
>>>  #include<net/icmp.h>
>>>  #include<net/tcp.h>
>>> @@ -282,23 +283,12 @@ int sysctl_tcp_fin_timeout __read_mostly =
>>> TCP_FIN_TIMEOUT;
>>>  struct percpu_counter tcp_orphan_count;
>>>  EXPORT_SYMBOL_GPL(tcp_orphan_count);
>>>
>>> -long sysctl_tcp_mem[3] __read_mostly;
>>>  int sysctl_tcp_wmem[3] __read_mostly;
>>>  int sysctl_tcp_rmem[3] __read_mostly;
>>>
>>> -EXPORT_SYMBOL(sysctl_tcp_mem);
>>>  EXPORT_SYMBOL(sysctl_tcp_rmem);
>>>  EXPORT_SYMBOL(sysctl_tcp_wmem);
>>>
>>> -atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
>>> -EXPORT_SYMBOL(tcp_memory_allocated);
>>> -
>>> -/*
>>> - * Current number of TCP sockets.
>>> - */
>>> -struct percpu_counter tcp_sockets_allocated;
>>> -EXPORT_SYMBOL(tcp_sockets_allocated);
>>> -
>>>  /*
>>>  * TCP splice context
>>>  */
>>> @@ -308,23 +298,157 @@ struct tcp_splice_state {
>>>        unsigned int flags;
>>>  };
>>>
>>> +#ifdef CONFIG_CGROUP_KMEM
>>>  /*
>>>  * Pressure flag: try to collapse.
>>>  * Technical note: it is used by multiple contexts non atomically.
>>>  * All the __sk_mem_schedule() is of this nature: accounting
>>>  * is strict, actions are advisory and have some latency.
>>>  */
>>> -int tcp_memory_pressure __read_mostly;
>>> -EXPORT_SYMBOL(tcp_memory_pressure);
>>> -
>>>  void tcp_enter_memory_pressure(struct sock *sk)
>>>  {
>>> +       struct kmem_cgroup *sg = sk->sk_cgrp;
>>> +       if (!sg->tcp_memory_pressure) {
>>> +               NET_INC_STATS(sock_net(sk),
>>> LINUX_MIB_TCPMEMORYPRESSURES);
>>> +               sg->tcp_memory_pressure = 1;
>>> +       }
>>> +}
>>> +
>>> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
>>> +{
>>> +       return sg->tcp_prot_mem;
>>> +}
>>> +
>>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&(sg->tcp_memory_allocated);
>>> +}
>>> +
>>> +static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64
>>> val)
>>> +{
>>> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>>> +       struct net *net = current->nsproxy->net_ns;
>>> +       int i;
>>> +
>>> +       if (!cgroup_lock_live_group(cgrp))
>>> +               return -ENODEV;
>>> +
>>> +       /*
>>> +        * We can't allow more memory than our parents. Since this
>>> +        * will be tested for all calls, by induction, there is no need
>>> +        * to test any parent other than our own
>>> +        * */
>>> +       if (sg->parent&&  (val>  sg->parent->tcp_max_memory))
>>> +               val = sg->parent->tcp_max_memory;
>>> +
>>> +       sg->tcp_max_memory = val;
>>> +
>>> +       for (i = 0; i<  3; i++)
>>> +               sg->tcp_prot_mem[i]  = min_t(long, val,
>>> +
>>>  net->ipv4.sysctl_tcp_mem[i]);
>>> +
>>> +       cgroup_unlock();
>>> +
>>> +       return 0;
>>> +}
>>> +
>>> +static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft)
>>> +{
>>> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>>> +       u64 ret;
>>> +
>>> +       if (!cgroup_lock_live_group(cgrp))
>>> +               return -ENODEV;
>>> +       ret = sg->tcp_max_memory;
>>> +
>>> +       cgroup_unlock();
>>> +       return ret;
>>> +}
>>> +
>>> +static struct cftype tcp_files[] = {
>>> +       {
>>> +               .name = "tcp_maxmem",
>>> +               .write_u64 = tcp_write_maxmem,
>>> +               .read_u64 = tcp_read_maxmem,
>>> +       },
>>> +};
>>> +
>>> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
>>> +{
>>> +       struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>>> +       unsigned long limit;
>>> +       struct net *net = current->nsproxy->net_ns;
>>> +
>>> +       sg->tcp_memory_pressure = 0;
>>> +       atomic_long_set(&sg->tcp_memory_allocated, 0);
>>> +       percpu_counter_init(&sg->tcp_sockets_allocated, 0);
>>> +
>>> +       limit = nr_free_buffer_pages() / 8;
>>> +       limit = max(limit, 128UL);
>>> +
>>> +       if (sg->parent)
>>> +               sg->tcp_max_memory = sg->parent->tcp_max_memory;
>>> +       else
>>> +               sg->tcp_max_memory = limit * 2;
>>> +
>>> +       sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
>>> +       sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
>>> +       sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
>>> +
>>> +       return cgroup_add_files(cgrp, ss, tcp_files,
>>> ARRAY_SIZE(tcp_files));
>>> +}
>>> +EXPORT_SYMBOL(tcp_init_cgroup);
>>> +
>>> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&sg->tcp_memory_pressure;
>>> +}
>>> +
>>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&sg->tcp_sockets_allocated;
>>> +}
>>> +#else
>>> +
>>> +/* Current number of TCP sockets. */
>>> +struct percpu_counter tcp_sockets_allocated;
>>> +atomic_long_t tcp_memory_allocated;    /* Current allocated memory. */
>>> +int tcp_memory_pressure;
>>> +
>>> +int *memory_pressure_tcp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&tcp_memory_pressure;
>>> +}
>>> +
>>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&tcp_sockets_allocated;
>>> +}
>>> +
>>> +void tcp_enter_memory_pressure(struct sock *sock)
>>> +{
>>>        if (!tcp_memory_pressure) {
>>>                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
>>>                tcp_memory_pressure = 1;
>>>        }
>>>  }
>>> +
>>> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
>>> +{
>>> +       return init_net.ipv4.sysctl_tcp_mem;
>>> +}
>>> +
>>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&tcp_memory_allocated;
>>> +}
>>> +#endif /* CONFIG_CGROUP_KMEM */
>>> +
>>> +EXPORT_SYMBOL(memory_pressure_tcp);
>>> +EXPORT_SYMBOL(sockets_allocated_tcp);
>>>  EXPORT_SYMBOL(tcp_enter_memory_pressure);
>>> +EXPORT_SYMBOL(tcp_sysctl_mem);
>>> +EXPORT_SYMBOL(memory_allocated_tcp);
>>>
>>>  /* Convert seconds to retransmits based on initial and max timeout */
>>>  static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
>>> @@ -3226,7 +3350,9 @@ void __init tcp_init(void)
>>>
>>>        BUILD_BUG_ON(sizeof(struct tcp_skb_cb)>  sizeof(skb->cb));
>>>
>>> +#ifndef CONFIG_CGROUP_KMEM
>>>        percpu_counter_init(&tcp_sockets_allocated, 0);
>>> +#endif
>>>        percpu_counter_init(&tcp_orphan_count, 0);
>>>        tcp_hashinfo.bind_bucket_cachep =
>>>                kmem_cache_create("tcp_bind_bucket",
>>> @@ -3277,14 +3403,10 @@ void __init tcp_init(void)
>>>        sysctl_tcp_max_orphans = cnt / 2;
>>>        sysctl_max_syn_backlog = max(128, cnt / 256);
>>>
>>> -       limit = nr_free_buffer_pages() / 8;
>>> -       limit = max(limit, 128UL);
>>> -       sysctl_tcp_mem[0] = limit / 4 * 3;
>>> -       sysctl_tcp_mem[1] = limit;
>>> -       sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
>>> -
>>>        /* Set per-socket limits to no more than 1/128 the pressure
>>> threshold */
>>> -       limit = ((unsigned long)sysctl_tcp_mem[1])<<  (PAGE_SHIFT - 7);
>>> +       limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
>>> +       limit<<= (PAGE_SHIFT - 7);
>>> +
>>>        max_share = min(4UL*1024*1024, limit);
>>>
>>>        sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
>>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>>> index ea0d218..c44e830 100644
>>> --- a/net/ipv4/tcp_input.c
>>> +++ b/net/ipv4/tcp_input.c
>>> @@ -316,7 +316,7 @@ static void tcp_grow_window(struct sock *sk, struct
>>> sk_buff *skb)
>>>        /* Check #1 */
>>>        if (tp->rcv_ssthresh<  tp->window_clamp&&
>>>            (int)tp->rcv_ssthresh<  tcp_space(sk)&&
>>> -           !tcp_memory_pressure) {
>>> +           !sk_memory_pressure(sk)) {
>>>                int incr;
>>>
>>>                /* Check #2. Increase window, if skb with such overhead
>>> @@ -393,15 +393,16 @@ static void tcp_clamp_window(struct sock *sk)
>>>  {
>>>        struct tcp_sock *tp = tcp_sk(sk);
>>>        struct inet_connection_sock *icsk = inet_csk(sk);
>>> +       struct proto *prot = sk->sk_prot;
>>>
>>>        icsk->icsk_ack.quick = 0;
>>>
>>> -       if (sk->sk_rcvbuf<  sysctl_tcp_rmem[2]&&
>>> +       if (sk->sk_rcvbuf<  prot->sysctl_rmem[2]&&
>>>            !(sk->sk_userlocks&  SOCK_RCVBUF_LOCK)&&
>>> -           !tcp_memory_pressure&&
>>> -           atomic_long_read(&tcp_memory_allocated)<  sysctl_tcp_mem[0])
>>> {
>>> +           !sk_memory_pressure(sk)&&
>>> +           atomic_long_read(sk_memory_allocated(sk))<
>>>  sk_prot_mem(sk)[0]) {
>>>                sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
>>> -                                   sysctl_tcp_rmem[2]);
>>> +                                   prot->sysctl_rmem[2]);
>>>        }
>>>        if (atomic_read(&sk->sk_rmem_alloc)>  sk->sk_rcvbuf)
>>>                tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
>>> @@ -4806,7 +4807,7 @@ static int tcp_prune_queue(struct sock *sk)
>>>
>>>        if (atomic_read(&sk->sk_rmem_alloc)>= sk->sk_rcvbuf)
>>>                tcp_clamp_window(sk);
>>> -       else if (tcp_memory_pressure)
>>> +       else if (sk_memory_pressure(sk))
>>>                tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
>>>
>>>        tcp_collapse_ofo_queue(sk);
>>> @@ -4872,11 +4873,11 @@ static int tcp_should_expand_sndbuf(struct sock
>>> *sk)
>>>                return 0;
>>>
>>>        /* If we are under global TCP memory pressure, do not expand.  */
>>> -       if (tcp_memory_pressure)
>>> +       if (sk_memory_pressure(sk))
>>>                return 0;
>>>
>>>        /* If we are under soft global TCP memory pressure, do not expand.
>>>  */
>>> -       if (atomic_long_read(&tcp_memory_allocated)>= sysctl_tcp_mem[0])
>>> +       if (atomic_long_read(sk_memory_allocated(sk))>=
>>> sk_prot_mem(sk)[0])
>>>                return 0;
>>>
>>>        /* If we filled the congestion window, do not expand.  */
>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>>> index 1c12b8e..af6c095 100644
>>> --- a/net/ipv4/tcp_ipv4.c
>>> +++ b/net/ipv4/tcp_ipv4.c
>>> @@ -1848,6 +1848,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>>>  {
>>>        struct inet_connection_sock *icsk = inet_csk(sk);
>>>        struct tcp_sock *tp = tcp_sk(sk);
>>> +       struct kmem_cgroup *sg;
>>>
>>>        skb_queue_head_init(&tp->out_of_order_queue);
>>>        tcp_init_xmit_timers(sk);
>>> @@ -1901,7 +1902,13 @@ static int tcp_v4_init_sock(struct sock *sk)
>>>        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
>>>
>>>        local_bh_disable();
>>> -       percpu_counter_inc(&tcp_sockets_allocated);
>>> +       percpu_counter_inc(sk_sockets_allocated(sk));
>>> +
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
>>> +               percpu_counter_inc(sg_sockets_allocated(sk->sk_prot,
>>> sg));
>>> +#endif
>>> +
>>>        local_bh_enable();
>>>
>>>        return 0;
>>> @@ -1910,6 +1917,7 @@ static int tcp_v4_init_sock(struct sock *sk)
>>>  void tcp_v4_destroy_sock(struct sock *sk)
>>>  {
>>>        struct tcp_sock *tp = tcp_sk(sk);
>>> +       struct kmem_cgroup *sg;
>>>
>>>        tcp_clear_xmit_timers(sk);
>>>
>>> @@ -1957,7 +1965,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
>>>                tp->cookie_values = NULL;
>>>        }
>>>
>>> -       percpu_counter_dec(&tcp_sockets_allocated);
>>> +       percpu_counter_dec(sk_sockets_allocated(sk));
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
>>> +               percpu_counter_dec(sg_sockets_allocated(sk->sk_prot,
>>> sg));
>>> +#endif
>>>  }
>>>  EXPORT_SYMBOL(tcp_v4_destroy_sock);
>>>
>>> @@ -2598,11 +2610,14 @@ struct proto tcp_prot = {
>>>        .unhash                 = inet_unhash,
>>>        .get_port               = inet_csk_get_port,
>>>        .enter_memory_pressure  = tcp_enter_memory_pressure,
>>> -       .sockets_allocated      =&tcp_sockets_allocated,
>>> +       .memory_pressure        = memory_pressure_tcp,
>>> +       .sockets_allocated      = sockets_allocated_tcp,
>>>        .orphan_count           =&tcp_orphan_count,
>>> -       .memory_allocated       =&tcp_memory_allocated,
>>> -       .memory_pressure        =&tcp_memory_pressure,
>>> -       .sysctl_mem             = sysctl_tcp_mem,
>>> +       .memory_allocated       = memory_allocated_tcp,
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       .init_cgroup            = tcp_init_cgroup,
>>> +#endif
>>> +       .prot_mem               = tcp_sysctl_mem,
>>>        .sysctl_wmem            = sysctl_tcp_wmem,
>>>        .sysctl_rmem            = sysctl_tcp_rmem,
>>>        .max_header             = MAX_TCP_HEADER,
>>> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>>> index 882e0b0..06aeb31 100644
>>> --- a/net/ipv4/tcp_output.c
>>> +++ b/net/ipv4/tcp_output.c
>>> @@ -1912,7 +1912,7 @@ u32 __tcp_select_window(struct sock *sk)
>>>        if (free_space<  (full_space>>  1)) {
>>>                icsk->icsk_ack.quick = 0;
>>>
>>> -               if (tcp_memory_pressure)
>>> +               if (sk_memory_pressure(sk))
>>>                        tp->rcv_ssthresh = min(tp->rcv_ssthresh,
>>>                                               4U * tp->advmss);
>>>
>>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
>>> index ecd44b0..2c67617 100644
>>> --- a/net/ipv4/tcp_timer.c
>>> +++ b/net/ipv4/tcp_timer.c
>>> @@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data)
>>>        }
>>>
>>>  out:
>>> -       if (tcp_memory_pressure)
>>> +       if (sk_memory_pressure(sk))
>>>                sk_mem_reclaim(sk);
>>>  out_unlock:
>>>        bh_unlock_sock(sk);
>>> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
>>> index 1b5a193..6c08c65 100644
>>> --- a/net/ipv4/udp.c
>>> +++ b/net/ipv4/udp.c
>>> @@ -120,9 +120,6 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
>>>  int sysctl_udp_wmem_min __read_mostly;
>>>  EXPORT_SYMBOL(sysctl_udp_wmem_min);
>>>
>>> -atomic_long_t udp_memory_allocated;
>>> -EXPORT_SYMBOL(udp_memory_allocated);
>>> -
>>>  #define MAX_UDP_PORTS 65536
>>>  #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
>>>
>>> @@ -1918,6 +1915,19 @@ unsigned int udp_poll(struct file *file, struct
>>> socket *sock, poll_table *wait)
>>>  }
>>>  EXPORT_SYMBOL(udp_poll);
>>>
>>> +static atomic_long_t udp_memory_allocated;
>>> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&udp_memory_allocated;
>>> +}
>>> +EXPORT_SYMBOL(memory_allocated_udp);
>>> +
>>> +long *udp_sysctl_mem(struct kmem_cgroup *sg)
>>> +{
>>> +       return sysctl_udp_mem;
>>> +}
>>> +EXPORT_SYMBOL(udp_sysctl_mem);
>>> +
>>>  struct proto udp_prot = {
>>>        .name              = "UDP",
>>>        .owner             = THIS_MODULE,
>>> @@ -1936,8 +1946,8 @@ struct proto udp_prot = {
>>>        .unhash            = udp_lib_unhash,
>>>        .rehash            = udp_v4_rehash,
>>>        .get_port          = udp_v4_get_port,
>>> -       .memory_allocated  =&udp_memory_allocated,
>>> -       .sysctl_mem        = sysctl_udp_mem,
>>> +       .memory_allocated  =&memory_allocated_udp,
>>> +       .prot_mem          = udp_sysctl_mem,
>>>        .sysctl_wmem       =&sysctl_udp_wmem_min,
>>>        .sysctl_rmem       =&sysctl_udp_rmem_min,
>>>        .obj_size          = sizeof(struct udp_sock),
>>> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
>>> index d1fb63f..0762e68 100644
>>> --- a/net/ipv6/tcp_ipv6.c
>>> +++ b/net/ipv6/tcp_ipv6.c
>>> @@ -1959,6 +1959,7 @@ static int tcp_v6_init_sock(struct sock *sk)
>>>  {
>>>        struct inet_connection_sock *icsk = inet_csk(sk);
>>>        struct tcp_sock *tp = tcp_sk(sk);
>>> +       struct kmem_cgroup *sg;
>>>
>>>        skb_queue_head_init(&tp->out_of_order_queue);
>>>        tcp_init_xmit_timers(sk);
>>> @@ -2012,7 +2013,12 @@ static int tcp_v6_init_sock(struct sock *sk)
>>>        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
>>>
>>>        local_bh_disable();
>>> -       percpu_counter_inc(&tcp_sockets_allocated);
>>> +       percpu_counter_inc(sk_sockets_allocated(sk));
>>> +#ifdef CONFIG_CGROUP_KMEM
>>> +       for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
>>> +               percpu_counter_dec(sg_sockets_allocated(sk->sk_prot,
>>> sg));
>>> +#endif
>>> +
>>>        local_bh_enable();
>>>
>>>        return 0;
>>> @@ -2221,11 +2227,11 @@ struct proto tcpv6_prot = {
>>>        .unhash                 = inet_unhash,
>>>        .get_port               = inet_csk_get_port,
>>>        .enter_memory_pressure  = tcp_enter_memory_pressure,
>>> -       .sockets_allocated      =&tcp_sockets_allocated,
>>> -       .memory_allocated       =&tcp_memory_allocated,
>>> -       .memory_pressure        =&tcp_memory_pressure,
>>> +       .sockets_allocated      = sockets_allocated_tcp,
>>> +       .memory_allocated       = memory_allocated_tcp,
>>> +       .memory_pressure        = memory_pressure_tcp,
>>>        .orphan_count           =&tcp_orphan_count,
>>> -       .sysctl_mem             = sysctl_tcp_mem,
>>> +       .prot_mem               = tcp_sysctl_mem,
>>>        .sysctl_wmem            = sysctl_tcp_wmem,
>>>        .sysctl_rmem            = sysctl_tcp_rmem,
>>>        .max_header             = MAX_TCP_HEADER,
>>> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
>>> index 29213b5..ef4b5b3 100644
>>> --- a/net/ipv6/udp.c
>>> +++ b/net/ipv6/udp.c
>>> @@ -1465,8 +1465,8 @@ struct proto udpv6_prot = {
>>>        .unhash            = udp_lib_unhash,
>>>        .rehash            = udp_v6_rehash,
>>>        .get_port          = udp_v6_get_port,
>>> -       .memory_allocated  =&udp_memory_allocated,
>>> -       .sysctl_mem        = sysctl_udp_mem,
>>> +       .memory_allocated  = memory_allocated_udp,
>>> +       .prot_mem          = udp_sysctl_mem,
>>>        .sysctl_wmem       =&sysctl_udp_wmem_min,
>>>        .sysctl_rmem       =&sysctl_udp_rmem_min,
>>>        .obj_size          = sizeof(struct udp6_sock),
>>> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
>>> index 836aa63..1b0300d 100644
>>> --- a/net/sctp/socket.c
>>> +++ b/net/sctp/socket.c
>>> @@ -119,11 +119,30 @@ static int sctp_memory_pressure;
>>>  static atomic_long_t sctp_memory_allocated;
>>>  struct percpu_counter sctp_sockets_allocated;
>>>
>>> +static long *sctp_sysctl_mem(struct kmem_cgroup *sg)
>>> +{
>>> +       return sysctl_sctp_mem;
>>> +}
>>> +
>>>  static void sctp_enter_memory_pressure(struct sock *sk)
>>>  {
>>>        sctp_memory_pressure = 1;
>>>  }
>>>
>>> +static int *memory_pressure_sctp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&sctp_memory_pressure;
>>> +}
>>> +
>>> +static atomic_long_t *memory_allocated_sctp(struct kmem_cgroup *sg)
>>> +{
>>> +       return&sctp_memory_allocated;
>>> +}
>>> +
>>> +static struct percpu_counter *sockets_allocated_sctp(struct kmem_cgroup
>>> *sg)
>>> +{
>>> +       return&sctp_sockets_allocated;
>>> +}
>>>
>>>  /* Get the sndbuf space available at the time on the association.  */
>>>  static inline int sctp_wspace(struct sctp_association *asoc)
>>> @@ -6831,13 +6850,13 @@ struct proto sctp_prot = {
>>>        .unhash      =  sctp_unhash,
>>>        .get_port    =  sctp_get_port,
>>>        .obj_size    =  sizeof(struct sctp_sock),
>>> -       .sysctl_mem  =  sysctl_sctp_mem,
>>> +       .prot_mem    =  sctp_sysctl_mem,
>>>        .sysctl_rmem =  sysctl_sctp_rmem,
>>>        .sysctl_wmem =  sysctl_sctp_wmem,
>>> -       .memory_pressure =&sctp_memory_pressure,
>>> +       .memory_pressure = memory_pressure_sctp,
>>>        .enter_memory_pressure = sctp_enter_memory_pressure,
>>> -       .memory_allocated =&sctp_memory_allocated,
>>> -       .sockets_allocated =&sctp_sockets_allocated,
>>> +       .memory_allocated = memory_allocated_sctp,
>>> +       .sockets_allocated = sockets_allocated_sctp,
>>>  };
>>>
>>>  #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
>>> @@ -6863,12 +6882,12 @@ struct proto sctpv6_prot = {
>>>        .unhash         = sctp_unhash,
>>>        .get_port       = sctp_get_port,
>>>        .obj_size       = sizeof(struct sctp6_sock),
>>> -       .sysctl_mem     = sysctl_sctp_mem,
>>> +       .prot_mem       = sctp_sysctl_mem,
>>>        .sysctl_rmem    = sysctl_sctp_rmem,
>>>        .sysctl_wmem    = sysctl_sctp_wmem,
>>> -       .memory_pressure =&sctp_memory_pressure,
>>> +       .memory_pressure = memory_pressure_sctp,
>>>        .enter_memory_pressure = sctp_enter_memory_pressure,
>>> -       .memory_allocated =&sctp_memory_allocated,
>>> -       .sockets_allocated =&sctp_sockets_allocated,
>>> +       .memory_allocated = memory_allocated_sctp,
>>> +       .sockets_allocated = sockets_allocated_sctp,
>>>  };
>>>  #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
>>> --
>>> 1.7.6
>>>
>>> --
>>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>> the body to majordomo@kvack.org.  For more info on Linux MM,
>>> see: http://www.linux-mm.org/ .
>>> Fight unfair telecom internet charges in Canada: sign
>>> http://stopthemeter.ca/
>>> Don't email:<a href=mailto:"dont@kvack.org">  email@kvack.org</a>
>>>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 6, 2011, 10:37 p.m. UTC | #6
On 09/06/2011 07:12 PM, Greg Thelen wrote:
> On Tue, Sep 6, 2011 at 9:16 AM, Glauber Costa<glommer@parallels.com>  wrote:
>> On 09/06/2011 01:08 PM, Greg Thelen wrote:
>>>
>>> On Mon, Sep 5, 2011 at 7:35 PM, Glauber Costa<glommer@parallels.com>
>>>   wrote:
>>>>
>>>> This patch introduces per-cgroup tcp buffers limitation. This allows
>>>> sysadmins to specify a maximum amount of kernel memory that
>>>> tcp connections can use at any point in time. TCP is the main interest
>>>> in this work, but extending it to other protocols would be easy.
>>
>> Hello Greg,
>>
>>> With this approach we would be giving admins the ability to
>>> independently limit user memory with memcg and kernel memory with this
>>> new kmem cgroup.
>>>
>>> At least in some situations admins prefer to give a particular
>>> container X bytes without thinking about the kernel vs user split.
>>> Sometimes the admin would prefer the kernel to keep the total
>>> user+kernel memory below a certain threshold.  To achieve this with
>>> this approach would we need a user space agent to monitor both kernel
>>> and user usage for a container and grow/shrink memcg/kmem limits?
>>
>> Yes, I believe so. And this is not only valid for containers: the
>> information we expose in proc, sys, cgroups, etc, is always much more fine
>> grained than a considerable part of the users want. Tools come to fill this
>> gap.
>
> In your use cases do jobs separately specify independent kmem usage
> limits and user memory usage limits?

Yes, because they are different in nature: user memory can be 
overcommited, kernel memory is pinned by its objects, and can't go to swap.

> I presume for people who want to simply dedicate X bytes of memory to
> container C that a user-space agent would need to poll both
> memcg/X/memory.usage_in_bytes and kmem/X/kmem.usage_in_bytes (or some
> other file) to determine if memory limits should be adjusted (i.e. if
> kernel memory is growing, then user memory would need to shrink).
Ok.

I think memcg's usage is really all you need here. In the end of the 
day, it tells you how many pages your container has available. The whole
point of kmem cgroup is not any kind of reservation or accounting.

Once a container (or cgroup) reaches a number of objects *pinned* in 
memory (therefore, non-reclaimable), you won't be able to grab anything 
from it.
> So
> far my use cases involve a single memory limit which includes both
> kernel and user memory.  So I would need a user space agent to poll
> {memcg,kmem}.usage_in_bytes to apply pressure to memcg if kmem grows
> and visa versa.

Maybe not.
If userspace memory works for you today (supposing it does), why change?
Right now you assign X bytes of user memory to a container, and the 
kernel memory is shared among all of them. If this works for you, 
kmem_cgroup won't change that. It just will impose limits over which
your kernel objects can't grow.

So you don't *need* a userspace agent doing this calculation, because 
fundamentally, nothing changed: I am not unbilling memory in memcg to 
bill it back in kmem_cg. Of course, once it is in, you will be able to 
do it in such a fine grained fashion if you decide to do so.

> Do you foresee instantiation of multiple kmem cgroups, so that a
> process could be added into kmem/K1 or kmem/K2?  If so do you plan on
> supporting migration between cgroups and/or migration of kmem charge
> between K1 to K2?
Yes, each container should have its own cgroup, so at least in the use
cases I am concerned, we will have a lot of them. But the usual 
lifecycle, is create, execute and die. Mobility between them
is not something I am overly concerned right now.


>>> Do you foresee the kmem cgroup growing to include reclaimable slab,
>>> where freeing one type of memory allows for reclaim of the other?
>>
>> Yes, absolutely.
>
> Small comments below.
>

>>>>   };
>>>>
>>>> +#define sk_memory_pressure(sk)                                         \
>>>> +({                                                                     \
>>>> +       int *__ret = NULL;                                              \
>>>> +       if ((sk)->sk_prot->memory_pressure)                             \
>>>> +               __ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);    \
>>>> +       __ret;                                                          \
>>>> +})
>>>> +
>>>> +#define sk_sockets_allocated(sk)                               \
>>>> +({                                                             \
>>>> +       struct percpu_counter *__p;                             \
>>>> +       __p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);    \
>>>> +       __p;                                                    \
>>>> +})
>
> Could this be simplified as (same applies to following few macros):
>
> static inline struct percpu_counter *sk_sockets_allocated(struct sock *sk)
> {
>          return sk->sk_prot->sockets_allocated(sk->sk_cgrp);
> }
Yes and no. Right now, I need them to be valid lvalues. But in the 
upcoming version of the patch, I will drop this requirement. Then
I will move to inline functions.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 6, 2011, 10:52 p.m. UTC | #7
On 09/06/2011 07:00 AM, KAMEZAWA Hiroyuki wrote:
> On Mon,  5 Sep 2011 23:35:56 -0300
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> This patch introduces per-cgroup tcp buffers limitation. This allows
>> sysadmins to specify a maximum amount of kernel memory that
>> tcp connections can use at any point in time. TCP is the main interest
>> in this work, but extending it to other protocols would be easy.
>>
>> It piggybacks in the memory control mechanism already present in
>> /proc/sys/net/ipv4/tcp_mem. There is a soft limit, and a hard limit,
>> that will suppress allocation when reached. For each cgroup, however,
>> the file kmem.tcp_maxmem will be used to cap those values.
>>
>> The usage I have in mind here is containers. Each container will
>> define its own values for soft and hard limits, but none of them will
>> be possibly bigger than the value the box' sysadmin specified from
>> the outside.
>>
>> To test for any performance impacts of this patch, I used netperf's
>> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
>>
>> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
>> results:
>>
>> Without the patch
>> =================
>>
>> Socket Size   Request  Resp.   Elapsed  Trans.
>> Send   Recv   Size     Size    Time     Rate
>> bytes  Bytes  bytes    bytes   secs.    per sec
>>
>> 16384  87380  1        1       10.00    26996.35
>> 16384  87380
>>
>> With the patch
>> ===============
>>
>> Local /Remote
>> Socket Size   Request  Resp.   Elapsed  Trans.
>> Send   Recv   Size     Size    Time     Rate
>> bytes  Bytes  bytes    bytes   secs.    per sec
>>
>> 16384  87380  1        1       10.00    27291.86
>> 16384  87380
>>
>> The difference is within a one-percent range.
>>
>> Nesting cgroups doesn't seem to be the dominating factor as well,
>> with nestings up to 10 levels not showing a significant performance
>> difference.
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: David S. Miller<davem@davemloft.net>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Eric W. Biederman<ebiederm@xmission.com>
>> ---
>>   crypto/af_alg.c               |    7 ++-
>>   include/linux/cgroup_subsys.h |    4 +
>>   include/net/netns/ipv4.h      |    1 +
>>   include/net/sock.h            |   66 +++++++++++++++-
>>   include/net/tcp.h             |   12 ++-
>>   include/net/udp.h             |    3 +-
>>   include/trace/events/sock.h   |   10 +-
>>   init/Kconfig                  |   11 +++
>>   mm/Makefile                   |    1 +
>>   net/core/sock.c               |  136 +++++++++++++++++++++++++++-------
>>   net/decnet/af_decnet.c        |   21 +++++-
>>   net/ipv4/proc.c               |    8 +-
>>   net/ipv4/sysctl_net_ipv4.c    |   59 +++++++++++++--
>>   net/ipv4/tcp.c                |  164 +++++++++++++++++++++++++++++++++++-----
>>   net/ipv4/tcp_input.c          |   17 ++--
>>   net/ipv4/tcp_ipv4.c           |   27 +++++--
>>   net/ipv4/tcp_output.c         |    2 +-
>>   net/ipv4/tcp_timer.c          |    2 +-
>>   net/ipv4/udp.c                |   20 ++++-
>>   net/ipv6/tcp_ipv6.c           |   16 +++-
>>   net/ipv6/udp.c                |    4 +-
>>   net/sctp/socket.c             |   35 +++++++--
>>   22 files changed, 514 insertions(+), 112 deletions(-)
>
> Hmm...could you please devide patches into a few patches ?
>
> If I was you, I'll devide the patches into
>
>   - Kconfig/Makefile/kmem_cgroup.c skelton.
>   - changes to struct sock and macro definition
>   - hooks to tcp.
>   - hooks to udp
>   - hooks to sctp

BTW: One thing to keep in mind, is that the hooks can't go
entirely in separate patches. Most of it has to go in the same
patch in which we change struct sock. Otherwise bisectability is
gone.

> And why not including mm/kmem_cgroup.c ?
>
> some comments below.
>
>
>>
>> diff --git a/crypto/af_alg.c b/crypto/af_alg.c
>> index ac33d5f..df168d8 100644
>> --- a/crypto/af_alg.c
>> +++ b/crypto/af_alg.c
>> @@ -29,10 +29,15 @@ struct alg_type_list {
>>
>>   static atomic_long_t alg_memory_allocated;
>>
>> +static atomic_long_t *memory_allocated_alg(struct kmem_cgroup *sg)
>> +{
>> +	return&alg_memory_allocated;
>> +}
>> +
>>   static struct proto alg_proto = {
>>   	.name			= "ALG",
>>   	.owner			= THIS_MODULE,
>> -	.memory_allocated	=&alg_memory_allocated,
>> +	.memory_allocated	= memory_allocated_alg,
>>   	.obj_size		= sizeof(struct alg_sock),
>>   };
>>
>> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
>> index ac663c1..363b8e8 100644
>> --- a/include/linux/cgroup_subsys.h
>> +++ b/include/linux/cgroup_subsys.h
>> @@ -35,6 +35,10 @@ SUBSYS(cpuacct)
>>   SUBSYS(mem_cgroup)
>>   #endif
>>
>> +#ifdef CONFIG_CGROUP_KMEM
>> +SUBSYS(kmem)
>> +#endif
>> +
>>   /* */
>>
>>   #ifdef CONFIG_CGROUP_DEVICE
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index d786b4f..bbd023a 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -55,6 +55,7 @@ struct netns_ipv4 {
>>   	int current_rt_cache_rebuild_count;
>>
>>   	unsigned int sysctl_ping_group_range[2];
>> +	long sysctl_tcp_mem[3];
>>
>>   	atomic_t rt_genid;
>>   	atomic_t dev_addr_genid;
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 8e4062f..e085148 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -62,7 +62,9 @@
>>   #include<linux/atomic.h>
>>   #include<net/dst.h>
>>   #include<net/checksum.h>
>> +#include<linux/kmem_cgroup.h>
>>
>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
>>   /*
>>    * This structure really needs to be cleaned up.
>>    * Most of it is for TCP, and not used by any of
>> @@ -339,6 +341,7 @@ struct sock {
>>   #endif
>>   	__u32			sk_mark;
>>   	u32			sk_classid;
>> +	struct kmem_cgroup	*sk_cgrp;
>>   	void			(*sk_state_change)(struct sock *sk);
>>   	void			(*sk_data_ready)(struct sock *sk, int bytes);
>>   	void			(*sk_write_space)(struct sock *sk);
>> @@ -786,16 +789,21 @@ struct proto {
>>
>>   	/* Memory pressure */
>>   	void			(*enter_memory_pressure)(struct sock *sk);
>> -	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
>> -	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
>> +	/* Current allocated memory. */
>> +	atomic_long_t		*(*memory_allocated)(struct kmem_cgroup *sg);
>> +	/* Current number of sockets. */
>> +	struct percpu_counter	*(*sockets_allocated)(struct kmem_cgroup *sg);
>> +
>> +	int			(*init_cgroup)(struct cgroup *cgrp,
>> +					       struct cgroup_subsys *ss);
>>   	/*
>>   	 * Pressure flag: try to collapse.
>>   	 * Technical note: it is used by multiple contexts non atomically.
>>   	 * All the __sk_mem_schedule() is of this nature: accounting
>>   	 * is strict, actions are advisory and have some latency.
>>   	 */
>> -	int			*memory_pressure;
>> -	long			*sysctl_mem;
>> +	int			*(*memory_pressure)(struct kmem_cgroup *sg);
>> +	long			*(*prot_mem)(struct kmem_cgroup *sg);
>
> Hmm. Socket interface callbacks doesn't have documentation ?
> Adding explanation in Documenation is better, isn't it ?
>
>
>>   	int			*sysctl_wmem;
>>   	int			*sysctl_rmem;
>>   	int			max_header;
>> @@ -826,6 +834,56 @@ struct proto {
>>   #endif
>>   };
>>
>> +#define sk_memory_pressure(sk)						\
>> +({									\
>> +	int *__ret = NULL;						\
>> +	if ((sk)->sk_prot->memory_pressure)				\
>> +		__ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);	\
>> +	__ret;								\
>> +})
>> +
>> +#define sk_sockets_allocated(sk)				\
>> +({								\
>> +	struct percpu_counter *__p;				\
>> +	__p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);	\
>> +	__p;							\
>> +})
>> +
>> +#define sk_memory_allocated(sk)					\
>> +({								\
>> +	atomic_long_t *__mem;					\
>> +	__mem = (sk)->sk_prot->memory_allocated(sk->sk_cgrp);	\
>> +	__mem;							\
>> +})
>> +
>> +#define sk_prot_mem(sk)						\
>> +({								\
>> +	long *__mem = (sk)->sk_prot->prot_mem(sk->sk_cgrp);	\
>> +	__mem;							\
>> +})
>> +
>> +#define sg_memory_pressure(prot, sg)				\
>> +({								\
>> +	int *__ret = NULL;					\
>> +	if (prot->memory_pressure)				\
>> +		__ret = (prot)->memory_pressure(sg);		\
>> +	__ret;							\
>> +})
>> +
>> +#define sg_memory_allocated(prot, sg)				\
>> +({								\
>> +	atomic_long_t *__mem;					\
>> +	__mem = (prot)->memory_allocated(sg);			\
>> +	__mem;							\
>> +})
>> +
>> +#define sg_sockets_allocated(prot, sg)				\
>> +({								\
>> +	struct percpu_counter *__p;				\
>> +	__p = (prot)->sockets_allocated(sg);			\
>> +	__p;							\
>> +})
>> +
>
> All functions are worth to be inlined ?
>
>
>
>>   extern int proto_register(struct proto *prot, int alloc_slab);
>>   extern void proto_unregister(struct proto *prot);
>>
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 149a415..8e1ec4a 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
>>   extern int sysctl_tcp_reordering;
>>   extern int sysctl_tcp_ecn;
>>   extern int sysctl_tcp_dsack;
>> -extern long sysctl_tcp_mem[3];
>>   extern int sysctl_tcp_wmem[3];
>>   extern int sysctl_tcp_rmem[3];
>>   extern int sysctl_tcp_app_win;
>> @@ -253,9 +252,12 @@ extern int sysctl_tcp_cookie_size;
>>   extern int sysctl_tcp_thin_linear_timeouts;
>>   extern int sysctl_tcp_thin_dupack;
>>
>> -extern atomic_long_t tcp_memory_allocated;
>> -extern struct percpu_counter tcp_sockets_allocated;
>> -extern int tcp_memory_pressure;
>> +struct kmem_cgroup;
>> +extern long *tcp_sysctl_mem(struct kmem_cgroup *sg);
>> +struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg);
>> +int *memory_pressure_tcp(struct kmem_cgroup *sg);
>> +int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg);
>>
>>   /*
>>    * The next routines deal with comparing 32 bit unsigned ints
>> @@ -286,7 +288,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
>>   	}
>>
>>   	if (sk->sk_wmem_queued>  SOCK_MIN_SNDBUF&&
>> -	    atomic_long_read(&tcp_memory_allocated)>  sysctl_tcp_mem[2])
>> +	    atomic_long_read(sk_memory_allocated(sk))>  sk_prot_mem(sk)[2])
>
> Why not sk_memory_allocated() returns the value ?
> Is it required to return pointer ?
>
>
>>   		return true;
>>   	return false;
>>   }
>> diff --git a/include/net/udp.h b/include/net/udp.h
>> index 67ea6fc..0e27388 100644
>> --- a/include/net/udp.h
>> +++ b/include/net/udp.h
>> @@ -105,7 +105,8 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
>>
>>   extern struct proto udp_prot;
>>
>> -extern atomic_long_t udp_memory_allocated;
>> +atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg);
>> +long *udp_sysctl_mem(struct kmem_cgroup *sg);
>>
>>   /* sysctl variables for udp */
>>   extern long sysctl_udp_mem[3];
>> diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
>> index 779abb9..12a6083 100644
>> --- a/include/trace/events/sock.h
>> +++ b/include/trace/events/sock.h
>> @@ -37,7 +37,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>
>>   	TP_STRUCT__entry(
>>   		__array(char, name, 32)
>> -		__field(long *, sysctl_mem)
>> +		__field(long *, prot_mem)
>>   		__field(long, allocated)
>>   		__field(int, sysctl_rmem)
>>   		__field(int, rmem_alloc)
>> @@ -45,7 +45,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>
>>   	TP_fast_assign(
>>   		strncpy(__entry->name, prot->name, 32);
>> -		__entry->sysctl_mem = prot->sysctl_mem;
>> +		__entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
>>   		__entry->allocated = allocated;
>>   		__entry->sysctl_rmem = prot->sysctl_rmem[0];
>>   		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
>> @@ -54,9 +54,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
>>   	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
>>   		"sysctl_rmem=%d rmem_alloc=%d",
>>   		__entry->name,
>> -		__entry->sysctl_mem[0],
>> -		__entry->sysctl_mem[1],
>> -		__entry->sysctl_mem[2],
>> +		__entry->prot_mem[0],
>> +		__entry->prot_mem[1],
>> +		__entry->prot_mem[2],
>>   		__entry->allocated,
>>   		__entry->sysctl_rmem,
>>   		__entry->rmem_alloc)
>> diff --git a/init/Kconfig b/init/Kconfig
>> index d627783..5955ac2 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -690,6 +690,17 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
>>   	  select this option (if, for some reason, they need to disable it
>>   	  then swapaccount=0 does the trick).
>>
>> +config CGROUP_KMEM
>> +	bool "Kernel Memory Resource Controller for Control Groups"
>> +	depends on CGROUPS
>> +	help
>> +	  The Kernel Memory cgroup can limit the amount of memory used by
>> +	  certain kernel objects in the system. Those are fundamentally
>> +	  different from the entities handled by the Memory Controller,
>> +	  which are page-based, and can be swapped. Users of the kmem
>> +	  cgroup can use it to guarantee that no group of processes will
>> +	  ever exhaust kernel resources alone.
>> +
>
> This help seems nice but please add Documentation/cgroup/kmem.
>
>
>
>
>>   config CGROUP_PERF
>>   	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
>>   	depends on PERF_EVENTS&&  CGROUPS
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 836e416..1b1aa24 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -45,6 +45,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
>>   obj-$(CONFIG_QUICKLIST) += quicklist.o
>>   obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
>>   obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
>> +obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o
>>   obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
>>   obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
>>   obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 3449df8..2d968ea 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -134,6 +134,24 @@
>>   #include<net/tcp.h>
>>   #endif
>>
>> +static DEFINE_RWLOCK(proto_list_lock);
>> +static LIST_HEAD(proto_list);
>> +
>> +int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
>> +{
>> +	struct proto *proto;
>> +	int ret = 0;
>> +
>> +	read_lock(&proto_list_lock);
>> +	list_for_each_entry(proto,&proto_list, node) {
>> +		if (proto->init_cgroup)
>> +			ret |= proto->init_cgroup(cgrp, ss);
>> +	}
>> +	read_unlock(&proto_list_lock);
>> +
>> +	return ret;
>> +}
>> +
>>   /*
>>    * Each address family might have different locking rules, so we have
>>    * one slock key per address family:
>> @@ -1114,6 +1132,31 @@ void sock_update_classid(struct sock *sk)
>>   EXPORT_SYMBOL(sock_update_classid);
>>   #endif
>>
>> +void sock_update_kmem_cgrp(struct sock *sk)
>> +{
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	sk->sk_cgrp = kcg_from_task(current);
>> +
>> +	/*
>> +	 * We don't need to protect against anything task-related, because
>> +	 * we are basically stuck with the sock pointer that won't change,
>> +	 * even if the task that originated the socket changes cgroups.
>> +	 *
>> +	 * What we do have to guarantee, is that the chain leading us to
>> +	 * the top level won't change under our noses. Incrementing the
>> +	 * reference count via cgroup_exclude_rmdir guarantees that.
>> +	 */
>> +	cgroup_exclude_rmdir(&sk->sk_cgrp->css);
>> +#endif
>> +}
>
> I'm not sure this kind of bare cgroup code in core/sock.c will be
> welcomed by network guys.
>
>
>
>
>> +
>> +void sock_release_kmem_cgrp(struct sock *sk)
>> +{
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	cgroup_release_and_wakeup_rmdir(&sk->sk_cgrp->css);
>> +#endif
>> +}
>> +
>>   /**
>>    *	sk_alloc - All socket objects are allocated here
>>    *	@net: the applicable net namespace
>> @@ -1139,6 +1182,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
>>   		atomic_set(&sk->sk_wmem_alloc, 1);
>>
>>   		sock_update_classid(sk);
>> +		sock_update_kmem_cgrp(sk);
>>   	}
>>
>>   	return sk;
>> @@ -1170,6 +1214,7 @@ static void __sk_free(struct sock *sk)
>>   		put_cred(sk->sk_peer_cred);
>>   	put_pid(sk->sk_peer_pid);
>>   	put_net(sock_net(sk));
>> +	sock_release_kmem_cgrp(sk);
>>   	sk_prot_free(sk->sk_prot_creator, sk);
>>   }
>>
>> @@ -1287,8 +1332,8 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
>>   		sk_set_socket(newsk, NULL);
>>   		newsk->sk_wq = NULL;
>>
>> -		if (newsk->sk_prot->sockets_allocated)
>> -			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
>> +		if (sk_sockets_allocated(sk))
>> +			percpu_counter_inc(sk_sockets_allocated(sk));
> How about
> 	sk_sockets_allocated_inc(sk);
> ?
>
>
>>
>>   		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
>>   		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
>> @@ -1676,29 +1721,51 @@ EXPORT_SYMBOL(sk_wait_data);
>>    */
>>   int __sk_mem_schedule(struct sock *sk, int size, int kind)
>>   {
>> -	struct proto *prot = sk->sk_prot;
>>   	int amt = sk_mem_pages(size);
>> +	struct proto *prot = sk->sk_prot;
>>   	long allocated;
>> +	int *memory_pressure;
>> +	long *prot_mem;
>> +	int parent_failure = 0;
>> +	struct kmem_cgroup *sg;
>>
>>   	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
>> -	allocated = atomic_long_add_return(amt, prot->memory_allocated);
>> +
>> +	memory_pressure = sk_memory_pressure(sk);
>> +	prot_mem = sk_prot_mem(sk);
>> +
>> +	allocated = atomic_long_add_return(amt, sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>> +		long alloc;
>> +		/*
>> +		 * Large nestings are not the common case, and stopping in the
>> +		 * middle would be complicated enough, that we bill it all the
>> +		 * way through the root, and if needed, unbill everything later
>> +		 */
>> +		alloc = atomic_long_add_return(amt,
>> +					       sg_memory_allocated(prot, sg));
>> +		parent_failure |= (alloc>  sk_prot_mem(sk)[2]);
>> +	}
>> +#endif
>> +
>> +	/* Over hard limit (we, or our parents) */
>> +	if (parent_failure || (allocated>  prot_mem[2]))
>> +		goto suppress_allocation;
>>
>>   	/* Under limit. */
>> -	if (allocated<= prot->sysctl_mem[0]) {
>> -		if (prot->memory_pressure&&  *prot->memory_pressure)
>> -			*prot->memory_pressure = 0;
>> +	if (allocated<= prot_mem[0]) {
>> +		if (memory_pressure&&  *memory_pressure)
>> +			*memory_pressure = 0;
>>   		return 1;
>>   	}
>>
>>   	/* Under pressure. */
>> -	if (allocated>  prot->sysctl_mem[1])
>> +	if (allocated>  prot_mem[1])
>>   		if (prot->enter_memory_pressure)
>>   			prot->enter_memory_pressure(sk);
>>
>> -	/* Over hard limit. */
>> -	if (allocated>  prot->sysctl_mem[2])
>> -		goto suppress_allocation;
>> -
>>   	/* guarantee minimum buffer size under pressure */
>>   	if (kind == SK_MEM_RECV) {
>>   		if (atomic_read(&sk->sk_rmem_alloc)<  prot->sysctl_rmem[0])
>> @@ -1712,13 +1779,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
>>   				return 1;
>>   	}
>>
>> -	if (prot->memory_pressure) {
>> +	if (memory_pressure) {
>>   		int alloc;
>>
>> -		if (!*prot->memory_pressure)
>> +		if (!*memory_pressure)
>>   			return 1;
>> -		alloc = percpu_counter_read_positive(prot->sockets_allocated);
>> -		if (prot->sysctl_mem[2]>  alloc *
>> +		alloc = percpu_counter_read_positive(sk_sockets_allocated(sk));
>> +		if (prot_mem[2]>  alloc *
>>   		    sk_mem_pages(sk->sk_wmem_queued +
>>   				 atomic_read(&sk->sk_rmem_alloc) +
>>   				 sk->sk_forward_alloc))
>> @@ -1741,7 +1808,13 @@ suppress_allocation:
>>
>>   	/* Alas. Undo changes. */
>>   	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
>> -	atomic_long_sub(amt, prot->memory_allocated);
>> +
>> +	atomic_long_sub(amt, sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent)
>> +		atomic_long_sub(amt, sg_memory_allocated(prot, sg));
>> +#endif
>>   	return 0;
>>   }
>>   EXPORT_SYMBOL(__sk_mem_schedule);
>> @@ -1753,14 +1826,24 @@ EXPORT_SYMBOL(__sk_mem_schedule);
>>   void __sk_mem_reclaim(struct sock *sk)
>>   {
>>   	struct proto *prot = sk->sk_prot;
>> +	struct kmem_cgroup *sg = sk->sk_cgrp;
>> +	int *memory_pressure = sk_memory_pressure(sk);
>>
>>   	atomic_long_sub(sk->sk_forward_alloc>>  SK_MEM_QUANTUM_SHIFT,
>> -		   prot->memory_allocated);
>> +		   sk_memory_allocated(sk));
>> +
>> +#ifdef CONFIG_CGROUP_KMEM
>> +	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
>> +		atomic_long_sub(sk->sk_forward_alloc>>  SK_MEM_QUANTUM_SHIFT,
>> +						sg_memory_allocated(prot, sg));
>> +	}
>> +#endif
>> +
>>   	sk->sk_forward_alloc&= SK_MEM_QUANTUM - 1;
>>
>> -	if (prot->memory_pressure&&  *prot->memory_pressure&&
>> -	    (atomic_long_read(prot->memory_allocated)<  prot->sysctl_mem[0]))
>> -		*prot->memory_pressure = 0;
>> +	if (memory_pressure&&  *memory_pressure&&
>> +	    (atomic_long_read(sk_memory_allocated(sk))<  sk_prot_mem(sk)[0]))
>> +		*memory_pressure = 0;
>>   }
>>   EXPORT_SYMBOL(__sk_mem_reclaim);
>>
>
> IMHO, I like to hide atomic_long_xxxx ops under kmem cgroup ops.
>
> And use callbacks like
> 	kmem_cgroup_read(SOCKET_MEM_ALLOCATED, sk)
>
> If other component uses kmem_cgroup, a generic interface will be
> helpful because optimization/fix in generic interface will improve
> all users of kmem_cgroup.
>
>
>
>> @@ -2252,9 +2335,6 @@ void sk_common_release(struct sock *sk)
>>   }
>>   EXPORT_SYMBOL(sk_common_release);
>>
>> -static DEFINE_RWLOCK(proto_list_lock);
>> -static LIST_HEAD(proto_list);
>> -
>>   #ifdef CONFIG_PROC_FS
>>   #define PROTO_INUSE_NR	64	/* should be enough for the first time */
>>   struct prot_inuse {
>> @@ -2479,13 +2559,17 @@ static char proto_method_implemented(const void *method)
>>
>>   static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
>>   {
>> +	struct kmem_cgroup *sg = kcg_from_task(current);
>> +
>>   	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
>>   			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
>>   		   proto->name,
>>   		   proto->obj_size,
>>   		   sock_prot_inuse_get(seq_file_net(seq), proto),
>> -		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
>> -		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
>> +		   proto->memory_allocated != NULL ?
>> +			atomic_long_read(sg_memory_allocated(proto, sg)) : -1L,
>> +		   proto->memory_pressure != NULL ?
>> +			*sg_memory_pressure(proto, sg) ? "yes" : "no" : "NI",
>>   		   proto->max_header,
>>   		   proto->slab == NULL ? "no" : "yes",
>>   		   module_name(proto->owner),
>> diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
>> index 19acd00..463b299 100644
>> --- a/net/decnet/af_decnet.c
>> +++ b/net/decnet/af_decnet.c
>> @@ -458,13 +458,28 @@ static void dn_enter_memory_pressure(struct sock *sk)
>>   	}
>>   }
>>
>> +static atomic_long_t *memory_allocated_dn(struct kmem_cgroup *sg)
>> +{
>> +	return&decnet_memory_allocated;
>> +}
>> +
>> +static int *memory_pressure_dn(struct kmem_cgroup *sg)
>> +{
>> +	return&dn_memory_pressure;
>> +}
>> +
>> +static long *dn_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +	return sysctl_decnet_mem;
>> +}
>> +
>>   static struct proto dn_proto = {
>>   	.name			= "NSP",
>>   	.owner			= THIS_MODULE,
>>   	.enter_memory_pressure	= dn_enter_memory_pressure,
>> -	.memory_pressure	=&dn_memory_pressure,
>> -	.memory_allocated	=&decnet_memory_allocated,
>> -	.sysctl_mem		= sysctl_decnet_mem,
>> +	.memory_pressure	= memory_pressure_dn,
>> +	.memory_allocated	= memory_allocated_dn,
>> +	.prot_mem		= dn_sysctl_mem,
>>   	.sysctl_wmem		= sysctl_decnet_wmem,
>>   	.sysctl_rmem		= sysctl_decnet_rmem,
>>   	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
>> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
>> index b14ec7d..9c80acf 100644
>> --- a/net/ipv4/proc.c
>> +++ b/net/ipv4/proc.c
>> @@ -52,20 +52,22 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
>>   {
>>   	struct net *net = seq->private;
>>   	int orphans, sockets;
>> +	struct kmem_cgroup *sg = kcg_from_task(current);
>> +	struct percpu_counter *allocated = sg_sockets_allocated(&tcp_prot, sg);
>>
>>   	local_bh_disable();
>>   	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
>> -	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
>> +	sockets = percpu_counter_sum_positive(allocated);
>>   	local_bh_enable();
>>
>>   	socket_seq_show(seq);
>>   	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
>>   		   sock_prot_inuse_get(net,&tcp_prot), orphans,
>>   		   tcp_death_row.tw_count, sockets,
>> -		   atomic_long_read(&tcp_memory_allocated));
>> +		   atomic_long_read(sg_memory_allocated((&tcp_prot), sg)));
>>   	seq_printf(seq, "UDP: inuse %d mem %ld\n",
>>   		   sock_prot_inuse_get(net,&udp_prot),
>> -		   atomic_long_read(&udp_memory_allocated));
>> +		   atomic_long_read(sg_memory_allocated((&udp_prot), sg)));
>>   	seq_printf(seq, "UDPLITE: inuse %d\n",
>>   		   sock_prot_inuse_get(net,&udplite_prot));
>>   	seq_printf(seq, "RAW: inuse %d\n",
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 69fd720..5e89480 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -14,6 +14,8 @@
>>   #include<linux/init.h>
>>   #include<linux/slab.h>
>>   #include<linux/nsproxy.h>
>> +#include<linux/kmem_cgroup.h>
>> +#include<linux/swap.h>
>>   #include<net/snmp.h>
>>   #include<net/icmp.h>
>>   #include<net/ip.h>
>> @@ -174,6 +176,43 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
>>   	return ret;
>>   }
>>
>> +static int ipv4_tcp_mem(ctl_table *ctl, int write,
>> +			   void __user *buffer, size_t *lenp,
>> +			   loff_t *ppos)
>> +{
>> +	int ret;
>> +	unsigned long vec[3];
>> +	struct kmem_cgroup *kmem = kcg_from_task(current);
>> +	struct net *net = current->nsproxy->net_ns;
>> +	int i;
>> +
>> +	ctl_table tmp = {
>> +		.data =&vec,
>> +		.maxlen = sizeof(vec),
>> +		.mode = ctl->mode,
>> +	};
>> +
>> +	if (!write) {
>> +		ctl->data =&net->ipv4.sysctl_tcp_mem;
>> +		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
>> +	}
>> +
>> +	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
>> +	if (ret)
>> +		return ret;
>> +
>> +	for (i = 0; i<  3; i++)
>> +		if (vec[i]>  kmem->tcp_max_memory)
>> +			return -EINVAL;
>> +
>> +	for (i = 0; i<  3; i++) {
>> +		net->ipv4.sysctl_tcp_mem[i] = vec[i];
>> +		kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i];
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static struct ctl_table ipv4_table[] = {
>>   	{
>>   		.procname	= "tcp_timestamps",
>> @@ -433,13 +472,6 @@ static struct ctl_table ipv4_table[] = {
>>   		.proc_handler	= proc_dointvec
>>   	},
>>   	{
>> -		.procname	= "tcp_mem",
>> -		.data		=&sysctl_tcp_mem,
>> -		.maxlen		= sizeof(sysctl_tcp_mem),
>> -		.mode		= 0644,
>> -		.proc_handler	= proc_doulongvec_minmax
>> -	},
>> -	{
>>   		.procname	= "tcp_wmem",
>>   		.data		=&sysctl_tcp_wmem,
>>   		.maxlen		= sizeof(sysctl_tcp_wmem),
>> @@ -721,6 +753,12 @@ static struct ctl_table ipv4_net_table[] = {
>>   		.mode		= 0644,
>>   		.proc_handler	= ipv4_ping_group_range,
>>   	},
>> +	{
>> +		.procname	= "tcp_mem",
>> +		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
>> +		.mode		= 0644,
>> +		.proc_handler	= ipv4_tcp_mem,
>> +	},
>>   	{ }
>>   };
>>
>> @@ -734,6 +772,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
>>   static __net_init int ipv4_sysctl_init_net(struct net *net)
>>   {
>>   	struct ctl_table *table;
>> +	unsigned long limit;
>>
>>   	table = ipv4_net_table;
>>   	if (!net_eq(net,&init_net)) {
>> @@ -769,6 +808,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
>>
>>   	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
>>
>> +	limit = nr_free_buffer_pages() / 8;
>> +	limit = max(limit, 128UL);
>> +	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
>> +	net->ipv4.sysctl_tcp_mem[1] = limit;
>> +	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
>> +
>
> What this calculation means ? Documented somewhere ?
>
>
>
>>   	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
>>   			net_ipv4_ctl_path, table);
>>   	if (net->ipv4.ipv4_hdr == NULL)
>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>> index 46febca..e1918fa 100644
>> --- a/net/ipv4/tcp.c
>> +++ b/net/ipv4/tcp.c
>> @@ -266,6 +266,7 @@
>>   #include<linux/crypto.h>
>>   #include<linux/time.h>
>>   #include<linux/slab.h>
>> +#include<linux/nsproxy.h>
>>
>>   #include<net/icmp.h>
>>   #include<net/tcp.h>
>> @@ -282,23 +283,12 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
>>   struct percpu_counter tcp_orphan_count;
>>   EXPORT_SYMBOL_GPL(tcp_orphan_count);
>>
>> -long sysctl_tcp_mem[3] __read_mostly;
>>   int sysctl_tcp_wmem[3] __read_mostly;
>>   int sysctl_tcp_rmem[3] __read_mostly;
>>
>> -EXPORT_SYMBOL(sysctl_tcp_mem);
>>   EXPORT_SYMBOL(sysctl_tcp_rmem);
>>   EXPORT_SYMBOL(sysctl_tcp_wmem);
>>
>> -atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
>> -EXPORT_SYMBOL(tcp_memory_allocated);
>> -
>> -/*
>> - * Current number of TCP sockets.
>> - */
>> -struct percpu_counter tcp_sockets_allocated;
>> -EXPORT_SYMBOL(tcp_sockets_allocated);
>> -
>>   /*
>>    * TCP splice context
>>    */
>> @@ -308,23 +298,157 @@ struct tcp_splice_state {
>>   	unsigned int flags;
>>   };
>>
>> +#ifdef CONFIG_CGROUP_KMEM
>>   /*
>>    * Pressure flag: try to collapse.
>>    * Technical note: it is used by multiple contexts non atomically.
>>    * All the __sk_mem_schedule() is of this nature: accounting
>>    * is strict, actions are advisory and have some latency.
>>    */
>> -int tcp_memory_pressure __read_mostly;
>> -EXPORT_SYMBOL(tcp_memory_pressure);
>> -
>>   void tcp_enter_memory_pressure(struct sock *sk)
>>   {
>> +	struct kmem_cgroup *sg = sk->sk_cgrp;
>> +	if (!sg->tcp_memory_pressure) {
>> +		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
>> +		sg->tcp_memory_pressure = 1;
>> +	}
>> +}
>> +
>> +long *tcp_sysctl_mem(struct kmem_cgroup *sg)
>> +{
>> +	return sg->tcp_prot_mem;
>> +}
>> +
>> +atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
>> +{
>> +	return&(sg->tcp_memory_allocated);
>> +}
>> +
>> +static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64 val)
>> +{
>> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>> +	struct net *net = current->nsproxy->net_ns;
>> +	int i;
>> +
>> +	if (!cgroup_lock_live_group(cgrp))
>> +		return -ENODEV;
>> +
>> +	/*
>> +	 * We can't allow more memory than our parents. Since this
>> +	 * will be tested for all calls, by induction, there is no need
>> +	 * to test any parent other than our own
>> +	 * */
>> +	if (sg->parent&&  (val>  sg->parent->tcp_max_memory))
>> +		val = sg->parent->tcp_max_memory;
>> +
>> +	sg->tcp_max_memory = val;
>> +
>> +	for (i = 0; i<  3; i++)
>> +		sg->tcp_prot_mem[i]  = min_t(long, val,
>> +					     net->ipv4.sysctl_tcp_mem[i]);
>> +
>> +	cgroup_unlock();
>> +
>> +	return 0;
>> +}
>> +
>
> Do we really need cgroup_lock/unlock ?
>
>
>
>> +static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft)
>> +{
>> +	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
>> +	u64 ret;
>> +
>> +	if (!cgroup_lock_live_group(cgrp))
>> +		return -ENODEV;
>> +	ret = sg->tcp_max_memory;
>> +
>> +	cgroup_unlock();
>> +	return ret;
>> +}
>> +
>
>
> Hmm, can't you implement this function as
>
> 	kmem_cgroup_read(SOCK_TCP_MAXMEM, sg);
>
> ? How do you think ?
>
> Thanks,
> -Kame
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Menage Sept. 7, 2011, 1:08 a.m. UTC | #8
On Mon, Sep 5, 2011 at 7:35 PM, Glauber Costa <glommer@parallels.com> wrote:
> This patch introduces per-cgroup tcp buffers limitation. This allows
> sysadmins to specify a maximum amount of kernel memory that
> tcp connections can use at any point in time. TCP is the main interest
> in this work, but extending it to other protocols would be easy.

The general idea of limiting total socket buffer memory consumed by a
cgroup is a fine idea, but I think it needs to be integrated more
closely with the existing kernel memory cgroup tracking efforts,
especially if you're trying to use as generic a name as "kmem" for it.

I agree with Kamezawa's comments that you need a lot more documentation.

Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 7, 2011, 1:09 a.m. UTC | #9
On 09/06/2011 10:08 PM, Paul Menage wrote:
> On Mon, Sep 5, 2011 at 7:35 PM, Glauber Costa<glommer@parallels.com>  wrote:
>> This patch introduces per-cgroup tcp buffers limitation. This allows
>> sysadmins to specify a maximum amount of kernel memory that
>> tcp connections can use at any point in time. TCP is the main interest
>> in this work, but extending it to other protocols would be easy.

Hi Paul,


> The general idea of limiting total socket buffer memory consumed by a
> cgroup is a fine idea, but I think it needs to be integrated more
> closely with the existing kernel memory cgroup tracking efforts,
> especially if you're trying to use as generic a name as "kmem" for it.
Can you be more specific?

The generic part of kmem cgroup in this patch is quite simple. I think 
any other patchset would have a very easy time merging things into it.
90 % is sockets.

>
> I agree with Kamezawa's comments that you need a lot more documentation.
Working on it right now.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Menage Sept. 7, 2011, 1:29 a.m. UTC | #10
On Tue, Sep 6, 2011 at 6:09 PM, Glauber Costa <glommer@parallels.com> wrote:
>
> Can you be more specific?

Maybe if you include the source to kmem_cgroup.c :-)

Things like the reporting of stats to user space, configuring limits,
etc, ought to be common with the other kernel memory tracking. (Sadly,
I've lost track of the status of the other kernel memory effort -
Kamezawa or the OpenVZ folks probably have a better handle on that).

Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 7, 2011, 1:32 a.m. UTC | #11
On 09/06/2011 10:29 PM, Paul Menage wrote:
> On Tue, Sep 6, 2011 at 6:09 PM, Glauber Costa<glommer@parallels.com>  wrote:
>>
>> Can you be more specific?
>
> Maybe if you include the source to kmem_cgroup.c :-)
Yeah, *facepalm*.

I am about to send another version, just finishing writing up the docs.
But it is really simple right now, and only cares about the socket 
stuff. No reporting, nothing else. My idea is we use it as a seed and 
grow it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 7, 2011, 2:38 a.m. UTC | #12
Le mardi 06 septembre 2011 à 15:12 -0700, Greg Thelen a écrit :

> >>> +#define sk_sockets_allocated(sk)                               \
> >>> +({                                                             \
> >>> +       struct percpu_counter *__p;                             \
> >>> +       __p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);    \
> >>> +       __p;                                                    \
> >>> +})
> 
> Could this be simplified as (same applies to following few macros):
> 
> static inline struct percpu_counter *sk_sockets_allocated(struct sock *sk)
> {
>         return sk->sk_prot->sockets_allocated(sk->sk_cgrp);
> }
> 

Please Greg, dont copy/paste huge sequence of code if you dont have
anymore comments.

Right before sending your mail, remove all parts that we already got in
previous mails.

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Thelen Sept. 7, 2011, 9:35 p.m. UTC | #13
On Tue, Sep 6, 2011 at 3:37 PM, Glauber Costa <glommer@parallels.com> wrote:
> I think memcg's usage is really all you need here. In the end of the day, it
> tells you how many pages your container has available. The whole
> point of kmem cgroup is not any kind of reservation or accounting.

The memcg does not reserve memory.  It provides upper bound limits on
memory usage.  A careful admin can configure soft_limit_in_bytes as an
approximation of a memory reservation.  But the soft limit is really
more like a reclaim target when there is global memory pressure.

> Once a container (or cgroup) reaches a number of objects *pinned* in memory
> (therefore, non-reclaimable), you won't be able to grab anything from it.
>
>> So
>> far my use cases involve a single memory limit which includes both
>> kernel and user memory.  So I would need a user space agent to poll
>> {memcg,kmem}.usage_in_bytes to apply pressure to memcg if kmem grows
>> and visa versa.
>
> Maybe not.
> If userspace memory works for you today (supposing it does), why change?

Good question.  Current upstream memcg user space memory limit does
not work for me today.  I should have made that more obvious (sorry).
See below for details.

> Right now you assign X bytes of user memory to a container, and the kernel
> memory is shared among all of them. If this works for you, kmem_cgroup won't
> change that. It just will impose limits over which
> your kernel objects can't grow.
>
> So you don't *need* a userspace agent doing this calculation, because
> fundamentally, nothing changed: I am not unbilling memory in memcg to bill
> it back in kmem_cg. Of course, once it is in, you will be able to do it in
> such a fine grained fashion if you decide to do so.
>
>> Do you foresee instantiation of multiple kmem cgroups, so that a
>> process could be added into kmem/K1 or kmem/K2?  If so do you plan on
>> supporting migration between cgroups and/or migration of kmem charge
>> between K1 to K2?
>
> Yes, each container should have its own cgroup, so at least in the use
> cases I am concerned, we will have a lot of them. But the usual lifecycle,
> is create, execute and die. Mobility between them
> is not something I am overly concerned right now.
>
>
>>>> Do you foresee the kmem cgroup growing to include reclaimable slab,
>>>> where freeing one type of memory allows for reclaim of the other?
>>>
>>> Yes, absolutely.

Now I see that you're using kmem to limit the amount of unreclaimable
kernel memory.

We have a work-in-progress patch series that adds kernel memory accounting to
memcg.  These patches allow an admin to specify a single memory limit
for a cgroup which encompasses both user memory (as upstream memcg
does) and also includes many kernel memory allocations (especially
slab, page-tables).  When kernel memory grows it puts pressure on user
memory; when user memory grows it puts pressure on reclaimable kernel
memory using registered shrinkers.  We are in the process of cleaning
up these memcg slab accounting patches.

In my uses cases there is a single memory limit that applies to both
kernel and user memory.  If a separate kmem cgroup is introduced to
manage kernel memory outside of memcg with a distinct limit, then I
would need a user space daemon which balances memory between the kmem
and memcg subsystems.  As kmem grows, this daemon would apply pressure
to memcg, and as memcg grows pressure would be applied to kmem.  As
you stated kernel memory is not necessarily reclaimable.  So such
reclaim may fail.  My resistance to this approach is that with a
single memory cgroup admins can do a better job packing a machine.  If
balancing daemons are employed then more memory would need to be
reserved and more user space cpu time would be needed to apply VM
pressure between the types of memory.

While there are people (like me) who want a combined memory usage
limit there are also people (like you) who want separate user and
kernel limiting.  I have toyed with the idea of having a per cgroup
flag that determines if kernel and user memory should be combined
charged against a single limit or if they should have separate limits.
 I have also wondered if there was a way to wire the usage of two
subsystems together, then it would also meet meet my needs.  But I am
not sure how to do that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 8, 2011, 4:44 a.m. UTC | #14
On 09/07/2011 06:35 PM, Greg Thelen wrote:
> On Tue, Sep 6, 2011 at 3:37 PM, Glauber Costa<glommer@parallels.com>  wrote:
>> I think memcg's usage is really all you need here. In the end of the day, it
>> tells you how many pages your container has available. The whole
>> point of kmem cgroup is not any kind of reservation or accounting.
>
> The memcg does not reserve memory.  It provides upper bound limits on
> memory usage.  A careful admin can configure soft_limit_in_bytes as an
> approximation of a memory reservation.  But the soft limit is really
> more like a reclaim target when there is global memory pressure.
>
>> Once a container (or cgroup) reaches a number of objects *pinned* in memory
>> (therefore, non-reclaimable), you won't be able to grab anything from it.
>>
>>> So
>>> far my use cases involve a single memory limit which includes both
>>> kernel and user memory.  So I would need a user space agent to poll
>>> {memcg,kmem}.usage_in_bytes to apply pressure to memcg if kmem grows
>>> and visa versa.
>>
>> Maybe not.
>> If userspace memory works for you today (supposing it does), why change?
>
> Good question.  Current upstream memcg user space memory limit does
> not work for me today.  I should have made that more obvious (sorry).
> See below for details.
>
>> Right now you assign X bytes of user memory to a container, and the kernel
>> memory is shared among all of them. If this works for you, kmem_cgroup won't
>> change that. It just will impose limits over which
>> your kernel objects can't grow.
>>
>> So you don't *need* a userspace agent doing this calculation, because
>> fundamentally, nothing changed: I am not unbilling memory in memcg to bill
>> it back in kmem_cg. Of course, once it is in, you will be able to do it in
>> such a fine grained fashion if you decide to do so.
>>
>>> Do you foresee instantiation of multiple kmem cgroups, so that a
>>> process could be added into kmem/K1 or kmem/K2?  If so do you plan on
>>> supporting migration between cgroups and/or migration of kmem charge
>>> between K1 to K2?
>>
>> Yes, each container should have its own cgroup, so at least in the use
>> cases I am concerned, we will have a lot of them. But the usual lifecycle,
>> is create, execute and die. Mobility between them
>> is not something I am overly concerned right now.
>>
>>
>>>>> Do you foresee the kmem cgroup growing to include reclaimable slab,
>>>>> where freeing one type of memory allows for reclaim of the other?
>>>>
>>>> Yes, absolutely.
>
> Now I see that you're using kmem to limit the amount of unreclaimable
> kernel memory.
>
> We have a work-in-progress patch series that adds kernel memory accounting to
> memcg.  These patches allow an admin to specify a single memory limit
> for a cgroup which encompasses both user memory (as upstream memcg
> does) and also includes many kernel memory allocations (especially
> slab, page-tables).  When kernel memory grows it puts pressure on user
> memory; when user memory grows it puts pressure on reclaimable kernel
> memory using registered shrinkers.  We are in the process of cleaning
> up these memcg slab accounting patches.
 >
> In my uses cases there is a single memory limit that applies to both
> kernel and user memory.  If a separate kmem cgroup is introduced to
> manage kernel memory outside of memcg with a distinct limit, then I
> would need a user space daemon which balances memory between the kmem
> and memcg subsystems.  As kmem grows, this daemon would apply pressure
> to memcg, and as memcg grows pressure would be applied to kmem.  As
> you stated kernel memory is not necessarily reclaimable.  So such
> reclaim may fail.  My resistance to this approach is that with a
> single memory cgroup admins can do a better job packing a machine.  If
> balancing daemons are employed then more memory would need to be
> reserved and more user space cpu time would be needed to apply VM
> pressure between the types of memory.
Well, it is a way to see this. The other way to see this, is that you're
proposing to move to the kernel, something that really belongs in 
userspace. That's because:

With the information you provided me, I have no reason to believe that 
the kernel has more condition to do this work. Do the kernel have access 
to any information that userspace do not, and can't be exported? If not, 
userspace is traditionally where this sort of stuff has been done.

Using userspace CPU is no different from using kernel cpu in this 
particular case. It is all overhead, regardless where it comes from. 
Moreover, you end up setting up a policy, instead of a mechanism. What 
should be this proportion?  Do we reclaim everything with the same 
frequency? Should we be more tolerant with a specific container?

Also, If you want to allow any flexibility in this scheme, like: "Should 
this network container be able to stress the network more, pinning more 
memory, but not other subsystems?", you end up having to touch all 
individual files anyway - probably with a userspace daemon.

Also, as you noticed yourself, kernel memory is fundamentally different 
from userspace memory. You can't just set reclaim limits, since you have 
no guarantees it will work. User memory is not a scarce resource.
Kernel memory is.

>
> While there are people (like me) who want a combined memory usage
> limit there are also people (like you) who want separate user and
> kernel limiting.

Combined excludes separate. Separate does not exclude combined.

> I have toyed with the idea of having a per cgroup
> flag that determines if kernel and user memory should be combined
> charged against a single limit or if they should have separate limits.

And then every other kind of mechanism one may think of involves a 
kernel patch, instead of a potentially simple userspace change.

>   I have also wondered if there was a way to wire the usage of two
> subsystems together, then it would also meet meet my needs.  But I am
> not sure how to do that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Thelen Sept. 8, 2011, 9:53 p.m. UTC | #15
On Wed, Sep 7, 2011 at 9:44 PM, Glauber Costa <glommer@parallels.com> wrote:

Thanks for your ideas and patience.

> Well, it is a way to see this. The other way to see this, is that you're
> proposing to move to the kernel, something that really belongs in userspace.
> That's because:
>
> With the information you provided me, I have no reason to believe that the
> kernel has more condition to do this work. Do the kernel have access to any
> information that userspace do not, and can't be exported? If not, userspace
> is traditionally where this sort of stuff has been done.

I think direct reclaim is a pain if user space is required to participate in
memory balancing decisions.  One thing a single memory limit solution has is the
ability to reclaim user memory to satisfy growing kernel memory needs (and vise
versa).  If a container must fit within 100M, then a single limit solution
would set the limit to 100M and never change it.  In a split limit solution a
user daemon (e.g. uswapd) would need to monitor the usage and the amount of
active memory vs inactive user memory and unreferenced kernel memory to
determine where to apply pressure.  With some more knobs such a uswapd could
attempt to keep ahead of demand.  But eventually direct reclaim would
be needed to satisfy rapid growth spikes.  Example: If the 100M container
starts with limits of 20M kmem and 80M user memory but later its kernel
memory needs grow to 70M.  With separate user and kernel memory
limits the kernel memory allocation could fail despite there being
reclaimable user pages available.  The job should have a way to
transition to memory limits to 70M+ kernel and 30M- of user.

I suppose a GFP_WAIT slab kernel page allocation could wakeup user space to
perform user-assisted direct reclaim.  User space would then lower the user
limit thereby causing the kernel to direct reclaim user pages, then
the user daemon would raise the kernel limit allowing the slab allocation to
succeed.  My hunch is that this would be prone to deadlocks (what prevents
uswapd from needing more even more kmem?)  I'll defer to more
experienced minds to know if user assisted direct memory reclaim has
other pitfalls.  It scares me.

Fundamentally I have no problem putting an upper bound on a cgroup's resource
usage.  This serves to contain the damage a job can do to the system and other
jobs.  My concern is about limiting the kernel's ability to trade one type of
memory for another by using different cgroups for different types of memory.

If kmem expands to include reclaimable kernel memory (e.g. dentry) then I
presume the kernel would have no way to exchange unused user pages for dentry
pages even if the user memory in the container is well below its limit.  This is
motivation for the above user assisted direct reclaim.

Do you feel the need to segregate user and kernel memory into different cgroups
with independent limits?  Or is this this just a way to create a new clean
cgroup with a simple purpose?

In some resource sharing shops customers purchase a certain amount of memory,
cpu, network, etc.  Such customers don't define how the memory is used and the
user/kernel mixture may change over time.  Can a user space reclaim daemon stay
ahead of the workloads needs?

> Using userspace CPU is no different from using kernel cpu in this particular
> case. It is all overhead, regardless where it comes from. Moreover, you end
> up setting up a policy, instead of a mechanism. What should be this
> proportion?  Do we reclaim everything with the same frequency? Should we be
> more tolerant with a specific container?

I assume that this implies that a generic kmem cgroup usage is inferior to
separate limits for each kernel memory type to allow user space the flexibility
to choose between kernel types (udp vs tcp vs ext4 vs page_tables vs ...)?  Do
you foresee a way to provide a limit on the total amount of kmem usage by all
such types?  If a container wants to dedicate 4M for all network protocol
buffers (tcp, udp, etc.) would that require a user space daemon to balance
memory limits b/w the protocols?

> Also, If you want to allow any flexibility in this scheme, like: "Should
> this network container be able to stress the network more, pinning more
> memory, but not other subsystems?", you end up having to touch all
> individual files anyway - probably with a userspace daemon.
>
> Also, as you noticed yourself, kernel memory is fundamentally different from
> userspace memory. You can't just set reclaim limits, since you have no
> guarantees it will work. User memory is not a scarce resource.
> Kernel memory is.

I agree that kernel memory is somewhat different.  In some (I argue most)
situations containers want the ability to exchange job kmem and job umem.
Either split or combined accounting protects the system and isolates other
containers from kmem allocations of a bad job.  To me it seems natural to
indicate that job X gets Y MB of memory.  I have more trouble dividing the
Y MB of memory into dedicated slices for different types of memory.

>> While there are people (like me) who want a combined memory usage
>> limit there are also people (like you) who want separate user and
>> kernel limiting.
>
> Combined excludes separate. Separate does not exclude combined.

I agree.  I have no problem with separate accounting and separate
user-accessible pressure knobs to allow for complex policies.  My concern is
about limiting the kernel's ability to reclaim one type of memory to
fulfill the needs of another memory type (e.g. I think reclaiming clean file
pages should be possible to make room for user slab needs).  I think
memcg aware slab accounting does a good job of limiting a job's
memory allocations.
Would such slab accounting meet your needs?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones Sept. 9, 2011, 12:18 a.m. UTC | #16
On 09/05/2011 07:35 PM, Glauber Costa wrote:
> To test for any performance impacts of this patch, I used netperf's
> TCP_RR benchmark on localhost, so we can have both recv and snd in action.
>
> Command line used was ./src/netperf -t TCP_RR -H localhost, and the
> results:
>
> Without the patch
> =================
>
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    26996.35
> 16384  87380
>
> With the patch
> ===============
>
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    27291.86
> 16384  87380

Comment about netperf TCP_RR - it can often have > 1% variability, so it 
would be a Good Idea (tm) to either run it multiple times in a row, or 
rely on the confidence intervals functionality.  Here, for example, is 
an invoking of netperf using confidence intervals and the recently 
added, related output selectors.  The options request that netperf be 
99% confident that the width of the confidence interval is 1%, and it 
should run at least 3 but no more than 30 (those are both the high and 
low limits respectively) iterations of the test.


raj@tardy:~/netperf2_trunk$ src/netperf -t TCP_RR -i 30,3 -I 99,1 -- -k 
throughput,confidence_level,confidence_interval,confidence_iteration,throughput_confid
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
to localhost.localdomain (127.0.0.1) port 0 AF_INET : +/-0.500% @ 99% 
conf.  : histogram : first burst 0
THROUGHPUT=55555.94
CONFIDENCE_LEVEL=99
CONFIDENCE_INTERVAL=1.000000
CONFIDENCE_ITERATION=26
THROUGHPUT_CONFID=0.984

it took 26 iterations for netperf to be 99% confident the interval width 
was < 1% .  Here is a "several times in a row" for the sake of completeness:

raj@tardy:~/netperf2_trunk$ HDR="-P 1";for i in `seq 1 10`; do netperf 
-t TCP_RR $HDR -B "iteration $i" -- -o result_brand,throughput; HDR="-P 
0"; done
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET 
to localhost.localdomain (127.0.0.1) port 0 AF_INET : first burst 0
Result Tag,Throughput
"iteration 1",55768.37
"iteration 2",55949.97
"iteration 3",55653.36
"iteration 4",55994.65
"iteration 5",54712.42
"iteration 6",55285.27
"iteration 7",55638.65
"iteration 8",55135.56
"iteration 9",56275.87
"iteration 10",55607.66

That way one can have greater confidence that one isn't accidentally 
comparing the trough of one configuration with the peak of another.

happy benchmarking,

rick jones

PS - while it may not really matter for loopback testing, where 
presumably 99 times out of 10 a single core will run at saturation, when 
running TCP_RR over a "real" network, including CPU utilization to get 
the differences in service demand is another Good Idea (tm) - 
particularly in the face of interrupt coalescing.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 9, 2011, 4:17 a.m. UTC | #17
On 09/08/2011 06:53 PM, Greg Thelen wrote:
> On Wed, Sep 7, 2011 at 9:44 PM, Glauber Costa<glommer@parallels.com>  wrote:
>
> Thanks for your ideas and patience.

Likewise. It is turning out to be a very fruitful
discussion.

>> Well, it is a way to see this. The other way to see this, is that you're
>> proposing to move to the kernel, something that really belongs in userspace.
>> That's because:
>>
>> With the information you provided me, I have no reason to believe that the
>> kernel has more condition to do this work. Do the kernel have access to any
>> information that userspace do not, and can't be exported? If not, userspace
>> is traditionally where this sort of stuff has been done.
>
> I think direct reclaim is a pain if user space is required to participate in
> memory balancing decisions.
It depends on the decision.

> One thing a single memory limit solution has is the
> ability to reclaim user memory to satisfy growing kernel memory needs (and vise
> versa).

this works for a strict definition of the word "needs". If I *need*
more kernel memory, I'd be happy to have more. But if I just want to
screw other containers, they will be happy if I don't get what I
"need" - it can be unreclaimable. Since those are limits, they are
expected to be, in any real setups, greater than any use people
should be doing, and yet, prevent bad usage scenarios.

> If a container must fit within 100M, then a single limit solution
> would set the limit to 100M and never change it.  In a split limit solution a
> user daemon (e.g. uswapd) would need to monitor the usage and the amount of
> active memory vs inactive user memory and unreferenced kernel memory to
> determine where to apply pressure.

Or it can just define some parameters and let the kernel do the
rest. Like for instance, a maximum proportion allowed, a maximum
proportion desired, etc.

> With some more knobs such a uswapd could
> attempt to keep ahead of demand.  But eventually direct reclaim would
> be needed to satisfy rapid growth spikes.  Example: If the 100M container
> starts with limits of 20M kmem and 80M user memory but later its kernel
> memory needs grow to 70M.  With separate user and kernel memory
> limits the kernel memory allocation could fail despite there being
> reclaimable user pages available.

No no, this is a ratio, not a *limit*. A limit is something you
should not be allowed to go over. A good limit of kernel memory for
a 100 Mb container could be something like... 100 mb. But there is
more to that.

Risking being a bit polemic here, I think that when we do
containers, we have to view the kernel a little bit like a shared 
resource that is not accounted to anybody. It is easy to account things 
like tcp buffer, but who do you account page tables for shared pages to? 
Or pinned dentries for shared filesystems ?

Being the shared table under everybody, the kernel is more or less
like buffers inside a physical hdd, or cache lines. You know it is
there, you know it has a size, but provided you have some sane
protections, you don't really care - because in most cases you can't
- who is using it.

> The job should have a way to
> transition to memory limits to 70M+ kernel and 30M- of user.

Yes, and I don't see how what I propose prevents that.

> I suppose a GFP_WAIT slab kernel page allocation could wakeup user space to
> perform user-assisted direct reclaim.  User space would then lower the user
> limit thereby causing the kernel to direct reclaim user pages, then
> the user daemon would raise the kernel limit allowing the slab allocation to
> succeed.  My hunch is that this would be prone to deadlocks (what prevents
> uswapd from needing more even more kmem?)  I'll defer to more
> experienced minds to know if user assisted direct memory reclaim has
> other pitfalls.  It scares me.

Good that it scares you, it should. OTOH, userspace being
able to set parameters to it, has nothing scary at all. A daemon in
userspace can detect that you need more kernel space memory , and
then - according to a policy you abide to - write to a file allowing
it more, or maybe not - according to that same policy. It is very
far away from "userspace driven reclaim".

> Fundamentally I have no problem putting an upper bound on a cgroup's resource
> usage.  This serves to contain the damage a job can do to the system and other
> jobs.  My concern is about limiting the kernel's ability to trade one type of
> memory for another by using different cgroups for different types of memory.
Yes, but limits have nothing to do with it.

>
> If kmem expands to include reclaimable kernel memory (e.g. dentry) then I
> presume the kernel would have no way to exchange unused user pages for dentry
> pages even if the user memory in the container is well below its limit.  This is
> motivation for the above user assisted direct reclaim.

Dentry is not always reclaimable. If it is pinned, it is non
reclaimable. Speaking of it, Would you take a look at
https://lkml.org/lkml/2011/8/14/110 ?

I am targetting dentry as well. But since it is hard to assign a
dentry to a process all the time, going through a different path. I
however, haven't entirely given up of doing it cgroups based, so any
ideas are welcome =)

> Do you feel the need to segregate user and kernel memory into different cgroups
> with independent limits?  Or is this this just a way to create a new clean
> cgroup with a simple purpose?

No, I don't necessarily feel that need. I just thought it was
cleaner to have entities with different purposes in different
cgroups. If moving it to the memory controller would help you in any
way, I can just do it. 80 % of this work is independent of where a
cgroup file lives.

> In some resource sharing shops customers purchase a certain amount of memory,
> cpu, network, etc.  Such customers don't define how the memory is used and the
> user/kernel mixture may change over time.  Can a user space reclaim daemon stay
> ahead of the workloads needs?

If you think solely about limits, you don't need to. The most
sane policy is actually "I don't care what is the kernel/user ratio,
as long as the kernel never grows over X Mb".

>> Using userspace CPU is no different from using kernel cpu in this particular
>> case. It is all overhead, regardless where it comes from. Moreover, you end
>> up setting up a policy, instead of a mechanism. What should be this
>> proportion?  Do we reclaim everything with the same frequency? Should we be
>> more tolerant with a specific container?
>
> I assume that this implies that a generic kmem cgroup usage is inferior to
> separate limits for each kernel memory type to allow user space the flexibility
> to choose between kernel types (udp vs tcp vs ext4 vs page_tables vs ...)?  Do
> you foresee a way to provide a limit on the total amount of kmem usage by all
> such types?  If a container wants to dedicate 4M for all network protocol
> buffers (tcp, udp, etc.) would that require a user space daemon to balance
> memory limits b/w the protocols?

Well, I am giving this an extra thought... Having separate knobs
adds flexibility, but - as usual - also complexity. For the goals I
have in mind, "kernel memory" would work just as fine.

If you look carefully at the other patches in the series besides
this one, you'll see that it is just a matter of billing from kernel
memory instead of tcp-memory, and then all the rest is the same.

Do you think that a single kernel-memory knob would be better for
your needs? I am willing to give it a try.
>> Also, If you want to allow any flexibility in this scheme, like: "Should
>> this network container be able to stress the network more, pinning more
>> memory, but not other subsystems?", you end up having to touch all
>> individual files anyway - probably with a userspace daemon.
>>
>> Also, as you noticed yourself, kernel memory is fundamentally different from
>> userspace memory. You can't just set reclaim limits, since you have no
>> guarantees it will work. User memory is not a scarce resource.
>> Kernel memory is.
>
> I agree that kernel memory is somewhat different.  In some (I argue most)
> situations containers want the ability to exchange job kmem and job umem.
> Either split or combined accounting protects the system and isolates other
> containers from kmem allocations of a bad job.  To me it seems natural to
> indicate that job X gets Y MB of memory.  I have more trouble dividing the
> Y MB of memory into dedicated slices for different types of memory.

I understand. And I don't think anyone doing containers
should be mandated to define a division. But a limit...
>>> While there are people (like me) who want a combined memory usage
>>> limit there are also people (like you) who want separate user and
>>> kernel limiting.
>>
>> Combined excludes separate. Separate does not exclude combined.
>
> I agree.  I have no problem with separate accounting and separate
> user-accessible pressure knobs to allow for complex policies.  My concern is
> about limiting the kernel's ability to reclaim one type of memory to
> fulfill the needs of another memory type (e.g. I think reclaiming clean file
> pages should be possible to make room for user slab needs).

I agree with your concern. It is definitely something we should not
do.

> I think
> memcg aware slab accounting does a good job of limiting a job's
> memory allocations.
> Would such slab accounting meet your needs?

Well, the slab alone, no. There are other objects - like tcp buffers
- that aren't covered by the slab. Others are usually shared among
many cgroups, and others don't really belong to anybody in
particular.igh

How do you think then, about turning this into 2 files inside memcg:

  - kernel_memory_hard_limit.
  - kernel_memory_soft_limit.

tcp memory would be the one defined in /proc, except if it is
greater than any of the limits. Instead of testing for memory
allocation against kmem.tcp_allocated_memory, we'd test it against
memcg.kmem_pinned_memory.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Thelen Sept. 9, 2011, 11:38 p.m. UTC | #18
On Thu, Sep 8, 2011 at 9:17 PM, Glauber Costa <glommer@parallels.com> wrote:
> On 09/08/2011 06:53 PM, Greg Thelen wrote:
>>
>> If a container must fit within 100M, then a single limit solution
>> would set the limit to 100M and never change it.  In a split limit
>> solution a
>> user daemon (e.g. uswapd) would need to monitor the usage and the amount
>> of
>> active memory vs inactive user memory and unreferenced kernel memory to
>> determine where to apply pressure.
>
> Or it can just define some parameters and let the kernel do the
> rest. Like for instance, a maximum proportion allowed, a maximum
> proportion desired, etc.

Agreed.

>> With some more knobs such a uswapd could
>> attempt to keep ahead of demand.  But eventually direct reclaim would
>> be needed to satisfy rapid growth spikes.  Example: If the 100M container
>> starts with limits of 20M kmem and 80M user memory but later its kernel
>> memory needs grow to 70M.  With separate user and kernel memory
>> limits the kernel memory allocation could fail despite there being
>> reclaimable user pages available.
>
> No no, this is a ratio, not a *limit*. A limit is something you
> should not be allowed to go over. A good limit of kernel memory for
> a 100 Mb container could be something like... 100 mb. But there is
> more to that.
>
> Risking being a bit polemic here, I think that when we do
> containers, we have to view the kernel a little bit like a shared resource
> that is not accounted to anybody. It is easy to account things like tcp
> buffer, but who do you account page tables for shared pages to? Or pinned
> dentries for shared filesystems ?

This is a tough challenge.  If you don't account page tables, then
fork bombs can grab lots of kernel memory.  Containing such pigs is
good.  We are working on some patches that charge page tables to the
memcg they are allocated from.  This charge is checked against the
memcg's limit and thus the page table allocation may fail due if over
limit.

I suggest charging dentries to the cgroup that allocated the dentry.
If containers are used for isolation, then it seems that containers
should typically not share files.  That would break isolation.  Of
course there are shared files (/bin/bash, etc), but at least those are
read-only.

> Being the shared table under everybody, the kernel is more or less
> like buffers inside a physical hdd, or cache lines. You know it is
> there, you know it has a size, but provided you have some sane
> protections, you don't really care - because in most cases you can't
> - who is using it.

I agree that it makes sense to not bother charging fixed sized
resources (struct page, etc.) to containers; they are part of the
platform.  But the set of resources that a process can allocate are
ideally charged to a cgroup to limit container memory usage (i.e.
prevent DoS attacks, isolate performance).

>> The job should have a way to
>> transition to memory limits to 70M+ kernel and 30M- of user.
>
> Yes, and I don't see how what I propose prevents that.

I don't think your current proposal prevents that.  I was thinking
about the future of the proposed kmem cgroup.  I want to make sure the
kernel has a way to apply reclaim pressure bidirectionally between
cgroup kernel memory and cgroup user memory.

>> I suppose a GFP_WAIT slab kernel page allocation could wakeup user space
>> to
>> perform user-assisted direct reclaim.  User space would then lower the
>> user
>> limit thereby causing the kernel to direct reclaim user pages, then
>> the user daemon would raise the kernel limit allowing the slab allocation
>> to
>> succeed.  My hunch is that this would be prone to deadlocks (what prevents
>> uswapd from needing more even more kmem?)  I'll defer to more
>> experienced minds to know if user assisted direct memory reclaim has
>> other pitfalls.  It scares me.
>
> Good that it scares you, it should. OTOH, userspace being
> able to set parameters to it, has nothing scary at all. A daemon in
> userspace can detect that you need more kernel space memory , and
> then - according to a policy you abide to - write to a file allowing
> it more, or maybe not - according to that same policy. It is very
> far away from "userspace driven reclaim".

Agreed.  I have no problem with user space policy being used to
updating kernel control files that alter reclaim.

>> If kmem expands to include reclaimable kernel memory (e.g. dentry) then I
>> presume the kernel would have no way to exchange unused user pages for
>> dentry
>> pages even if the user memory in the container is well below its limit.
>>  This is
>> motivation for the above user assisted direct reclaim.
>
> Dentry is not always reclaimable. If it is pinned, it is non
> reclaimable. Speaking of it, Would you take a look at
> https://lkml.org/lkml/2011/8/14/110 ?
>
> I am targetting dentry as well. But since it is hard to assign a
> dentry to a process all the time, going through a different path. I
> however, haven't entirely given up of doing it cgroups based, so any
> ideas are welcome =)

My hope is that dentry consumption can be effectively limited by
limiting the memory needed to allocate the dentries.  IOW: with a
memcg aware slab allocator which, when possible, charges the calling
process' cgroup.

> Well, I am giving this an extra thought... Having separate knobs
> adds flexibility, but - as usual - also complexity. For the goals I
> have in mind, "kernel memory" would work just as fine.
>
> If you look carefully at the other patches in the series besides
> this one, you'll see that it is just a matter of billing from kernel
> memory instead of tcp-memory, and then all the rest is the same.
>
> Do you think that a single kernel-memory knob would be better for
> your needs? I am willing to give it a try.

Regarding the tcp buffers we're discussing, how does an application
consume lots of buffer memory?  IOW, what would a malicious app do to
cause a grows of tcp buffer usage?  Or is this the kind of resource
that is difficult to directly exploit?

Also, how does the pressure get applied.  I see you have a
per-protocol, per-cgroup pressure pressure setting.  Once set, how
does this pressure cause memory to be freed?  It looks like the
pressure is set when allocating memory and later when packets are
freed the associated memory _might_ be freed if pressure was
previously detected.  Did I get this right?

For me the primary concern is that both user and kernel memory are
eventually charged to the same counter with an associated limit.
Per-container memory pressure is based on this composite limit.  So I
don't have a strong opinion as to how many kernel memory counters
there are so long as they also feed into container memory usage
counter.

>> I agree that kernel memory is somewhat different.  In some (I argue most)
>> situations containers want the ability to exchange job kmem and job umem.
>> Either split or combined accounting protects the system and isolates other
>> containers from kmem allocations of a bad job.  To me it seems natural to
>> indicate that job X gets Y MB of memory.  I have more trouble dividing the
>> Y MB of memory into dedicated slices for different types of memory.
>
> I understand. And I don't think anyone doing containers
> should be mandated to define a division. But a limit...

SGTM, so long as it is an optional kernel memory limit (see below).

>> I think
>> memcg aware slab accounting does a good job of limiting a job's
>> memory allocations.
>> Would such slab accounting meet your needs?
>
> Well, the slab alone, no. There are other objects - like tcp buffers
> - that aren't covered by the slab. Others are usually shared among
> many cgroups, and others don't really belong to anybody in
> particular.igh

Sorry, I am dumb wrt. networking.  I thought that these tcp buffers
were allocated using slab with __alloc_skb().  Are they done in
container process context or arbitrary context?

> How do you think then, about turning this into 2 files inside memcg:
>
>  - kernel_memory_hard_limit.
>  - kernel_memory_soft_limit.
>
> tcp memory would be the one defined in /proc, except if it is
> greater than any of the limits. Instead of testing for memory
> allocation against kmem.tcp_allocated_memory, we'd test it against
> memcg.kmem_pinned_memory.

memcg control files currently include:
  memory.limit_in_bytes
  memory.soft_limit_in_bytes
  memory.usage_in_bytes

If you want a kernel memory limit, then we can introduce two new memcg APIs:
  memory.kernel_limit_in_bytes
  memory.kernel_soft_limit_in_bytes
  memory.kernel_usage_in_bytes

Any memory charged to memory.kernel_limit_in_bytes would also be
charged to memory.limit_in_bytes.  IOW, memory.limit_in_bytes would
include both user pages (as it does today) and some kernel pages.

With these new files there are two cgroup memory limits: total and
kernel.  The total limit is a combination of user and some kinds of
kernel memory.

Kernel page reclaim would occur if memory.kernel_usage_in_bytes
exceeds memory.kernel_limit_in_bytes.  There would be no need to
reclaim user pages in this situation.  If kernel usage exceeds
kernel_soft_limit_in_bytes, then the protocol pressure flags would be
set.

Memcg-wide page reclaim would occur if memory.usage_in_bytes exceeds
memory.limit_in_bytes.  This reclaim would consider either container
kernel memory or user pages associated with the container.

This would be a behavioral memcg change which would need to be
carefully considered.  Today the limit_in_bytes is just a user byte
count that does not include kernel memory.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Menage Sept. 12, 2011, 5:03 a.m. UTC | #19
On Thu, Sep 8, 2011 at 9:17 PM, Glauber Costa <glommer@parallels.com> wrote:
> On 09/08/2011 06:53 PM, Greg Thelen wrote:
>>
>> On Wed, Sep 7, 2011 at 9:44 PM, Glauber Costa<glommer@parallels.com>
>>  wrote:
>>
>> Thanks for your ideas and patience.
>
> Likewise. It is turning out to be a very fruitful
> discussion.
>

This also got a fair bit of attention at LPC on Thursday - shame you
couldn't make it, but I hope you had no problems getting your visa!

I definitely think that there was no consensus reached on unified
versus split charging - but I think that we can work around that and
keep everyone happy, see below.

>
> Risking being a bit polemic here, I think that when we do
> containers, we have to view the kernel a little bit like a shared resource
> that is not accounted to anybody. It is easy to account things like tcp
> buffer, but who do you account page tables for shared pages to? Or pinned
> dentries for shared filesystems ?

I'd say that if filesystems (or page tables) are shared between
containers / cgroups, then it's fair to do first-touch accounting. But
if a filesystem is shared between multiple processes in just one
container / cgroup then it should definitely be charged to that cgroup
(which you get automatically with first-touch accounting).

It might be reasonable to also allow the sysadmin to request a more
expensive form of accounting (probably either charge-all or
equal-share), if the code to support that didn't interfere with the
performance of the normal first-touch charging mechanism, or
complicate the code too much.

On the subject of filesystems specifically, see Greg Thelen's proposal
for using bind mounts to account on a bind mount to a given cgroup -
that could apply to dentries, page tables and other kernel memory as
well as page cache.

> Good that it scares you, it should. OTOH, userspace being
> able to set parameters to it, has nothing scary at all. A daemon in
> userspace can detect that you need more kernel space memory , and
> then - according to a policy you abide to - write to a file allowing
> it more, or maybe not - according to that same policy. It is very
> far away from "userspace driven reclaim".

It can do that, but it's a bit messy when all userspace really wants
to do is say "limit the total to X GB".


> No, I don't necessarily feel that need. I just thought it was
> cleaner to have entities with different purposes in different
> cgroups. If moving it to the memory controller would help you in any
> way, I can just do it. 80 % of this work is independent of where a
> cgroup file lives.

My feeling is also that it's more appropriate for this to live in
memcg. While the ability to mount subsystems separately gives nice
flexibility, in the case of userspace versus kernel memory, I'm having
trouble envisaging a realistic situation where you'd want them mounted
on different hierarchies.

>  - kernel_memory_hard_limit.
>  - kernel_memory_soft_limit.
>
> tcp memory would be the one defined in /proc, except if it is
> greater than any of the limits. Instead of testing for memory
> allocation against kmem.tcp_allocated_memory, we'd test it against
> memcg.kmem_pinned_memory.
>

This is definitely an improvement, but I'd say it's not enough. I
think we should consider something like:

- a root cgroup file (memory.unified?) that controls whether memory
limits are unified or split.
- this file can only be modified when there are no memory cgroups
created (other than the root, which we shouldn't be charging, I think)
- the 'active' control determines whether (all) child cgroups will
have  memory.{limit,usage}_in_bytes files, or
memory.{kernel,user}_{limit,usage}_in_bytes files
- kernel memory will be charged either against 'kernel' or 'total'
depending on the value of unified
- userspace memory will be charged either against 'user' or 'total'

That way the kernel doesn't force a unified or split decision on
anyone, with I think zero performance hit and not much extra code.

You could even take this a step further. The following is probably
straying into the realm of overly-flexible, but I'll throw it into the
discussion anyway:

- define counters for various types of memory (e.g. total, user,
kernel, network, tcp, pagetables, dentries, etc)

- document what kind of memory is potentially charged against each
counter. (e.g. TCP buffers would be potentially charged against 'tcp',
'network', 'kernel' and 'total')

- have a memory.counters root cgroup file that allows the user to
specify which of the available counters are active. (only writeable
when no child cgroups, to avoid inconsistencies caused by changes
while memory is charged)

- default memory.counters to 'total'

- when memory.counters is updated, the memcg code builds the set of
targets for each chargeable entity. (e.g. if you enabled 'network' and
'total', then we'd update the charge_tcp array to contain something
like { &network_counter, &total_counter, NULL })

- charging for an object (TCP allocation, pagecache page, pagetable,
dentry, etc) just involves (trying to) charge all the counters set up
in that object's array. Since these arrays will be immutable once
child cgroups are being charged, they should be fairly well predicted.

This would give complete flexibility to userspace on the policy for
charging and reporting without incurring performance overhead for
systems that didn't care about some particular counter. (And without
complicating the code too much, I think).

Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 12, 2011, 4:30 p.m. UTC | #20
>>> With some more knobs such a uswapd could
>>> attempt to keep ahead of demand.  But eventually direct reclaim would
>>> be needed to satisfy rapid growth spikes.  Example: If the 100M container
>>> starts with limits of 20M kmem and 80M user memory but later its kernel
>>> memory needs grow to 70M.  With separate user and kernel memory
>>> limits the kernel memory allocation could fail despite there being
>>> reclaimable user pages available.
>>
>> No no, this is a ratio, not a *limit*. A limit is something you
>> should not be allowed to go over. A good limit of kernel memory for
>> a 100 Mb container could be something like... 100 mb. But there is
>> more to that.
>>
>> Risking being a bit polemic here, I think that when we do
>> containers, we have to view the kernel a little bit like a shared resource
>> that is not accounted to anybody. It is easy to account things like tcp
>> buffer, but who do you account page tables for shared pages to? Or pinned
>> dentries for shared filesystems ?
>
> This is a tough challenge.  If you don't account page tables, then
> fork bombs can grab lots of kernel memory.  Containing such pigs is
> good.  We are working on some patches that charge page tables to the
> memcg they are allocated from.  This charge is checked against the
> memcg's limit and thus the page table allocation may fail due if over
> limit.
>
> I suggest charging dentries to the cgroup that allocated the dentry.
> If containers are used for isolation, then it seems that containers
> should typically not share files.  That would break isolation.  Of
> course there are shared files (/bin/bash, etc), but at least those are
> read-only.

You mean first allocated ? I actually considered that a while ago.
Specially with page tables, to whom do you account shared ptes ?
If we're doing first allocated, one way to fork bomb then, although 
clearly harder, is to have a cgroup mapping ptes that were already 
mapped before by other cgroups. A part of them is trivial to guess,
some others can be guessed more cleverly by the type of workload
you expect other containers in the box to be running.

This way you map a lot, and nothing is charged to you.
With dentries, first allocated might be possible if you assume files
are not shared, indeed... there are some shared use cases for exported 
directories... humm, but  maybe there are ways to contain those cases
as well.

>
>> Being the shared table under everybody, the kernel is more or less
>> like buffers inside a physical hdd, or cache lines. You know it is
>> there, you know it has a size, but provided you have some sane
>> protections, you don't really care - because in most cases you can't
>> - who is using it.
>
> I agree that it makes sense to not bother charging fixed sized
> resources (struct page, etc.) to containers; they are part of the
> platform.  But the set of resources that a process can allocate are
> ideally charged to a cgroup to limit container memory usage (i.e.
> prevent DoS attacks, isolate performance).
>
Agreed.

>>> The job should have a way to
>>> transition to memory limits to 70M+ kernel and 30M- of user.
>>
>> Yes, and I don't see how what I propose prevents that.
>
> I don't think your current proposal prevents that.  I was thinking
> about the future of the proposed kmem cgroup.  I want to make sure the
> kernel has a way to apply reclaim pressure bidirectionally between
> cgroup kernel memory and cgroup user memory.

I see.

>>> If kmem expands to include reclaimable kernel memory (e.g. dentry) then I
>>> presume the kernel would have no way to exchange unused user pages for
>>> dentry
>>> pages even if the user memory in the container is well below its limit.
>>>   This is
>>> motivation for the above user assisted direct reclaim.
>>
>> Dentry is not always reclaimable. If it is pinned, it is non
>> reclaimable. Speaking of it, Would you take a look at
>> https://lkml.org/lkml/2011/8/14/110 ?
>>
>> I am targetting dentry as well. But since it is hard to assign a
>> dentry to a process all the time, going through a different path. I
>> however, haven't entirely given up of doing it cgroups based, so any
>> ideas are welcome =)
>
> My hope is that dentry consumption can be effectively limited by
> limiting the memory needed to allocate the dentries.  IOW: with a
> memcg aware slab allocator which, when possible, charges the calling
> process' cgroup.

If you have patches for that, even early patches, I'd like to take a 
look at them.

>> Well, I am giving this an extra thought... Having separate knobs
>> adds flexibility, but - as usual - also complexity. For the goals I
>> have in mind, "kernel memory" would work just as fine.
>>
>> If you look carefully at the other patches in the series besides
>> this one, you'll see that it is just a matter of billing from kernel
>> memory instead of tcp-memory, and then all the rest is the same.
>>
>> Do you think that a single kernel-memory knob would be better for
>> your needs? I am willing to give it a try.
>
> Regarding the tcp buffers we're discussing, how does an application
> consume lots of buffer memory?  IOW, what would a malicious app do to
> cause a grows of tcp buffer usage?  Or is this the kind of resource
> that is difficult to directly exploit?
>
> Also, how does the pressure get applied.  I see you have a
> per-protocol, per-cgroup pressure pressure setting.  Once set, how
> does this pressure cause memory to be freed?  It looks like the
> pressure is set when allocating memory and later when packets are
> freed the associated memory _might_ be freed if pressure was
> previously detected.  Did I get this right?

See below:

> For me the primary concern is that both user and kernel memory are
> eventually charged to the same counter with an associated limit.
> Per-container memory pressure is based on this composite limit.  So I
> don't have a strong opinion as to how many kernel memory counters
> there are so long as they also feed into container memory usage
> counter.

Okay.

>
>>> I think
>>> memcg aware slab accounting does a good job of limiting a job's
>>> memory allocations.
>>> Would such slab accounting meet your needs?
>>
>> Well, the slab alone, no. There are other objects - like tcp buffers
>> - that aren't covered by the slab. Others are usually shared among
>> many cgroups, and others don't really belong to anybody in
>> particular.igh
>
> Sorry, I am dumb wrt. networking.  I thought that these tcp buffers
> were allocated using slab with __alloc_skb().  Are they done in
> container process context or arbitrary context?

You are right. They come from the slab with alloc_skb. But consider the 
following code, from tcp.c:

         skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
         if (skb) {
                 if (sk_wmem_schedule(sk, skb->truesize)) {
                         /*
                          * Make sure that we have exactly size bytes
                          * available to the caller, no more, no less.
                          */
                         skb_reserve(skb, skb_tailroom(skb) - size);
                         return skb;
                 }
                 __kfree_skb(skb);
         } else {
		...
	}

Limiting the slab may be enough to prevent kernel memory abuse (sorry, I 
should have made this clear from the start). However, if the allocation 
succeed, a container can then kfree it, because someone else
is starving the network.

Of course, my patchset also limits kernel memory usage. But I agree that 
if we do slab accounting, only the network specific part remains.

>> How do you think then, about turning this into 2 files inside memcg:
>>
>>   - kernel_memory_hard_limit.
>>   - kernel_memory_soft_limit.
>>
>> tcp memory would be the one defined in /proc, except if it is
>> greater than any of the limits. Instead of testing for memory
>> allocation against kmem.tcp_allocated_memory, we'd test it against
>> memcg.kmem_pinned_memory.
>
> memcg control files currently include:
>    memory.limit_in_bytes
>    memory.soft_limit_in_bytes
>    memory.usage_in_bytes
>
> If you want a kernel memory limit, then we can introduce two new memcg APIs:
>    memory.kernel_limit_in_bytes
>    memory.kernel_soft_limit_in_bytes
>    memory.kernel_usage_in_bytes
>
> Any memory charged to memory.kernel_limit_in_bytes would also be
> charged to memory.limit_in_bytes.  IOW, memory.limit_in_bytes would
> include both user pages (as it does today) and some kernel pages.
>
> With these new files there are two cgroup memory limits: total and
> kernel.  The total limit is a combination of user and some kinds of
> kernel memory.

Nice.

> Kernel page reclaim would occur if memory.kernel_usage_in_bytes
> exceeds memory.kernel_limit_in_bytes.  There would be no need to
> reclaim user pages in this situation.  If kernel usage exceeds
> kernel_soft_limit_in_bytes, then the protocol pressure flags would be
> set.

SGTM. ATLAIJI as well.

> Memcg-wide page reclaim would occur if memory.usage_in_bytes exceeds
> memory.limit_in_bytes.  This reclaim would consider either container
> kernel memory or user pages associated with the container.
>
> This would be a behavioral memcg change which would need to be
> carefully considered.  Today the limit_in_bytes is just a user byte
> count that does not include kernel memory.

This makes me wonder that maybe I should just add a tcp-specific file to 
memcg (so all memory-related information goes in the same place), but
not account this memory as kernel memory. When you have slab-aware 
memcg, let it handle the accounting.

Alternatively, I could start doing basic accounting to pave the way, and 
later on this accounting is removing in favor or the slab-based
one.

What do you think ?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 12, 2011, 4:57 p.m. UTC | #21
On 09/12/2011 02:03 AM, Paul Menage wrote:
> On Thu, Sep 8, 2011 at 9:17 PM, Glauber Costa<glommer@parallels.com>  wrote:
>> On 09/08/2011 06:53 PM, Greg Thelen wrote:
>>>
>>> On Wed, Sep 7, 2011 at 9:44 PM, Glauber Costa<glommer@parallels.com>
>>>   wrote:
>>>
>>> Thanks for your ideas and patience.
>>
>> Likewise. It is turning out to be a very fruitful
>> discussion.
>>
>
> This also got a fair bit of attention at LPC on Thursday - shame you
> couldn't make it, but I hope you had no problems getting your visa!

Apart from the trip to the nearest consulate and 4 hours in a line, no 
problems =) I can now legally enter the United States - so wait for me 
next time!

>
> I definitely think that there was no consensus reached on unified
> versus split charging - but I think that we can work around that and
> keep everyone happy, see below.

I think at this point there is at least consensus that this could very 
well live in memcg, right ?
>
>>
>> Risking being a bit polemic here, I think that when we do
>> containers, we have to view the kernel a little bit like a shared resource
>> that is not accounted to anybody. It is easy to account things like tcp
>> buffer, but who do you account page tables for shared pages to? Or pinned
>> dentries for shared filesystems ?
>
> I'd say that if filesystems (or page tables) are shared between
> containers / cgroups, then it's fair to do first-touch accounting. But
> if a filesystem is shared between multiple processes in just one
> container / cgroup then it should definitely be charged to that cgroup
> (which you get automatically with first-touch accounting).
>
> It might be reasonable to also allow the sysadmin to request a more
> expensive form of accounting (probably either charge-all or
> equal-share), if the code to support that didn't interfere with the
> performance of the normal first-touch charging mechanism, or
> complicate the code too much.

I personally think equal share is a nightmare. But charge-all for shared 
resources is quite acceptable even as a default if people are not 
heavily concerned about precise accounting, in the sense that all 
containers should sum up to one (well, I am not)

> On the subject of filesystems specifically, see Greg Thelen's proposal
> for using bind mounts to account on a bind mount to a given cgroup -
> that could apply to dentries, page tables and other kernel memory as
> well as page cache.

Care to point me to it ?

>> Good that it scares you, it should. OTOH, userspace being
>> able to set parameters to it, has nothing scary at all. A daemon in
>> userspace can detect that you need more kernel space memory , and
>> then - according to a policy you abide to - write to a file allowing
>> it more, or maybe not - according to that same policy. It is very
>> far away from "userspace driven reclaim".
>
> It can do that, but it's a bit messy when all userspace really wants
> to do is say "limit the total to X GB".
>
>
>> No, I don't necessarily feel that need. I just thought it was
>> cleaner to have entities with different purposes in different
>> cgroups. If moving it to the memory controller would help you in any
>> way, I can just do it. 80 % of this work is independent of where a
>> cgroup file lives.
>
> My feeling is also that it's more appropriate for this to live in
> memcg. While the ability to mount subsystems separately gives nice
> flexibility, in the case of userspace versus kernel memory, I'm having
> trouble envisaging a realistic situation where you'd want them mounted
> on different hierarchies.

Right.
>>   - kernel_memory_hard_limit.
>>   - kernel_memory_soft_limit.
>>
>> tcp memory would be the one defined in /proc, except if it is
>> greater than any of the limits. Instead of testing for memory
>> allocation against kmem.tcp_allocated_memory, we'd test it against
>> memcg.kmem_pinned_memory.
>>
>
> This is definitely an improvement, but I'd say it's not enough. I
> think we should consider something like:
One step at a time =)

>
> - a root cgroup file (memory.unified?) that controls whether memory
> limits are unified or split.
> - this file can only be modified when there are no memory cgroups
> created (other than the root, which we shouldn't be charging, I think)

Okay.

> - the 'active' control determines whether (all) child cgroups will
> have  memory.{limit,usage}_in_bytes files, or
> memory.{kernel,user}_{limit,usage}_in_bytes files
> - kernel memory will be charged either against 'kernel' or 'total'
> depending on the value of unified

You mean for display/pressure purposes, right? Internally, I think once 
we have kernel memory, we always charge it to kernel memory, regardless 
of anything else. The value in unified field will only take place when 
we need to grab this value.

I don't personally see a reason for not having all files present at all 
times.

> - userspace memory will be charged either against 'user' or 'total'
>
> That way the kernel doesn't force a unified or split decision on
> anyone, with I think zero performance hit and not much extra code.

Yes, I think not forcing it is not only an interesting way of settling 
this, but also an interesting feature.

> You could even take this a step further. The following is probably
> straying into the realm of overly-flexible, but I'll throw it into the
> discussion anyway:
>
> - define counters for various types of memory (e.g. total, user,
> kernel, network, tcp, pagetables, dentries, etc)

Hummm, I like it, but I also fear that in the future we'll hit some 
resource whose origin is hard to track... /me trying to think of such
potential monster...

It is overly flexible if we're exposing these counters and expecting the 
user to do anything with them. It is perfectly fine if a single file, 
when read, displays this information as statistics.


> - document what kind of memory is potentially charged against each
> counter. (e.g. TCP buffers would be potentially charged against 'tcp',
> 'network', 'kernel' and 'total')

That, I think, is overly flexible. I think what I said above makes sense 
for very well defined entities such the slab. If we have something that 
can be accounted to more to one general (kernel/user) and one specific 
(slab) entity, we're getting too complex.

> - have a memory.counters root cgroup file that allows the user to
> specify which of the available counters are active. (only writeable
> when no child cgroups, to avoid inconsistencies caused by changes
> while memory is charged)
>
> - default memory.counters to 'total'
>
> - when memory.counters is updated, the memcg code builds the set of
> targets for each chargeable entity. (e.g. if you enabled 'network' and
> 'total', then we'd update the charge_tcp array to contain something
> like {&network_counter,&total_counter, NULL })
>
> - charging for an object (TCP allocation, pagecache page, pagetable,
> dentry, etc) just involves (trying to) charge all the counters set up
> in that object's array. Since these arrays will be immutable once
> child cgroups are being charged, they should be fairly well predicted.
>
> This would give complete flexibility to userspace on the policy for
> charging and reporting without incurring performance overhead for
> systems that didn't care about some particular counter. (And without
> complicating the code too much, I think).

While I do think it is overly complicated, I think that a sound use case 
for this would bring it to the "complicated" realm only. It sounds like 
a nice feature if there are users for it.

Not only for containers - that tend to be generic in most cases - but 
maybe people enclosing services in isolated cgroups can come up with 
profiles that will, for instance, disallow or allow very little disk 
activity on a network-profiled cgroup and vice versa.

I am adding Lennart to this discussion, since if there is anyone crazy 
enough to come up with an interesting use case for something strange, 
it'll be systemd.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg Thelen Sept. 13, 2011, 6:56 a.m. UTC | #22
On Mon, Sep 12, 2011 at 9:57 AM, Glauber Costa <glommer@parallels.com> wrote:
> On 09/12/2011 02:03 AM, Paul Menage wrote:
>> I definitely think that there was no consensus reached on unified
>> versus split charging - but I think that we can work around that and
>> keep everyone happy, see below.
>
> I think at this point there is at least consensus that this could very well
> live in memcg, right ?

Yes, I think it should live in memcg.

>> On the subject of filesystems specifically, see Greg Thelen's proposal
>> for using bind mounts to account on a bind mount to a given cgroup -
>> that could apply to dentries, page tables and other kernel memory as
>> well as page cache.
>
> Care to point me to it ?

http://marc.info/?t=127749867100004&r=1&w=2
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Menage Sept. 13, 2011, 6:09 p.m. UTC | #23
On Mon, Sep 12, 2011 at 9:57 AM, Glauber Costa <glommer@parallels.com> wrote:
>
> I think at this point there is at least consensus that this could very well
> live in memcg, right ?

Yes, it looks that way.

>> This is definitely an improvement, but I'd say it's not enough. I
>> think we should consider something like:
>
> One step at a time =)

Yes, as far as design and initial implementation goes - but the full
plan has to be figured out before anything gets committed to mainline,
given the stability guarantees that implies.

>> - the 'active' control determines whether (all) child cgroups will
>> have  memory.{limit,usage}_in_bytes files, or
>> memory.{kernel,user}_{limit,usage}_in_bytes files
>> - kernel memory will be charged either against 'kernel' or 'total'
>> depending on the value of unified
>
> You mean for display/pressure purposes, right? Internally, I think once we
> have kernel memory, we always charge it to kernel memory, regardless of
> anything else. The value in unified field will only take place when we need
> to grab this value.
>
> I don't personally see a reason for not having all files present at all
> times.

There's pretty much only one reason - avoiding the overhead of
maintaining multiple counters.

Each set of counters (user, kernel, total) will have its own locks,
contention and other overheads to keep up to date. If userspace
doesn't care about one or two of the three, then that's mostly wasted.

Now it might be that the accounting of all three can be done with
little more overhead than that required to update just a split view or
just a unified view, in which case there's much less argument against
simplifying and tracking/charging/limiting all three.

>
> It is overly flexible if we're exposing these counters and expecting the
> user to do anything with them. It is perfectly fine if a single file, when
> read, displays this information as statistics.
>

When I proposed this, I guess I was envisioning that most of the
counters (e.g. things like TCP buffers or general network buffers)
would be primarily for stats, since the admin probably only cares
about total memory usage.

The main point of this was to allow people who want to do something
like tracking/limiting TCP buffer usage specifically per-cgroup to do
so, without having any performance impact on the regular users.

Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 13, 2011, 6:11 p.m. UTC | #24
On 09/13/2011 03:09 PM, Paul Menage wrote:
> Each set of counters (user, kernel, total) will have its own locks,
> contention and other overheads to keep up to date. If userspace
> doesn't care about one or two of the three, then that's mostly wasted.
>
> Now it might be that the accounting of all three can be done with
> little more overhead than that required to update just a split view or
> just a unified view, in which case there's much less argument against
> simplifying and tracking/charging/limiting all three.
What if they are all updated under the same lock ?
The lock argument is very well valid for accounting vs not accounting 
kernel memory. But once it is accounted, which counter we account to, I 
think, is less of a problem.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Menage Sept. 13, 2011, 6:46 p.m. UTC | #25
On Tue, Sep 13, 2011 at 11:11 AM, Glauber Costa <glommer@parallels.com> wrote:
>
> What if they are all updated under the same lock ?

Right, that would be the kind of optimization that would remove the
need for worrying about whether or not to account it. It would
probably mean creating some memcg-specific structures like
res-counters that could handle multiple values, since you'd need to
update both the kernel charge and the total charge, in this cgroup
*and* its ancestors.

Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa Sept. 13, 2011, 7:08 p.m. UTC | #26
On 09/13/2011 03:46 PM, Paul Menage wrote:
> On Tue, Sep 13, 2011 at 11:11 AM, Glauber Costa<glommer@parallels.com>  wrote:
>>
>> What if they are all updated under the same lock ?
>
> Right, that would be the kind of optimization that would remove the
> need for worrying about whether or not to account it. It would
> probably mean creating some memcg-specific structures like
> res-counters that could handle multiple values, since you'd need to
> update both the kernel charge and the total charge, in this cgroup
> *and* its ancestors.
>
> Paul
If we do that, we may have to commit to an intermediary user interface - 
with controls to to determine if kernel memory is billed to kernel or 
total, a enable/disable file, just to later render it pointless by a new 
optimization - that we seem to agree that seems possible.

I think it is preferred to always assume kernel memory is accounted to 
the kernel, and when we optimize it, no changes are made to what's 
exposed to userspace.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

=================

Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

16384  87380  1        1       10.00    26996.35
16384  87380

With the patch
===============

Local /Remote
Socket Size   Request  Resp.   Elapsed  Trans.
Send   Recv   Size     Size    Time     Rate
bytes  Bytes  bytes    bytes   secs.    per sec

16384  87380  1        1       10.00    27291.86
16384  87380

The difference is within a one-percent range.

Nesting cgroups doesn't seem to be the dominating factor as well,
with nestings up to 10 levels not showing a significant performance
difference.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: David S. Miller <davem@davemloft.net>
CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
---
 crypto/af_alg.c               |    7 ++-
 include/linux/cgroup_subsys.h |    4 +
 include/net/netns/ipv4.h      |    1 +
 include/net/sock.h            |   66 +++++++++++++++-
 include/net/tcp.h             |   12 ++-
 include/net/udp.h             |    3 +-
 include/trace/events/sock.h   |   10 +-
 init/Kconfig                  |   11 +++
 mm/Makefile                   |    1 +
 net/core/sock.c               |  136 +++++++++++++++++++++++++++-------
 net/decnet/af_decnet.c        |   21 +++++-
 net/ipv4/proc.c               |    8 +-
 net/ipv4/sysctl_net_ipv4.c    |   59 +++++++++++++--
 net/ipv4/tcp.c                |  164 +++++++++++++++++++++++++++++++++++-----
 net/ipv4/tcp_input.c          |   17 ++--
 net/ipv4/tcp_ipv4.c           |   27 +++++--
 net/ipv4/tcp_output.c         |    2 +-
 net/ipv4/tcp_timer.c          |    2 +-
 net/ipv4/udp.c                |   20 ++++-
 net/ipv6/tcp_ipv6.c           |   16 +++-
 net/ipv6/udp.c                |    4 +-
 net/sctp/socket.c             |   35 +++++++--
 22 files changed, 514 insertions(+), 112 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index ac33d5f..df168d8 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -29,10 +29,15 @@  struct alg_type_list {
 
 static atomic_long_t alg_memory_allocated;
 
+static atomic_long_t *memory_allocated_alg(struct kmem_cgroup *sg)
+{
+	return &alg_memory_allocated;
+}
+
 static struct proto alg_proto = {
 	.name			= "ALG",
 	.owner			= THIS_MODULE,
-	.memory_allocated	= &alg_memory_allocated,
+	.memory_allocated	= memory_allocated_alg,
 	.obj_size		= sizeof(struct alg_sock),
 };
 
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ac663c1..363b8e8 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -35,6 +35,10 @@  SUBSYS(cpuacct)
 SUBSYS(mem_cgroup)
 #endif
 
+#ifdef CONFIG_CGROUP_KMEM
+SUBSYS(kmem)
+#endif
+
 /* */
 
 #ifdef CONFIG_CGROUP_DEVICE
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d786b4f..bbd023a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -55,6 +55,7 @@  struct netns_ipv4 {
 	int current_rt_cache_rebuild_count;
 
 	unsigned int sysctl_ping_group_range[2];
+	long sysctl_tcp_mem[3];
 
 	atomic_t rt_genid;
 	atomic_t dev_addr_genid;
diff --git a/include/net/sock.h b/include/net/sock.h
index 8e4062f..e085148 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -62,7 +62,9 @@ 
 #include <linux/atomic.h>
 #include <net/dst.h>
 #include <net/checksum.h>
+#include <linux/kmem_cgroup.h>
 
+int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
@@ -339,6 +341,7 @@  struct sock {
 #endif
 	__u32			sk_mark;
 	u32			sk_classid;
+	struct kmem_cgroup	*sk_cgrp;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
 	void			(*sk_write_space)(struct sock *sk);
@@ -786,16 +789,21 @@  struct proto {
 
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
-	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
-	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
+	/* Current allocated memory. */
+	atomic_long_t		*(*memory_allocated)(struct kmem_cgroup *sg);
+	/* Current number of sockets. */
+	struct percpu_counter	*(*sockets_allocated)(struct kmem_cgroup *sg);
+
+	int			(*init_cgroup)(struct cgroup *cgrp,
+					       struct cgroup_subsys *ss);
 	/*
 	 * Pressure flag: try to collapse.
 	 * Technical note: it is used by multiple contexts non atomically.
 	 * All the __sk_mem_schedule() is of this nature: accounting
 	 * is strict, actions are advisory and have some latency.
 	 */
-	int			*memory_pressure;
-	long			*sysctl_mem;
+	int			*(*memory_pressure)(struct kmem_cgroup *sg);
+	long			*(*prot_mem)(struct kmem_cgroup *sg);
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
 	int			max_header;
@@ -826,6 +834,56 @@  struct proto {
 #endif
 };
 
+#define sk_memory_pressure(sk)						\
+({									\
+	int *__ret = NULL;						\
+	if ((sk)->sk_prot->memory_pressure)				\
+		__ret = (sk)->sk_prot->memory_pressure(sk->sk_cgrp);	\
+	__ret;								\
+})
+
+#define sk_sockets_allocated(sk)				\
+({								\
+	struct percpu_counter *__p;				\
+	__p = (sk)->sk_prot->sockets_allocated(sk->sk_cgrp);	\
+	__p;							\
+})
+
+#define sk_memory_allocated(sk)					\
+({								\
+	atomic_long_t *__mem;					\
+	__mem = (sk)->sk_prot->memory_allocated(sk->sk_cgrp);	\
+	__mem;							\
+})
+
+#define sk_prot_mem(sk)						\
+({								\
+	long *__mem = (sk)->sk_prot->prot_mem(sk->sk_cgrp);	\
+	__mem;							\
+})
+
+#define sg_memory_pressure(prot, sg)				\
+({								\
+	int *__ret = NULL;					\
+	if (prot->memory_pressure)				\
+		__ret = (prot)->memory_pressure(sg);		\
+	__ret;							\
+})
+
+#define sg_memory_allocated(prot, sg)				\
+({								\
+	atomic_long_t *__mem;					\
+	__mem = (prot)->memory_allocated(sg);			\
+	__mem;							\
+})
+
+#define sg_sockets_allocated(prot, sg)				\
+({								\
+	struct percpu_counter *__p;				\
+	__p = (prot)->sockets_allocated(sg);			\
+	__p;							\
+})
+
 extern int proto_register(struct proto *prot, int alloc_slab);
 extern void proto_unregister(struct proto *prot);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 149a415..8e1ec4a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -230,7 +230,6 @@  extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_ecn;
 extern int sysctl_tcp_dsack;
-extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
@@ -253,9 +252,12 @@  extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 
-extern atomic_long_t tcp_memory_allocated;
-extern struct percpu_counter tcp_sockets_allocated;
-extern int tcp_memory_pressure;
+struct kmem_cgroup;
+extern long *tcp_sysctl_mem(struct kmem_cgroup *sg);
+struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg);
+int *memory_pressure_tcp(struct kmem_cgroup *sg);
+int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
+atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg);
 
 /*
  * The next routines deal with comparing 32 bit unsigned ints
@@ -286,7 +288,7 @@  static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 	}
 
 	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
-	    atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])
+	    atomic_long_read(sk_memory_allocated(sk)) > sk_prot_mem(sk)[2])
 		return true;
 	return false;
 }
diff --git a/include/net/udp.h b/include/net/udp.h
index 67ea6fc..0e27388 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -105,7 +105,8 @@  static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 
 extern struct proto udp_prot;
 
-extern atomic_long_t udp_memory_allocated;
+atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg);
+long *udp_sysctl_mem(struct kmem_cgroup *sg);
 
 /* sysctl variables for udp */
 extern long sysctl_udp_mem[3];
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 779abb9..12a6083 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -37,7 +37,7 @@  TRACE_EVENT(sock_exceed_buf_limit,
 
 	TP_STRUCT__entry(
 		__array(char, name, 32)
-		__field(long *, sysctl_mem)
+		__field(long *, prot_mem)
 		__field(long, allocated)
 		__field(int, sysctl_rmem)
 		__field(int, rmem_alloc)
@@ -45,7 +45,7 @@  TRACE_EVENT(sock_exceed_buf_limit,
 
 	TP_fast_assign(
 		strncpy(__entry->name, prot->name, 32);
-		__entry->sysctl_mem = prot->sysctl_mem;
+		__entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
 		__entry->allocated = allocated;
 		__entry->sysctl_rmem = prot->sysctl_rmem[0];
 		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
@@ -54,9 +54,9 @@  TRACE_EVENT(sock_exceed_buf_limit,
 	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
 		"sysctl_rmem=%d rmem_alloc=%d",
 		__entry->name,
-		__entry->sysctl_mem[0],
-		__entry->sysctl_mem[1],
-		__entry->sysctl_mem[2],
+		__entry->prot_mem[0],
+		__entry->prot_mem[1],
+		__entry->prot_mem[2],
 		__entry->allocated,
 		__entry->sysctl_rmem,
 		__entry->rmem_alloc)
diff --git a/init/Kconfig b/init/Kconfig
index d627783..5955ac2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -690,6 +690,17 @@  config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 	  select this option (if, for some reason, they need to disable it
 	  then swapaccount=0 does the trick).
 
+config CGROUP_KMEM
+	bool "Kernel Memory Resource Controller for Control Groups"
+	depends on CGROUPS
+	help
+	  The Kernel Memory cgroup can limit the amount of memory used by
+	  certain kernel objects in the system. Those are fundamentally
+	  different from the entities handled by the Memory Controller,
+	  which are page-based, and can be swapped. Users of the kmem
+	  cgroup can use it to guarantee that no group of processes will
+	  ever exhaust kernel resources alone.
+
 config CGROUP_PERF
 	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
 	depends on PERF_EVENTS && CGROUPS
diff --git a/mm/Makefile b/mm/Makefile
index 836e416..1b1aa24 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -45,6 +45,7 @@  obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_KMEM) += kmem_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/net/core/sock.c b/net/core/sock.c
index 3449df8..2d968ea 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -134,6 +134,24 @@ 
 #include <net/tcp.h>
 #endif
 
+static DEFINE_RWLOCK(proto_list_lock);
+static LIST_HEAD(proto_list);
+
+int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	struct proto *proto;
+	int ret = 0;
+
+	read_lock(&proto_list_lock);
+	list_for_each_entry(proto, &proto_list, node) {
+		if (proto->init_cgroup)
+			ret |= proto->init_cgroup(cgrp, ss);
+	}
+	read_unlock(&proto_list_lock);
+
+	return ret;
+}
+
 /*
  * Each address family might have different locking rules, so we have
  * one slock key per address family:
@@ -1114,6 +1132,31 @@  void sock_update_classid(struct sock *sk)
 EXPORT_SYMBOL(sock_update_classid);
 #endif
 
+void sock_update_kmem_cgrp(struct sock *sk)
+{
+#ifdef CONFIG_CGROUP_KMEM
+	sk->sk_cgrp = kcg_from_task(current);
+
+	/*
+	 * We don't need to protect against anything task-related, because
+	 * we are basically stuck with the sock pointer that won't change,
+	 * even if the task that originated the socket changes cgroups.
+	 *
+	 * What we do have to guarantee, is that the chain leading us to
+	 * the top level won't change under our noses. Incrementing the
+	 * reference count via cgroup_exclude_rmdir guarantees that.
+	 */
+	cgroup_exclude_rmdir(&sk->sk_cgrp->css);
+#endif
+}
+
+void sock_release_kmem_cgrp(struct sock *sk)
+{
+#ifdef CONFIG_CGROUP_KMEM
+	cgroup_release_and_wakeup_rmdir(&sk->sk_cgrp->css);
+#endif
+}
+
 /**
  *	sk_alloc - All socket objects are allocated here
  *	@net: the applicable net namespace
@@ -1139,6 +1182,7 @@  struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
 		sock_update_classid(sk);
+		sock_update_kmem_cgrp(sk);
 	}
 
 	return sk;
@@ -1170,6 +1214,7 @@  static void __sk_free(struct sock *sk)
 		put_cred(sk->sk_peer_cred);
 	put_pid(sk->sk_peer_pid);
 	put_net(sock_net(sk));
+	sock_release_kmem_cgrp(sk);
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
@@ -1287,8 +1332,8 @@  struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		sk_set_socket(newsk, NULL);
 		newsk->sk_wq = NULL;
 
-		if (newsk->sk_prot->sockets_allocated)
-			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+		if (sk_sockets_allocated(sk))
+			percpu_counter_inc(sk_sockets_allocated(sk));
 
 		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
 		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1676,29 +1721,51 @@  EXPORT_SYMBOL(sk_wait_data);
  */
 int __sk_mem_schedule(struct sock *sk, int size, int kind)
 {
-	struct proto *prot = sk->sk_prot;
 	int amt = sk_mem_pages(size);
+	struct proto *prot = sk->sk_prot;
 	long allocated;
+	int *memory_pressure;
+	long *prot_mem;
+	int parent_failure = 0;
+	struct kmem_cgroup *sg;
 
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-	allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+	memory_pressure = sk_memory_pressure(sk);
+	prot_mem = sk_prot_mem(sk);
+
+	allocated = atomic_long_add_return(amt, sk_memory_allocated(sk));
+
+#ifdef CONFIG_CGROUP_KMEM
+	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
+		long alloc;
+		/*
+		 * Large nestings are not the common case, and stopping in the
+		 * middle would be complicated enough, that we bill it all the
+		 * way through the root, and if needed, unbill everything later
+		 */
+		alloc = atomic_long_add_return(amt,
+					       sg_memory_allocated(prot, sg));
+		parent_failure |= (alloc > sk_prot_mem(sk)[2]);
+	}
+#endif
+
+	/* Over hard limit (we, or our parents) */
+	if (parent_failure || (allocated > prot_mem[2]))
+		goto suppress_allocation;
 
 	/* Under limit. */
-	if (allocated <= prot->sysctl_mem[0]) {
-		if (prot->memory_pressure && *prot->memory_pressure)
-			*prot->memory_pressure = 0;
+	if (allocated <= prot_mem[0]) {
+		if (memory_pressure && *memory_pressure)
+			*memory_pressure = 0;
 		return 1;
 	}
 
 	/* Under pressure. */
-	if (allocated > prot->sysctl_mem[1])
+	if (allocated > prot_mem[1])
 		if (prot->enter_memory_pressure)
 			prot->enter_memory_pressure(sk);
 
-	/* Over hard limit. */
-	if (allocated > prot->sysctl_mem[2])
-		goto suppress_allocation;
-
 	/* guarantee minimum buffer size under pressure */
 	if (kind == SK_MEM_RECV) {
 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
@@ -1712,13 +1779,13 @@  int __sk_mem_schedule(struct sock *sk, int size, int kind)
 				return 1;
 	}
 
-	if (prot->memory_pressure) {
+	if (memory_pressure) {
 		int alloc;
 
-		if (!*prot->memory_pressure)
+		if (!*memory_pressure)
 			return 1;
-		alloc = percpu_counter_read_positive(prot->sockets_allocated);
-		if (prot->sysctl_mem[2] > alloc *
+		alloc = percpu_counter_read_positive(sk_sockets_allocated(sk));
+		if (prot_mem[2] > alloc *
 		    sk_mem_pages(sk->sk_wmem_queued +
 				 atomic_read(&sk->sk_rmem_alloc) +
 				 sk->sk_forward_alloc))
@@ -1741,7 +1808,13 @@  suppress_allocation:
 
 	/* Alas. Undo changes. */
 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-	atomic_long_sub(amt, prot->memory_allocated);
+
+	atomic_long_sub(amt, sk_memory_allocated(sk));
+
+#ifdef CONFIG_CGROUP_KMEM
+	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent)
+		atomic_long_sub(amt, sg_memory_allocated(prot, sg));
+#endif
 	return 0;
 }
 EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1753,14 +1826,24 @@  EXPORT_SYMBOL(__sk_mem_schedule);
 void __sk_mem_reclaim(struct sock *sk)
 {
 	struct proto *prot = sk->sk_prot;
+	struct kmem_cgroup *sg = sk->sk_cgrp;
+	int *memory_pressure = sk_memory_pressure(sk);
 
 	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
-		   prot->memory_allocated);
+		   sk_memory_allocated(sk));
+
+#ifdef CONFIG_CGROUP_KMEM
+	for (sg = sk->sk_cgrp->parent; sg != NULL; sg = sg->parent) {
+		atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+						sg_memory_allocated(prot, sg));
+	}
+#endif
+
 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
 
-	if (prot->memory_pressure && *prot->memory_pressure &&
-	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
-		*prot->memory_pressure = 0;
+	if (memory_pressure && *memory_pressure &&
+	    (atomic_long_read(sk_memory_allocated(sk)) < sk_prot_mem(sk)[0]))
+		*memory_pressure = 0;
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
@@ -2252,9 +2335,6 @@  void sk_common_release(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_common_release);
 
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
-
 #ifdef CONFIG_PROC_FS
 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
 struct prot_inuse {
@@ -2479,13 +2559,17 @@  static char proto_method_implemented(const void *method)
 
 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
 {
+	struct kmem_cgroup *sg = kcg_from_task(current);
+
 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
 		   proto->name,
 		   proto->obj_size,
 		   sock_prot_inuse_get(seq_file_net(seq), proto),
-		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
-		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+		   proto->memory_allocated != NULL ?
+			atomic_long_read(sg_memory_allocated(proto, sg)) : -1L,
+		   proto->memory_pressure != NULL ?
+			*sg_memory_pressure(proto, sg) ? "yes" : "no" : "NI",
 		   proto->max_header,
 		   proto->slab == NULL ? "no" : "yes",
 		   module_name(proto->owner),
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 19acd00..463b299 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -458,13 +458,28 @@  static void dn_enter_memory_pressure(struct sock *sk)
 	}
 }
 
+static atomic_long_t *memory_allocated_dn(struct kmem_cgroup *sg)
+{
+	return &decnet_memory_allocated;
+}
+
+static int *memory_pressure_dn(struct kmem_cgroup *sg)
+{
+	return &dn_memory_pressure;
+}
+
+static long *dn_sysctl_mem(struct kmem_cgroup *sg)
+{
+	return sysctl_decnet_mem;
+}
+
 static struct proto dn_proto = {
 	.name			= "NSP",
 	.owner			= THIS_MODULE,
 	.enter_memory_pressure	= dn_enter_memory_pressure,
-	.memory_pressure	= &dn_memory_pressure,
-	.memory_allocated	= &decnet_memory_allocated,
-	.sysctl_mem		= sysctl_decnet_mem,
+	.memory_pressure	= memory_pressure_dn,
+	.memory_allocated	= memory_allocated_dn,
+	.prot_mem		= dn_sysctl_mem,
 	.sysctl_wmem		= sysctl_decnet_wmem,
 	.sysctl_rmem		= sysctl_decnet_rmem,
 	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b14ec7d..9c80acf 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,20 +52,22 @@  static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
 	int orphans, sockets;
+	struct kmem_cgroup *sg = kcg_from_task(current);
+	struct percpu_counter *allocated = sg_sockets_allocated(&tcp_prot, sg);
 
 	local_bh_disable();
 	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
-	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
+	sockets = percpu_counter_sum_positive(allocated);
 	local_bh_enable();
 
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
 		   tcp_death_row.tw_count, sockets,
-		   atomic_long_read(&tcp_memory_allocated));
+		   atomic_long_read(sg_memory_allocated((&tcp_prot), sg)));
 	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
-		   atomic_long_read(&udp_memory_allocated));
+		   atomic_long_read(sg_memory_allocated((&udp_prot), sg)));
 	seq_printf(seq, "UDPLITE: inuse %d\n",
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 69fd720..5e89480 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -14,6 +14,8 @@ 
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
+#include <linux/kmem_cgroup.h>
+#include <linux/swap.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -174,6 +176,43 @@  static int proc_allowed_congestion_control(ctl_table *ctl,
 	return ret;
 }
 
+static int ipv4_tcp_mem(ctl_table *ctl, int write,
+			   void __user *buffer, size_t *lenp,
+			   loff_t *ppos)
+{
+	int ret;
+	unsigned long vec[3];
+	struct kmem_cgroup *kmem = kcg_from_task(current);
+	struct net *net = current->nsproxy->net_ns;
+	int i;
+
+	ctl_table tmp = {
+		.data = &vec,
+		.maxlen = sizeof(vec),
+		.mode = ctl->mode,
+	};
+
+	if (!write) {
+		ctl->data = &net->ipv4.sysctl_tcp_mem;
+		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
+	}
+
+	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < 3; i++)
+		if (vec[i] > kmem->tcp_max_memory)
+			return -EINVAL;
+
+	for (i = 0; i < 3; i++) {
+		net->ipv4.sysctl_tcp_mem[i] = vec[i];
+		kmem->tcp_prot_mem[i] = net->ipv4.sysctl_tcp_mem[i];
+	}
+
+	return 0;
+}
+
 static struct ctl_table ipv4_table[] = {
 	{
 		.procname	= "tcp_timestamps",
@@ -433,13 +472,6 @@  static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.procname	= "tcp_mem",
-		.data		= &sysctl_tcp_mem,
-		.maxlen		= sizeof(sysctl_tcp_mem),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax
-	},
-	{
 		.procname	= "tcp_wmem",
 		.data		= &sysctl_tcp_wmem,
 		.maxlen		= sizeof(sysctl_tcp_wmem),
@@ -721,6 +753,12 @@  static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_ping_group_range,
 	},
+	{
+		.procname	= "tcp_mem",
+		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
+		.mode		= 0644,
+		.proc_handler	= ipv4_tcp_mem,
+	},
 	{ }
 };
 
@@ -734,6 +772,7 @@  EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
 static __net_init int ipv4_sysctl_init_net(struct net *net)
 {
 	struct ctl_table *table;
+	unsigned long limit;
 
 	table = ipv4_net_table;
 	if (!net_eq(net, &init_net)) {
@@ -769,6 +808,12 @@  static __net_init int ipv4_sysctl_init_net(struct net *net)
 
 	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
 
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
+	net->ipv4.sysctl_tcp_mem[1] = limit;
+	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+
 	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
 			net_ipv4_ctl_path, table);
 	if (net->ipv4.ipv4_hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 46febca..e1918fa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -266,6 +266,7 @@ 
 #include <linux/crypto.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/nsproxy.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -282,23 +283,12 @@  int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
-long sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
 
-EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
-atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
-EXPORT_SYMBOL(tcp_memory_allocated);
-
-/*
- * Current number of TCP sockets.
- */
-struct percpu_counter tcp_sockets_allocated;
-EXPORT_SYMBOL(tcp_sockets_allocated);
-
 /*
  * TCP splice context
  */
@@ -308,23 +298,157 @@  struct tcp_splice_state {
 	unsigned int flags;
 };
 
+#ifdef CONFIG_CGROUP_KMEM
 /*
  * Pressure flag: try to collapse.
  * Technical note: it is used by multiple contexts non atomically.
  * All the __sk_mem_schedule() is of this nature: accounting
  * is strict, actions are advisory and have some latency.
  */
-int tcp_memory_pressure __read_mostly;
-EXPORT_SYMBOL(tcp_memory_pressure);
-
 void tcp_enter_memory_pressure(struct sock *sk)
 {
+	struct kmem_cgroup *sg = sk->sk_cgrp;
+	if (!sg->tcp_memory_pressure) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+		sg->tcp_memory_pressure = 1;
+	}
+}
+
+long *tcp_sysctl_mem(struct kmem_cgroup *sg)
+{
+	return sg->tcp_prot_mem;
+}
+
+atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
+{
+	return &(sg->tcp_memory_allocated);
+}
+
+static int tcp_write_maxmem(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
+	struct net *net = current->nsproxy->net_ns;
+	int i;
+
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+
+	/*
+	 * We can't allow more memory than our parents. Since this
+	 * will be tested for all calls, by induction, there is no need
+	 * to test any parent other than our own
+	 * */
+	if (sg->parent && (val > sg->parent->tcp_max_memory))
+		val = sg->parent->tcp_max_memory;
+
+	sg->tcp_max_memory = val;
+
+	for (i = 0; i < 3; i++)
+		sg->tcp_prot_mem[i]  = min_t(long, val,
+					     net->ipv4.sysctl_tcp_mem[i]);
+
+	cgroup_unlock();
+
+	return 0;
+}
+
+static u64 tcp_read_maxmem(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
+	u64 ret;
+
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	ret = sg->tcp_max_memory;
+
+	cgroup_unlock();
+	return ret;
+}
+
+static struct cftype tcp_files[] = {
+	{
+		.name = "tcp_maxmem",
+		.write_u64 = tcp_write_maxmem,
+		.read_u64 = tcp_read_maxmem,
+	},
+};
+
+int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
+	unsigned long limit;
+	struct net *net = current->nsproxy->net_ns;
+
+	sg->tcp_memory_pressure = 0;
+	atomic_long_set(&sg->tcp_memory_allocated, 0);
+	percpu_counter_init(&sg->tcp_sockets_allocated, 0);
+
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+
+	if (sg->parent)
+		sg->tcp_max_memory = sg->parent->tcp_max_memory;
+	else
+		sg->tcp_max_memory = limit * 2;
+
+	sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
+	sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
+	sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+
+	return cgroup_add_files(cgrp, ss, tcp_files, ARRAY_SIZE(tcp_files));
+}
+EXPORT_SYMBOL(tcp_init_cgroup);
+
+int *memory_pressure_tcp(struct kmem_cgroup *sg)
+{
+	return &sg->tcp_memory_pressure;
+}
+
+struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
+{
+	return &sg->tcp_sockets_allocated;
+}
+#else
+
+/* Current number of TCP sockets. */
+struct percpu_counter tcp_sockets_allocated;
+atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
+int tcp_memory_pressure;
+
+int *memory_pressure_tcp(struct kmem_cgroup *sg)
+{
+	return &tcp_memory_pressure;
+}
+
+struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
+{
+	return &tcp_sockets_allocated;
+}
+
+void tcp_enter_memory_pressure(struct sock *sock)
+{
 	if (!tcp_memory_pressure) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
 		tcp_memory_pressure = 1;
 	}
 }
+
+long *tcp_sysctl_mem(struct kmem_cgroup *sg)
+{
+	return init_net.ipv4.sysctl_tcp_mem;
+}
+
+atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
+{
+	return &tcp_memory_allocated;
+}
+#endif /* CONFIG_CGROUP_KMEM */
+
+EXPORT_SYMBOL(memory_pressure_tcp);
+EXPORT_SYMBOL(sockets_allocated_tcp);
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
+EXPORT_SYMBOL(tcp_sysctl_mem);
+EXPORT_SYMBOL(memory_allocated_tcp);
 
 /* Convert seconds to retransmits based on initial and max timeout */
 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
@@ -3226,7 +3350,9 @@  void __init tcp_init(void)
 
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
 
+#ifndef CONFIG_CGROUP_KMEM
 	percpu_counter_init(&tcp_sockets_allocated, 0);
+#endif
 	percpu_counter_init(&tcp_orphan_count, 0);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
@@ -3277,14 +3403,10 @@  void __init tcp_init(void)
 	sysctl_tcp_max_orphans = cnt / 2;
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
-	limit = nr_free_buffer_pages() / 8;
-	limit = max(limit, 128UL);
-	sysctl_tcp_mem[0] = limit / 4 * 3;
-	sysctl_tcp_mem[1] = limit;
-	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
-
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
-	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
+	limit = (unsigned long)init_net.ipv4.sysctl_tcp_mem[1];
+	limit <<= (PAGE_SHIFT - 7);
+
 	max_share = min(4UL*1024*1024, limit);
 
 	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ea0d218..c44e830 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -316,7 +316,7 @@  static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
 	/* Check #1 */
 	if (tp->rcv_ssthresh < tp->window_clamp &&
 	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
-	    !tcp_memory_pressure) {
+	    !sk_memory_pressure(sk)) {
 		int incr;
 
 		/* Check #2. Increase window, if skb with such overhead
@@ -393,15 +393,16 @@  static void tcp_clamp_window(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct proto *prot = sk->sk_prot;
 
 	icsk->icsk_ack.quick = 0;
 
-	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+	if (sk->sk_rcvbuf < prot->sysctl_rmem[2] &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
-	    !tcp_memory_pressure &&
-	    atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+	    !sk_memory_pressure(sk) &&
+	    atomic_long_read(sk_memory_allocated(sk)) < sk_prot_mem(sk)[0]) {
 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
-				    sysctl_tcp_rmem[2]);
+				    prot->sysctl_rmem[2]);
 	}
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -4806,7 +4807,7 @@  static int tcp_prune_queue(struct sock *sk)
 
 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		tcp_clamp_window(sk);
-	else if (tcp_memory_pressure)
+	else if (sk_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
@@ -4872,11 +4873,11 @@  static int tcp_should_expand_sndbuf(struct sock *sk)
 		return 0;
 
 	/* If we are under global TCP memory pressure, do not expand.  */
-	if (tcp_memory_pressure)
+	if (sk_memory_pressure(sk))
 		return 0;
 
 	/* If we are under soft global TCP memory pressure, do not expand.  */
-	if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+	if (atomic_long_read(sk_memory_allocated(sk)) >= sk_prot_mem(sk)[0])
 		return 0;
 
 	/* If we filled the congestion window, do not expand.  */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1c12b8e..af6c095 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1848,6 +1848,7 @@  static int tcp_v4_init_sock(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct kmem_cgroup *sg;
 
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
@@ -1901,7 +1902,13 @@  static int tcp_v4_init_sock(struct sock *sk)
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
 	local_bh_disable();
-	percpu_counter_inc(&tcp_sockets_allocated);
+	percpu_counter_inc(sk_sockets_allocated(sk));
+
+#ifdef CONFIG_CGROUP_KMEM
+	for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
+		percpu_counter_inc(sg_sockets_allocated(sk->sk_prot, sg));
+#endif
+
 	local_bh_enable();
 
 	return 0;
@@ -1910,6 +1917,7 @@  static int tcp_v4_init_sock(struct sock *sk)
 void tcp_v4_destroy_sock(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct kmem_cgroup *sg;
 
 	tcp_clear_xmit_timers(sk);
 
@@ -1957,7 +1965,11 @@  void tcp_v4_destroy_sock(struct sock *sk)
 		tp->cookie_values = NULL;
 	}
 
-	percpu_counter_dec(&tcp_sockets_allocated);
+	percpu_counter_dec(sk_sockets_allocated(sk));
+#ifdef CONFIG_CGROUP_KMEM
+	for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
+		percpu_counter_dec(sg_sockets_allocated(sk->sk_prot, sg));
+#endif
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
 
@@ -2598,11 +2610,14 @@  struct proto tcp_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
-	.sockets_allocated	= &tcp_sockets_allocated,
+	.memory_pressure	= memory_pressure_tcp,
+	.sockets_allocated	= sockets_allocated_tcp,
 	.orphan_count		= &tcp_orphan_count,
-	.memory_allocated	= &tcp_memory_allocated,
-	.memory_pressure	= &tcp_memory_pressure,
-	.sysctl_mem		= sysctl_tcp_mem,
+	.memory_allocated	= memory_allocated_tcp,
+#ifdef CONFIG_CGROUP_KMEM
+	.init_cgroup		= tcp_init_cgroup,
+#endif
+	.prot_mem		= tcp_sysctl_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 882e0b0..06aeb31 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1912,7 +1912,7 @@  u32 __tcp_select_window(struct sock *sk)
 	if (free_space < (full_space >> 1)) {
 		icsk->icsk_ack.quick = 0;
 
-		if (tcp_memory_pressure)
+		if (sk_memory_pressure(sk))
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
 					       4U * tp->advmss);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ecd44b0..2c67617 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -261,7 +261,7 @@  static void tcp_delack_timer(unsigned long data)
 	}
 
 out:
-	if (tcp_memory_pressure)
+	if (sk_memory_pressure(sk))
 		sk_mem_reclaim(sk);
 out_unlock:
 	bh_unlock_sock(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1b5a193..6c08c65 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -120,9 +120,6 @@  EXPORT_SYMBOL(sysctl_udp_rmem_min);
 int sysctl_udp_wmem_min __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_wmem_min);
 
-atomic_long_t udp_memory_allocated;
-EXPORT_SYMBOL(udp_memory_allocated);
-
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
@@ -1918,6 +1915,19 @@  unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 }
 EXPORT_SYMBOL(udp_poll);
 
+static atomic_long_t udp_memory_allocated;
+atomic_long_t *memory_allocated_udp(struct kmem_cgroup *sg)
+{
+	return &udp_memory_allocated;
+}
+EXPORT_SYMBOL(memory_allocated_udp);
+
+long *udp_sysctl_mem(struct kmem_cgroup *sg)
+{
+	return sysctl_udp_mem;
+}
+EXPORT_SYMBOL(udp_sysctl_mem);
+
 struct proto udp_prot = {
 	.name		   = "UDP",
 	.owner		   = THIS_MODULE,
@@ -1936,8 +1946,8 @@  struct proto udp_prot = {
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v4_rehash,
 	.get_port	   = udp_v4_get_port,
-	.memory_allocated  = &udp_memory_allocated,
-	.sysctl_mem	   = sysctl_udp_mem,
+	.memory_allocated  = &memory_allocated_udp,
+	.prot_mem	   = udp_sysctl_mem,
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d1fb63f..0762e68 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1959,6 +1959,7 @@  static int tcp_v6_init_sock(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct kmem_cgroup *sg;
 
 	skb_queue_head_init(&tp->out_of_order_queue);
 	tcp_init_xmit_timers(sk);
@@ -2012,7 +2013,12 @@  static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
 	local_bh_disable();
-	percpu_counter_inc(&tcp_sockets_allocated);
+	percpu_counter_inc(sk_sockets_allocated(sk));
+#ifdef CONFIG_CGROUP_KMEM
+	for (sg = sk->sk_cgrp->parent; sg; sg = sg->parent)
+		percpu_counter_dec(sg_sockets_allocated(sk->sk_prot, sg));
+#endif
+
 	local_bh_enable();
 
 	return 0;
@@ -2221,11 +2227,11 @@  struct proto tcpv6_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
-	.sockets_allocated	= &tcp_sockets_allocated,
-	.memory_allocated	= &tcp_memory_allocated,
-	.memory_pressure	= &tcp_memory_pressure,
+	.sockets_allocated	= sockets_allocated_tcp,
+	.memory_allocated	= memory_allocated_tcp,
+	.memory_pressure	= memory_pressure_tcp,
 	.orphan_count		= &tcp_orphan_count,
-	.sysctl_mem		= sysctl_tcp_mem,
+	.prot_mem		= tcp_sysctl_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 29213b5..ef4b5b3 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1465,8 +1465,8 @@  struct proto udpv6_prot = {
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v6_rehash,
 	.get_port	   = udp_v6_get_port,
-	.memory_allocated  = &udp_memory_allocated,
-	.sysctl_mem	   = sysctl_udp_mem,
+	.memory_allocated  = memory_allocated_udp,
+	.prot_mem	   = udp_sysctl_mem,
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp6_sock),
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 836aa63..1b0300d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -119,11 +119,30 @@  static int sctp_memory_pressure;
 static atomic_long_t sctp_memory_allocated;
 struct percpu_counter sctp_sockets_allocated;
 
+static long *sctp_sysctl_mem(struct kmem_cgroup *sg)
+{
+	return sysctl_sctp_mem;
+}
+
 static void sctp_enter_memory_pressure(struct sock *sk)
 {
 	sctp_memory_pressure = 1;
 }
 
+static int *memory_pressure_sctp(struct kmem_cgroup *sg)
+{
+	return &sctp_memory_pressure;
+}
+
+static atomic_long_t *memory_allocated_sctp(struct kmem_cgroup *sg)
+{
+	return &sctp_memory_allocated;
+}
+
+static struct percpu_counter *sockets_allocated_sctp(struct kmem_cgroup *sg)
+{
+	return &sctp_sockets_allocated;
+}
 
 /* Get the sndbuf space available at the time on the association.  */
 static inline int sctp_wspace(struct sctp_association *asoc)
@@ -6831,13 +6850,13 @@  struct proto sctp_prot = {
 	.unhash      =	sctp_unhash,
 	.get_port    =	sctp_get_port,
 	.obj_size    =  sizeof(struct sctp_sock),
-	.sysctl_mem  =  sysctl_sctp_mem,
+	.prot_mem    =  sctp_sysctl_mem,
 	.sysctl_rmem =  sysctl_sctp_rmem,
 	.sysctl_wmem =  sysctl_sctp_wmem,
-	.memory_pressure = &sctp_memory_pressure,
+	.memory_pressure = memory_pressure_sctp,
 	.enter_memory_pressure = sctp_enter_memory_pressure,
-	.memory_allocated = &sctp_memory_allocated,
-	.sockets_allocated = &sctp_sockets_allocated,
+	.memory_allocated = memory_allocated_sctp,
+	.sockets_allocated = sockets_allocated_sctp,
 };
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -6863,12 +6882,12 @@  struct proto sctpv6_prot = {
 	.unhash		= sctp_unhash,
 	.get_port	= sctp_get_port,
 	.obj_size	= sizeof(struct sctp6_sock),
-	.sysctl_mem	= sysctl_sctp_mem,
+	.prot_mem	= sctp_sysctl_mem,
 	.sysctl_rmem	= sysctl_sctp_rmem,
 	.sysctl_wmem	= sysctl_sctp_wmem,
-	.memory_pressure = &sctp_memory_pressure,
+	.memory_pressure = memory_pressure_sctp,
 	.enter_memory_pressure = sctp_enter_memory_pressure,
-	.memory_allocated = &sctp_memory_allocated,
-	.sockets_allocated = &sctp_sockets_allocated,
+	.memory_allocated = memory_allocated_sctp,
+	.sockets_allocated = sockets_allocated_sctp,
 };
 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */