Patchwork [v1,3/5] cgroup: Protect access to task_cls_classid() when built as module

login
register
mail settings
Submitter Daniel Wagner
Date Aug. 17, 2012, 2:58 p.m.
Message ID <1345215494-9181-4-git-send-email-wagi@monom.org>
Download mbox | patch
Permalink /patch/178238/
State Changes Requested
Delegated to: David Miller
Headers show

Comments

Daniel Wagner - Aug. 17, 2012, 2:58 p.m.
From: Daniel Wagner <daniel.wagner@bmw-carit.de>

The module version of task_cls_classid() checks if net_cls_sbusys_id
is valid to indentify when it is okay to access the controller.

Instead relying on the subusys_id to be set, make it explicit
with a jump label.

Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Gao feng <gaofeng@cn.fujitsu.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: netdev@vger.kernel.org
Cc: cgroups@vger.kernel.org
---
 include/net/cls_cgroup.h | 5 ++++-
 net/core/sock.c          | 5 +++++
 net/sched/cls_cgroup.c   | 9 +++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)
Neil Horman - Aug. 17, 2012, 6:28 p.m.
On Fri, Aug 17, 2012 at 04:58:12PM +0200, Daniel Wagner wrote:
> From: Daniel Wagner <daniel.wagner@bmw-carit.de>
> 
> The module version of task_cls_classid() checks if net_cls_sbusys_id
> is valid to indentify when it is okay to access the controller.
> 
> Instead relying on the subusys_id to be set, make it explicit
> with a jump label.
> 
> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
> Cc: "David S. Miller" <davem@davemloft.net>
> Cc: Gao feng <gaofeng@cn.fujitsu.com>
> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> Cc: John Fastabend <john.r.fastabend@intel.com>
> Cc: Li Zefan <lizefan@huawei.com>
> Cc: Neil Horman <nhorman@tuxdriver.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: netdev@vger.kernel.org
> Cc: cgroups@vger.kernel.org
> ---
>  include/net/cls_cgroup.h | 5 ++++-
>  net/core/sock.c          | 5 +++++
>  net/sched/cls_cgroup.c   | 9 +++++++++
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
> index 401672c..bbbd957 100644
> --- a/include/net/cls_cgroup.h
> +++ b/include/net/cls_cgroup.h
> @@ -16,6 +16,7 @@
>  #include <linux/cgroup.h>
>  #include <linux/hardirq.h>
>  #include <linux/rcupdate.h>
> +#include <linux/jump_label.h>
>  
>  #ifdef CONFIG_CGROUPS
>  struct cgroup_cls_state
> @@ -44,6 +45,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
>  }
>  
>  #elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
> +extern struct static_key cgroup_cls_enabled;
> +#define clscg_enabled static_key_false(&cgroup_cls_enabled)
>  
>  extern int net_cls_subsys_id;
>  
> @@ -52,7 +55,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
>  	int id;
>  	u32 classid = 0;
>  
> -	if (in_interrupt())
> +	if (!clscg_enabled || in_interrupt())
>  		return 0;
>  
>  	rcu_read_lock();
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 8f67ced..8106e77 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -327,6 +327,11 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>  EXPORT_SYMBOL(__sk_backlog_rcv);
>  
>  #if defined(CONFIG_CGROUPS)
> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> +struct static_key cgroup_cls_enabled = STATIC_KEY_INIT_FALSE;
> +EXPORT_SYMBOL_GPL(cgroup_cls_enabled);
> +#endif
> +
>  #if !defined(CONFIG_NET_CLS_CGROUP)
>  int net_cls_subsys_id = -1;
>  EXPORT_SYMBOL_GPL(net_cls_subsys_id);
> diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
> index 7743ea8..0635894 100644
> --- a/net/sched/cls_cgroup.c
> +++ b/net/sched/cls_cgroup.c
> @@ -44,12 +44,21 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
>  
>  	if (cgrp->parent)
>  		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> +	else if (!clscg_enabled)
> +                static_key_slow_inc(&cgroup_cls_enabled);
This is racy I think.  The read of the static key is atomic with other reads,
but the entire conditional is not atomic.  If two cpus were creating cgroups in
parallel, it would be possible for both to read the static key as being zero
(the second cpu would read the key before the first cpu could increment it).
> +#endif
>  
>  	return &cs->css;
>  }
>  
>  static void cgrp_destroy(struct cgroup *cgrp)
>  {
> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> +	if (!cgrp->parent && clscg_enabled)
> +                static_key_slow_dec(&cgroup_cls_enabled);
Ditto here with the race above.  I think what you want is one of:

1) Use static_key_slow_[inc|dec] unconditionally 

2) Keep a separate internal counter to track the number of cgroup instances
so that you only inc the static key on the first create and dec it on the last
delete.

I would think (1) would be sufficent.  It looks like static_key_slow_inc uses
atomic_inc_not_zero to just do an inc anyway in the event that multiple inc
events are made.

Neil

> +#endif
> +
>  	kfree(cgrp_cls_state(cgrp));
>  }
>  
> -- 
> 1.7.12.rc1.16.g05a20c8
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zefan Li - Aug. 20, 2012, 12:59 a.m.
于 2012/8/18 2:28, Neil Horman 写道:
> On Fri, Aug 17, 2012 at 04:58:12PM +0200, Daniel Wagner wrote:
>> From: Daniel Wagner <daniel.wagner@bmw-carit.de>
>>
>> The module version of task_cls_classid() checks if net_cls_sbusys_id
>> is valid to indentify when it is okay to access the controller.
>>
>> Instead relying on the subusys_id to be set, make it explicit
>> with a jump label.
>>
>> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
>> Cc: "David S. Miller" <davem@davemloft.net>
>> Cc: Gao feng <gaofeng@cn.fujitsu.com>
>> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
>> Cc: John Fastabend <john.r.fastabend@intel.com>
>> Cc: Li Zefan <lizefan@huawei.com>
>> Cc: Neil Horman <nhorman@tuxdriver.com>
>> Cc: Tejun Heo <tj@kernel.org>
>> Cc: netdev@vger.kernel.org
>> Cc: cgroups@vger.kernel.org
>> ---
>>  include/net/cls_cgroup.h | 5 ++++-
>>  net/core/sock.c          | 5 +++++
>>  net/sched/cls_cgroup.c   | 9 +++++++++
>>  3 files changed, 18 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
>> index 401672c..bbbd957 100644
>> --- a/include/net/cls_cgroup.h
>> +++ b/include/net/cls_cgroup.h
>> @@ -16,6 +16,7 @@
>>  #include <linux/cgroup.h>
>>  #include <linux/hardirq.h>
>>  #include <linux/rcupdate.h>
>> +#include <linux/jump_label.h>
>>  
>>  #ifdef CONFIG_CGROUPS
>>  struct cgroup_cls_state
>> @@ -44,6 +45,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
>>  }
>>  
>>  #elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
>> +extern struct static_key cgroup_cls_enabled;
>> +#define clscg_enabled static_key_false(&cgroup_cls_enabled)
>>  
>>  extern int net_cls_subsys_id;
>>  
>> @@ -52,7 +55,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
>>  	int id;
>>  	u32 classid = 0;
>>  
>> -	if (in_interrupt())
>> +	if (!clscg_enabled || in_interrupt())
>>  		return 0;
>>  
>>  	rcu_read_lock();
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 8f67ced..8106e77 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -327,6 +327,11 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>>  EXPORT_SYMBOL(__sk_backlog_rcv);
>>  
>>  #if defined(CONFIG_CGROUPS)
>> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
>> +struct static_key cgroup_cls_enabled = STATIC_KEY_INIT_FALSE;
>> +EXPORT_SYMBOL_GPL(cgroup_cls_enabled);
>> +#endif
>> +
>>  #if !defined(CONFIG_NET_CLS_CGROUP)
>>  int net_cls_subsys_id = -1;
>>  EXPORT_SYMBOL_GPL(net_cls_subsys_id);
>> diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
>> index 7743ea8..0635894 100644
>> --- a/net/sched/cls_cgroup.c
>> +++ b/net/sched/cls_cgroup.c
>> @@ -44,12 +44,21 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
>>  
>>  	if (cgrp->parent)
>>  		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
>> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
>> +	else if (!clscg_enabled)
>> +                static_key_slow_inc(&cgroup_cls_enabled);
> This is racy I think.  The read of the static key is atomic with other reads,
> but the entire conditional is not atomic.  If two cpus were creating cgroups in
> parallel, it would be possible for both to read the static key as being zero
> (the second cpu would read the key before the first cpu could increment it).

static_key_slow_inc() is called only when we're creating the root cgroup, and that's
module loading.

so it's safe.

>> +#endif
>>  
>>  	return &cs->css;
>>  }
>>  
>>  static void cgrp_destroy(struct cgroup *cgrp)
>>  {
>> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
>> +	if (!cgrp->parent && clscg_enabled)
>> +                static_key_slow_dec(&cgroup_cls_enabled);
> Ditto here with the race above.  I think what you want is one of:

and this is called only when we're destroying the root cgroup, and that's
module unload.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman - Aug. 20, 2012, 11:08 a.m.
On Mon, Aug 20, 2012 at 08:59:37AM +0800, Li Zefan wrote:
> 于 2012/8/18 2:28, Neil Horman 写道:
> > On Fri, Aug 17, 2012 at 04:58:12PM +0200, Daniel Wagner wrote:
> >> From: Daniel Wagner <daniel.wagner@bmw-carit.de>
> >>
> >> The module version of task_cls_classid() checks if net_cls_sbusys_id
> >> is valid to indentify when it is okay to access the controller.
> >>
> >> Instead relying on the subusys_id to be set, make it explicit
> >> with a jump label.
> >>
> >> Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
> >> Cc: "David S. Miller" <davem@davemloft.net>
> >> Cc: Gao feng <gaofeng@cn.fujitsu.com>
> >> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> >> Cc: John Fastabend <john.r.fastabend@intel.com>
> >> Cc: Li Zefan <lizefan@huawei.com>
> >> Cc: Neil Horman <nhorman@tuxdriver.com>
> >> Cc: Tejun Heo <tj@kernel.org>
> >> Cc: netdev@vger.kernel.org
> >> Cc: cgroups@vger.kernel.org
> >> ---
> >>  include/net/cls_cgroup.h | 5 ++++-
> >>  net/core/sock.c          | 5 +++++
> >>  net/sched/cls_cgroup.c   | 9 +++++++++
> >>  3 files changed, 18 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
> >> index 401672c..bbbd957 100644
> >> --- a/include/net/cls_cgroup.h
> >> +++ b/include/net/cls_cgroup.h
> >> @@ -16,6 +16,7 @@
> >>  #include <linux/cgroup.h>
> >>  #include <linux/hardirq.h>
> >>  #include <linux/rcupdate.h>
> >> +#include <linux/jump_label.h>
> >>  
> >>  #ifdef CONFIG_CGROUPS
> >>  struct cgroup_cls_state
> >> @@ -44,6 +45,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
> >>  }
> >>  
> >>  #elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
> >> +extern struct static_key cgroup_cls_enabled;
> >> +#define clscg_enabled static_key_false(&cgroup_cls_enabled)
> >>  
> >>  extern int net_cls_subsys_id;
> >>  
> >> @@ -52,7 +55,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
> >>  	int id;
> >>  	u32 classid = 0;
> >>  
> >> -	if (in_interrupt())
> >> +	if (!clscg_enabled || in_interrupt())
> >>  		return 0;
> >>  
> >>  	rcu_read_lock();
> >> diff --git a/net/core/sock.c b/net/core/sock.c
> >> index 8f67ced..8106e77 100644
> >> --- a/net/core/sock.c
> >> +++ b/net/core/sock.c
> >> @@ -327,6 +327,11 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> >>  EXPORT_SYMBOL(__sk_backlog_rcv);
> >>  
> >>  #if defined(CONFIG_CGROUPS)
> >> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> >> +struct static_key cgroup_cls_enabled = STATIC_KEY_INIT_FALSE;
> >> +EXPORT_SYMBOL_GPL(cgroup_cls_enabled);
> >> +#endif
> >> +
> >>  #if !defined(CONFIG_NET_CLS_CGROUP)
> >>  int net_cls_subsys_id = -1;
> >>  EXPORT_SYMBOL_GPL(net_cls_subsys_id);
> >> diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
> >> index 7743ea8..0635894 100644
> >> --- a/net/sched/cls_cgroup.c
> >> +++ b/net/sched/cls_cgroup.c
> >> @@ -44,12 +44,21 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
> >>  
> >>  	if (cgrp->parent)
> >>  		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
> >> +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> >> +	else if (!clscg_enabled)
> >> +                static_key_slow_inc(&cgroup_cls_enabled);
> > This is racy I think.  The read of the static key is atomic with other reads,
> > but the entire conditional is not atomic.  If two cpus were creating cgroups in
> > parallel, it would be possible for both to read the static key as being zero
> > (the second cpu would read the key before the first cpu could increment it).
> 
> static_key_slow_inc() is called only when we're creating the root cgroup, and that's
> module loading.
> 
> so it's safe.
> 
What makes you say that?  It appears to me that we call ss->create (and
potential ss->destroy, depending on failure conditions) from cgroup_create,
which in turn is called from cgroup_mkdir, which is called for every cgroup
instance created, not just the root cgroup.

Neil

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Wagner - Aug. 20, 2012, 11:29 a.m.
On Fri, Aug 17, 2012 at 02:28:55PM -0400, Neil Horman wrote:
> On Fri, Aug 17, 2012 at 04:58:12PM +0200, Daniel Wagner wrote:
> > From: Daniel Wagner <daniel.wagner@bmw-carit.de>
> > 
> > The module version of task_cls_classid() checks if net_cls_sbusys_id
> > is valid to indentify when it is okay to access the controller.
> > 
> > Instead relying on the subusys_id to be set, make it explicit
> > with a jump label.
> > 
> > Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
> > Cc: "David S. Miller" <davem@davemloft.net>
> > Cc: Gao feng <gaofeng@cn.fujitsu.com>
> > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > Cc: John Fastabend <john.r.fastabend@intel.com>
> > Cc: Li Zefan <lizefan@huawei.com>
> > Cc: Neil Horman <nhorman@tuxdriver.com>
> > Cc: Tejun Heo <tj@kernel.org>
> > Cc: netdev@vger.kernel.org
> > Cc: cgroups@vger.kernel.org
> > ---
> >  include/net/cls_cgroup.h | 5 ++++-
> >  net/core/sock.c          | 5 +++++
> >  net/sched/cls_cgroup.c   | 9 +++++++++
> >  3 files changed, 18 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
> > index 401672c..bbbd957 100644
> > --- a/include/net/cls_cgroup.h
> > +++ b/include/net/cls_cgroup.h
> > @@ -16,6 +16,7 @@
> >  #include <linux/cgroup.h>
> >  #include <linux/hardirq.h>
> >  #include <linux/rcupdate.h>
> > +#include <linux/jump_label.h>
> >  
> >  #ifdef CONFIG_CGROUPS
> >  struct cgroup_cls_state
> > @@ -44,6 +45,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
> >  }
> >  
> >  #elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > +extern struct static_key cgroup_cls_enabled;
> > +#define clscg_enabled static_key_false(&cgroup_cls_enabled)
> >  
> >  extern int net_cls_subsys_id;
> >  
> > @@ -52,7 +55,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
> >  	int id;
> >  	u32 classid = 0;
> >  
> > -	if (in_interrupt())
> > +	if (!clscg_enabled || in_interrupt())
> >  		return 0;
> >  
> >  	rcu_read_lock();
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 8f67ced..8106e77 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -327,6 +327,11 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> >  EXPORT_SYMBOL(__sk_backlog_rcv);
> >  
> >  #if defined(CONFIG_CGROUPS)
> > +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > +struct static_key cgroup_cls_enabled = STATIC_KEY_INIT_FALSE;
> > +EXPORT_SYMBOL_GPL(cgroup_cls_enabled);
> > +#endif
> > +
> >  #if !defined(CONFIG_NET_CLS_CGROUP)
> >  int net_cls_subsys_id = -1;
> >  EXPORT_SYMBOL_GPL(net_cls_subsys_id);
> > diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
> > index 7743ea8..0635894 100644
> > --- a/net/sched/cls_cgroup.c
> > +++ b/net/sched/cls_cgroup.c
> > @@ -44,12 +44,21 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
> >  
> >  	if (cgrp->parent)
> >  		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
> > +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > +	else if (!clscg_enabled)
> > +                static_key_slow_inc(&cgroup_cls_enabled);
> This is racy I think.  The read of the static key is atomic with other reads,
> but the entire conditional is not atomic.  If two cpus were creating cgroups in
> parallel, it would be possible for both to read the static key as being zero
> (the second cpu would read the key before the first cpu could increment it).

D'oh, That is racy.

> > +#endif
> >  
> >  	return &cs->css;
> >  }
> >  
> >  static void cgrp_destroy(struct cgroup *cgrp)
> >  {
> > +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > +	if (!cgrp->parent && clscg_enabled)
> > +                static_key_slow_dec(&cgroup_cls_enabled);
> Ditto here with the race above.  I think what you want is one of:
> 
> 1) Use static_key_slow_[inc|dec] unconditionally 

While the static_key_slow_inc() case will work, I am not so sure about
the static_key_slow_dec(), e.g. we could still access inside
task_cls_classid() a destroyed container.

> 2) Keep a separate internal counter to track the number of cgroup instances
> so that you only inc the static key on the first create and dec it on the last
> delete.

If I got you right, than this would not be different then direclty using
static_key_slow_[inc|dec].

> I would think (1) would be sufficent.  It looks like static_key_slow_inc uses
> atomic_inc_not_zero to just do an inc anyway in the event that multiple inc
> events are made.

Would something like this work?

static inline u32 task_cls_classid(struct task_struct *p)
{
	u32 classid;
	struct cgroup_cls_state *css;

	if (!clscg_enabled || in_interrupt())
		return 0;

	rcu_read_lock();
	css = container_of(task_subsys_state(p, net_cls_subsys_id),
			   struct cgroup_cls_state, css);
	if (!css)
		classid = css->classid;
	else
		classid = 0;
	rcu_read_unlock();

	return classid;
}

Daniel
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa - Aug. 20, 2012, 11:33 a.m.
On 08/20/2012 03:29 PM, Daniel Wagner wrote:
>> This is racy I think.  The read of the static key is atomic with other reads,
>> > but the entire conditional is not atomic.  If two cpus were creating cgroups in
>> > parallel, it would be possible for both to read the static key as being zero
>> > (the second cpu would read the key before the first cpu could increment it).
> D'oh, That is racy.
> 

Take a look at how we solve this particular problem in memcg. By using a
pair of bits meaning "ever active" and "currently active", we can avoid
problems with the static key increments.

Nothing bad can't happen by increasing the static key more than once;
the problem is that you need to somehow take note of that to make sure
you decrement the as many times you incremented when you release it.

Two other important things to keep in mind while dealing with static
branches:

* You can't increase/decrease while holding the cgroup_lock. lockdep may
trigger, because the cgroup_lock holds the cpu_hotplug lock, that the
static branches update path will also take. (due to cpusets). Doing a
logical hotplug will stress this path, and make the problem visible.

Take a look at disarm_sock_keys() (mm/memcontrol.c) to see how we solve
this particular problem.

* If you have code in more than one call site, the update among them is
not atomic. Not sure if this one applies to your case, but it can lead
you to some very unpleasant to debug problems. We use one of those two
bits I mentioned ("currently active") to make sure objects are not
marked before all sites are already patched.

Hope our previous experience with this can help you.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman - Aug. 20, 2012, 5:03 p.m.
On Mon, Aug 20, 2012 at 01:29:38PM +0200, Daniel Wagner wrote:
> On Fri, Aug 17, 2012 at 02:28:55PM -0400, Neil Horman wrote:
> > On Fri, Aug 17, 2012 at 04:58:12PM +0200, Daniel Wagner wrote:
> > > From: Daniel Wagner <daniel.wagner@bmw-carit.de>
> > > 
> > > The module version of task_cls_classid() checks if net_cls_sbusys_id
> > > is valid to indentify when it is okay to access the controller.
> > > 
> > > Instead relying on the subusys_id to be set, make it explicit
> > > with a jump label.
> > > 
> > > Signed-off-by: Daniel Wagner <daniel.wagner@bmw-carit.de>
> > > Cc: "David S. Miller" <davem@davemloft.net>
> > > Cc: Gao feng <gaofeng@cn.fujitsu.com>
> > > Cc: Jamal Hadi Salim <jhs@mojatatu.com>
> > > Cc: John Fastabend <john.r.fastabend@intel.com>
> > > Cc: Li Zefan <lizefan@huawei.com>
> > > Cc: Neil Horman <nhorman@tuxdriver.com>
> > > Cc: Tejun Heo <tj@kernel.org>
> > > Cc: netdev@vger.kernel.org
> > > Cc: cgroups@vger.kernel.org
> > > ---
> > >  include/net/cls_cgroup.h | 5 ++++-
> > >  net/core/sock.c          | 5 +++++
> > >  net/sched/cls_cgroup.c   | 9 +++++++++
> > >  3 files changed, 18 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
> > > index 401672c..bbbd957 100644
> > > --- a/include/net/cls_cgroup.h
> > > +++ b/include/net/cls_cgroup.h
> > > @@ -16,6 +16,7 @@
> > >  #include <linux/cgroup.h>
> > >  #include <linux/hardirq.h>
> > >  #include <linux/rcupdate.h>
> > > +#include <linux/jump_label.h>
> > >  
> > >  #ifdef CONFIG_CGROUPS
> > >  struct cgroup_cls_state
> > > @@ -44,6 +45,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
> > >  }
> > >  
> > >  #elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > > +extern struct static_key cgroup_cls_enabled;
> > > +#define clscg_enabled static_key_false(&cgroup_cls_enabled)
> > >  
> > >  extern int net_cls_subsys_id;
> > >  
> > > @@ -52,7 +55,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
> > >  	int id;
> > >  	u32 classid = 0;
> > >  
> > > -	if (in_interrupt())
> > > +	if (!clscg_enabled || in_interrupt())
> > >  		return 0;
> > >  
> > >  	rcu_read_lock();
> > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > index 8f67ced..8106e77 100644
> > > --- a/net/core/sock.c
> > > +++ b/net/core/sock.c
> > > @@ -327,6 +327,11 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> > >  EXPORT_SYMBOL(__sk_backlog_rcv);
> > >  
> > >  #if defined(CONFIG_CGROUPS)
> > > +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > > +struct static_key cgroup_cls_enabled = STATIC_KEY_INIT_FALSE;
> > > +EXPORT_SYMBOL_GPL(cgroup_cls_enabled);
> > > +#endif
> > > +
> > >  #if !defined(CONFIG_NET_CLS_CGROUP)
> > >  int net_cls_subsys_id = -1;
> > >  EXPORT_SYMBOL_GPL(net_cls_subsys_id);
> > > diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
> > > index 7743ea8..0635894 100644
> > > --- a/net/sched/cls_cgroup.c
> > > +++ b/net/sched/cls_cgroup.c
> > > @@ -44,12 +44,21 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
> > >  
> > >  	if (cgrp->parent)
> > >  		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
> > > +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > > +	else if (!clscg_enabled)
> > > +                static_key_slow_inc(&cgroup_cls_enabled);
> > This is racy I think.  The read of the static key is atomic with other reads,
> > but the entire conditional is not atomic.  If two cpus were creating cgroups in
> > parallel, it would be possible for both to read the static key as being zero
> > (the second cpu would read the key before the first cpu could increment it).
> 
> D'oh, That is racy.
> 
> > > +#endif
> > >  
> > >  	return &cs->css;
> > >  }
> > >  
> > >  static void cgrp_destroy(struct cgroup *cgrp)
> > >  {
> > > +#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
> > > +	if (!cgrp->parent && clscg_enabled)
> > > +                static_key_slow_dec(&cgroup_cls_enabled);
> > Ditto here with the race above.  I think what you want is one of:
> > 
> > 1) Use static_key_slow_[inc|dec] unconditionally 
> 
> While the static_key_slow_inc() case will work, I am not so sure about
> the static_key_slow_dec(), e.g. we could still access inside
> task_cls_classid() a destroyed container.
> 
Possibly, yes, I think.

> > 2) Keep a separate internal counter to track the number of cgroup instances
> > so that you only inc the static key on the first create and dec it on the last
> > delete.
> 
> If I got you right, than this would not be different then direclty using
> static_key_slow_[inc|dec].
> 
As long as a cgroup subsystems ->destroy method is only called when the
subsystem is being removed, then I think thats correct.  I'm not 100% sure thats
the case though.

> > I would think (1) would be sufficent.  It looks like static_key_slow_inc uses
> > atomic_inc_not_zero to just do an inc anyway in the event that multiple inc
> > events are made.
> 
> Would something like this work?
> 
I think so yes, assuming that you also make the slow_inc|dec changes

> static inline u32 task_cls_classid(struct task_struct *p)
> {
> 	u32 classid;
> 	struct cgroup_cls_state *css;
> 
> 	if (!clscg_enabled || in_interrupt())
> 		return 0;
> 
> 	rcu_read_lock();
> 	css = container_of(task_subsys_state(p, net_cls_subsys_id),
> 			   struct cgroup_cls_state, css);
> 	if (!css)
> 		classid = css->classid;
> 	else
> 		classid = 0;
> 	rcu_read_unlock();
> 
> 	return classid;
> }
> 
> Daniel
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa - Aug. 21, 2012, 9:12 a.m.
On 08/20/2012 09:03 PM, Neil Horman wrote:
>>> 2) Keep a separate internal counter to track the number of cgroup instances
>>> > > so that you only inc the static key on the first create and dec it on the last
>>> > > delete.
>> > 
>> > If I got you right, than this would not be different then direclty using
>> > static_key_slow_[inc|dec].
>> > 
> As long as a cgroup subsystems ->destroy method is only called when the
> subsystem is being removed, then I think thats correct.  I'm not 100% sure thats
> the case though.
> 
THAT is correct, but not the call itself. ->destroy() is called with the
cgroup_lock held, and there is a lockdep dependency created by cpuset
that prevents the cpu_hotplug lock, taken by static branch updates, to
be taken inside cgroup_lock.

So unless cpuset is fixed - which is a major work, we can't do
static_branch updates while holding the cgroup_lock.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Glauber Costa - Aug. 23, 2012, 9:12 a.m.
> 
> So there is no need to move the enabling/disabling of the static branch
> at this point to create/destroy.
> 

That will surely save you some trouble.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Wagner - Aug. 23, 2012, 9:12 a.m.
On 21.08.2012 11:12, Glauber Costa wrote:
> On 08/20/2012 09:03 PM, Neil Horman wrote:
>>>> 2) Keep a separate internal counter to track the number of cgroup instances
>>>>>> so that you only inc the static key on the first create and dec it on the last
>>>>>> delete.
>>>>
>>>> If I got you right, than this would not be different then direclty using
>>>> static_key_slow_[inc|dec].
>>>>
>> As long as a cgroup subsystems ->destroy method is only called when the
>> subsystem is being removed, then I think thats correct.  I'm not 100% sure thats
>> the case though.
>>
> THAT is correct, but not the call itself. ->destroy() is called with the
> cgroup_lock held, and there is a lockdep dependency created by cpuset
> that prevents the cpu_hotplug lock, taken by static branch updates, to
> be taken inside cgroup_lock.
>
> So unless cpuset is fixed - which is a major work, we can't do
> static_branch updates while holding the cgroup_lock.

Thanks a lot for the info on this problem. I have spend the last days 
trying to figure out how you have solved this in memcg. It looks like 
complex and fragile. So I rather avoid enabling/disabling the static 
branch in cgrp_create()/cgroup_destroy() and do it on module init/exit 
as it is done currently with the id assigment.

Initially I wanted to optimize the task_cls_classid() path, so that only
when a cgroup really exists we spent time in there. Though as soon the 
controller is registered to the cgroup core, the code path is enabled. 
That means when cls is builtin it is used all the time (even though no 
one is using it). In the module case, it is the same. task_cls_classid() 
path is used right after module init.

So there is no need to move the enabling/disabling of the static branch 
at this point to create/destroy.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 401672c..bbbd957 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -16,6 +16,7 @@ 
 #include <linux/cgroup.h>
 #include <linux/hardirq.h>
 #include <linux/rcupdate.h>
+#include <linux/jump_label.h>
 
 #ifdef CONFIG_CGROUPS
 struct cgroup_cls_state
@@ -44,6 +45,8 @@  static inline u32 task_cls_classid(struct task_struct *p)
 }
 
 #elif IS_MODULE(CONFIG_NET_CLS_CGROUP)
+extern struct static_key cgroup_cls_enabled;
+#define clscg_enabled static_key_false(&cgroup_cls_enabled)
 
 extern int net_cls_subsys_id;
 
@@ -52,7 +55,7 @@  static inline u32 task_cls_classid(struct task_struct *p)
 	int id;
 	u32 classid = 0;
 
-	if (in_interrupt())
+	if (!clscg_enabled || in_interrupt())
 		return 0;
 
 	rcu_read_lock();
diff --git a/net/core/sock.c b/net/core/sock.c
index 8f67ced..8106e77 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -327,6 +327,11 @@  int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 EXPORT_SYMBOL(__sk_backlog_rcv);
 
 #if defined(CONFIG_CGROUPS)
+#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
+struct static_key cgroup_cls_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL_GPL(cgroup_cls_enabled);
+#endif
+
 #if !defined(CONFIG_NET_CLS_CGROUP)
 int net_cls_subsys_id = -1;
 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 7743ea8..0635894 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -44,12 +44,21 @@  static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp)
 
 	if (cgrp->parent)
 		cs->classid = cgrp_cls_state(cgrp->parent)->classid;
+#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
+	else if (!clscg_enabled)
+                static_key_slow_inc(&cgroup_cls_enabled);
+#endif
 
 	return &cs->css;
 }
 
 static void cgrp_destroy(struct cgroup *cgrp)
 {
+#if IS_MODULE(CONFIG_NET_CLS_CGROUP)
+	if (!cgrp->parent && clscg_enabled)
+                static_key_slow_dec(&cgroup_cls_enabled);
+#endif
+
 	kfree(cgrp_cls_state(cgrp));
 }