diff mbox series

[ghak90,V5,09/10] audit: add support for containerid to network namespaces

Message ID 27473c84a274c64871cfa8e3636deaf05603c978.1552665316.git.rgb@redhat.com
State Awaiting Upstream
Delegated to: Pablo Neira
Headers show
Series audit: implement container identifier | expand

Commit Message

Richard Guy Briggs March 15, 2019, 6:29 p.m. UTC
Audit events could happen in a network namespace outside of a task
context due to packets received from the net that trigger an auditing
rule prior to being associated with a running task.  The network
namespace could be in use by multiple containers by association to the
tasks in that network namespace.  We still want a way to attribute
these events to any potential containers.  Keep a list per network
namespace to track these audit container identifiiers.

Add/increment the audit container identifier on:
- initial setting of the audit container identifier via /proc
- clone/fork call that inherits an audit container identifier
- unshare call that inherits an audit container identifier
- setns call that inherits an audit container identifier
Delete/decrement the audit container identifier on:
- an inherited audit container identifier dropped when child set
- process exit
- unshare call that drops a net namespace
- setns call that drops a net namespace

See: https://github.com/linux-audit/audit-kernel/issues/92
See: https://github.com/linux-audit/audit-testsuite/issues/64
See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 include/linux/audit.h | 19 ++++++++++++
 kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/nsproxy.c      |  4 +++
 3 files changed, 106 insertions(+), 3 deletions(-)

Comments

Neil Horman March 18, 2019, 8:56 p.m. UTC | #1
On Fri, Mar 15, 2019 at 02:29:57PM -0400, Richard Guy Briggs wrote:
> Audit events could happen in a network namespace outside of a task
> context due to packets received from the net that trigger an auditing
> rule prior to being associated with a running task.  The network
> namespace could be in use by multiple containers by association to the
> tasks in that network namespace.  We still want a way to attribute
> these events to any potential containers.  Keep a list per network
> namespace to track these audit container identifiiers.
> 
> Add/increment the audit container identifier on:
> - initial setting of the audit container identifier via /proc
> - clone/fork call that inherits an audit container identifier
> - unshare call that inherits an audit container identifier
> - setns call that inherits an audit container identifier
> Delete/decrement the audit container identifier on:
> - an inherited audit container identifier dropped when child set
> - process exit
> - unshare call that drops a net namespace
> - setns call that drops a net namespace
> 
> See: https://github.com/linux-audit/audit-kernel/issues/92
> See: https://github.com/linux-audit/audit-testsuite/issues/64
> See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  include/linux/audit.h | 19 ++++++++++++
>  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/nsproxy.c      |  4 +++
>  3 files changed, 106 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index fa19fa408931..70255c2dfb9f 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -27,6 +27,7 @@
>  #include <linux/ptrace.h>
>  #include <linux/namei.h>  /* LOOKUP_* */
>  #include <uapi/linux/audit.h>
> +#include <linux/refcount.h>
>  
>  #define AUDIT_INO_UNSET ((unsigned long)-1)
>  #define AUDIT_DEV_UNSET ((dev_t)-1)
> @@ -99,6 +100,13 @@ struct audit_task_info {
>  
>  extern struct audit_task_info init_struct_audit;
>  
> +struct audit_contid {
> +	struct list_head	list;
> +	u64			id;
> +	refcount_t		refcount;
> +	struct rcu_head		rcu;
> +};
> +
>  extern int is_audit_feature_set(int which);
>  
>  extern int __init audit_register_class(int class, unsigned *list);
> @@ -202,6 +210,10 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
>  }
>  
>  extern void audit_log_contid(struct audit_context *context, u64 contid);
> +extern void audit_netns_contid_add(struct net *net, u64 contid);
> +extern void audit_netns_contid_del(struct net *net, u64 contid);
> +extern void audit_switch_task_namespaces(struct nsproxy *ns,
> +					 struct task_struct *p);
>  
>  extern u32 audit_enabled;
>  #else /* CONFIG_AUDIT */
> @@ -271,6 +283,13 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
>  
>  static inline void audit_log_contid(struct audit_context *context, u64 contid)
>  { }
> +static inline void audit_netns_contid_add(struct net *net, u64 contid)
> +{ }
> +static inline void audit_netns_contid_del(struct net *net, u64 contid)
> +{ }
> +static inline void audit_switch_task_namespaces(struct nsproxy *ns,
> +						struct task_struct *p)
> +{ }
>  
>  #define audit_enabled AUDIT_OFF
>  #endif /* CONFIG_AUDIT */
> diff --git a/kernel/audit.c b/kernel/audit.c
> index cf448599ef34..7fa3194f5342 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -72,6 +72,7 @@
>  #include <linux/freezer.h>
>  #include <linux/pid_namespace.h>
>  #include <net/netns/generic.h>
> +#include <net/net_namespace.h>
>  
>  #include "audit.h"
>  
> @@ -99,9 +100,13 @@
>  /**
>   * struct audit_net - audit private network namespace data
>   * @sk: communication socket
> + * @contid_list: audit container identifier list
> + * @contid_list_lock audit container identifier list lock
>   */
>  struct audit_net {
>  	struct sock *sk;
> +	struct list_head contid_list;
> +	spinlock_t contid_list_lock;
>  };
>  
>  /**
> @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
>  void audit_free(struct task_struct *tsk)
>  {
>  	struct audit_task_info *info = tsk->audit;
> +	struct nsproxy *ns = tsk->nsproxy;
>  
>  	audit_free_syscall(tsk);
> +	if (ns)
> +		audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
>  	/* Freeing the audit_task_info struct must be performed after
>  	 * audit_log_exit() due to need for loginuid and sessionid.
>  	 */
> @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
>  	return aunet->sk;
>  }
>  
> +void audit_netns_contid_add(struct net *net, u64 contid)
> +{
> +	struct audit_net *aunet = net_generic(net, audit_net_id);
> +	struct list_head *contid_list = &aunet->contid_list;
> +	struct audit_contid *cont;
> +
> +	if (!audit_contid_valid(contid))
> +		return;
> +	if (!aunet)
> +		return;
> +	spin_lock(&aunet->contid_list_lock);
> +	if (!list_empty(contid_list))
> +		list_for_each_entry_rcu(cont, contid_list, list)
> +			if (cont->id == contid) {
> +				refcount_inc(&cont->refcount);
> +				goto out;
> +			}
> +	cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> +	if (cont) {
> +		INIT_LIST_HEAD(&cont->list);
> +		cont->id = contid;
> +		refcount_set(&cont->refcount, 1);
> +		list_add_rcu(&cont->list, contid_list);
> +	}
> +out:
> +	spin_unlock(&aunet->contid_list_lock);
> +}
> +
> +void audit_netns_contid_del(struct net *net, u64 contid)
> +{
> +	struct audit_net *aunet;
> +	struct list_head *contid_list;
> +	struct audit_contid *cont = NULL;
> +
> +	if (!net)
> +		return;
> +	if (!audit_contid_valid(contid))
> +		return;
> +	aunet = net_generic(net, audit_net_id);
> +	if (!aunet)
> +		return;
> +	contid_list = &aunet->contid_list;
> +	spin_lock(&aunet->contid_list_lock);
> +	if (!list_empty(contid_list))
> +		list_for_each_entry_rcu(cont, contid_list, list)
> +			if (cont->id == contid) {
> +				if (refcount_dec_and_test(&cont->refcount)) {
> +					list_del_rcu(&cont->list);
> +					kfree_rcu(cont, rcu);
> +				}
> +				break;
> +			}
> +	spin_unlock(&aunet->contid_list_lock);
> +}
> +
> +void audit_switch_task_namespaces(struct nsproxy *ns, struct task_struct *p)
> +{
> +	u64 contid = audit_get_contid(p);
> +	struct nsproxy *new = p->nsproxy;
> +
> +	if (!audit_contid_valid(contid))
> +		return;
> +	audit_netns_contid_del(ns->net_ns, contid);
> +	if (new)
> +		audit_netns_contid_add(new->net_ns, contid);
> +}
> +
>  void audit_panic(const char *message)
>  {
>  	switch (audit_failure) {
> @@ -1619,7 +1694,6 @@ static int __net_init audit_net_init(struct net *net)
>  		.flags	= NL_CFG_F_NONROOT_RECV,
>  		.groups	= AUDIT_NLGRP_MAX,
>  	};
> -
>  	struct audit_net *aunet = net_generic(net, audit_net_id);
>  
>  	aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
> @@ -1628,7 +1702,8 @@ static int __net_init audit_net_init(struct net *net)
>  		return -ENOMEM;
>  	}
>  	aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
> -
> +	INIT_LIST_HEAD(&aunet->contid_list);
> +	spin_lock_init(&aunet->contid_list_lock);
>  	return 0;
>  }
>  
> @@ -2380,6 +2455,7 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>  	uid_t uid;
>  	struct tty_struct *tty;
>  	char comm[sizeof(current->comm)];
> +	struct net *net = task->nsproxy->net_ns;
>  
>  	task_lock(task);
>  	/* Can't set if audit disabled */
> @@ -2401,8 +2477,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>  	else if (!(thread_group_leader(task) && thread_group_empty(task)))
>  		rc = -EALREADY;
>  	read_unlock(&tasklist_lock);
> -	if (!rc)
> +	if (!rc) {
> +		if (audit_contid_valid(oldcontid))
> +			audit_netns_contid_del(net, oldcontid);
>  		task->audit->contid = contid;
> +		audit_netns_contid_add(net, contid);
> +	}
>  	task_unlock(task);
>  
>  	if (!audit_enabled)
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index f6c5d330059a..718b1201ae70 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -27,6 +27,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/cgroup.h>
>  #include <linux/perf_event.h>
> +#include <linux/audit.h>
>  
>  static struct kmem_cache *nsproxy_cachep;
>  
> @@ -140,6 +141,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>  	struct nsproxy *old_ns = tsk->nsproxy;
>  	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
>  	struct nsproxy *new_ns;
> +	u64 contid = audit_get_contid(tsk);
>  
>  	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
>  			      CLONE_NEWPID | CLONE_NEWNET |
> @@ -167,6 +169,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>  		return  PTR_ERR(new_ns);
>  
>  	tsk->nsproxy = new_ns;
> +	audit_netns_contid_add(new_ns->net_ns, contid);
>  	return 0;
>  }
>  
> @@ -224,6 +227,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
>  	ns = p->nsproxy;
>  	p->nsproxy = new;
>  	task_unlock(p);
> +	audit_switch_task_namespaces(ns, p);
>  
>  	if (ns && atomic_dec_and_test(&ns->count))
>  		free_nsproxy(ns);
> -- 
> 1.8.3.1
> 
> 
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Ondrej Mosnacek March 27, 2019, 10:42 p.m. UTC | #2
On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Audit events could happen in a network namespace outside of a task
> context due to packets received from the net that trigger an auditing
> rule prior to being associated with a running task.  The network
> namespace could be in use by multiple containers by association to the
> tasks in that network namespace.  We still want a way to attribute
> these events to any potential containers.  Keep a list per network
> namespace to track these audit container identifiiers.
>
> Add/increment the audit container identifier on:
> - initial setting of the audit container identifier via /proc
> - clone/fork call that inherits an audit container identifier
> - unshare call that inherits an audit container identifier
> - setns call that inherits an audit container identifier
> Delete/decrement the audit container identifier on:
> - an inherited audit container identifier dropped when child set
> - process exit
> - unshare call that drops a net namespace
> - setns call that drops a net namespace
>
> See: https://github.com/linux-audit/audit-kernel/issues/92
> See: https://github.com/linux-audit/audit-testsuite/issues/64
> See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  include/linux/audit.h | 19 ++++++++++++
>  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/nsproxy.c      |  4 +++
>  3 files changed, 106 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/audit.h b/include/linux/audit.h
> index fa19fa408931..70255c2dfb9f 100644
> --- a/include/linux/audit.h
> +++ b/include/linux/audit.h
> @@ -27,6 +27,7 @@
>  #include <linux/ptrace.h>
>  #include <linux/namei.h>  /* LOOKUP_* */
>  #include <uapi/linux/audit.h>
> +#include <linux/refcount.h>
>
>  #define AUDIT_INO_UNSET ((unsigned long)-1)
>  #define AUDIT_DEV_UNSET ((dev_t)-1)
> @@ -99,6 +100,13 @@ struct audit_task_info {
>
>  extern struct audit_task_info init_struct_audit;
>
> +struct audit_contid {
> +       struct list_head        list;
> +       u64                     id;
> +       refcount_t              refcount;

Hm, since we only ever touch the refcount under a spinlock, I wonder
if we could just make it a regular unsigned int (we don't need the
atomicity guarantees). OTOH, refcount_t comes with some extra overflow
checking, so it's probably better to leave it as is...

> +       struct rcu_head         rcu;
> +};
> +
>  extern int is_audit_feature_set(int which);
>
>  extern int __init audit_register_class(int class, unsigned *list);
> @@ -202,6 +210,10 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
>  }
>
>  extern void audit_log_contid(struct audit_context *context, u64 contid);
> +extern void audit_netns_contid_add(struct net *net, u64 contid);
> +extern void audit_netns_contid_del(struct net *net, u64 contid);
> +extern void audit_switch_task_namespaces(struct nsproxy *ns,
> +                                        struct task_struct *p);
>
>  extern u32 audit_enabled;
>  #else /* CONFIG_AUDIT */
> @@ -271,6 +283,13 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
>
>  static inline void audit_log_contid(struct audit_context *context, u64 contid)
>  { }
> +static inline void audit_netns_contid_add(struct net *net, u64 contid)
> +{ }
> +static inline void audit_netns_contid_del(struct net *net, u64 contid)
> +{ }
> +static inline void audit_switch_task_namespaces(struct nsproxy *ns,
> +                                               struct task_struct *p)
> +{ }
>
>  #define audit_enabled AUDIT_OFF
>  #endif /* CONFIG_AUDIT */
> diff --git a/kernel/audit.c b/kernel/audit.c
> index cf448599ef34..7fa3194f5342 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -72,6 +72,7 @@
>  #include <linux/freezer.h>
>  #include <linux/pid_namespace.h>
>  #include <net/netns/generic.h>
> +#include <net/net_namespace.h>
>
>  #include "audit.h"
>
> @@ -99,9 +100,13 @@
>  /**
>   * struct audit_net - audit private network namespace data
>   * @sk: communication socket
> + * @contid_list: audit container identifier list
> + * @contid_list_lock audit container identifier list lock
>   */
>  struct audit_net {
>         struct sock *sk;
> +       struct list_head contid_list;
> +       spinlock_t contid_list_lock;
>  };
>
>  /**
> @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
>  void audit_free(struct task_struct *tsk)
>  {
>         struct audit_task_info *info = tsk->audit;
> +       struct nsproxy *ns = tsk->nsproxy;
>
>         audit_free_syscall(tsk);
> +       if (ns)
> +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
>         /* Freeing the audit_task_info struct must be performed after
>          * audit_log_exit() due to need for loginuid and sessionid.
>          */
> @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
>         return aunet->sk;
>  }
>
> +void audit_netns_contid_add(struct net *net, u64 contid)
> +{
> +       struct audit_net *aunet = net_generic(net, audit_net_id);
> +       struct list_head *contid_list = &aunet->contid_list;
> +       struct audit_contid *cont;
> +
> +       if (!audit_contid_valid(contid))
> +               return;
> +       if (!aunet)
> +               return;
> +       spin_lock(&aunet->contid_list_lock);
> +       if (!list_empty(contid_list))
> +               list_for_each_entry_rcu(cont, contid_list, list)
> +                       if (cont->id == contid) {
> +                               refcount_inc(&cont->refcount);
> +                               goto out;
> +                       }
> +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> +       if (cont) {
> +               INIT_LIST_HEAD(&cont->list);
> +               cont->id = contid;
> +               refcount_set(&cont->refcount, 1);
> +               list_add_rcu(&cont->list, contid_list);
> +       }
> +out:
> +       spin_unlock(&aunet->contid_list_lock);
> +}
> +
> +void audit_netns_contid_del(struct net *net, u64 contid)
> +{
> +       struct audit_net *aunet;
> +       struct list_head *contid_list;
> +       struct audit_contid *cont = NULL;
> +
> +       if (!net)
> +               return;
> +       if (!audit_contid_valid(contid))
> +               return;
> +       aunet = net_generic(net, audit_net_id);
> +       if (!aunet)
> +               return;
> +       contid_list = &aunet->contid_list;
> +       spin_lock(&aunet->contid_list_lock);
> +       if (!list_empty(contid_list))
> +               list_for_each_entry_rcu(cont, contid_list, list)
> +                       if (cont->id == contid) {
> +                               if (refcount_dec_and_test(&cont->refcount)) {
> +                                       list_del_rcu(&cont->list);
> +                                       kfree_rcu(cont, rcu);
> +                               }
> +                               break;
> +                       }
> +       spin_unlock(&aunet->contid_list_lock);
> +}
> +
> +void audit_switch_task_namespaces(struct nsproxy *ns, struct task_struct *p)
> +{
> +       u64 contid = audit_get_contid(p);
> +       struct nsproxy *new = p->nsproxy;
> +
> +       if (!audit_contid_valid(contid))
> +               return;
> +       audit_netns_contid_del(ns->net_ns, contid);
> +       if (new)
> +               audit_netns_contid_add(new->net_ns, contid);
> +}
> +
>  void audit_panic(const char *message)
>  {
>         switch (audit_failure) {
> @@ -1619,7 +1694,6 @@ static int __net_init audit_net_init(struct net *net)
>                 .flags  = NL_CFG_F_NONROOT_RECV,
>                 .groups = AUDIT_NLGRP_MAX,
>         };
> -
>         struct audit_net *aunet = net_generic(net, audit_net_id);
>
>         aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
> @@ -1628,7 +1702,8 @@ static int __net_init audit_net_init(struct net *net)
>                 return -ENOMEM;
>         }
>         aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
> -
> +       INIT_LIST_HEAD(&aunet->contid_list);
> +       spin_lock_init(&aunet->contid_list_lock);
>         return 0;
>  }
>
> @@ -2380,6 +2455,7 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>         uid_t uid;
>         struct tty_struct *tty;
>         char comm[sizeof(current->comm)];
> +       struct net *net = task->nsproxy->net_ns;
>
>         task_lock(task);
>         /* Can't set if audit disabled */
> @@ -2401,8 +2477,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
>         else if (!(thread_group_leader(task) && thread_group_empty(task)))
>                 rc = -EALREADY;
>         read_unlock(&tasklist_lock);
> -       if (!rc)
> +       if (!rc) {
> +               if (audit_contid_valid(oldcontid))
> +                       audit_netns_contid_del(net, oldcontid);
>                 task->audit->contid = contid;
> +               audit_netns_contid_add(net, contid);
> +       }
>         task_unlock(task);
>
>         if (!audit_enabled)
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index f6c5d330059a..718b1201ae70 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -27,6 +27,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/cgroup.h>
>  #include <linux/perf_event.h>
> +#include <linux/audit.h>
>
>  static struct kmem_cache *nsproxy_cachep;
>
> @@ -140,6 +141,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>         struct nsproxy *old_ns = tsk->nsproxy;
>         struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
>         struct nsproxy *new_ns;
> +       u64 contid = audit_get_contid(tsk);
>
>         if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
>                               CLONE_NEWPID | CLONE_NEWNET |
> @@ -167,6 +169,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
>                 return  PTR_ERR(new_ns);
>
>         tsk->nsproxy = new_ns;
> +       audit_netns_contid_add(new_ns->net_ns, contid);
>         return 0;
>  }
>
> @@ -224,6 +227,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
>         ns = p->nsproxy;
>         p->nsproxy = new;
>         task_unlock(p);
> +       audit_switch_task_namespaces(ns, p);

Since we call audit_switch_task_namespaces() after task_unlock(),
could there be a potential race condition? I'm not going to dive too
much into this now, because it's getting late here, but on first look
it seems like p->nsproxy could change under our hands before we fetch
it in audit_switch_task_namespaces()...

>
>         if (ns && atomic_dec_and_test(&ns->count))
>                 free_nsproxy(ns);
> --
> 1.8.3.1
>
Richard Guy Briggs March 28, 2019, 1:12 a.m. UTC | #3
On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > Audit events could happen in a network namespace outside of a task
> > context due to packets received from the net that trigger an auditing
> > rule prior to being associated with a running task.  The network
> > namespace could be in use by multiple containers by association to the
> > tasks in that network namespace.  We still want a way to attribute
> > these events to any potential containers.  Keep a list per network
> > namespace to track these audit container identifiiers.
> >
> > Add/increment the audit container identifier on:
> > - initial setting of the audit container identifier via /proc
> > - clone/fork call that inherits an audit container identifier
> > - unshare call that inherits an audit container identifier
> > - setns call that inherits an audit container identifier
> > Delete/decrement the audit container identifier on:
> > - an inherited audit container identifier dropped when child set
> > - process exit
> > - unshare call that drops a net namespace
> > - setns call that drops a net namespace
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/92
> > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  include/linux/audit.h | 19 ++++++++++++
> >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> >  kernel/nsproxy.c      |  4 +++
> >  3 files changed, 106 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > index fa19fa408931..70255c2dfb9f 100644
> > --- a/include/linux/audit.h
> > +++ b/include/linux/audit.h
> > @@ -27,6 +27,7 @@
> >  #include <linux/ptrace.h>
> >  #include <linux/namei.h>  /* LOOKUP_* */
> >  #include <uapi/linux/audit.h>
> > +#include <linux/refcount.h>
> >
> >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > @@ -99,6 +100,13 @@ struct audit_task_info {
> >
> >  extern struct audit_task_info init_struct_audit;
> >
> > +struct audit_contid {
> > +       struct list_head        list;
> > +       u64                     id;
> > +       refcount_t              refcount;
> 
> Hm, since we only ever touch the refcount under a spinlock, I wonder
> if we could just make it a regular unsigned int (we don't need the
> atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> checking, so it's probably better to leave it as is...

Since the update is done using rcu-safe methods, do we even need the
spin_lock?  Neil?  Paul?

> > +       struct rcu_head         rcu;
> > +};
> > +
> >  extern int is_audit_feature_set(int which);
> >
> >  extern int __init audit_register_class(int class, unsigned *list);
> > @@ -202,6 +210,10 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> >  }
> >
> >  extern void audit_log_contid(struct audit_context *context, u64 contid);
> > +extern void audit_netns_contid_add(struct net *net, u64 contid);
> > +extern void audit_netns_contid_del(struct net *net, u64 contid);
> > +extern void audit_switch_task_namespaces(struct nsproxy *ns,
> > +                                        struct task_struct *p);
> >
> >  extern u32 audit_enabled;
> >  #else /* CONFIG_AUDIT */
> > @@ -271,6 +283,13 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> >
> >  static inline void audit_log_contid(struct audit_context *context, u64 contid)
> >  { }
> > +static inline void audit_netns_contid_add(struct net *net, u64 contid)
> > +{ }
> > +static inline void audit_netns_contid_del(struct net *net, u64 contid)
> > +{ }
> > +static inline void audit_switch_task_namespaces(struct nsproxy *ns,
> > +                                               struct task_struct *p)
> > +{ }
> >
> >  #define audit_enabled AUDIT_OFF
> >  #endif /* CONFIG_AUDIT */
> > diff --git a/kernel/audit.c b/kernel/audit.c
> > index cf448599ef34..7fa3194f5342 100644
> > --- a/kernel/audit.c
> > +++ b/kernel/audit.c
> > @@ -72,6 +72,7 @@
> >  #include <linux/freezer.h>
> >  #include <linux/pid_namespace.h>
> >  #include <net/netns/generic.h>
> > +#include <net/net_namespace.h>
> >
> >  #include "audit.h"
> >
> > @@ -99,9 +100,13 @@
> >  /**
> >   * struct audit_net - audit private network namespace data
> >   * @sk: communication socket
> > + * @contid_list: audit container identifier list
> > + * @contid_list_lock audit container identifier list lock
> >   */
> >  struct audit_net {
> >         struct sock *sk;
> > +       struct list_head contid_list;
> > +       spinlock_t contid_list_lock;
> >  };
> >
> >  /**
> > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> >  void audit_free(struct task_struct *tsk)
> >  {
> >         struct audit_task_info *info = tsk->audit;
> > +       struct nsproxy *ns = tsk->nsproxy;
> >
> >         audit_free_syscall(tsk);
> > +       if (ns)
> > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> >         /* Freeing the audit_task_info struct must be performed after
> >          * audit_log_exit() due to need for loginuid and sessionid.
> >          */
> > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> >         return aunet->sk;
> >  }
> >
> > +void audit_netns_contid_add(struct net *net, u64 contid)
> > +{
> > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > +       struct list_head *contid_list = &aunet->contid_list;
> > +       struct audit_contid *cont;
> > +
> > +       if (!audit_contid_valid(contid))
> > +               return;
> > +       if (!aunet)
> > +               return;
> > +       spin_lock(&aunet->contid_list_lock);
> > +       if (!list_empty(contid_list))
> > +               list_for_each_entry_rcu(cont, contid_list, list)
> > +                       if (cont->id == contid) {
> > +                               refcount_inc(&cont->refcount);
> > +                               goto out;
> > +                       }
> > +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> > +       if (cont) {
> > +               INIT_LIST_HEAD(&cont->list);
> > +               cont->id = contid;
> > +               refcount_set(&cont->refcount, 1);
> > +               list_add_rcu(&cont->list, contid_list);
> > +       }
> > +out:
> > +       spin_unlock(&aunet->contid_list_lock);
> > +}
> > +
> > +void audit_netns_contid_del(struct net *net, u64 contid)
> > +{
> > +       struct audit_net *aunet;
> > +       struct list_head *contid_list;
> > +       struct audit_contid *cont = NULL;
> > +
> > +       if (!net)
> > +               return;
> > +       if (!audit_contid_valid(contid))
> > +               return;
> > +       aunet = net_generic(net, audit_net_id);
> > +       if (!aunet)
> > +               return;
> > +       contid_list = &aunet->contid_list;
> > +       spin_lock(&aunet->contid_list_lock);
> > +       if (!list_empty(contid_list))
> > +               list_for_each_entry_rcu(cont, contid_list, list)
> > +                       if (cont->id == contid) {
> > +                               if (refcount_dec_and_test(&cont->refcount)) {
> > +                                       list_del_rcu(&cont->list);
> > +                                       kfree_rcu(cont, rcu);
> > +                               }
> > +                               break;
> > +                       }
> > +       spin_unlock(&aunet->contid_list_lock);
> > +}
> > +
> > +void audit_switch_task_namespaces(struct nsproxy *ns, struct task_struct *p)
> > +{
> > +       u64 contid = audit_get_contid(p);
> > +       struct nsproxy *new = p->nsproxy;
> > +
> > +       if (!audit_contid_valid(contid))
> > +               return;
> > +       audit_netns_contid_del(ns->net_ns, contid);
> > +       if (new)
> > +               audit_netns_contid_add(new->net_ns, contid);
> > +}
> > +
> >  void audit_panic(const char *message)
> >  {
> >         switch (audit_failure) {
> > @@ -1619,7 +1694,6 @@ static int __net_init audit_net_init(struct net *net)
> >                 .flags  = NL_CFG_F_NONROOT_RECV,
> >                 .groups = AUDIT_NLGRP_MAX,
> >         };
> > -
> >         struct audit_net *aunet = net_generic(net, audit_net_id);
> >
> >         aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
> > @@ -1628,7 +1702,8 @@ static int __net_init audit_net_init(struct net *net)
> >                 return -ENOMEM;
> >         }
> >         aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
> > -
> > +       INIT_LIST_HEAD(&aunet->contid_list);
> > +       spin_lock_init(&aunet->contid_list_lock);
> >         return 0;
> >  }
> >
> > @@ -2380,6 +2455,7 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> >         uid_t uid;
> >         struct tty_struct *tty;
> >         char comm[sizeof(current->comm)];
> > +       struct net *net = task->nsproxy->net_ns;
> >
> >         task_lock(task);
> >         /* Can't set if audit disabled */
> > @@ -2401,8 +2477,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> >         else if (!(thread_group_leader(task) && thread_group_empty(task)))
> >                 rc = -EALREADY;
> >         read_unlock(&tasklist_lock);
> > -       if (!rc)
> > +       if (!rc) {
> > +               if (audit_contid_valid(oldcontid))
> > +                       audit_netns_contid_del(net, oldcontid);
> >                 task->audit->contid = contid;
> > +               audit_netns_contid_add(net, contid);
> > +       }
> >         task_unlock(task);
> >
> >         if (!audit_enabled)
> > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> > index f6c5d330059a..718b1201ae70 100644
> > --- a/kernel/nsproxy.c
> > +++ b/kernel/nsproxy.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/syscalls.h>
> >  #include <linux/cgroup.h>
> >  #include <linux/perf_event.h>
> > +#include <linux/audit.h>
> >
> >  static struct kmem_cache *nsproxy_cachep;
> >
> > @@ -140,6 +141,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> >         struct nsproxy *old_ns = tsk->nsproxy;
> >         struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
> >         struct nsproxy *new_ns;
> > +       u64 contid = audit_get_contid(tsk);
> >
> >         if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> >                               CLONE_NEWPID | CLONE_NEWNET |
> > @@ -167,6 +169,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> >                 return  PTR_ERR(new_ns);
> >
> >         tsk->nsproxy = new_ns;
> > +       audit_netns_contid_add(new_ns->net_ns, contid);
> >         return 0;
> >  }
> >
> > @@ -224,6 +227,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
> >         ns = p->nsproxy;
> >         p->nsproxy = new;
> >         task_unlock(p);
> > +       audit_switch_task_namespaces(ns, p);
> 
> Since we call audit_switch_task_namespaces() after task_unlock(),
> could there be a potential race condition? I'm not going to dive too
> much into this now, because it's getting late here, but on first look
> it seems like p->nsproxy could change under our hands before we fetch
> it in audit_switch_task_namespaces()...

The rules are defined in include/linux/nsproxy.h.

Since the callers (sys_setns, do_exit, copy_process error path) are all
current or handing it a dead task and we are not writing nsproxy or its
pointers, which is only allowed by current anyway, we don't need the
lock.

> >
> >         if (ns && atomic_dec_and_test(&ns->count))
> >                 free_nsproxy(ns);
> > --
> > 1.8.3.1
> 
> Ondrej Mosnacek <omosnace at redhat dot com>

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Ondrej Mosnacek March 28, 2019, 8:01 a.m. UTC | #4
On Thu, Mar 28, 2019 at 2:12 AM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> > On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > Audit events could happen in a network namespace outside of a task
> > > context due to packets received from the net that trigger an auditing
> > > rule prior to being associated with a running task.  The network
> > > namespace could be in use by multiple containers by association to the
> > > tasks in that network namespace.  We still want a way to attribute
> > > these events to any potential containers.  Keep a list per network
> > > namespace to track these audit container identifiiers.
> > >
> > > Add/increment the audit container identifier on:
> > > - initial setting of the audit container identifier via /proc
> > > - clone/fork call that inherits an audit container identifier
> > > - unshare call that inherits an audit container identifier
> > > - setns call that inherits an audit container identifier
> > > Delete/decrement the audit container identifier on:
> > > - an inherited audit container identifier dropped when child set
> > > - process exit
> > > - unshare call that drops a net namespace
> > > - setns call that drops a net namespace
> > >
> > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  include/linux/audit.h | 19 ++++++++++++
> > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  kernel/nsproxy.c      |  4 +++
> > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > index fa19fa408931..70255c2dfb9f 100644
> > > --- a/include/linux/audit.h
> > > +++ b/include/linux/audit.h
> > > @@ -27,6 +27,7 @@
> > >  #include <linux/ptrace.h>
> > >  #include <linux/namei.h>  /* LOOKUP_* */
> > >  #include <uapi/linux/audit.h>
> > > +#include <linux/refcount.h>
> > >
> > >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> > >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > > @@ -99,6 +100,13 @@ struct audit_task_info {
> > >
> > >  extern struct audit_task_info init_struct_audit;
> > >
> > > +struct audit_contid {
> > > +       struct list_head        list;
> > > +       u64                     id;
> > > +       refcount_t              refcount;
> >
> > Hm, since we only ever touch the refcount under a spinlock, I wonder
> > if we could just make it a regular unsigned int (we don't need the
> > atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> > checking, so it's probably better to leave it as is...
>
> Since the update is done using rcu-safe methods, do we even need the
> spin_lock?  Neil?  Paul?
>
> > > +       struct rcu_head         rcu;
> > > +};
> > > +
> > >  extern int is_audit_feature_set(int which);
> > >
> > >  extern int __init audit_register_class(int class, unsigned *list);
> > > @@ -202,6 +210,10 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> > >  }
> > >
> > >  extern void audit_log_contid(struct audit_context *context, u64 contid);
> > > +extern void audit_netns_contid_add(struct net *net, u64 contid);
> > > +extern void audit_netns_contid_del(struct net *net, u64 contid);
> > > +extern void audit_switch_task_namespaces(struct nsproxy *ns,
> > > +                                        struct task_struct *p);
> > >
> > >  extern u32 audit_enabled;
> > >  #else /* CONFIG_AUDIT */
> > > @@ -271,6 +283,13 @@ static inline u64 audit_get_contid(struct task_struct *tsk)
> > >
> > >  static inline void audit_log_contid(struct audit_context *context, u64 contid)
> > >  { }
> > > +static inline void audit_netns_contid_add(struct net *net, u64 contid)
> > > +{ }
> > > +static inline void audit_netns_contid_del(struct net *net, u64 contid)
> > > +{ }
> > > +static inline void audit_switch_task_namespaces(struct nsproxy *ns,
> > > +                                               struct task_struct *p)
> > > +{ }
> > >
> > >  #define audit_enabled AUDIT_OFF
> > >  #endif /* CONFIG_AUDIT */
> > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > index cf448599ef34..7fa3194f5342 100644
> > > --- a/kernel/audit.c
> > > +++ b/kernel/audit.c
> > > @@ -72,6 +72,7 @@
> > >  #include <linux/freezer.h>
> > >  #include <linux/pid_namespace.h>
> > >  #include <net/netns/generic.h>
> > > +#include <net/net_namespace.h>
> > >
> > >  #include "audit.h"
> > >
> > > @@ -99,9 +100,13 @@
> > >  /**
> > >   * struct audit_net - audit private network namespace data
> > >   * @sk: communication socket
> > > + * @contid_list: audit container identifier list
> > > + * @contid_list_lock audit container identifier list lock
> > >   */
> > >  struct audit_net {
> > >         struct sock *sk;
> > > +       struct list_head contid_list;
> > > +       spinlock_t contid_list_lock;
> > >  };
> > >
> > >  /**
> > > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> > >  void audit_free(struct task_struct *tsk)
> > >  {
> > >         struct audit_task_info *info = tsk->audit;
> > > +       struct nsproxy *ns = tsk->nsproxy;
> > >
> > >         audit_free_syscall(tsk);
> > > +       if (ns)
> > > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> > >         /* Freeing the audit_task_info struct must be performed after
> > >          * audit_log_exit() due to need for loginuid and sessionid.
> > >          */
> > > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> > >         return aunet->sk;
> > >  }
> > >
> > > +void audit_netns_contid_add(struct net *net, u64 contid)
> > > +{
> > > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > > +       struct list_head *contid_list = &aunet->contid_list;
> > > +       struct audit_contid *cont;
> > > +
> > > +       if (!audit_contid_valid(contid))
> > > +               return;
> > > +       if (!aunet)
> > > +               return;
> > > +       spin_lock(&aunet->contid_list_lock);
> > > +       if (!list_empty(contid_list))
> > > +               list_for_each_entry_rcu(cont, contid_list, list)
> > > +                       if (cont->id == contid) {
> > > +                               refcount_inc(&cont->refcount);
> > > +                               goto out;
> > > +                       }
> > > +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> > > +       if (cont) {
> > > +               INIT_LIST_HEAD(&cont->list);
> > > +               cont->id = contid;
> > > +               refcount_set(&cont->refcount, 1);
> > > +               list_add_rcu(&cont->list, contid_list);
> > > +       }
> > > +out:
> > > +       spin_unlock(&aunet->contid_list_lock);
> > > +}
> > > +
> > > +void audit_netns_contid_del(struct net *net, u64 contid)
> > > +{
> > > +       struct audit_net *aunet;
> > > +       struct list_head *contid_list;
> > > +       struct audit_contid *cont = NULL;
> > > +
> > > +       if (!net)
> > > +               return;
> > > +       if (!audit_contid_valid(contid))
> > > +               return;
> > > +       aunet = net_generic(net, audit_net_id);
> > > +       if (!aunet)
> > > +               return;
> > > +       contid_list = &aunet->contid_list;
> > > +       spin_lock(&aunet->contid_list_lock);
> > > +       if (!list_empty(contid_list))
> > > +               list_for_each_entry_rcu(cont, contid_list, list)
> > > +                       if (cont->id == contid) {
> > > +                               if (refcount_dec_and_test(&cont->refcount)) {
> > > +                                       list_del_rcu(&cont->list);
> > > +                                       kfree_rcu(cont, rcu);
> > > +                               }
> > > +                               break;
> > > +                       }
> > > +       spin_unlock(&aunet->contid_list_lock);
> > > +}
> > > +
> > > +void audit_switch_task_namespaces(struct nsproxy *ns, struct task_struct *p)
> > > +{
> > > +       u64 contid = audit_get_contid(p);
> > > +       struct nsproxy *new = p->nsproxy;
> > > +
> > > +       if (!audit_contid_valid(contid))
> > > +               return;
> > > +       audit_netns_contid_del(ns->net_ns, contid);
> > > +       if (new)
> > > +               audit_netns_contid_add(new->net_ns, contid);
> > > +}
> > > +
> > >  void audit_panic(const char *message)
> > >  {
> > >         switch (audit_failure) {
> > > @@ -1619,7 +1694,6 @@ static int __net_init audit_net_init(struct net *net)
> > >                 .flags  = NL_CFG_F_NONROOT_RECV,
> > >                 .groups = AUDIT_NLGRP_MAX,
> > >         };
> > > -
> > >         struct audit_net *aunet = net_generic(net, audit_net_id);
> > >
> > >         aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
> > > @@ -1628,7 +1702,8 @@ static int __net_init audit_net_init(struct net *net)
> > >                 return -ENOMEM;
> > >         }
> > >         aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
> > > -
> > > +       INIT_LIST_HEAD(&aunet->contid_list);
> > > +       spin_lock_init(&aunet->contid_list_lock);
> > >         return 0;
> > >  }
> > >
> > > @@ -2380,6 +2455,7 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> > >         uid_t uid;
> > >         struct tty_struct *tty;
> > >         char comm[sizeof(current->comm)];
> > > +       struct net *net = task->nsproxy->net_ns;
> > >
> > >         task_lock(task);
> > >         /* Can't set if audit disabled */
> > > @@ -2401,8 +2477,12 @@ int audit_set_contid(struct task_struct *task, u64 contid)
> > >         else if (!(thread_group_leader(task) && thread_group_empty(task)))
> > >                 rc = -EALREADY;
> > >         read_unlock(&tasklist_lock);
> > > -       if (!rc)
> > > +       if (!rc) {
> > > +               if (audit_contid_valid(oldcontid))
> > > +                       audit_netns_contid_del(net, oldcontid);
> > >                 task->audit->contid = contid;
> > > +               audit_netns_contid_add(net, contid);
> > > +       }
> > >         task_unlock(task);
> > >
> > >         if (!audit_enabled)
> > > diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> > > index f6c5d330059a..718b1201ae70 100644
> > > --- a/kernel/nsproxy.c
> > > +++ b/kernel/nsproxy.c
> > > @@ -27,6 +27,7 @@
> > >  #include <linux/syscalls.h>
> > >  #include <linux/cgroup.h>
> > >  #include <linux/perf_event.h>
> > > +#include <linux/audit.h>
> > >
> > >  static struct kmem_cache *nsproxy_cachep;
> > >
> > > @@ -140,6 +141,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> > >         struct nsproxy *old_ns = tsk->nsproxy;
> > >         struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
> > >         struct nsproxy *new_ns;
> > > +       u64 contid = audit_get_contid(tsk);
> > >
> > >         if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
> > >                               CLONE_NEWPID | CLONE_NEWNET |
> > > @@ -167,6 +169,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
> > >                 return  PTR_ERR(new_ns);
> > >
> > >         tsk->nsproxy = new_ns;
> > > +       audit_netns_contid_add(new_ns->net_ns, contid);
> > >         return 0;
> > >  }
> > >
> > > @@ -224,6 +227,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
> > >         ns = p->nsproxy;
> > >         p->nsproxy = new;
> > >         task_unlock(p);
> > > +       audit_switch_task_namespaces(ns, p);
> >
> > Since we call audit_switch_task_namespaces() after task_unlock(),
> > could there be a potential race condition? I'm not going to dive too
> > much into this now, because it's getting late here, but on first look
> > it seems like p->nsproxy could change under our hands before we fetch
> > it in audit_switch_task_namespaces()...
>
> The rules are defined in include/linux/nsproxy.h.
>
> Since the callers (sys_setns, do_exit, copy_process error path) are all
> current or handing it a dead task and we are not writing nsproxy or its
> pointers, which is only allowed by current anyway, we don't need the
> lock.

I see, so the task lock is taken during the swap only to protect
against races with other tasks reading this task's nsproxy... makes
sense. Thanks for clarifying!

The refcount/spinlock issue is not blocking (and could be addressed in
a follow-up patch later), so:

Reviewed-by: Ondrej Mosnacek <omosnace@redhat.com>

>
> > >
> > >         if (ns && atomic_dec_and_test(&ns->count))
> > >                 free_nsproxy(ns);
> > > --
> > > 1.8.3.1
> >
> > Ondrej Mosnacek <omosnace at redhat dot com>
>
> - RGB
>
> --
> Richard Guy Briggs <rgb@redhat.com>
> Sr. S/W Engineer, Kernel Security, Base Operating Systems
> Remote, Ottawa, Red Hat Canada
> IRC: rgb, SunRaycer
> Voice: +1.647.777.2635, Internal: (81) 32635

--
Ondrej Mosnacek <omosnace at redhat dot com>
Software Engineer, Security Technologies
Red Hat, Inc.
Paul Moore March 28, 2019, 3:46 p.m. UTC | #5
On Wed, Mar 27, 2019 at 9:12 PM Richard Guy Briggs <rgb@redhat.com> wrote:
>
> On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> > On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > Audit events could happen in a network namespace outside of a task
> > > context due to packets received from the net that trigger an auditing
> > > rule prior to being associated with a running task.  The network
> > > namespace could be in use by multiple containers by association to the
> > > tasks in that network namespace.  We still want a way to attribute
> > > these events to any potential containers.  Keep a list per network
> > > namespace to track these audit container identifiiers.
> > >
> > > Add/increment the audit container identifier on:
> > > - initial setting of the audit container identifier via /proc
> > > - clone/fork call that inherits an audit container identifier
> > > - unshare call that inherits an audit container identifier
> > > - setns call that inherits an audit container identifier
> > > Delete/decrement the audit container identifier on:
> > > - an inherited audit container identifier dropped when child set
> > > - process exit
> > > - unshare call that drops a net namespace
> > > - setns call that drops a net namespace
> > >
> > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  include/linux/audit.h | 19 ++++++++++++
> > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  kernel/nsproxy.c      |  4 +++
> > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > index fa19fa408931..70255c2dfb9f 100644
> > > --- a/include/linux/audit.h
> > > +++ b/include/linux/audit.h
> > > @@ -27,6 +27,7 @@
> > >  #include <linux/ptrace.h>
> > >  #include <linux/namei.h>  /* LOOKUP_* */
> > >  #include <uapi/linux/audit.h>
> > > +#include <linux/refcount.h>
> > >
> > >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> > >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > > @@ -99,6 +100,13 @@ struct audit_task_info {
> > >
> > >  extern struct audit_task_info init_struct_audit;
> > >
> > > +struct audit_contid {
> > > +       struct list_head        list;
> > > +       u64                     id;
> > > +       refcount_t              refcount;
> >
> > Hm, since we only ever touch the refcount under a spinlock, I wonder
> > if we could just make it a regular unsigned int (we don't need the
> > atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> > checking, so it's probably better to leave it as is...
>
> Since the update is done using rcu-safe methods, do we even need the
> spin_lock?  Neil?  Paul?

As discussed, the refcount field is protected against simultaneous
writes by the spinlock that protects additions/removals from the list
as a whole so I don't believe the refcount_t atomicity is critical in
this regard.

Where it gets tricky, and I can't say I'm 100% confident on my answer
here, is if refcount was a regular int and we wanted to access it
outside of a spinlock (to be clear, it doesn't look like this patch
currently does this).  With RCU, if refcount was a regular int
(unsigned or otherwise), I believe it would be possible for different
threads of execution to potentially see different values of refcount
(assuming one thread was adding/removing from the list).  Using a
refcount_t would protect against this, alternatively, taking the
spinlock should also protect against this.

As we all know, RCU can be tricky at times, so I may be off on the
above; if I am, please provide an explanation so I (and likely others
as well) can learn a little bit more. :)
Richard Guy Briggs March 28, 2019, 9:40 p.m. UTC | #6
On 2019-03-28 11:46, Paul Moore wrote:
> On Wed, Mar 27, 2019 at 9:12 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> >
> > On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> > > On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > Audit events could happen in a network namespace outside of a task
> > > > context due to packets received from the net that trigger an auditing
> > > > rule prior to being associated with a running task.  The network
> > > > namespace could be in use by multiple containers by association to the
> > > > tasks in that network namespace.  We still want a way to attribute
> > > > these events to any potential containers.  Keep a list per network
> > > > namespace to track these audit container identifiiers.
> > > >
> > > > Add/increment the audit container identifier on:
> > > > - initial setting of the audit container identifier via /proc
> > > > - clone/fork call that inherits an audit container identifier
> > > > - unshare call that inherits an audit container identifier
> > > > - setns call that inherits an audit container identifier
> > > > Delete/decrement the audit container identifier on:
> > > > - an inherited audit container identifier dropped when child set
> > > > - process exit
> > > > - unshare call that drops a net namespace
> > > > - setns call that drops a net namespace
> > > >
> > > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > ---
> > > >  include/linux/audit.h | 19 ++++++++++++
> > > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > >  kernel/nsproxy.c      |  4 +++
> > > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > > index fa19fa408931..70255c2dfb9f 100644
> > > > --- a/include/linux/audit.h
> > > > +++ b/include/linux/audit.h
> > > > @@ -27,6 +27,7 @@
> > > >  #include <linux/ptrace.h>
> > > >  #include <linux/namei.h>  /* LOOKUP_* */
> > > >  #include <uapi/linux/audit.h>
> > > > +#include <linux/refcount.h>
> > > >
> > > >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> > > >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > > > @@ -99,6 +100,13 @@ struct audit_task_info {
> > > >
> > > >  extern struct audit_task_info init_struct_audit;
> > > >
> > > > +struct audit_contid {
> > > > +       struct list_head        list;
> > > > +       u64                     id;
> > > > +       refcount_t              refcount;
> > >
> > > Hm, since we only ever touch the refcount under a spinlock, I wonder
> > > if we could just make it a regular unsigned int (we don't need the
> > > atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> > > checking, so it's probably better to leave it as is...
> >
> > Since the update is done using rcu-safe methods, do we even need the
> > spin_lock?  Neil?  Paul?
> 
> As discussed, the refcount field is protected against simultaneous
> writes by the spinlock that protects additions/removals from the list
> as a whole so I don't believe the refcount_t atomicity is critical in
> this regard.
> 
> Where it gets tricky, and I can't say I'm 100% confident on my answer
> here, is if refcount was a regular int and we wanted to access it
> outside of a spinlock (to be clear, it doesn't look like this patch
> currently does this).  With RCU, if refcount was a regular int
> (unsigned or otherwise), I believe it would be possible for different
> threads of execution to potentially see different values of refcount
> (assuming one thread was adding/removing from the list).  Using a
> refcount_t would protect against this, alternatively, taking the
> spinlock should also protect against this.

Ok, from the above it isn't clear to me if you are happy with the
current code or would prefer any changes, or from below that you still
need to work it through to make a pronouncement.  It sounds to me you
would be ok with *either* spinlock *or* refcount_t, but don't see the
need for both.

> As we all know, RCU can be tricky at times, so I may be off on the
> above; if I am, please provide an explanation so I (and likely others
> as well) can learn a little bit more. :)
> 
> -- 
> paul moore
> www.paul-moore.com

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore March 28, 2019, 10 p.m. UTC | #7
On Thu, Mar 28, 2019 at 5:40 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-03-28 11:46, Paul Moore wrote:
> > On Wed, Mar 27, 2019 at 9:12 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > >
> > > On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> > > > On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > Audit events could happen in a network namespace outside of a task
> > > > > context due to packets received from the net that trigger an auditing
> > > > > rule prior to being associated with a running task.  The network
> > > > > namespace could be in use by multiple containers by association to the
> > > > > tasks in that network namespace.  We still want a way to attribute
> > > > > these events to any potential containers.  Keep a list per network
> > > > > namespace to track these audit container identifiiers.
> > > > >
> > > > > Add/increment the audit container identifier on:
> > > > > - initial setting of the audit container identifier via /proc
> > > > > - clone/fork call that inherits an audit container identifier
> > > > > - unshare call that inherits an audit container identifier
> > > > > - setns call that inherits an audit container identifier
> > > > > Delete/decrement the audit container identifier on:
> > > > > - an inherited audit container identifier dropped when child set
> > > > > - process exit
> > > > > - unshare call that drops a net namespace
> > > > > - setns call that drops a net namespace
> > > > >
> > > > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > > ---
> > > > >  include/linux/audit.h | 19 ++++++++++++
> > > > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > >  kernel/nsproxy.c      |  4 +++
> > > > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > > > index fa19fa408931..70255c2dfb9f 100644
> > > > > --- a/include/linux/audit.h
> > > > > +++ b/include/linux/audit.h
> > > > > @@ -27,6 +27,7 @@
> > > > >  #include <linux/ptrace.h>
> > > > >  #include <linux/namei.h>  /* LOOKUP_* */
> > > > >  #include <uapi/linux/audit.h>
> > > > > +#include <linux/refcount.h>
> > > > >
> > > > >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> > > > >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > > > > @@ -99,6 +100,13 @@ struct audit_task_info {
> > > > >
> > > > >  extern struct audit_task_info init_struct_audit;
> > > > >
> > > > > +struct audit_contid {
> > > > > +       struct list_head        list;
> > > > > +       u64                     id;
> > > > > +       refcount_t              refcount;
> > > >
> > > > Hm, since we only ever touch the refcount under a spinlock, I wonder
> > > > if we could just make it a regular unsigned int (we don't need the
> > > > atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> > > > checking, so it's probably better to leave it as is...
> > >
> > > Since the update is done using rcu-safe methods, do we even need the
> > > spin_lock?  Neil?  Paul?
> >
> > As discussed, the refcount field is protected against simultaneous
> > writes by the spinlock that protects additions/removals from the list
> > as a whole so I don't believe the refcount_t atomicity is critical in
> > this regard.
> >
> > Where it gets tricky, and I can't say I'm 100% confident on my answer
> > here, is if refcount was a regular int and we wanted to access it
> > outside of a spinlock (to be clear, it doesn't look like this patch
> > currently does this).  With RCU, if refcount was a regular int
> > (unsigned or otherwise), I believe it would be possible for different
> > threads of execution to potentially see different values of refcount
> > (assuming one thread was adding/removing from the list).  Using a
> > refcount_t would protect against this, alternatively, taking the
> > spinlock should also protect against this.
>
> Ok, from the above it isn't clear to me if you are happy with the
> current code or would prefer any changes, or from below that you still
> need to work it through to make a pronouncement.  It sounds to me you
> would be ok with *either* spinlock *or* refcount_t, but don't see the
> need for both.

To be fair you didn't ask if I was "happy" with the approach above,
you asked if we needed the spinlock/refcount_t.  I believe I answered
that question as comprehensively as I could, but perhaps you wanted a
hard yes or no?  In that case, since refcount_t is obviously safer, I
would stick with that for now just to limit the number of possible
failures.  If someone smarter than you or I comes along and
definitively says you are 100% safe to use an int, then go ahead and
use an int.

Beyond that, I'm still in the process of reviewing your patches, but I
haven't finished yet, so no "pronouncement" or whatever you want to
call it.
Neil Horman March 29, 2019, 2:49 p.m. UTC | #8
On Wed, Mar 27, 2019 at 09:12:02PM -0400, Richard Guy Briggs wrote:
> On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> > On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > Audit events could happen in a network namespace outside of a task
> > > context due to packets received from the net that trigger an auditing
> > > rule prior to being associated with a running task.  The network
> > > namespace could be in use by multiple containers by association to the
> > > tasks in that network namespace.  We still want a way to attribute
> > > these events to any potential containers.  Keep a list per network
> > > namespace to track these audit container identifiiers.
> > >
> > > Add/increment the audit container identifier on:
> > > - initial setting of the audit container identifier via /proc
> > > - clone/fork call that inherits an audit container identifier
> > > - unshare call that inherits an audit container identifier
> > > - setns call that inherits an audit container identifier
> > > Delete/decrement the audit container identifier on:
> > > - an inherited audit container identifier dropped when child set
> > > - process exit
> > > - unshare call that drops a net namespace
> > > - setns call that drops a net namespace
> > >
> > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  include/linux/audit.h | 19 ++++++++++++
> > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  kernel/nsproxy.c      |  4 +++
> > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > index fa19fa408931..70255c2dfb9f 100644
> > > --- a/include/linux/audit.h
> > > +++ b/include/linux/audit.h
> > > @@ -27,6 +27,7 @@
> > >  #include <linux/ptrace.h>
> > >  #include <linux/namei.h>  /* LOOKUP_* */
> > >  #include <uapi/linux/audit.h>
> > > +#include <linux/refcount.h>
> > >
> > >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> > >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > > @@ -99,6 +100,13 @@ struct audit_task_info {
> > >
> > >  extern struct audit_task_info init_struct_audit;
> > >
> > > +struct audit_contid {
> > > +       struct list_head        list;
> > > +       u64                     id;
> > > +       refcount_t              refcount;
> > 
> > Hm, since we only ever touch the refcount under a spinlock, I wonder
> > if we could just make it a regular unsigned int (we don't need the
> > atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> > checking, so it's probably better to leave it as is...
> 
> Since the update is done using rcu-safe methods, do we even need the
> spin_lock?  Neil?  Paul?
> 
Yes, we do.  Rcu-safe methods only apply to read side operations, we still need
traditional mutual exclusion on the write side of the operation.  That is to say
we need to protect the list against multiple writers at the same time, and for
that we need a spin lock.

Neil
Neil Horman March 29, 2019, 2:50 p.m. UTC | #9
On Thu, Mar 28, 2019 at 05:40:23PM -0400, Richard Guy Briggs wrote:
> On 2019-03-28 11:46, Paul Moore wrote:
> > On Wed, Mar 27, 2019 at 9:12 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > >
> > > On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> > > > On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > Audit events could happen in a network namespace outside of a task
> > > > > context due to packets received from the net that trigger an auditing
> > > > > rule prior to being associated with a running task.  The network
> > > > > namespace could be in use by multiple containers by association to the
> > > > > tasks in that network namespace.  We still want a way to attribute
> > > > > these events to any potential containers.  Keep a list per network
> > > > > namespace to track these audit container identifiiers.
> > > > >
> > > > > Add/increment the audit container identifier on:
> > > > > - initial setting of the audit container identifier via /proc
> > > > > - clone/fork call that inherits an audit container identifier
> > > > > - unshare call that inherits an audit container identifier
> > > > > - setns call that inherits an audit container identifier
> > > > > Delete/decrement the audit container identifier on:
> > > > > - an inherited audit container identifier dropped when child set
> > > > > - process exit
> > > > > - unshare call that drops a net namespace
> > > > > - setns call that drops a net namespace
> > > > >
> > > > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > > ---
> > > > >  include/linux/audit.h | 19 ++++++++++++
> > > > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > >  kernel/nsproxy.c      |  4 +++
> > > > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > > > index fa19fa408931..70255c2dfb9f 100644
> > > > > --- a/include/linux/audit.h
> > > > > +++ b/include/linux/audit.h
> > > > > @@ -27,6 +27,7 @@
> > > > >  #include <linux/ptrace.h>
> > > > >  #include <linux/namei.h>  /* LOOKUP_* */
> > > > >  #include <uapi/linux/audit.h>
> > > > > +#include <linux/refcount.h>
> > > > >
> > > > >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> > > > >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > > > > @@ -99,6 +100,13 @@ struct audit_task_info {
> > > > >
> > > > >  extern struct audit_task_info init_struct_audit;
> > > > >
> > > > > +struct audit_contid {
> > > > > +       struct list_head        list;
> > > > > +       u64                     id;
> > > > > +       refcount_t              refcount;
> > > >
> > > > Hm, since we only ever touch the refcount under a spinlock, I wonder
> > > > if we could just make it a regular unsigned int (we don't need the
> > > > atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> > > > checking, so it's probably better to leave it as is...
> > >
> > > Since the update is done using rcu-safe methods, do we even need the
> > > spin_lock?  Neil?  Paul?
> > 
> > As discussed, the refcount field is protected against simultaneous
> > writes by the spinlock that protects additions/removals from the list
> > as a whole so I don't believe the refcount_t atomicity is critical in
> > this regard.
> > 
> > Where it gets tricky, and I can't say I'm 100% confident on my answer
> > here, is if refcount was a regular int and we wanted to access it
> > outside of a spinlock (to be clear, it doesn't look like this patch
> > currently does this).  With RCU, if refcount was a regular int
> > (unsigned or otherwise), I believe it would be possible for different
> > threads of execution to potentially see different values of refcount
> > (assuming one thread was adding/removing from the list).  Using a
> > refcount_t would protect against this, alternatively, taking the
> > spinlock should also protect against this.
> 
> Ok, from the above it isn't clear to me if you are happy with the
> current code or would prefer any changes, or from below that you still
> need to work it through to make a pronouncement.  It sounds to me you
> would be ok with *either* spinlock *or* refcount_t, but don't see the
> need for both.
> 
I'll reiterate I think we should keep the refcount type just as it is, not for
safetys sake, but for readability and convienience.

Because the refcount currently only is used on add and delete operations
(implying it is only read in paths where its also modified), we need to
guarantee atomicity against multiple parallel writes.  We already have that
guarantee because every path in which we call
refcount_set/refcount_inc/refcount_dec_and_test occurs under the protection of
the list spin lock, and so from that standpoint we don't need the additional
guarantees offered by the refcount_t type.

However, if it were to be converted to an int type, we would have to replace the
refcount_dec_and_test call with this:
if (x == 0)
   warn_on_underflow
   return
x -= 1;
if (x == 0)
   preform_operations_to_free_list_entry

I find refcount_dec_and_test to be far easier to read and maintain, and you
still have to do all of this under the protection of a spin lock, to protect
against multiple writes.  And if you ever find that you are adding a pure read
side query of the refcount, you would need to hold the spin lock there as well,
instead of just using the available refcount_read api call

Yeah, you would save a few cycles if you didn't use an atomic type here, but
we're only talking about paths from user space making system calls executing
here (no high volume per packet receive paths or anything), and these paths have
already taken a few locks (the list lock, the task lock, etc), so eliminating
this one atomic isn't going to amount to anything.  Lets leave it as it is and
buy ourselves the extra code readability.

Neil

> > As we all know, RCU can be tricky at times, so I may be off on the
> > above; if I am, please provide an explanation so I (and likely others
> > as well) can learn a little bit more. :)
> > 
> > -- 
> > paul moore
> > www.paul-moore.com
> 
> - RGB
> 
> --
> Richard Guy Briggs <rgb@redhat.com>
> Sr. S/W Engineer, Kernel Security, Base Operating Systems
> Remote, Ottawa, Red Hat Canada
> IRC: rgb, SunRaycer
> Voice: +1.647.777.2635, Internal: (81) 32635
> 
>
Neil Horman March 31, 2019, 2:11 a.m. UTC | #10
On Thu, Mar 28, 2019 at 06:00:21PM -0400, Paul Moore wrote:
> On Thu, Mar 28, 2019 at 5:40 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > On 2019-03-28 11:46, Paul Moore wrote:
> > > On Wed, Mar 27, 2019 at 9:12 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > >
> > > > On 2019-03-27 23:42, Ondrej Mosnacek wrote:
> > > > > On Fri, Mar 15, 2019 at 7:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > > > Audit events could happen in a network namespace outside of a task
> > > > > > context due to packets received from the net that trigger an auditing
> > > > > > rule prior to being associated with a running task.  The network
> > > > > > namespace could be in use by multiple containers by association to the
> > > > > > tasks in that network namespace.  We still want a way to attribute
> > > > > > these events to any potential containers.  Keep a list per network
> > > > > > namespace to track these audit container identifiiers.
> > > > > >
> > > > > > Add/increment the audit container identifier on:
> > > > > > - initial setting of the audit container identifier via /proc
> > > > > > - clone/fork call that inherits an audit container identifier
> > > > > > - unshare call that inherits an audit container identifier
> > > > > > - setns call that inherits an audit container identifier
> > > > > > Delete/decrement the audit container identifier on:
> > > > > > - an inherited audit container identifier dropped when child set
> > > > > > - process exit
> > > > > > - unshare call that drops a net namespace
> > > > > > - setns call that drops a net namespace
> > > > > >
> > > > > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > > > ---
> > > > > >  include/linux/audit.h | 19 ++++++++++++
> > > > > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > > >  kernel/nsproxy.c      |  4 +++
> > > > > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > > > > >
> > > > > > diff --git a/include/linux/audit.h b/include/linux/audit.h
> > > > > > index fa19fa408931..70255c2dfb9f 100644
> > > > > > --- a/include/linux/audit.h
> > > > > > +++ b/include/linux/audit.h
> > > > > > @@ -27,6 +27,7 @@
> > > > > >  #include <linux/ptrace.h>
> > > > > >  #include <linux/namei.h>  /* LOOKUP_* */
> > > > > >  #include <uapi/linux/audit.h>
> > > > > > +#include <linux/refcount.h>
> > > > > >
> > > > > >  #define AUDIT_INO_UNSET ((unsigned long)-1)
> > > > > >  #define AUDIT_DEV_UNSET ((dev_t)-1)
> > > > > > @@ -99,6 +100,13 @@ struct audit_task_info {
> > > > > >
> > > > > >  extern struct audit_task_info init_struct_audit;
> > > > > >
> > > > > > +struct audit_contid {
> > > > > > +       struct list_head        list;
> > > > > > +       u64                     id;
> > > > > > +       refcount_t              refcount;
> > > > >
> > > > > Hm, since we only ever touch the refcount under a spinlock, I wonder
> > > > > if we could just make it a regular unsigned int (we don't need the
> > > > > atomicity guarantees). OTOH, refcount_t comes with some extra overflow
> > > > > checking, so it's probably better to leave it as is...
> > > >
> > > > Since the update is done using rcu-safe methods, do we even need the
> > > > spin_lock?  Neil?  Paul?
> > >
> > > As discussed, the refcount field is protected against simultaneous
> > > writes by the spinlock that protects additions/removals from the list
> > > as a whole so I don't believe the refcount_t atomicity is critical in
> > > this regard.
> > >
> > > Where it gets tricky, and I can't say I'm 100% confident on my answer
> > > here, is if refcount was a regular int and we wanted to access it
> > > outside of a spinlock (to be clear, it doesn't look like this patch
> > > currently does this).  With RCU, if refcount was a regular int
> > > (unsigned or otherwise), I believe it would be possible for different
> > > threads of execution to potentially see different values of refcount
> > > (assuming one thread was adding/removing from the list).  Using a
> > > refcount_t would protect against this, alternatively, taking the
> > > spinlock should also protect against this.
> >
> > Ok, from the above it isn't clear to me if you are happy with the
> > current code or would prefer any changes, or from below that you still
> > need to work it through to make a pronouncement.  It sounds to me you
> > would be ok with *either* spinlock *or* refcount_t, but don't see the
> > need for both.
> 
> To be fair you didn't ask if I was "happy" with the approach above,
> you asked if we needed the spinlock/refcount_t.  I believe I answered
> that question as comprehensively as I could, but perhaps you wanted a
> hard yes or no?  In that case, since refcount_t is obviously safer, I
> would stick with that for now just to limit the number of possible
> failures.  If someone smarter than you or I comes along and
> definitively says you are 100% safe to use an int, then go ahead and
> use an int.
> 
> Beyond that, I'm still in the process of reviewing your patches, but I
> haven't finished yet, so no "pronouncement" or whatever you want to
> call it.
> 

We definately need the spinlock, as its not meant to protect the refcount
alterations, its meant to protect parallel modifications to the list pointers.
Without the spinlock, the list can become corrupted (protecting the refcount is
just a byproduct).

Because of that byproduct, the atomicity of the refcount isn't required, and so
we could modify it to be an int or some other non-atomic ordinal type, but as I
noted, I don't think thats a good idea.  The refcount type helps denote clearly
what the variable is used for, making it more readable, and given the relative
non-performance critical path that the reference count is read/written in, and
the relatively more expensive locking we are already doing there, I think there
is more value in using the refcount to make the code legible than making
marginally more performant by altering it to be an int type.

Neil

> -- 
> paul moore
> www.paul-moore.com
>
Paul Moore April 1, 2019, 2:50 p.m. UTC | #11
On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> Audit events could happen in a network namespace outside of a task
> context due to packets received from the net that trigger an auditing
> rule prior to being associated with a running task.  The network
> namespace could be in use by multiple containers by association to the
> tasks in that network namespace.  We still want a way to attribute
> these events to any potential containers.  Keep a list per network
> namespace to track these audit container identifiiers.
>
> Add/increment the audit container identifier on:
> - initial setting of the audit container identifier via /proc
> - clone/fork call that inherits an audit container identifier
> - unshare call that inherits an audit container identifier
> - setns call that inherits an audit container identifier
> Delete/decrement the audit container identifier on:
> - an inherited audit container identifier dropped when child set
> - process exit
> - unshare call that drops a net namespace
> - setns call that drops a net namespace
>
> See: https://github.com/linux-audit/audit-kernel/issues/92
> See: https://github.com/linux-audit/audit-testsuite/issues/64
> See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> ---
>  include/linux/audit.h | 19 ++++++++++++
>  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  kernel/nsproxy.c      |  4 +++
>  3 files changed, 106 insertions(+), 3 deletions(-)

...

> diff --git a/kernel/audit.c b/kernel/audit.c
> index cf448599ef34..7fa3194f5342 100644
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -72,6 +72,7 @@
>  #include <linux/freezer.h>
>  #include <linux/pid_namespace.h>
>  #include <net/netns/generic.h>
> +#include <net/net_namespace.h>
>
>  #include "audit.h"
>
> @@ -99,9 +100,13 @@
>  /**
>   * struct audit_net - audit private network namespace data
>   * @sk: communication socket
> + * @contid_list: audit container identifier list
> + * @contid_list_lock audit container identifier list lock
>   */
>  struct audit_net {
>         struct sock *sk;
> +       struct list_head contid_list;
> +       spinlock_t contid_list_lock;
>  };
>
>  /**
> @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
>  void audit_free(struct task_struct *tsk)
>  {
>         struct audit_task_info *info = tsk->audit;
> +       struct nsproxy *ns = tsk->nsproxy;
>
>         audit_free_syscall(tsk);
> +       if (ns)
> +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
>         /* Freeing the audit_task_info struct must be performed after
>          * audit_log_exit() due to need for loginuid and sessionid.
>          */
> @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
>         return aunet->sk;
>  }
>
> +void audit_netns_contid_add(struct net *net, u64 contid)
> +{
> +       struct audit_net *aunet = net_generic(net, audit_net_id);
> +       struct list_head *contid_list = &aunet->contid_list;
> +       struct audit_contid *cont;
> +
> +       if (!audit_contid_valid(contid))
> +               return;
> +       if (!aunet)
> +               return;

We should move the contid_list assignment below this check, or decide
that aunet is always going to valid (?) and get rid of this check
completely.

> +       spin_lock(&aunet->contid_list_lock);
> +       if (!list_empty(contid_list))

We don't need the list_empty() check here do we?  I think we can just
call list_for_each_entry_rcu(), yes?

> +               list_for_each_entry_rcu(cont, contid_list, list)
> +                       if (cont->id == contid) {
> +                               refcount_inc(&cont->refcount);
> +                               goto out;
> +                       }
> +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);

If you had to guess, what do you think is going to be more common:
bumping the refcount of an existing entry in the list, or adding a new
entry?  I'm asking because I always get a little nervous when doing
allocations while holding a spinlock.  Yes, you are doing it with
GFP_ATOMIC, but it still seems like something to try and avoid if this
is going to approach 50%.  However, if the new entry is rare then the
extra work of always doing the allocation before taking the lock and
then freeing it afterwards might be a bad tradeoff.

My gut feeling says we might do about as many allocations as refcount
bumps, but I could be thinking about this wrong.

Moving the allocation outside the spinlock might also open the door to
doing this as GFP_KERNEL, which is a good thing, but I haven't looked
at the callers to see if that is possible (it may not be).  That's an
exercise left to the patch author (if he hasn't done that already).

> +       if (cont) {
> +               INIT_LIST_HEAD(&cont->list);

Unless there is some guidance that INIT_LIST_HEAD() should be used
regardless, you shouldn't need to call this here since list_add_rcu()
will take care of any list.h related initialization.

> +               cont->id = contid;
> +               refcount_set(&cont->refcount, 1);
> +               list_add_rcu(&cont->list, contid_list);
> +       }
> +out:
> +       spin_unlock(&aunet->contid_list_lock);
> +}
Richard Guy Briggs April 1, 2019, 8:41 p.m. UTC | #12
On 2019-04-01 10:50, Paul Moore wrote:
> On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > Audit events could happen in a network namespace outside of a task
> > context due to packets received from the net that trigger an auditing
> > rule prior to being associated with a running task.  The network
> > namespace could be in use by multiple containers by association to the
> > tasks in that network namespace.  We still want a way to attribute
> > these events to any potential containers.  Keep a list per network
> > namespace to track these audit container identifiiers.
> >
> > Add/increment the audit container identifier on:
> > - initial setting of the audit container identifier via /proc
> > - clone/fork call that inherits an audit container identifier
> > - unshare call that inherits an audit container identifier
> > - setns call that inherits an audit container identifier
> > Delete/decrement the audit container identifier on:
> > - an inherited audit container identifier dropped when child set
> > - process exit
> > - unshare call that drops a net namespace
> > - setns call that drops a net namespace
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/92
> > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  include/linux/audit.h | 19 ++++++++++++
> >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> >  kernel/nsproxy.c      |  4 +++
> >  3 files changed, 106 insertions(+), 3 deletions(-)
> 
> ...
> 
> > diff --git a/kernel/audit.c b/kernel/audit.c
> > index cf448599ef34..7fa3194f5342 100644
> > --- a/kernel/audit.c
> > +++ b/kernel/audit.c
> > @@ -72,6 +72,7 @@
> >  #include <linux/freezer.h>
> >  #include <linux/pid_namespace.h>
> >  #include <net/netns/generic.h>
> > +#include <net/net_namespace.h>
> >
> >  #include "audit.h"
> >
> > @@ -99,9 +100,13 @@
> >  /**
> >   * struct audit_net - audit private network namespace data
> >   * @sk: communication socket
> > + * @contid_list: audit container identifier list
> > + * @contid_list_lock audit container identifier list lock
> >   */
> >  struct audit_net {
> >         struct sock *sk;
> > +       struct list_head contid_list;
> > +       spinlock_t contid_list_lock;
> >  };
> >
> >  /**
> > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> >  void audit_free(struct task_struct *tsk)
> >  {
> >         struct audit_task_info *info = tsk->audit;
> > +       struct nsproxy *ns = tsk->nsproxy;
> >
> >         audit_free_syscall(tsk);
> > +       if (ns)
> > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> >         /* Freeing the audit_task_info struct must be performed after
> >          * audit_log_exit() due to need for loginuid and sessionid.
> >          */
> > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> >         return aunet->sk;
> >  }
> >
> > +void audit_netns_contid_add(struct net *net, u64 contid)
> > +{
> > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > +       struct list_head *contid_list = &aunet->contid_list;
> > +       struct audit_contid *cont;
> > +
> > +       if (!audit_contid_valid(contid))
> > +               return;
> > +       if (!aunet)
> > +               return;
> 
> We should move the contid_list assignment below this check, or decide
> that aunet is always going to valid (?) and get rid of this check
> completely.

Yes, we should move the contid_list assignment below the aunet check.
At this point, testing has revlealed that audit_netns_contid_del() is
being called very infrequently (very early) with invalid net or aunet
which is why its code has been re-arranged to safeguard against it.  So
far, audit_netns_contid_add() has not had this problem, but I'd feel
safer leaving the check in and to make that check relevant (rather than
the contid_list assignment panicing) I'll move that assignment.

> > +       spin_lock(&aunet->contid_list_lock);
> > +       if (!list_empty(contid_list))
> 
> We don't need the list_empty() check here do we?  I think we can just
> call list_for_each_entry_rcu(), yes?

Yes, you appear to be correct.  The examples in auditfilter.c and
auditsc.c are stale since the original optimizations they were
protecting are no longer present, but the practice was copied by several
maintainers since.  I'll fix this and address the others seperately.

> > +               list_for_each_entry_rcu(cont, contid_list, list)
> > +                       if (cont->id == contid) {
> > +                               refcount_inc(&cont->refcount);
> > +                               goto out;
> > +                       }
> > +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> 
> If you had to guess, what do you think is going to be more common:
> bumping the refcount of an existing entry in the list, or adding a new
> entry?  I'm asking because I always get a little nervous when doing
> allocations while holding a spinlock.  Yes, you are doing it with
> GFP_ATOMIC, but it still seems like something to try and avoid if this
> is going to approach 50%.  However, if the new entry is rare then the
> extra work of always doing the allocation before taking the lock and
> then freeing it afterwards might be a bad tradeoff.
> 
> My gut feeling says we might do about as many allocations as refcount
> bumps, but I could be thinking about this wrong.

I'm sure we could find extreme use cases in both directions.  One
extreme is one task per container.  The other extreme is a distro
container where all processes for an entire O/S running in a container
just use the refcount.  The former seems more likely at this point.  We
break even at two tasks per container.

I'm willing to switch this over to alloc/dealloc and use GFP_KERNEL
which will inevitably make some kernel maintainers happier and others
cringe.  This could be a good candidate for kmem_cache since it is a
fixed size.

> Moving the allocation outside the spinlock might also open the door to
> doing this as GFP_KERNEL, which is a good thing, but I haven't looked
> at the callers to see if that is possible (it may not be).  That's an
> exercise left to the patch author (if he hasn't done that already).

Switching to GFP_KERNEL is certainly preferable.  None of the callers
will have a problem with that.

> > +       if (cont) {
> > +               INIT_LIST_HEAD(&cont->list);
> 
> Unless there is some guidance that INIT_LIST_HEAD() should be used
> regardless, you shouldn't need to call this here since list_add_rcu()
> will take care of any list.h related initialization.

I see enough examples without that suggest it isn't necessary.  I'll
remove it.

> > +               cont->id = contid;
> > +               refcount_set(&cont->refcount, 1);
> > +               list_add_rcu(&cont->list, contid_list);
> > +       }
> > +out:
> > +       spin_unlock(&aunet->contid_list_lock);
> > +}
> 
> paul moore

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Neil Horman April 2, 2019, 11:31 a.m. UTC | #13
On Mon, Apr 01, 2019 at 10:50:03AM -0400, Paul Moore wrote:
> On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > Audit events could happen in a network namespace outside of a task
> > context due to packets received from the net that trigger an auditing
> > rule prior to being associated with a running task.  The network
> > namespace could be in use by multiple containers by association to the
> > tasks in that network namespace.  We still want a way to attribute
> > these events to any potential containers.  Keep a list per network
> > namespace to track these audit container identifiiers.
> >
> > Add/increment the audit container identifier on:
> > - initial setting of the audit container identifier via /proc
> > - clone/fork call that inherits an audit container identifier
> > - unshare call that inherits an audit container identifier
> > - setns call that inherits an audit container identifier
> > Delete/decrement the audit container identifier on:
> > - an inherited audit container identifier dropped when child set
> > - process exit
> > - unshare call that drops a net namespace
> > - setns call that drops a net namespace
> >
> > See: https://github.com/linux-audit/audit-kernel/issues/92
> > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > ---
> >  include/linux/audit.h | 19 ++++++++++++
> >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> >  kernel/nsproxy.c      |  4 +++
> >  3 files changed, 106 insertions(+), 3 deletions(-)
> 
> ...
> 
> > diff --git a/kernel/audit.c b/kernel/audit.c
> > index cf448599ef34..7fa3194f5342 100644
> > --- a/kernel/audit.c
> > +++ b/kernel/audit.c
> > @@ -72,6 +72,7 @@
> >  #include <linux/freezer.h>
> >  #include <linux/pid_namespace.h>
> >  #include <net/netns/generic.h>
> > +#include <net/net_namespace.h>
> >
> >  #include "audit.h"
> >
> > @@ -99,9 +100,13 @@
> >  /**
> >   * struct audit_net - audit private network namespace data
> >   * @sk: communication socket
> > + * @contid_list: audit container identifier list
> > + * @contid_list_lock audit container identifier list lock
> >   */
> >  struct audit_net {
> >         struct sock *sk;
> > +       struct list_head contid_list;
> > +       spinlock_t contid_list_lock;
> >  };
> >
> >  /**
> > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> >  void audit_free(struct task_struct *tsk)
> >  {
> >         struct audit_task_info *info = tsk->audit;
> > +       struct nsproxy *ns = tsk->nsproxy;
> >
> >         audit_free_syscall(tsk);
> > +       if (ns)
> > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> >         /* Freeing the audit_task_info struct must be performed after
> >          * audit_log_exit() due to need for loginuid and sessionid.
> >          */
> > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> >         return aunet->sk;
> >  }
> >
> > +void audit_netns_contid_add(struct net *net, u64 contid)
> > +{
> > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > +       struct list_head *contid_list = &aunet->contid_list;
> > +       struct audit_contid *cont;
> > +
> > +       if (!audit_contid_valid(contid))
> > +               return;
> > +       if (!aunet)
> > +               return;
> 
> We should move the contid_list assignment below this check, or decide
> that aunet is always going to valid (?) and get rid of this check
> completely.
> 
I'm not sure why that would be needed.  Finding the net_id list is an operation
of a map relating net namespaces to lists, not contids to lists.  We could do
it, sure, but since they're unrelated operations, I don't think we experience
any slowdowns from doing it this way.

> > +       spin_lock(&aunet->contid_list_lock);
> > +       if (!list_empty(contid_list))
> 
> We don't need the list_empty() check here do we?  I think we can just
> call list_for_each_entry_rcu(), yes?
> 
This is true, the list_empty check is redundant, and the for loop will fall
through if the list is empty.

> > +               list_for_each_entry_rcu(cont, contid_list, list)
> > +                       if (cont->id == contid) {
> > +                               refcount_inc(&cont->refcount);
> > +                               goto out;
> > +                       }
> > +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> 
> If you had to guess, what do you think is going to be more common:
> bumping the refcount of an existing entry in the list, or adding a new
> entry?  I'm asking because I always get a little nervous when doing
> allocations while holding a spinlock.  Yes, you are doing it with
> GFP_ATOMIC, but it still seems like something to try and avoid if this
> is going to approach 50%.  However, if the new entry is rare then the
> extra work of always doing the allocation before taking the lock and
> then freeing it afterwards might be a bad tradeoff.
> 
I think this is another way of asking, will multiple processes exist in the same
network namespace?  That is to say, will we expect a process to create a net
namespace, and then have other processes enter that namespace (thereby
triggering multiple calls to audit_netns_contid_add with the same net pointer
and cont id).  Given that the kubernetes notion of a pod is almost by definition
multiple containers sharing a network namespace, I think the answer is that we
will be strongly biased towards the refcount_inc case, rather than the kmalloc
case.  I could be wrong, but I think this is likely already in the optimized
order.

> My gut feeling says we might do about as many allocations as refcount
> bumps, but I could be thinking about this wrong.
> 
> Moving the allocation outside the spinlock might also open the door to
> doing this as GFP_KERNEL, which is a good thing, but I haven't looked
> at the callers to see if that is possible (it may not be).  That's an
> exercise left to the patch author (if he hasn't done that already).
> 
> > +       if (cont) {
> > +               INIT_LIST_HEAD(&cont->list);
> 
> Unless there is some guidance that INIT_LIST_HEAD() should be used
> regardless, you shouldn't need to call this here since list_add_rcu()
> will take care of any list.h related initialization.
> 
There is a corner case that needs it.  list_add_rcu has a check that gets
called, __list_add_valid.  Its a noop in the regular case, but if
CONFIG_DEBUG_LIST is defined, its a check to ensure that the next and prev
pointers getting passed in aren't set to detectable corrupt values.  If we pass
in garbage, we can get transient false positives on that check, so we need to
set the list pointers to known good values before hand, either by using kzalloc,
or INIT_LIST_HEAD, as has been done here.  Given that we expressly set every
field of this structure, I think this is the right approach, as it uses the list
macro to expressly set the list values to their proper state.

> > +               cont->id = contid;
> > +               refcount_set(&cont->refcount, 1);
> > +               list_add_rcu(&cont->list, contid_list);
> > +       }
> > +out:
> > +       spin_unlock(&aunet->contid_list_lock);
> > +}
> 
> -- 
> paul moore
> www.paul-moore.com
>
Paul Moore April 2, 2019, 1:31 p.m. UTC | #14
On Tue, Apr 2, 2019 at 7:32 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> On Mon, Apr 01, 2019 at 10:50:03AM -0400, Paul Moore wrote:
> > On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > Audit events could happen in a network namespace outside of a task
> > > context due to packets received from the net that trigger an auditing
> > > rule prior to being associated with a running task.  The network
> > > namespace could be in use by multiple containers by association to the
> > > tasks in that network namespace.  We still want a way to attribute
> > > these events to any potential containers.  Keep a list per network
> > > namespace to track these audit container identifiiers.
> > >
> > > Add/increment the audit container identifier on:
> > > - initial setting of the audit container identifier via /proc
> > > - clone/fork call that inherits an audit container identifier
> > > - unshare call that inherits an audit container identifier
> > > - setns call that inherits an audit container identifier
> > > Delete/decrement the audit container identifier on:
> > > - an inherited audit container identifier dropped when child set
> > > - process exit
> > > - unshare call that drops a net namespace
> > > - setns call that drops a net namespace
> > >
> > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  include/linux/audit.h | 19 ++++++++++++
> > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  kernel/nsproxy.c      |  4 +++
> > >  3 files changed, 106 insertions(+), 3 deletions(-)
> >
> > ...
> >
> > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > index cf448599ef34..7fa3194f5342 100644
> > > --- a/kernel/audit.c
> > > +++ b/kernel/audit.c
> > > @@ -72,6 +72,7 @@
> > >  #include <linux/freezer.h>
> > >  #include <linux/pid_namespace.h>
> > >  #include <net/netns/generic.h>
> > > +#include <net/net_namespace.h>
> > >
> > >  #include "audit.h"
> > >
> > > @@ -99,9 +100,13 @@
> > >  /**
> > >   * struct audit_net - audit private network namespace data
> > >   * @sk: communication socket
> > > + * @contid_list: audit container identifier list
> > > + * @contid_list_lock audit container identifier list lock
> > >   */
> > >  struct audit_net {
> > >         struct sock *sk;
> > > +       struct list_head contid_list;
> > > +       spinlock_t contid_list_lock;
> > >  };
> > >
> > >  /**
> > > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> > >  void audit_free(struct task_struct *tsk)
> > >  {
> > >         struct audit_task_info *info = tsk->audit;
> > > +       struct nsproxy *ns = tsk->nsproxy;
> > >
> > >         audit_free_syscall(tsk);
> > > +       if (ns)
> > > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> > >         /* Freeing the audit_task_info struct must be performed after
> > >          * audit_log_exit() due to need for loginuid and sessionid.
> > >          */
> > > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> > >         return aunet->sk;
> > >  }
> > >
> > > +void audit_netns_contid_add(struct net *net, u64 contid)
> > > +{
> > > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > > +       struct list_head *contid_list = &aunet->contid_list;
> > > +       struct audit_contid *cont;
> > > +
> > > +       if (!audit_contid_valid(contid))
> > > +               return;
> > > +       if (!aunet)
> > > +               return;
> >
> > We should move the contid_list assignment below this check, or decide
> > that aunet is always going to valid (?) and get rid of this check
> > completely.
> >
> I'm not sure why that would be needed.  Finding the net_id list is an operation
> of a map relating net namespaces to lists, not contids to lists.  We could do
> it, sure, but since they're unrelated operations, I don't think we experience
> any slowdowns from doing it this way.

In the first line of the function, when aunet is declared, it is also
assigned a value using net_generic():

  struct audit_net *aunet = net_generic(net, audit_net_id);

Later in the function there is check to see if aunet is NULL, yet on
the second line of the function (before the NULL check), there is this
line of code:

  struct list_head *contid_list = &aunet->contid_list;

... which could result in the dereference of a NULL pointer if aunet
is NULL.  My suggestion was either to move this assignment below the
aunet-NULL check or decide that aunet was always going to be valid
(e.g. non-NULL) and do away with the aunet-NULL check completely.
Richard has since replied that the aunet-NULL check has been
demonstrated to be necessary so the proper thing to do would be to
move the assignment.  I believe that is what Richard is planning on
doing.

> > > +       if (cont) {
> > > +               INIT_LIST_HEAD(&cont->list);
> >
> > Unless there is some guidance that INIT_LIST_HEAD() should be used
> > regardless, you shouldn't need to call this here since list_add_rcu()
> > will take care of any list.h related initialization.
>
> There is a corner case that needs it.  list_add_rcu has a check that gets
> called, __list_add_valid.  Its a noop in the regular case, but if
> CONFIG_DEBUG_LIST is defined, its a check to ensure that the next and prev
> pointers getting passed in aren't set to detectable corrupt values.  If we pass
> in garbage, we can get transient false positives on that check, so we need to
> set the list pointers to known good values before hand, either by using kzalloc,
> or INIT_LIST_HEAD, as has been done here.  Given that we expressly set every
> field of this structure, I think this is the right approach, as it uses the list
> macro to expressly set the list values to their proper state.

Good to know, thanks.
Neil Horman April 2, 2019, 2:28 p.m. UTC | #15
On Tue, Apr 02, 2019 at 09:31:49AM -0400, Paul Moore wrote:
> On Tue, Apr 2, 2019 at 7:32 AM Neil Horman <nhorman@tuxdriver.com> wrote:
> > On Mon, Apr 01, 2019 at 10:50:03AM -0400, Paul Moore wrote:
> > > On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > Audit events could happen in a network namespace outside of a task
> > > > context due to packets received from the net that trigger an auditing
> > > > rule prior to being associated with a running task.  The network
> > > > namespace could be in use by multiple containers by association to the
> > > > tasks in that network namespace.  We still want a way to attribute
> > > > these events to any potential containers.  Keep a list per network
> > > > namespace to track these audit container identifiiers.
> > > >
> > > > Add/increment the audit container identifier on:
> > > > - initial setting of the audit container identifier via /proc
> > > > - clone/fork call that inherits an audit container identifier
> > > > - unshare call that inherits an audit container identifier
> > > > - setns call that inherits an audit container identifier
> > > > Delete/decrement the audit container identifier on:
> > > > - an inherited audit container identifier dropped when child set
> > > > - process exit
> > > > - unshare call that drops a net namespace
> > > > - setns call that drops a net namespace
> > > >
> > > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > ---
> > > >  include/linux/audit.h | 19 ++++++++++++
> > > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > >  kernel/nsproxy.c      |  4 +++
> > > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > >
> > > ...
> > >
> > > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > > index cf448599ef34..7fa3194f5342 100644
> > > > --- a/kernel/audit.c
> > > > +++ b/kernel/audit.c
> > > > @@ -72,6 +72,7 @@
> > > >  #include <linux/freezer.h>
> > > >  #include <linux/pid_namespace.h>
> > > >  #include <net/netns/generic.h>
> > > > +#include <net/net_namespace.h>
> > > >
> > > >  #include "audit.h"
> > > >
> > > > @@ -99,9 +100,13 @@
> > > >  /**
> > > >   * struct audit_net - audit private network namespace data
> > > >   * @sk: communication socket
> > > > + * @contid_list: audit container identifier list
> > > > + * @contid_list_lock audit container identifier list lock
> > > >   */
> > > >  struct audit_net {
> > > >         struct sock *sk;
> > > > +       struct list_head contid_list;
> > > > +       spinlock_t contid_list_lock;
> > > >  };
> > > >
> > > >  /**
> > > > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> > > >  void audit_free(struct task_struct *tsk)
> > > >  {
> > > >         struct audit_task_info *info = tsk->audit;
> > > > +       struct nsproxy *ns = tsk->nsproxy;
> > > >
> > > >         audit_free_syscall(tsk);
> > > > +       if (ns)
> > > > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> > > >         /* Freeing the audit_task_info struct must be performed after
> > > >          * audit_log_exit() due to need for loginuid and sessionid.
> > > >          */
> > > > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> > > >         return aunet->sk;
> > > >  }
> > > >
> > > > +void audit_netns_contid_add(struct net *net, u64 contid)
> > > > +{
> > > > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > > > +       struct list_head *contid_list = &aunet->contid_list;
> > > > +       struct audit_contid *cont;
> > > > +
> > > > +       if (!audit_contid_valid(contid))
> > > > +               return;
> > > > +       if (!aunet)
> > > > +               return;
> > >
> > > We should move the contid_list assignment below this check, or decide
> > > that aunet is always going to valid (?) and get rid of this check
> > > completely.
> > >
> > I'm not sure why that would be needed.  Finding the net_id list is an operation
> > of a map relating net namespaces to lists, not contids to lists.  We could do
> > it, sure, but since they're unrelated operations, I don't think we experience
> > any slowdowns from doing it this way.
> 
> In the first line of the function, when aunet is declared, it is also
> assigned a value using net_generic():
> 
>   struct audit_net *aunet = net_generic(net, audit_net_id);
> 
> Later in the function there is check to see if aunet is NULL, yet on
> the second line of the function (before the NULL check), there is this
> line of code:
> 
>   struct list_head *contid_list = &aunet->contid_list;
> 
> ... which could result in the dereference of a NULL pointer if aunet
> is NULL.  My suggestion was either to move this assignment below the
> aunet-NULL check or decide that aunet was always going to be valid
> (e.g. non-NULL) and do away with the aunet-NULL check completely.
> Richard has since replied that the aunet-NULL check has been
> demonstrated to be necessary so the proper thing to do would be to
> move the assignment.  I believe that is what Richard is planning on
> doing.
> 
oh, I'm sorry, you're right, I was looking at the contid tests not the list
tests.

Neil

> > > > +       if (cont) {
> > > > +               INIT_LIST_HEAD(&cont->list);
> > >
> > > Unless there is some guidance that INIT_LIST_HEAD() should be used
> > > regardless, you shouldn't need to call this here since list_add_rcu()
> > > will take care of any list.h related initialization.
> >
> > There is a corner case that needs it.  list_add_rcu has a check that gets
> > called, __list_add_valid.  Its a noop in the regular case, but if
> > CONFIG_DEBUG_LIST is defined, its a check to ensure that the next and prev
> > pointers getting passed in aren't set to detectable corrupt values.  If we pass
> > in garbage, we can get transient false positives on that check, so we need to
> > set the list pointers to known good values before hand, either by using kzalloc,
> > or INIT_LIST_HEAD, as has been done here.  Given that we expressly set every
> > field of this structure, I think this is the right approach, as it uses the list
> > macro to expressly set the list values to their proper state.
> 
> Good to know, thanks.
> 
> -- 
> paul moore
> www.paul-moore.com
>
Richard Guy Briggs April 4, 2019, 9:40 p.m. UTC | #16
On 2019-04-02 07:31, Neil Horman wrote:
> On Mon, Apr 01, 2019 at 10:50:03AM -0400, Paul Moore wrote:
> > On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > Audit events could happen in a network namespace outside of a task
> > > context due to packets received from the net that trigger an auditing
> > > rule prior to being associated with a running task.  The network
> > > namespace could be in use by multiple containers by association to the
> > > tasks in that network namespace.  We still want a way to attribute
> > > these events to any potential containers.  Keep a list per network
> > > namespace to track these audit container identifiiers.
> > >
> > > Add/increment the audit container identifier on:
> > > - initial setting of the audit container identifier via /proc
> > > - clone/fork call that inherits an audit container identifier
> > > - unshare call that inherits an audit container identifier
> > > - setns call that inherits an audit container identifier
> > > Delete/decrement the audit container identifier on:
> > > - an inherited audit container identifier dropped when child set
> > > - process exit
> > > - unshare call that drops a net namespace
> > > - setns call that drops a net namespace
> > >
> > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > ---
> > >  include/linux/audit.h | 19 ++++++++++++
> > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  kernel/nsproxy.c      |  4 +++
> > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > 
> > ...
> > 
> > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > index cf448599ef34..7fa3194f5342 100644
> > > --- a/kernel/audit.c
> > > +++ b/kernel/audit.c
> > > @@ -72,6 +72,7 @@
> > >  #include <linux/freezer.h>
> > >  #include <linux/pid_namespace.h>
> > >  #include <net/netns/generic.h>
> > > +#include <net/net_namespace.h>
> > >
> > >  #include "audit.h"
> > >
> > > @@ -99,9 +100,13 @@
> > >  /**
> > >   * struct audit_net - audit private network namespace data
> > >   * @sk: communication socket
> > > + * @contid_list: audit container identifier list
> > > + * @contid_list_lock audit container identifier list lock
> > >   */
> > >  struct audit_net {
> > >         struct sock *sk;
> > > +       struct list_head contid_list;
> > > +       spinlock_t contid_list_lock;
> > >  };
> > >
> > >  /**
> > > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> > >  void audit_free(struct task_struct *tsk)
> > >  {
> > >         struct audit_task_info *info = tsk->audit;
> > > +       struct nsproxy *ns = tsk->nsproxy;
> > >
> > >         audit_free_syscall(tsk);
> > > +       if (ns)
> > > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> > >         /* Freeing the audit_task_info struct must be performed after
> > >          * audit_log_exit() due to need for loginuid and sessionid.
> > >          */
> > > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> > >         return aunet->sk;
> > >  }
> > >
> > > +void audit_netns_contid_add(struct net *net, u64 contid)
> > > +{
> > > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > > +       struct list_head *contid_list = &aunet->contid_list;
> > > +       struct audit_contid *cont;
> > > +
> > > +       if (!audit_contid_valid(contid))
> > > +               return;
> > > +       if (!aunet)
> > > +               return;
> > 
> > We should move the contid_list assignment below this check, or decide
> > that aunet is always going to valid (?) and get rid of this check
> > completely.
> > 
> I'm not sure why that would be needed.  Finding the net_id list is an operation
> of a map relating net namespaces to lists, not contids to lists.  We could do
> it, sure, but since they're unrelated operations, I don't think we experience
> any slowdowns from doing it this way.
> 
> > > +       spin_lock(&aunet->contid_list_lock);
> > > +       if (!list_empty(contid_list))
> > 
> > We don't need the list_empty() check here do we?  I think we can just
> > call list_for_each_entry_rcu(), yes?
> > 
> This is true, the list_empty check is redundant, and the for loop will fall
> through if the list is empty.
> 
> > > +               list_for_each_entry_rcu(cont, contid_list, list)
> > > +                       if (cont->id == contid) {
> > > +                               refcount_inc(&cont->refcount);
> > > +                               goto out;
> > > +                       }
> > > +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> > 
> > If you had to guess, what do you think is going to be more common:
> > bumping the refcount of an existing entry in the list, or adding a new
> > entry?  I'm asking because I always get a little nervous when doing
> > allocations while holding a spinlock.  Yes, you are doing it with
> > GFP_ATOMIC, but it still seems like something to try and avoid if this
> > is going to approach 50%.  However, if the new entry is rare then the
> > extra work of always doing the allocation before taking the lock and
> > then freeing it afterwards might be a bad tradeoff.
> > 
> I think this is another way of asking, will multiple processes exist in the same
> network namespace?  That is to say, will we expect a process to create a net
> namespace, and then have other processes enter that namespace (thereby
> triggering multiple calls to audit_netns_contid_add with the same net pointer
> and cont id).  Given that the kubernetes notion of a pod is almost by definition
> multiple containers sharing a network namespace, I think the answer is that we
> will be strongly biased towards the refcount_inc case, rather than the kmalloc
> case.  I could be wrong, but I think this is likely already in the optimized
> order.

I had a stab at doing a GFP_KERNEL alloc before the spinlock and releasing it
after if it wasn't needed (in Patch#1 below).  I also went one step further and
converted it to kmem_cache (in Patch#2 below).

> > My gut feeling says we might do about as many allocations as refcount
> > bumps, but I could be thinking about this wrong.
> > 
> > Moving the allocation outside the spinlock might also open the door to
> > doing this as GFP_KERNEL, which is a good thing, but I haven't looked
> > at the callers to see if that is possible (it may not be).  That's an
> > exercise left to the patch author (if he hasn't done that already).

Both appear to work, but after successfully running both the contid test and
audit_testsuite, once I start to push that test system a bit harder I end up
with a deadlock warning.

I am having trouble understanding why since it happens both without and with
the kmem_cache options, so it must be another part of the code that is
triggering it.  The task_lock is being held at this point in
audit_set_contid().  I haven't tried changing this alloc over to a GFP_ATOMIC
to see if that prevents it, just as a debugging check...
At this point, I'm inclined to leave it as it was without these two patches
since it works and there doesn't seem to be an obvious best way given the
uncertainty of the potential workloads.

======================================================
WARNING: possible circular locking dependency detected
5.1.0-rc1-ghak90-audit-containerID.v6.3+ #80 Not tainted
------------------------------------------------------
kswapd0/41 is trying to acquire lock:
0000000006a8c88b (&(&p->alloc_lock)->rlock){+.+.}, at: create_task_io_context+0xe9/0x150

but task is already holding lock:
0000000010aadb74 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x40

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

-> #1 (fs_reclaim){+.+.}:
       __fs_reclaim_acquire+0x2c/0x40
       kmem_cache_alloc_trace+0x30/0x230
       audit_netns_contid_add.part.12+0xcf/0x210
       audit_set_contid+0x18f/0x480
       proc_contid_write+0x74/0x110
       vfs_write+0xad/0x1b0
       ksys_write+0x55/0xc0
       do_syscall_64+0x60/0x210
       entry_SYSCALL_64_after_hwframe+0x49/0xbe

-> #0 (&(&p->alloc_lock)->rlock){+.+.}:
       lock_acquire+0xd4/0x1c0
       _raw_spin_lock+0x2e/0x40
       create_task_io_context+0xe9/0x150
       generic_make_request_checks+0x8b4/0x970
       generic_make_request+0xbb/0x570
       submit_bio+0x6e/0x130
       __swap_writepage+0x281/0x420
       shmem_writepage+0x1d3/0x350
       pageout.isra.54+0x1e9/0x3f0
       shrink_page_list+0xa9f/0xe60
       shrink_inactive_list+0x263/0x6e0
       shrink_node_memcg+0x376/0x7c0
       shrink_node+0xdd/0x470
       balance_pgdat+0x26c/0x570
       kswapd+0x191/0x4f0
       kthread+0xf8/0x130
       ret_from_fork+0x3a/0x50

other info that might help us debug this:

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(fs_reclaim);
                               lock(&(&p->alloc_lock)->rlock);
                               lock(fs_reclaim);
  lock(&(&p->alloc_lock)->rlock);

 *** DEADLOCK ***

1 lock held by kswapd0/41:
 #0: 0000000010aadb74 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x40

> > paul moore

Patch#1:
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -388,7 +388,7 @@ void audit_netns_contid_add(struct net *net, u64 contid)
 {
        struct audit_net *aunet;
        struct list_head *contid_list;
-       struct audit_contid *cont;
+       struct audit_contid *cont, *newcont;
 
        if (!net)
                return;
@@ -398,18 +398,19 @@ void audit_netns_contid_add(struct net *net, u64 contid)
        if (!aunet)
                return;
        contid_list = &aunet->contid_list;
+       newcont = kmalloc(sizeof(struct audit_contid), GFP_KERNEL);
        spin_lock(&aunet->contid_list_lock);
        list_for_each_entry_rcu(cont, contid_list, list)
                if (cont->id == contid) {
                        refcount_inc(&cont->refcount);
+                       kfree(newcont);
                        goto out;
                }
-       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
-       if (cont) {
-               INIT_LIST_HEAD(&cont->list);
-               cont->id = contid;
-               refcount_set(&cont->refcount, 1);
-               list_add_rcu(&cont->list, contid_list);
+       if (newcont) {
+               INIT_LIST_HEAD(&newcont->list);
+               newcont->id = contid;
+               refcount_set(&newcont->refcount, 1);
+               list_add_rcu(&newcont->list, contid_list);
        }
 out:
        spin_unlock(&aunet->contid_list_lock);


Patch#2:
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -222,12 +222,16 @@ struct audit_reply {
 };
 
 static struct kmem_cache *audit_task_cache;
+static struct kmem_cache *audit_contid_cache;
 
 void __init audit_task_init(void)
 {
        audit_task_cache = kmem_cache_create("audit_task",
                                             sizeof(struct audit_task_info),
                                             0, SLAB_PANIC, NULL);
+       audit_contid_cache = kmem_cache_create("audit_contid",
+                                            sizeof(struct audit_contid),
+                                            0, SLAB_PANIC, NULL);
 }
 
 /**
@@ -388,7 +392,7 @@ void audit_netns_contid_add(struct net *net, u64 contid)
 {
        struct audit_net *aunet;
        struct list_head *contid_list;
-       struct audit_contid *cont;
+       struct audit_contid *cont, *newcont;
 
        if (!net)
                return;
@@ -398,23 +402,32 @@ void audit_netns_contid_add(struct net *net, u64 contid)
        if (!aunet)
                return;
        contid_list = &aunet->contid_list;
+       newcont = kmem_cache_alloc(audit_contid_cache, GFP_KERNEL);
        spin_lock(&aunet->contid_list_lock);
        list_for_each_entry_rcu(cont, contid_list, list)
                if (cont->id == contid) {
                        refcount_inc(&cont->refcount);
+                       kmem_cache_free(audit_contid_cache, newcont);
                        goto out;
                }
-       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
-       if (cont) {
-               INIT_LIST_HEAD(&cont->list);
-               cont->id = contid;
-               refcount_set(&cont->refcount, 1);
-               list_add_rcu(&cont->list, contid_list);
+       if (newcont) {
+               INIT_LIST_HEAD(&newcont->list);
+               newcont->id = contid;
+               refcount_set(&newcont->refcount, 1);
+               list_add_rcu(&newcont->list, contid_list);
        }
 out:
        spin_unlock(&aunet->contid_list_lock);
 }
 
+static void audit_contid_free(struct rcu_head *rcu)
+{
+       struct audit_contid *cont;
+
+       cont = container_of(rcu, struct audit_contid, rcu);
+       kmem_cache_free(audit_contid_cache, cont);
+}
+
 void audit_netns_contid_del(struct net *net, u64 contid)
 {
        struct audit_net *aunet;
@@ -434,7 +447,7 @@ void audit_netns_contid_del(struct net *net, u64 contid)
                if (cont->id == contid) {
                        if (refcount_dec_and_test(&cont->refcount)) {
                                list_del_rcu(&cont->list);
-                               kfree_rcu(cont, rcu);
+                               call_rcu(&cont->rcu, audit_contid_free);
                        }
                        break;
                }

- RGB

--
Richard Guy Briggs <rgb@redhat.com>
Sr. S/W Engineer, Kernel Security, Base Operating Systems
Remote, Ottawa, Red Hat Canada
IRC: rgb, SunRaycer
Voice: +1.647.777.2635, Internal: (81) 32635
Paul Moore April 5, 2019, 2:06 a.m. UTC | #17
On Thu, Apr 4, 2019 at 5:40 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> On 2019-04-02 07:31, Neil Horman wrote:
> > On Mon, Apr 01, 2019 at 10:50:03AM -0400, Paul Moore wrote:
> > > On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > Audit events could happen in a network namespace outside of a task
> > > > context due to packets received from the net that trigger an auditing
> > > > rule prior to being associated with a running task.  The network
> > > > namespace could be in use by multiple containers by association to the
> > > > tasks in that network namespace.  We still want a way to attribute
> > > > these events to any potential containers.  Keep a list per network
> > > > namespace to track these audit container identifiiers.
> > > >
> > > > Add/increment the audit container identifier on:
> > > > - initial setting of the audit container identifier via /proc
> > > > - clone/fork call that inherits an audit container identifier
> > > > - unshare call that inherits an audit container identifier
> > > > - setns call that inherits an audit container identifier
> > > > Delete/decrement the audit container identifier on:
> > > > - an inherited audit container identifier dropped when child set
> > > > - process exit
> > > > - unshare call that drops a net namespace
> > > > - setns call that drops a net namespace
> > > >
> > > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > ---
> > > >  include/linux/audit.h | 19 ++++++++++++
> > > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > >  kernel/nsproxy.c      |  4 +++
> > > >  3 files changed, 106 insertions(+), 3 deletions(-)

...

> > > > +               list_for_each_entry_rcu(cont, contid_list, list)
> > > > +                       if (cont->id == contid) {
> > > > +                               refcount_inc(&cont->refcount);
> > > > +                               goto out;
> > > > +                       }
> > > > +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> > >
> > > If you had to guess, what do you think is going to be more common:
> > > bumping the refcount of an existing entry in the list, or adding a new
> > > entry?  I'm asking because I always get a little nervous when doing
> > > allocations while holding a spinlock.  Yes, you are doing it with
> > > GFP_ATOMIC, but it still seems like something to try and avoid if this
> > > is going to approach 50%.  However, if the new entry is rare then the
> > > extra work of always doing the allocation before taking the lock and
> > > then freeing it afterwards might be a bad tradeoff.
> > >
> > I think this is another way of asking, will multiple processes exist in the same
> > network namespace?  That is to say, will we expect a process to create a net
> > namespace, and then have other processes enter that namespace (thereby
> > triggering multiple calls to audit_netns_contid_add with the same net pointer
> > and cont id).  Given that the kubernetes notion of a pod is almost by definition
> > multiple containers sharing a network namespace, I think the answer is that we
> > will be strongly biased towards the refcount_inc case, rather than the kmalloc
> > case.  I could be wrong, but I think this is likely already in the optimized
> > order.
>
> I had a stab at doing a GFP_KERNEL alloc before the spinlock and releasing it
> after if it wasn't needed (in Patch#1 below).  I also went one step further and
> converted it to kmem_cache (in Patch#2 below).
>
> > > My gut feeling says we might do about as many allocations as refcount
> > > bumps, but I could be thinking about this wrong.
> > >
> > > Moving the allocation outside the spinlock might also open the door to
> > > doing this as GFP_KERNEL, which is a good thing, but I haven't looked
> > > at the callers to see if that is possible (it may not be).  That's an
> > > exercise left to the patch author (if he hasn't done that already).
>
> Both appear to work, but after successfully running both the contid test and
> audit_testsuite, once I start to push that test system a bit harder I end up
> with a deadlock warning.
>
> I am having trouble understanding why since it happens both without and with
> the kmem_cache options, so it must be another part of the code that is
> triggering it.  The task_lock is being held at this point in
> audit_set_contid().  I haven't tried changing this alloc over to a GFP_ATOMIC
> to see if that prevents it, just as a debugging check...
> At this point, I'm inclined to leave it as it was without these two patches
> since it works and there doesn't seem to be an obvious best way given the
> uncertainty of the potential workloads.

I'm definitely not a mm expert, but I wonder if you might be able to
get this working using GFP_NOFS, but I see just now that they
recommend you *not* use GFP_NOFS; even if this did solve the problem,
it's probably not the right approach.

I'm okay with leaving it as-is with GFP_ATOMIC.  My original response
to this was more of a question about optimizing for a given use case,
having something that works is far more important.
Neil Horman April 5, 2019, 11:32 a.m. UTC | #18
On Thu, Apr 04, 2019 at 05:40:06PM -0400, Richard Guy Briggs wrote:
> On 2019-04-02 07:31, Neil Horman wrote:
> > On Mon, Apr 01, 2019 at 10:50:03AM -0400, Paul Moore wrote:
> > > On Fri, Mar 15, 2019 at 2:35 PM Richard Guy Briggs <rgb@redhat.com> wrote:
> > > > Audit events could happen in a network namespace outside of a task
> > > > context due to packets received from the net that trigger an auditing
> > > > rule prior to being associated with a running task.  The network
> > > > namespace could be in use by multiple containers by association to the
> > > > tasks in that network namespace.  We still want a way to attribute
> > > > these events to any potential containers.  Keep a list per network
> > > > namespace to track these audit container identifiiers.
> > > >
> > > > Add/increment the audit container identifier on:
> > > > - initial setting of the audit container identifier via /proc
> > > > - clone/fork call that inherits an audit container identifier
> > > > - unshare call that inherits an audit container identifier
> > > > - setns call that inherits an audit container identifier
> > > > Delete/decrement the audit container identifier on:
> > > > - an inherited audit container identifier dropped when child set
> > > > - process exit
> > > > - unshare call that drops a net namespace
> > > > - setns call that drops a net namespace
> > > >
> > > > See: https://github.com/linux-audit/audit-kernel/issues/92
> > > > See: https://github.com/linux-audit/audit-testsuite/issues/64
> > > > See: https://github.com/linux-audit/audit-kernel/wiki/RFE-Audit-Container-ID
> > > > Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
> > > > ---
> > > >  include/linux/audit.h | 19 ++++++++++++
> > > >  kernel/audit.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
> > > >  kernel/nsproxy.c      |  4 +++
> > > >  3 files changed, 106 insertions(+), 3 deletions(-)
> > > 
> > > ...
> > > 
> > > > diff --git a/kernel/audit.c b/kernel/audit.c
> > > > index cf448599ef34..7fa3194f5342 100644
> > > > --- a/kernel/audit.c
> > > > +++ b/kernel/audit.c
> > > > @@ -72,6 +72,7 @@
> > > >  #include <linux/freezer.h>
> > > >  #include <linux/pid_namespace.h>
> > > >  #include <net/netns/generic.h>
> > > > +#include <net/net_namespace.h>
> > > >
> > > >  #include "audit.h"
> > > >
> > > > @@ -99,9 +100,13 @@
> > > >  /**
> > > >   * struct audit_net - audit private network namespace data
> > > >   * @sk: communication socket
> > > > + * @contid_list: audit container identifier list
> > > > + * @contid_list_lock audit container identifier list lock
> > > >   */
> > > >  struct audit_net {
> > > >         struct sock *sk;
> > > > +       struct list_head contid_list;
> > > > +       spinlock_t contid_list_lock;
> > > >  };
> > > >
> > > >  /**
> > > > @@ -275,8 +280,11 @@ struct audit_task_info init_struct_audit = {
> > > >  void audit_free(struct task_struct *tsk)
> > > >  {
> > > >         struct audit_task_info *info = tsk->audit;
> > > > +       struct nsproxy *ns = tsk->nsproxy;
> > > >
> > > >         audit_free_syscall(tsk);
> > > > +       if (ns)
> > > > +               audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
> > > >         /* Freeing the audit_task_info struct must be performed after
> > > >          * audit_log_exit() due to need for loginuid and sessionid.
> > > >          */
> > > > @@ -376,6 +384,73 @@ static struct sock *audit_get_sk(const struct net *net)
> > > >         return aunet->sk;
> > > >  }
> > > >
> > > > +void audit_netns_contid_add(struct net *net, u64 contid)
> > > > +{
> > > > +       struct audit_net *aunet = net_generic(net, audit_net_id);
> > > > +       struct list_head *contid_list = &aunet->contid_list;
> > > > +       struct audit_contid *cont;
> > > > +
> > > > +       if (!audit_contid_valid(contid))
> > > > +               return;
> > > > +       if (!aunet)
> > > > +               return;
> > > 
> > > We should move the contid_list assignment below this check, or decide
> > > that aunet is always going to valid (?) and get rid of this check
> > > completely.
> > > 
> > I'm not sure why that would be needed.  Finding the net_id list is an operation
> > of a map relating net namespaces to lists, not contids to lists.  We could do
> > it, sure, but since they're unrelated operations, I don't think we experience
> > any slowdowns from doing it this way.
> > 
> > > > +       spin_lock(&aunet->contid_list_lock);
> > > > +       if (!list_empty(contid_list))
> > > 
> > > We don't need the list_empty() check here do we?  I think we can just
> > > call list_for_each_entry_rcu(), yes?
> > > 
> > This is true, the list_empty check is redundant, and the for loop will fall
> > through if the list is empty.
> > 
> > > > +               list_for_each_entry_rcu(cont, contid_list, list)
> > > > +                       if (cont->id == contid) {
> > > > +                               refcount_inc(&cont->refcount);
> > > > +                               goto out;
> > > > +                       }
> > > > +       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> > > 
> > > If you had to guess, what do you think is going to be more common:
> > > bumping the refcount of an existing entry in the list, or adding a new
> > > entry?  I'm asking because I always get a little nervous when doing
> > > allocations while holding a spinlock.  Yes, you are doing it with
> > > GFP_ATOMIC, but it still seems like something to try and avoid if this
> > > is going to approach 50%.  However, if the new entry is rare then the
> > > extra work of always doing the allocation before taking the lock and
> > > then freeing it afterwards might be a bad tradeoff.
> > > 
> > I think this is another way of asking, will multiple processes exist in the same
> > network namespace?  That is to say, will we expect a process to create a net
> > namespace, and then have other processes enter that namespace (thereby
> > triggering multiple calls to audit_netns_contid_add with the same net pointer
> > and cont id).  Given that the kubernetes notion of a pod is almost by definition
> > multiple containers sharing a network namespace, I think the answer is that we
> > will be strongly biased towards the refcount_inc case, rather than the kmalloc
> > case.  I could be wrong, but I think this is likely already in the optimized
> > order.
> 
> I had a stab at doing a GFP_KERNEL alloc before the spinlock and releasing it
> after if it wasn't needed (in Patch#1 below).  I also went one step further and
> converted it to kmem_cache (in Patch#2 below).
> 
> > > My gut feeling says we might do about as many allocations as refcount
> > > bumps, but I could be thinking about this wrong.
> > > 
> > > Moving the allocation outside the spinlock might also open the door to
> > > doing this as GFP_KERNEL, which is a good thing, but I haven't looked
> > > at the callers to see if that is possible (it may not be).  That's an
> > > exercise left to the patch author (if he hasn't done that already).
> 
> Both appear to work, but after successfully running both the contid test and
> audit_testsuite, once I start to push that test system a bit harder I end up
> with a deadlock warning.
> 
> I am having trouble understanding why since it happens both without and with
> the kmem_cache options, so it must be another part of the code that is
> triggering it.  The task_lock is being held at this point in
> audit_set_contid().  I haven't tried changing this alloc over to a GFP_ATOMIC
> to see if that prevents it, just as a debugging check...
> At this point, I'm inclined to leave it as it was without these two patches
> since it works and there doesn't seem to be an obvious best way given the
> uncertainty of the potential workloads.
> 
I would agree.  The trace below says that, when you use GFP_KERNEL, you are
allowing the memory subsystem to block the memory allocating task on memory
reclaim operations (effectively to sleep until kswapd reclaims sufficient memory
to grant the allocation), but kswapd wants to lock the task that is doing the
allocation with task_lock, which the task calling kmalloc(...,GFP_KEREL) already
holds, ergo, deadlock.

In short, you can't hold a tasks lock while preforming a sleeping operation
involving memory allocation. You either need to use GFP_ATOMIC, or release the
tasks lock before allocating memory with GFP_KERNEL.  Given the possible race
conditions surrounding releasing and re-aquiring the lock, I'm inclined to say
just use GFP_ATOMIC

Neil

> ======================================================
> WARNING: possible circular locking dependency detected
> 5.1.0-rc1-ghak90-audit-containerID.v6.3+ #80 Not tainted
> ------------------------------------------------------
> kswapd0/41 is trying to acquire lock:
> 0000000006a8c88b (&(&p->alloc_lock)->rlock){+.+.}, at: create_task_io_context+0xe9/0x150
> 
> but task is already holding lock:
> 0000000010aadb74 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x40
> 
> which lock already depends on the new lock.
> 
> 
> the existing dependency chain (in reverse order) is:
> 
> -> #1 (fs_reclaim){+.+.}:
>        __fs_reclaim_acquire+0x2c/0x40
>        kmem_cache_alloc_trace+0x30/0x230
>        audit_netns_contid_add.part.12+0xcf/0x210
>        audit_set_contid+0x18f/0x480
>        proc_contid_write+0x74/0x110
>        vfs_write+0xad/0x1b0
>        ksys_write+0x55/0xc0
>        do_syscall_64+0x60/0x210
>        entry_SYSCALL_64_after_hwframe+0x49/0xbe
> 
> -> #0 (&(&p->alloc_lock)->rlock){+.+.}:
>        lock_acquire+0xd4/0x1c0
>        _raw_spin_lock+0x2e/0x40
>        create_task_io_context+0xe9/0x150
>        generic_make_request_checks+0x8b4/0x970
>        generic_make_request+0xbb/0x570
>        submit_bio+0x6e/0x130
>        __swap_writepage+0x281/0x420
>        shmem_writepage+0x1d3/0x350
>        pageout.isra.54+0x1e9/0x3f0
>        shrink_page_list+0xa9f/0xe60
>        shrink_inactive_list+0x263/0x6e0
>        shrink_node_memcg+0x376/0x7c0
>        shrink_node+0xdd/0x470
>        balance_pgdat+0x26c/0x570
>        kswapd+0x191/0x4f0
>        kthread+0xf8/0x130
>        ret_from_fork+0x3a/0x50
> 
> other info that might help us debug this:
> 
>  Possible unsafe locking scenario:
> 
>        CPU0                    CPU1
>        ----                    ----
>   lock(fs_reclaim);
>                                lock(&(&p->alloc_lock)->rlock);
>                                lock(fs_reclaim);
>   lock(&(&p->alloc_lock)->rlock);
> 
>  *** DEADLOCK ***
> 
> 1 lock held by kswapd0/41:
>  #0: 0000000010aadb74 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x40
> 
> > > paul moore
> 
> Patch#1:
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -388,7 +388,7 @@ void audit_netns_contid_add(struct net *net, u64 contid)
>  {
>         struct audit_net *aunet;
>         struct list_head *contid_list;
> -       struct audit_contid *cont;
> +       struct audit_contid *cont, *newcont;
>  
>         if (!net)
>                 return;
> @@ -398,18 +398,19 @@ void audit_netns_contid_add(struct net *net, u64 contid)
>         if (!aunet)
>                 return;
>         contid_list = &aunet->contid_list;
> +       newcont = kmalloc(sizeof(struct audit_contid), GFP_KERNEL);
>         spin_lock(&aunet->contid_list_lock);
>         list_for_each_entry_rcu(cont, contid_list, list)
>                 if (cont->id == contid) {
>                         refcount_inc(&cont->refcount);
> +                       kfree(newcont);
>                         goto out;
>                 }
> -       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> -       if (cont) {
> -               INIT_LIST_HEAD(&cont->list);
> -               cont->id = contid;
> -               refcount_set(&cont->refcount, 1);
> -               list_add_rcu(&cont->list, contid_list);
> +       if (newcont) {
> +               INIT_LIST_HEAD(&newcont->list);
> +               newcont->id = contid;
> +               refcount_set(&newcont->refcount, 1);
> +               list_add_rcu(&newcont->list, contid_list);
>         }
>  out:
>         spin_unlock(&aunet->contid_list_lock);
> 
> 
> Patch#2:
> --- a/kernel/audit.c
> +++ b/kernel/audit.c
> @@ -222,12 +222,16 @@ struct audit_reply {
>  };
>  
>  static struct kmem_cache *audit_task_cache;
> +static struct kmem_cache *audit_contid_cache;
>  
>  void __init audit_task_init(void)
>  {
>         audit_task_cache = kmem_cache_create("audit_task",
>                                              sizeof(struct audit_task_info),
>                                              0, SLAB_PANIC, NULL);
> +       audit_contid_cache = kmem_cache_create("audit_contid",
> +                                            sizeof(struct audit_contid),
> +                                            0, SLAB_PANIC, NULL);
>  }
>  
>  /**
> @@ -388,7 +392,7 @@ void audit_netns_contid_add(struct net *net, u64 contid)
>  {
>         struct audit_net *aunet;
>         struct list_head *contid_list;
> -       struct audit_contid *cont;
> +       struct audit_contid *cont, *newcont;
>  
>         if (!net)
>                 return;
> @@ -398,23 +402,32 @@ void audit_netns_contid_add(struct net *net, u64 contid)
>         if (!aunet)
>                 return;
>         contid_list = &aunet->contid_list;
> +       newcont = kmem_cache_alloc(audit_contid_cache, GFP_KERNEL);
>         spin_lock(&aunet->contid_list_lock);
>         list_for_each_entry_rcu(cont, contid_list, list)
>                 if (cont->id == contid) {
>                         refcount_inc(&cont->refcount);
> +                       kmem_cache_free(audit_contid_cache, newcont);
>                         goto out;
>                 }
> -       cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
> -       if (cont) {
> -               INIT_LIST_HEAD(&cont->list);
> -               cont->id = contid;
> -               refcount_set(&cont->refcount, 1);
> -               list_add_rcu(&cont->list, contid_list);
> +       if (newcont) {
> +               INIT_LIST_HEAD(&newcont->list);
> +               newcont->id = contid;
> +               refcount_set(&newcont->refcount, 1);
> +               list_add_rcu(&newcont->list, contid_list);
>         }
>  out:
>         spin_unlock(&aunet->contid_list_lock);
>  }
>  
> +static void audit_contid_free(struct rcu_head *rcu)
> +{
> +       struct audit_contid *cont;
> +
> +       cont = container_of(rcu, struct audit_contid, rcu);
> +       kmem_cache_free(audit_contid_cache, cont);
> +}
> +
>  void audit_netns_contid_del(struct net *net, u64 contid)
>  {
>         struct audit_net *aunet;
> @@ -434,7 +447,7 @@ void audit_netns_contid_del(struct net *net, u64 contid)
>                 if (cont->id == contid) {
>                         if (refcount_dec_and_test(&cont->refcount)) {
>                                 list_del_rcu(&cont->list);
> -                               kfree_rcu(cont, rcu);
> +                               call_rcu(&cont->rcu, audit_contid_free);
>                         }
>                         break;
>                 }
> 
> - RGB
> 
> --
> Richard Guy Briggs <rgb@redhat.com>
> Sr. S/W Engineer, Kernel Security, Base Operating Systems
> Remote, Ottawa, Red Hat Canada
> IRC: rgb, SunRaycer
> Voice: +1.647.777.2635, Internal: (81) 32635
>
diff mbox series

Patch

diff --git a/include/linux/audit.h b/include/linux/audit.h
index fa19fa408931..70255c2dfb9f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -27,6 +27,7 @@ 
 #include <linux/ptrace.h>
 #include <linux/namei.h>  /* LOOKUP_* */
 #include <uapi/linux/audit.h>
+#include <linux/refcount.h>
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
 #define AUDIT_DEV_UNSET ((dev_t)-1)
@@ -99,6 +100,13 @@  struct audit_task_info {
 
 extern struct audit_task_info init_struct_audit;
 
+struct audit_contid {
+	struct list_head	list;
+	u64			id;
+	refcount_t		refcount;
+	struct rcu_head		rcu;
+};
+
 extern int is_audit_feature_set(int which);
 
 extern int __init audit_register_class(int class, unsigned *list);
@@ -202,6 +210,10 @@  static inline u64 audit_get_contid(struct task_struct *tsk)
 }
 
 extern void audit_log_contid(struct audit_context *context, u64 contid);
+extern void audit_netns_contid_add(struct net *net, u64 contid);
+extern void audit_netns_contid_del(struct net *net, u64 contid);
+extern void audit_switch_task_namespaces(struct nsproxy *ns,
+					 struct task_struct *p);
 
 extern u32 audit_enabled;
 #else /* CONFIG_AUDIT */
@@ -271,6 +283,13 @@  static inline u64 audit_get_contid(struct task_struct *tsk)
 
 static inline void audit_log_contid(struct audit_context *context, u64 contid)
 { }
+static inline void audit_netns_contid_add(struct net *net, u64 contid)
+{ }
+static inline void audit_netns_contid_del(struct net *net, u64 contid)
+{ }
+static inline void audit_switch_task_namespaces(struct nsproxy *ns,
+						struct task_struct *p)
+{ }
 
 #define audit_enabled AUDIT_OFF
 #endif /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index cf448599ef34..7fa3194f5342 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -72,6 +72,7 @@ 
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <net/netns/generic.h>
+#include <net/net_namespace.h>
 
 #include "audit.h"
 
@@ -99,9 +100,13 @@ 
 /**
  * struct audit_net - audit private network namespace data
  * @sk: communication socket
+ * @contid_list: audit container identifier list
+ * @contid_list_lock audit container identifier list lock
  */
 struct audit_net {
 	struct sock *sk;
+	struct list_head contid_list;
+	spinlock_t contid_list_lock;
 };
 
 /**
@@ -275,8 +280,11 @@  struct audit_task_info init_struct_audit = {
 void audit_free(struct task_struct *tsk)
 {
 	struct audit_task_info *info = tsk->audit;
+	struct nsproxy *ns = tsk->nsproxy;
 
 	audit_free_syscall(tsk);
+	if (ns)
+		audit_netns_contid_del(ns->net_ns, audit_get_contid(tsk));
 	/* Freeing the audit_task_info struct must be performed after
 	 * audit_log_exit() due to need for loginuid and sessionid.
 	 */
@@ -376,6 +384,73 @@  static struct sock *audit_get_sk(const struct net *net)
 	return aunet->sk;
 }
 
+void audit_netns_contid_add(struct net *net, u64 contid)
+{
+	struct audit_net *aunet = net_generic(net, audit_net_id);
+	struct list_head *contid_list = &aunet->contid_list;
+	struct audit_contid *cont;
+
+	if (!audit_contid_valid(contid))
+		return;
+	if (!aunet)
+		return;
+	spin_lock(&aunet->contid_list_lock);
+	if (!list_empty(contid_list))
+		list_for_each_entry_rcu(cont, contid_list, list)
+			if (cont->id == contid) {
+				refcount_inc(&cont->refcount);
+				goto out;
+			}
+	cont = kmalloc(sizeof(struct audit_contid), GFP_ATOMIC);
+	if (cont) {
+		INIT_LIST_HEAD(&cont->list);
+		cont->id = contid;
+		refcount_set(&cont->refcount, 1);
+		list_add_rcu(&cont->list, contid_list);
+	}
+out:
+	spin_unlock(&aunet->contid_list_lock);
+}
+
+void audit_netns_contid_del(struct net *net, u64 contid)
+{
+	struct audit_net *aunet;
+	struct list_head *contid_list;
+	struct audit_contid *cont = NULL;
+
+	if (!net)
+		return;
+	if (!audit_contid_valid(contid))
+		return;
+	aunet = net_generic(net, audit_net_id);
+	if (!aunet)
+		return;
+	contid_list = &aunet->contid_list;
+	spin_lock(&aunet->contid_list_lock);
+	if (!list_empty(contid_list))
+		list_for_each_entry_rcu(cont, contid_list, list)
+			if (cont->id == contid) {
+				if (refcount_dec_and_test(&cont->refcount)) {
+					list_del_rcu(&cont->list);
+					kfree_rcu(cont, rcu);
+				}
+				break;
+			}
+	spin_unlock(&aunet->contid_list_lock);
+}
+
+void audit_switch_task_namespaces(struct nsproxy *ns, struct task_struct *p)
+{
+	u64 contid = audit_get_contid(p);
+	struct nsproxy *new = p->nsproxy;
+
+	if (!audit_contid_valid(contid))
+		return;
+	audit_netns_contid_del(ns->net_ns, contid);
+	if (new)
+		audit_netns_contid_add(new->net_ns, contid);
+}
+
 void audit_panic(const char *message)
 {
 	switch (audit_failure) {
@@ -1619,7 +1694,6 @@  static int __net_init audit_net_init(struct net *net)
 		.flags	= NL_CFG_F_NONROOT_RECV,
 		.groups	= AUDIT_NLGRP_MAX,
 	};
-
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 
 	aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
@@ -1628,7 +1702,8 @@  static int __net_init audit_net_init(struct net *net)
 		return -ENOMEM;
 	}
 	aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
-
+	INIT_LIST_HEAD(&aunet->contid_list);
+	spin_lock_init(&aunet->contid_list_lock);
 	return 0;
 }
 
@@ -2380,6 +2455,7 @@  int audit_set_contid(struct task_struct *task, u64 contid)
 	uid_t uid;
 	struct tty_struct *tty;
 	char comm[sizeof(current->comm)];
+	struct net *net = task->nsproxy->net_ns;
 
 	task_lock(task);
 	/* Can't set if audit disabled */
@@ -2401,8 +2477,12 @@  int audit_set_contid(struct task_struct *task, u64 contid)
 	else if (!(thread_group_leader(task) && thread_group_empty(task)))
 		rc = -EALREADY;
 	read_unlock(&tasklist_lock);
-	if (!rc)
+	if (!rc) {
+		if (audit_contid_valid(oldcontid))
+			audit_netns_contid_del(net, oldcontid);
 		task->audit->contid = contid;
+		audit_netns_contid_add(net, contid);
+	}
 	task_unlock(task);
 
 	if (!audit_enabled)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..718b1201ae70 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -27,6 +27,7 @@ 
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
 #include <linux/perf_event.h>
+#include <linux/audit.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -140,6 +141,7 @@  int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 	struct nsproxy *old_ns = tsk->nsproxy;
 	struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
 	struct nsproxy *new_ns;
+	u64 contid = audit_get_contid(tsk);
 
 	if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
 			      CLONE_NEWPID | CLONE_NEWNET |
@@ -167,6 +169,7 @@  int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 		return  PTR_ERR(new_ns);
 
 	tsk->nsproxy = new_ns;
+	audit_netns_contid_add(new_ns->net_ns, contid);
 	return 0;
 }
 
@@ -224,6 +227,7 @@  void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 	ns = p->nsproxy;
 	p->nsproxy = new;
 	task_unlock(p);
+	audit_switch_task_namespaces(ns, p);
 
 	if (ns && atomic_dec_and_test(&ns->count))
 		free_nsproxy(ns);