diff mbox

namespaces: Use task_lock and not rcu to protect nsproxy

Message ID 1408708119-25133-2-git-send-email-rafael.tinoco@canonical.com
State New
Headers show

Commit Message

Rafael David Tinoco Aug. 22, 2014, 11:48 a.m. UTC
BugLink: https://bugs.launchpad.net/bugs/1328088

The synchronous syncrhonize_rcu in switch_task_namespaces makes setns
a sufficiently expensive system call that people have complained.

Upon inspect nsproxy no longer needs rcu protection for remote reads.
remote reads are rare.  So optimize for same process reads and write
by switching using rask_lock instead.

This yields a simpler to understand lock, and a faster setns system call.

In particular this fixes a performance regression observed
by Rafael David Tinoco <rafael.tinoco@canonical.com>.

This is effectively a revert of Pavel Emelyanov's commit
cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter
from 2007.  The race this originialy fixed no longer exists as
do_notify_parent uses task_active_pid_ns(parent) instead of
parent->nsproxy.

[Conflicts]

./fs/proc_namespace.c:

* Upstream version patched if(!nsp || !nsp->mnt_ns) block
  but Trusty version is using 2 condition blocks to achieve
  same result. So rcu_unlock -> task_unlock substitution had
  to be done twice.

OriginalAuthor: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
(cherry-picked from commit 728dba3a39c66b3d8ac889ddbe38b5b1c264aec3 v3.17-rc1)
Signed-off-by: "Rafael David Tinoco" <rafael.tinoco@canonical.com>
---
 fs/namespace.c           |  6 +++---
 fs/proc/proc_net.c       |  4 +++-
 fs/proc_namespace.c      | 10 ++++------
 include/linux/nsproxy.h  | 16 ++++++----------
 ipc/namespace.c          |  6 +++---
 kernel/nsproxy.c         | 15 ++++-----------
 kernel/utsname.c         |  6 +++---
 net/core/net_namespace.c | 10 ++++++----
 8 files changed, 32 insertions(+), 41 deletions(-)

Comments

Tim Gardner Aug. 22, 2014, 2:05 p.m. UTC | #1
Your analysis is compelling.
Chris J Arges Aug. 22, 2014, 2:12 p.m. UTC | #2
An amazing amount of testing has been done on this, in addition to the
upstream discussions documenting the reasons behind this change.
Backport looks good.

Great work Rafael!

Signed-off-by: Chris J Arges <chris.j.arges@canonical.com>

On 08/22/2014 06:48 AM, Rafael David Tinoco wrote:
> BugLink: https://bugs.launchpad.net/bugs/1328088
> 
> The synchronous syncrhonize_rcu in switch_task_namespaces makes setns
> a sufficiently expensive system call that people have complained.
> 
> Upon inspect nsproxy no longer needs rcu protection for remote reads.
> remote reads are rare.  So optimize for same process reads and write
> by switching using rask_lock instead.
> 
> This yields a simpler to understand lock, and a faster setns system call.
> 
> In particular this fixes a performance regression observed
> by Rafael David Tinoco <rafael.tinoco@canonical.com>.
> 
> This is effectively a revert of Pavel Emelyanov's commit
> cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter
> from 2007.  The race this originialy fixed no longer exists as
> do_notify_parent uses task_active_pid_ns(parent) instead of
> parent->nsproxy.
> 
> [Conflicts]
> 
> ./fs/proc_namespace.c:
> 
> * Upstream version patched if(!nsp || !nsp->mnt_ns) block
>   but Trusty version is using 2 condition blocks to achieve
>   same result. So rcu_unlock -> task_unlock substitution had
>   to be done twice.
> 
> OriginalAuthor: "Eric W. Biederman" <ebiederm@xmission.com>
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> (cherry-picked from commit 728dba3a39c66b3d8ac889ddbe38b5b1c264aec3 v3.17-rc1)
> Signed-off-by: "Rafael David Tinoco" <rafael.tinoco@canonical.com>
> ---
>  fs/namespace.c           |  6 +++---
>  fs/proc/proc_net.c       |  4 +++-
>  fs/proc_namespace.c      | 10 ++++------
>  include/linux/nsproxy.h  | 16 ++++++----------
>  ipc/namespace.c          |  6 +++---
>  kernel/nsproxy.c         | 15 ++++-----------
>  kernel/utsname.c         |  6 +++---
>  net/core/net_namespace.c | 10 ++++++----
>  8 files changed, 32 insertions(+), 41 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 7789c20..ee81392 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -3035,13 +3035,13 @@ static void *mntns_get(struct task_struct *task)
>  	struct mnt_namespace *ns = NULL;
>  	struct nsproxy *nsproxy;
>  
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
>  	if (nsproxy) {
>  		ns = nsproxy->mnt_ns;
>  		get_mnt_ns(ns);
>  	}
> -	rcu_read_unlock();
> +	task_unlock(task);
>  
>  	return ns;
>  }
> diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
> index 4677bb7..a63af3e 100644
> --- a/fs/proc/proc_net.c
> +++ b/fs/proc/proc_net.c
> @@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir)
>  	rcu_read_lock();
>  	task = pid_task(proc_pid(dir), PIDTYPE_PID);
>  	if (task != NULL) {
> -		ns = task_nsproxy(task);
> +		task_lock(task);
> +		ns = task->nsproxy;
>  		if (ns != NULL)
>  			net = get_net(ns->net_ns);
> +		task_unlock(task);
>  	}
>  	rcu_read_unlock();
>  
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 439406e..b68cecc 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -232,22 +232,20 @@ static int mounts_open_common(struct inode *inode, struct file *file,
>  	if (!task)
>  		goto err;
>  
> -	rcu_read_lock();
> -	nsp = task_nsproxy(task);
> +	task_lock(task);
> +	nsp = task->nsproxy;
>  	if (!nsp) {
> -		rcu_read_unlock();
> +		task_unlock(task);
>  		put_task_struct(task);
>  		goto err;
>  	}
>  	ns = nsp->mnt_ns;
>  	if (!ns) {
> -		rcu_read_unlock();
> +		task_unlock(task);
>  		put_task_struct(task);
>  		goto err;
>  	}
>  	get_mnt_ns(ns);
> -	rcu_read_unlock();
> -	task_lock(task);
>  	if (!task->fs) {
>  		task_unlock(task);
>  		put_task_struct(task);
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index b4ec59d..35fa08f 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -40,32 +40,28 @@ extern struct nsproxy init_nsproxy;
>   * the namespaces access rules are:
>   *
>   *  1. only current task is allowed to change tsk->nsproxy pointer or
> - *     any pointer on the nsproxy itself
> + *     any pointer on the nsproxy itself.  Current must hold the task_lock
> + *     when changing tsk->nsproxy.
>   *
>   *  2. when accessing (i.e. reading) current task's namespaces - no
>   *     precautions should be taken - just dereference the pointers
>   *
>   *  3. the access to other task namespaces is performed like this
> - *     rcu_read_lock();
> - *     nsproxy = task_nsproxy(tsk);
> + *     task_lock(task);
> + *     nsproxy = task->nsproxy;
>   *     if (nsproxy != NULL) {
>   *             / *
>   *               * work with the namespaces here
>   *               * e.g. get the reference on one of them
>   *               * /
>   *     } / *
> - *         * NULL task_nsproxy() means that this task is
> + *         * NULL task->nsproxy means that this task is
>   *         * almost dead (zombie)
>   *         * /
> - *     rcu_read_unlock();
> + *     task_unlock(task);
>   *
>   */
>  
> -static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
> -{
> -	return rcu_dereference(tsk->nsproxy);
> -}
> -
>  int copy_namespaces(unsigned long flags, struct task_struct *tsk);
>  void exit_task_namespaces(struct task_struct *tsk);
>  void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index 59451c1..b54468e 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -154,11 +154,11 @@ static void *ipcns_get(struct task_struct *task)
>  	struct ipc_namespace *ns = NULL;
>  	struct nsproxy *nsproxy;
>  
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
>  	if (nsproxy)
>  		ns = get_ipc_ns(nsproxy->ipc_ns);
> -	rcu_read_unlock();
> +	task_unlock(task);
>  
>  	return ns;
>  }
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 8e78110..ef42d0a 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
>  
>  	might_sleep();
>  
> +	task_lock(p);
>  	ns = p->nsproxy;
> +	p->nsproxy = new;
> +	task_unlock(p);
>  
> -	rcu_assign_pointer(p->nsproxy, new);
> -
> -	if (ns && atomic_dec_and_test(&ns->count)) {
> -		/*
> -		 * wait for others to get what they want from this nsproxy.
> -		 *
> -		 * cannot release this nsproxy via the call_rcu() since
> -		 * put_mnt_ns() will want to sleep
> -		 */
> -		synchronize_rcu();
> +	if (ns && atomic_dec_and_test(&ns->count))
>  		free_nsproxy(ns);
> -	}
>  }
>  
>  void exit_task_namespaces(struct task_struct *p)
> diff --git a/kernel/utsname.c b/kernel/utsname.c
> index fd39312..883aaaa 100644
> --- a/kernel/utsname.c
> +++ b/kernel/utsname.c
> @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
>  	struct uts_namespace *ns = NULL;
>  	struct nsproxy *nsproxy;
>  
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
>  	if (nsproxy) {
>  		ns = nsproxy->uts_ns;
>  		get_uts_ns(ns);
>  	}
> -	rcu_read_unlock();
> +	task_unlock(task);
>  
>  	return ns;
>  }
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 7c8ffd9..826bf80 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid)
>  	tsk = find_task_by_vpid(pid);
>  	if (tsk) {
>  		struct nsproxy *nsproxy;
> -		nsproxy = task_nsproxy(tsk);
> +		task_lock(tsk);
> +		nsproxy = tsk->nsproxy;
>  		if (nsproxy)
>  			net = get_net(nsproxy->net_ns);
> +		task_unlock(tsk);
>  	}
>  	rcu_read_unlock();
>  	return net;
> @@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task)
>  	struct net *net = NULL;
>  	struct nsproxy *nsproxy;
>  
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
>  	if (nsproxy)
>  		net = get_net(nsproxy->net_ns);
> -	rcu_read_unlock();
> +	task_unlock(task);
>  
>  	return net;
>  }
>
diff mbox

Patch

diff --git a/fs/namespace.c b/fs/namespace.c
index 7789c20..ee81392 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3035,13 +3035,13 @@  static void *mntns_get(struct task_struct *task)
 	struct mnt_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy) {
 		ns = nsproxy->mnt_ns;
 		get_mnt_ns(ns);
 	}
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return ns;
 }
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4677bb7..a63af3e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -113,9 +113,11 @@  static struct net *get_proc_task_net(struct inode *dir)
 	rcu_read_lock();
 	task = pid_task(proc_pid(dir), PIDTYPE_PID);
 	if (task != NULL) {
-		ns = task_nsproxy(task);
+		task_lock(task);
+		ns = task->nsproxy;
 		if (ns != NULL)
 			net = get_net(ns->net_ns);
+		task_unlock(task);
 	}
 	rcu_read_unlock();
 
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 439406e..b68cecc 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -232,22 +232,20 @@  static int mounts_open_common(struct inode *inode, struct file *file,
 	if (!task)
 		goto err;
 
-	rcu_read_lock();
-	nsp = task_nsproxy(task);
+	task_lock(task);
+	nsp = task->nsproxy;
 	if (!nsp) {
-		rcu_read_unlock();
+		task_unlock(task);
 		put_task_struct(task);
 		goto err;
 	}
 	ns = nsp->mnt_ns;
 	if (!ns) {
-		rcu_read_unlock();
+		task_unlock(task);
 		put_task_struct(task);
 		goto err;
 	}
 	get_mnt_ns(ns);
-	rcu_read_unlock();
-	task_lock(task);
 	if (!task->fs) {
 		task_unlock(task);
 		put_task_struct(task);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index b4ec59d..35fa08f 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -40,32 +40,28 @@  extern struct nsproxy init_nsproxy;
  * the namespaces access rules are:
  *
  *  1. only current task is allowed to change tsk->nsproxy pointer or
- *     any pointer on the nsproxy itself
+ *     any pointer on the nsproxy itself.  Current must hold the task_lock
+ *     when changing tsk->nsproxy.
  *
  *  2. when accessing (i.e. reading) current task's namespaces - no
  *     precautions should be taken - just dereference the pointers
  *
  *  3. the access to other task namespaces is performed like this
- *     rcu_read_lock();
- *     nsproxy = task_nsproxy(tsk);
+ *     task_lock(task);
+ *     nsproxy = task->nsproxy;
  *     if (nsproxy != NULL) {
  *             / *
  *               * work with the namespaces here
  *               * e.g. get the reference on one of them
  *               * /
  *     } / *
- *         * NULL task_nsproxy() means that this task is
+ *         * NULL task->nsproxy means that this task is
  *         * almost dead (zombie)
  *         * /
- *     rcu_read_unlock();
+ *     task_unlock(task);
  *
  */
 
-static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
-{
-	return rcu_dereference(tsk->nsproxy);
-}
-
 int copy_namespaces(unsigned long flags, struct task_struct *tsk);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 59451c1..b54468e 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -154,11 +154,11 @@  static void *ipcns_get(struct task_struct *task)
 	struct ipc_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy)
 		ns = get_ipc_ns(nsproxy->ipc_ns);
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return ns;
 }
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e78110..ef42d0a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@  void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 
 	might_sleep();
 
+	task_lock(p);
 	ns = p->nsproxy;
+	p->nsproxy = new;
+	task_unlock(p);
 
-	rcu_assign_pointer(p->nsproxy, new);
-
-	if (ns && atomic_dec_and_test(&ns->count)) {
-		/*
-		 * wait for others to get what they want from this nsproxy.
-		 *
-		 * cannot release this nsproxy via the call_rcu() since
-		 * put_mnt_ns() will want to sleep
-		 */
-		synchronize_rcu();
+	if (ns && atomic_dec_and_test(&ns->count))
 		free_nsproxy(ns);
-	}
 }
 
 void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd39312..883aaaa 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@  static void *utsns_get(struct task_struct *task)
 	struct uts_namespace *ns = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy) {
 		ns = nsproxy->uts_ns;
 		get_uts_ns(ns);
 	}
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return ns;
 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 7c8ffd9..826bf80 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -373,9 +373,11 @@  struct net *get_net_ns_by_pid(pid_t pid)
 	tsk = find_task_by_vpid(pid);
 	if (tsk) {
 		struct nsproxy *nsproxy;
-		nsproxy = task_nsproxy(tsk);
+		task_lock(tsk);
+		nsproxy = tsk->nsproxy;
 		if (nsproxy)
 			net = get_net(nsproxy->net_ns);
+		task_unlock(tsk);
 	}
 	rcu_read_unlock();
 	return net;
@@ -632,11 +634,11 @@  static void *netns_get(struct task_struct *task)
 	struct net *net = NULL;
 	struct nsproxy *nsproxy;
 
-	rcu_read_lock();
-	nsproxy = task_nsproxy(task);
+	task_lock(task);
+	nsproxy = task->nsproxy;
 	if (nsproxy)
 		net = get_net(nsproxy->net_ns);
-	rcu_read_unlock();
+	task_unlock(task);
 
 	return net;
 }