[REVIEW][PATCH] namespaces: Use task_lock and not rcu to protect nsproxy

Rafael David Tinoco rafael.tinoco at canonical.com
Mon Aug 11 11:40:28 UTC 2014


For those this might concern,

This patch works for me and is running in a big production environment for more 
then 2 weeks with no problems. Performance regression (observed after commit 
#911af505 in http://people.canonical.com/~inaddy/lp1328088/charts/{250,500,750,
1000,1250,1500,1750}.html) does not exist anymore. 

Changing locking mechanism from rcu_read_lock to task_lock makes network namespaces
creation less sensible to _rcu implementation_ changes and fixed observed regression.

Thank you very much.

-Rafael

On Jul 29, 2014, at 10:21 PM, Eric W. Biederman <ebiederm at xmission.com> wrote:

> 
> The synchronous syncrhonize_rcu in switch_task_namespaces makes setns
> a sufficiently expensive system call that people have complained.
> 
> Upon inspect nsproxy no longer needs rcu protection for remote reads.
> remote reads are rare.  So optimize for same process reads and write
> by switching using rask_lock instead.
> 
> This yields a simpler to understand lock, and a faster setns system call.
> 
> In particular this fixes a performance regression observed
> by Rafael David Tinoco <rafael.tinoco at canonical.com>.
> 
> This is effectively a revert of Pavel Emelyanov's commit
> cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter
> from 2007.  The race this originialy fixed no longer exists as
> do_notify_parent uses task_active_pid_ns(parent) instead of
> parent->nsproxy.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm at xmission.com>
> ---
> fs/namespace.c           |  6 +++---
> fs/proc/proc_net.c       |  4 +++-
> fs/proc_namespace.c      |  8 +++-----
> include/linux/nsproxy.h  | 16 ++++++----------
> ipc/namespace.c          |  6 +++---
> kernel/nsproxy.c         | 15 ++++-----------
> kernel/utsname.c         |  6 +++---
> net/core/net_namespace.c | 10 ++++++----
> 8 files changed, 31 insertions(+), 40 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 182bc41cd887..7187d01329c3 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2972,13 +2972,13 @@ static void *mntns_get(struct task_struct *task)
> 	struct mnt_namespace *ns = NULL;
> 	struct nsproxy *nsproxy;
> 
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
> 	if (nsproxy) {
> 		ns = nsproxy->mnt_ns;
> 		get_mnt_ns(ns);
> 	}
> -	rcu_read_unlock();
> +	task_unlock(task);
> 
> 	return ns;
> }
> diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
> index 4677bb7dc7c2..a63af3e0a612 100644
> --- a/fs/proc/proc_net.c
> +++ b/fs/proc/proc_net.c
> @@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir)
> 	rcu_read_lock();
> 	task = pid_task(proc_pid(dir), PIDTYPE_PID);
> 	if (task != NULL) {
> -		ns = task_nsproxy(task);
> +		task_lock(task);
> +		ns = task->nsproxy;
> 		if (ns != NULL)
> 			net = get_net(ns->net_ns);
> +		task_unlock(task);
> 	}
> 	rcu_read_unlock();
> 
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 1a81373947f3..73ca1740d839 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file,
> 	if (!task)
> 		goto err;
> 
> -	rcu_read_lock();
> -	nsp = task_nsproxy(task);
> +	task_lock(task);
> +	nsp = task->nsproxy;
> 	if (!nsp || !nsp->mnt_ns) {
> -		rcu_read_unlock();
> +		task_unlock(task);
> 		put_task_struct(task);
> 		goto err;
> 	}
> 	ns = nsp->mnt_ns;
> 	get_mnt_ns(ns);
> -	rcu_read_unlock();
> -	task_lock(task);
> 	if (!task->fs) {
> 		task_unlock(task);
> 		put_task_struct(task);
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index b4ec59d159ac..35fa08fd7739 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -40,32 +40,28 @@ extern struct nsproxy init_nsproxy;
>  * the namespaces access rules are:
>  *
>  *  1. only current task is allowed to change tsk->nsproxy pointer or
> - *     any pointer on the nsproxy itself
> + *     any pointer on the nsproxy itself.  Current must hold the task_lock
> + *     when changing tsk->nsproxy.
>  *
>  *  2. when accessing (i.e. reading) current task's namespaces - no
>  *     precautions should be taken - just dereference the pointers
>  *
>  *  3. the access to other task namespaces is performed like this
> - *     rcu_read_lock();
> - *     nsproxy = task_nsproxy(tsk);
> + *     task_lock(task);
> + *     nsproxy = task->nsproxy;
>  *     if (nsproxy != NULL) {
>  *             / *
>  *               * work with the namespaces here
>  *               * e.g. get the reference on one of them
>  *               * /
>  *     } / *
> - *         * NULL task_nsproxy() means that this task is
> + *         * NULL task->nsproxy means that this task is
>  *         * almost dead (zombie)
>  *         * /
> - *     rcu_read_unlock();
> + *     task_unlock(task);
>  *
>  */
> 
> -static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
> -{
> -	return rcu_dereference(tsk->nsproxy);
> -}
> -
> int copy_namespaces(unsigned long flags, struct task_struct *tsk);
> void exit_task_namespaces(struct task_struct *tsk);
> void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index 59451c1e214d..b54468e48e32 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -154,11 +154,11 @@ static void *ipcns_get(struct task_struct *task)
> 	struct ipc_namespace *ns = NULL;
> 	struct nsproxy *nsproxy;
> 
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
> 	if (nsproxy)
> 		ns = get_ipc_ns(nsproxy->ipc_ns);
> -	rcu_read_unlock();
> +	task_unlock(task);
> 
> 	return ns;
> }
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 8e7811086b82..ef42d0ab3115 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
> 
> 	might_sleep();
> 
> +	task_lock(p);
> 	ns = p->nsproxy;
> +	p->nsproxy = new;
> +	task_unlock(p);
> 
> -	rcu_assign_pointer(p->nsproxy, new);
> -
> -	if (ns && atomic_dec_and_test(&ns->count)) {
> -		/*
> -		 * wait for others to get what they want from this nsproxy.
> -		 *
> -		 * cannot release this nsproxy via the call_rcu() since
> -		 * put_mnt_ns() will want to sleep
> -		 */
> -		synchronize_rcu();
> +	if (ns && atomic_dec_and_test(&ns->count))
> 		free_nsproxy(ns);
> -	}
> }
> 
> void exit_task_namespaces(struct task_struct *p)
> diff --git a/kernel/utsname.c b/kernel/utsname.c
> index fd393124e507..883aaaa7de8a 100644
> --- a/kernel/utsname.c
> +++ b/kernel/utsname.c
> @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
> 	struct uts_namespace *ns = NULL;
> 	struct nsproxy *nsproxy;
> 
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
> 	if (nsproxy) {
> 		ns = nsproxy->uts_ns;
> 		get_uts_ns(ns);
> 	}
> -	rcu_read_unlock();
> +	task_unlock(task);
> 
> 	return ns;
> }
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 85b62691f4f2..7c6b51a58968 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid)
> 	tsk = find_task_by_vpid(pid);
> 	if (tsk) {
> 		struct nsproxy *nsproxy;
> -		nsproxy = task_nsproxy(tsk);
> +		task_lock(tsk);
> +		nsproxy = tsk->nsproxy;
> 		if (nsproxy)
> 			net = get_net(nsproxy->net_ns);
> +		task_unlock(tsk);
> 	}
> 	rcu_read_unlock();
> 	return net;
> @@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task)
> 	struct net *net = NULL;
> 	struct nsproxy *nsproxy;
> 
> -	rcu_read_lock();
> -	nsproxy = task_nsproxy(task);
> +	task_lock(task);
> +	nsproxy = task->nsproxy;
> 	if (nsproxy)
> 		net = get_net(nsproxy->net_ns);
> -	rcu_read_unlock();
> +	task_unlock(task);
> 
> 	return net;
> }
> -- 
> 1.9.1
> 



More information about the Containers mailing list