[REVIEW][PATCH] namespaces: Use task_lock and not rcu to protect nsproxy
Rafael David Tinoco
rafael.tinoco at canonical.com
Mon Aug 11 11:40:28 UTC 2014
For those this might concern,
This patch works for me and is running in a big production environment for more
then 2 weeks with no problems. Performance regression (observed after commit
#911af505 in http://people.canonical.com/~inaddy/lp1328088/charts/{250,500,750,
1000,1250,1500,1750}.html) does not exist anymore.
Changing locking mechanism from rcu_read_lock to task_lock makes network namespaces
creation less sensible to _rcu implementation_ changes and fixed observed regression.
Thank you very much.
-Rafael
On Jul 29, 2014, at 10:21 PM, Eric W. Biederman <ebiederm at xmission.com> wrote:
>
> The synchronous syncrhonize_rcu in switch_task_namespaces makes setns
> a sufficiently expensive system call that people have complained.
>
> Upon inspect nsproxy no longer needs rcu protection for remote reads.
> remote reads are rare. So optimize for same process reads and write
> by switching using rask_lock instead.
>
> This yields a simpler to understand lock, and a faster setns system call.
>
> In particular this fixes a performance regression observed
> by Rafael David Tinoco <rafael.tinoco at canonical.com>.
>
> This is effectively a revert of Pavel Emelyanov's commit
> cf7b708c8d1d7a27736771bcf4c457b332b0f818 Make access to task's nsproxy lighter
> from 2007. The race this originialy fixed no longer exists as
> do_notify_parent uses task_active_pid_ns(parent) instead of
> parent->nsproxy.
>
> Signed-off-by: "Eric W. Biederman" <ebiederm at xmission.com>
> ---
> fs/namespace.c | 6 +++---
> fs/proc/proc_net.c | 4 +++-
> fs/proc_namespace.c | 8 +++-----
> include/linux/nsproxy.h | 16 ++++++----------
> ipc/namespace.c | 6 +++---
> kernel/nsproxy.c | 15 ++++-----------
> kernel/utsname.c | 6 +++---
> net/core/net_namespace.c | 10 ++++++----
> 8 files changed, 31 insertions(+), 40 deletions(-)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 182bc41cd887..7187d01329c3 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -2972,13 +2972,13 @@ static void *mntns_get(struct task_struct *task)
> struct mnt_namespace *ns = NULL;
> struct nsproxy *nsproxy;
>
> - rcu_read_lock();
> - nsproxy = task_nsproxy(task);
> + task_lock(task);
> + nsproxy = task->nsproxy;
> if (nsproxy) {
> ns = nsproxy->mnt_ns;
> get_mnt_ns(ns);
> }
> - rcu_read_unlock();
> + task_unlock(task);
>
> return ns;
> }
> diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
> index 4677bb7dc7c2..a63af3e0a612 100644
> --- a/fs/proc/proc_net.c
> +++ b/fs/proc/proc_net.c
> @@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir)
> rcu_read_lock();
> task = pid_task(proc_pid(dir), PIDTYPE_PID);
> if (task != NULL) {
> - ns = task_nsproxy(task);
> + task_lock(task);
> + ns = task->nsproxy;
> if (ns != NULL)
> net = get_net(ns->net_ns);
> + task_unlock(task);
> }
> rcu_read_unlock();
>
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index 1a81373947f3..73ca1740d839 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file,
> if (!task)
> goto err;
>
> - rcu_read_lock();
> - nsp = task_nsproxy(task);
> + task_lock(task);
> + nsp = task->nsproxy;
> if (!nsp || !nsp->mnt_ns) {
> - rcu_read_unlock();
> + task_unlock(task);
> put_task_struct(task);
> goto err;
> }
> ns = nsp->mnt_ns;
> get_mnt_ns(ns);
> - rcu_read_unlock();
> - task_lock(task);
> if (!task->fs) {
> task_unlock(task);
> put_task_struct(task);
> diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
> index b4ec59d159ac..35fa08fd7739 100644
> --- a/include/linux/nsproxy.h
> +++ b/include/linux/nsproxy.h
> @@ -40,32 +40,28 @@ extern struct nsproxy init_nsproxy;
> * the namespaces access rules are:
> *
> * 1. only current task is allowed to change tsk->nsproxy pointer or
> - * any pointer on the nsproxy itself
> + * any pointer on the nsproxy itself. Current must hold the task_lock
> + * when changing tsk->nsproxy.
> *
> * 2. when accessing (i.e. reading) current task's namespaces - no
> * precautions should be taken - just dereference the pointers
> *
> * 3. the access to other task namespaces is performed like this
> - * rcu_read_lock();
> - * nsproxy = task_nsproxy(tsk);
> + * task_lock(task);
> + * nsproxy = task->nsproxy;
> * if (nsproxy != NULL) {
> * / *
> * * work with the namespaces here
> * * e.g. get the reference on one of them
> * * /
> * } / *
> - * * NULL task_nsproxy() means that this task is
> + * * NULL task->nsproxy means that this task is
> * * almost dead (zombie)
> * * /
> - * rcu_read_unlock();
> + * task_unlock(task);
> *
> */
>
> -static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
> -{
> - return rcu_dereference(tsk->nsproxy);
> -}
> -
> int copy_namespaces(unsigned long flags, struct task_struct *tsk);
> void exit_task_namespaces(struct task_struct *tsk);
> void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
> diff --git a/ipc/namespace.c b/ipc/namespace.c
> index 59451c1e214d..b54468e48e32 100644
> --- a/ipc/namespace.c
> +++ b/ipc/namespace.c
> @@ -154,11 +154,11 @@ static void *ipcns_get(struct task_struct *task)
> struct ipc_namespace *ns = NULL;
> struct nsproxy *nsproxy;
>
> - rcu_read_lock();
> - nsproxy = task_nsproxy(task);
> + task_lock(task);
> + nsproxy = task->nsproxy;
> if (nsproxy)
> ns = get_ipc_ns(nsproxy->ipc_ns);
> - rcu_read_unlock();
> + task_unlock(task);
>
> return ns;
> }
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 8e7811086b82..ef42d0ab3115 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
>
> might_sleep();
>
> + task_lock(p);
> ns = p->nsproxy;
> + p->nsproxy = new;
> + task_unlock(p);
>
> - rcu_assign_pointer(p->nsproxy, new);
> -
> - if (ns && atomic_dec_and_test(&ns->count)) {
> - /*
> - * wait for others to get what they want from this nsproxy.
> - *
> - * cannot release this nsproxy via the call_rcu() since
> - * put_mnt_ns() will want to sleep
> - */
> - synchronize_rcu();
> + if (ns && atomic_dec_and_test(&ns->count))
> free_nsproxy(ns);
> - }
> }
>
> void exit_task_namespaces(struct task_struct *p)
> diff --git a/kernel/utsname.c b/kernel/utsname.c
> index fd393124e507..883aaaa7de8a 100644
> --- a/kernel/utsname.c
> +++ b/kernel/utsname.c
> @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
> struct uts_namespace *ns = NULL;
> struct nsproxy *nsproxy;
>
> - rcu_read_lock();
> - nsproxy = task_nsproxy(task);
> + task_lock(task);
> + nsproxy = task->nsproxy;
> if (nsproxy) {
> ns = nsproxy->uts_ns;
> get_uts_ns(ns);
> }
> - rcu_read_unlock();
> + task_unlock(task);
>
> return ns;
> }
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 85b62691f4f2..7c6b51a58968 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -373,9 +373,11 @@ struct net *get_net_ns_by_pid(pid_t pid)
> tsk = find_task_by_vpid(pid);
> if (tsk) {
> struct nsproxy *nsproxy;
> - nsproxy = task_nsproxy(tsk);
> + task_lock(tsk);
> + nsproxy = tsk->nsproxy;
> if (nsproxy)
> net = get_net(nsproxy->net_ns);
> + task_unlock(tsk);
> }
> rcu_read_unlock();
> return net;
> @@ -632,11 +634,11 @@ static void *netns_get(struct task_struct *task)
> struct net *net = NULL;
> struct nsproxy *nsproxy;
>
> - rcu_read_lock();
> - nsproxy = task_nsproxy(task);
> + task_lock(task);
> + nsproxy = task->nsproxy;
> if (nsproxy)
> net = get_net(nsproxy->net_ns);
> - rcu_read_unlock();
> + task_unlock(task);
>
> return net;
> }
> --
> 1.9.1
>
More information about the Containers
mailing list