[PATCH] linux-cr: nested pid namespaces (v3)

Louis Rilling Louis.Rilling at kerlabs.com
Tue Mar 23 00:14:32 PDT 2010


On Tue, Mar 23, 2010 at 12:18:39AM -0500, Serge E. Hallyn wrote:
> Support checkpoint and restart of tasks in nested pid namespaces.
> We keep the original single pids_array to minimize memory
> allocations.  The pids array entries are augmented with a pidns
> depth (relative to the container init's pidns, and an "rpid" which
> is the pid in the checkpointer's pidns (or 0 if no valid pid exists).
> The rpid will be used by userspace to gather more information (like
> /proc/$$/mountinfo) after the kernel sys_checkpoint.  If any tasks
> are in nested pid namespace, another single array will hold all of
> the vpids.  At restart those are used by userspace to determine how
> to call eclone().  Kernel ignores them.
> 
> This patch also adds 'rpid' to struct ckpt_hdr_pids, which is not
> needed for nested pid_ns support, but will be needed for the
> userspace checkpointer to gather additional information (i.e.
> /proc/pid/mountinfo) after sys_checkpoint() completes.
> 
> Changelog:
>   Mar 22:
> 	Use Louis Rilling's smarter ckpt_vpids algorithm
> 	verbatim, to handle pid_ns depths > CKPT_HDR_PIDS_CHUNK,
> 	as well as fix an apparent bug in my original code.
> 
> 	As Louis suggested, use task_active_pid_ns() rather than
> 	task->nsproxy->pid_ns.  In fact it's a must, bc the
> 	checkpointed task may be dead and have NULL
> 	task->nsproxy->pid_ns.

Hm, if task can be dead, then there is a much bigger issue:
task->nsproxy is NULL. Or did I miss something?

To me the real reason is to anticipate pid namespace unsharing. And this
together with setns() will need to re-consider much of the namespace C/R
logic imho. For instance, checkpoint could be done from a foreign task
having entered the container, leak detection should take such foreign
tasks into account (see example below), etc.

> 
> 	Oren: Add spinlock for nsproxy->pidns; use
> 	ckpt_read_consume() to consume vpids; and use __s32 instead
> 	of ckpt_vpid struct
> 
> Signed-off-by: Serge E. Hallyn <serue at us.ibm.com>
> ---
>  checkpoint/checkpoint.c          |  123 ++++++++++++++++++++++++++++++++++----
>  checkpoint/process.c             |   22 ++++++-
>  checkpoint/restart.c             |   43 ++++++++++++-
>  checkpoint/sys.c                 |    2 +
>  include/linux/checkpoint.h       |    2 +-
>  include/linux/checkpoint_hdr.h   |   14 ++++
>  include/linux/checkpoint_types.h |    3 +
>  kernel/nsproxy.c                 |    9 ++-
>  8 files changed, 195 insertions(+), 23 deletions(-)
> 
> diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> index f27af41..fd61a80 100644
> --- a/checkpoint/checkpoint.c
> +++ b/checkpoint/checkpoint.c
> @@ -27,6 +27,7 @@
>  #include <linux/deferqueue.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <linux/pid_namespace.h>
>  
>  /* unique checkpoint identifier (FIXME: should be per-container ?) */
>  static atomic_t ctx_count = ATOMIC_INIT(0);
> @@ -242,6 +243,7 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  	struct task_struct *root = ctx->root_task;
>  	struct nsproxy *nsproxy;
>  	int ret = 0;
> +	struct pid_namespace *pidns;
>  
>  	ckpt_debug("check %d\n", task_pid_nr_ns(t, ctx->root_nsproxy->pid_ns));
>  
> @@ -293,10 +295,15 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  		_ckpt_err(ctx, -EPERM, "%(T)Nested net_ns unsupported\n");
>  		ret = -EPERM;
>  	}
> -	/* no support for >1 private pidns */
> -	if (nsproxy->pid_ns != ctx->root_nsproxy->pid_ns) {
> -		_ckpt_err(ctx, -EPERM, "%(T)Nested pid_ns unsupported\n");
> -		ret = -EPERM;
> +	/* pidns must be descendent of root_nsproxy */
> +	pidns = nsproxy->pid_ns;

In case of unshared pid namespace, task_active_pid_ns(t) should be checked
instead of t->nsproxy->pid_ns: we can't checkpoint a foreign task.

Thanks,

Louis

> +	while (pidns != ctx->root_nsproxy->pid_ns) {
> +		if (pidns == &init_pid_ns) {
> +			ret = -EPERM;
> +			_ckpt_err(ctx, ret, "%(T)stranger pid_ns\n");
> +			break;
> +		}
> +		pidns = pidns->parent;
>  	}
>  	rcu_read_unlock();
>  
> @@ -305,15 +312,19 @@ static int may_checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
>  
>  #define CKPT_HDR_PIDS_CHUNK	256
>  
> +/*
> + * Write the pids in ctx->root_nsproxy->pidns.  This info is
> + * needed at restart to unambiguously dereference tasks.
> + */
>  static int checkpoint_pids(struct ckpt_ctx *ctx)
>  {
>  	struct ckpt_pids *h;
> -	struct pid_namespace *ns;
> +	struct pid_namespace *root_pidns;
>  	struct task_struct *task;
>  	struct task_struct **tasks_arr;
>  	int nr_tasks, n, pos = 0, ret = 0;
>  
> -	ns = ctx->root_nsproxy->pid_ns;
> +	root_pidns = ctx->root_nsproxy->pid_ns;
>  	tasks_arr = ctx->tasks_arr;
>  	nr_tasks = ctx->nr_tasks;
>  	BUG_ON(nr_tasks <= 0);
> @@ -331,15 +342,21 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	do {
>  		rcu_read_lock();
>  		for (n = 0; n < min(nr_tasks, CKPT_HDR_PIDS_CHUNK); n++) {
> +			struct pid_namespace *task_pidns;
>  			task = tasks_arr[pos];
>  
> -			h[n].vpid = task_pid_nr_ns(task, ns);
> -			h[n].vtgid = task_tgid_nr_ns(task, ns);
> -			h[n].vpgid = task_pgrp_nr_ns(task, ns);
> -			h[n].vsid = task_session_nr_ns(task, ns);
> -			h[n].vppid = task_tgid_nr_ns(task->real_parent, ns);
> +			h[n].vpid = task_pid_nr_ns(task, root_pidns);
> +			h[n].vtgid = task_tgid_nr_ns(task, root_pidns);
> +			h[n].vpgid = task_pgrp_nr_ns(task, root_pidns);
> +			h[n].vsid = task_session_nr_ns(task, root_pidns);
> +			h[n].vppid = task_tgid_nr_ns(task->real_parent,
> +					root_pidns);
> +			task_pidns = task_active_pid_ns(task);
> +			h[n].rpid = task_pid_vnr(task);
> +			h[n].depth = task_pidns->level - root_pidns->level;
>  			ckpt_debug("task[%d]: vpid %d vtgid %d parent %d\n",
>  				   pos, h[n].vpid, h[n].vtgid, h[n].vppid);
> +			ctx->nr_vpids += h[n].depth;
>  			pos++;
>  		}
>  		rcu_read_unlock();
> @@ -356,6 +373,71 @@ static int checkpoint_pids(struct ckpt_ctx *ctx)
>  	return ret;
>  }
>  
> +static int checkpoint_vpids(struct ckpt_ctx *ctx)
> +{
> +	__s32 *h;  /* vpid array */
> +	struct pid_namespace *root_pidns, *task_pidns = NULL, *active_pidns;
> +	struct task_struct *task;
> +	int ret, nr_tasks = ctx->nr_tasks;
> +	int tidx = 0, /* index into task array */
> +		hidx = 0, /* pids written into current __s32 chunk */
> +		vidx = 0; /* vpid index for current task */
> +
> +	root_pidns = ctx->root_nsproxy->pid_ns;
> +	nr_tasks = ctx->nr_tasks;
> +
> +	ret = ckpt_write_obj_type(ctx, NULL,
> +				  sizeof(*h) * ctx->nr_vpids,
> +				  CKPT_HDR_BUFFER);
> +	if (ret < 0)
> +		return ret;
> +
> +	h = ckpt_hdr_get(ctx, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	do {
> +		rcu_read_lock();
> +		while (tidx < nr_tasks && hidx < CKPT_HDR_PIDS_CHUNK) {
> +			int nsdelta;
> +
> +			task = ctx->tasks_arr[tidx];
> +			active_pidns = task_active_pid_ns(task);
> +			nsdelta = active_pidns->level - root_pidns->level;
> +			if (hidx + nsdelta - vidx > CKPT_HDR_PIDS_CHUNK)
> +				/*
> +				 * We will release rcu before recording the
> +				 * remaining vpids, but neither task nor its
> +				 * pid can disappear.
> +				 */
> +				nsdelta = CKPT_HDR_PIDS_CHUNK - hidx + vidx;
> +
> +			if (vidx == 0)
> +				task_pidns = active_pidns;
> +			for (; vidx < nsdelta; vidx++) {
> +				h[hidx] = task_pid_nr_ns(task, task_pidns);
> +				hidx++;
> +				task_pidns = task_pidns->parent;
> +			}
> +
> +			if (task_pidns == root_pidns) {
> +				tidx++;
> +				vidx = 0;
> +			}
> +		}
> +		rcu_read_unlock();
> +
> +		ret = ckpt_kwrite(ctx, h, hidx * sizeof(*h));
> +		if (ret < 0)
> +			break;
> +
> +		hidx = 0;
> +	} while (tidx < nr_tasks);
> +
> +	_ckpt_hdr_put(ctx, h, sizeof(*h) * CKPT_HDR_PIDS_CHUNK);
> +	return ret;
> +}
> +
>  static int collect_objects(struct ckpt_ctx *ctx)
>  {
>  	int n, ret = 0;
> @@ -466,6 +548,7 @@ static int build_tree(struct ckpt_ctx *ctx)
>  static int checkpoint_tree(struct ckpt_ctx *ctx)
>  {
>  	struct ckpt_hdr_tree *h;
> +	struct ckpt_hdr_vpids *hvpids;
>  	int ret;
>  
>  	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TREE);
> @@ -480,7 +563,23 @@ static int checkpoint_tree(struct ckpt_ctx *ctx)
>  		return ret;
>  
>  	ret = checkpoint_pids(ctx);
> -	return ret;
> +	if (ret < 0)
> +		return ret;
> +
> +	hvpids = ckpt_hdr_get_type(ctx, sizeof(*hvpids), CKPT_HDR_VPIDS);
> +	if (!hvpids)
> +		return -ENOMEM;
> +
> +	hvpids->nr_vpids = ctx->nr_vpids;
> +
> +	ret = ckpt_write_obj(ctx, &hvpids->h);
> +	ckpt_hdr_put(ctx, hvpids);
> +	if (ret < 0)
> +		return ret;
> +	if (ctx->nr_vpids == 0)
> +		return 0;
> +
> +	return checkpoint_vpids(ctx);
>  }
>  
>  static struct task_struct *get_freezer_task(struct task_struct *root_task)
> diff --git a/checkpoint/process.c b/checkpoint/process.c
> index f917112..602ba9f 100644
> --- a/checkpoint/process.c
> +++ b/checkpoint/process.c
> @@ -22,7 +22,7 @@
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
>  #include <linux/syscalls.h>
> -
> +#include <linux/pid_namespace.h>
>  
>  pid_t ckpt_pid_nr(struct ckpt_ctx *ctx, struct pid *pid)
>  {
> @@ -51,7 +51,7 @@ struct pid *_ckpt_find_pgrp(struct ckpt_ctx *ctx, pid_t pgid)
>  		 * Find the owner process of this pgid (it must exist
>  		 * if pgrp exists). It must be a thread group leader.
>  		 */
> -		pgrp = find_vpid(pgid);
> +		pgrp = find_pid_ns(pgid, ctx->root_nsproxy->pid_ns);
>  		p = pid_task(pgrp, PIDTYPE_PID);
>  		if (!p || !thread_group_leader(p))
>  			return NULL;
> @@ -561,6 +561,13 @@ static int restore_task_struct(struct ckpt_ctx *ctx)
>  	return ret;
>  }
>  
> +/*
> + * restart is currently serialized, but if/when that changes we want
> + * to make sure that setting nsproxy->pidns in restore_task_ns() is only
> + * done once.  That's what checkpoint_nslock is for
> + */
> +DEFINE_SPINLOCK(checkpoint_nslock);
> +
>  static int restore_task_ns(struct ckpt_ctx *ctx)
>  {
>  	struct ckpt_hdr_task_ns *h;
> @@ -578,6 +585,10 @@ static int restore_task_ns(struct ckpt_ctx *ctx)
>  	}
>  
>  	if (nsproxy != task_nsproxy(current)) {
> +		spin_lock(&checkpoint_nslock);
> +		if (!nsproxy->pid_ns)
> +			nsproxy->pid_ns = get_pid_ns(current->nsproxy->pid_ns);
> +		spin_unlock(&checkpoint_nslock);
>  		get_nsproxy(nsproxy);
>  		switch_task_namespaces(current, nsproxy);
>  	}
> @@ -829,8 +840,8 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
>  
>  	pgid = ctx->pids_arr[ctx->active_pid].vpgid;
>  
> -	if (pgid == task_pgrp_vnr(task))  /* nothing to do */
> -		return 0;
> +	if (pgid == task_pgrp_nr_ns(task, ctx->root_nsproxy->pid_ns))
> +		return 0;  /* nothing to do */
>  
>  	if (task->signal->leader)  /* (2) */
>  		return -EINVAL;
> @@ -850,6 +861,9 @@ static int restore_task_pgid(struct ckpt_ctx *ctx)
>  	if (ctx->uflags & RESTART_TASKSELF)
>  		ret = 0;
>  
> +	if (ret < 0)
> +		ckpt_err(ctx, ret, "setting pgid\n");
> +
>  	return ret;
>  }
>  
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> index 6a9644d..c25ce88 100644
> --- a/checkpoint/restart.c
> +++ b/checkpoint/restart.c
> @@ -23,6 +23,7 @@
>  #include <asm/syscall.h>
>  #include <linux/elf.h>
>  #include <linux/deferqueue.h>
> +#include <linux/pid_namespace.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
>  
> @@ -760,6 +761,33 @@ static int restore_read_tree(struct ckpt_ctx *ctx)
>  	return ret;
>  }
>  
> +/*
> + * read all the vpids - we don't actually care about them,
> + * userspace did
> + */
> +static int restore_slurp_vpids(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_vpids *h;
> +	int size, ret;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_VPIDS);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +	ctx->nr_vpids = h->nr_vpids;
> +	ckpt_hdr_put(ctx, h);
> +
> +	if (!ctx->nr_vpids)
> +		return 0;
> +
> +	size = sizeof(__s32) * ctx->nr_vpids;
> +	if (size < 0)		/* overflow ? */
> +		return -EINVAL;
> +
> +	ret = ckpt_read_consume(ctx, size + sizeof(struct ckpt_hdr),
> +				CKPT_HDR_BUFFER);
> +	return ret;
> +}
> +
>  static inline int all_tasks_activated(struct ckpt_ctx *ctx)
>  {
>  	return (ctx->active_pid == ctx->nr_pids);
> @@ -848,7 +876,8 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
>  		pid = get_active_pid(ctx);
>  
>  		rcu_read_lock();
> -		task = find_task_by_pid_ns(pid, ctx->root_nsproxy->pid_ns);
> +		task = find_task_by_pid_ns(pid,
> +					task_active_pid_ns(ctx->root_task));
>  		/* target task must have same restart context */
>  		if (task && task->checkpoint_ctx == ctx)
>  			wake_up_process(task);
> @@ -870,7 +899,7 @@ static int restore_activate_next(struct ckpt_ctx *ctx)
>  
>  static int wait_task_active(struct ckpt_ctx *ctx)
>  {
> -	pid_t pid = task_pid_vnr(current);
> +	pid_t pid = task_pid_nr_ns(current, task_active_pid_ns(ctx->root_task));
>  	int ret;
>  
>  	ckpt_debug("pid %d waiting\n", pid);
> @@ -886,7 +915,8 @@ static int wait_task_active(struct ckpt_ctx *ctx)
>  
>  static int wait_task_sync(struct ckpt_ctx *ctx)
>  {
> -	ckpt_debug("pid %d syncing\n", task_pid_vnr(current));
> +	ckpt_debug("pid %d syncing\n",
> +		task_pid_nr_ns(current, task_active_pid_ns(ctx->root_task)));
>  	wait_event_interruptible(ctx->waitq, ckpt_test_complete(ctx));
>  	ckpt_debug("task sync done (errno %d)\n", ctx->errno);
>  	if (ckpt_test_error(ctx))
> @@ -1160,7 +1190,7 @@ static struct task_struct *choose_root_task(struct ckpt_ctx *ctx, pid_t pid)
>  
>  	read_lock(&tasklist_lock);
>  	list_for_each_entry(task, &current->children, sibling) {
> -		if (task_pid_vnr(task) == pid) {
> +		if (task_pid_nr_ns(task, ctx->coord_pidns) == pid) {
>  			get_task_struct(task);
>  			ctx->root_task = task;
>  			ctx->root_pid = pid;
> @@ -1237,6 +1267,11 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
>  	if (ret < 0)
>  		return ret;
>  
> +	ret = restore_slurp_vpids(ctx);
> +	ckpt_debug("read vpids %d\n", ret);
> +	if (ret < 0)
> +		return ret;
> +
>  	if ((ctx->uflags & RESTART_TASKSELF) && ctx->nr_pids != 1)
>  		return -EINVAL;
>  
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 9e9df9b..45d3e7a 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -22,6 +22,7 @@
>  #include <linux/capability.h>
>  #include <linux/checkpoint.h>
>  #include <linux/deferqueue.h>
> +#include <linux/pid_namespace.h>
>  
>  /*
>   * ckpt_unpriv_allowed - sysctl controlled.
> @@ -276,6 +277,7 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
>  	ctx->uflags = uflags;
>  	ctx->kflags = kflags;
>  	ctx->ktime_begin = ktime_get();
> +	ctx->coord_pidns = get_pid_ns(current->nsproxy->pid_ns);
>  
>  	atomic_set(&ctx->refcount, 0);
>  	INIT_LIST_HEAD(&ctx->pgarr_list);
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 792b523..e860bf5 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -10,7 +10,7 @@
>   *  distribution for more details.
>   */
>  
> -#define CHECKPOINT_VERSION  5
> +#define CHECKPOINT_VERSION  6
>  
>  /* checkpoint user flags */
>  #define CHECKPOINT_SUBTREE	0x1
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 41412d1..20c90b3 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -117,6 +117,8 @@ enum {
>  #define CKPT_HDR_GROUPINFO CKPT_HDR_GROUPINFO
>  	CKPT_HDR_TASK_CREDS,
>  #define CKPT_HDR_TASK_CREDS CKPT_HDR_TASK_CREDS
> +	CKPT_HDR_VPIDS,
> +#define CKPT_HDR_VPIDS CKPT_HDR_VPIDS
>  
>  	/* 201-299: reserved for arch-dependent */
>  
> @@ -327,11 +329,23 @@ struct ckpt_hdr_tree {
>  } __attribute__((aligned(8)));
>  
>  struct ckpt_pids {
> +	/* These pids are in the root_nsproxy's pid ns */
>  	__s32 vpid;
>  	__s32 vppid;
>  	__s32 vtgid;
>  	__s32 vpgid;
>  	__s32 vsid;
> +	/* rpid is the real pid, in checkpointer's pidns.  This is
> +	 * so checkpointer in userspace can get more info about the
> +	 * task (i.e. /proc/pid/mountinfo) */
> +	__s32 rpid;
> +	__s32 depth; /* pid namespace depth relative to container init */
> +} __attribute__((aligned(8)));
> +
> +/* number of vpids */
> +struct ckpt_hdr_vpids {
> +	struct ckpt_hdr h;
> +	__s32 nr_vpids;
>  } __attribute__((aligned(8)));
>  
>  /* pids */
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index ecd3e91..2fb79cf 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -72,6 +72,9 @@ struct ckpt_ctx {
>  	struct task_struct **tasks_arr; /* array of all tasks [checkpoint] */
>  	int nr_tasks;                   /* size of tasks array */
>  
> +	int nr_vpids;
> +	struct pid_namespace *coord_pidns;	/* coordinator pid_ns */
> +
>  	/* [multi-process restart] */
>  	struct ckpt_pids *pids_arr;	/* array of all pids [restart] */
>  	int nr_pids;			/* size of pids array */
> diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
> index 0da0d83..6d86240 100644
> --- a/kernel/nsproxy.c
> +++ b/kernel/nsproxy.c
> @@ -364,8 +364,13 @@ static struct nsproxy *do_restore_ns(struct ckpt_ctx *ctx)
>  		get_net(net_ns);
>  		nsproxy->net_ns = net_ns;
>  
> -		get_pid_ns(current->nsproxy->pid_ns);
> -		nsproxy->pid_ns = current->nsproxy->pid_ns;
> +		/*
> +		 * The pid_ns will get assigned the first time that we
> +		 * assign the nsproxy to a task.  The task had unshared
> +		 * its pid_ns in userspace before calling restart, and
> +		 * we want to keep using that pid_ns.
> +		 */
> +		nsproxy->pid_ns = NULL;
>  	}
>   out:
>  	if (ret < 0)
> -- 
> 1.6.1
> 
> _______________________________________________
> Containers mailing list
> Containers at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 197 bytes
Desc: Digital signature
Url : http://lists.linux-foundation.org/pipermail/containers/attachments/20100323/048e1a55/attachment.pgp 


More information about the Containers mailing list