[PATCH 5/7] clone: Introduce the CLONE_CHILD_USEPID functionality

Matt Helsley matthltc at us.ibm.com
Fri Jul 22 16:08:48 PDT 2011


On Thu, Jul 21, 2011 at 11:04:59AM -0500, Serge E. Hallyn wrote:
> Quoting Pavel Emelyanov (xemul at parallels.com):
> > The respective flag for clone() makes the latter to take the desired
> > pid of a new process from the child_tidptr. The given pid is used as
> > the pid for the pid namespace the parent is currently running in.
> > 
> > Needed badly for restoring a process.
> > 
> > Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
> 
> How do you intend to eventually support multiple pid namespaces?
> I don't mind not supporting them now, but if the interface does
> cannot be extended to support it, I think that's a simple NACK.
> 
> IIRC Eric Biederman (explicity cc:d) in the past has advocated using
> /proc/sys/pid_max games to specify a pid.  That is actually
> usable cross-namespace, though only serially.  Do you think that
> will be too cumbersome?

Didn't that already get NACK'd by others?

Regardless, I'm not a fan of that method. If we're going to add
an interface to enable doing something then it's best if we can
do it simply and directly -- not play games with some apparent
tunable in order to "covertly" get the pids we want.

> 
> There is also the clone_with_pid() syscall from Suka.  It
> accompanied the in-kernel checkpoint/restart patchset, but should
> be perfectly usable without it.  Is there a good reason not to
> pursue that?

aka eclone. It enabled extending the clone flags in the future. Suka
did substantial work making sure it worked for multiple architectures
(clone is rather special that way). It's already been tested.

Furthermore, Oren's user-cr code already creates a tree of
multiple processes with correct pids, pid namespaces, session ids,
and process group ids in userspace using only eclone. So we know eclone
can handle all of that today because there's code to do it.

So I think it would be better to incorporate the eclone patch set
unless, as you say, Pavel can see a good reason not to.

Cheers,
	-Matt Helsley

> 
> thanks,
> -serge
> 
> > ---
> >  include/linux/pid.h   |    2 +-
> >  include/linux/sched.h |    1 +
> >  kernel/fork.c         |   10 ++++++-
> >  kernel/pid.c          |   70 +++++++++++++++++++++++++++++++++++-------------
> >  4 files changed, 62 insertions(+), 21 deletions(-)
> > 
> > diff --git a/include/linux/pid.h b/include/linux/pid.h
> > index cdced84..de772ab 100644
> > --- a/include/linux/pid.h
> > +++ b/include/linux/pid.h
> > @@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
> >  extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
> >  int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
> >  
> > -extern struct pid *alloc_pid(struct pid_namespace *ns);
> > +extern struct pid *alloc_pid(struct pid_namespace *ns, int pid);
> >  extern void free_pid(struct pid *pid);
> >  
> >  /*
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index 781abd1..5b6c1e2 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -23,6 +23,7 @@
> >  #define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
> >  /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
> >     and is now available for re-use. */
> > +#define CLONE_CHILD_USEPID	0x02000000	/* use the given pid */
> >  #define CLONE_NEWUTS		0x04000000	/* New utsname group? */
> >  #define CLONE_NEWIPC		0x08000000	/* New ipcs */
> >  #define CLONE_NEWUSER		0x10000000	/* New user namespace */
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index e7548de..f30fbdb 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1183,8 +1183,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> >  		goto bad_fork_cleanup_io;
> >  
> >  	if (pid != &init_struct_pid) {
> > +		int want_pid = 0;
> > +
> > +		if (clone_flags & CLONE_CHILD_USEPID) {
> > +			retval = get_user(want_pid, child_tidptr);
> > +			if (retval)
> > +				goto bad_fork_cleanup_io;
> > +		}
> > +
> >  		retval = -ENOMEM;
> > -		pid = alloc_pid(p->nsproxy->pid_ns);
> > +		pid = alloc_pid(p->nsproxy->pid_ns, want_pid);
> >  		if (!pid)
> >  			goto bad_fork_cleanup_io;
> >  	}
> > diff --git a/kernel/pid.c b/kernel/pid.c
> > index 57a8346..69ae1be 100644
> > --- a/kernel/pid.c
> > +++ b/kernel/pid.c
> > @@ -159,11 +159,55 @@ static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
> >  	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
> >  }
> >  
> > -static int alloc_pidmap(struct pid_namespace *pid_ns)
> > +static int alloc_pidmap_page(struct pidmap *map)
> > +{
> > +	if (unlikely(!map->page)) {
> > +		void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > +		/*
> > +		 * Free the page if someone raced with us
> > +		 * installing it:
> > +		 */
> > +		spin_lock_irq(&pidmap_lock);
> > +		if (!map->page) {
> > +			map->page = page;
> > +			page = NULL;
> > +		}
> > +		spin_unlock_irq(&pidmap_lock);
> > +		kfree(page);
> > +		if (unlikely(!map->page))
> > +			return -ENOMEM;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static int set_pidmap(struct pid_namespace *pid_ns, int pid)
> > +{
> > +	int offset;
> > +	struct pidmap *map;
> > +
> > +	offset = pid & BITS_PER_PAGE_MASK;
> > +	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
> > +
> > +	if (alloc_pidmap_page(map) < 0)
> > +		return -ENOMEM;
> > +
> > +	if (!test_and_set_bit(offset, map->page)) {
> > +		atomic_dec(&map->nr_free);
> > +		return pid;
> > +	}
> > +
> > +	return -EBUSY;
> > +}
> > +
> > +static int alloc_pidmap(struct pid_namespace *pid_ns, int desired_pid)
> >  {
> >  	int i, offset, max_scan, pid, last = pid_ns->last_pid;
> >  	struct pidmap *map;
> >  
> > +	if (desired_pid)
> > +		return set_pidmap(pid_ns, desired_pid);
> > +
> >  	pid = last + 1;
> >  	if (pid >= pid_max)
> >  		pid = RESERVED_PIDS;
> > @@ -176,22 +220,9 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
> >  	 */
> >  	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
> >  	for (i = 0; i <= max_scan; ++i) {
> > -		if (unlikely(!map->page)) {
> > -			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
> > -			/*
> > -			 * Free the page if someone raced with us
> > -			 * installing it:
> > -			 */
> > -			spin_lock_irq(&pidmap_lock);
> > -			if (!map->page) {
> > -				map->page = page;
> > -				page = NULL;
> > -			}
> > -			spin_unlock_irq(&pidmap_lock);
> > -			kfree(page);
> > -			if (unlikely(!map->page))
> > -				break;
> > -		}
> > +		if (alloc_pidmap_page(map) < 0)
> > +			break;
> > +
> >  		if (likely(atomic_read(&map->nr_free))) {
> >  			do {
> >  				if (!test_and_set_bit(offset, map->page)) {
> > @@ -277,7 +308,7 @@ void free_pid(struct pid *pid)
> >  	call_rcu(&pid->rcu, delayed_put_pid);
> >  }
> >  
> > -struct pid *alloc_pid(struct pid_namespace *ns)
> > +struct pid *alloc_pid(struct pid_namespace *ns, int this_ns_pid)
> >  {
> >  	struct pid *pid;
> >  	enum pid_type type;
> > @@ -291,13 +322,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> >  
> >  	tmp = ns;
> >  	for (i = ns->level; i >= 0; i--) {
> > -		nr = alloc_pidmap(tmp);
> > +		nr = alloc_pidmap(tmp, this_ns_pid);
> >  		if (nr < 0)
> >  			goto out_free;
> >  
> >  		pid->numbers[i].nr = nr;
> >  		pid->numbers[i].ns = tmp;
> >  		tmp = tmp->parent;
> > +		this_ns_pid = 0;
> >  	}
> >  
> >  	get_pid_ns(ns);
> > -- 
> > 1.5.5.6
> > _______________________________________________
> > Containers mailing list
> > Containers at lists.linux-foundation.org
> > https://lists.linux-foundation.org/mailman/listinfo/containers
> _______________________________________________
> Containers mailing list
> Containers at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers


More information about the Containers mailing list