[PATCH user-cr 2/2] add nsexeccwp to test clone-with-pids

Nathan Lynch ntl at pobox.com
Mon Nov 16 15:18:42 PST 2009


On Mon, 2009-11-16 at 12:26 -0600, Serge E. Hallyn wrote:
> Quoting Nathan Lynch (ntl at pobox.com):
> > On Mon, 2009-11-16 at 05:12 -0600, Serge E. Hallyn wrote:
> > > Quoting Nathan Lynch (ntl at pobox.com):
> > > > On Thu, 2009-11-12 at 23:24 -0600, serue at us.ibm.com wrote:
> > > > > +	if (use_clone) {
> > > > > +		int stacksize = 4*getpagesize();
> > > > > +		void *stack = malloc(stacksize);
> > > > > +
> > > > > +		if (!stack) {
> > > > > +			perror("malloc");
> > > > > +			return -1;
> > > > > +		}
> > > > > +
> > > > > +		printf("about to clone with %lx\n", flags);
> > > > > +		if (chosen_pid)
> > > > > +			printf("Will choose pid %d\n", chosen_pid);
> > > > > +		flags |= SIGCHLD;
> > > > > +		pid = clone_with_pids(do_child, stack, flags, &pid_set,
> > > > > +					(void *)argv);
> > > > 
> > > > The stack argument should be adjusted with the usual stack += stacksize
> > > > - 1 or similar, right?
> > > 
> > > the clone_with_pids() helper in user-cr/clone_s390x.c (and IIRC the
> > > x86 one by Suka also) does this implicitly, by doing:
> > > 
> > > 	s = child_stack;
> > > 	*--s = arg;
> > > 	*--s = fn;
> > > 	child_stack -= 16
> > 
> > That's setting up arguments for the function to run in the child, and
> > afaict that code assumes the value of child_stack is the _end_ of the
> > stack region.
> 
> Yes.
> 
> > The code I quoted above is passing the beginning of the
> > region (the return value from malloc).
> 
> Holy cow, that was a snafu in my switching to sending (stack_base,stack_size)
> for the previous version, and then back again.  It was meant to send
> stack_base+stack_size now.

Okay, here's the violence I've committed against your code to get eclone
working on powerpc (tested 32-bit userspace against 64-bit kernel).

./nsexeccwp -z 300 /bin/bash -c 'echo $$'
[debugging cruft elided]
300

This is meant not for inclusion but for discussion at this point.  I
made some changes that will certainly break the builds for other
architectures.

Note that I have generic code initializing clone_args with the true
stack base and size and passing that to the architecture code.  The
architecture code (e.g. clone_ppc.c) is responsible for calculating the
stack pointer to pass to the kernel.  The architecture code is also
responsible for clearing clone_args.child_stack_size and updating
clone_args.child_stack, adjusting for alignment and arguments if
appropriate.  In this way, we can accommodate ia64 and parisc and keep
platform details in platform-specific code.


 clone_ppc.c  |   54 +++++++++++++++++++++++++++++++++++
 clone_ppc_.S |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 eclone.h     |   25 ++++++++++++++++
 nsexeccwp.c  |   42 ++++++++++++----------------
 4 files changed, 182 insertions(+), 27 deletions(-)

diff --git a/clone_ppc.c b/clone_ppc.c
index 49797fd..9e19fae 100644
--- a/clone_ppc.c
+++ b/clone_ppc.c
@@ -10,14 +10,25 @@
 
 #define _GNU_SOURCE
 
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
 #include <unistd.h>
 #include <errno.h>
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <asm/unistd.h>
 
+#include "eclone.h"
+
 struct target_pid_set;
 
+struct pid_set {
+	size_t nr_pids;
+	pid_t *pids;
+};
+
+
 extern int __clone_with_pids(int (*fn)(void *arg),
 			     void *child_stack ,
 			     int flags,
@@ -56,3 +67,46 @@ int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
 }
 
 #endif
+
+extern int __eclone(int (*fn)(void *arg),
+		    void *child_sp,
+		    int flags,
+		    void *fn_arg,
+		    struct clone_args *args,
+		    size_t args_size,
+		    pid_t *pids);
+
+int eclone(int (*fn)(void *), void *fn_arg, int clone_flags_low,
+	   struct clone_args *clone_args, pid_t *pids)
+{
+	struct clone_args my_args;
+	unsigned long child_sp;
+	int newpid;
+
+	if (clone_args->child_stack)
+		child_sp = clone_args->child_stack +
+			clone_args->child_stack_size - 1;
+	else
+		child_sp = 0;
+
+	my_args = *clone_args;
+	my_args.child_stack = child_sp;
+	my_args.child_stack_size = 0;
+
+	printf("%s: child_sp = %p\n", __func__, (void *)child_sp);
+
+	newpid = __eclone(fn,
+			  (void *)child_sp,
+			  clone_flags_low,
+			  fn_arg,
+			  &my_args,
+			  sizeof(my_args),
+			  pids);
+
+	if (newpid < 0) {
+		errno = -newpid;
+		newpid = -1;
+	}
+
+	return newpid;
+}
diff --git a/clone_ppc_.S b/clone_ppc_.S
index cb3e053..b777b2d 100644
--- a/clone_ppc_.S
+++ b/clone_ppc_.S
@@ -11,6 +11,14 @@
 #include <asm/unistd.h>
 #include "powerpc_asm.h"
 
+#ifndef __NR_clone_with_pids
+#define __NR_clone_with_pids	325
+#endif
+
+#ifndef __NR_eclone
+#define __NR_eclone	323
+#endif
+
 /* int [r3] clone_with_pids(int (*fn)(void *arg) [r3],
  *                          void *child_stack [r4],
  *                          int flags [r5],
@@ -29,10 +37,10 @@
 .globl __clone_with_pids
 __clone_with_pids:
 
-/* No argument validation. */
+	/* No argument validation. */
 
-/* Set up parent's stack frame. */
-stwu	r1,-32(r1)
+	/* Set up parent's stack frame. */
+	stwu	r1,-32(r1)
 
 	/* Save non-volatiles (r28-r31) which we plan to use. */
 	stmw	r28,16(r1)
@@ -88,3 +96,77 @@ parent:
 	neg	r3,r3
 	blr
 
+/* int [r3] eclone(int (*fn)(void *arg) [r3],
+ *                          void *child_sp [r4],
+ *                          int flags [r5],
+ *                          void *fn_arg [r6],
+ *                          struct clone_args *args [r7],
+ *                          size_t args_size [r8],
+ *                          pid_t *pids [r9]);
+ * Creates a child task with the pids specified by pids.
+ * Returns to parent only, child execution and exit is handled here.
+ * On error, returns negated errno.  On success, returns the pid of the child
+ * created.
+ */
+
+.globl __eclone
+__eclone:
+
+	/* No argument validation. */
+
+	/* Set up parent's stack frame. */
+	stwu	r1,-32(r1)
+
+	/* Save non-volatiles (r28-r31) which we plan to use. */
+	stmw	r28,16(r1)
+
+	/* Set up child's stack frame. */
+	clrrwi	r4,r4,4
+	li	r0,0
+	stw	r0,-16(r4)
+
+	/* Save fn, stack pointer, flags, and fn_arg across system call. */
+	mr	r28,r3
+	mr	r29,r4
+	mr	r30,r5
+	mr	r31,r6
+
+	/* Set up arguments for system call. */
+	mr	r3,r5	/* flags */
+	mr	r4,r7	/* clone_args */
+	mr	r5,r8	/* clone_args' size */
+	mr	r6,r9	/* pids */
+
+	/* Do the system call */
+	li	r0,__NR_eclone
+	sc
+
+	/* Parent or child? */
+	cmpwi	cr1,r3,0
+	crandc	4*cr1+eq,4*cr1+eq,4*cr0+so
+	bne	cr1,eclone_parent
+
+	/* Child. Call fn. */
+	mtctr	r28
+	mr 	r3,r31
+	bctrl
+
+	/* Assume result of fn in r3 and exit. */
+	li	r0,__NR_exit
+	sc
+
+eclone_parent:
+	/* Restore non-volatiles. */
+	lmw	r28,16(r1)
+
+	addi	r1,r1,32
+
+	/* Return to caller on success. */
+	bnslr
+
+	/* Handle error.  Negate the return value to signal an error
+	 * to the caller, which must set errno.
+	 */
+	neg	r3,r3
+	blr
+
diff --git a/eclone.h b/eclone.h
new file mode 100644
index 0000000..601a621
--- /dev/null
+++ b/eclone.h
@@ -0,0 +1,25 @@
+#ifndef _ECLONE_H_
+#define _ECLONE_H_
+
+#include <stdint.h>
+
+struct clone_args {
+	uint64_t clone_flags_high;
+	uint64_t child_stack;
+	uint64_t child_stack_size;
+	uint64_t parent_tid_ptr;
+	uint64_t child_tid_ptr;
+
+	uint32_t nr_pids;
+
+	uint32_t reserved0;
+	uint64_t reserved1;
+};
+
+/* arch-dependent code implements this interface */
+extern int eclone(int (*fn)(void *), void *fn_arg,
+		  int clone_flags_low,
+		  struct clone_args *clone_args,
+		  pid_t *pids);
+
+#endif
diff --git a/nsexeccwp.c b/nsexeccwp.c
index a71d9a4..b80b78e 100644
--- a/nsexeccwp.c
+++ b/nsexeccwp.c
@@ -17,29 +17,13 @@
 #include <sys/wait.h>
 
 #include "clone.h"
+#include "eclone.h"
 
 struct pid_set {
 	int num_pids;
 	pid_t *pids;
 };
 
-typedef unsigned long long u64;
-typedef unsigned int u32;
-typedef int pid_t;
-struct clone_args {
-	u64 clone_flags_high;
-
-	u64 child_stack_base;
-	u64 child_stack_size;
-
-	u64 parent_tid_ptr;
-	u64 child_tid_ptr;
-
-	u32 nr_pids;
-
-	u32 reserved0;
-	u64 reserved1;
-};
 /* (until it's supported by libc) from clone_ARCH.c */
 extern int clone_with_pids(int (*fn)(void *), void *child_stack, int flags,
 			   struct pid_set *target_pids, void *arg);
@@ -210,6 +194,9 @@ int do_child(void *vargv)
 {
 	char **argv = (char **)vargv;
 
+	printf("%s(%p)/%lu\n", __func__, vargv, (unsigned long)getpid());
+	fflush(NULL);
+
 	if (check_newcgrp())
 		return 1;
 
@@ -237,6 +224,7 @@ void write_pid(char *pid_file, int pid)
 
 int main(int argc, char *argv[])
 {
+	int i;
 	int c;
 	unsigned long flags = 0, eflags = 0;
 	char ttyname[256];
@@ -244,11 +232,8 @@ int main(int argc, char *argv[])
 	int ret, use_clone = 0;
 	int pid;
 	char *pid_file = NULL;
-	struct pid_set pid_set;
-	int chosen_pid = 0;
-
-	pid_set.num_pids = 1;
-	pid_set.pids = &chosen_pid;
+	size_t nr_pids = 1;
+	pid_t chosen_pid = 0;
 
 	procname = basename(argv[0]);
 
@@ -287,6 +272,9 @@ int main(int argc, char *argv[])
 	argv = &argv[optind];
 	argc = argc - optind;
 
+	for (i = 0; i < argc; i++)
+		printf("argv[%d] = '%s'\n", i, argv[i]);
+
 	if (do_newcgrp) {
 		ret = pipe(pipefd);
 		if (ret) {
@@ -297,6 +285,7 @@ int main(int argc, char *argv[])
 	}
 
 	if (use_clone) {
+		struct clone_args clone_args;
 		int stacksize = 4*getpagesize();
 		void *stack = malloc(stacksize);
 
@@ -305,12 +294,17 @@ int main(int argc, char *argv[])
 			return -1;
 		}
 
+		memset(&clone_args, 0, sizeof(clone_args));
+		clone_args.child_stack = (unsigned long)stack;
+		clone_args.child_stack_size = stacksize;
+		clone_args.nr_pids = nr_pids;
+
 		printf("about to clone with %lx\n", flags);
 		if (chosen_pid)
 			printf("Will choose pid %d\n", chosen_pid);
+		printf("argv = %p\n", argv);
 		flags |= SIGCHLD;
-		pid = clone_with_pids(do_child, stack, flags, &pid_set,
-					(void *)argv);
+		pid = eclone(do_child, argv, flags, &clone_args, &chosen_pid);
 		if (pid == -1) {
 			perror("clone");
 			return -1;




More information about the Containers mailing list