How much of a mess does OpenVZ make? ;) Was: What can OpenVZ do?

Ying Han yinghan at google.com
Thu Mar 12 23:19:43 PDT 2009


Thank you Sukadev for your comments. I will try to clean up my patch
and repost it.

--Ying

On Thu, Mar 12, 2009 at 10:34 PM, Sukadev Bhattiprolu
<sukadev at linux.vnet.ibm.com> wrote:
> Ying Han [yinghan at google.com] wrote:
> | Hi Serge:
> | I made a patch based on Oren's tree recently which implement a new
> | syscall clone_with_pid. I tested with checkpoint/restart process tree
> | and it works as expected.
>
> Yes, I think we had a version of clone() with pid a while ago.
>
> But it would be easier to review if you break it up into smaller
> patches. and remove the unnecessary diffs in this patch like...
>
>
> | This patch has some hack in it which i made a copy of libc's clone and
> | made modifications of passing one more argument(pid number). I will
> | try to clean up the code and do more testing.
> |
> | New syscall clone_with_pid
> | Implement a new syscall which clone a thread with a preselected pid number.
> |
> | clone_with_pid(child_func, child_stack + CHILD_STACK - 16,
> |                       CLONE_WITH_PID|SIGCHLD, pid, NULL);
> |
> | Signed-off-by: Ying Han <yinghan at google.com>
> |
> | diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
> | index 87803da..b5a1b03 100644
> | --- a/arch/x86/include/asm/syscalls.h
> | +++ b/arch/x86/include/asm/syscalls.h
> | @@ -26,6 +26,7 @@ asmlinkage int sys_fork(struct pt_regs);
> |  asmlinkage int sys_clone(struct pt_regs);
> |  asmlinkage int sys_vfork(struct pt_regs);
> |  asmlinkage int sys_execve(struct pt_regs);
> | +asmlinkage int sys_clone_with_pid(struct pt_regs);
> |
> |  /* kernel/signal_32.c */
> |  asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
> | diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32
> | index a5f9e09..f10ca0e 100644
> | --- a/arch/x86/include/asm/unistd_32.h
> | +++ b/arch/x86/include/asm/unistd_32.h
> | @@ -340,6 +340,7 @@
> |  #define __NR_inotify_init1   332
> |  #define __NR_checkpoint              333
> |  #define __NR_restart         334
> | +#define __NR_clone_with_pid  335
> |
> |  #ifdef __KERNEL__
> |
> | diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
> | index 0a1302f..88ae634 100644
> | --- a/arch/x86/kernel/process_32.c
> | +++ b/arch/x86/kernel/process_32.c
> | @@ -8,7 +8,6 @@
> |  /*
> |   * This file handles the architecture-dependent parts of process handling..
> |   */
> | -
>
> these
>
> |  #include <stdarg.h>
> |
> |  #include <linux/cpu.h>
> | @@ -652,6 +651,28 @@ asmlinkage int sys_clone(struct pt_regs regs)
> |       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
> |  }
> |
> | +/**
> | + * sys_clone_with_pid - clone a thread with pre-select pid number.
> | + */
> | +asmlinkage int sys_clone_with_pid(struct pt_regs regs)
> | +{
> | +     unsigned long clone_flags;
> | +     unsigned long newsp;
> | +     int __user *parent_tidptr, *child_tidptr;
> | +     pid_t pid_nr;
> | +
> | +     clone_flags = regs.bx;
> | +     newsp = regs.cx;
> | +     parent_tidptr = (int __user *)regs.dx;
> | +     child_tidptr = (int __user *)regs.di;
> | +     pid_nr = regs.bp;
> | +
> | +     if (!newsp)
> | +             newsp = regs.sp;
> | +     return do_fork(clone_flags, newsp, &regs, pid_nr, parent_tidptr,
> | +                     child_tidptr);
> | +}
> | +
> |  /*
> |   * This is trivial, and on the face of it looks like it
> |   * could equally well be done in user mode.
> | diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_tabl
> | index 5543136..5191117 100644
> | --- a/arch/x86/kernel/syscall_table_32.S
> | +++ b/arch/x86/kernel/syscall_table_32.S
> | @@ -334,3 +334,4 @@ ENTRY(sys_call_table)
> |       .long sys_inotify_init1
> |       .long sys_checkpoint
> |       .long sys_restart
> | +     .long sys_clone_with_pid
> | diff --git a/arch/x86/mm/checkpoint.c b/arch/x86/mm/checkpoint.c
> | index 50bde9a..a4aee65 100644
> | --- a/arch/x86/mm/checkpoint.c
> | +++ b/arch/x86/mm/checkpoint.c
> | @@ -7,7 +7,6 @@
> |   *  License.  See the file COPYING in the main directory of the Linux
> |   *  distribution for more details.
> |   */
> | -
> |  #include <asm/desc.h>
> |  #include <asm/i387.h>
> |
> | diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
> | index 64155de..b7de611 100644
> | --- a/checkpoint/checkpoint.c
> | +++ b/checkpoint/checkpoint.c
> | @@ -8,6 +8,7 @@
> |   *  distribution for more details.
> |   */
> |
> | +#define DEBUG
> |  #include <linux/version.h>
> |  #include <linux/sched.h>
> |  #include <linux/ptrace.h>
> | @@ -564,3 +565,4 @@ int do_checkpoint(struct cr_ctx *ctx, pid_t pid)
> |   out:
> |       return ret;
> |  }
> | +
> | diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c
> | index e3097ac..a8c5ad5 100644
> | --- a/checkpoint/ckpt_file.c
> | +++ b/checkpoint/ckpt_file.c
> | @@ -7,7 +7,7 @@
> |   *  License.  See the file COPYING in the main directory of the Linux
> |   *  distribution for more details.
> |   */
> | -
> | +#define DEBUG
> |  #include <linux/kernel.h>
> |  #include <linux/sched.h>
> |  #include <linux/file.h>
> | diff --git a/checkpoint/ckpt_mem.c b/checkpoint/ckpt_mem.c
> | index 4925ff2..ca5840b 100644
> | --- a/checkpoint/ckpt_mem.c
> | +++ b/checkpoint/ckpt_mem.c
> | @@ -7,7 +7,7 @@
> |   *  License.  See the file COPYING in the main directory of the Linux
> |   *  distribution for more details.
> |   */
> | -
> | +#define DEBUG
> |  #include <linux/kernel.h>
> |  #include <linux/sched.h>
> |  #include <linux/slab.h>
> | diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> | index 7ec4de4..30e43c2 100644
> | --- a/checkpoint/restart.c
> | +++ b/checkpoint/restart.c
> | @@ -8,6 +8,7 @@
> |   *  distribution for more details.
> |   */
> |
> | +#define DEBUG
> |  #include <linux/version.h>
> |  #include <linux/sched.h>
> |  #include <linux/wait.h>
> | @@ -242,7 +243,7 @@ static int cr_read_task_struct(struct cr_ctx *ctx)
> |               memcpy(t->comm, buf, min(hh->task_comm_len, TASK_COMM_LEN));
> |       }
> |       kfree(buf);
> | -
> | +     pr_debug("read task %s\n", t->comm);
> |       /* FIXME: restore remaining relevant task_struct fields */
> |   out:
> |       cr_hbuf_put(ctx, sizeof(*hh));
> | diff --git a/checkpoint/rstr_file.c b/checkpoint/rstr_file.c
> | index f44b081..755e40e 100644
> | --- a/checkpoint/rstr_file.c
> | +++ b/checkpoint/rstr_file.c
> | @@ -7,7 +7,7 @@
> |   *  License.  See the file COPYING in the main directory of the Linux
> |   *  distribution for more details.
> |   */
> | -
> | +#define DEBUG
> |  #include <linux/kernel.h>
> |  #include <linux/sched.h>
> |  #include <linux/fs.h>
> | diff --git a/checkpoint/rstr_mem.c b/checkpoint/rstr_mem.c
> | index 4d5ce1a..8330468 100644
> | --- a/checkpoint/rstr_mem.c
> | +++ b/checkpoint/rstr_mem.c
> | @@ -7,7 +7,7 @@
> |   *  License.  See the file COPYING in the main directory of the Linux
> |   *  distribution for more details.
> |   */
> | -
> | +#define DEBUG
> |  #include <linux/kernel.h>
> |  #include <linux/sched.h>
> |  #include <linux/fcntl.h>
> | diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> | index f26b0c6..d1a5394 100644
> | --- a/checkpoint/sys.c
> | +++ b/checkpoint/sys.c
> | @@ -7,7 +7,7 @@
> |   *  License.  See the file COPYING in the main directory of the Linux
> |   *  distribution for more details.
> |   */
> | -
> | +#define DEBUG
> |  #include <linux/sched.h>
> |  #include <linux/nsproxy.h>
> |  #include <linux/kernel.h>
> | @@ -263,7 +263,6 @@ asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned
> |               return PTR_ERR(ctx);
> |
> |       ret = do_checkpoint(ctx, pid);
> | -
> |       if (!ret)
> |               ret = ctx->crid;
> |
> | @@ -304,3 +303,4 @@ asmlinkage long sys_restart(int crid, int fd, unsigned lon
> |       cr_ctx_put(ctx);
> |       return ret;
> |  }
> | +
> | diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> | index 217cf6e..bc2c202 100644
> | --- a/include/linux/checkpoint.h
> | +++ b/include/linux/checkpoint.h
> | @@ -114,7 +114,6 @@ extern int cr_write_files(struct cr_ctx *ctx, struct task_
> |  extern int do_restart(struct cr_ctx *ctx, pid_t pid);
> |  extern int cr_read_mm(struct cr_ctx *ctx);
> |  extern int cr_read_files(struct cr_ctx *ctx);
> | -
> |  #ifdef pr_fmt
> |  #undef pr_fmt
> |  #endif
> | diff --git a/include/linux/pid.h b/include/linux/pid.h
> | index d7e98ff..86e2f61 100644
> | --- a/include/linux/pid.h
> | +++ b/include/linux/pid.h
> | @@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
> |  extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
> |  int next_pidmap(struct pid_namespace *pid_ns, int last);
> |
> | -extern struct pid *alloc_pid(struct pid_namespace *ns);
> | +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t pid_nr);
> |  extern void free_pid(struct pid *pid);
> |
> |  /*
> | diff --git a/include/linux/sched.h b/include/linux/sched.h
> | index 0150e90..7fb4e28 100644
> | --- a/include/linux/sched.h
> | +++ b/include/linux/sched.h
> | @@ -28,6 +28,7 @@
> |  #define CLONE_NEWPID         0x20000000      /* New pid namespace */
> |  #define CLONE_NEWNET         0x40000000      /* New network namespace */
> |  #define CLONE_IO             0x80000000      /* Clone io context */
> | +#define CLONE_WITH_PID               0x00001000      /* Clone with pre-select PID */
> |
> |  /*
> |   * Scheduling policies
> | diff --git a/kernel/exit.c b/kernel/exit.c
> | index 2d8be7e..4baf651 100644
> | --- a/kernel/exit.c
> | +++ b/kernel/exit.c
> | @@ -3,7 +3,7 @@
> |   *
> |   *  Copyright (C) 1991, 1992  Linus Torvalds
> |   */
> | -
> | +#define DEBUG
> |  #include <linux/mm.h>
> |  #include <linux/slab.h>
> |  #include <linux/interrupt.h>
> | @@ -1676,6 +1676,7 @@ static long do_wait(enum pid_type type, struct pid *pid,
> |       DECLARE_WAITQUEUE(wait, current);
> |       struct task_struct *tsk;
> |       int retval;
> | +     int level;
>
> and this (level is not used).
> |
> |       trace_sched_process_wait(pid);
> |
> | @@ -1708,7 +1709,6 @@ repeat:
> |                       retval = tsk_result;
> |                       goto end;
> |               }
> | -
> |               if (options & __WNOTHREAD)
> |                       break;
> |               tsk = next_thread(tsk);
> | @@ -1817,7 +1817,6 @@ asmlinkage long sys_wait4(pid_t upid, int __user *stat_a
> |               type = PIDTYPE_PID;
> |               pid = find_get_pid(upid);
> |       }
> | -
> |       ret = do_wait(type, pid, options | WEXITED, NULL, stat_addr, ru);
> |       put_pid(pid);
> |
> | diff --git a/kernel/fork.c b/kernel/fork.c
> | index 085ce56..262ae1e 100644
> | --- a/kernel/fork.c
> | +++ b/kernel/fork.c
> | @@ -10,7 +10,7 @@
> |   * Fork is rather simple, once you get the hang of it, but the memory
> |   * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
> |   */
> | -
> | +#define DEBUG
> |  #include <linux/slab.h>
> |  #include <linux/init.h>
> |  #include <linux/unistd.h>
> | @@ -959,10 +959,19 @@ static struct task_struct *copy_process(unsigned long cl
> |       int retval;
> |       struct task_struct *p;
> |       int cgroup_callbacks_done = 0;
> | +     pid_t clone_pid = stack_size;
> |
> |       if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
> |               return ERR_PTR(-EINVAL);
> |
> | +     /* We only allow the clone_with_pid when a new pid namespace is
> | +      * created. FIXME: how to restrict it.
>
> Not sure why CLONE_NEWPID is required to set pid_nr. In fact with CLONE_NEWPID,
> by definition, pid_nr should be 1. Also, what happens if a container has
> more than one process - where the second process has a pid_nr > 2 ?
>


More information about the Containers mailing list