[PATCH 1/2] Run dump pipe in container's namespace

Zhao Lei zhaolei at cn.fujitsu.com
Wed Mar 16 09:23:18 UTC 2016


In current system, when we set core_pattern to a pipe, both pipe program
and program's output are in host's filesystem.

For example, when we set following core_pattern:
 # echo "|/my_dump_pipe %s %c %p %u %g %t e" >/proc/sys/kernel/core_pattern
and trigger a segment fault in a container, my_dump_pipe is searched from
host's filesystem, and it will write coredump into host's filesystem too.

In a privileged container, user can crush host system by following command:
 # # In a container
 # echo "|/bin/dd of=/boot/vmlinuz" >/proc/sys/kernel/core_pattern
 # make_dump

Actually, all operation in a container should not change host's
environment, the container should use core_pattern as its private setting.
In detail, in core dump action:
1: Search pipe program in container's fs namespace.
2: Run pipe program in container's fs namespace to write coredump to it.

This patch fixed above problem running pipe program in user process's
context instead of kthread.

Test:
 # ################
 # # In host's system
 # ################
 #
 # ulimit -c 1024000
 # echo "|/dump_pipe" >/proc/sys/kernel/core_pattern
 # cat /dump_pipe
 #!/bin/sh
 cat >/tmp/host_dump_$1_$2_$3_$4_$5_$6
 # rm -f /tmp/*dump*
 # ./make_dump
 Segmentation fault (core dumped)
 # ls -l /tmp/*dump*
 -rw-r--r-- 1 root root 331776 Mar 16 16:57 /tmp/host_dump______
 #
 # lxc-start -n vm01
 #
 # ################
 # # In guest's system:
 # ################
 #
 # cat /proc/sys/kernel/core_pattern
 |/dump_pipe
 # cat /dump_pipe
 #!/bin/sh
 cat >/tmp/guest_dump_$1_$2_$3_$4_$5_$6
 # rm -f /tmp/*dump*
 # ./make_dump
 Segmentation fault (core dumped)
 # ls -l /tmp/*dump*
 -rw-r--r--    1 root     root       331776 Mar 16 09:02 /tmp/guest_dump______
 #

Signed-off-by: Zhao Lei <zhaolei at cn.fujitsu.com>
---
 arch/x86/kernel/process_32.c |  5 +--
 arch/x86/kernel/process_64.c |  5 +--
 fs/coredump.c                | 76 +++++++++++++++++++++++++++-----------------
 include/linux/sched.h        |  5 +--
 kernel/fork.c                | 24 ++++++++------
 5 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 9f95091..2b1862e 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -130,7 +130,8 @@ void release_thread(struct task_struct *dead_task)
 }
 
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
-	unsigned long arg, struct task_struct *p, unsigned long tls)
+	unsigned long arg, struct task_struct *p, unsigned long tls,
+	int return_to_kernel)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
 	struct task_struct *tsk;
@@ -140,7 +141,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	p->thread.sp0 = (unsigned long) (childregs+1);
 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
-	if (unlikely(p->flags & PF_KTHREAD)) {
+	if (unlikely(p->flags & PF_KTHREAD) || return_to_kernel) {
 		/* kernel thread */
 		memset(childregs, 0, sizeof(struct pt_regs));
 		p->thread.ip = (unsigned long) ret_from_kernel_thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0..de05bc0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -153,7 +153,8 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
 }
 
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
-		unsigned long arg, struct task_struct *p, unsigned long tls)
+		unsigned long arg, struct task_struct *p, unsigned long tls,
+		int return_to_kernel)
 {
 	int err;
 	struct pt_regs *childregs;
@@ -173,7 +174,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	savesegment(ds, p->thread.ds);
 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
-	if (unlikely(p->flags & PF_KTHREAD)) {
+	if (unlikely(p->flags & PF_KTHREAD) || return_to_kernel) {
 		/* kernel thread */
 		memset(childregs, 0, sizeof(struct pt_regs));
 		childregs->sp = (unsigned long)childregs;
diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9..6287f00 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -496,33 +496,50 @@ static void wait_for_dump_helpers(struct file *file)
 	pipe_unlock(pipe);
 }
 
-/*
- * umh_pipe_setup
- * helper function to customize the process used
- * to collect the core in userspace.  Specifically
- * it sets up a pipe and installs it as fd 0 (stdin)
- * for the process.  Returns 0 on success, or
- * PTR_ERR on failure.
- * Note that it also sets the core limit to 1.  This
- * is a special value that we use to trap recursive
- * core dumps
- */
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+struct pipeprg_data {
+	char **argv;
+	struct coredump_params *cp;
+};
+
+static int fork_callback(void *data)
 {
+	struct pipeprg_data *ppd = (struct pipeprg_data *)data;
 	struct file *files[2];
-	struct coredump_params *cp = (struct coredump_params *)info->data;
-	int err = create_pipe_files(files, 0);
-	if (err)
-		return err;
+	int ret;
+
+	/*
+	 * Sets up a pipe and installs it as fd 0 (stdin)
+	 * for the process.
+	 */
+	ret = create_pipe_files(files, 0);
+	if (ret)
+		do_exit(0);
 
-	cp->file = files[1];
+	ppd->cp->file = files[1];
 
-	err = replace_fd(0, files[0], 0);
+	ret = replace_fd(0, files[0], 0);
 	fput(files[0]);
-	/* and disallow core files too */
+	if (ret < 0)
+		do_exit(0);
+
+	/*
+	 * Sets the core limit to 1.  This
+	 * is a special value that we use to trap recursive
+	 * core dumps
+	 */
 	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
 
-	return err;
+	set_fs(KERNEL_DS);
+	ret = do_execve(getname_kernel(ppd->argv[0]),
+			(const char __user *const __user *)ppd->argv,
+			(const char __user *const __user *)NULL);
+	if (ret) {
+		printk(KERN_WARNING "execute pipe program failed: %s ret=%d\n",
+		       ppd->argv[0], ret);
+		do_exit(0);
+	}
+
+	return ret;
 }
 
 void do_coredump(const siginfo_t *siginfo)
@@ -551,6 +568,8 @@ void do_coredump(const siginfo_t *siginfo)
 		 */
 		.mm_flags = mm->flags,
 	};
+	struct pipeprg_data ppd;
+	pid_t pid;
 
 	audit_core_dumps(siginfo->si_signo);
 
@@ -586,7 +605,6 @@ void do_coredump(const siginfo_t *siginfo)
 	if (ispipe) {
 		int dump_count;
 		char **helper_argv;
-		struct subprocess_info *sub_info;
 
 		if (ispipe < 0) {
 			printk(KERN_WARNING "format_corename failed\n");
@@ -633,19 +651,17 @@ void do_coredump(const siginfo_t *siginfo)
 			goto fail_dropcount;
 		}
 
-		retval = -ENOMEM;
-		sub_info = call_usermodehelper_setup(helper_argv[0],
-						helper_argv, NULL, GFP_KERNEL,
-						umh_pipe_setup, NULL, &cprm);
-		if (sub_info)
-			retval = call_usermodehelper_exec(sub_info,
-							  UMH_WAIT_EXEC);
+		ppd.argv = helper_argv;
+		ppd.cp = &cprm;
 
+		pid = _do_fork(CLONE_VFORK, (unsigned long)fork_callback,
+			       (unsigned long)&ppd, NULL, NULL, 0, 1);
 		argv_free(helper_argv);
-		if (retval) {
+		if (pid < 0) {
 			printk(KERN_INFO "Core dump to |%s pipe failed\n",
 			       cn.corename);
-			goto close_fail;
+			retval = pid;
+			goto fail_dropcount;
 		}
 	} else {
 		struct inode *inode;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a10494a..1647319 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2612,7 +2612,7 @@ extern void mm_release(struct task_struct *, struct mm_struct *);
 
 #ifdef CONFIG_HAVE_COPY_THREAD_TLS
 extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
-			struct task_struct *, unsigned long);
+			struct task_struct *, unsigned long, int);
 #else
 extern int copy_thread(unsigned long, unsigned long, unsigned long,
 			struct task_struct *);
@@ -2644,7 +2644,8 @@ extern int do_execveat(int, struct filename *,
 		       const char __user * const __user *,
 		       const char __user * const __user *,
 		       int);
-extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
+extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *,
+		     int __user *, unsigned long, int);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index 2e391c7..643a09b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1245,7 +1245,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					int __user *child_tidptr,
 					struct pid *pid,
 					int trace,
-					unsigned long tls)
+					unsigned long tls,
+					int return_to_kernel)
 {
 	int retval;
 	struct task_struct *p;
@@ -1451,7 +1452,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	retval = copy_io(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls,
+				 return_to_kernel);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
@@ -1673,7 +1675,7 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, 0);
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
 		init_idle(task, cpu);
@@ -1693,7 +1695,8 @@ long _do_fork(unsigned long clone_flags,
 	      unsigned long stack_size,
 	      int __user *parent_tidptr,
 	      int __user *child_tidptr,
-	      unsigned long tls)
+	      unsigned long tls,
+	      int return_to_kernel)
 {
 	struct task_struct *p;
 	int trace = 0;
@@ -1718,7 +1721,7 @@ long _do_fork(unsigned long clone_flags,
 	}
 
 	p = copy_process(clone_flags, stack_start, stack_size,
-			 child_tidptr, NULL, trace, tls);
+			 child_tidptr, NULL, trace, tls, return_to_kernel);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
@@ -1769,7 +1772,7 @@ long do_fork(unsigned long clone_flags,
 	      int __user *child_tidptr)
 {
 	return _do_fork(clone_flags, stack_start, stack_size,
-			parent_tidptr, child_tidptr, 0);
+			parent_tidptr, child_tidptr, 0, 0);
 }
 #endif
 
@@ -1779,14 +1782,14 @@ long do_fork(unsigned long clone_flags,
 pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 {
 	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
-		(unsigned long)arg, NULL, NULL, 0);
+		(unsigned long)arg, NULL, NULL, 0, 0);
 }
 
 #ifdef __ARCH_WANT_SYS_FORK
 SYSCALL_DEFINE0(fork)
 {
 #ifdef CONFIG_MMU
-	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0, 0);
 #else
 	/* can not support in nommu mode */
 	return -EINVAL;
@@ -1798,7 +1801,7 @@ SYSCALL_DEFINE0(fork)
 SYSCALL_DEFINE0(vfork)
 {
 	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
-			0, NULL, NULL, 0);
+			0, NULL, NULL, 0, 0);
 }
 #endif
 
@@ -1826,7 +1829,8 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 unsigned long, tls)
 #endif
 {
-	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+	return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr,
+			tls, 0);
 }
 #endif
 
-- 
1.8.5.1





More information about the Containers mailing list