[PATCH 6/7] proc: Introduce the /proc/<pid>/dump file

Kirill A. Shutemov kirill at shutemov.name
Sat Jul 16 15:57:09 PDT 2011


On Fri, Jul 15, 2011 at 05:47:44PM +0400, Pavel Emelyanov wrote:
> An image read from file contains task's registers and information
> about its VM. Later this image can be execve-ed causing recreation
> of the previously read task state.
> 
> The file format is my own, very simple. Introduced to make the code
> as simple as possible. Better file format (if any) is to be discussed.

I think file format should be per-binfmt, similar to core dump. So it will
be ELF with ELF binary. Core dumper code can be reused in some way.

> Signed-off-by: Pavel Emelyanov <xemul at parallels.com>
> 
> ---
>  fs/proc/Kconfig            |    8 +
>  fs/proc/Makefile           |    1 +
>  fs/proc/base.c             |    3 +
>  fs/proc/img_dump.c         |  397 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/binfmt_img.h |   87 ++++++++++
>  include/linux/proc_fs.h    |    2 +
>  6 files changed, 498 insertions(+), 0 deletions(-)
>  create mode 100644 fs/proc/img_dump.c
>  create mode 100644 include/linux/binfmt_img.h
> 
> diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
> index 15af622..c64bf75 100644
> --- a/fs/proc/Kconfig
> +++ b/fs/proc/Kconfig
> @@ -67,3 +67,11 @@ config PROC_PAGE_MONITOR
>  	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
>  	  /proc/kpagecount, and /proc/kpageflags. Disabling these
>            interfaces will reduce the size of the kernel by approximately 4kb.
> +
> +config PROC_IMG
> +	default y
> +	depends on PROC_FS

depends on X86_64 ?

>+	bool "Enable /proc/<pid>/dump file"
> +	help
> +	  Say Y here if you want to be able to produce checkpoint-restore images
> +	  for tasks via proc
> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
> index df434c5..3a59cb1 100644
> --- a/fs/proc/Makefile
> +++ b/fs/proc/Makefile
> @@ -27,3 +27,4 @@ proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
>  proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
>  proc-$(CONFIG_PRINTK)	+= kmsg.o
>  proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
> +proc-$(CONFIG_PROC_IMG) += img_dump.o
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 633af12..c01438f 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -3044,6 +3044,9 @@ static const struct pid_entry tgid_base_stuff[] = {
>  #endif
>  	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
>  	ONE("stat",       S_IRUGO, proc_tgid_stat),
> +#ifdef CONFIG_PROC_IMG
> +	REG("dump",	  S_IRUSR|S_IWUSR, proc_pid_dump_operations),
> +#endif

Writable?

>  	ONE("statm",      S_IRUGO, proc_pid_statm),
>  	REG("maps",       S_IRUGO, proc_maps_operations),
>  #ifdef CONFIG_NUMA
> diff --git a/fs/proc/img_dump.c b/fs/proc/img_dump.c
> new file mode 100644
> index 0000000..7fa52ef
> --- /dev/null
> +++ b/fs/proc/img_dump.c
> @@ -0,0 +1,397 @@
> +#include <linux/proc_fs.h>
> +#include <linux/sched.h>
> +#include <linux/uaccess.h>
> +#include <linux/binfmt_img.h>
> +#include <linux/mm.h>
> +#include <linux/mman.h>
> +#include <linux/highmem.h>
> +#include <linux/types.h>
> +#include "internal.h"
> +
> +static int img_dump_buffer(char __user *ubuf, size_t size, void *buf, int len, int pos)
> +{
> +	int ret;
> +	static size_t dumped = 0;
> +
> +	len -= pos;
> +	if (len > size)
> +		len = size;
> +
> +	ret = copy_to_user(ubuf, buf + pos, len);
> +	if (ret)
> +		return -EFAULT;
> +
> +	dumped += len;
> +	return len;
> +}
> +
> +static int img_dump_header(char __user *buf, size_t size, int pos)
> +{
> +	struct binfmt_img_header hdr;
> +
> +	hdr.magic = BINFMT_IMG_MAGIC;
> +	hdr.version = BINFMT_IMG_VERS_0;
> +
> +	return img_dump_buffer(buf, size, &hdr, sizeof(hdr), pos);
> +}
> +
> +static __u16 encode_segment(unsigned short seg)
> +{
> +	if (seg == 0)
> +		return CKPT_X86_SEG_NULL;
> +	BUG_ON((seg & 3) != 3);
> +
> +	if (seg == __USER_CS)
> +		return CKPT_X86_SEG_USER64_CS;
> +	if (seg == __USER_DS)
> +		return CKPT_X86_SEG_USER64_DS;
> +#ifdef CONFIG_COMPAT
> +	if (seg == __USER32_CS)
> +		return CKPT_X86_SEG_USER32_CS;
> +	if (seg == __USER32_DS)
> +		return CKPT_X86_SEG_USER32_DS;
> +#endif
> +
> +	if (seg & 4)
> +		return CKPT_X86_SEG_LDT | (seg >> 3);
> +
> +	seg >>= 3;
> +	if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
> +		return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
> +
> +	printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
> +	BUG();
> +}
> +
> +static __u64 encode_tls(struct desc_struct *d)
> +{
> +	return ((__u64)d->a << 32) + d->b;
> +}
> +
> +static int img_dump_regs(struct task_struct *p, char __user *buf, size_t size, int pos)
> +{
> +	struct binfmt_regs_image regi;
> +	struct pt_regs *regs;
> +	int i;
> +
> +	regs = task_pt_regs(p);
> +
> +	regi.r15 = regs->r15;
> +	regi.r14 = regs->r14;
> +	regi.r13 = regs->r13;
> +	regi.r12 = regs->r12;
> +	regi.r11 = regs->r11;
> +	regi.r10 = regs->r10;
> +	regi.r9 = regs->r9;
> +	regi.r8 = regs->r8;
> +	regi.ax = regs->ax;
> +	regi.orig_ax = regs->orig_ax;
> +	regi.bx = regs->bx;
> +	regi.cx = regs->cx;
> +	regi.dx = regs->dx;
> +	regi.si = regs->si;
> +	regi.di = regs->di;
> +	regi.ip = regs->ip;
> +	regi.flags = regs->flags;
> +	regi.bp = regs->bp;
> +	regi.sp = regs->sp;
> +
> +	/* segments */
> +	regi.gsindex = encode_segment(p->thread.gsindex);
> +	regi.fsindex = encode_segment(p->thread.fsindex);
> +	regi.cs = encode_segment(regs->cs);
> +	regi.ss = encode_segment(regs->ss);
> +	regi.ds = encode_segment(p->thread.ds);
> +	regi.es = encode_segment(p->thread.es);
> +
> +	BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
> +	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
> +		regi.tls[i] = encode_tls(&p->thread.tls_array[i]);
> +
> +	if (p->thread.gsindex)
> +		regi.gs = 0;
> +	else
> +		regi.gs = p->thread.gs;
> +
> +	if (p->thread.fsindex)
> +		regi.fs = 0;
> +	else
> +		regi.fs = p->thread.fs;
> +
> +	return img_dump_buffer(buf, size, &regi, sizeof(regi), pos);
> +}
> +
> +static int img_dump_mm(struct mm_struct *mm, char __user *buf, size_t size, int pos)
> +{
> +	struct binfmt_mm_image mmi;
> +
> +	mmi.flags = mm->flags;
> +	mmi.def_flags = mm->def_flags;
> +	mmi.start_code = mm->start_code;
> +	mmi.end_code = mm->end_code;
> +	mmi.start_data = mm->start_data;
> +	mmi.end_data = mm->end_data;
> +	mmi.start_brk = mm->start_brk;
> +	mmi.brk = mm->brk;
> +	mmi.start_stack = mm->start_stack;
> +	mmi.arg_start = mm->arg_start;
> +	mmi.arg_end = mm->arg_end;
> +	mmi.env_start = mm->env_start;
> +	mmi.env_end = mm->env_end;
> +	mmi.exe_fd = 0;
> +
> +	return img_dump_buffer(buf, size, &mmi, sizeof(mmi), pos);
> +}
> +
> +static int img_dump_vma(struct vm_area_struct *vma, char __user *buf, size_t size, int pos)
> +{
> +	struct binfmt_vma_image vmai;
> +
> +	if (vma == NULL) {
> +		memset(&vmai, 0, sizeof(vmai));
> +		goto dumpit;
> +	}
> +
> +	printk("Dumping vma %016lx-%016lx %p/%p\n", vma->vm_start, vma->vm_end, vma, vma->vm_mm);
> +
> +	vmai.fd = 0;
> +	vmai.prot = 0;
> +	if (vma->vm_flags & VM_READ)
> +		vmai.prot |= PROT_READ;
> +	if (vma->vm_flags & VM_WRITE)
> +		vmai.prot |= PROT_WRITE;
> +	if (vma->vm_flags & VM_EXEC)
> +		vmai.prot |= PROT_EXEC;
> +
> +	vmai.flags = 0;
> +	if (vma->vm_file == NULL)
> +		vmai.flags |= MAP_ANONYMOUS;
> +	if (vma->vm_flags & VM_MAYSHARE)
> +		vmai.flags |= MAP_SHARED;
> +	else
> +		vmai.flags |= MAP_PRIVATE;
> +
> +	vmai.start = vma->vm_start;
> +	vmai.end = vma->vm_end;
> +	vmai.pgoff = vma->vm_pgoff;
> +
> +dumpit:
> +	return img_dump_buffer(buf, size, &vmai, sizeof(vmai), pos);
> +}
> +
> +static int img_dump_page(unsigned long addr, void *data, char __user *buf, size_t size, int pos)
> +{
> +	struct binfmt_page_image pgi;
> +	int ret = 0, tmp;
> +
> +	pgi.vaddr = addr;
> +
> +	if (pos < sizeof(pgi)) {
> +		tmp = img_dump_buffer(buf, size, &pgi, sizeof(pgi), pos);
> +		if (tmp < 0)
> +			return tmp;
> +
> +		ret = tmp;
> +		if (size <= ret)
> +			return ret;
> +
> +		buf += ret;
> +		size -= ret;
> +		pos = 0;
> +	} else
> +		pos -= sizeof(pgi);
> +
> +	tmp = img_dump_buffer(buf, size, data, PAGE_SIZE, pos);
> +	if (tmp < 0)
> +		return tmp;
> +
> +	return ret + tmp;
> +}
> +
> +static inline int is_private_vma(struct vm_area_struct *vma)
> +{
> +	if (vma->vm_file == NULL)
> +		return 1;
> +	if (!(vma->vm_flags & VM_SHARED))
> +		return 1;
> +	return 0;
> +}
> +
> +static ssize_t do_produce_dump(struct task_struct *p, char __user *buf,
> +		size_t size, loff_t *ppos)
> +{
> +	size_t img_pos = 0, img_ppos;
> +	size_t produced = 0;
> +	int len;
> +	loff_t pos = *ppos;
> +	struct mm_struct *mm;
> +	struct vm_area_struct *vma;
> +
> +#define move_pos();	do {	\
> +		buf += len;	\
> +		produced += len;\
> +		size -= len;	\
> +		pos += len;	\
> +	} while (0)
> +
> +#define seek_pos(__size);	do {	\
> +		img_ppos = img_pos;	\
> +		img_pos += (__size);	\
> +	} while (0)
> +
> +	/* header */
> +	seek_pos(sizeof(struct binfmt_img_header));
> +	if (pos < img_pos) {
> +		len = img_dump_header(buf, size, pos - img_ppos);
> +		if (len < 0)
> +			goto err;
> +
> +		move_pos();
> +		if (size == 0)
> +			goto out;
> +	}
> +
> +	/* registers */
> +	seek_pos(sizeof(struct binfmt_regs_image));
> +	if (pos < img_pos) {
> +		len = img_dump_regs(p, buf, size, pos - img_ppos);
> +		if (len < 0)
> +			goto err;
> +
> +		move_pos();
> +		if (size == 0)
> +			goto out;
> +	}
> +
> +	/* memory */
> +	mm = get_task_mm(p);
> +	if (mm == NULL)
> +		return -EACCES;
> +
> +	down_read(&mm->mmap_sem);
> +
> +	seek_pos(sizeof(struct binfmt_mm_image));
> +	if (pos < img_pos) {
> +		len = img_dump_mm(mm, buf, size, pos - img_ppos);
> +		if (len < 0)
> +			goto err_mm;
> +
> +		move_pos();
> +		if (size == 0)
> +			goto out_mm;
> +	}
> +
> +	vma = mm->mmap;
> +	while (1) {
> +		seek_pos(sizeof(struct binfmt_vma_image));
> +		if (pos < img_pos) {
> +			len = img_dump_vma(vma, buf, size, pos - img_ppos);
> +			if (len < 0)
> +				goto err_mm;
> +
> +			move_pos();
> +			if (size == 0)
> +				goto out_mm;
> +		}
> +
> +		if (vma == NULL)
> +			break;
> +
> +		vma = vma->vm_next;
> +	}
> +
> +	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
> +		/* slow and stupid */
> +		unsigned long addr;
> +		struct page *page;
> +		void *pg_data;
> +
> +		if (!is_private_vma(vma))
> +			continue;
> +
> +		for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
> +			page = follow_page(vma, addr, FOLL_FORCE | FOLL_DUMP | FOLL_GET);
> +			if (page == NULL)
> +				continue;
> +			if (IS_ERR(page)) /* huh? */
> +				continue;
> +
> +			seek_pos(sizeof(struct binfmt_page_image) + PAGE_SIZE);
> +			if (pos < img_pos) {
> +				pg_data = kmap(page);
> +				len = img_dump_page(addr, pg_data, buf, size, pos - img_ppos);
> +				kunmap(page);
> +
> +				if (len < 0) {
> +					put_page(page);
> +					goto err_mm;
> +				}
> +
> +				move_pos();
> +				if (size == 0) {
> +					put_page(page);
> +					goto out_mm;
> +				}
> +			}
> +
> +			put_page(page);
> +		}
> +	}
> +
> +	seek_pos(sizeof(struct binfmt_page_image));
> +	if (pos < img_pos) {
> +		struct binfmt_page_image zero;
> +
> +		memset(&zero, 0, sizeof(zero));
> +		len = img_dump_buffer(buf, size, &zero, sizeof(zero), pos - img_ppos);
> +		if (len < 0)
> +			goto err;
> +
> +		move_pos();
> +	}
> +
> +out_mm:
> +	up_read(&mm->mmap_sem);
> +	mmput(mm);
> +out:
> +	*ppos = pos;
> +	return produced;
> +
> +err_mm:
> +	up_read(&mm->mmap_sem);
> +	mmput(mm);
> +err:
> +	return len;
> +}
> +
> +static ssize_t img_dump_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
> +{
> +	struct task_struct *p;
> +
> +	p = get_proc_task(file->f_dentry->d_inode);
> +	if (p == NULL)
> +		return -ESRCH;
> +
> +	if (!(p->state & TASK_STOPPED)) {
> +		put_task_struct(p);
> +		return -EINVAL;
> +	}
> +
> +	return do_produce_dump(p, buf, size, ppos);
> +}
> +
> +static int img_dump_open(struct inode *inode, struct file *filp)
> +{
> +	return 0;
> +}
> +
> +static int img_dump_release(struct inode *inode, struct file *filp)
> +{
> +	return 0;
> +}
> +
> +const struct file_operations proc_pid_dump_operations = {
> +	.open		= img_dump_open,
> +	.read		= img_dump_read,
> +	.release	= img_dump_release,
> +};
> diff --git a/include/linux/binfmt_img.h b/include/linux/binfmt_img.h
> new file mode 100644
> index 0000000..a4293af
> --- /dev/null
> +++ b/include/linux/binfmt_img.h
> @@ -0,0 +1,87 @@
> +#ifndef __BINFMT_IMG_H__
> +#define __BINFMT_IMG_H__
> +
> +#include <linux/types.h>
> +
> +struct binfmt_img_header {
> +	__u32	magic;
> +	__u32	version;
> +};
> +
> +#define CKPT_TLS_ENTRIES	3
> +
> +struct binfmt_regs_image {
> +	__u64 r15;
> +	__u64 r14;
> +	__u64 r13;
> +	__u64 r12;
> +	__u64 r11;
> +	__u64 r10;
> +	__u64 r9;
> +	__u64 r8;
> +	__u64 ax;
> +	__u64 orig_ax;
> +	__u64 bx;
> +	__u64 cx;
> +	__u64 dx;
> +	__u64 si;
> +	__u64 di;
> +	__u64 ip;
> +	__u64 flags;
> +	__u64 bp;
> +	__u64 sp;
> +
> +	__u64 gs;
> +	__u64 fs;
> +	__u64 tls[CKPT_TLS_ENTRIES];
> +	__u16 gsindex;
> +	__u16 fsindex;
> +	__u16 cs;
> +	__u16 ss;
> +	__u16 ds;
> +	__u16 es;
> +};
> +
> +#define CKPT_X86_SEG_NULL       0
> +#define CKPT_X86_SEG_USER32_CS  1
> +#define CKPT_X86_SEG_USER32_DS  2
> +#define CKPT_X86_SEG_USER64_CS  3
> +#define CKPT_X86_SEG_USER64_DS  4
> +#define CKPT_X86_SEG_TLS        0x4000
> +#define CKPT_X86_SEG_LDT        0x8000
> +
> +struct binfmt_mm_image {
> +	__u64	flags;
> +	__u64	def_flags;
> +	__u64	start_code;
> +	__u64	end_code;
> +	__u64	start_data;
> +	__u64	end_data;
> +	__u64	start_brk;
> +	__u64	brk;
> +	__u64	start_stack;
> +	__u64	arg_start;
> +	__u64	arg_end;
> +	__u64	env_start;
> +	__u64	env_end;
> +	__u32	exe_fd;
> +};
> +
> +struct binfmt_vma_image {
> +	__u32	prot;
> +	__u32	flags;
> +	__u32	pad;
> +	__u32	fd;
> +	__u64	start;
> +	__u64	end;
> +	__u64	pgoff;
> +};
> +
> +struct binfmt_page_image {
> +	__u64	vaddr;
> +};
> +
> +#define BINFMT_IMG_MAGIC	0xa75b8d43
> +#define BINFMT_IMG_VERS_0	0x00000100
> +
> +#endif
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index c779c74..686b374 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -102,6 +102,8 @@ struct vmcore {
>  
>  #ifdef CONFIG_PROC_FS
>  
> +extern const struct file_operations proc_pid_dump_operations;
> +
>  extern void proc_root_init(void);
>  
>  void proc_flush_task(struct task_struct *task);
> -- 
> 1.5.5.6
> _______________________________________________
> Containers mailing list
> Containers at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

-- 
 Kirill A. Shutemov


More information about the Containers mailing list