[PATCH 6/7] proc: Introduce the /proc/<pid>/dump file

Pavel Emelyanov xemul at parallels.com
Fri Jul 15 06:47:44 PDT 2011


An image read from file contains task's registers and information
about its VM. Later this image can be execve-ed causing recreation
of the previously read task state.

The file format is my own, very simple. Introduced to make the code
as simple as possible. Better file format (if any) is to be discussed.

Signed-off-by: Pavel Emelyanov <xemul at parallels.com>

---
 fs/proc/Kconfig            |    8 +
 fs/proc/Makefile           |    1 +
 fs/proc/base.c             |    3 +
 fs/proc/img_dump.c         |  397 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/binfmt_img.h |   87 ++++++++++
 include/linux/proc_fs.h    |    2 +
 6 files changed, 498 insertions(+), 0 deletions(-)
 create mode 100644 fs/proc/img_dump.c
 create mode 100644 include/linux/binfmt_img.h

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 15af622..c64bf75 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -67,3 +67,11 @@ config PROC_PAGE_MONITOR
 	  /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
 	  /proc/kpagecount, and /proc/kpageflags. Disabling these
           interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_IMG
+	default y
+	depends on PROC_FS
+	bool "Enable /proc/<pid>/dump file"
+	help
+	  Say Y here if you want to be able to produce checkpoint-restore images
+	  for tasks via proc
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index df434c5..3a59cb1 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,3 +27,4 @@ proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
 proc-$(CONFIG_PRINTK)	+= kmsg.o
 proc-$(CONFIG_PROC_PAGE_MONITOR)	+= page.o
+proc-$(CONFIG_PROC_IMG) += img_dump.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 633af12..c01438f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3044,6 +3044,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 	INF("cmdline",    S_IRUGO, proc_pid_cmdline),
 	ONE("stat",       S_IRUGO, proc_tgid_stat),
+#ifdef CONFIG_PROC_IMG
+	REG("dump",	  S_IRUSR|S_IWUSR, proc_pid_dump_operations),
+#endif
 	ONE("statm",      S_IRUGO, proc_pid_statm),
 	REG("maps",       S_IRUGO, proc_maps_operations),
 #ifdef CONFIG_NUMA
diff --git a/fs/proc/img_dump.c b/fs/proc/img_dump.c
new file mode 100644
index 0000000..7fa52ef
--- /dev/null
+++ b/fs/proc/img_dump.c
@@ -0,0 +1,397 @@
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/binfmt_img.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/types.h>
+#include "internal.h"
+
+static int img_dump_buffer(char __user *ubuf, size_t size, void *buf, int len, int pos)
+{
+	int ret;
+	static size_t dumped = 0;
+
+	len -= pos;
+	if (len > size)
+		len = size;
+
+	ret = copy_to_user(ubuf, buf + pos, len);
+	if (ret)
+		return -EFAULT;
+
+	dumped += len;
+	return len;
+}
+
+static int img_dump_header(char __user *buf, size_t size, int pos)
+{
+	struct binfmt_img_header hdr;
+
+	hdr.magic = BINFMT_IMG_MAGIC;
+	hdr.version = BINFMT_IMG_VERS_0;
+
+	return img_dump_buffer(buf, size, &hdr, sizeof(hdr), pos);
+}
+
+static __u16 encode_segment(unsigned short seg)
+{
+	if (seg == 0)
+		return CKPT_X86_SEG_NULL;
+	BUG_ON((seg & 3) != 3);
+
+	if (seg == __USER_CS)
+		return CKPT_X86_SEG_USER64_CS;
+	if (seg == __USER_DS)
+		return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+	if (seg == __USER32_CS)
+		return CKPT_X86_SEG_USER32_CS;
+	if (seg == __USER32_DS)
+		return CKPT_X86_SEG_USER32_DS;
+#endif
+
+	if (seg & 4)
+		return CKPT_X86_SEG_LDT | (seg >> 3);
+
+	seg >>= 3;
+	if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+		return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+	printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+	BUG();
+}
+
+static __u64 encode_tls(struct desc_struct *d)
+{
+	return ((__u64)d->a << 32) + d->b;
+}
+
+static int img_dump_regs(struct task_struct *p, char __user *buf, size_t size, int pos)
+{
+	struct binfmt_regs_image regi;
+	struct pt_regs *regs;
+	int i;
+
+	regs = task_pt_regs(p);
+
+	regi.r15 = regs->r15;
+	regi.r14 = regs->r14;
+	regi.r13 = regs->r13;
+	regi.r12 = regs->r12;
+	regi.r11 = regs->r11;
+	regi.r10 = regs->r10;
+	regi.r9 = regs->r9;
+	regi.r8 = regs->r8;
+	regi.ax = regs->ax;
+	regi.orig_ax = regs->orig_ax;
+	regi.bx = regs->bx;
+	regi.cx = regs->cx;
+	regi.dx = regs->dx;
+	regi.si = regs->si;
+	regi.di = regs->di;
+	regi.ip = regs->ip;
+	regi.flags = regs->flags;
+	regi.bp = regs->bp;
+	regi.sp = regs->sp;
+
+	/* segments */
+	regi.gsindex = encode_segment(p->thread.gsindex);
+	regi.fsindex = encode_segment(p->thread.fsindex);
+	regi.cs = encode_segment(regs->cs);
+	regi.ss = encode_segment(regs->ss);
+	regi.ds = encode_segment(p->thread.ds);
+	regi.es = encode_segment(p->thread.es);
+
+	BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
+	for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+		regi.tls[i] = encode_tls(&p->thread.tls_array[i]);
+
+	if (p->thread.gsindex)
+		regi.gs = 0;
+	else
+		regi.gs = p->thread.gs;
+
+	if (p->thread.fsindex)
+		regi.fs = 0;
+	else
+		regi.fs = p->thread.fs;
+
+	return img_dump_buffer(buf, size, &regi, sizeof(regi), pos);
+}
+
+static int img_dump_mm(struct mm_struct *mm, char __user *buf, size_t size, int pos)
+{
+	struct binfmt_mm_image mmi;
+
+	mmi.flags = mm->flags;
+	mmi.def_flags = mm->def_flags;
+	mmi.start_code = mm->start_code;
+	mmi.end_code = mm->end_code;
+	mmi.start_data = mm->start_data;
+	mmi.end_data = mm->end_data;
+	mmi.start_brk = mm->start_brk;
+	mmi.brk = mm->brk;
+	mmi.start_stack = mm->start_stack;
+	mmi.arg_start = mm->arg_start;
+	mmi.arg_end = mm->arg_end;
+	mmi.env_start = mm->env_start;
+	mmi.env_end = mm->env_end;
+	mmi.exe_fd = 0;
+
+	return img_dump_buffer(buf, size, &mmi, sizeof(mmi), pos);
+}
+
+static int img_dump_vma(struct vm_area_struct *vma, char __user *buf, size_t size, int pos)
+{
+	struct binfmt_vma_image vmai;
+
+	if (vma == NULL) {
+		memset(&vmai, 0, sizeof(vmai));
+		goto dumpit;
+	}
+
+	printk("Dumping vma %016lx-%016lx %p/%p\n", vma->vm_start, vma->vm_end, vma, vma->vm_mm);
+
+	vmai.fd = 0;
+	vmai.prot = 0;
+	if (vma->vm_flags & VM_READ)
+		vmai.prot |= PROT_READ;
+	if (vma->vm_flags & VM_WRITE)
+		vmai.prot |= PROT_WRITE;
+	if (vma->vm_flags & VM_EXEC)
+		vmai.prot |= PROT_EXEC;
+
+	vmai.flags = 0;
+	if (vma->vm_file == NULL)
+		vmai.flags |= MAP_ANONYMOUS;
+	if (vma->vm_flags & VM_MAYSHARE)
+		vmai.flags |= MAP_SHARED;
+	else
+		vmai.flags |= MAP_PRIVATE;
+
+	vmai.start = vma->vm_start;
+	vmai.end = vma->vm_end;
+	vmai.pgoff = vma->vm_pgoff;
+
+dumpit:
+	return img_dump_buffer(buf, size, &vmai, sizeof(vmai), pos);
+}
+
+static int img_dump_page(unsigned long addr, void *data, char __user *buf, size_t size, int pos)
+{
+	struct binfmt_page_image pgi;
+	int ret = 0, tmp;
+
+	pgi.vaddr = addr;
+
+	if (pos < sizeof(pgi)) {
+		tmp = img_dump_buffer(buf, size, &pgi, sizeof(pgi), pos);
+		if (tmp < 0)
+			return tmp;
+
+		ret = tmp;
+		if (size <= ret)
+			return ret;
+
+		buf += ret;
+		size -= ret;
+		pos = 0;
+	} else
+		pos -= sizeof(pgi);
+
+	tmp = img_dump_buffer(buf, size, data, PAGE_SIZE, pos);
+	if (tmp < 0)
+		return tmp;
+
+	return ret + tmp;
+}
+
+static inline int is_private_vma(struct vm_area_struct *vma)
+{
+	if (vma->vm_file == NULL)
+		return 1;
+	if (!(vma->vm_flags & VM_SHARED))
+		return 1;
+	return 0;
+}
+
+static ssize_t do_produce_dump(struct task_struct *p, char __user *buf,
+		size_t size, loff_t *ppos)
+{
+	size_t img_pos = 0, img_ppos;
+	size_t produced = 0;
+	int len;
+	loff_t pos = *ppos;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+
+#define move_pos();	do {	\
+		buf += len;	\
+		produced += len;\
+		size -= len;	\
+		pos += len;	\
+	} while (0)
+
+#define seek_pos(__size);	do {	\
+		img_ppos = img_pos;	\
+		img_pos += (__size);	\
+	} while (0)
+
+	/* header */
+	seek_pos(sizeof(struct binfmt_img_header));
+	if (pos < img_pos) {
+		len = img_dump_header(buf, size, pos - img_ppos);
+		if (len < 0)
+			goto err;
+
+		move_pos();
+		if (size == 0)
+			goto out;
+	}
+
+	/* registers */
+	seek_pos(sizeof(struct binfmt_regs_image));
+	if (pos < img_pos) {
+		len = img_dump_regs(p, buf, size, pos - img_ppos);
+		if (len < 0)
+			goto err;
+
+		move_pos();
+		if (size == 0)
+			goto out;
+	}
+
+	/* memory */
+	mm = get_task_mm(p);
+	if (mm == NULL)
+		return -EACCES;
+
+	down_read(&mm->mmap_sem);
+
+	seek_pos(sizeof(struct binfmt_mm_image));
+	if (pos < img_pos) {
+		len = img_dump_mm(mm, buf, size, pos - img_ppos);
+		if (len < 0)
+			goto err_mm;
+
+		move_pos();
+		if (size == 0)
+			goto out_mm;
+	}
+
+	vma = mm->mmap;
+	while (1) {
+		seek_pos(sizeof(struct binfmt_vma_image));
+		if (pos < img_pos) {
+			len = img_dump_vma(vma, buf, size, pos - img_ppos);
+			if (len < 0)
+				goto err_mm;
+
+			move_pos();
+			if (size == 0)
+				goto out_mm;
+		}
+
+		if (vma == NULL)
+			break;
+
+		vma = vma->vm_next;
+	}
+
+	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+		/* slow and stupid */
+		unsigned long addr;
+		struct page *page;
+		void *pg_data;
+
+		if (!is_private_vma(vma))
+			continue;
+
+		for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+			page = follow_page(vma, addr, FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+			if (page == NULL)
+				continue;
+			if (IS_ERR(page)) /* huh? */
+				continue;
+
+			seek_pos(sizeof(struct binfmt_page_image) + PAGE_SIZE);
+			if (pos < img_pos) {
+				pg_data = kmap(page);
+				len = img_dump_page(addr, pg_data, buf, size, pos - img_ppos);
+				kunmap(page);
+
+				if (len < 0) {
+					put_page(page);
+					goto err_mm;
+				}
+
+				move_pos();
+				if (size == 0) {
+					put_page(page);
+					goto out_mm;
+				}
+			}
+
+			put_page(page);
+		}
+	}
+
+	seek_pos(sizeof(struct binfmt_page_image));
+	if (pos < img_pos) {
+		struct binfmt_page_image zero;
+
+		memset(&zero, 0, sizeof(zero));
+		len = img_dump_buffer(buf, size, &zero, sizeof(zero), pos - img_ppos);
+		if (len < 0)
+			goto err;
+
+		move_pos();
+	}
+
+out_mm:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out:
+	*ppos = pos;
+	return produced;
+
+err_mm:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+err:
+	return len;
+}
+
+static ssize_t img_dump_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+{
+	struct task_struct *p;
+
+	p = get_proc_task(file->f_dentry->d_inode);
+	if (p == NULL)
+		return -ESRCH;
+
+	if (!(p->state & TASK_STOPPED)) {
+		put_task_struct(p);
+		return -EINVAL;
+	}
+
+	return do_produce_dump(p, buf, size, ppos);
+}
+
+static int img_dump_open(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static int img_dump_release(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+const struct file_operations proc_pid_dump_operations = {
+	.open		= img_dump_open,
+	.read		= img_dump_read,
+	.release	= img_dump_release,
+};
diff --git a/include/linux/binfmt_img.h b/include/linux/binfmt_img.h
new file mode 100644
index 0000000..a4293af
--- /dev/null
+++ b/include/linux/binfmt_img.h
@@ -0,0 +1,87 @@
+#ifndef __BINFMT_IMG_H__
+#define __BINFMT_IMG_H__
+
+#include <linux/types.h>
+
+struct binfmt_img_header {
+	__u32	magic;
+	__u32	version;
+};
+
+#define CKPT_TLS_ENTRIES	3
+
+struct binfmt_regs_image {
+	__u64 r15;
+	__u64 r14;
+	__u64 r13;
+	__u64 r12;
+	__u64 r11;
+	__u64 r10;
+	__u64 r9;
+	__u64 r8;
+	__u64 ax;
+	__u64 orig_ax;
+	__u64 bx;
+	__u64 cx;
+	__u64 dx;
+	__u64 si;
+	__u64 di;
+	__u64 ip;
+	__u64 flags;
+	__u64 bp;
+	__u64 sp;
+
+	__u64 gs;
+	__u64 fs;
+	__u64 tls[CKPT_TLS_ENTRIES];
+	__u16 gsindex;
+	__u16 fsindex;
+	__u16 cs;
+	__u16 ss;
+	__u16 ds;
+	__u16 es;
+};
+
+#define CKPT_X86_SEG_NULL       0
+#define CKPT_X86_SEG_USER32_CS  1
+#define CKPT_X86_SEG_USER32_DS  2
+#define CKPT_X86_SEG_USER64_CS  3
+#define CKPT_X86_SEG_USER64_DS  4
+#define CKPT_X86_SEG_TLS        0x4000
+#define CKPT_X86_SEG_LDT        0x8000
+
+struct binfmt_mm_image {
+	__u64	flags;
+	__u64	def_flags;
+	__u64	start_code;
+	__u64	end_code;
+	__u64	start_data;
+	__u64	end_data;
+	__u64	start_brk;
+	__u64	brk;
+	__u64	start_stack;
+	__u64	arg_start;
+	__u64	arg_end;
+	__u64	env_start;
+	__u64	env_end;
+	__u32	exe_fd;
+};
+
+struct binfmt_vma_image {
+	__u32	prot;
+	__u32	flags;
+	__u32	pad;
+	__u32	fd;
+	__u64	start;
+	__u64	end;
+	__u64	pgoff;
+};
+
+struct binfmt_page_image {
+	__u64	vaddr;
+};
+
+#define BINFMT_IMG_MAGIC	0xa75b8d43
+#define BINFMT_IMG_VERS_0	0x00000100
+
+#endif
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index c779c74..686b374 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -102,6 +102,8 @@ struct vmcore {
 
 #ifdef CONFIG_PROC_FS
 
+extern const struct file_operations proc_pid_dump_operations;
+
 extern void proc_root_init(void);
 
 void proc_flush_task(struct task_struct *task);
-- 
1.5.5.6


More information about the Containers mailing list