[PATCH 2/2] c/r: x86-64: checkpoint/restart implementation

Oren Laadan orenl at cs.columbia.edu
Sun Dec 6 12:31:09 PST 2009


Support for checkpoint and restart for X86_32 architecture.
Partly based on Alexey's work.

 Checkpoint          Restart
 (app/arch)         (app/arch)
--------------------------------
  64/x86-64	->  64/x86-64	  works
  32/x86-64	->  32/x86-64	  ?
  32/x86-64	->  32/x86-32	  ?
  32/x86-32	->  32/x86-64	  ?

Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
---
 arch/x86/Kconfig                      |    2 +-
 arch/x86/include/asm/checkpoint_hdr.h |    6 +
 arch/x86/include/asm/syscalls.h       |    6 +
 arch/x86/include/asm/unistd_64.h      |    4 +
 arch/x86/kernel/Makefile              |    2 +
 arch/x86/kernel/checkpoint_64.c       |  251 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/entry_64.S            |    5 +
 include/linux/checkpoint_hdr.h        |    2 +
 8 files changed, 277 insertions(+), 1 deletions(-)
 create mode 100644 arch/x86/kernel/checkpoint_64.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 69d6077..f6260f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,7 +88,7 @@ config HAVE_LATENCYTOP_SUPPORT
 
 config CHECKPOINT_SUPPORT
 	bool
-	default y if X86_32
+	default y
 
 config MMU
 	def_bool y
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index 65511ca..0033bfe 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -36,6 +36,10 @@
 #include <asm/processor.h>
 #endif
 
+#ifdef CONFIG_X86_64
+#define CKPT_ARCH_ID	CKPT_ARCH_X86_64
+#endif
+
 #ifdef CONFIG_X86_32
 #define CKPT_ARCH_ID	CKPT_ARCH_X86_32
 #endif
@@ -135,6 +139,8 @@ struct ckpt_hdr_cpu {
 #define CKPT_X86_SEG_NULL	0
 #define CKPT_X86_SEG_USER32_CS	1
 #define CKPT_X86_SEG_USER32_DS	2
+#define CKPT_X86_SEG_USER64_CS	3
+#define CKPT_X86_SEG_USER64_DS	4
 #define CKPT_X86_SEG_TLS	0x4000	/* 0100 0000 0000 00xx */
 #define CKPT_X86_SEG_LDT	0x8000	/* 100x xxxx xxxx xxxx */
 
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 1079447..063cdd0 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -88,6 +88,12 @@ asmlinkage long sys_execve(char __user *, char __user * __user *,
 			   struct pt_regs *);
 long sys_arch_prctl(int, unsigned long);
 
+/* kernel/checkpoint_64.c */
+#ifdef CONFIG_CHECKPOINT
+asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd,
+			    struct pt_regs *regs);
+#endif
+
 /* kernel/signal.c */
 asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
 				struct pt_regs *);
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2ffc89..c360707 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,10 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_eclone                   		299
 __SYSCALL(__NR_eclone, stub_eclone)
+#define __NR_checkpoint                   	300
+__SYSCALL(__NR_checkpoint, sys_checkpoint)
+#define __NR_restart                   		301
+__SYSCALL(__NR_restart, stub_restart)
 
 
 #ifndef __NO_STUBS
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2821fd6..ded0ee2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -138,4 +138,6 @@ ifeq ($(CONFIG_X86_64),y)
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
+
+	obj-$(CONFIG_CHECKPOINT)	+= checkpoint_64.o
 endif
diff --git a/arch/x86/kernel/checkpoint_64.c b/arch/x86/kernel/checkpoint_64.c
new file mode 100644
index 0000000..3901a53
--- /dev/null
+++ b/arch/x86/kernel/checkpoint_64.c
@@ -0,0 +1,251 @@
+/*
+ *  Checkpoint/restart - architecture specific support for x86_64
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/elf.h>
+
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+
+/*
+ * sys_restart needs to access and modify the pt_regs structure to
+ * restore the original state from the time of the checkpoint.
+ */
+asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd,
+			    struct pt_regs *regs)
+{
+	return do_sys_restart(pid, fd, flags, logfd);
+}
+
+/* helpers to encode/decode/validate segments */
+
+int check_segment(__u16 seg)
+{
+	int ret = 0;
+
+	switch (seg) {
+	case CKPT_X86_SEG_NULL:
+	case CKPT_X86_SEG_USER64_CS:
+	case CKPT_X86_SEG_USER64_DS:
+#ifdef CONFIG_COMPAT
+	case CKPT_X86_SEG_USER32_CS:
+	case CKPT_X86_SEG_USER32_DS:
+#endif
+		return 1;
+	}
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		if (seg <= GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN)
+			ret = 1;
+	} else if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		if (seg <= 0x1fff)
+			ret = 1;
+	}
+	return ret;
+}
+
+__u16 encode_segment(unsigned short seg)
+{
+	if (seg == 0)
+		return CKPT_X86_SEG_NULL;
+	BUG_ON((seg & 3) != 3);
+
+	if (seg == __USER_CS)
+		return CKPT_X86_SEG_USER64_CS;
+	if (seg == __USER_DS)
+		return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+	if (seg == __USER32_CS)
+		return CKPT_X86_SEG_USER32_CS;
+	if (seg == __USER32_DS)
+		return CKPT_X86_SEG_USER32_DS;
+#endif
+
+	if (seg & 4)
+		return CKPT_X86_SEG_LDT | (seg >> 3);
+
+	seg >>= 3;
+	if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+		return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+	printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+	BUG();
+}
+
+unsigned short decode_segment(__u16 seg)
+{
+	if (seg == CKPT_X86_SEG_NULL)
+		return 0;
+
+	if (seg == CKPT_X86_SEG_USER64_CS)
+		return __USER_CS;
+	if (seg == CKPT_X86_SEG_USER64_DS)
+		return __USER_DS;
+#ifdef CONFIG_COMPAT
+	if (seg == CKPT_X86_SEG_USER32_CS)
+		return __USER32_CS;
+	if (seg == CKPT_X86_SEG_USER32_DS)
+		return __USER32_DS;
+#endif
+
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+	}
+	if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		return (seg << 3) | 7;
+	}
+	BUG();
+}
+
+void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct pt_regs *regs = task_pt_regs(t);
+	unsigned long _ds, _es, _fs, _gs;
+
+	h->r15 = regs->r15;
+	h->r14 = regs->r14;
+	h->r13 = regs->r13;
+	h->r12 = regs->r12;
+	h->r11 = regs->r11;
+	h->r10 = regs->r10;
+	h->r9 = regs->r9;
+	h->r8 = regs->r8;
+
+	h->bp = regs->bp;
+	h->bx = regs->bx;
+	h->ax = regs->ax;
+	h->cx = regs->cx;
+	h->dx = regs->dx;
+	h->si = regs->si;
+	h->di = regs->di;
+	h->orig_ax = regs->orig_ax;
+	h->ip = regs->ip;
+
+	h->flags = regs->flags;
+	h->sp = regs->sp;
+
+	/*
+	 * for checkpoint in process context (from within a container)
+	 * DS, ES, FS, GS registers should be saved from the hardware;
+	 * otherwise they are already saved on the thread structure
+	 */
+
+	h->cs = encode_segment(regs->cs);
+	h->ss = encode_segment(regs->ss);
+
+	if (t == current) {
+		savesegment(ds, _ds);
+		savesegment(es, _es);
+		savesegment(fs, _fs);
+		savesegment(gs, _gs);
+	} else {
+		_ds = t->thread.ds;
+		_es = t->thread.es;
+		_fs = t->thread.fsindex;
+		_gs = t->thread.gsindex;
+	}
+	h->ds = encode_segment(_ds);
+	h->es = encode_segment(_es);
+	h->fsindex = encode_segment(_fs);
+	h->gsindex = encode_segment(_gs);
+
+	if (!test_tsk_thread_flag(t, TIF_IA32)) {
+		h->fs = t->thread.fs;
+		h->gs = t->thread.gs;
+	}
+
+	/*
+	 * for checkpoint in process context (from within a container),
+	 * the actual syscall is taking place at this very moment; so
+	 * we (optimistically) subtitute the future return value (0) of
+	 * this syscall into the orig_eax, so that upon restart it will
+	 * succeed (or it will endlessly retry checkpoint...)
+	 */
+	if (t == current) {
+		BUG_ON(h->orig_ax < 0);
+		h->ax = 0;
+	}
+}
+
+int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+	struct pt_regs *regs = task_pt_regs(t);
+
+	if (h->cs == CKPT_X86_SEG_NULL)
+		return -EINVAL;
+	if (!check_segment(h->cs) || !check_segment(h->ds) ||
+	    !check_segment(h->es) || !check_segment(h->ss) ||
+	    !check_segment(h->fsindex) || !check_segment(h->gsindex))
+		return -EINVAL;
+
+#ifdef CONFIG_COMPAT
+	if (test_tsk_thread_flag(t, TIF_IA32) &&
+	    (!check_segment(h->fs) || !check_segment(h->gs)))
+		return -EINVAL;
+#endif
+
+	regs->r15 = h->r15;
+	regs->r14 = h->r14;
+	regs->r13 = h->r13;
+	regs->r12 = h->r12;
+	regs->r11 = h->r11;
+	regs->r10 = h->r10;
+	regs->r9 = h->r9;
+	regs->r8 = h->r8;
+
+	regs->bp = h->bp;
+	regs->bx = h->bx;
+	regs->ax = h->ax;
+	regs->cx = h->cx;
+	regs->dx = h->dx;
+	regs->si = h->si;
+	regs->di = h->di;
+	regs->orig_ax = h->orig_ax;
+	regs->ip = h->ip;
+
+	regs->sp = h->sp;
+	thread->usersp = h->sp;
+
+	preempt_disable();
+
+	regs->cs = decode_segment(h->cs);
+	regs->ss = decode_segment(h->ss);
+	thread->ds = decode_segment(h->ds);
+	thread->es = decode_segment(h->es);
+	thread->fsindex = decode_segment(h->fsindex);
+	thread->gsindex = decode_segment(h->gsindex);
+
+#ifdef CONFIG_COMPAT
+	if (!test_tsk_thread_flag(t, TIF_IA32)) {
+		thread->fs = h->fs;
+		thread->gs = h->gs;
+	}
+#endif
+
+	/* XXX - unsure is this really needed ... */
+	loadsegment(fs, thread->fsindex);
+        if (thread->fs)
+		wrmsrl(MSR_FS_BASE, thread->fs);
+	load_gs_index(thread->gsindex);
+        if (thread->gs)
+		wrmsrl(MSR_KERNEL_GS_BASE, thread->gs);
+
+	preempt_enable();
+
+	return 0;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6d60cd1..e692193 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -699,6 +699,11 @@ END(\label)
 	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
 	PTREGSCALL stub_iopl, sys_iopl, %rsi
 	PTREGSCALL stub_eclone, sys_eclone, %r8
+#ifdef CONFIG_CHECKPOINT
+	PTREGSCALL stub_restart, sys_restart, %r8
+#else
+	PTREGSCALL stub_restart, sys_ni_syscall, %r8
+#endif
 
 ENTRY(ptregscall_common)
 	DEFAULT_FRAME 1 8	/* offset 8: return address */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 4e57d37..6468fa9 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -195,6 +195,8 @@ enum {
 #define CKPT_ARCH_PPC32 CKPT_ARCH_PPC32
 	CKPT_ARCH_PPC64,
 #define CKPT_ARCH_PPC64 CKPT_ARCH_PPC64
+	CKPT_ARCH_X86_64,
+#define CKPT_ARCH_X86_64 CKPT_ARCH_X86_64
 };
 
 /* shared objrects (objref) */
-- 
1.6.3.3



More information about the Containers mailing list