[PATCH v21 024/100] c/r: x86-64: checkpoint/restart implementation

Oren Laadan orenl at cs.columbia.edu
Sat May 1 07:15:06 PDT 2010


Support for checkpoint and restart for X86_32 architecture.
Partly based on Alexey's work.

Support for 32bit on 64bit and fixes from Serge Hallyn.

 Checkpoint          Restart
 (app/arch)         (app/arch/program*)
---------------------------------------
  64/x86-64	->  64/x86-64	  works
  32/x86-64	->  32/x86-64	  works
  32/x86-64	->  32/x86-32	  ?
  32/x86-32	->  32/x86-64	  ?

  32/x86-64	->  32/x86-32	  ?
  32/x86-32	->  32/x86-64	  ?

(*) "program" indicates the bit-ness of 'restart' executable.

Changelog[v21]:
  - Do not include checkpoint_hdr.h explicitly
Changelog[v19-rc3]:
  - Rebase to kernel 2.6.33
  - [Serge Hallyn] Changes to fs/gs register handling
  - [Serge Hallyn] Allow 32-bit restart of 64-bit and vice versa
  - [Serge Hallyn] Only allow 'restart' with same bit-ness as image.

Cc: x86 at kernel.org
Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
Signed-off-by: Serge Hallyn <serue at us.ibm.com>
---
 arch/x86/Kconfig                      |    2 +-
 arch/x86/include/asm/checkpoint_hdr.h |    6 +
 arch/x86/include/asm/unistd_64.h      |    4 +
 arch/x86/kernel/Makefile              |    2 +
 arch/x86/kernel/checkpoint.c          |   16 +++
 arch/x86/kernel/checkpoint_64.c       |  240 +++++++++++++++++++++++++++++++++
 arch/x86/kernel/entry_64.S            |    7 +
 include/linux/checkpoint_hdr.h        |    2 +
 8 files changed, 278 insertions(+), 1 deletions(-)
 create mode 100644 arch/x86/kernel/checkpoint_64.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0874484..335a4b3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -95,7 +95,7 @@ config HAVE_LATENCYTOP_SUPPORT
 
 config CHECKPOINT_SUPPORT
 	bool
-	default y if X86_32
+	default y
 
 config MMU
 	def_bool y
diff --git a/arch/x86/include/asm/checkpoint_hdr.h b/arch/x86/include/asm/checkpoint_hdr.h
index e6cfc99..6f600dd 100644
--- a/arch/x86/include/asm/checkpoint_hdr.h
+++ b/arch/x86/include/asm/checkpoint_hdr.h
@@ -36,6 +36,10 @@
 #include <asm/processor.h>
 #endif
 
+#ifdef CONFIG_X86_64
+#define CKPT_ARCH_ID	CKPT_ARCH_X86_64
+#endif
+
 #ifdef CONFIG_X86_32
 #define CKPT_ARCH_ID	CKPT_ARCH_X86_32
 #endif
@@ -106,6 +110,8 @@ struct ckpt_hdr_cpu {
 #define CKPT_X86_SEG_NULL	0
 #define CKPT_X86_SEG_USER32_CS	1
 #define CKPT_X86_SEG_USER32_DS	2
+#define CKPT_X86_SEG_USER64_CS	3
+#define CKPT_X86_SEG_USER64_DS	4
 #define CKPT_X86_SEG_TLS	0x4000	/* 0100 0000 0000 00xx */
 #define CKPT_X86_SEG_LDT	0x8000	/* 100x xxxx xxxx xxxx */
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 1cd16af..2b162e1 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -665,6 +665,10 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
 #define __NR_eclone				300
 __SYSCALL(__NR_eclone, stub_eclone)
+#define __NR_checkpoint				301
+__SYSCALL(__NR_checkpoint, stub_checkpoint)
+#define __NR_restart				302
+__SYSCALL(__NR_restart, stub_restart)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2078d1b..916a7e1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -138,4 +138,6 @@ ifeq ($(CONFIG_X86_64),y)
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
+
+	obj-$(CONFIG_CHECKPOINT)	+= checkpoint_64.o
 endif
diff --git a/arch/x86/kernel/checkpoint.c b/arch/x86/kernel/checkpoint.c
index 015401a..3976318 100644
--- a/arch/x86/kernel/checkpoint.c
+++ b/arch/x86/kernel/checkpoint.c
@@ -249,6 +249,22 @@ int restore_thread(struct ckpt_ctx *ctx)
 	load_TLS(thread, cpu);
 	put_cpu();
 
+	{
+		int pre, post;
+		/*
+		 * Eventually we'd like to support mixed-bit restart, but for
+		 * now don't pretend to.
+		 */
+		pre = test_thread_flag(TIF_IA32);
+		post = !!(h->thread_info_flags & _TIF_IA32);
+		if (pre != post) {
+			ret = -EINVAL;
+			ckpt_err(ctx, ret, "%d-bit restarting %d-bit\n",
+				64 >> pre, 64 >> post);
+			goto out;
+		}
+	}
+
 	/* TODO: restore TIF flags as necessary (e.g. TIF_NOTSC) */
 
 	ret = 0;
diff --git a/arch/x86/kernel/checkpoint_64.c b/arch/x86/kernel/checkpoint_64.c
new file mode 100644
index 0000000..cac52f3
--- /dev/null
+++ b/arch/x86/kernel/checkpoint_64.c
@@ -0,0 +1,240 @@
+/*
+ *  Checkpoint/restart - architecture specific support for x86_64
+ *
+ *  Copyright (C) 2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/elf.h>
+
+#include <linux/checkpoint.h>
+
+/* helpers to encode/decode/validate segments */
+
+int check_segment(__u16 seg)
+{
+	int ret = 0;
+
+	switch (seg) {
+	case CKPT_X86_SEG_NULL:
+	case CKPT_X86_SEG_USER64_CS:
+	case CKPT_X86_SEG_USER64_DS:
+#ifdef CONFIG_COMPAT
+	case CKPT_X86_SEG_USER32_CS:
+	case CKPT_X86_SEG_USER32_DS:
+#endif
+		return 1;
+	}
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		if (seg <= GDT_ENTRY_TLS_MAX - GDT_ENTRY_TLS_MIN)
+			ret = 1;
+	} else if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		if (seg <= 0x1fff)
+			ret = 1;
+	}
+	return ret;
+}
+
+__u16 encode_segment(unsigned short seg)
+{
+	if (seg == 0)
+		return CKPT_X86_SEG_NULL;
+	BUG_ON((seg & 3) != 3);
+
+	if (seg == __USER_CS)
+		return CKPT_X86_SEG_USER64_CS;
+	if (seg == __USER_DS)
+		return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+	if (seg == __USER32_CS)
+		return CKPT_X86_SEG_USER32_CS;
+	if (seg == __USER32_DS)
+		return CKPT_X86_SEG_USER32_DS;
+#endif
+
+	if (seg & 4)
+		return CKPT_X86_SEG_LDT | (seg >> 3);
+
+	seg >>= 3;
+	if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+		return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+	printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+	BUG();
+}
+
+unsigned short decode_segment(__u16 seg)
+{
+	if (seg == CKPT_X86_SEG_NULL)
+		return 0;
+
+	if (seg == CKPT_X86_SEG_USER64_CS)
+		return __USER_CS;
+	if (seg == CKPT_X86_SEG_USER64_DS)
+		return __USER_DS;
+#ifdef CONFIG_COMPAT
+	if (seg == CKPT_X86_SEG_USER32_CS)
+		return __USER32_CS;
+	if (seg == CKPT_X86_SEG_USER32_DS)
+		return __USER32_DS;
+#endif
+
+	if (seg & CKPT_X86_SEG_TLS) {
+		seg &= ~CKPT_X86_SEG_TLS;
+		return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+	}
+	if (seg & CKPT_X86_SEG_LDT) {
+		seg &= ~CKPT_X86_SEG_LDT;
+		return (seg << 3) | 7;
+	}
+	BUG();
+}
+
+void save_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct pt_regs *regs = task_pt_regs(t);
+	unsigned long _ds, _es, _fs, _gs;
+
+	h->r15 = regs->r15;
+	h->r14 = regs->r14;
+	h->r13 = regs->r13;
+	h->r12 = regs->r12;
+	h->r11 = regs->r11;
+	h->r10 = regs->r10;
+	h->r9 = regs->r9;
+	h->r8 = regs->r8;
+
+	h->bp = regs->bp;
+	h->bx = regs->bx;
+	h->ax = regs->ax;
+	h->cx = regs->cx;
+	h->dx = regs->dx;
+	h->si = regs->si;
+	h->di = regs->di;
+	h->orig_ax = regs->orig_ax;
+	h->ip = regs->ip;
+
+	h->flags = regs->flags;
+	h->sp = regs->sp;
+
+	/*
+	 * for checkpoint in process context (from within a container)
+	 * DS, ES, FS, GS registers should be saved from the hardware;
+	 * otherwise they are already saved on the thread structure
+	 */
+
+	h->cs = encode_segment(regs->cs);
+	h->ss = encode_segment(regs->ss);
+
+	if (t == current) {
+		savesegment(ds, _ds);
+		savesegment(es, _es);
+		savesegment(fs, _fs);
+		savesegment(gs, _gs);
+		rdmsrl(MSR_FS_BASE, h->fs);
+		rdmsrl(MSR_KERNEL_GS_BASE, h->gs);
+	} else {
+		_ds = t->thread.ds;
+		_es = t->thread.es;
+		_fs = t->thread.fsindex;
+		_gs = t->thread.gsindex;
+		h->fs = t->thread.fs;
+		h->gs = t->thread.gs;
+	}
+	h->ds = encode_segment(_ds);
+	h->es = encode_segment(_es);
+	h->fsindex = encode_segment(_fs);
+	h->gsindex = encode_segment(_gs);
+
+	/* see comment in __switch_to() */
+	if (_fs)
+		h->fs = 0;
+	if (_gs)
+		h->gs = 0;
+
+	/*
+	 * for checkpoint in process context (from within a container),
+	 * the actual syscall is taking place at this very moment; so
+	 * we (optimistically) subtitute the future return value (0) of
+	 * this syscall into the orig_eax, so that upon restart it will
+	 * succeed (or it will endlessly retry checkpoint...)
+	 */
+	if (t == current) {
+		BUG_ON(h->orig_ax < 0);
+		h->ax = 0;
+	}
+}
+
+int load_cpu_regs(struct ckpt_hdr_cpu *h, struct task_struct *t)
+{
+	struct thread_struct *thread = &t->thread;
+	struct pt_regs *regs = task_pt_regs(t);
+
+	if (h->cs == CKPT_X86_SEG_NULL)
+		return -EINVAL;
+	if (!check_segment(h->cs) || !check_segment(h->ds) ||
+	    !check_segment(h->es) || !check_segment(h->ss) ||
+	    !check_segment(h->fsindex) || !check_segment(h->gsindex))
+		return -EINVAL;
+
+	regs->r15 = h->r15;
+	regs->r14 = h->r14;
+	regs->r13 = h->r13;
+	regs->r12 = h->r12;
+	regs->r11 = h->r11;
+	regs->r10 = h->r10;
+	regs->r9 = h->r9;
+	regs->r8 = h->r8;
+
+	regs->bp = h->bp;
+	regs->bx = h->bx;
+	regs->ax = h->ax;
+	regs->cx = h->cx;
+	regs->dx = h->dx;
+	regs->si = h->si;
+	regs->di = h->di;
+	regs->orig_ax = h->orig_ax;
+	regs->ip = h->ip;
+
+	regs->sp = h->sp;
+	thread->usersp = h->sp;
+
+	preempt_disable();
+
+	regs->cs = decode_segment(h->cs);
+	regs->ss = decode_segment(h->ss);
+	thread->ds = decode_segment(h->ds);
+	thread->es = decode_segment(h->es);
+	thread->fsindex = decode_segment(h->fsindex);
+	thread->gsindex = decode_segment(h->gsindex);
+
+	thread->fs = h->fs;
+	thread->gs = h->gs;
+
+	/* XXX - unsure is this really needed ... */
+	loadsegment(fs, thread->fsindex);
+	if (thread->fs)
+		wrmsrl(MSR_FS_BASE, thread->fs);
+	load_gs_index(thread->gsindex);
+	/*
+	 * when we switch to user-space, the MSR_KERNEL_GS_BASE
+	 * will be moved back to MSR_GS_BASE.
+	 * http://lists.openwall.net/linux-kernel/2008/11/18/340
+	 */
+	if (thread->gs)
+		wrmsrl(MSR_KERNEL_GS_BASE, thread->gs);
+
+	preempt_enable();
+
+	return 0;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 216681e..c2ece28 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -699,6 +699,13 @@ END(\label)
 	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
 	PTREGSCALL stub_iopl, sys_iopl, %rsi
 	PTREGSCALL stub_eclone, sys_eclone, %r8
+#ifdef CONFIG_CHECKPOINT
+	PTREGSCALL stub_checkpoint, sys_checkpoint, %r8
+	PTREGSCALL stub_restart, sys_restart, %r8
+#else
+	PTREGSCALL stub_checkpoint, sys_ni_syscall, %r8
+	PTREGSCALL stub_restart, sys_ni_syscall, %r8
+#endif
 
 ENTRY(ptregscall_common)
 	DEFAULT_FRAME 1 8	/* offset 8: return address */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index a0189de..ecc4afb 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -90,6 +90,8 @@ enum {
 enum {
 	CKPT_ARCH_X86_32 = 1,
 #define CKPT_ARCH_X86_32 CKPT_ARCH_X86_32
+	CKPT_ARCH_X86_64,
+#define CKPT_ARCH_X86_64 CKPT_ARCH_X86_64
 };
 
 /* kernel constants */
-- 
1.6.3.3



More information about the Containers mailing list