[PATCH 3/4] c/r: checkpoint/restart of anonymous hugetlb mappings

Oren Laadan orenl at cs.columbia.edu
Mon Jan 10 18:11:28 PST 2011


Support checkpoint and restore of both private and shared
hugepage-backed mappings established via mmap(MAP_HUGETLB).  Introduce
APIs for checkpoint and restart of individual huge pages which are to
be used by the sysv SHM_HUGETLB c/r code.

Original patch posted by Nathan Lynch <ntl at pobox.com>.

Changelog[v23-rc1]:
  - Mofidied to reuse existing code in mm/checkpoint.c (specifically
  checkpoint_memory_contents() and restore_memory_contents()
  - Merge patch that adds the necessary plumbing to to checkpoint
  open hugetlbfs files.
  - Merge patch that removes VM_HUGETLB from CKPT_VMA_NOT_SUPPORTED

Cc: Nathan Lynch <<ntl at pobox.com>>
Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
---
 include/linux/checkpoint.h     |    3 +-
 include/linux/checkpoint_hdr.h |   16 ++++
 include/linux/hugetlb.h        |   34 +++++++++
 ipc/checkpoint_shm.c           |    2 +-
 mm/checkpoint.c                |   82 ++++++++++++++++------
 mm/hugetlb.c                   |  157 ++++++++++++++++++++++++++++++++++++++++
 mm/shmem.c                     |    2 +-
 7 files changed, 272 insertions(+), 24 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 6da31c5..51298d4 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -300,7 +300,8 @@ extern int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 extern int checkpoint_memory_contents(struct ckpt_ctx *ctx,
 				      struct vm_area_struct *vma,
 				      struct file *file);
-extern int restore_memory_contents(struct ckpt_ctx *ctx, struct file *file);
+extern int restore_memory_contents(struct ckpt_ctx *ctx,
+				   struct file *file, int huge);
 
 
 #define CKPT_VMA_NOT_SUPPORTED	\
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index f7e233d..b7a7406 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -169,6 +169,8 @@ enum {
 #define CKPT_HDR_VMA CKPT_HDR_VMA
 	CKPT_HDR_PGARR,
 #define CKPT_HDR_PGARR CKPT_HDR_PGARR
+	CKPT_HDR_HPAGE,
+#define CKPT_HDR_HPAGE CKPT_HDR_HPAGE
 	CKPT_HDR_MM_CONTEXT,
 #define CKPT_HDR_MM_CONTEXT CKPT_HDR_MM_CONTEXT
 
@@ -922,6 +924,10 @@ enum vma_type {
 #define CKPT_VMA_SHM_IPC CKPT_VMA_SHM_IPC
 	CKPT_VMA_SHM_IPC_SKIP,	/* shared sysvipc (skip contents) */
 #define CKPT_VMA_SHM_IPC_SKIP CKPT_VMA_SHM_IPC_SKIP
+	CKPT_VMA_HUGETLB,
+#define CKPT_VMA_HUGETLB CKPT_VMA_HUGETLB
+	CKPT_VMA_HUGETLB_SKIP,
+#define CKPT_VMA_HUGETLB_SKIP CKPT_VMA_HUGETLB_SKIP
 };
 
 /* vma descriptor */
@@ -946,6 +952,16 @@ struct ckpt_hdr_pgarr {
 	__u64 nr_pages;		/* number of pages to saved */
 } __attribute__((aligned(8)));
 
+/* huge page */
+struct ckpt_hdr_hpage {
+	struct ckpt_hdr h;
+	union {
+		__u64 vaddr;
+		__u64 index;
+	};
+	__u16 shift;
+} __attribute__((aligned(8)));
+
 /* signals */
 struct ckpt_sigset {
 	__u8 sigset[CKPT_ARCH_NSIG / 8];
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 943c76b..a0aabe1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,6 +43,13 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						struct vm_area_struct *vma,
 						int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
+#ifdef CONFIG_CHECKPOINT
+int checkpoint_dump_hugetlb(struct ckpt_ctx *ctx, struct page *page);
+int restore_read_hugetlb(struct ckpt_ctx *ctx, struct page *page);
+struct page *consider_hugetlb_private_page(struct vm_area_struct *vma,
+					   unsigned long addr);
+#endif
+
 int dequeue_hwpoisoned_huge_page(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
 
@@ -114,6 +121,22 @@ static inline void copy_huge_page(struct page *dst, struct page *src)
 #define HPAGE_SIZE	PAGE_SIZE
 #endif
 
+#ifdef CONFIG_CHECKPOINT
+static inline int checkpoint_dump_hugetlb(struct ckpt_ctx *ctx, struct page *page)
+{
+	return -ENOSYS;
+}
+static inline int restore_read_hugetlb(struct ckpt_ctx *ctx, struct page *page)
+{
+	return -ENOSYS;
+}
+static inline struct page *consider_hugetlb_private_page(struct vm_area_struct *vma,
+							 unsigned long addr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+#endif
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 
 #define HUGETLB_ANON_FILE "anon_hugepage"
@@ -332,4 +355,15 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
 #define hstate_index_to_shift(index) 0
 #endif
 
+#ifdef CONFIG_CHECKPOINT
+#ifdef CONFIG_HUGETLB_PAGE
+struct ckpt_ctx;
+struct ckpt_hdr_vma;
+extern int hugetlb_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+			   struct ckpt_hdr_vma *h);
+#else
+#define hugetlb_restore NULL
+#endif
+#endif
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/ipc/checkpoint_shm.c b/ipc/checkpoint_shm.c
index acfb79b..05ba5cf 100644
--- a/ipc/checkpoint_shm.c
+++ b/ipc/checkpoint_shm.c
@@ -294,7 +294,7 @@ int restore_ipc_shm(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
 	ret = ckpt_obj_insert(ctx, file, h->objref, CKPT_OBJ_FILE);
 	if (ret < 0)
 		goto fput;
-	ret = restore_memory_contents(ctx, file);
+	ret = restore_memory_contents(ctx, file, 0);
 fput:
 	fput(file);
 
diff --git a/mm/checkpoint.c b/mm/checkpoint.c
index 8b40f4d..1c50f62 100644
--- a/mm/checkpoint.c
+++ b/mm/checkpoint.c
@@ -25,6 +25,7 @@
 #include <linux/proc_fs.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
+#include <linux/hugetlb.h>
 #include <linux/checkpoint.h>
 
 /*
@@ -240,7 +241,7 @@ static struct page *consider_private_page(struct vm_area_struct *vma,
  */
 static struct page *consider_shared_page(struct file *file, unsigned long idx)
 {
-	struct ino *inode = file->f_dentfy->d_inode;
+	struct inode *ino = file->f_dentry->d_inode;
 	struct page *page = NULL;
 	int ret;
 
@@ -288,20 +289,24 @@ static struct page *consider_shared_page(struct file *file, unsigned long idx)
  */
 static int vma_fill_pgarr(struct ckpt_ctx *ctx,
 			  struct vm_area_struct *vma, struct file *file,
-			  unsigned long *start, unsigned long end)
+			  int huge, unsigned long *start, unsigned long end)
 {
 	unsigned long addr = *start;
 	struct ckpt_pgarr *pgarr;
 	struct inode *inode;
+	unsigned long pagesize;
 	int nr_used;
 	int cnt = 0;
 
 	BUG_ON(file && vma);
 
-	if (vma)
+	if (vma) {
 		down_read(&vma->vm_mm->mmap_sem);
-	else
+		pagesize = vma_kernel_pagesize(vma);
+	} else {
 		inode = file->f_dentry->d_inode;
+		pagesize = 1;
+	}
 
 	do {
 		pgarr = pgarr_current(ctx);
@@ -315,10 +320,14 @@ static int vma_fill_pgarr(struct ckpt_ctx *ctx,
 		while (addr < end) {
 			struct page *page;
 
-			if (vma)
+			if (vma && !huge)  /* vma && !huge */
 				page = consider_private_page(vma, addr);
-			else
+			else if (vma)      /* vma && huge */
+				page = consider_hugetlb_private_page(vma, addr);
+			else if (!huge)    /* !vma && !huge */
 				page = consider_shared_page(file, addr);
+			else               /* !vma && huge */
+				page = ERR_PTR(-EINVAL);
 
 			if (IS_ERR(page)) {
 				cnt = PTR_ERR(page);
@@ -333,10 +342,7 @@ static int vma_fill_pgarr(struct ckpt_ctx *ctx,
 				pgarr->nr_used++;
 			}
 
-			if (vma)
-				addr += PAGE_SIZE;
-			else
-				addr++;
+			addr += pagesize;
 
 			if (pgarr_is_full(pgarr))
 				break;
@@ -368,10 +374,13 @@ int checkpoint_dump_page(struct ckpt_ctx *ctx, struct page *page)
  * vma_dump_pages - dump pages listed in the ctx page-array chain
  * @ctx - checkpoint context
  * @total - total number of pages
+ * @huge - indicates hugetbl pages
+ * @pagesize - page size
  *
  * First dump all virtual addresses, followed by the contents of all pages
  */
-static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
+static int vma_dump_pages(struct ckpt_ctx *ctx, int total,
+			  int huge, unsigned long pagesize)
 {
 	struct ckpt_pgarr *pgarr;
 	int i, ret = 0;
@@ -379,7 +388,7 @@ static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
 	if (!total)
 		return 0;
 
-	i =  total * (sizeof(unsigned long) + PAGE_SIZE);
+	i =  total * (sizeof(unsigned long) + pagesize);
 	ret = ckpt_write_obj_type(ctx, NULL, i, CKPT_HDR_BUFFER);
 	if (ret < 0)
 		return ret;
@@ -393,7 +402,12 @@ static int vma_dump_pages(struct ckpt_ctx *ctx, int total)
 
 	list_for_each_entry_reverse(pgarr, &ctx->pgarr_list, list) {
 		for (i = 0; i < pgarr->nr_used; i++) {
-			ret = checkpoint_dump_page(ctx, pgarr->pages[i]);
+			if (!huge)
+				ret = checkpoint_dump_page(ctx,
+							   pgarr->pages[i]);
+			else
+				ret = checkpoint_dump_hugetlb(ctx,
+							   pgarr->pages[i]);
 			if (ret < 0)
 				return ret;
 		}
@@ -418,14 +432,20 @@ int checkpoint_memory_contents(struct ckpt_ctx *ctx,
 {
 	struct ckpt_hdr_pgarr *h;
 	unsigned long addr, end;
+	unsigned long pagesize;
 	int cnt, ret;
+	int huge;
 
 	BUG_ON(vma && file);
 
 	if (vma) {
-		addr = vma->vm_start;
+		huge = is_vm_hugetlb_page(vma);
+		pagesize = vma_kernel_pagesize(vma);
 		end = vma->vm_end;
+		addr = vma->vm_start;
 	} else {
+		huge = 0;
+		pagesize = PAGE_SIZE;
 		end = PAGE_ALIGN(i_size_read(file->f_dentry->d_inode))
 			>> PAGE_CACHE_SHIFT;
 		addr = 0;
@@ -455,7 +475,7 @@ int checkpoint_memory_contents(struct ckpt_ctx *ctx,
 	 */
 
 	while (addr < end) {
-		cnt = vma_fill_pgarr(ctx, vma, file, &addr, end);
+		cnt = vma_fill_pgarr(ctx, vma, file, huge, &addr, end);
 		if (cnt == 0)
 			break;
 		else if (cnt < 0)
@@ -473,7 +493,7 @@ int checkpoint_memory_contents(struct ckpt_ctx *ctx,
 		if (ret < 0)
 			return ret;
 
-		ret = vma_dump_pages(ctx, cnt);
+		ret = vma_dump_pages(ctx, cnt, huge, pagesize);
 		if (ret < 0)
 			return ret;
 
@@ -905,8 +925,10 @@ static struct page *bring_shared_page(unsigned long idx, struct inode *ino)
 /**
  * read_pages_contents - read in data of pages in page-array chain
  * @ctx - restart context
+ * @file - associated file (mapped or ipc)
+ * @huge - hugetlb flag
  */
-static int read_pages_contents(struct ckpt_ctx *ctx, struct file *file)
+static int read_pages_contents(struct ckpt_ctx *ctx, struct file *file, int huge)
 {
 	struct ckpt_pgarr *pgarr;
 	unsigned long *vaddrs;
@@ -932,7 +954,11 @@ static int read_pages_contents(struct ckpt_ctx *ctx, struct file *file)
 			if (IS_ERR(page))
 				return PTR_ERR(page);
 
-			ret = restore_read_page(ctx, page);
+			if (!huge)
+				ret = restore_read_page(ctx, page);
+			else
+				ret = restore_read_hugetlb(ctx, page);
+
 			page_cache_release(page);
 
 			if (ret < 0)
@@ -953,7 +979,7 @@ static int read_pages_contents(struct ckpt_ctx *ctx, struct file *file)
  * these steps until reaching a header specifying "0" pages, which marks
  * the end of the contents.
  */
-int restore_memory_contents(struct ckpt_ctx *ctx, struct file *file)
+int restore_memory_contents(struct ckpt_ctx *ctx, struct file *file, int huge)
 {
 	struct ckpt_hdr_pgarr *h;
 	unsigned long nr_pages;
@@ -980,7 +1006,7 @@ int restore_memory_contents(struct ckpt_ctx *ctx, struct file *file)
 		ret = read_pages_vaddrs(ctx, nr_pages);
 		if (ret < 0)
 			break;
-		ret = read_pages_contents(ctx, file);
+		ret = read_pages_contents(ctx, file, huge);
 		if (ret < 0)
 			break;
 		pgarr_reset_all(ctx);
@@ -1030,6 +1056,8 @@ static unsigned long calc_map_flags_bits(unsigned long orig_vm_flags)
 		vm_flags |= MAP_PRIVATE;
 	if (orig_vm_flags & VM_NORESERVE)
 		vm_flags |= MAP_NORESERVE;
+	if (orig_vm_flags & VM_HUGETLB)
+		vm_flags |= MAP_HUGETLB;
 
 	return vm_flags;
 }
@@ -1094,7 +1122,7 @@ int private_vma_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
 	if (IS_ERR((void *) addr))
 		return PTR_ERR((void *) addr);
 
-	return restore_memory_contents(ctx, NULL);
+	return restore_memory_contents(ctx, NULL, 0);
 }
 
 /**
@@ -1189,6 +1217,18 @@ static struct restore_vma_ops restore_vma_ops[] = {
 		.vma_type = CKPT_VMA_SHM_IPC_SKIP,
 		.restore = ipcshm_restore,
 	},
+	/* hugeltb */
+	{
+		.vma_name = "HUGETLB",
+		.vma_type = CKPT_VMA_HUGETLB,
+		.restore = hugetlb_restore,
+	},
+	/* hugetlb (skip) */
+	{
+		.vma_name = "HUGETLB (SKIP)",
+		.vma_type = CKPT_VMA_HUGETLB_SKIP,
+		.restore = hugetlb_restore,
+	},
 };
 
 /**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8585524..44e4e0a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -8,6 +8,9 @@
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/sysctl.h>
+#include <linux/checkpoint.h>
+#include <linux/file.h>
+#include <linux/mman.h>
 #include <linux/highmem.h>
 #include <linux/mmu_notifier.h>
 #include <linux/nodemask.h>
@@ -2129,10 +2132,164 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return 0;
 }
 
+#ifdef CONFIG_CHECKPOINT
+struct page *consider_hugetlb_private_page(struct vm_area_struct *vma,
+					   unsigned long addr)
+{
+	sturct page *page;
+	int ret, nr = 1;
+
+	ret = follow_hugetlb_page(vma->vm_mm, vma, &page, NULL,
+				  &addr, &nr, 1, FOLL_DUMP | FOLL_GET);
+	if (ret == -EFAULT)
+		return NULL;
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	return page;
+}
+
+int checkpoint_dump_hugetlb(struct ckpt_ctx *ctx, struct page *head)
+{
+	unsigned int nr_pages;
+	struct page *page;
+	int ret = 0;
+	int i;
+
+	nr_pages = pages_per_huge_page(page_hstate(head));
+	page = head;
+
+	for (i = 0; i < nr_pages; i++) {
+		void *ptr;
+
+		ptr = kmap_atomic(page, KM_USER1);
+		copy_page(ctx->scratch_page, ptr);
+		kunmap_atomic(ptr, KM_USER1);
+		ret = ckpt_kwrite(ctx, ctx->scratch_page, PAGE_SIZE);
+		if (ret < 0)
+			break;
+
+		page = mem_map_next(page, head, i + 1);
+	}
+
+	return ret;
+}
+
+static int hugetlb_vm_op_checkpoint(struct ckpt_ctx *ctx, struct vm_area_struct *vma)
+{
+	enum vma_type vma_type;
+	int ino_objref;
+	int ret, first;
+
+	BUG_ON(!(vma->vm_flags & VM_HUGETLB));
+	BUG_ON(!vma->vm_file);
+
+	ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
+	if (ret < 0)
+		return ret;
+
+	ino_objref = ckpt_obj_lookup_add(ctx, vma->vm_file->f_dentry->d_inode,
+					 CKPT_OBJ_INODE, &first);
+	if (ino_objref < 0)
+		return ino_objref;
+
+	vma_type = (first ? CKPT_VMA_HUGETLB : CKPT_VMA_HUGETLB_SKIP);
+
+	ret = generic_vma_checkpoint(ctx, vma, vma_type, 0, ino_objref);
+	if (ret)
+		return ret;
+
+	if (vma_type == CKPT_VMA_HUGETLB)
+		ret = checkpoint_memory_contents(ctx, vma, NULL);
+
+	return ret;
+}
+
+int restore_read_hugetlb(struct ckpt_ctx *ctx, struct page *head)
+{
+	unsigned int nr_pages;
+	struct page *page;
+	int ret = 0;
+	int i;
+
+	nr_pages = pages_per_huge_page(page_hstate(head));
+	page = head;
+
+	for (i = 0; i < nr_pages; i++) {
+		void *ptr;
+
+		ret = ckpt_kread(ctx, ctx->scratch_page, PAGE_SIZE);
+		if (ret < 0)
+			break;
+
+		ptr = kmap_atomic(page, KM_USER1);
+		copy_page(ptr, ctx->scratch_page);
+		kunmap_atomic(ptr, KM_USER1);
+
+		page = mem_map_next(page, head, i + 1);
+	}
+
+	return ret;
+}
+
+int hugetlb_restore(struct ckpt_ctx *ctx, struct mm_struct *mm,
+		    struct ckpt_hdr_vma *hdr)
+{
+	unsigned long addr;
+	struct file *file;
+	int ret = 0;
+
+	if (!(hdr->vm_flags & (VM_HUGETLB)))
+		return -EINVAL;
+
+	file = ckpt_obj_try_fetch(ctx, hdr->ino_objref, CKPT_OBJ_FILE);
+	if (PTR_ERR(file) == -EINVAL)
+		file = NULL;
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* To do: don't assume same default_hstate on source and destinaton */
+	if (!file) {
+		struct user_struct *user = NULL;
+		unsigned long len;
+
+		if (hdr->vma_type != CKPT_VMA_HUGETLB)
+			return -EINVAL;
+
+		/* see sys_mmap_pgoff */
+		len = hdr->vm_end - hdr->vm_start;
+		len = ALIGN(len, huge_page_size(&default_hstate));
+		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+					  &user, HUGETLB_ANONHUGE_INODE);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+		ret = ckpt_obj_insert(ctx, file, hdr->ino_objref, CKPT_OBJ_FILE);
+		if (ret < 0)
+			goto out;
+	} else {
+		if (hdr->vma_type != CKPT_VMA_HUGETLB_SKIP)
+			return -EINVAL;
+		get_file(file);
+	}
+
+	addr = generic_vma_restore(mm, file, hdr);
+	if (IS_ERR((void *)addr))
+		ret = PTR_ERR((void *)addr);
+	else if (hdr->vma_type == CKPT_VMA_HUGETLB)
+		ret = restore_memory_contents(ctx, file, 1);
+out:
+	fput(file);
+	return ret;
+}
+#endif /* CONFIG_CHECKPOINT */
+
 const struct vm_operations_struct hugetlb_vm_ops = {
 	.fault = hugetlb_vm_op_fault,
 	.open = hugetlb_vm_op_open,
 	.close = hugetlb_vm_op_close,
+#ifdef CONFIG_CHECKPOINT
+	.checkpoint = hugetlb_vm_op_checkpoint,
+#endif
 };
 
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
diff --git a/mm/shmem.c b/mm/shmem.c
index cf018ba..7649368 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2486,7 +2486,7 @@ int shmem_restore(struct ckpt_ctx *ctx,
 		return PTR_ERR((void *) addr);
 
 	if (h->vma_type == CKPT_VMA_SHM_ANON)
-		ret = restore_memory_contents(ctx, file);
+		ret = restore_memory_contents(ctx, file, 0);
  out:
 	fput(file);
 	return ret;
-- 
1.7.1



More information about the Containers mailing list