[PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.

Matt Helsley matthltc at us.ibm.com
Wed Aug 19 22:17:44 PDT 2009


Save/restore epoll items during checkpoint/restart respectively.

Tests for the cr_tests suite to follow. Tests pass on i386.

TODOs (search the patch for "TODO") that could probably use some
comments:

What to do when there's a "possible checkpoint obj leak"? (search patch
	for this string to see what I'm talking about)

Ensure get_current_user will be correct (a userns question/issue?).

kmalloc failures should be dealt with more kindly than just error-out
	because epoll is made to poll many thousands of file
	descriptors.
	This seems like a more general problem with some of the
	ckpt_hdr* functions than an epoll problem but...

Pick better errnos for some cases.

Signed-off-by: Matt Helsley <matthltc at us.ibm.com>
Cc: Oren Laadan <orenl at librato.com>
---
 checkpoint/files.c               |   35 +++++
 checkpoint/restart.c             |    2 +-
 fs/eventpoll.c                   |  280 +++++++++++++++++++++++++++++++++++++-
 include/linux/checkpoint.h       |    1 +
 include/linux/checkpoint_hdr.h   |   14 ++
 include/linux/checkpoint_types.h |    2 +
 include/linux/eventpoll.h        |   14 ++-
 7 files changed, 345 insertions(+), 3 deletions(-)

diff --git a/checkpoint/files.c b/checkpoint/files.c
index 204055b..8f86dcc 100644
--- a/checkpoint/files.c
+++ b/checkpoint/files.c
@@ -21,6 +21,8 @@
 #include <linux/syscalls.h>
 #include <linux/checkpoint.h>
 #include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
+#include <linux/eventpoll.h>
 #include <net/sock.h>
 
 
@@ -289,11 +291,24 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
 		goto out;
 
 	ckpt_debug("nfds %d\n", nfds);
+	ctx->files_deferq = deferqueue_create();
+	if (!ctx->files_deferq) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	for (n = 0; n < nfds; n++) {
 		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
 		if (ret < 0)
 			break;
 	}
+	if (!ret) {
+		ret = deferqueue_run(ctx->files_deferq);
+		if (ret > 0) {
+			pr_warning("c/r: files deferqueue had %d entries\n", ret);
+			ret = 0;
+		}
+	}
+	deferqueue_destroy(ctx->files_deferq);
  out:
 	kfree(fdtable);
 	return ret;
@@ -572,6 +587,14 @@ static struct restore_file_ops restore_file_ops[] = {
 		.file_type = CKPT_FILE_SOCKET,
 		.restore = sock_file_restore,
 	},
+#ifdef CONFIG_EPOLL
+	/* epoll */
+	{
+		.file_name = "EPOLL",
+		.file_type = CKPT_FILE_EPOLL,
+		.restore = ep_file_restore,
+	},
+#endif
 };
 
 static struct file *do_restore_file(struct ckpt_ctx *ctx)
@@ -692,11 +715,23 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
 	if (ret < 0)
 		goto out;
 
+	ret = -ENOMEM;
+	ctx->files_deferq = deferqueue_create();
+	if (!ctx->files_deferq)
+		goto out;
 	for (i = 0; i < h->fdt_nfds; i++) {
 		ret = restore_file_desc(ctx);
 		if (ret < 0)
 			break;
 	}
+	if (!ret) {
+		ret = deferqueue_run(ctx->files_deferq);
+		if (ret > 0) {
+			pr_warning("c/r: files deferqueue had %d entries\n", ret);
+			ret = 0;
+		}
+	}
+	deferqueue_destroy(ctx->files_deferq);
  out:
 	ckpt_hdr_put(ctx, h);
 	if (!ret) {
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 4fdae78..3a7d914 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -164,7 +164,7 @@ int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
  *
  * Return: new buffer allocated on success, error pointer otherwise
  */
-static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
 {
 	struct ckpt_hdr hh;
 	struct ckpt_hdr *h;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 085c5c0..7f7070f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -671,10 +671,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 	return pollflags != -1 ? pollflags : 0;
 }
 
+#ifdef CONFIG_CHECKPOINT
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
+#else
+#define ep_eventpoll_checkpoint NULL
+#endif
+
 /* File callbacks that implement the eventpoll file behaviour */
 static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
-	.poll		= ep_eventpoll_poll
+	.poll		= ep_eventpoll_poll,
+	.checkpoint 	= ep_eventpoll_checkpoint,
+	.collect 	= ep_file_collect,
 };
 
 /* Fast test to see if the file is an evenpoll file */
@@ -1413,6 +1422,275 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 
 #endif /* HAVE_SET_RESTORE_SIGMASK */
 
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/deferqueue.h>
+
+static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct rb_node *rbp;
+	struct eventpoll *ep;
+	int ret = 0;
+
+	if (!is_file_epoll(file))
+		return 0;
+
+	ep = file->private_data;
+	mutex_lock(&ep->mtx);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		struct epitem *epi;
+
+		epi = rb_entry(rbp, struct epitem, rbn);
+		ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
+		if (ret < 0)
+			break;
+	}
+	mutex_unlock(&ep->mtx);
+	return ret;
+}
+
+struct epoll_deferq_entry {
+	struct ckpt_ctx *ctx;
+	struct file *epfile;
+};
+
+static int ep_items_checkpoint(void *data)
+{
+	struct epoll_deferq_entry *ep_dq_entry = data;
+	struct ckpt_ctx *ctx;
+	struct file *file;
+	struct ckpt_eventpoll_items *h;
+	struct rb_node *rbp;
+	struct eventpoll *ep;
+	int i, ret = -ENOMEM;
+
+	file = ep_dq_entry->epfile;
+	ctx = ep_dq_entry->ctx;
+
+	ep = file->private_data;
+	mutex_lock(&ep->mtx);
+	for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
+	mutex_unlock(&ep->mtx);
+
+	/* TODO likely allocation failure when lots of epoll items */
+	h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
+			      CKPT_HDR_FILE_EPOLL_ITEMS);
+	if (!h)
+		goto out;
+
+	ret = -ENODEV;
+	h->num_items = i;
+	h->epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
+	if (h->epfile_objref <= 0)
+		goto out;
+
+	ret = 0;
+	mutex_lock(&ep->mtx);
+	for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
+		struct epitem *epi;
+		int objref;
+
+		epi = rb_entry(rbp, struct epitem, rbn);
+		objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
+		if (objref <= 0) {
+			/* TODO error -- possible checkpoint obj leak */
+			ret = -ENODEV;
+			break;
+		}
+		h->items[i].fd = epi->ffd.fd;
+		h->items[i].file_objref = objref;
+		h->items[i].events = epi->event.events;
+		h->items[i].data = epi->event.data;
+	}
+	mutex_unlock(&ep->mtx);
+	if (h && !ret)
+		ret = ckpt_write_obj(ctx, &h->h);
+	if (!ret && (i != h->num_items)) {
+		/* TODO error -- possible checkpoint obj leak */
+	}
+out:
+	if (h)
+		ckpt_hdr_put(ctx, &h->h);
+	return ret;
+}
+
+static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
+{
+	struct ckpt_hdr_file *h;
+	struct epoll_deferq_entry ep_dq_entry;
+	int ret = -ENOMEM;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
+	if (!h)
+		goto out_print;
+	h->f_type = CKPT_FILE_EPOLL;
+	ret = checkpoint_file_common(ctx, file, h);
+	if (ret < 0)
+		goto out;
+	ret = ckpt_write_obj(ctx, &h->h);
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Defer saving the epoll items until all of the ffd.file pointers
+	 * have an objref; after the file table has been checkpointed.
+	 */
+	ep_dq_entry.ctx = ctx;
+	ep_dq_entry.epfile = file;
+	ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
+			     sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
+out:
+	ckpt_hdr_put(ctx, h);
+out_print:
+	return ret;
+}
+
+static int ep_items_restore(void *data)
+{
+	struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
+	struct ckpt_eventpoll_items *h;
+	struct eventpoll *ep;
+	struct file *epfile = NULL;
+	int ret, i = 0, remaining_watches;
+
+	/*
+	 * TODO possible kmalloc failure due to too many watches.
+	 */
+	h = ckpt_read_obj(ctx, 0,
+			  sizeof(*h) + max_user_watches*sizeof(h->items[0]));
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if ((h->h.type != CKPT_HDR_FILE_EPOLL_ITEMS) ||
+	    (h->h.len < sizeof(*h)))
+		goto out;
+
+	/* Make sure the items match the size we expect */
+	if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
+		goto out;
+
+	epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
+	if (IS_ERR(epfile)) {
+		ret = PTR_ERR(epfile);
+		goto out;
+	}
+	ret = -ENOMSG;
+	if (!is_file_epoll(epfile))
+		goto out;
+
+	ep = epfile->private_data;
+
+	ret = -ENOSPC;
+	remaining_watches = (max_user_watches -
+			     atomic_read(&ep->user->epoll_watches));
+	if (h->num_items > remaining_watches)
+		goto out;
+
+	ret = 0;
+	/* Restore the epoll items/watches */
+	for (i = 0; !ret && i < h->num_items; i++) {
+		/*
+		 * Loop body like multiple epoll_ctl(ep, ADD, event)
+		 * calls except we've already done much of the checking.
+		 */
+		struct epoll_event epev;
+		struct epitem *epi;
+		struct file *tfile;
+
+		epev.events = h->items[i].events;
+		epev.data = h->items[i].data;
+
+		/* Get the file* for the target file */
+		if (h->items[i].file_objref <= 0) {
+			ret = -EINVAL;
+			break;
+		}
+
+		tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
+				       CKPT_OBJ_FILE);
+		if (IS_ERR(tfile)) {
+			ret = PTR_ERR(tfile);
+			break;
+		}
+
+		/* The target file must support poll */
+		if (!tfile->f_op || !tfile->f_op->poll) {
+			ret = -EPERM;
+			break;
+		}
+
+		/* Cannot add an epoll file descriptor inside itself. */
+		if (epfile == tfile) {
+			ret = -EINVAL;
+			break;
+		}
+
+		mutex_lock(&ep->mtx);
+		epi = ep_find(ep, tfile, h->items[i].fd);
+		if (!epi) {
+			epev.events |= POLLERR | POLLHUP;
+			ret = ep_insert(ep, &epev, tfile, h->items[i].fd);
+		} else
+			ret = -EEXIST;
+		mutex_unlock(&ep->mtx);
+	}
+out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* TODO confirm that get_current_user() has been restored */
+struct file* ep_file_restore(struct ckpt_ctx *ctx,
+			     struct ckpt_hdr_file *h)
+{
+	struct file *epfile;
+	int epfd, ret;
+
+	if (h->h.type != CKPT_HDR_FILE ||
+	    h->h.len  != sizeof(*h) ||
+	    h->f_type != CKPT_FILE_EPOLL)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * TODO Normally h->f_flags contains flags that epoll_create() won't
+	 * accept. Right now we pass only those flags it will accept here
+	 * and restore the rest during the "common" file restore. Check
+	 * to make sure we're not missing anything.
+	 */
+	epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
+	if (epfd < 0)
+		return ERR_PTR(epfd);
+	epfile = fget(epfd);
+	if (!epfile)
+		return ERR_PTR(-ENOENT); /* TODO pick better error? */
+
+	ret = restore_file_common(ctx, epfile, h);
+	if (ret < 0)
+		goto fput_out;
+
+	/*
+	 * Now we have the file and file descriptor but the epoll set is empty.
+	 * Defer restoring the epoll set until we encounter its corresponding
+	 * items. Note that this effectively counts the number of
+	 * ckpt_eventpoll_items blocks we should expect -- we rely on the
+	 * epfile_objref of those blocks to associate them with the proper
+	 * file.
+	 */
+	ret = deferqueue_add(ctx->files_deferq, &ctx, sizeof(ctx),
+			     ep_items_restore, NULL);
+	if (ret < 0) {
+fput_out:
+		fput(epfile);
+		epfile = ERR_PTR(ret);
+	}
+	sys_close(epfd);
+	return epfile;
+}
+
+#endif /* CONFIG_CHECKPOINT */
+
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 761cad5..053c6c0 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -69,6 +69,7 @@ extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
 extern int _ckpt_read_nbuffer(struct ckpt_ctx *ctx, void *ptr, int len);
 extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
 extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
+extern void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max);
 extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
 extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int len, int type);
 extern int ckpt_read_payload(struct ckpt_ctx *ctx,
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 4d5c22a..3a3e530 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -76,6 +76,7 @@ enum {
 	CKPT_HDR_FILE_NAME,
 	CKPT_HDR_FILE,
 	CKPT_HDR_PIPE_BUF,
+	CKPT_HDR_FILE_EPOLL_ITEMS, /* Follows file-table */
 
 	CKPT_HDR_MM = 401,
 	CKPT_HDR_VMA,
@@ -342,6 +343,7 @@ enum file_type {
 	CKPT_FILE_PIPE,
 	CKPT_FILE_FIFO,
 	CKPT_FILE_SOCKET,
+	CKPT_FILE_EPOLL,
 	CKPT_FILE_MAX
 };
 
@@ -426,6 +428,18 @@ struct ckpt_hdr_file_socket {
 	struct ckpt_hdr_socket socket;
 } __attribute__((aligned(8)));
 
+struct ckpt_eventpoll_items {
+	struct ckpt_hdr h;
+	__s32  epfile_objref;
+	__u32  num_items;
+	struct {
+		__u64 data;
+		__u32 fd;
+		__s32 file_objref;
+		__u32 events;
+	} items[0];
+} __attribute__((aligned(8)));
+
 struct ckpt_hdr_utsns {
 	struct ckpt_hdr h;
 	char sysname[__NEW_UTS_LEN + 1];
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
index e98251b..51cdd0c 100644
--- a/include/linux/checkpoint_types.h
+++ b/include/linux/checkpoint_types.h
@@ -48,6 +48,8 @@ struct ckpt_ctx {
 
 	struct ckpt_obj_hash *obj_hash;	/* repository for shared objects */
 	struct deferqueue_head *deferqueue;	/* queue of deferred work */
+	struct deferqueue_head *files_deferq; /* deferred work to do after
+						 saving file table */
 
 	struct path fs_mnt;     /* container root (FIXME) */
 
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index f6856a5..ff3de38 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -95,11 +95,23 @@ static inline void eventpoll_release(struct file *file)
 	eventpoll_release_file(file);
 }
 
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint_hdr.h>
+extern struct file* ep_file_restore(struct ckpt_ctx *ctx,
+				    struct ckpt_hdr_file *h);
+#endif
 #else
 
 static inline void eventpoll_init_file(struct file *file) {}
 static inline void eventpoll_release(struct file *file) {}
-
+#ifdef CONFIG_CHECKPOINT
+#include <linux/checkpoint_hdr.h>
+static inline struct file* ep_file_restore(struct ckpt_ctx *ctx,
+					   struct ckpt_hdr_file *ptr)
+{
+	return NULL;
+}
+#endif
 #endif
 
 #endif /* #ifdef __KERNEL__ */
-- 
1.5.6.3



More information about the Containers mailing list