[PATCH 2/2] [RFC] Add checkpoint/restart support for epoll files.

Serge E. Hallyn serue at us.ibm.com
Mon Aug 24 14:27:25 PDT 2009


Quoting Matt Helsley (matthltc at us.ibm.com):
> Save/restore epoll items during checkpoint/restart respectively.
> 
> Tests for the cr_tests suite to follow. Tests pass on i386.
> 
> TODOs (search the patch for "TODO") that could probably use some
> comments:
> 
> What to do when there's a "possible checkpoint obj leak"? (search patch
> 	for this string to see what I'm talking about)
> 
> Ensure get_current_user will be correct (a userns question/issue?).
> 
> kmalloc failures should be dealt with more kindly than just error-out
> 	because epoll is made to poll many thousands of file
> 	descriptors.
> 	This seems like a more general problem with some of the
> 	ckpt_hdr* functions than an epoll problem but...
> 
> Pick better errnos for some cases.
> 
> Signed-off-by: Matt Helsley <matthltc at us.ibm.com>
> Cc: Oren Laadan <orenl at librato.com>
> ---
>  checkpoint/files.c               |   35 +++++
>  checkpoint/restart.c             |    2 +-
>  fs/eventpoll.c                   |  280 +++++++++++++++++++++++++++++++++++++-
>  include/linux/checkpoint.h       |    1 +
>  include/linux/checkpoint_hdr.h   |   14 ++
>  include/linux/checkpoint_types.h |    2 +
>  include/linux/eventpoll.h        |   14 ++-
>  7 files changed, 345 insertions(+), 3 deletions(-)
> 
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index 204055b..8f86dcc 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -21,6 +21,8 @@
>  #include <linux/syscalls.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <linux/deferqueue.h>
> +#include <linux/eventpoll.h>
>  #include <net/sock.h>
> 
> 
> @@ -289,11 +291,24 @@ static int do_checkpoint_file_table(struct ckpt_ctx *ctx,
>  		goto out;
> 
>  	ckpt_debug("nfds %d\n", nfds);
> +	ctx->files_deferq = deferqueue_create();
> +	if (!ctx->files_deferq) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
>  	for (n = 0; n < nfds; n++) {
>  		ret = checkpoint_file_desc(ctx, files, fdtable[n]);
>  		if (ret < 0)
>  			break;
>  	}
> +	if (!ret) {
> +		ret = deferqueue_run(ctx->files_deferq);
> +		if (ret > 0) {
> +			pr_warning("c/r: files deferqueue had %d entries\n", ret);
> +			ret = 0;
> +		}
> +	}
> +	deferqueue_destroy(ctx->files_deferq);
>   out:
>  	kfree(fdtable);
>  	return ret;
> @@ -572,6 +587,14 @@ static struct restore_file_ops restore_file_ops[] = {
>  		.file_type = CKPT_FILE_SOCKET,
>  		.restore = sock_file_restore,
>  	},
> +#ifdef CONFIG_EPOLL
> +	/* epoll */
> +	{
> +		.file_name = "EPOLL",
> +		.file_type = CKPT_FILE_EPOLL,
> +		.restore = ep_file_restore,
> +	},
> +#endif
>  };
> 
>  static struct file *do_restore_file(struct ckpt_ctx *ctx)
> @@ -692,11 +715,23 @@ static struct files_struct *do_restore_file_table(struct ckpt_ctx *ctx)
>  	if (ret < 0)
>  		goto out;
> 
> +	ret = -ENOMEM;
> +	ctx->files_deferq = deferqueue_create();
> +	if (!ctx->files_deferq)
> +		goto out;
>  	for (i = 0; i < h->fdt_nfds; i++) {
>  		ret = restore_file_desc(ctx);
>  		if (ret < 0)
>  			break;
>  	}
> +	if (!ret) {
> +		ret = deferqueue_run(ctx->files_deferq);
> +		if (ret > 0) {
> +			pr_warning("c/r: files deferqueue had %d entries\n", ret);
> +			ret = 0;
> +		}
> +	}
> +	deferqueue_destroy(ctx->files_deferq);
>   out:
>  	ckpt_hdr_put(ctx, h);
>  	if (!ret) {
> diff --git a/checkpoint/restart.c b/checkpoint/restart.c
> index 4fdae78..3a7d914 100644
> --- a/checkpoint/restart.c
> +++ b/checkpoint/restart.c
> @@ -164,7 +164,7 @@ int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
>   *
>   * Return: new buffer allocated on success, error pointer otherwise
>   */
> -static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
> +void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
>  {
>  	struct ckpt_hdr hh;
>  	struct ckpt_hdr *h;
> diff --git a/fs/eventpoll.c b/fs/eventpoll.c
> index 085c5c0..7f7070f 100644
> --- a/fs/eventpoll.c
> +++ b/fs/eventpoll.c
> @@ -671,10 +671,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
>  	return pollflags != -1 ? pollflags : 0;
>  }
> 
> +#ifdef CONFIG_CHECKPOINT
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file);
> +#else
> +#define ep_eventpoll_checkpoint NULL

Don't forget ep_file_collect here.

> +#endif
> +
>  /* File callbacks that implement the eventpoll file behaviour */
>  static const struct file_operations eventpoll_fops = {
>  	.release	= ep_eventpoll_release,
> -	.poll		= ep_eventpoll_poll
> +	.poll		= ep_eventpoll_poll,
> +	.checkpoint 	= ep_eventpoll_checkpoint,
> +	.collect 	= ep_file_collect,
>  };
> 
>  /* Fast test to see if the file is an evenpoll file */
> @@ -1413,6 +1422,275 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
> 
>  #endif /* HAVE_SET_RESTORE_SIGMASK */
> 
> +#ifdef CONFIG_CHECKPOINT
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/deferqueue.h>
> +
> +static int ep_file_collect(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct rb_node *rbp;
> +	struct eventpoll *ep;
> +	int ret = 0;
> +
> +	if (!is_file_epoll(file))
> +		return 0;
> +
> +	ep = file->private_data;
> +	mutex_lock(&ep->mtx);
> +	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
> +		struct epitem *epi;
> +
> +		epi = rb_entry(rbp, struct epitem, rbn);
> +		ret = ckpt_obj_collect(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> +		if (ret < 0)
> +			break;
> +	}
> +	mutex_unlock(&ep->mtx);
> +	return ret;
> +}
> +
> +struct epoll_deferq_entry {
> +	struct ckpt_ctx *ctx;
> +	struct file *epfile;
> +};
> +
> +static int ep_items_checkpoint(void *data)
> +{
> +	struct epoll_deferq_entry *ep_dq_entry = data;
> +	struct ckpt_ctx *ctx;
> +	struct file *file;
> +	struct ckpt_eventpoll_items *h;
> +	struct rb_node *rbp;
> +	struct eventpoll *ep;
> +	int i, ret = -ENOMEM;
> +
> +	file = ep_dq_entry->epfile;
> +	ctx = ep_dq_entry->ctx;
> +
> +	ep = file->private_data;
> +	mutex_lock(&ep->mtx);
> +	for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
> +	mutex_unlock(&ep->mtx);
> +
> +	/* TODO likely allocation failure when lots of epoll items */
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h) + i*sizeof(h->items[0]),
> +			      CKPT_HDR_FILE_EPOLL_ITEMS);
> +	if (!h)
> +		goto out;
> +
> +	ret = -ENODEV;
> +	h->num_items = i;
> +	h->epfile_objref = ckpt_obj_lookup(ctx, file, CKPT_OBJ_FILE);
> +	if (h->epfile_objref <= 0)
> +		goto out;
> +
> +	ret = 0;
> +	mutex_lock(&ep->mtx);
> +	for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {
> +		struct epitem *epi;
> +		int objref;
> +
> +		epi = rb_entry(rbp, struct epitem, rbn);
> +		objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
> +		if (objref <= 0) {
> +			/* TODO error -- possible checkpoint obj leak */
> +			ret = -ENODEV;
> +			break;
> +		}
> +		h->items[i].fd = epi->ffd.fd;
> +		h->items[i].file_objref = objref;
> +		h->items[i].events = epi->event.events;
> +		h->items[i].data = epi->event.data;
> +	}
> +	mutex_unlock(&ep->mtx);
> +	if (h && !ret)
> +		ret = ckpt_write_obj(ctx, &h->h);
> +	if (!ret && (i != h->num_items)) {
> +		/* TODO error -- possible checkpoint obj leak */
> +	}
> +out:
> +	if (h)
> +		ckpt_hdr_put(ctx, &h->h);
> +	return ret;
> +}
> +
> +static int ep_eventpoll_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct ckpt_hdr_file *h;
> +	struct epoll_deferq_entry ep_dq_entry;
> +	int ret = -ENOMEM;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> +	if (!h)
> +		goto out_print;
> +	h->f_type = CKPT_FILE_EPOLL;
> +	ret = checkpoint_file_common(ctx, file, h);
> +	if (ret < 0)
> +		goto out;
> +	ret = ckpt_write_obj(ctx, &h->h);
> +	if (ret < 0)
> +		goto out;
> +
> +	/*
> +	 * Defer saving the epoll items until all of the ffd.file pointers
> +	 * have an objref; after the file table has been checkpointed.
> +	 */
> +	ep_dq_entry.ctx = ctx;
> +	ep_dq_entry.epfile = file;
> +	ret = deferqueue_add(ctx->files_deferq, &ep_dq_entry,
> +			     sizeof(ep_dq_entry), ep_items_checkpoint, NULL);
> +out:
> +	ckpt_hdr_put(ctx, h);
> +out_print:
> +	return ret;
> +}
> +
> +static int ep_items_restore(void *data)
> +{
> +	struct ckpt_ctx *ctx = *((struct ckpt_ctx**)data);
> +	struct ckpt_eventpoll_items *h;
> +	struct eventpoll *ep;
> +	struct file *epfile = NULL;
> +	int ret, i = 0, remaining_watches;
> +
> +	/*
> +	 * TODO possible kmalloc failure due to too many watches.
> +	 */
> +	h = ckpt_read_obj(ctx, 0,
> +			  sizeof(*h) + max_user_watches*sizeof(h->items[0]));
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	ret = -EINVAL;
> +	if ((h->h.type != CKPT_HDR_FILE_EPOLL_ITEMS) ||
> +	    (h->h.len < sizeof(*h)))
> +		goto out;
> +
> +	/* Make sure the items match the size we expect */
> +	if (h->num_items != ((h->h.len - sizeof(*h)) / sizeof(h->items[0])))
> +		goto out;
> +
> +	epfile = ckpt_obj_fetch(ctx, h->epfile_objref, CKPT_OBJ_FILE);
> +	if (IS_ERR(epfile)) {
> +		ret = PTR_ERR(epfile);
> +		goto out;
> +	}
> +	ret = -ENOMSG;
> +	if (!is_file_epoll(epfile))
> +		goto out;
> +
> +	ep = epfile->private_data;
> +
> +	ret = -ENOSPC;
> +	remaining_watches = (max_user_watches -
> +			     atomic_read(&ep->user->epoll_watches));
> +	if (h->num_items > remaining_watches)
> +		goto out;
> +
> +	ret = 0;
> +	/* Restore the epoll items/watches */
> +	for (i = 0; !ret && i < h->num_items; i++) {
> +		/*
> +		 * Loop body like multiple epoll_ctl(ep, ADD, event)
> +		 * calls except we've already done much of the checking.
> +		 */
> +		struct epoll_event epev;
> +		struct epitem *epi;
> +		struct file *tfile;
> +
> +		epev.events = h->items[i].events;
> +		epev.data = h->items[i].data;
> +
> +		/* Get the file* for the target file */
> +		if (h->items[i].file_objref <= 0) {
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		tfile = ckpt_obj_fetch(ctx, h->items[i].file_objref,
> +				       CKPT_OBJ_FILE);
> +		if (IS_ERR(tfile)) {
> +			ret = PTR_ERR(tfile);
> +			break;
> +		}
> +
> +		/* The target file must support poll */
> +		if (!tfile->f_op || !tfile->f_op->poll) {
> +			ret = -EPERM;
> +			break;
> +		}
> +
> +		/* Cannot add an epoll file descriptor inside itself. */
> +		if (epfile == tfile) {
> +			ret = -EINVAL;
> +			break;
> +		}
> +
> +		mutex_lock(&ep->mtx);
> +		epi = ep_find(ep, tfile, h->items[i].fd);
> +		if (!epi) {
> +			epev.events |= POLLERR | POLLHUP;
> +			ret = ep_insert(ep, &epev, tfile, h->items[i].fd);
> +		} else
> +			ret = -EEXIST;
> +		mutex_unlock(&ep->mtx);
> +	}
> +out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +/* TODO confirm that get_current_user() has been restored */

Not sure what you mean.  At this point, the task's credentials
are still the ones used when it called sys_restart().  We won't
update them until the end of the restart operation.  However the
normal file restore operations should have reset the file->f_cred
to the checkpointed value - including hierarchical user namespaces.

> +struct file* ep_file_restore(struct ckpt_ctx *ctx,
> +			     struct ckpt_hdr_file *h)
> +{
> +	struct file *epfile;
> +	int epfd, ret;
> +
> +	if (h->h.type != CKPT_HDR_FILE ||
> +	    h->h.len  != sizeof(*h) ||
> +	    h->f_type != CKPT_FILE_EPOLL)
> +		return ERR_PTR(-EINVAL);
> +
> +	/*
> +	 * TODO Normally h->f_flags contains flags that epoll_create() won't
> +	 * accept. Right now we pass only those flags it will accept here
> +	 * and restore the rest during the "common" file restore. Check
> +	 * to make sure we're not missing anything.
> +	 */
> +	epfd = sys_epoll_create1(h->f_flags & EPOLL_CLOEXEC);
> +	if (epfd < 0)
> +		return ERR_PTR(epfd);
> +	epfile = fget(epfd);
> +	if (!epfile)
> +		return ERR_PTR(-ENOENT); /* TODO pick better error? */
> +
> +	ret = restore_file_common(ctx, epfile, h);
> +	if (ret < 0)
> +		goto fput_out;
> +
> +	/*
> +	 * Now we have the file and file descriptor but the epoll set is empty.
> +	 * Defer restoring the epoll set until we encounter its corresponding
> +	 * items. Note that this effectively counts the number of
> +	 * ckpt_eventpoll_items blocks we should expect -- we rely on the
> +	 * epfile_objref of those blocks to associate them with the proper
> +	 * file.
> +	 */
> +	ret = deferqueue_add(ctx->files_deferq, &ctx, sizeof(ctx),
> +			     ep_items_restore, NULL);
> +	if (ret < 0) {
> +fput_out:
> +		fput(epfile);
> +		epfile = ERR_PTR(ret);
> +	}
> +	sys_close(epfd);
> +	return epfile;
> +}
> +
> +#endif /* CONFIG_CHECKPOINT */
> +


More information about the Containers mailing list