[RFC v14-rc2][PATCH 17/29] Checkpoint open pipes

Serge E. Hallyn serue at us.ibm.com
Wed Apr 1 12:47:26 PDT 2009


Quoting Oren Laadan (orenl at cs.columbia.edu):
> A pipe is essentially a double-headed inode with a buffer attached to
> it. We checkpoint the pipe buffer only once, as soon as we hit one
> side of the pipe, regardless whether it is read- or write- end.
> 
> To checkpoint a file descriptor that refers to a pipe (either end), we
> first lookup the inode in the hash table:
> 
> If not found, it is the first encounter of this pipe. Besides the file
> descriptor, we also (a) save the pipe data, and (b) register the pipe
> inode in the hash. We save the 'objref' of the inode 'in ->fd_objref'
> of the file descriptor. The file descriptor type becomes CR_FD_PIPE.
> 
> If found, it is the second encounter of this pipe, namely, as we hit
> the other end of the same pipe. In this case we need only record the
> reference ('objref') to the inode that we had saved before, and the
> file descriptor type is changed to CR_FD_OBJREF.
> 
> The type CR_FD_PIPE will indicate to the kernel to create a new pipe;
> since both ends are created at the same time, one end will be used,
> and the other end will be deposited in the hash table for later use.
> The type CR_FD_OBJREF will indicate that the corresponding file
> descriptor is already setup and registered in the hash using the
> '->fd_objref' that it had been assigned.
> 
> The format of the pipe data is as follows:
> 
> struct cr_hdr_fd_pipe {
>        __u32 nr_bufs;
> }
> 
> cr_hdr + cr_hdr_fd_ent
> 	cr_hdr + cr_hdr_fd_data
> 		cr_hdr + cr_hdr_fd_pipe		-> # buffers
> 			cr_hdr + cr_hdr_buffer	-> 1st buffer
> 			cr_hdr + cr_hdr_buffer	-> 2nd buffer
> 			cr_hdr + cr_hdr_buffer	-> 3rd buffer
> 			...
> 
> Changelog[v14]:
>   - Use 'fd_type' instead of 'hh->fd_objref' in cr_write_fd_data()
>   - Revert change to pr_debug(), back to cr_debug()
>   - Discard the 'h.parent' field
>   - Check whether calls to cr_hbuf_get() fail
>   - Test that a pipe's inode != ctx->file's inode to prevent deadlock
> 
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>

Acked-by: Serge Hallyn <serue at us.ibm.com>

But:

> ---
>  checkpoint/ckpt_file.c         |    2 +
>  fs/pipe.c                      |  113 ++++++++++++++++++++++++++++++++++++++++
>  include/linux/checkpoint_hdr.h |    8 +++-
>  3 files changed, 122 insertions(+), 1 deletions(-)
> 
> diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c
> index 0fe68bf..dd26b3d 100644
> --- a/checkpoint/ckpt_file.c
> +++ b/checkpoint/ckpt_file.c
> @@ -12,6 +12,7 @@
>  #include <linux/sched.h>
>  #include <linux/file.h>
>  #include <linux/fdtable.h>
> +#include <linux/pipe_fs_i.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> 
> @@ -72,6 +73,7 @@ int cr_scan_fds(struct files_struct *files, int **fdtable)
>  	return n;
>  }
> 
> +
>  static int cr_write_file_generic(struct cr_ctx *ctx, struct file *file,
>  				 struct cr_hdr_file *hh)
>  {
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 14f502b..0c3f391 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -22,6 +22,9 @@
>  #include <asm/uaccess.h>
>  #include <asm/ioctls.h>
> 
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +
>  /*
>   * We use a start+len construction, which provides full use of the 
>   * allocated memory.
> @@ -771,6 +774,113 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
>  	return 0;
>  }
> 
> +/* cr_write_pipebuf - dump contents of a pipe/fifo (assume i_mutex taken) */
> +static int cr_write_pipebuf(struct cr_ctx *ctx, struct pipe_inode_info *pipe)
> +{
> +	struct cr_hdr h;
> +	void *kbuf, *addr;
> +	int i, ret = 0;
> +
> +	kbuf = (void *) __get_free_page(GFP_KERNEL);
> +	if (!kbuf)
> +		return -ENOMEM;
> +
> +	/* this is a simplified fs/pipe.c:read_pipe() */

pipe_read() actually :)

> +
> +	for (i = 0; i < pipe->nrbufs; i++) {
> +		int nn = (pipe->curbuf + i) & (PIPE_BUFFERS-1);
> +		struct pipe_buffer *pbuf = pipe->bufs + nn;
> +		const struct pipe_buf_operations *ops = pbuf->ops;
> +
> +		ret = ops->confirm(pipe, pbuf);
> +		if (ret < 0)
> +			break;

not that it seems to matter, but pipe_read() returns error
also if ret > 0.

> +
> +		addr = ops->map(pipe, pbuf, 1);
> +		memcpy(kbuf, addr + pbuf->offset, pbuf->len);
> +		ops->unmap(pipe, pbuf, addr);
> +
> +		h.type = CR_HDR_BUFFER;
> +		h.len = pbuf->len;
> +
> +		ret = cr_write_obj(ctx, &h, kbuf);
> +		if (ret < 0)
> +			break;
> +	}
> +
> +	free_page((unsigned long) kbuf);
> +	return ret;
> +}
> +
> +/* cr_write_pipe - dump pipe (assume i_mutex taken) */
> +static int cr_write_pipe(struct cr_ctx *ctx, struct inode *inode)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_fd_pipe *hh;
> +	struct pipe_inode_info *pipe = inode->i_pipe;
> +	int ret;
> +
> +	h.type = CR_HDR_FD_PIPE;
> +	h.len = sizeof(*hh);
> +
> +	hh = cr_hbuf_get(ctx, sizeof(*hh));
> +	if (!hh)
> +		return -ENOMEM;
> +
> +	hh->nr_bufs = pipe->nrbufs;
> +
> +	ret = cr_write_obj(ctx, &h, hh);
> +	cr_hbuf_put(ctx, sizeof(*hh));
> +	if (ret < 0)
> +		return ret;
> +
> +	return cr_write_pipebuf(ctx, pipe);
> +}
> +
> +static int pipe_file_checkpoint(struct cr_ctx *ctx,
> +				struct file *file, struct cr_hdr_file *hh)
> +{
> +	struct cr_hdr h;
> +	struct inode *inode = file->f_dentry->d_inode;
> +	int new, objref;
> +	int ret;
> +
> +	/*
> +	 * We take the inode's mutex and later will call vfs_write(),
> +	 * which also takes an inode's mutex. To avoid deadlock, make
> +	 * sure that the two inodes are distinct.
> +	 */
> +	if (ctx->file->f_dentry->d_inode == inode) {
> +		pr_warning("c/r: writing to pipe that is checkpointed "
> +			   "may result in a deadlock ... aborting\n");
> +		return -EDEADLK;
> +	}
> +
> +	h.type = CR_HDR_FILE;
> +	h.len = sizeof(*hh);
> +
> +	new = cr_obj_add_ptr(ctx, inode, &objref, CR_OBJ_INODE, 0);
> +	cr_debug("objref %d inode %p new %d\n", objref, inode, new);
> +	if (new < 0)
> +		return new;
> +
> +	hh->fd_type = (new ? CR_FD_PIPE : CR_FD_OBJREF);

The git commit msg has a good explanation, but it's worth a comment
in the code as well, that on first instance we call it
CR_FD_PIPE and second time CR_FD_OBJREF.

> +	hh->fd_objref = objref;
> +
> +	ret = cr_write_obj(ctx, &h, hh);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (new) {
> +		mutex_lock(&inode->i_mutex);
> +		ret = cr_write_pipe(ctx, inode);
> +		mutex_unlock(&inode->i_mutex);
> +	}
> +
> +	return ret;
> +}
> +
> +
>  /*
>   * The file_operations structs are not static because they
>   * are also used in linux/fs/fifo.c to do operations on FIFOs.
> @@ -787,6 +897,7 @@ const struct file_operations read_pipefifo_fops = {
>  	.open		= pipe_read_open,
>  	.release	= pipe_read_release,
>  	.fasync		= pipe_read_fasync,
> +	.checkpoint	= pipe_file_checkpoint,
>  };
> 
>  const struct file_operations write_pipefifo_fops = {
> @@ -799,6 +910,7 @@ const struct file_operations write_pipefifo_fops = {
>  	.open		= pipe_write_open,
>  	.release	= pipe_write_release,
>  	.fasync		= pipe_write_fasync,
> +	.checkpoint	= pipe_file_checkpoint,
>  };
> 
>  const struct file_operations rdwr_pipefifo_fops = {
> @@ -812,6 +924,7 @@ const struct file_operations rdwr_pipefifo_fops = {
>  	.open		= pipe_rdwr_open,
>  	.release	= pipe_rdwr_release,
>  	.fasync		= pipe_rdwr_fasync,
> +	.checkpoint	= pipe_file_checkpoint,
>  };
> 
>  struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 9ad845d..ce5d880 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -57,6 +57,7 @@ enum {
>  	CR_HDR_FD_TABLE = 301,
>  	CR_HDR_FD_ENT,
>  	CR_HDR_FILE,
> +	CR_HDR_FD_PIPE,
> 
>  	CR_HDR_TAIL = 5001
>  };
> @@ -152,7 +153,8 @@ struct cr_hdr_fd_ent {
>  /* fd types */
>  enum  fd_type {
>  	CR_FD_OBJREF = 1,
> -	CR_FD_GENERIC
> +	CR_FD_GENERIC,
> +	CR_FD_PIPE,
>  };
> 
>  struct cr_hdr_file {
> @@ -165,4 +167,8 @@ struct cr_hdr_file {
>  	__u64 f_version;
>  } __attribute__((aligned(8)));
> 
> +struct cr_hdr_fd_pipe {
> +	__s32 nr_bufs;
> +} __attribute__((aligned(8)));
> +
>  #endif /* _CHECKPOINT_CKPT_HDR_H_ */
> -- 
> 1.5.4.3


More information about the Containers mailing list