[RFC v2][PATCH 8/9] File descriprtors - dump state

Louis Rilling Louis.Rilling at kerlabs.com
Thu Aug 21 04:06:14 PDT 2008


On Wed, Aug 20, 2008 at 11:07:16PM -0400, Oren Laadan wrote:
>
> Dump the files_struct of a task with 'struct cr_hdr_files', followed by
> all open file descriptors. Since FDs can be shared, they are assigned a
> tag and registered in the object hash.
>
> For each open FD there is a 'struct cr_hdr_fd_ent' with the FD, its tag
> and its close-on-exec property. If the FD is to be saved (first time)
> then this is followed by a 'struct cr_hdr_fd_data' with the FD state.
> Then will come the next FD and so on.
>
> This patch only handles basic FDs - regular files, directories and also
> symbolic links.

[...]

> diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c
> new file mode 100644
> index 0000000..18faaf1
> --- /dev/null
> +++ b/checkpoint/ckpt_file.c
> @@ -0,0 +1,234 @@
> +/*
> + *  Checkpoint file descriptors
> + *
> + *  Copyright (C) 2008 Oren Laadan
> + *
> + *  This file is subject to the terms and conditions of the GNU General Public
> + *  License.  See the file COPYING in the main directory of the Linux
> + *  distribution for more details.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/sched.h>
> +#include <linux/file.h>
> +#include <linux/fdtable.h>
> +
> +#include "ckpt.h"
> +#include "ckpt_hdr.h"
> +#include "ckpt_file.h"
> +
> +#define CR_DEFAULT_FDTABLE  128
> +
> +/**
> + * cr_scan_fds - scan file table and construct array of open fds
> + * @files: files_struct pointer
> + * @fdtable: (output) array of open fds
> + * @return: the number of open fds found
> + *
> + * Allocates the file descriptors array (*fdtable), caller should free
> + */
> +int cr_scan_fds(struct files_struct *files, int **fdtable)
> +{
> +	int i, j, n, max;
> +	struct fdtable *fdt;
> +	int *fdlist;
> +
> +	max = CR_DEFAULT_FDTABLE;
> +
> + repeat:
> +	fdlist = kmalloc(max * sizeof(*fdlist), GFP_KERNEL);
> +	if (!fdlist)
> +		return -ENOMEM;
> +
> +	j = 0;
> +	n = 0;
> +
> +	spin_lock(&files->file_lock);
> +	fdt = files_fdtable(files);
> +	for (;;) {
> +		unsigned long set;
> +		i = j * __NFDBITS;
> +		if (i >= fdt->max_fds)
> +			break;
> +		set = fdt->open_fds->fds_bits[j++];

Why not simply use fcheck_files(files, n) and check if the result is not NULL?

> +		while (set) {
> +			if (set & 1) {
> +				if (unlikely(n == max)) {
> +					spin_unlock(&files->file_lock);
> +					kfree(fdlist);
> +					max *= 2;
> +					if (max < 0)	/* overflow ? */
> +						return -EMFILE;
> +					goto repeat;
> +				}
> +				fdlist[n++] = i;
> +			}
> +			i++;
> +			set >>= 1;
> +		}
> +	}
> +	spin_unlock(&files->file_lock);
> +
> +	*fdtable = fdlist;
> +	return n;
> +}
> +
> +/* cr_write_fd_data - dump the state of a given file pointer */
> +static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int ptag)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_fd_data *hh = ctx->hbuf;
> +	struct dentry *dent = file->f_dentry;
> +	struct inode *inode = dent->d_inode;
> +	char *fname;
> +	int flen, how, ret;
> +
> +	h.type = CR_HDR_FD_DATA;
> +	h.len = sizeof(*hh);
> +	h.ptag = ptag;
> +
> +	BUG_ON(!inode);
> +
> +	flen = PAGE_SIZE;
> +	fname = cr_fill_fname(&file->f_path, ctx->vfsroot, ctx->tbuf, &flen);
> +	if (IS_ERR(fname))
> +		return PTR_ERR(fname);
> +
> +	hh->f_flags = file->f_flags;
> +	hh->f_mode = file->f_mode;
> +	hh->f_pos = file->f_pos;
> +	hh->f_uid = file->f_uid;
> +	hh->f_gid = file->f_gid;
> +	hh->f_version = file->f_version;
> +	/* FIX: need also file->f_owner */
> +
> +	switch(inode->i_mode & S_IFMT) {
> +	case S_IFREG:
> +		how = CR_FD_FILE;
> +		break;
> +	case S_IFDIR:
> +		how = CR_FD_DIR;
> +		break;
> +	case S_IFLNK:
> +		how = CR_FD_LINK;
> +		break;
> +	default:
> +		return -EBADF;
> +	}
> +
> +	/* FIX: check if the file/dir/link is unlinked */
> +
> +	BUG_ON(!flen);
> +
> +	ret = cr_write_obj(ctx, &h, hh);
> +	if (!ret && flen)
> +		ret = cr_write_str(ctx, fname, flen);
> +
> +	return ret;
> +}
> +
> +/**
> + * cr_write_fd_ent - dump the state of a given file descriptor
> + * @ctx: checkpoint context
> + * @files: files_struct pointer
> + * @fd: file descriptor
> + * + * Save the state of the file descriptor; look up the actual file 
> pointer
> + * in the hash table, and if found save the matching tag, otherwise call
> + * cr_write_fd_data to dump the file pointer too.
> + */
> +static int
> +cr_write_fd_ent(struct cr_ctx *ctx, struct files_struct *files, int fd)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_fd_ent *hh = ctx->hbuf;
> +	struct file *file = NULL;
> +	struct fdtable *fdt;
> +	int coe, tag, ret;
> +
> +	/* make sure hh->fd (that is of type __u16) doesn't overflow */
> +	if (fd > USHORT_MAX) {
> +		pr_warning("CR: open files table too big (%d)\n", USHORT_MAX);
> +		return -EMFILE;
> +	}
> +
> +	rcu_read_lock();
> +	fdt = files_fdtable(files);
> +	if (fd < fdt->max_fds)
> +		file = rcu_dereference(fdt->fd[fd]);

You should probably use fcheck_files() instead of copying its code here. I agree
that this means dereferencing files->fdt a second time below, but is it so
performance critical?

> +	if (file) {
> +		coe = FD_ISSET(fd, fdt->close_on_exec);
> +		get_file(file);
> +	}
> +	rcu_read_unlock();
> +
> +	/* sanity check (although this shouldn't happen) */
> +	if (unlikely(!file))
> +		return -EBADF;
> +
> +	ret = cr_obj_add_ptr(ctx, (void *) file, &tag, CR_OBJ_FILE, 0);
> +	cr_debug("fd %d tag %d file %p c-o-e %d)\n", fd, tag, file, coe);
> +
> +	if (ret >= 0) {
> +		int new = ret;
> +
> +		h.type = CR_HDR_FD_ENT;
> +		h.len = sizeof(*hh);
> +		h.ptag = 0;
> + +		hh->tag = tag;
    ^
    |

> +		hh->fd = fd;
> +		hh->close_on_exec = coe;
> +
> +		ret = cr_write_obj(ctx, &h, hh);
> +
> +		/* new==1 if-and-only-if file was new and added to hash */
> +		if (!ret && new)
> +			ret = cr_write_fd_data(ctx, file, tag);
> +	}
> +
> +	fput(file);
> +	return ret;
> +}
> +
> +int cr_write_files(struct cr_ctx *ctx, struct task_struct *t)
> +{
> +	struct cr_hdr h;
> +	struct cr_hdr_files *hh = ctx->hbuf;
> +	struct files_struct *files;
> +	int *fdtable;
> +	int nfds, n, ret;
> +
> +	h.type = CR_HDR_FILES;
> +	h.len = sizeof(*hh);
> +	h.ptag = task_pid_vnr(t);
> +
> +	files = get_files_struct(t);
> +
> +	nfds = cr_scan_fds(files, &fdtable);
> +	if (nfds < 0) {
> +		ret = nfds;
> +		goto out;
> +	}
> + +	hh->tag = 0;
    ^
    |

> +	hh->nfds = nfds;
> +
> +	ret = cr_write_obj(ctx, &h, hh);
> +	if (ret < 0)
> +		goto clean;
> +
> +	cr_debug("nfds %d\n", nfds);
> +	for (n = 0; n < nfds; n++) {
> +		ret = cr_write_fd_ent(ctx, files, n);
> +		if (ret < 0)
> +			break;
> +	}
> +
> + clean:
> +	kfree(fdtable);
> + out:
> +	put_files_struct(files);
> +
> +	return ret;
> +}

[...]

> diff --git a/checkpoint/ckpt_hdr.h b/checkpoint/ckpt_hdr.h
> index a3919cf..a8a37db 100644
> --- a/checkpoint/ckpt_hdr.h
> +++ b/checkpoint/ckpt_hdr.h

[...]

> @@ -114,4 +125,24 @@ struct cr_hdr_vma {
>
>  } __attribute__ ((aligned (8)));
>
> +struct cr_hdr_files {
> +	__u32 tag;	/* sharing identifier */
> +	__u32 nfds;
> +} __attribute__ ((aligned (8)));
> +
> +struct cr_hdr_fd_ent {
> +	__u32 tag;
> +	__u16 fd;
> +	__u16 close_on_exec;
> +} __attribute__ ((aligned (8)));
> +
> +struct cr_hdr_fd_data {
> +	__u16 how;
> +	__u16 f_mode;
> +	__u32 f_flags;
> +	__u32 f_uid, f_gid;

We are not at a 64bits boundary here. Should add one __32 padding or reorder the
fields.

> +	__u64 f_pos;
> +	__u64 f_version;
> +} __attribute__ ((aligned (8)));
> +
>  #endif /* _CHECKPOINT_CKPT_HDR_H_ */
> -- 
> 1.5.4.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Dr Louis Rilling			Kerlabs
Skype: louis.rilling			Batiment Germanium
Phone: (+33|0) 6 80 89 08 23		80 avenue des Buttes de Coesmes
http://www.kerlabs.com/			35700 Rennes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url : http://lists.linux-foundation.org/pipermail/containers/attachments/20080821/21903d35/attachment.pgp 


More information about the Containers mailing list