[PATCH] c/r: Add AF_UNIX support (v3)

Oren Laadan orenl at cs.columbia.edu
Wed Jun 24 19:30:59 PDT 2009



Dan Smith wrote:
> This patch adds basic checkpoint/restart support for AF_UNIX sockets.  It
> has been tested with a single and multiple processes, and with data inflight
> at the time of checkpoint.  It supports socketpair()s, path-based, and
> abstract sockets.
> 
> Changes in v3:
>   - Move sock_file_checkpoint() above sock_file_restore()
>   - Change __sock_file_*() functions to do_sock_file_*()
>   - Adjust some of the struct cr_hdr_socket alignment
>   - Improve the sock_copy_buffers() algorithm to avoid locking the source
>     queue for the entire operation
>   - Fix alignment in the socket header struct(s)
>   - Move the per-protocol structure (ckpt_hdr_socket_un) out of the
>     common socket header and read/write it separately
>   - Fix missing call to sock_cptrst() in restore path
>   - Break out the socket joining into another function
>   - Fix failure to restore the socket address thus fixing getname()
>   - Check the state values on restart
>   - Fix case of state being TCP_CLOSE, which allows dgram sockets to be
>     properly connected (if appropriate) to their peer and maintain the
>     sockaddr for getname() operation
>   - Fix restoring a listening socket that has been unlink()'d
>   - Fix checkpointing sockets with an in-flight FD-passing SKB.  Fail
>     with EBUSY.
>   - Fix checkpointing listening sockets with an unaccepted connection.
>     Fail with EBUSY.
> 
> Changes in v2:
>   - Change GFP_KERNEL to GFP_ATOMIC in sock_copy_buffers() (this seems
>     to be rather common in other uses of skb_copy())
>   - Move the ckpt_hdr_socket structure definition to linux/socket.h
>   - Fix whitespace issue
>   - Move sock_file_checkpoint() to net/socket.c for symmetry
> 
> Cc: Oren Laaden <orenl at cs.columbia.edu>
> Cc: Alexey Dobriyan <adobriyan at gmail.com>
> Signed-off-by: Dan Smith <danms at us.ibm.com>
> ---
>  checkpoint/files.c             |    7 +
>  checkpoint/objhash.c           |   27 +++
>  include/linux/checkpoint_hdr.h |   13 +
>  include/linux/socket.h         |   59 +++++
>  include/net/sock.h             |    9 +
>  net/Makefile                   |    2 +
>  net/checkpoint.c               |  493 ++++++++++++++++++++++++++++++++++++++++
>  net/socket.c                   |   84 +++++++
>  8 files changed, 694 insertions(+), 0 deletions(-)
>  create mode 100644 net/checkpoint.c
> 
> diff --git a/checkpoint/files.c b/checkpoint/files.c
> index b264e40..bb2cca0 100644
> --- a/checkpoint/files.c
> +++ b/checkpoint/files.c
> @@ -21,6 +21,7 @@
>  #include <linux/syscalls.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <net/sock.h>
>  
>  
>  /**************************************************************************
> @@ -440,6 +441,12 @@ static struct restore_file_ops restore_file_ops[] = {
>  		.file_type = CKPT_FILE_PIPE,
>  		.restore = pipe_file_restore,
>  	},
> +	/* socket */
> +	{
> +		.file_name = "SOCKET",
> +		.file_type = CKPT_FILE_SOCKET,
> +		.restore = sock_file_restore,
> +	},
>  };
>  
>  static struct file *do_restore_file(struct ckpt_ctx *ctx)
> diff --git a/checkpoint/objhash.c b/checkpoint/objhash.c
> index 045a920..7819e5e 100644
> --- a/checkpoint/objhash.c
> +++ b/checkpoint/objhash.c
> @@ -19,6 +19,7 @@
>  #include <linux/ipc_namespace.h>
>  #include <linux/checkpoint.h>
>  #include <linux/checkpoint_hdr.h>
> +#include <net/sock.h>
>  
>  struct ckpt_obj;
>  struct ckpt_obj_ops;
> @@ -177,6 +178,22 @@ static int obj_ipc_ns_users(void *ptr)
>  	return atomic_read(&((struct ipc_namespace *) ptr)->count);
>  }
>  
> +static int obj_sock_grab(void *ptr)
> +{
> +	sock_hold((struct sock *) ptr);
> +	return 0;
> +}
> +
> +static void obj_sock_drop(void *ptr)
> +{
> +	sock_put((struct sock *) ptr);
> +}
> +
> +static int obj_sock_users(void *ptr)
> +{
> +	return atomic_read(&((struct sock *) ptr)->sk_refcnt);
> +}
> +
>  static struct ckpt_obj_ops ckpt_obj_ops[] = {
>  	/* ignored object */
>  	{
> @@ -254,6 +271,16 @@ static struct ckpt_obj_ops ckpt_obj_ops[] = {
>  		.checkpoint = checkpoint_bad,
>  		.restore = restore_bad,
>  	},
> +	/* sock object */
> +	{
> +		.obj_name = "SOCKET",
> +		.obj_type = CKPT_OBJ_SOCK,
> +		.ref_drop = obj_sock_drop,
> +		.ref_grab = obj_sock_grab,
> +		.ref_users = obj_sock_users,
> +		.checkpoint = sock_file_checkpoint,
> +		.restore = sock_file_restore,
> +	},
>  };
>  
>  
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index cd427d8..60870df 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -76,6 +76,12 @@ enum {
>  	CKPT_HDR_IPC_MSG_MSG,
>  	CKPT_HDR_IPC_SEM,
>  
> + 	CKPT_HDR_FD_SOCKET = 601,
> + 	CKPT_HDR_SOCKET,
> + 	CKPT_HDR_SOCKET_BUFFERS,
> + 	CKPT_HDR_SOCKET_BUFFER,
> +	CKPT_HDR_SOCKET_UN,
> +
>  	CKPT_HDR_TAIL = 9001,
>  
>  	CKPT_HDR_ERROR = 9999,
> @@ -103,6 +109,7 @@ enum obj_type {
>  	CKPT_OBJ_NS,
>  	CKPT_OBJ_UTS_NS,
>  	CKPT_OBJ_IPC_NS,
> +	CKPT_OBJ_SOCK,
>  	CKPT_OBJ_MAX
>  };
>  
> @@ -225,6 +232,7 @@ enum file_type {
>  	CKPT_FILE_IGNORE = 0,
>  	CKPT_FILE_GENERIC,
>  	CKPT_FILE_PIPE,
> +	CKPT_FILE_SOCKET,
>  	CKPT_FILE_MAX
>  };
>  
> @@ -248,6 +256,11 @@ struct ckpt_hdr_file_pipe {
>  	__s32 pipe_objref;
>  } __attribute__((aligned(8)));
>  
> +struct ckpt_hdr_file_socket {
> +	struct ckpt_hdr_file common;
> +	__u16 family;
> +} __attribute__((aligned(8)));
> +
>  struct ckpt_hdr_file_pipe_state {
>  	struct ckpt_hdr h;
>  	__s32 pipe_len;
> diff --git a/include/linux/socket.h b/include/linux/socket.h
> index 421afb4..3b5be70 100644
> --- a/include/linux/socket.h
> +++ b/include/linux/socket.h
> @@ -23,6 +23,7 @@ struct __kernel_sockaddr_storage {
>  #include <linux/uio.h>			/* iovec support		*/
>  #include <linux/types.h>		/* pid_t			*/
>  #include <linux/compiler.h>		/* __user			*/
> +#include <linux/checkpoint_hdr.h>	/* ckpt_hdr			*/
>  
>  #ifdef __KERNEL__
>  # ifdef CONFIG_PROC_FS
> @@ -323,5 +324,63 @@ extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *ka
>  extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
>  
>  #endif
> +
> +struct ckpt_hdr_socket_un {
> +	struct ckpt_hdr h;
> +	__u32 this;
> +	__u32 peer;
> +	__u8 linked;
> +} __attribute__ ((aligned(8)));
> +
> +struct ckpt_hdr_socket {
> +	struct ckpt_hdr h;
> +
> +	struct ckpt_socket { /* struct socket */
> +		__u64 flags;
> +		__u8 state;
> +	} socket __attribute__ ((aligned(8)));
> +
> +	struct ckpt_sock_common { /* struct sock_common */
> +		__u32 bound_dev_if;
> +		__u16 family;
> +		__u8 state;
> +		__u8 reuse;
> +	} sock_common __attribute__ ((aligned(8)));
> +
> +	struct ckpt_sock { /* struct sock */
> +		__u64 rcvlowat;
> +		__u64 rcvtimeo;
> +		__u64 sndtimeo;
> +		__u64 flags;
> +		__u64 lingertime;
> +
> +		__u32 err;
> +		__u32 err_soft;
> +		__u32 priority;
> +		__s32 rcvbuf;
> +		__s32 sndbuf;
> +		__u16 type;
> +		__u16 backlog;
> +
> +		__u8 protocol;
> +		__u8 state;
> +		__u8 shutdown;
> +		__u8 userlocks;
> +		__u8 no_check;
> +	} sock __attribute__ ((aligned(8)));
> +
> +	/* common to all supported families */
> +	__u32 laddr_len;
> +	__u32 raddr_len;
> +	struct sockaddr laddr;
> +	struct sockaddr raddr;
> +
> +} __attribute__ ((aligned(8)));
> +
> +struct ckpt_hdr_socket_buffer {
> +	struct ckpt_hdr h;
> +	__u32 skb_count;
> +} __attribute__ ((aligned(8)));
> +
>  #endif /* not kernel and not glibc */
>  #endif /* _LINUX_SOCKET_H */
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 4bb1ff9..a8b6af1 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1482,4 +1482,13 @@ extern int sysctl_optmem_max;
>  extern __u32 sysctl_wmem_default;
>  extern __u32 sysctl_rmem_default;
>  
> +/* Checkpoint/Restart Functions */
> +struct ckpt_ctx;
> +struct ckpt_hdr_socket;
> +extern int sock_file_checkpoint(struct ckpt_ctx *, void *);
> +extern void *sock_file_restore(struct ckpt_ctx *);
> +extern struct socket *do_sock_file_restore(struct ckpt_ctx *,
> +					   struct ckpt_hdr_socket *);
> +extern int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file);
> +
>  #endif	/* _SOCK_H */
> diff --git a/net/Makefile b/net/Makefile
> index 9e00a55..c226ed1 100644
> --- a/net/Makefile
> +++ b/net/Makefile
> @@ -65,3 +65,5 @@ ifeq ($(CONFIG_NET),y)
>  obj-$(CONFIG_SYSCTL)		+= sysctl_net.o
>  endif
>  obj-$(CONFIG_WIMAX)		+= wimax/
> +
> +obj-$(CONFIG_CHECKPOINT)	+= checkpoint.o
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> new file mode 100644
> index 0000000..fd47485
> --- /dev/null
> +++ b/net/checkpoint.c
> @@ -0,0 +1,493 @@
> +/*
> + *  Copyright 2009 IBM Corporation
> + *
> + *  Author: Dan Smith <danms at us.ibm.com>
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License as
> + *  published by the Free Software Foundation, version 2 of the
> + *  License.
> + */
> +
> +#include <linux/socket.h>
> +#include <linux/mount.h>
> +#include <linux/file.h>
> +
> +#include <net/af_unix.h>
> +#include <net/tcp_states.h>
> +
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +#include <linux/namei.h>
> +
> +/* Size of an empty struct sockaddr_un */
> +#define UNIX_LEN_EMPTY 2
> +
> +static int sock_copy_buffers(struct sk_buff_head *from, struct sk_buff_head *to)
> +{
> +	int count = 0;
> +	struct sk_buff *skb;
> +
> +	skb_queue_walk(from, skb) {
> +		struct sk_buff *tmp;
> +
> +		tmp = dev_alloc_skb(skb->len);
> +		if (!tmp)
> +			return -ENOMEM;
> +
> +		spin_lock(&from->lock);
> +		skb_morph(tmp, skb);
> +		spin_unlock(&from->lock);
> +
> +		skb_queue_tail(to, tmp);
> +		count++;
> +	}
> +
> +	return count;
> +}
> +
> +static int __sock_write_buffers(struct ckpt_ctx *ctx,
> +				struct sk_buff_head *queue)
> +{
> +	struct sk_buff *skb;
> +	int ret = 0;
> +
> +	skb_queue_walk(queue, skb) {
> +		if (UNIXCB(skb).fp) {
> +			ckpt_debug("unsupported fd-passing skb found\n");
> +			return -EBUSY;
> +		}
> +
> +		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
> +					  CKPT_HDR_SOCKET_BUFFER);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_write_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	struct sk_buff_head tmpq;
> +	int ret = -ENOMEM;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (!h)
> +		goto out;
> +
> +	skb_queue_head_init(&tmpq);
> +
> +	h->skb_count = sock_copy_buffers(queue, &tmpq);
> +	if (h->skb_count < 0) {
> +		ret = h->skb_count;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (!ret)
> +		ret = __sock_write_buffers(ctx, &tmpq);
> +
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	__skb_queue_purge(&tmpq);
> +
> +	return ret;
> +}
> +
> +static int sock_un_checkpoint(struct ckpt_ctx *ctx,
> +			      struct sock *sock,
> +			      struct ckpt_hdr_socket *h)
> +{
> +	struct unix_sock *sk = unix_sk(sock);
> +	struct unix_sock *pr = unix_sk(sk->peer);
> +	struct ckpt_hdr_socket_un *un;
> +	int new;
> +	int ret = -ENOMEM;
> +
> +	if ((sock->sk_state == TCP_LISTEN) &&
> +	    !skb_queue_empty(&sock->sk_receive_queue)) {
> +		ckpt_debug("listening socket has unaccepted peers");
> +		return -EBUSY;
> +	}
> +
> +	un = ckpt_hdr_get_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
> +	if (!un)
> +		goto out;
> +
> +	un->linked = sk->dentry && (sk->dentry->d_inode->i_nlink > 0);
> +
> +	un->this = ckpt_obj_lookup_add(ctx, sk, CKPT_OBJ_SOCK, &new);
> +	if (un->this < 0)
> +		goto out;
> +
> +	if (sk->peer)
> +		un->peer = ckpt_obj_lookup_add(ctx, pr, CKPT_OBJ_SOCK, &new);
> +	else
> +		un->peer = 0;
> +
> +	if (un->peer < 0) {
> +		ret = un->peer;
> +		goto out;
> +	}
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) un);
> + out:
> +	return ret;
> +}
> +
> +static int sock_cptrst(struct ckpt_ctx *ctx,
> +		       struct sock *sock,
> +		       struct ckpt_hdr_socket *h,
> +		       int op)
> +{
> +	if (sock->sk_socket) {
> +		CKPT_COPY(op, h->socket.flags, sock->sk_socket->flags);
> +		CKPT_COPY(op, h->socket.state, sock->sk_socket->state);
> +	}
> +
> +	CKPT_COPY(op, h->sock_common.reuse, sock->sk_reuse);
> +	CKPT_COPY(op, h->sock_common.bound_dev_if, sock->sk_bound_dev_if);
> +	CKPT_COPY(op, h->sock_common.family, sock->sk_family);
> +

Value of some fields below needs to be verified/checked:

> +	CKPT_COPY(op, h->sock.shutdown, sock->sk_shutdown);

Allow on SHUTDOWN_MASK (because in some places the code tests a flag
and in others a value - so it could be that {RCV,SND}_SHUTDOWN are
true, but != SHUTDOWN_MASK, which is unexpected by the code). Also
for future extensibility.

> +	CKPT_COPY(op, h->sock.userlocks, sock->sk_userlocks)

Here too, allow possible flags only.

> +	CKPT_COPY(op, h->sock.no_check, sock->sk_no_check);
> +	CKPT_COPY(op, h->sock.protocol, sock->sk_protocol);
> +	CKPT_COPY(op, h->sock.err, sock->sk_err);
> +	CKPT_COPY(op, h->sock.err_soft, sock->sk_err_soft);

I'm unsure if we need ensure that an error value is in range...
can it do harm to pass an out-of-range value back to the caller ?

> +	CKPT_COPY(op, h->sock.priority, sock->sk_priority);
> +	CKPT_COPY(op, h->sock.rcvlowat, sock->sk_rcvlowat);

This is 'int' in struct sock, but assignment may make it negative ?

> +	CKPT_COPY(op, h->sock.backlog, sock->sk_max_ack_backlog);

Same.

> +	CKPT_COPY(op, h->sock.rcvtimeo, sock->sk_rcvtimeo);
> +	CKPT_COPY(op, h->sock.sndtimeo, sock->sk_sndtimeo);

These two too. Perhaps refactor and use sock_set_timeout() ?

> +	CKPT_COPY(op, h->sock.rcvbuf, sock->sk_rcvbuf);
> +	CKPT_COPY(op, h->sock.sndbuf, sock->sk_sndbuf);

These should be checked for sign-ness, limited in magnitude (e.g.
<= sysctl_wmem_max), checked against a minimum (see setting it
in net/core/sock.c), and finally perhaps correlated with a flag
against sk_userlocks ?

> +	CKPT_COPY(op, h->sock.flags, sock->sk_flags);
> +	CKPT_COPY(op, h->sock.lingertime, sock->sk_lingertime);

Perhaps here too ?

> +	CKPT_COPY(op, h->sock.type, sock->sk_type);
> +	CKPT_COPY(op, h->sock.state, sock->sk_state);
> +

Do we need the tests below also during checkpoint ?

Perhaps place all the sanity tests in a separate routine that
is called from here as in:

	if (op == CKPT_RST)
		return sock_validate_data(...);
	else
		return 0;

> +	if ((h->socket.state == SS_CONNECTED) &&
> +	    (h->sock.state != TCP_ESTABLISHED)) {
> +		ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
> +			   h->socket.state, h->sock.state);
> +		return -EINVAL;
> +	} else if ((h->socket.state == SS_UNCONNECTED) &&
> +		   (h->sock.state == TCP_ESTABLISHED)) {
> +		ckpt_debug("socket/sock in inconsistent state: %i/%i\n",
> +			   h->socket.state, h->sock.state);
> +		return -EINVAL;
> +	} else if ((h->sock.state < TCP_ESTABLISHED) ||
> +		   (h->sock.state >= TCP_MAX_STATES)) {
> +		ckpt_debug("sock in invalid state: %i\n", h->sock.state);
> +		return -EINVAL;
> +	} else if ((h->socket.state < SS_FREE) ||
> +		   (h->socket.state > SS_DISCONNECTING)) {
> +		ckpt_debug("socket in invalid state: %i\n", h->socket.state);
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +int do_sock_file_checkpoint(struct ckpt_ctx *ctx, struct file *file)
> +{
> +	struct socket *socket = file->private_data;
> +	struct sock *sock = socket->sk;
> +	struct ckpt_hdr_socket *h;
> +	int ret = 0;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->laddr_len = sizeof(h->laddr);
> +	h->raddr_len = sizeof(h->raddr);
> +
> +	if (socket->ops->getname(socket, &h->laddr, &h->laddr_len, 0)) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((sock->sk_state != TCP_LISTEN) &&
> +	    (socket->ops->getname(socket, &h->raddr, &h->raddr_len, 1))) {
> +		if (sock->sk_type != SOCK_DGRAM) {
> +			ret = -EINVAL;
> +			goto out;
> +		}

I suspect this fails for !SOCK_DGRAM that isn't connected, e.g. newly
created.

> +		h->raddr_len = 0;

Hmmm.. do you need to ensure that this value is consistent with the
value of ->peer ?  (at restart)

> +	}
> +
> +	sock_cptrst(ctx, sock, h, CKPT_CPT);
> +
> +	if (sock->sk_family == AF_UNIX) {
> +		ret = sock_un_checkpoint(ctx, sock, h);
> +		if (ret)
> +			goto out;
> +	} else {
> +		ckpt_debug("unsupported socket type %i\n",
> +			   sock->sk_family);

I'd suggest a ckpt_write_err() here (and in other key places) to
explain to the user what caused the checkpoint to fail.

> +		ret = EINVAL;
> +		goto out;
> +	}
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_write_buffers(ctx, &sock->sk_write_queue);
> +	if (ret)
> +		goto out;

Any reason to write the buffers of a listening socket ?  they only
contain not-yet-accepted sockest, which aren't supported anyway.

And while there is not support for not-yet-accepted sockets, maybe
add an explicit test (if LISTEN and has skb's) and fail with an
error, as well as call ckpt_write_err().

> +
> +	/* FIXME: write out-of-order queue for TCP */

I'd prefer failing if such data exists...

> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static int sock_read_buffer(struct ckpt_ctx *ctx,
> +			    struct sock *sock,
> +			    struct sk_buff **skb)
> +{
> +	struct ckpt_hdr *h;
> +	int ret = 0;
> +	int len;
> +
> +	h = ckpt_read_buf_type(ctx, SKB_MAX_ALLOC, CKPT_HDR_SOCKET_BUFFER);
> +	if (IS_ERR(h))
> +		return PTR_ERR(h);
> +
> +	len = h->len - sizeof(*h);
> +
> +	*skb = sock_alloc_send_skb(sock, len, MSG_DONTWAIT, &ret);
> +	if (*skb == NULL) {
> +		ret = ENOMEM;
> +		goto out;
> +	}
> +
> +	memcpy(skb_put(*skb, len), (char *)(h + 1), len);

Can we avoid this memory copy ?
Perhaps add _ckpt_read_hdr() and _ckpt_read_payload() helpers ?

> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +static int sock_read_buffers(struct ckpt_ctx *ctx,
> +			     struct sock *sock,
> +			     struct sk_buff_head *queue)
> +{
> +	struct ckpt_hdr_socket_buffer *h;
> +	int ret = 0;
> +	int i;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFERS);
> +	if (IS_ERR(h)) {
> +		ret = PTR_ERR(h);
> +		goto out;
> +	}
> +
> +	for (i = 0; i < h->skb_count; i++) {
> +		struct sk_buff *skb = NULL;
> +
> +		ret = sock_read_buffer(ctx, sock, &skb);
> +		if (ret)
> +			break;
> +
> +		skb_queue_tail(queue, skb);
> +	}

Make sure that total length is within limits, or this is an opening
for DoS.

> + out:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ret;
> +}
> +
> +static struct unix_address *sock_un_makeaddr(struct sockaddr_un *sun_addr,
> +					     unsigned len)
> +{
> +	struct unix_address *addr;
> +
> +	addr = kmalloc(sizeof(*addr) + len, GFP_KERNEL);
> +	if (!addr)
> +		return ERR_PTR(ENOMEM);

I checked below, but you don't examine @len before calling this
function. So a DoS is possible by exhausting kernel memory with
these kmalloc()s.

> +
> +	memcpy(addr->name, sun_addr, len);
> +	addr->len = len;
> +	atomic_set(&addr->refcnt, 1);
> +
> +	return addr;
> +}
> +
> +static int sock_un_join(struct sock *a,
> +			struct sock *b,
> +			struct ckpt_hdr_socket *h)
> +{
> +	struct unix_address *addr;
> +
> +	sock_hold(a);
> +	sock_hold(b);
> +
> +	unix_sk(a)->peer = b;
> +	unix_sk(b)->peer = a;
> +
> +	a->sk_peercred.pid = task_tgid_vnr(current);
> +	current_euid_egid(&a->sk_peercred.uid,
> +			  &a->sk_peercred.gid);
> +
> +	b->sk_peercred.pid = task_tgid_vnr(current);
> +	current_euid_egid(&b->sk_peercred.uid,
> +			  &b->sk_peercred.gid);
> +

Hmmm... a socket that is connected may also be bind(), so I'm
not sure the test below is always correct ?

Also, this is the place to verify that the {l,r}addr_len make
sense, to prevent the DoS mentioned above.

> +	if (h->laddr_len == UNIX_LEN_EMPTY)
> +		addr = sock_un_makeaddr((struct sockaddr_un *)&h->raddr,
> +					h->raddr_len);
> +	else
> +		addr = sock_un_makeaddr((struct sockaddr_un *)&h->laddr,
> +					h->laddr_len);
> +	if (IS_ERR(addr))
> +	    return PTR_ERR(addr);
> +
> +	atomic_inc(&addr->refcnt); /* Held by both ends */
> +	unix_sk(a)->addr = unix_sk(b)->addr = addr;

A thought: do we want to restore sharing of addresses at restart ?
For instance, all sockets accepted from a single listening socket
share their local addresses.

> +
> +	return 0;
> +}
> +
> +static int sock_un_unlink(const char *name)
> +{
> +	struct path spath;
> +	struct path ppath;
> +	int ret;
> +
> +	ret = kern_path(name, 0, &spath);
> +	if (ret)
> +		return ret;
> +
> +	ret = kern_path(name, LOOKUP_PARENT, &ppath);
> +	if (ret)
> +		goto out_s;
> +
> +	if (!spath.dentry) {
> +		ckpt_debug("No dentry found for %s\n", name);
> +		ret = -ENOENT;
> +		goto out_p;
> +	}
> +
> +	if (!ppath.dentry || !ppath.dentry->d_inode) {
> +		ckpt_debug("No inode for parent of %s\n", name);
> +		ret = -ENOENT;
> +		goto out_p;
> +	}
> +
> +	ret = vfs_unlink(ppath.dentry->d_inode, spath.dentry);
> + out_p:
> +	path_put(&ppath);
> + out_s:
> +	path_put(&spath);
> +
> +	return ret;
> +}
> +
> +static int sock_un_restart(struct ckpt_ctx *ctx,
> +			   struct ckpt_hdr_socket *h,
> +			   struct socket *socket)
> +{
> +	struct sock *peer;
> +	struct ckpt_hdr_socket_un *un;
> +	int ret = 0;
> +
> +	un = ckpt_read_obj_type(ctx, sizeof(*un), CKPT_HDR_SOCKET_UN);
> +	if (IS_ERR(un))
> +		return PTR_ERR(un);
> +
> +	if (un->peer < 0) {
> +		ret = -EINVAL;
> +		goto out;
> +	}
> +
> +	peer = ckpt_obj_fetch(ctx, un->peer, CKPT_OBJ_SOCK);
> +
> +	if ((h->sock.state == TCP_ESTABLISHED) ||
> +	    (h->sock.state == TCP_CLOSE)) {

Here below, @peer may hold ERR_PTR(-EINVAL) meaning not found,
or ERR_PTR(-ENOMSG) meaning type mismatch and bad input. You
need to test for the latter.

> +		if (!IS_ERR(peer)) {
> +			/* We're last, so join with peer */
> +			struct sock *this = socket->sk;
> +
> +			ret = sock_un_join(this, peer, h);
> +		} else {

So this should be:
		} else if (PTR_ERR(peer) == -EINVAL) {

> +			/* We're first, so add our socket and wait for peer */
> +			ckpt_obj_insert(ctx, socket->sk, un->this,
> +					CKPT_OBJ_SOCK);
		} else {
			ret = PTR_ERR(peer);
> +		}

Also, need to check ret value of ckpt_obj_insert().

> +
> +	} else if (h->sock.state == TCP_LISTEN) {
> +		ret = socket->ops->bind(socket,
> +					(struct sockaddr *)&h->laddr,
> +					h->laddr_len);

Three comments:

1) Non-listening sockets may also need bind(). Especially SOCK_DGRAM.

2) What happens if s1 is checkpointed before s2 in this scenario:
	s1 = socket(); bind(s1, addr); unlink(addr);
	s2 = socket(); bind(s1, addr);

I'd suggest that a socket that is did bind() and that corresponding
inode was later unlink()ed, will not be re-bind at all. After all,
it is unreachable in terms of connect() syscall.

3) If the sockets path is relative, you can't blindly rely on the socket
address reported by ->getname(), because the original bind() occured
_relative_ to the task's current-working-directory at the time of the
socket() syscall.

The cwd of the restarting task may differ. Moreover, the resulting
path: cwd + sockaddr may exceed 108 bytes. Lastly, getsockname()
reports the original, relative pathname anyways.

So if the address doesn't begin with a '/', you need to also save
the cwd and on restart, go there before caling bind().

> +		if (ret < 0)
> +			goto out;
> +
> +		ret = socket->ops->listen(socket, h->sock.backlog);
> +		if (ret < 0)
> +			goto out;
> +
> +		/* We can unlink this blindly because we just created it
> +		 * above and would have failed already without proper
> +		 * permissions
> +		 */
> +		if (!un->linked) {
> +			struct sockaddr_un *sun =
> +				(struct sockaddr_un *)&h->laddr;
> +			ret = sock_un_unlink(sun->sun_path);

Here, too, need to be aware of the cwd of the restarting task.

> +		}
> +	} else
> +		ckpt_debug("unsupported UNIX socket state %i\n", h->sock.state);
> + out:
> +	ckpt_hdr_put(ctx, un);
> +	return ret;
> +}
> +
> +struct socket *do_sock_file_restore(struct ckpt_ctx *ctx,
> +				    struct ckpt_hdr_socket *h)
> +{
> +	struct socket *socket;
> +	int ret;
> +
> +	ret = sock_create(h->sock_common.family, h->sock.type, 0, &socket);
> +	if (ret < 0)
> +		return ERR_PTR(ret);
> +
> +	if (h->sock_common.family == AF_UNIX) {
> +		ret = sock_un_restart(ctx, h, socket);
> +		ckpt_debug("sock_un_restart: %i\n", ret);
> +	} else {
> +		ckpt_debug("unsupported family %i\n", h->sock_common.family);
> +		ret = -EINVAL;
> +	}
> +
> +	if (ret)
> +		goto out;
> +
> +	sock_cptrst(ctx, socket->sk, h, CKPT_RST);

Check return value ?

> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_receive_queue);
> +	if (ret)
> +		goto out;
> +
> +	ret = sock_read_buffers(ctx, socket->sk, &socket->sk->sk_write_queue);
> +	if (ret)
> +		goto out;

Only read buffers if really need to - i.e. not listening socket,
not closed socket.

> + out:
> +	if (ret) {
> +		sock_release(socket);
> +		socket = ERR_PTR(ret);
> +	}
> +
> +	return socket;
> +}
> +
> diff --git a/net/socket.c b/net/socket.c
> index 791d71a..be8c562 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -96,6 +96,9 @@
>  #include <net/sock.h>
>  #include <linux/netfilter.h>
>  
> +#include <linux/checkpoint.h>
> +#include <linux/checkpoint_hdr.h>
> +
>  static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
>  static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
>  			 unsigned long nr_segs, loff_t pos);
> @@ -140,6 +143,9 @@ static const struct file_operations socket_file_ops = {
>  	.sendpage =	sock_sendpage,
>  	.splice_write = generic_splice_sendpage,
>  	.splice_read =	sock_splice_read,
> +#ifdef CONFIG_CHECKPOINT
> +	.checkpoint =   sock_file_checkpoint,
> +#endif
>  };
>  
>  /*
> @@ -415,6 +421,84 @@ int sock_map_fd(struct socket *sock, int flags)
>  	return fd;
>  }
>  
> +int sock_file_checkpoint(struct ckpt_ctx *ctx, void *ptr)
> +{
> +	struct ckpt_hdr_file_socket *h;
> +	int ret;
> +	struct file *file = ptr;
> +
> +	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_FILE);
> +	if (!h)
> +		return -ENOMEM;
> +
> +	h->common.f_type = CKPT_FILE_SOCKET;
> +
> +	ret = checkpoint_file_common(ctx, file, &h->common);
> +	if (ret < 0)
> +		goto out;
> +	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
> +	if (ret < 0)
> +		goto out;
> +
> +	ret = do_sock_file_checkpoint(ctx, file);
> + out:
> +	ckpt_hdr_put(ctx, h);
> +	return ret;
> +}
> +
> +static struct file *sock_alloc_attach_fd(struct socket *socket)
> +{
> +	struct file *file;
> +	int err;
> +
> +	file = get_empty_filp();
> +	if (!file)
> +		return ERR_PTR(ENOMEM);
> +
> +	err = sock_attach_fd(socket, file, 0);
> +	if (err < 0) {
> +		put_filp(file);
> +		file = ERR_PTR(err);
> +	}
> +
> +	return file;
> +}
> +
> +void *sock_file_restore(struct ckpt_ctx *ctx)
> +{
> +	struct ckpt_hdr_socket *h = NULL;
> +	struct socket *socket = NULL;
> +	struct file *file = NULL;
> +	int err;
> +
> +	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET);
> +	if (IS_ERR(h))
> +		return h;
> +
> +	socket = do_sock_file_restore(ctx, h);
> +	if (IS_ERR(socket)) {
> +		err = PTR_ERR(socket);
> +		goto err_put;
> +	}
> +
> +	file = sock_alloc_attach_fd(socket);
> +	if (IS_ERR(file)) {
> +		err = PTR_ERR(file);
> +		goto err_release;
> +	}
> +
> +	ckpt_hdr_put(ctx, h);
> +
> +	return file;
> +
> + err_release:
> +	sock_release(socket);
> + err_put:
> +	ckpt_hdr_put(ctx, h);
> +
> +	return ERR_PTR(err);
> +}
> +
>  static struct socket *sock_from_file(struct file *file, int *err)
>  {
>  	if (file->f_op == &socket_file_ops)

Ok ... waiting for next round :)

Oren.


More information about the Containers mailing list