[PATCH 2/4] [RFC] Add c/r support for connected INET sockets (v5)

Serge E. Hallyn serue at us.ibm.com
Tue Nov 17 20:34:51 PST 2009


Quoting Dan Smith (danms at us.ibm.com):
> This patch adds basic support for C/R of open INET sockets.  I think that
> all the important bits of the TCP and ICSK socket structures is saved,
> but I think there is still some additional IPv6 stuff that needs to be
> handled.
> 
> With this patch applied, the following script can be used to demonstrate
> the functionality:
> 
>   https://lists.linux-foundation.org/pipermail/containers/2009-October/021239.html
> 
> It shows that this enables migration of a sendmail process with open
> connections from one machine to another without dropping.
> 
> We probably need comments from the netdev people about the quality of
> sanity checking we do on the values in the ckpt_hdr_socket_inet
> structure on restart.
> 
> Note that this still doesn't address lingering sockets yet.
> 
> Changes in v5:
>  - Change ckpt_write_err() to ckpt_err()
> 
> Changes in v4:
>  - Use the new socket buffer restore functions introduced in the
>    previous patch
>  - Move listen_sockets list under the restart items in ckpt_ctx
>  - Rename RESTART_SOCK_LISTENONLY to RESTART_CONN_RESET
> 
> Changes in v3:
>  - Prevent restart from allowing a bind on a <1024 port unless the
>    user is granted that capability
>  - Add some sanity checking in the inet_precheck() function to make sure
>    the values read from the checkpoint image are within acceptable ranges
>  - Check the result of sock_restore_header_info() and fail if needed
> 
> Changes in v2:
>  - Restore saddr, rcv_saddr, daddr, sport, and dport from the sockaddr
>    structure instead of saving them separately
>  - Fix 'sock' naming in sock_cptrst()
>  - Don't take the queue lock before skb_queue_tail() since it is
>    done for us
>  - Allow "listen only" restore behavior if RESTART_SOCK_LISTENONLY
>    flag is specified on sys_restart()
>  - Pull the implementation of the list of listening sockets back into
>    this patch
>  - Fix dangling printk
>  - Add some comments around the parent/child restore logic
> 
> Cc: netdev at vger.kernel.org
> Acked-by: Oren Laadan <orenl at librato.com>
> Signed-off-by: Dan Smith <danms at us.ibm.com>

Acked-by: Serge Hallyn <serue at us.ibm.com>

> ---
>  checkpoint/sys.c                 |    4 +
>  include/linux/checkpoint.h       |    5 +-
>  include/linux/checkpoint_hdr.h   |   95 +++++++++
>  include/linux/checkpoint_types.h |    1 +
>  net/checkpoint.c                 |   27 ++--
>  net/ipv4/checkpoint.c            |  391 ++++++++++++++++++++++++++++++++++----
>  6 files changed, 473 insertions(+), 50 deletions(-)
> 
> diff --git a/checkpoint/sys.c b/checkpoint/sys.c
> index 9f9e825..baed891 100644
> --- a/checkpoint/sys.c
> +++ b/checkpoint/sys.c
> @@ -244,6 +244,8 @@ static void ckpt_ctx_free(struct ckpt_ctx *ctx)
> 
>  	kfree(ctx->pids_arr);
> 
> +	sock_listening_list_free(&ctx->listen_sockets);
> +
>  	kfree(ctx);
>  }
> 
> @@ -274,6 +276,8 @@ static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
> 
>  	mutex_init(&ctx->msg_mutex);
> 
> +	INIT_LIST_HEAD(&ctx->listen_sockets);
> +
>  	err = -EBADF;
>  	ctx->file = fget(fd);
>  	if (!ctx->file)
> diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
> index 0eff43e..ddc9aa0 100644
> --- a/include/linux/checkpoint.h
> +++ b/include/linux/checkpoint.h
> @@ -20,6 +20,7 @@
>  #define RESTART_FROZEN		0x2
>  #define RESTART_GHOST		0x4
>  #define RESTART_KEEP_LSM	0x8
> +#define RESTART_CONN_RESET      0x10
> 
>  /* misc user visible */
>  #define CHECKPOINT_FD_NONE	-1
> @@ -53,7 +54,8 @@
>  	(RESTART_TASKSELF | \
>  	 RESTART_FROZEN | \
>  	 RESTART_KEEP_LSM | \
> -	 RESTART_GHOST)
> +	 RESTART_GHOST | \
> +	 RESTART_CONN_RESET)
>  #define CKPT_LSM_INFO_LEN 200
>  #define CKPT_LSM_STRING_MAX 1024
> 
> @@ -105,6 +107,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
>  			      struct sockaddr *loc, unsigned *loc_len,
>  			      struct sockaddr *rem, unsigned *rem_len);
>  struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
> +void sock_listening_list_free(struct list_head *head);
> 
>  /* ckpt kflags */
>  #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
> diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
> index 787cf89..d1a93e3 100644
> --- a/include/linux/checkpoint_hdr.h
> +++ b/include/linux/checkpoint_hdr.h
> @@ -15,6 +15,7 @@
>  #include <linux/socket.h>
>  #include <linux/un.h>
>  #include <linux/in.h>
> +#include <linux/in6.h>
>  #else
>  #include <sys/types.h>
>  #include <linux/types.h>
> @@ -625,6 +626,100 @@ struct ckpt_hdr_socket_unix {
> 
>  struct ckpt_hdr_socket_inet {
>  	struct ckpt_hdr h;
> +	__u32 daddr;
> +	__u32 rcv_saddr;
> +	__u32 saddr;
> +	__u16 dport;
> +	__u16 num;
> +	__u16 sport;
> +	__s16 uc_ttl;
> +	__u16 cmsg_flags;
> +
> +	struct {
> +		__u64 timeout;
> +		__u32 ato;
> +		__u32 lrcvtime;
> +		__u16 last_seg_size;
> +		__u16 rcv_mss;
> +		__u8 pending;
> +		__u8 quick;
> +		__u8 pingpong;
> +		__u8 blocked;
> +	} icsk_ack __attribute__ ((aligned(8)));
> +
> +	/* FIXME: Skipped opt, tos, multicast, cork settings */
> +
> +	struct {
> +		__u32 rcv_nxt;
> +		__u32 copied_seq;
> +		__u32 rcv_wup;
> +		__u32 snd_nxt;
> +		__u32 snd_una;
> +		__u32 snd_sml;
> +		__u32 rcv_tstamp;
> +		__u32 lsndtime;
> +
> +		__u32 snd_wl1;
> +		__u32 snd_wnd;
> +		__u32 max_window;
> +		__u32 mss_cache;
> +		__u32 window_clamp;
> +		__u32 rcv_ssthresh;
> +		__u32 frto_highmark;
> +
> +		__u32 srtt;
> +		__u32 mdev;
> +		__u32 mdev_max;
> +		__u32 rttvar;
> +		__u32 rtt_seq;
> +
> +		__u32 packets_out;
> +		__u32 retrans_out;
> +
> +		__u32 snd_up;
> +		__u32 rcv_wnd;
> +		__u32 write_seq;
> +		__u32 pushed_seq;
> +		__u32 lost_out;
> +		__u32 sacked_out;
> +		__u32 fackets_out;
> +		__u32 tso_deferred;
> +		__u32 bytes_acked;
> +
> +		__s32 lost_cnt_hint;
> +		__u32 retransmit_high;
> +
> +		__u32 lost_retrans_low;
> +
> +		__u32 prior_ssthresh;
> +		__u32 high_seq;
> +
> +		__u32 retrans_stamp;
> +		__u32 undo_marker;
> +		__s32 undo_retrans;
> +		__u32 total_retrans;
> +
> +		__u32 urg_seq;
> +		__u32 keepalive_time;
> +		__u32 keepalive_intvl;
> +
> +		__u16 urg_data;
> +		__u16 advmss;
> +		__u8 frto_counter;
> +		__u8 nonagle;
> +
> +		__u8 ecn_flags;
> +		__u8 reordering;
> +
> +		__u8 keepalive_probes;
> +	} tcp __attribute__ ((aligned(8)));
> +
> +	struct {
> +		struct in6_addr saddr;
> +		struct in6_addr rcv_saddr;
> +		struct in6_addr daddr;
> +	} inet6 __attribute__ ((aligned(8)));
> +
>  	__u32 laddr_len;
>  	__u32 raddr_len;
>  	struct sockaddr_in laddr;
> diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
> index 77f8592..79c9c09 100644
> --- a/include/linux/checkpoint_types.h
> +++ b/include/linux/checkpoint_types.h
> @@ -82,6 +82,7 @@ struct ckpt_ctx {
>  	wait_queue_head_t waitq;	/* waitqueue for restarting tasks */
>  	wait_queue_head_t ghostq;	/* waitqueue for ghost tasks */
>  	struct cred *realcred, *ecred;	/* tmp storage for cred at restart */
> +	struct list_head listen_sockets;/* listening parent sockets */
> 
>  	struct ckpt_stats stats;	/* statistics */
> 
> diff --git a/net/checkpoint.c b/net/checkpoint.c
> index 49d9a2f..aba1497 100644
> --- a/net/checkpoint.c
> +++ b/net/checkpoint.c
> @@ -324,6 +324,7 @@ static int __sock_write_skb(struct ckpt_ctx *ctx,
> 
>  static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  				struct sk_buff_head *queue,
> +				uint16_t family,
>  				int dst_objref)
>  {
>  	struct sk_buff *skb;
> @@ -336,11 +337,11 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
>  			return -EBUSY;
>  		}
> 
> -		/* The other ancillary messages are always present
> -		 * unlike descriptors.  Even though we can't detect
> -		 * them and fail the checkpoint, we're not at risk
> -		 * because we don't save out (or restore) the control
> -		 * information contained in the skb.
> +		/* The other ancillary messages UNIX are always
> +		 * present unlike descriptors.  Even though we can't
> +		 * detect them and fail the checkpoint, we're not at
> +		 * risk because we don't restore the control
> +		 * information in the UNIX code.
>  		 */
> 
>  		ret = __sock_write_skb(ctx, skb, dst_objref);
> @@ -353,6 +354,7 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
> 
>  static int sock_write_buffers(struct ckpt_ctx *ctx,
>  			      struct sk_buff_head *queue,
> +			      uint16_t family,
>  			      int dst_objref)
>  {
>  	struct ckpt_hdr_socket_queue *h;
> @@ -372,7 +374,7 @@ static int sock_write_buffers(struct ckpt_ctx *ctx,
>  	h->skb_count = ret;
>  	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
>  	if (!ret)
> -		ret = __sock_write_buffers(ctx, &tmpq, dst_objref);
> +		ret = __sock_write_buffers(ctx, &tmpq, family, dst_objref);
> 
>   out:
>  	ckpt_hdr_put(ctx, h);
> @@ -394,12 +396,14 @@ int sock_deferred_write_buffers(void *data)
>  		return dst_objref;
>  	}
> 
> -	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue, dst_objref);
> +	ret = sock_write_buffers(ctx, &dq->sk->sk_receive_queue,
> +				 dq->sk->sk_family, dst_objref);
>  	ckpt_debug("write recv buffers: %i\n", ret);
>  	if (ret < 0)
>  		return ret;
> 
> -	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue, dst_objref);
> +	ret = sock_write_buffers(ctx, &dq->sk->sk_write_queue,
> +				 dq->sk->sk_family, dst_objref);
>  	ckpt_debug("write send buffers: %i\n", ret);
> 
>  	return ret;
> @@ -924,10 +928,9 @@ struct sock *do_sock_restore(struct ckpt_ctx *ctx)
>  		goto err;
> 
>  	if ((h->sock_common.family == AF_INET) &&
> -	    (h->sock.state != TCP_LISTEN)) {
> -		/* Temporary hack to enable restore of TCP_LISTEN sockets
> -		 * while forcing anything else to a closed state
> -		 */
> +	    (h->sock.state != TCP_LISTEN) &&
> +	    (ctx->uflags & RESTART_CONN_RESET)) {
> +		ckpt_debug("Forcing open socket closed\n");
>  		sock->sk->sk_state = TCP_CLOSE;
>  		sock->state = SS_UNCONNECTED;
>  	}
> diff --git a/net/ipv4/checkpoint.c b/net/ipv4/checkpoint.c
> index 9cbbf5e..3e20cc9 100644
> --- a/net/ipv4/checkpoint.c
> +++ b/net/ipv4/checkpoint.c
> @@ -17,6 +17,7 @@
>  #include <linux/deferqueue.h>
>  #include <net/tcp_states.h>
>  #include <net/tcp.h>
> +#include <net/ipv6.h>
> 
>  struct dq_sock {
>  	struct ckpt_ctx *ctx;
> @@ -28,6 +29,236 @@ struct dq_buffers {
>  	struct sock *sk;
>  };
> 
> +struct listen_item {
> +	struct sock *sk;
> +	struct list_head list;
> +};
> +
> +void sock_listening_list_free(struct list_head *head)
> +{
> +	struct listen_item *item, *tmp;
> +
> +	list_for_each_entry_safe(item, tmp, head, list) {
> +		list_del(&item->list);
> +		kfree(item);
> +	}
> +}
> +
> +static int sock_listening_list_add(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct listen_item *item;
> +
> +	item = kmalloc(sizeof(*item), GFP_KERNEL);
> +	if (!item)
> +		return -ENOMEM;
> +
> +	item->sk = sk;
> +	list_add(&item->list, &ctx->listen_sockets);
> +
> +	return 0;
> +}
> +
> +static struct sock *sock_get_parent(struct ckpt_ctx *ctx, struct sock *sk)
> +{
> +	struct listen_item *item;
> +
> +	list_for_each_entry(item, &ctx->listen_sockets, list) {
> +		if (inet_sk(sk)->sport == inet_sk(item->sk)->sport)
> +			return item->sk;
> +	}
> +
> +	return NULL;
> +}
> +
> +static int sock_hash_parent(void *data)
> +{
> +	struct dq_sock *dq = (struct dq_sock *)data;
> +	struct sock *parent;
> +
> +	ckpt_debug("INET post-restart hash\n");
> +
> +	dq->sk->sk_prot->hash(dq->sk);
> +
> +	/* If there is a listening socket with the same source port,
> +	 * then become a child of that socket [we are the result of an
> +	 * accept()].  Otherwise hash ourselves directly in [we are
> +	 * the result of a connect()]
> +	 */
> +
> +	parent = sock_get_parent(dq->ctx, dq->sk);
> +	if (parent) {
> +		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
> +		local_bh_disable();
> +		__inet_inherit_port(parent, dq->sk);
> +		local_bh_enable();
> +	} else {
> +		inet_sk(dq->sk)->num = 0;
> +		inet_hash_connect(&tcp_death_row, dq->sk);
> +		inet_sk(dq->sk)->num = ntohs(inet_sk(dq->sk)->sport);
> +	}
> +
> +	return 0;
> +}
> +
> +static int sock_defer_hash(struct ckpt_ctx *ctx, struct sock *sock)
> +{
> +	struct dq_sock dq;
> +
> +	dq.sk = sock;
> +	dq.ctx = ctx;
> +
> +	return deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
> +			      sock_hash_parent, NULL);
> +}
> +
> +static int sock_inet_tcp_cptrst(struct ckpt_ctx *ctx,
> +				struct tcp_sock *sk,
> +				struct ckpt_hdr_socket_inet *hh,
> +				int op)
> +{
> +	CKPT_COPY(op, hh->tcp.rcv_nxt, sk->rcv_nxt);
> +	CKPT_COPY(op, hh->tcp.copied_seq, sk->copied_seq);
> +	CKPT_COPY(op, hh->tcp.rcv_wup, sk->rcv_wup);
> +	CKPT_COPY(op, hh->tcp.snd_nxt, sk->snd_nxt);
> +	CKPT_COPY(op, hh->tcp.snd_una, sk->snd_una);
> +	CKPT_COPY(op, hh->tcp.snd_sml, sk->snd_sml);
> +	CKPT_COPY(op, hh->tcp.rcv_tstamp, sk->rcv_tstamp);
> +	CKPT_COPY(op, hh->tcp.lsndtime, sk->lsndtime);
> +
> +	CKPT_COPY(op, hh->tcp.snd_wl1, sk->snd_wl1);
> +	CKPT_COPY(op, hh->tcp.snd_wnd, sk->snd_wnd);
> +	CKPT_COPY(op, hh->tcp.max_window, sk->max_window);
> +	CKPT_COPY(op, hh->tcp.mss_cache, sk->mss_cache);
> +	CKPT_COPY(op, hh->tcp.window_clamp, sk->window_clamp);
> +	CKPT_COPY(op, hh->tcp.rcv_ssthresh, sk->rcv_ssthresh);
> +	CKPT_COPY(op, hh->tcp.frto_highmark, sk->frto_highmark);
> +	CKPT_COPY(op, hh->tcp.advmss, sk->advmss);
> +	CKPT_COPY(op, hh->tcp.frto_counter, sk->frto_counter);
> +	CKPT_COPY(op, hh->tcp.nonagle, sk->nonagle);
> +
> +	CKPT_COPY(op, hh->tcp.srtt, sk->srtt);
> +	CKPT_COPY(op, hh->tcp.mdev, sk->mdev);
> +	CKPT_COPY(op, hh->tcp.mdev_max, sk->mdev_max);
> +	CKPT_COPY(op, hh->tcp.rttvar, sk->rttvar);
> +	CKPT_COPY(op, hh->tcp.rtt_seq, sk->rtt_seq);
> +
> +	CKPT_COPY(op, hh->tcp.packets_out, sk->packets_out);
> +	CKPT_COPY(op, hh->tcp.retrans_out, sk->retrans_out);
> +
> +	CKPT_COPY(op, hh->tcp.urg_data, sk->urg_data);
> +	CKPT_COPY(op, hh->tcp.ecn_flags, sk->ecn_flags);
> +	CKPT_COPY(op, hh->tcp.reordering, sk->reordering);
> +	CKPT_COPY(op, hh->tcp.snd_up, sk->snd_up);
> +
> +	CKPT_COPY(op, hh->tcp.keepalive_probes, sk->keepalive_probes);
> +
> +	CKPT_COPY(op, hh->tcp.rcv_wnd, sk->rcv_wnd);
> +	CKPT_COPY(op, hh->tcp.write_seq, sk->write_seq);
> +	CKPT_COPY(op, hh->tcp.pushed_seq, sk->pushed_seq);
> +	CKPT_COPY(op, hh->tcp.lost_out, sk->lost_out);
> +	CKPT_COPY(op, hh->tcp.sacked_out, sk->sacked_out);
> +	CKPT_COPY(op, hh->tcp.fackets_out, sk->fackets_out);
> +	CKPT_COPY(op, hh->tcp.tso_deferred, sk->tso_deferred);
> +	CKPT_COPY(op, hh->tcp.bytes_acked, sk->bytes_acked);
> +
> +	CKPT_COPY(op, hh->tcp.lost_cnt_hint, sk->lost_cnt_hint);
> +	CKPT_COPY(op, hh->tcp.retransmit_high, sk->retransmit_high);
> +
> +	CKPT_COPY(op, hh->tcp.lost_retrans_low, sk->lost_retrans_low);
> +
> +	CKPT_COPY(op, hh->tcp.prior_ssthresh, sk->prior_ssthresh);
> +	CKPT_COPY(op, hh->tcp.high_seq, sk->high_seq);
> +
> +	CKPT_COPY(op, hh->tcp.retrans_stamp, sk->retrans_stamp);
> +	CKPT_COPY(op, hh->tcp.undo_marker, sk->undo_marker);
> +	CKPT_COPY(op, hh->tcp.undo_retrans, sk->undo_retrans);
> +	CKPT_COPY(op, hh->tcp.total_retrans, sk->total_retrans);
> +
> +	CKPT_COPY(op, hh->tcp.urg_seq, sk->urg_seq);
> +	CKPT_COPY(op, hh->tcp.keepalive_time, sk->keepalive_time);
> +	CKPT_COPY(op, hh->tcp.keepalive_intvl, sk->keepalive_intvl);
> +
> +	if (!skb_queue_empty(&sk->ucopy.prequeue))
> +		printk("PREQUEUE!\n");
> +
> +	return 0;
> +}
> +
> +static int sock_inet_restore_addrs(struct inet_sock *inet,
> +				   struct ckpt_hdr_socket_inet *hh)
> +{
> +	inet->daddr = hh->raddr.sin_addr.s_addr;
> +	inet->saddr = hh->laddr.sin_addr.s_addr;
> +	inet->rcv_saddr = inet->saddr;
> +
> +	inet->dport = hh->raddr.sin_port;
> +	inet->sport = hh->laddr.sin_port;
> +
> +	return 0;
> +}
> +
> +static int sock_inet_cptrst(struct ckpt_ctx *ctx,
> +			    struct sock *sk,
> +			    struct ckpt_hdr_socket_inet *hh,
> +			    int op)
> +{
> +	struct inet_sock *inet = inet_sk(sk);
> +	struct inet_connection_sock *icsk = inet_csk(sk);
> +	int ret;
> +
> +	if (op == CKPT_CPT) {
> +		CKPT_COPY(op, hh->daddr, inet->daddr);
> +		CKPT_COPY(op, hh->rcv_saddr, inet->rcv_saddr);
> +		CKPT_COPY(op, hh->dport, inet->dport);
> +		CKPT_COPY(op, hh->saddr, inet->saddr);
> +		CKPT_COPY(op, hh->sport, inet->sport);
> +	} else {
> +		ret = sock_inet_restore_addrs(inet, hh);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	CKPT_COPY(op, hh->num, inet->num);
> +	CKPT_COPY(op, hh->uc_ttl, inet->uc_ttl);
> +	CKPT_COPY(op, hh->cmsg_flags, inet->cmsg_flags);
> +
> +	CKPT_COPY(op, hh->icsk_ack.pending, icsk->icsk_ack.pending);
> +	CKPT_COPY(op, hh->icsk_ack.quick, icsk->icsk_ack.quick);
> +	CKPT_COPY(op, hh->icsk_ack.pingpong, icsk->icsk_ack.pingpong);
> +	CKPT_COPY(op, hh->icsk_ack.blocked, icsk->icsk_ack.blocked);
> +	CKPT_COPY(op, hh->icsk_ack.ato, icsk->icsk_ack.ato);
> +	CKPT_COPY(op, hh->icsk_ack.timeout, icsk->icsk_ack.timeout);
> +	CKPT_COPY(op, hh->icsk_ack.lrcvtime, icsk->icsk_ack.lrcvtime);
> +	CKPT_COPY(op,
> +		  hh->icsk_ack.last_seg_size, icsk->icsk_ack.last_seg_size);
> +	CKPT_COPY(op, hh->icsk_ack.rcv_mss, icsk->icsk_ack.rcv_mss);
> +
> +	if (sk->sk_protocol == IPPROTO_TCP)
> +		ret = sock_inet_tcp_cptrst(ctx, tcp_sk(sk), hh, op);
> +	else if (sk->sk_protocol == IPPROTO_UDP)
> +		ret = 0;
> +	else {
> +		ret = -EINVAL;
> +		ckpt_err(ctx, ret, "unknown socket protocol %d",
> +			 sk->sk_protocol);
> +	}
> +
> +	if (sk->sk_family == AF_INET6) {
> +		struct ipv6_pinfo *inet6 = inet6_sk(sk);
> +		if (op == CKPT_CPT) {
> +			ipv6_addr_copy(&hh->inet6.saddr, &inet6->saddr);
> +			ipv6_addr_copy(&hh->inet6.rcv_saddr, &inet6->rcv_saddr);
> +			ipv6_addr_copy(&hh->inet6.daddr, &inet6->daddr);
> +		} else {
> +			ipv6_addr_copy(&inet6->saddr, &hh->inet6.saddr);
> +			ipv6_addr_copy(&inet6->rcv_saddr, &hh->inet6.rcv_saddr);
> +			ipv6_addr_copy(&inet6->daddr, &hh->inet6.daddr);
> +		}
> +	}
> +
> +	return ret;
> +}
> +
>  int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
>  {
>  	struct ckpt_hdr_socket_inet *in;
> @@ -43,6 +274,10 @@ int inet_checkpoint(struct ckpt_ctx *ctx, struct socket *sock)
>  	if (ret)
>  		goto out;
> 
> +	ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_CPT);
> +	if (ret < 0)
> +		goto out;
> +
>  	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) in);
>   out:
>  	ckpt_hdr_put(ctx, in);
> @@ -55,51 +290,22 @@ int inet_collect(struct ckpt_ctx *ctx, struct socket *sock)
>  	return ckpt_obj_collect(ctx, sock->sk, CKPT_OBJ_SOCK);
>  }
> 
> -static int inet_read_buffer(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +static int inet_read_buffer(struct ckpt_ctx *ctx,
> +			    struct sk_buff_head *queue)
>  {
> -	struct ckpt_hdr_socket_buffer *h;
> -	int len;
> -	int ret;
>  	struct sk_buff *skb = NULL;
> 
> -	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
> -	if (IS_ERR(h))
> -		return PTR_ERR(h);
> -
> -	len = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
> -	if (len < 0) {
> -		ret = len;
> -		goto out;
> -	} else if (len > SKB_MAX_ALLOC) {
> -		ckpt_debug("Socket buffer too big (%i > %lu)",
> -			   len, SKB_MAX_ALLOC);
> -		ret = -ENOSPC;
> -		goto out;
> -	}
> -
> -	skb = alloc_skb(len, GFP_KERNEL);
> -	if (!skb) {
> -		ret = -ENOMEM;
> -		goto out;
> -	}
> -
> -	ret = ckpt_kread(ctx, skb_put(skb, len), len);
> -	if (ret < 0)
> -		goto out;
> +	skb = sock_restore_skb(ctx);
> +	if (IS_ERR(skb))
> +		return PTR_ERR(skb);
> 
> -	spin_lock(&queue->lock);
>  	skb_queue_tail(queue, skb);
> -	spin_unlock(&queue->lock);
> - out:
> -	ckpt_hdr_put(ctx, h);
> -
> -	if ((ret < 0) && skb)
> -		kfree_skb(skb);
> 
> -	return ret;
> +	return skb->len;
>  }
> 
> -static int inet_read_buffers(struct ckpt_ctx *ctx, struct sk_buff_head *queue)
> +static int inet_read_buffers(struct ckpt_ctx *ctx,
> +			     struct sk_buff_head *queue)
>  {
>  	struct ckpt_hdr_socket_queue *h;
>  	int ret = 0;
> @@ -162,6 +368,19 @@ static int inet_defer_restore_buffers(struct ckpt_ctx *ctx, struct sock *sk)
> 
>  static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
>  {
> +	__u8 icsk_ack_mask = ICSK_ACK_SCHED | ICSK_ACK_TIMER |
> +		ICSK_ACK_PUSHED | ICSK_ACK_PUSHED2;
> +	__u16 urg_mask = TCP_URG_VALID | TCP_URG_NOTYET | TCP_URG_READ;
> +	__u8 nonagle_mask = TCP_NAGLE_OFF | TCP_NAGLE_CORK | TCP_NAGLE_PUSH;
> +	__u8 ecn_mask = TCP_ECN_OK | TCP_ECN_QUEUE_CWR | TCP_ECN_DEMAND_CWR;
> +
> +	if ((htons(in->laddr.sin_port) < PROT_SOCK) &&
> +	    !capable(CAP_NET_BIND_SERVICE)) {
> +		ckpt_debug("unable to bind to port %hu\n",
> +			   htons(in->laddr.sin_port));
> +		return -EINVAL;
> +	}
> +
>  	if (in->laddr_len > sizeof(struct sockaddr_in)) {
>  		ckpt_debug("laddr_len is too big\n");
>  		return -EINVAL;
> @@ -172,6 +391,77 @@ static int inet_precheck(struct socket *sock, struct ckpt_hdr_socket_inet *in)
>  		return -EINVAL;
>  	}
> 
> +	/* Set ato to the default */
> +	in->icsk_ack.ato = TCP_ATO_MIN;
> +
> +	/* No quick acks are scheduled after a restart */
> +	in->icsk_ack.quick = 0;
> +
> +	if (in->icsk_ack.pending & ~icsk_ack_mask) {
> +		ckpt_debug("invalid pending flags 0x%x\n",
> +			   in->icsk_ack.pending & ~icsk_ack_mask);
> +		return -EINVAL;
> +	}
> +
> +	if (in->icsk_ack.pingpong > 1) {
> +		ckpt_debug("invalid icsk_ack.pingpong value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->icsk_ack.blocked > 1) {
> +		ckpt_debug("invalid icsk_ack.blocked value\n");
> +		return -EINVAL;
> +	}
> +
> +	/* do_tcp_setsockopt() quietly makes this coercion */
> +	if (in->tcp.window_clamp < (SOCK_MIN_RCVBUF / 2))
> +		in->tcp.window_clamp = SOCK_MIN_RCVBUF / 2;
> +	else if (in->tcp.window_clamp > 65535U) {
> +		ckpt_debug("invalid window_clamp value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.rcv_ssthresh > (4U * in->tcp.advmss))
> +		in->tcp.rcv_ssthresh = 4U * in->tcp.advmss;
> +
> +	/* These will all be recalculated on the next call to
> +	 * tcp_rtt_estimator()
> +	 */
> +	in->tcp.srtt = in->tcp.mdev = in->tcp.mdev_max = 0;
> +	in->tcp.rttvar = in->tcp.rtt_seq = 0;
> +
> +	/* Might want to set packets_out to zero ? */
> +
> +	if (in->tcp.rcv_wnd > MAX_TCP_WINDOW)
> +		in->tcp.rcv_wnd = MAX_TCP_WINDOW;
> +
> +	if (in->tcp.keepalive_intvl > MAX_TCP_KEEPINTVL) {
> +		ckpt_debug("keepalive_intvl %i out of range\n",
> +			   in->tcp.keepalive_intvl);
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.keepalive_probes > MAX_TCP_KEEPCNT) {
> +		ckpt_debug("Invalid keepalive_probes value %i\n",
> +			   in->tcp.keepalive_probes);
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.urg_data & ~urg_mask) {
> +		ckpt_debug("Invalid urg_data value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.nonagle & ~nonagle_mask) {
> +		ckpt_debug("Invalid nonagle value\n");
> +		return -EINVAL;
> +	}
> +
> +	if (in->tcp.ecn_flags & ~ecn_mask) {
> +		ckpt_debug("Invalid ecn_flags value\n");
> +		return -EINVAL;
> +	}
> +
>  	return 0;
>  }
> 
> @@ -209,8 +499,35 @@ int inet_restore(struct ckpt_ctx *ctx,
>  			ckpt_debug("inet listen: %i\n", ret);
>  			if (ret < 0)
>  				goto out;
> +
> +			/* We are a listening socket, so add ourselves
> +			 * to the list of parent sockets.  This will
> +			 * allow our children to find us later and
> +			 * link up
> +			 */
> +
> +			ret = sock_listening_list_add(ctx, sock->sk);
> +			if (ret < 0)
> +				goto out;
>  		}
>  	} else {
> +		ret = sock_inet_cptrst(ctx, sock->sk, in, CKPT_RST);
> +		if (ret)
> +			goto out;
> +
> +		if ((h->sock.state == TCP_ESTABLISHED) &&
> +		    (h->sock.protocol == IPPROTO_TCP)) {
> +			/* A connected socket that was spawned from an
> +			 * accept() needs to be hashed with its parent
> +			 * listening socket in order to receive
> +			 * traffic on the original port.  Since we may
> +			 * not have restarted the parent yet, we defer
> +			 * this until later when we know we have all
> +			 * the listening sockets accounted for.
> +			 */
> +			ret = sock_defer_hash(ctx, sock->sk);
> +		}
> +
>  		if (!sock_flag(sock->sk, SOCK_DEAD))
>  			ret = inet_defer_restore_buffers(ctx, sock->sk);
>  	}
> -- 
> 1.6.2.5
> 
> _______________________________________________
> Containers mailing list
> Containers at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers


More information about the Containers mailing list