[PATCH 1/4] Unify skb read/write functions and fix for fragmented buffers (v2)

Dan Smith danms at us.ibm.com
Tue Nov 17 07:26:27 PST 2009


The INET code often creates socket buffers by attaching fragments instead
of writing to the linear region.  This extends the skb write functions
to write out the linear and fragment regions of an skb, and adds a
function to be used by others wishing to restore an skb in the same way.
This also includes the header-mark-setting bits from a previous patch.

Changes in v2:
 - Change the length variables in ckpt_hdr_socket_buffer to u32
 - Check frag number against MAX_SKB_FRAGS

Signed-off-by: Dan Smith <danms at us.ibm.com>
---
 include/linux/checkpoint.h     |    1 +
 include/linux/checkpoint_hdr.h |   11 ++
 net/checkpoint.c               |  258 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 247 insertions(+), 23 deletions(-)

diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index 550f6e8..0eff43e 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -104,6 +104,7 @@ extern int ckpt_sock_getnames(struct ckpt_ctx *ctx,
 			      struct socket *socket,
 			      struct sockaddr *loc, unsigned *loc_len,
 			      struct sockaddr *rem, unsigned *rem_len);
+struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx);
 
 /* ckpt kflags */
 #define ckpt_set_ctx_kflag(__ctx, __kflag)  \
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index 667b7aa..787cf89 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -594,8 +594,19 @@ struct ckpt_hdr_socket_queue {
 
 struct ckpt_hdr_socket_buffer {
 	struct ckpt_hdr h;
+	__u32 transport_header;
+	__u32 network_header;
+	__u32 mac_header;
+	__u32 lin_len; /* Length of linear data */
+	__u32 frg_len; /* Length of fragment data */
+	__u32 skb_len; /* Length of skb (adjusted) */
+	__u32 hdr_len; /* Length of skipped header */
+	__u32 mac_len;
 	__s32 sk_objref;
 	__s32 pr_objref;
+	__u16 protocol;
+	__u16 nr_frags;
+	__u8 cb[48];
 };
 
 #define CKPT_UNIX_LINKED 1
diff --git a/net/checkpoint.c b/net/checkpoint.c
index 7e8e137..49d9a2f 100644
--- a/net/checkpoint.c
+++ b/net/checkpoint.c
@@ -17,9 +17,11 @@
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include <linux/fs_struct.h>
+#include <linux/highmem.h>
 
 #include <net/af_unix.h>
 #include <net/tcp_states.h>
+#include <net/tcp.h>
 
 #include <linux/deferqueue.h>
 #include <linux/checkpoint.h>
@@ -88,6 +90,238 @@ static int sock_copy_buffers(struct sk_buff_head *from,
 	return -EAGAIN;
 }
 
+static void sock_record_header_info(struct sk_buff *skb,
+				    struct ckpt_hdr_socket_buffer *h)
+{
+
+	h->mac_len = skb->mac_len;
+	h->skb_len = skb->len;
+	h->hdr_len = skb->data - skb->head;
+	h->lin_len = (skb->tail - skb->head);
+	h->frg_len = skb->data_len;
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	h->transport_header = skb->transport_hdr;
+	h->network_header = skb->network_header;
+	h->mac_header = skb->mac_header;
+#else
+	h->transport_header = skb->transport_header - skb->head;
+	h->network_header = skb->network_header - skb->head;
+	h->mac_header = skb->mac_header - skb->head;
+#endif
+
+	memcpy(h->cb, skb->cb, sizeof(skb->cb));
+	h->nr_frags = skb_shinfo(skb)->nr_frags;
+}
+
+int sock_restore_header_info(struct sk_buff *skb,
+			     struct ckpt_hdr_socket_buffer *h)
+{
+	if (h->mac_header + h->mac_len != h->network_header) {
+		ckpt_debug("skb mac_header %u+%u != network header %u\n",
+			   h->mac_header, h->mac_len, h->network_header);
+		return -EINVAL;
+	}
+
+	if (h->network_header > h->lin_len) {
+		ckpt_debug("skb network header %u > linear length %u\n",
+			   h->network_header, h->lin_len);
+		return -EINVAL;
+	}
+
+	if (h->transport_header > h->lin_len) {
+		ckpt_debug("skb transport header %u > linear length %u\n",
+			   h->transport_header, h->lin_len);
+		return -EINVAL;
+	}
+
+	if (h->skb_len > SKB_MAX_ALLOC) {
+		ckpt_debug("skb total length %u larger than max of %lu\n",
+			   h->skb_len, SKB_MAX_ALLOC);
+		return -EINVAL;
+	}
+
+	skb_set_transport_header(skb, h->transport_header);
+	skb_set_network_header(skb, h->network_header);
+	skb_set_mac_header(skb, h->mac_header);
+	skb->mac_len = h->mac_len;
+
+	/* FIXME: This should probably be sanitized per-protocol to
+	 * make sure nothing bad happens if it is hijacked.  For the
+	 * current set of protocols that we restore this way, the data
+	 * contained within is not very risky (flags and sequence
+	 * numbers) but could still be evalutated from a
+	 * could-the-user- have-set-these-flags point of view.
+	 */
+	memcpy(skb->cb, h->cb, sizeof(skb->cb));
+
+	skb->data = skb->head + skb->hdr_len;
+	skb->len = h->skb_len;
+
+	return 0;
+}
+
+static int sock_restore_skb_frag(struct ckpt_ctx *ctx,
+				 struct sk_buff *skb,
+				 int frag_idx)
+{
+	int ret = 0;
+	int fraglen;
+	struct page *page;
+	void *buf;
+
+	fraglen = _ckpt_read_obj_type(ctx, NULL, 0, CKPT_HDR_BUFFER);
+	if (fraglen < 0)
+		return fraglen;
+
+	if (fraglen > PAGE_SIZE) {
+		ckpt_debug("skb frag size %i > PAGE_SIZE\n", fraglen);
+		return -EINVAL;
+	}
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	buf = kmap(page);
+	ret = ckpt_kread(ctx, buf, fraglen);
+	kunmap(page);
+
+	if (ret) {
+		ckpt_debug("failed to read fragment: %i\n", ret);
+		ret = -EINVAL;
+		__free_page(page);
+	} else {
+		ckpt_debug("read %i for fragment %i\n", fraglen, frag_idx);
+		skb_add_rx_frag(skb, frag_idx, page, 0, fraglen);
+	}
+
+	return ret < 0 ? ret : fraglen;
+}
+
+struct sk_buff *sock_restore_skb(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	struct sk_buff *skb = NULL;
+	int i;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (IS_ERR(h))
+		return (struct sk_buff *)h;
+
+	if (h->lin_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket linear buffer too big (%u > %lu)\n",
+			   h->lin_len, SKB_MAX_ALLOC);
+		ret = -ENOSPC;
+		goto out;
+	} else if (h->frg_len > SKB_MAX_ALLOC) {
+		ckpt_debug("socket frag size too big (%u > %lu\n",
+			   h->frg_len, SKB_MAX_ALLOC);
+		ret = -ENOSPC;
+		goto out;
+	} else if (h->nr_frags >= MAX_SKB_FRAGS) {
+		ckpt_debug("socket frag count too big (%u > %lu\n",
+			   h->nr_frags, MAX_SKB_FRAGS);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	skb = alloc_skb(h->lin_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = _ckpt_read_obj_type(ctx, skb_put(skb, h->lin_len),
+				  h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("read linear skb length %u: %i\n", h->lin_len, ret);
+	if (ret < 0) {
+		goto out;
+	}
+
+	for (i = 0; i < h->nr_frags; i++) {
+		ret = sock_restore_skb_frag(ctx, skb, i);
+		ckpt_debug("read skb frag %i/%i: %i\n",
+			   i + 1, h->nr_frags, ret);
+		if (ret < 0)
+			goto out;
+		h->frg_len -= ret;
+	}
+
+	if (h->frg_len != 0) {
+		ckpt_debug("length %u remaining after reading frags\n",
+			   h->frg_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sock_restore_header_info(skb, h);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0) {
+		kfree_skb(skb);
+		skb = ERR_PTR(ret);
+	}
+
+	return skb;
+}
+
+static int __sock_write_skb(struct ckpt_ctx *ctx,
+			    struct sk_buff *skb,
+			    int dst_objref)
+{
+	struct ckpt_hdr_socket_buffer *h;
+	int ret = 0;
+	int i;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
+	if (!h)
+		return -ENOMEM;
+
+	if (dst_objref > 0) {
+		BUG_ON(!skb->sk);
+		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
+		if (ret < 0)
+			goto out;
+		h->sk_objref = ret;
+		h->pr_objref = dst_objref;
+	}
+
+	sock_record_header_info(skb, h);
+
+	ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
+	if (ret < 0)
+		goto out;
+
+	ret = ckpt_write_obj_type(ctx, skb->head, h->lin_len, CKPT_HDR_BUFFER);
+	ckpt_debug("writing skb linear region %u: %i\n", h->lin_len, ret);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		u8 *vaddr = kmap(frag->page);
+
+		ckpt_debug("writing buffer fragment %i/%i (%i)\n",
+			   i + 1, h->nr_frags, frag->size);
+		ret = ckpt_write_obj_type(ctx, vaddr + frag->page_offset,
+					  frag->size, CKPT_HDR_BUFFER);
+		kunmap(frag->page);
+		h->frg_len -= frag->size;
+		if (ret < 0)
+			goto out;
+	}
+
+	WARN_ON(h->frg_len != 0);
+
+ out:
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
 static int __sock_write_buffers(struct ckpt_ctx *ctx,
 				struct sk_buff_head *queue,
 				int dst_objref)
@@ -95,13 +329,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 	struct sk_buff *skb;
 
 	skb_queue_walk(queue, skb) {
-		struct ckpt_hdr_socket_buffer *h;
 		int ret = 0;
 
-		/* FIXME: This could be a false positive for non-unix
-		 *        buffers, so add a type check here in the
-		 *        future
-		 */
 		if (UNIXCB(skb).fp) {
 			ckpt_err(ctx, -EBUSY, "%(T)af_unix: pass fd\n");
 			return -EBUSY;
@@ -113,25 +342,8 @@ static int __sock_write_buffers(struct ckpt_ctx *ctx,
 		 * because we don't save out (or restore) the control
 		 * information contained in the skb.
 		 */
-		h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_SOCKET_BUFFER);
-		if (!h)
-			return -ENOMEM;
-
-		BUG_ON(!skb->sk);
-		ret = checkpoint_obj(ctx, skb->sk, CKPT_OBJ_SOCK);
-		if (ret < 0)
-			goto end;
-		h->sk_objref = ret;
-		h->pr_objref = dst_objref;
-
-		ret = ckpt_write_obj(ctx, (struct ckpt_hdr *) h);
-		if (ret < 0)
-			goto end;
 
-		ret = ckpt_write_obj_type(ctx, skb->data, skb->len,
-					  CKPT_HDR_BUFFER);
-	end:
-		ckpt_hdr_put(ctx, h);
+		ret = __sock_write_skb(ctx, skb, dst_objref);
 		if (ret < 0)
 			return ret;
 	}
-- 
1.6.2.5



More information about the Containers mailing list