[PATCH v2 6/9] per-cgroup tcp buffers control

Glauber Costa glommer at parallels.com
Tue Sep 6 21:23:16 PDT 2011


With all the infrastructure in place, this patch implements
per-cgroup control for tcp memory pressure handling.

Signed-off-by: Glauber Costa <glommer at parallels.com>
CC: David S. Miller <davem at davemloft.net>
CC: Hiroyouki Kamezawa <kamezawa.hiroyu at jp.fujitsu.com>
CC: Eric W. Biederman <ebiederm at xmission.com>
---
 include/linux/kmem_cgroup.h |    7 ++++
 include/net/sock.h          |   10 ++++++-
 mm/kmem_cgroup.c            |   10 ++++++-
 net/core/sock.c             |   18 +++++++++++
 net/ipv4/tcp.c              |   67 +++++++++++++++++++++++++++++++++++++-----
 5 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/include/linux/kmem_cgroup.h b/include/linux/kmem_cgroup.h
index d983ba8..89ad0a1 100644
--- a/include/linux/kmem_cgroup.h
+++ b/include/linux/kmem_cgroup.h
@@ -23,6 +23,13 @@
 struct kmem_cgroup {
 	struct cgroup_subsys_state css;
 	struct kmem_cgroup *parent;
+
+#ifdef CONFIG_INET
+	int tcp_memory_pressure;
+	atomic_long_t tcp_memory_allocated;
+	struct percpu_counter tcp_sockets_allocated;
+	long tcp_prot_mem[3];
+#endif
 };
 
 
diff --git a/include/net/sock.h b/include/net/sock.h
index ab65640..91424e3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -64,6 +64,7 @@
 #include <net/dst.h>
 #include <net/checksum.h>
 
+int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
@@ -814,7 +815,14 @@ struct proto {
 	int			*(*memory_pressure)(struct kmem_cgroup *sg);
 	/* Pointer to the per-cgroup version of the the sysctl_mem field */
 	long			*(*prot_mem)(struct kmem_cgroup *sg);
-
+	/*
+	 * cgroup specific initialization function. Called once for all
+	 * protocols that implement it, from cgroups populate function.
+	 * This function has to setup any files the protocol want to
+	 * appear in the kmem cgroup filesystem.
+	 */
+	int			(*init_cgroup)(struct cgroup *cgrp,
+					       struct cgroup_subsys *ss);
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
 	int			max_header;
diff --git a/mm/kmem_cgroup.c b/mm/kmem_cgroup.c
index 7950e69..5e53d66 100644
--- a/mm/kmem_cgroup.c
+++ b/mm/kmem_cgroup.c
@@ -17,16 +17,24 @@
 #include <linux/cgroup.h>
 #include <linux/slab.h>
 #include <linux/kmem_cgroup.h>
+#include <net/sock.h>
 
 static int kmem_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
-	return 0;
+	int ret = 0;
+#ifdef CONFIG_NET
+	ret = sockets_populate(ss, cgrp);
+#endif
+	return ret;
 }
 
 static void
 kmem_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct kmem_cgroup *cg = kcg_from_cgroup(cgrp);
+#ifdef CONFIG_INET
+	percpu_counter_destroy(&cg->tcp_sockets_allocated);
+#endif
 	kfree(cg);
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index ead9c02..9d833cf 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -134,6 +134,24 @@
 #include <net/tcp.h>
 #endif
 
+static DEFINE_RWLOCK(proto_list_lock);
+static LIST_HEAD(proto_list);
+
+int sockets_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+	struct proto *proto;
+	int ret = 0;
+
+	read_lock(&proto_list_lock);
+	list_for_each_entry(proto, &proto_list, node) {
+		if (proto->init_cgroup)
+			ret |= proto->init_cgroup(cgrp, ss);
+	}
+	read_unlock(&proto_list_lock);
+
+	return ret;
+}
+
 /*
  * Each address family might have different locking rules, so we have
  * one slock key per address family:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 76f03ed..0725dc4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -289,13 +289,6 @@ int sysctl_tcp_rmem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
-atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
-
-/*
- * Current number of TCP sockets.
- */
-struct percpu_counter tcp_sockets_allocated;
-
 /*
  * TCP splice context
  */
@@ -305,13 +298,68 @@ struct tcp_splice_state {
 	unsigned int flags;
 };
 
+#ifdef CONFIG_CGROUP_KMEM
 /*
  * Pressure flag: try to collapse.
  * Technical note: it is used by multiple contexts non atomically.
  * All the __sk_mem_schedule() is of this nature: accounting
  * is strict, actions are advisory and have some latency.
  */
-int tcp_memory_pressure __read_mostly;
+void tcp_enter_memory_pressure(struct sock *sk)
+{
+	struct kmem_cgroup *sg = sk->sk_cgrp;
+	if (!sg->tcp_memory_pressure) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+		sg->tcp_memory_pressure = 1;
+	}
+}
+
+long *tcp_sysctl_mem(struct kmem_cgroup *sg)
+{
+	return sg->tcp_prot_mem;
+}
+
+atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
+{
+	return &(sg->tcp_memory_allocated);
+}
+
+int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct kmem_cgroup *sg = kcg_from_cgroup(cgrp);
+	unsigned long limit;
+	struct net *net = current->nsproxy->net_ns;
+
+	sg->tcp_memory_pressure = 0;
+	atomic_long_set(&sg->tcp_memory_allocated, 0);
+	percpu_counter_init(&sg->tcp_sockets_allocated, 0);
+
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+
+	sg->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
+	sg->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
+	sg->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+
+	return 0;
+}
+EXPORT_SYMBOL(tcp_init_cgroup);
+
+int *memory_pressure_tcp(struct kmem_cgroup *sg)
+{
+	return &sg->tcp_memory_pressure;
+}
+
+struct percpu_counter *sockets_allocated_tcp(struct kmem_cgroup *sg)
+{
+	return &sg->tcp_sockets_allocated;
+}
+#else
+
+/* Current number of TCP sockets. */
+struct percpu_counter tcp_sockets_allocated;
+atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
+int tcp_memory_pressure;
 
 int *memory_pressure_tcp(struct kmem_cgroup *sg)
 {
@@ -340,6 +388,7 @@ atomic_long_t *memory_allocated_tcp(struct kmem_cgroup *sg)
 {
 	return &tcp_memory_allocated;
 }
+#endif /* CONFIG_CGROUP_KMEM */
 
 EXPORT_SYMBOL(memory_pressure_tcp);
 EXPORT_SYMBOL(sockets_allocated_tcp);
@@ -3247,7 +3296,9 @@ void __init tcp_init(void)
 
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
 
+#ifndef CONFIG_CGROUP_KMEM
 	percpu_counter_init(&tcp_sockets_allocated, 0);
+#endif
 	percpu_counter_init(&tcp_orphan_count, 0);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
-- 
1.7.6



More information about the Containers mailing list