[RFC][PATCH] memory cgroup enhancements updated [10/10] NUMA aware account

Fri Oct 19 02:37:30 PDT 2007

This patch is a trial for making stat for memory_cgroup NUMA-aware.

 * dividing per-cpu stat to handle nid information.
 * add nid information to page_cgroup
 * Because init routine has to call kmalloc, init_early is set to be 0.

This is just a trial at this stage. Any comments are welcome.
Works well on my fake NUMA system.
I think we can add "numastat" based on this.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu at jp.fujitsu.com>


 mm/memcontrol.c |   97 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 71 insertions(+), 26 deletions(-)

Index: devel-2.6.23-mm1/mm/memcontrol.c
===================================================================

--- devel-2.6.23-mm1.orig/mm/memcontrol.c
+++ devel-2.6.23-mm1/mm/memcontrol.c
@@ -29,6 +29,7 @@
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/vmalloc.h>
 
 #include <asm/uaccess.h>
 
@@ -59,12 +60,18 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_NSTATS,
 };
 
+#ifndef CONFIG_NUMA
 struct mem_cgroup_stat_cpu {
-	s64 count[MEM_CGROUP_STAT_NSTATS];
+	s64 count[1][MEM_CGROUP_STAT_NSTATS];
 } ____cacheline_aligned_in_smp;
+#else
+struct mem_cgroup_stat_cpu {
+	s64 count[MAX_NUMNODES][MEM_CGROUP_STAT_NSTATS];
+} ____cacheline_aligned_in_smp;
+#endif
 
 struct mem_cgroup_stat {
-	struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+	struct mem_cgroup_stat_cpu *cpustat[NR_CPUS];
 };
 
 /*
@@ -72,28 +79,28 @@ struct mem_cgroup_stat {
  * MUST be called under preempt_disable().
  */
 static inline void __mem_cgroup_stat_add(struct mem_cgroup_stat *stat,
-                enum mem_cgroup_stat_index idx, int val)
+		enum mem_cgroup_stat_index idx, int nid, int val)
 {
 	int cpu = smp_processor_id();
 #ifdef CONFIG_PREEMPT
 	VM_BUG_ON(preempt_count() == 0);
 #endif
-	stat->cpustat[cpu].count[idx] += val;
+	stat->cpustat[cpu]->count[nid][idx] += val;
 }
 
 static inline void mem_cgroup_stat_inc(struct mem_cgroup_stat *stat,
-		enum mem_cgroup_stat_index idx)
+		enum mem_cgroup_stat_index idx, int nid)
 {
 	preempt_disable();
-	__mem_cgroup_stat_add(stat, idx, 1);
+	__mem_cgroup_stat_add(stat, idx, nid, 1);
 	preempt_enable();
 }
 
 static inline void mem_cgroup_stat_dec(struct mem_cgroup_stat *stat,
-		enum mem_cgroup_stat_index idx)
+		enum mem_cgroup_stat_index idx, int nid)
 {
 	preempt_disable();
-	__mem_cgroup_stat_add(stat, idx, -1);
+	__mem_cgroup_stat_add(stat, idx, nid, -1);
 	preempt_enable();
 }
 
@@ -149,6 +156,7 @@ struct page_cgroup {
 	struct list_head lru;		/* per cgroup LRU list */
 	struct page *page;
 	struct mem_cgroup *mem_cgroup;
+	int	nid;
 	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
 					/* mapped and cached states     */
 	int	 flags;
@@ -169,21 +177,23 @@ enum {
  * We have to modify several values at charge/uncharge..
  */
 static inline void
-mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, int charge)
+mem_cgroup_charge_statistics(struct mem_cgroup *mem, int nid,
+				int flags, int charge)
 {
 	int val = (charge)? 1 : -1;
 	struct mem_cgroup_stat *stat = &mem->stat;
 	preempt_disable();
 
 	if (flags & PCGF_PAGECACHE)
-		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_PAGECACHE, val);
+		__mem_cgroup_stat_add(stat,
+				MEM_CGROUP_STAT_PAGECACHE, nid, val);
 	else
-		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_RSS, val);
+		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_RSS, nid, val);
 
 	if (flags & PCGF_ACTIVE)
-		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, val);
+		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, nid, val);
 	else
-		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, val);
+		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, nid, val);
 
 	preempt_enable();
 }
@@ -315,8 +325,10 @@ static void __mem_cgroup_move_lists(stru
 	if (moved) {
 		struct mem_cgroup_stat *stat = &mem->stat;
 		preempt_disable();
-		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, moved);
-		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, -moved);
+		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE,
+							pc->nid, moved);
+		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE,
+					pc->nid, -moved);
 		preempt_enable();
 	}
 	if (active) {
@@ -493,10 +505,10 @@ retry:
 		bool is_atomic = gfp_mask & GFP_ATOMIC;
 		if (is_cache)
 			mem_cgroup_stat_inc(&mem->stat,
-				MEM_CGROUP_STAT_FAIL_CACHE);
+				MEM_CGROUP_STAT_FAIL_CACHE, page_to_nid(page));
 		else
 			mem_cgroup_stat_inc(&mem->stat,
-				MEM_CGROUP_STAT_FAIL_RSS);
+				MEM_CGROUP_STAT_FAIL_RSS, page_to_nid(page));
 		/*
 		 * We cannot reclaim under GFP_ATOMIC, fail the charge
 		 */
@@ -537,6 +549,7 @@ noreclaim:
 	atomic_set(&pc->ref_cnt, 1);
 	pc->mem_cgroup = mem;
 	pc->page = page;
+	pc->nid = page_to_nid(page);
 	if (is_cache)
 		pc->flags = PCGF_PAGECACHE | PCGF_ACTIVE;
 	else
@@ -554,7 +567,7 @@ noreclaim:
 	}
 
 	/* Update statistics vector */
-	mem_cgroup_charge_statistics(mem, pc->flags, true);
+	mem_cgroup_charge_statistics(mem, pc->nid, pc->flags, true);
 
 	spin_lock_irqsave(&mem->lru_lock, flags);
 	list_add(&pc->lru, &mem->active_list);
@@ -621,7 +634,8 @@ void mem_cgroup_uncharge(struct page_cgr
 			spin_lock_irqsave(&mem->lru_lock, flags);
 			list_del_init(&pc->lru);
 			spin_unlock_irqrestore(&mem->lru_lock, flags);
-			mem_cgroup_charge_statistics(mem, pc->flags, false);
+			mem_cgroup_charge_statistics(mem, pc->nid, pc->flags,
+						false);
 			kfree(pc);
 		}
 	}
@@ -657,6 +671,7 @@ void mem_cgroup_end_migration(struct pag
 void mem_cgroup_page_migration(struct page *page, struct page *newpage)
 {
 	struct page_cgroup *pc;
+	struct mem_cgroup *mem;
 retry:
 	pc = page_get_page_cgroup(page);
 	if (!pc)
@@ -664,6 +679,11 @@ retry:
 	if (clear_page_cgroup(page, pc) != pc)
 		goto retry;
 	pc->page = newpage;
+	pc->nid = page_to_nid(newpage);
+	mem = pc->mem_cgroup;
+	mem_cgroup_charge_statistics(mem, page_to_nid(page), pc->flags, false);
+	mem_cgroup_charge_statistics(mem,
+				page_to_nid(newpage), pc->flags, true);
 	lock_page_cgroup(newpage);
 	page_assign_page_cgroup(newpage, pc);
 	unlock_page_cgroup(newpage);
@@ -697,7 +717,8 @@ retry:
 			css_put(&mem->css);
 			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			list_del_init(&pc->lru);
-			mem_cgroup_charge_statistics(mem, pc->flags, false);
+			mem_cgroup_charge_statistics(mem, pc->flags, pc->nid,
+					false);
 			kfree(pc);
 		} else 	/* being uncharged ? ...do relax */
 			break;
@@ -872,13 +893,16 @@ static int mem_control_stat_show(struct 
 	struct mem_cgroup_stat *stat = &mem_cont->stat;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
+	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		unsigned int cpu;
+		int node;
 		s64 val;
 
 		val = 0;
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
-			val += stat->cpustat[cpu].count[i];
+		for_each_possible_cpu(cpu)
+			for_each_node_state(node, N_POSSIBLE)
+				val += stat->cpustat[cpu]->count[node][i];
+
 		val *= mem_cgroup_stat_desc[i].unit;
 		seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, val);
 	}
@@ -941,12 +965,14 @@ static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct mem_cgroup *mem;
+	int cpu;
 
 	if (unlikely((cont->parent) == NULL)) {
 		mem = &init_mem_cgroup;
 		init_mm.mem_cgroup = mem;
-	} else
+	} else {
 		mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
+	}
 
 	if (mem == NULL)
 		return NULL;
@@ -956,6 +982,17 @@ mem_cgroup_create(struct cgroup_subsys *
 	INIT_LIST_HEAD(&mem->inactive_list);
 	spin_lock_init(&mem->lru_lock);
 	mem->control_type = MEM_CGROUP_TYPE_ALL;
+
+	for_each_possible_cpu(cpu) {
+		int nid = cpu_to_node(cpu);
+		struct mem_cgroup_stat_cpu *mcsc;
+		if (sizeof(*mcsc) < PAGE_SIZE)
+			mcsc = kmalloc_node(sizeof(*mcsc), GFP_KERNEL, nid);
+		else
+			mcsc = vmalloc_node(sizeof(*mcsc), nid);
+		memset(mcsc, 0, sizeof(*mcsc));
+		mem->stat.cpustat[cpu] = mcsc;
+	}
 	return &mem->css;
 }
 
@@ -969,7 +1006,15 @@ static void mem_cgroup_pre_destroy(struc
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
-	kfree(mem_cgroup_from_cont(cont));
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	int cpu;
+	for_each_possible_cpu(cpu) {
+		if (sizeof(struct mem_cgroup_stat_cpu) < PAGE_SIZE)
+			kfree(mem->stat.cpustat[cpu]);
+		else
+			vfree(mem->stat.cpustat[cpu]);
+	}
+	kfree(mem);
 }
 
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -1021,5 +1066,5 @@ struct cgroup_subsys mem_cgroup_subsys =
 	.destroy = mem_cgroup_destroy,
 	.populate = mem_cgroup_populate,
 	.attach = mem_cgroup_move_task,
-	.early_init = 1,
+	.early_init = 0,
 };