[RFC] memory controller : backgorund reclaim and avoid excessive locking [4/5] borrow resource

Thu Feb 14 00:35:04 PST 2008

One of contended lock is counter->lock.

Now, counter->usage is changed by PAGE_SIZE. This patch changes 
this to be PAGE_SIZE * borrow_factor and cache "borrow" in
per cpu area.

This reduce # of lock against counter->lock.


Signed-off-by: KAMEZAWA Hiroyuki <kaemzawa.hiroyu at jp.fujitsu.com>

Index: linux-2.6.24-mm1/mm/memcontrol.c
===================================================================

--- linux-2.6.24-mm1.orig/mm/memcontrol.c
+++ linux-2.6.24-mm1/mm/memcontrol.c
@@ -47,7 +47,8 @@ enum mem_cgroup_stat_index {
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
-
+	MEM_CGROUP_STAT_BORROW,	   /* # of per-cpu borrow resource from
+					global resource */
 	MEM_CGROUP_STAT_NSTATS,
 };
 
@@ -134,6 +135,9 @@ struct mem_cgroup {
 	struct mem_cgroup_lru_info info;
 
 	int	prev_priority;	/* for recording reclaim priority */
+
+	int	borrow_unit;	  /* size of unit for borrowing resource */
+
 	/*
 	 * statistics.
 	 */
@@ -611,6 +615,92 @@ unsigned long mem_cgroup_isolate_pages(u
 	return nr_taken;
 }
 
+/* FIXME?  we assume that size is always PAGE_SIZE. */
+
+static int mem_cgroup_borrow_and_charge(struct mem_cgroup *mem, int size)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = 0;
+
+	local_irq_save(flags);
+	if (mem->borrow_unit) {
+		int cpu;
+		s64 *bwp;
+		cpu = smp_processor_id();
+		bwp = &mem->stat.cpustat[cpu].count[MEM_CGROUP_STAT_BORROW];
+		if (*bwp > size) {
+			*bwp -= size;
+			goto out;
+		}
+		/* try to charge */
+		ret = res_counter_charge(&mem->res, mem->borrow_unit);
+		if (!ret) { /* success */
+			*bwp += (mem->borrow_unit - size);
+			goto out;
+		}
+	}
+	spin_lock(&mem->res.lock);
+	ret = res_counter_charge_locked(&mem->res, size);
+	spin_unlock(&mem->res.lock);
+out:
+	local_irq_restore(flags);
+	return ret;
+}
+
+static void mem_cgroup_return_and_uncharge(struct mem_cgroup *mem, int size)
+{
+	unsigned long flags;
+	int uncharge_size = 0;
+
+	local_irq_save(flags);
+	if (mem->borrow_unit) {
+		int limit = mem->borrow_unit * 2;
+		int cpu;
+		s64 *bwp;
+		cpu = smp_processor_id();
+		bwp = &mem->stat.cpustat[cpu].count[MEM_CGROUP_STAT_BORROW];
+		*bwp += size;
+		if (*bwp > limit) {
+			uncharge_size = *bwp - mem->borrow_unit;
+			*bwp = mem->borrow_unit;
+		}
+	} else
+		uncharge_size = size;
+
+	if (uncharge_size) {
+		spin_lock(&mem->res.lock);
+		res_counter_uncharge_locked(&mem->res, size);
+		spin_unlock(&mem->res.lock);
+	}
+	local_irq_restore(flags);
+
+	return;
+}
+
+static void drain_local_borrow(void *data)
+{
+	int cpu;
+	int borrow;
+	unsigned long flags;
+	struct mem_cgroup *mem = data;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	borrow = mem->stat.cpustat[cpu].count[MEM_CGROUP_STAT_BORROW];
+	mem->stat.cpustat[cpu].count[MEM_CGROUP_STAT_BORROW] = 0;
+	spin_lock(&mem->res.lock);
+	res_counter_uncharge_locked(&mem->res, borrow);
+	spin_unlock(&mem->res.lock);
+	local_irq_restore(flags);
+}
+
+static void drain_all_borrow(struct mem_cgroup *mem)
+{
+	on_each_cpu(drain_local_borrow, mem, 0, 1);
+}
+
 /*
  * Charge the memory controller for page usage.
  * Return
@@ -681,7 +771,7 @@ retry:
 	 * If we created the page_cgroup, we should free it on exceeding
 	 * the cgroup limit.
 	 */
-	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
+	while (mem_cgroup_borrow_and_charge(mem, PAGE_SIZE)) {
 		int ret;
 		if (!(gfp_mask & __GFP_WAIT))
 			goto out;
@@ -709,6 +799,8 @@ retry:
 		if (res_counter_check_under_limit(&mem->res))
 			continue;
 
+		if (nr_retries < MEM_CGROUP_RECLAIM_RETRIES)
+			drain_all_borrow(mem);
 		if (!nr_retries--) {
 			mem_cgroup_out_of_memory(mem, gfp_mask);
 			goto out;
@@ -805,7 +897,7 @@ void mem_cgroup_uncharge(struct page_cgr
 		if (clear_page_cgroup(page, pc) == pc) {
 			mem = pc->mem_cgroup;
 			css_put(&mem->css);
-			res_counter_uncharge(&mem->res, PAGE_SIZE);
+			mem_cgroup_return_and_uncharge(mem, PAGE_SIZE);
 			spin_lock_irqsave(&mz->lru_lock, flags);
 			__mem_cgroup_remove_list(pc);
 			spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -1005,6 +1097,7 @@ int mem_cgroup_force_empty(struct mem_cg
 				/* drop all page_cgroup in inactive_list */
 				mem_cgroup_force_empty_list(mem, mz, 0);
 			}
+		drain_all_borrow(mem);
 	}
 	ret = 0;
 out:
@@ -1109,12 +1202,29 @@ static u64 mem_throttle_read(struct cgro
 	return (u64)mem->throttle.limit;
 }
 
+static int mem_bulkratio_write(struct cgroup *cont, struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	int unit = val * PAGE_SIZE;
+	if (unit > (PAGE_SIZE << (MAX_ORDER/2)))
+		return -EINVAL;
+	mem->borrow_unit = unit;
+	return 0;
+}
+
+static u64 mem_bulkratio_read(struct cgroup *cont, struct cftype *cft)
+{
+	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+	return (u64)(mem->borrow_unit/PAGE_SIZE);
+}
+
 static const struct mem_cgroup_stat_desc {
 	const char *msg;
 	u64 unit;
 } mem_cgroup_stat_desc[] = {
 	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
 	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+	[MEM_CGROUP_STAT_BORROW] = { "borrow", 1, },
 };
 
 static int mem_control_stat_show(struct seq_file *m, void *arg)
@@ -1205,6 +1315,11 @@ static struct cftype mem_cgroup_files[] 
 		.read_uint  = mem_throttle_read,
 	},
 	{
+		.name = "bulkratio",
+		.write_uint = mem_bulkratio_write,
+		.read_uint = mem_bulkratio_read,
+	},
+	{
 		.name = "stat",
 		.open = mem_control_stat_open,
 	},
@@ -1279,6 +1394,8 @@ mem_cgroup_create(struct cgroup_subsys *
 	mem->throttle.limit = 10240; /* maybe enough big for no throttle */
 	atomic_set(&mem->throttle.reclaimers, 0);
 
+	mem->borrow_unit = 0; /* Work at strict/precise mode as default */
+
 	return &mem->css;
 free_out:
 	for_each_node_state(node, N_POSSIBLE)