[PATCH 3/7] page_cgroup: provide a generic page tracking infrastructure

Gui Jianfeng guijianfeng at cn.fujitsu.com
Thu Apr 23 19:11:09 PDT 2009


Andrea Righi wrote:
> Dirty pages in the page cache can be processed asynchronously by kernel
> threads (pdflush) using a writeback policy. For this reason the real
> writes to the underlying block devices occur in a different IO context
> respect to the task that originally generated the dirty pages involved
> in the IO operation. This makes the tracking and throttling of writeback
> IO more complicate respect to the synchronous IO.
> 
> The page_cgroup infrastructure, currently available only for the memory
> cgroup controller, can be used to store the owner of each page and
> opportunely track the writeback IO. This information is encoded in
> page_cgroup->flags.

  You encode id in page_cgroup->flags, if a cgroup get removed, IMHO, you
  should remove the corresponding id in flags.
  One more thing, if a task is moving from a cgroup to another, the id in
  flags also need to be changed.

> 
> A owner can be identified using a generic ID number and the following
> interfaces are provided to store a retrieve this information:
> 
>   unsigned long page_cgroup_get_owner(struct page *page);
>   int page_cgroup_set_owner(struct page *page, unsigned long id);
>   int page_cgroup_copy_owner(struct page *npage, struct page *opage);
> 
> The io-throttle controller uses the cgroup css_id() as the owner's ID
> number.
> 
> A big part of this code is taken from the Ryo and Hirokazu's bio-cgroup
> controller (http://people.valinux.co.jp/~ryov/bio-cgroup/).
> 
> Signed-off-by: Andrea Righi <righi.andrea at gmail.com>
> Signed-off-by: Hirokazu Takahashi <taka at valinux.co.jp>
> Signed-off-by: Ryo Tsuruta <ryov at valinux.co.jp>
> ---
>  include/linux/memcontrol.h  |    6 +++
>  include/linux/mmzone.h      |    4 +-
>  include/linux/page_cgroup.h |   33 +++++++++++++-
>  init/Kconfig                |    4 ++
>  mm/Makefile                 |    3 +-
>  mm/memcontrol.c             |    6 +++
>  mm/page_cgroup.c            |   95 ++++++++++++++++++++++++++++++++++++++-----
>  7 files changed, 135 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 18146c9..f3e0e64 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -37,6 +37,8 @@ struct mm_struct;
>   * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
>   */
>  
> +extern void __init_mem_page_cgroup(struct page_cgroup *pc);
> +
>  extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
>  				gfp_t gfp_mask);
>  /* for swap handling */
> @@ -120,6 +122,10 @@ extern bool mem_cgroup_oom_called(struct task_struct *task);
>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>  struct mem_cgroup;
>  
> +static inline void __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> +}
> +
>  static inline int mem_cgroup_newpage_charge(struct page *page,
>  					struct mm_struct *mm, gfp_t gfp_mask)
>  {
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 186ec6a..b178eb9 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -607,7 +607,7 @@ typedef struct pglist_data {
>  	int nr_zones;
>  #ifdef CONFIG_FLAT_NODE_MEM_MAP	/* means !SPARSEMEM */
>  	struct page *node_mem_map;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
>  	struct page_cgroup *node_page_cgroup;
>  #endif
>  #endif
> @@ -958,7 +958,7 @@ struct mem_section {
>  
>  	/* See declaration of similar field in struct zone */
>  	unsigned long *pageblock_flags;
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
>  	/*
>  	 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
>  	 * section. (see memcontrol.h/page_cgroup.h about this.)
> diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
> index 7339c7b..f24d081 100644
> --- a/include/linux/page_cgroup.h
> +++ b/include/linux/page_cgroup.h
> @@ -1,7 +1,7 @@
>  #ifndef __LINUX_PAGE_CGROUP_H
>  #define __LINUX_PAGE_CGROUP_H
>  
> -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +#ifdef CONFIG_PAGE_TRACKING
>  #include <linux/bit_spinlock.h>
>  /*
>   * Page Cgroup can be considered as an extended mem_map.
> @@ -12,11 +12,38 @@
>   */
>  struct page_cgroup {
>  	unsigned long flags;
> -	struct mem_cgroup *mem_cgroup;
>  	struct page *page;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
> +	struct mem_cgroup *mem_cgroup;
>  	struct list_head lru;		/* per cgroup LRU list */
> +#endif
>  };
>  
> +/*
> + * use lower 16 bits for flags and reserve the rest for the page tracking id
> + */
> +#define PAGE_TRACKING_ID_SHIFT	(16)
> +#define PAGE_TRACKING_ID_BITS \
> +		(8 * sizeof(unsigned long) - PAGE_TRACKING_ID_SHIFT)
> +
> +/* NOTE: must be called with page_cgroup() held */
> +static inline unsigned long page_cgroup_get_id(struct page_cgroup *pc)
> +{
> +	return pc->flags >> PAGE_TRACKING_ID_SHIFT;
> +}
> +
> +/* NOTE: must be called with page_cgroup() held */
> +static inline void page_cgroup_set_id(struct page_cgroup *pc, unsigned long id)
> +{
> +	WARN_ON(id >= (1UL << PAGE_TRACKING_ID_BITS));
> +	pc->flags &= (1UL << PAGE_TRACKING_ID_SHIFT) - 1;
> +	pc->flags |= (unsigned long)(id << PAGE_TRACKING_ID_SHIFT);
> +}
> +
> +unsigned long page_cgroup_get_owner(struct page *page);
> +int page_cgroup_set_owner(struct page *page, unsigned long id);
> +int page_cgroup_copy_owner(struct page *npage, struct page *opage);
> +
>  void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat);
>  void __init page_cgroup_init(void);
>  struct page_cgroup *lookup_page_cgroup(struct page *page);
> @@ -71,7 +98,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc)
>  	bit_spin_unlock(PCG_LOCK, &pc->flags);
>  }
>  
> -#else /* CONFIG_CGROUP_MEM_RES_CTLR */
> +#else /* CONFIG_PAGE_TRACKING */
>  struct page_cgroup;
>  
>  static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
> diff --git a/init/Kconfig b/init/Kconfig
> index 7be4d38..5428ac7 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -569,6 +569,7 @@ config CGROUP_MEM_RES_CTLR
>  	bool "Memory Resource Controller for Control Groups"
>  	depends on CGROUPS && RESOURCE_COUNTERS
>  	select MM_OWNER
> +	select PAGE_TRACKING
>  	help
>  	  Provides a memory resource controller that manages both anonymous
>  	  memory and page cache. (See Documentation/cgroups/memory.txt)
> @@ -611,6 +612,9 @@ endif # CGROUPS
>  config MM_OWNER
>  	bool
>  
> +config PAGE_TRACKING
> +	bool
> +
>  config SYSFS_DEPRECATED
>  	bool
>  
> diff --git a/mm/Makefile b/mm/Makefile
> index ec73c68..b94e074 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,4 +37,5 @@ else
>  obj-$(CONFIG_SMP) += allocpercpu.o
>  endif
>  obj-$(CONFIG_QUICKLIST) += quicklist.o
> -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
> +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
> +obj-$(CONFIG_PAGE_TRACKING) += page_cgroup.o
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e44fb0f..69d1c31 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2524,6 +2524,12 @@ struct cgroup_subsys mem_cgroup_subsys = {
>  	.use_id = 1,
>  };
>  
> +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc)
> +{
> +	pc->mem_cgroup = NULL;
> +	INIT_LIST_HEAD(&pc->lru);
> +}
> +
>  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
>  
>  static int __init disable_swap_account(char *s)
> diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
> index 791905c..b3b394c 100644
> --- a/mm/page_cgroup.c
> +++ b/mm/page_cgroup.c
> @@ -3,6 +3,7 @@
>  #include <linux/bootmem.h>
>  #include <linux/bit_spinlock.h>
>  #include <linux/page_cgroup.h>
> +#include <linux/blk-io-throttle.h>
>  #include <linux/hash.h>
>  #include <linux/slab.h>
>  #include <linux/memory.h>
> @@ -14,9 +15,8 @@ static void __meminit
>  __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
>  {
>  	pc->flags = 0;
> -	pc->mem_cgroup = NULL;
>  	pc->page = pfn_to_page(pfn);
> -	INIT_LIST_HEAD(&pc->lru);
> +	__init_mem_page_cgroup(pc);
>  }
>  static unsigned long total_usage;
>  
> @@ -74,7 +74,7 @@ void __init page_cgroup_init(void)
>  
>  	int nid, fail;
>  
> -	if (mem_cgroup_disabled())
> +	if (mem_cgroup_disabled() && iothrottle_disabled())
>  		return;
>  
>  	for_each_online_node(nid)  {
> @@ -83,12 +83,13 @@ void __init page_cgroup_init(void)
>  			goto fail;
>  	}
>  	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -	printk(KERN_INFO "please try cgroup_disable=memory option if you"
> -	" don't want\n");
> +	printk(KERN_INFO
> +		"try cgroup_disable=memory,blockio option if you don't want\n");
>  	return;
>  fail:
>  	printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
> -	printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
> +	printk(KERN_CRIT
> +		"try cgroup_disable=memory,blockio boot option\n");
>  	panic("Out of memory");
>  }
>  
> @@ -243,12 +244,85 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
>  
>  #endif
>  
> +/**
> + * page_cgroup_get_owner() - get the owner ID of a page
> + * @page:	the page we want to find the owner
> + *
> + * Returns the owner ID of the page, 0 means that the owner cannot be
> + * retrieved.
> + **/
> +unsigned long page_cgroup_get_owner(struct page *page)
> +{
> +	struct page_cgroup *pc;
> +	unsigned long ret;
> +
> +	pc = lookup_page_cgroup(page);
> +	if (unlikely(!pc))
> +		return 0;
> +
> +	lock_page_cgroup(pc);
> +	ret = page_cgroup_get_id(pc);
> +	unlock_page_cgroup(pc);
> +	return ret;
> +}
> +
> +/**
> + * page_cgroup_set_owner() - set the owner ID of a page
> + * @page:	the page we want to tag
> + * @id:		the ID number that will be associated to page
> + *
> + * Returns 0 if the owner is correctly associated to the page. Returns a
> + * negative value in case of failure.
> + **/
> +int page_cgroup_set_owner(struct page *page, unsigned long id)
> +{
> +	struct page_cgroup *pc;
> +
> +	pc = lookup_page_cgroup(page);
> +	if (unlikely(!pc))
> +		return -ENOENT;
> +
> +	lock_page_cgroup(pc);
> +	page_cgroup_set_id(pc, id);
> +	unlock_page_cgroup(pc);
> +	return 0;
> +}
> +
> +/**
> + * page_cgroup_copy_owner() - copy the owner ID of a page into another page
> + * @npage:	the page where we want to copy the owner
> + * @opage:	the page from which we want to copy the ID
> + *
> + * Returns 0 if the owner is correctly associated to npage. Returns a negative
> + * value in case of failure.
> + **/
> +int page_cgroup_copy_owner(struct page *npage, struct page *opage)
> +{
> +	struct page_cgroup *npc, *opc;
> +	unsigned long id;
> +
> +	npc = lookup_page_cgroup(npage);
> +	if (unlikely(!npc))
> +		return -ENOENT;
> +	opc = lookup_page_cgroup(opage);
> +	if (unlikely(!opc))
> +		return -ENOENT;
> +	lock_page_cgroup(opc);
> +	lock_page_cgroup(npc);
> +	id = page_cgroup_get_id(opc);
> +	page_cgroup_set_id(npc, id);
> +	unlock_page_cgroup(npc);
> +	unlock_page_cgroup(opc);
> +
> +	return 0;
> +}
> +
>  void __init page_cgroup_init(void)
>  {
>  	unsigned long pfn;
>  	int fail = 0;
>  
> -	if (mem_cgroup_disabled())
> +	if (mem_cgroup_disabled() && iothrottle_disabled())
>  		return;
>  
>  	for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
> @@ -257,14 +331,15 @@ void __init page_cgroup_init(void)
>  		fail = init_section_page_cgroup(pfn);
>  	}
>  	if (fail) {
> -		printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
> +		printk(KERN_CRIT
> +			"try cgroup_disable=memory,blockio boot option\n");
>  		panic("Out of memory");
>  	} else {
>  		hotplug_memory_notifier(page_cgroup_callback, 0);
>  	}
>  	printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
> -	printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
> -	" want\n");
> +	printk(KERN_INFO
> +		"try cgroup_disable=memory,blockio option if you don't want\n");
>  }
>  
>  void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)

-- 
Regards
Gui Jianfeng



More information about the Containers mailing list