[PATCH v4 cgroup/for-3.7] cgroup: mark subsystems with broken hierarchy support and whine if cgroups are nested for them

Thu Sep 13 19:27:26 UTC 2012

On Thu 13-09-12 12:20:58, Tejun Heo wrote:
> Currently, cgroup hierarchy support is a mess.  cpu related subsystems
> behave correctly - configuration, accounting and control on a parent
> properly cover its children.  blkio and freezer completely ignore
> hierarchy and treat all cgroups as if they're directly under the root
> cgroup.  Others show yet different behaviors.
> 
> These differing interpretations of cgroup hierarchy make using cgroup
> confusing and it impossible to co-mount controllers into the same
> hierarchy and obtain sane behavior.
> 
> Eventually, we want full hierarchy support from all subsystems and
> probably a unified hierarchy.  Users using separate hierarchies
> expecting completely different behaviors depending on the mounted
> subsystem is deterimental to making any progress on this front.
> 
> This patch adds cgroup_subsys.broken_hierarchy and sets it to %true
> for controllers which are lacking in hierarchy support.  The goal of
> this patch is two-fold.
> 
> * Move users away from using hierarchy on currently non-hierarchical
>   subsystems, so that implementing proper hierarchy support on those
>   doesn't surprise them.
> 
> * Keep track of which controllers are broken how and nudge the
>   subsystems to implement proper hierarchy support.
> 
> For now, start with a single warning message.  We can whine louder
> later on.
> 
> v2: Fixed a typo spotted by Michal. Warning message updated.
> 
> v3: Updated memcg part so that it doesn't generate warning in the
>     cases where .use_hierarchy=false doesn't make the behavior
>     different from root.use_hierarchy=true.  Fixed a typo spotted by
>     Glauber.
> 
> v4: Check ->broken_hierarchy after cgroup creation is complete so that
>     ->create() can affect the result per Michal.  Dropped unnecessary
>     memcg root handling per Michal.
> 
> Signed-off-by: Tejun Heo <tj at kernel.org>
> Cc: Michal Hocko <mhocko at suse.cz>
> Cc: Li Zefan <lizefan at huawei.com>
> Cc: Glauber Costa <glommer at parallels.com>
> Cc: Peter Zijlstra <peterz at infradead.org>
> Cc: Paul Turner <pjt at google.com>
> Cc: Johannes Weiner <hannes at cmpxchg.org>
> Cc: Thomas Graf <tgraf at suug.ch>
> Cc: Serge E. Hallyn <serue at us.ibm.com>
> Cc: Vivek Goyal <vgoyal at redhat.com>
> Cc: Paul Mackerras <paulus at samba.org>
> Cc: Ingo Molnar <mingo at redhat.com>
> Cc: Arnaldo Carvalho de Melo <acme at ghostprotocols.net>
> Cc: Neil Horman <nhorman at tuxdriver.com>
> Cc: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>

Acked-by: Michal Hocko <mhocko at suse.cz>

Thanks!

> ---
>  block/blk-cgroup.c        |    8 ++++++++
>  include/linux/cgroup.h    |   15 +++++++++++++++
>  kernel/cgroup.c           |   12 +++++++++++-
>  kernel/cgroup_freezer.c   |    8 ++++++++
>  kernel/events/core.c      |    7 +++++++
>  mm/memcontrol.c           |    7 +++++++
>  net/core/netprio_cgroup.c |   12 +++++++++++-
>  net/sched/cls_cgroup.c    |    9 +++++++++
>  security/device_cgroup.c  |    9 +++++++++
>  9 files changed, 85 insertions(+), 2 deletions(-)
> 
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -737,6 +737,14 @@ struct cgroup_subsys blkio_subsys = {
>  	.subsys_id = blkio_subsys_id,
>  	.base_cftypes = blkcg_files,
>  	.module = THIS_MODULE,
> +
> +	/*
> +	 * blkio subsystem is utterly broken in terms of hierarchy support.
> +	 * It treats all cgroups equally regardless of where they're
> +	 * located in the hierarchy - all cgroups are treated as if they're
> +	 * right below the root.  Fix it and remove the following.
> +	 */
> +	.broken_hierarchy = true,
>  };
>  EXPORT_SYMBOL_GPL(blkio_subsys);
>  
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -503,6 +503,21 @@ struct cgroup_subsys {
>  	 */
>  	bool __DEPRECATED_clear_css_refs;
>  
> +	/*
> +	 * If %false, this subsystem is properly hierarchical -
> +	 * configuration, resource accounting and restriction on a parent
> +	 * cgroup cover those of its children.  If %true, hierarchy support
> +	 * is broken in some ways - some subsystems ignore hierarchy
> +	 * completely while others are only implemented half-way.
> +	 *
> +	 * It's now disallowed to create nested cgroups if the subsystem is
> +	 * broken and cgroup core will emit a warning message on such
> +	 * cases.  Eventually, all subsystems will be made properly
> +	 * hierarchical and this will go away.
> +	 */
> +	bool broken_hierarchy;
> +	bool warned_broken_hierarchy;
> +
>  #define MAX_CGROUP_TYPE_NAMELEN 32
>  	const char *name;
>  
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -4075,8 +4075,9 @@ static long cgroup_create(struct cgroup 
>  		set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
>  
>  	for_each_subsys(root, ss) {
> -		struct cgroup_subsys_state *css = ss->create(cgrp);
> +		struct cgroup_subsys_state *css;
>  
> +		css = ss->create(cgrp);
>  		if (IS_ERR(css)) {
>  			err = PTR_ERR(css);
>  			goto err_destroy;
> @@ -4090,6 +4091,15 @@ static long cgroup_create(struct cgroup 
>  		/* At error, ->destroy() callback has to free assigned ID. */
>  		if (clone_children(parent) && ss->post_clone)
>  			ss->post_clone(cgrp);
> +
> +		if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
> +		    parent->parent) {
> +			pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
> +				   current->comm, current->pid, ss->name);
> +			if (!strcmp(ss->name, "memory"))
> +				pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
> +			ss->warned_broken_hierarchy = true;
> +		}
>  	}
>  
>  	list_add(&cgrp->sibling, &cgrp->parent->children);
> --- a/kernel/cgroup_freezer.c
> +++ b/kernel/cgroup_freezer.c
> @@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = {
>  	.can_attach	= freezer_can_attach,
>  	.fork		= freezer_fork,
>  	.base_cftypes	= files,
> +
> +	/*
> +	 * freezer subsys doesn't handle hierarchy at all.  Frozen state
> +	 * should be inherited through the hierarchy - if a parent is
> +	 * frozen, all its children should be frozen.  Fix it and remove
> +	 * the following.
> +	 */
> +	.broken_hierarchy = true,
>  };
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7285,5 +7285,12 @@ struct cgroup_subsys perf_subsys = {
>  	.destroy	= perf_cgroup_destroy,
>  	.exit		= perf_cgroup_exit,
>  	.attach		= perf_cgroup_attach,
> +
> +	/*
> +	 * perf_event cgroup doesn't handle nesting correctly.
> +	 * ctx->nr_cgroups adjustments should be propagated through the
> +	 * cgroup hierarchy.  Fix it and remove the following.
> +	 */
> +	.broken_hierarchy = true,
>  };
>  #endif /* CONFIG_CGROUP_PERF */
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -4973,6 +4973,13 @@ mem_cgroup_create(struct cgroup *cont)
>  	} else {
>  		res_counter_init(&memcg->res, NULL);
>  		res_counter_init(&memcg->memsw, NULL);
> +		/*
> +		 * Deeper hierachy with use_hierarchy == false doesn't make
> +		 * much sense so let cgroup subsystem know about this
> +		 * unfortunate state in our controller.
> +		 */
> +		if (parent && parent != root_mem_cgroup)
> +			mem_cgroup_subsys.broken_hierarchy = true;
>  	}
>  	memcg->last_scanned_node = MAX_NUMNODES;
>  	INIT_LIST_HEAD(&memcg->oom_notify);
> --- a/net/core/netprio_cgroup.c
> +++ b/net/core/netprio_cgroup.c
> @@ -330,7 +330,17 @@ struct cgroup_subsys net_prio_subsys = {
>  	.subsys_id	= net_prio_subsys_id,
>  #endif
>  	.base_cftypes	= ss_files,
> -	.module		= THIS_MODULE
> +	.module		= THIS_MODULE,
> +
> +	/*
> +	 * net_prio has artificial limit on the number of cgroups and
> +	 * disallows nesting making it impossible to co-mount it with other
> +	 * hierarchical subsystems.  Remove the artificially low PRIOIDX_SZ
> +	 * limit and properly nest configuration such that children follow
> +	 * their parents' configurations by default and are allowed to
> +	 * override and remove the following.
> +	 */
> +	.broken_hierarchy = true,
>  };
>  
>  static int netprio_device_event(struct notifier_block *unused,
> --- a/net/sched/cls_cgroup.c
> +++ b/net/sched/cls_cgroup.c
> @@ -82,6 +82,15 @@ struct cgroup_subsys net_cls_subsys = {
>  #endif
>  	.base_cftypes	= ss_files,
>  	.module		= THIS_MODULE,
> +
> +	/*
> +	 * While net_cls cgroup has the rudimentary hierarchy support of
> +	 * inheriting the parent's classid on cgroup creation, it doesn't
> +	 * properly propagates config changes in ancestors to their
> +	 * descendents.  A child should follow the parent's configuration
> +	 * but be allowed to override it.  Fix it and remove the following.
> +	 */
> +	.broken_hierarchy = true,
>  };
>  
>  struct cls_cgroup_head {
> --- a/security/device_cgroup.c
> +++ b/security/device_cgroup.c
> @@ -457,6 +457,15 @@ struct cgroup_subsys devices_subsys = {
>  	.destroy = devcgroup_destroy,
>  	.subsys_id = devices_subsys_id,
>  	.base_cftypes = dev_cgroup_files,
> +
> +	/*
> +	 * While devices cgroup has the rudimentary hierarchy support which
> +	 * checks the parent's restriction, it doesn't properly propagates
> +	 * config changes in ancestors to their descendents.  A child
> +	 * should only be allowed to add more restrictions to the parent's
> +	 * configuration.  Fix it and remove the following.
> +	 */
> +	.broken_hierarchy = true,
>  };
>  
>  int __devcgroup_inode_permission(struct inode *inode, int mask)

-- 
Michal Hocko
SUSE Labs