[PATCH v3 2/3] fs: introduce uid/gid shifting bind mount

Christoph Hellwig hch at infradead.org
Tue Feb 18 22:33:13 UTC 2020


On Mon, Feb 17, 2020 at 12:53:06PM -0800, James Bottomley wrote:
> 
> v3: added a bind mount base shift at the request of Serge Hallyn
> ---
>  fs/attr.c             | 127 +++++++++++++++++++++++++++++++++++++++++---------
>  fs/exec.c             |   3 +-
>  fs/inode.c            |  10 ++--
>  fs/internal.h         |   2 +
>  fs/mount.h            |   1 +
>  fs/namei.c            | 112 +++++++++++++++++++++++++++++++++++++-------
>  fs/namespace.c        |   5 ++
>  fs/open.c             |  25 +++++++++-
>  fs/posix_acl.c        |   4 +-
>  fs/stat.c             |  32 +++++++++++--
>  include/linux/cred.h  |  12 +++++
>  include/linux/mount.h |   4 +-
>  include/linux/sched.h |   5 ++
>  kernel/capability.c   |   9 +++-
>  kernel/cred.c         |  20 ++++++++
>  15 files changed, 317 insertions(+), 54 deletions(-)
> 
> diff --git a/fs/attr.c b/fs/attr.c
> index 11201ab7e3b1..d7c5883a4b4c 100644
> --- a/fs/attr.c
> +++ b/fs/attr.c
> @@ -18,14 +18,26 @@
>  #include <linux/evm.h>
>  #include <linux/ima.h>
>  
> +#include "internal.h"
> +#include "mount.h"
> +
>  static bool chown_ok(const struct inode *inode, kuid_t uid)
>  {
> +	kuid_t i_uid = inode->i_uid;
> +
> +	if (cred_is_shifted()) {
> +		struct mount *m = real_mount(current->mnt);
> +
> +		i_uid = KUIDT_INIT(from_kuid(m->mnt_userns, i_uid));
> +		i_uid = make_kuid(current_user_ns(), __kuid_val(i_uid));
> +	}
> +
>  	if (uid_eq(current_fsuid(), inode->i_uid) &&
> -	    uid_eq(uid, inode->i_uid))
> +	    uid_eq(uid, i_uid))
>  		return true;
>  	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
>  		return true;
> -	if (uid_eq(inode->i_uid, INVALID_UID) &&
> +	if (uid_eq(i_uid, INVALID_UID) &&
>  	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
>  		return true;
>  	return false;
> @@ -33,17 +45,40 @@ static bool chown_ok(const struct inode *inode, kuid_t uid)
>  
>  static bool chgrp_ok(const struct inode *inode, kgid_t gid)
>  {
> +	kgid_t i_gid = inode->i_gid;
> +	kuid_t i_uid = inode->i_uid;
> +
> +	if (cred_is_shifted()) {
> +		struct mount *m = real_mount(current->mnt);
> +		struct user_namespace *ns = current_user_ns();
> +
> +		i_uid = KUIDT_INIT(from_kuid(m->mnt_userns, i_uid));
> +		i_uid = make_kuid(ns, __kuid_val(i_uid));
> +		i_gid = KGIDT_INIT(from_kgid(m->mnt_userns, i_gid));
> +		i_gid = make_kgid(ns, __kgid_val(i_gid));
> +	}
>  	if (uid_eq(current_fsuid(), inode->i_uid) &&
> -	    (in_group_p(gid) || gid_eq(gid, inode->i_gid)))
> +	    (in_group_p(gid) || gid_eq(gid, i_gid)))
>  		return true;
>  	if (capable_wrt_inode_uidgid(inode, CAP_CHOWN))
>  		return true;
> -	if (gid_eq(inode->i_gid, INVALID_GID) &&
> +	if (gid_eq(i_gid, INVALID_GID) &&
>  	    ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
>  		return true;
>  	return false;
>  }
>  
> +int in_group_p_shifted(kgid_t grp)
> +{
> +	if (cred_is_shifted()) {
> +		struct mount *m = real_mount(current->mnt);
> +
> +		grp = KGIDT_INIT(from_kgid(m->mnt_userns, grp));
> +		grp = make_kgid(current_user_ns(), __kgid_val(grp));
> +	}
> +	return in_group_p(grp);
> +}
> +
>  /**
>   * setattr_prepare - check if attribute changes to a dentry are allowed
>   * @dentry:	dentry to check
> @@ -89,9 +124,10 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr)
>  	if (ia_valid & ATTR_MODE) {
>  		if (!inode_owner_or_capable(inode))
>  			return -EPERM;
> +
>  		/* Also check the setgid bit! */
> -		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
> -				inode->i_gid) &&
> +		if (!in_group_p_shifted((ia_valid & ATTR_GID) ? attr->ia_gid :
> +					inode->i_gid) &&
>  		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
>  			attr->ia_mode &= ~S_ISGID;
>  	}
> @@ -192,7 +228,7 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
>  	if (ia_valid & ATTR_MODE) {
>  		umode_t mode = attr->ia_mode;
>  
> -		if (!in_group_p(inode->i_gid) &&
> +		if (!in_group_p_shifted(inode->i_gid) &&
>  		    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
>  			mode &= ~S_ISGID;
>  		inode->i_mode = mode;
> @@ -200,6 +236,23 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
>  }
>  EXPORT_SYMBOL(setattr_copy);
>  
> +void cred_shift(kuid_t *uid, kgid_t *gid)
> +{
> +	if (cred_is_shifted()) {
> +		struct user_namespace *ns = current_user_ns();
> +		struct mount *m = real_mount(current->mnt);
> +
> +		if (uid) {
> +			*uid = KUIDT_INIT(from_kuid(m->mnt_userns, *uid));
> +			*uid = make_kuid(ns, __kuid_val(*uid));
> +		}
> +		if (gid) {
> +			*gid = KGIDT_INIT(from_kgid(m->mnt_userns, *gid));
> +			*gid = make_kgid(ns, __kgid_val(*gid));
> +		}
> +	}
> +}
> +
>  /**
>   * notify_change - modify attributes of a filesytem object
>   * @dentry:	object affected
> @@ -229,6 +282,9 @@ int notify_change(const struct path *path, struct iattr * attr,
>  	int error;
>  	struct timespec64 now;
>  	unsigned int ia_valid = attr->ia_valid;
> +	const struct cred *cred;
> +	kuid_t i_uid = inode->i_uid;
> +	kgid_t i_gid = inode->i_gid;
>  
>  	WARN_ON_ONCE(!inode_is_locked(inode));
>  
> @@ -237,18 +293,30 @@ int notify_change(const struct path *path, struct iattr * attr,
>  			return -EPERM;
>  	}
>  
> +	cred = change_userns_creds(path);
> +	if (cred) {
> +		struct mount *m = real_mount(path->mnt);
> +
> +		attr->ia_uid = KUIDT_INIT(from_kuid(m->mnt_ns->user_ns, attr->ia_uid));
> +		attr->ia_uid = make_kuid(m->mnt_userns, __kuid_val(attr->ia_uid));
> +		attr->ia_gid = KGIDT_INIT(from_kgid(m->mnt_ns->user_ns, attr->ia_gid));
> +		attr->ia_gid = make_kgid(m->mnt_userns, __kgid_val(attr->ia_gid));
> +	}
> +
>  	/*
>  	 * If utimes(2) and friends are called with times == NULL (or both
>  	 * times are UTIME_NOW), then we need to check for write permission
>  	 */
>  	if (ia_valid & ATTR_TOUCH) {
> -		if (IS_IMMUTABLE(inode))
> -			return -EPERM;
> +		if (IS_IMMUTABLE(inode)) {
> +			error = -EPERM;
> +			goto err;
> +		}
>  
>  		if (!inode_owner_or_capable(inode)) {
>  			error = inode_permission(inode, MAY_WRITE);
>  			if (error)
> -				return error;
> +				goto err;
>  		}
>  	}
>  
> @@ -274,7 +342,7 @@ int notify_change(const struct path *path, struct iattr * attr,
>  	if (ia_valid & ATTR_KILL_PRIV) {
>  		error = security_inode_need_killpriv(dentry);
>  		if (error < 0)
> -			return error;
> +			goto err;
>  		if (error == 0)
>  			ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV;
>  	}
> @@ -305,34 +373,49 @@ int notify_change(const struct path *path, struct iattr * attr,
>  			attr->ia_mode &= ~S_ISGID;
>  		}
>  	}
> -	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
> -		return 0;
> +	if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID))) {
> +		error = 0;
> +		goto err;
> +	}
>  
>  	/*
>  	 * Verify that uid/gid changes are valid in the target
>  	 * namespace of the superblock.
>  	 */
> +	error = -EOVERFLOW;
>  	if (ia_valid & ATTR_UID &&
>  	    !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
> -		return -EOVERFLOW;
> +		goto err;
> +
>  	if (ia_valid & ATTR_GID &&
>  	    !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
> -		return -EOVERFLOW;
> +		goto err;
>  
>  	/* Don't allow modifications of files with invalid uids or
>  	 * gids unless those uids & gids are being made valid.
>  	 */
> -	if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
> -		return -EOVERFLOW;
> -	if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
> -		return -EOVERFLOW;
> +	if (cred_is_shifted()) {
> +		struct user_namespace *ns = current_user_ns();
> +		struct mount *m = real_mount(current->mnt);
> +
> +		i_uid = KUIDT_INIT(from_kuid(m->mnt_userns, i_uid));
> +		i_uid = make_kuid(ns, __kuid_val(i_uid));
> +		i_gid = KGIDT_INIT(from_kgid(m->mnt_userns, i_gid));
> +		i_gid = make_kgid(ns, __kgid_val(i_gid));
> +	}
> +
> +	if (!(ia_valid & ATTR_UID) && !uid_valid(i_uid))
> +		goto err;
> +
> +	if (!(ia_valid & ATTR_GID) && !gid_valid(i_gid))
> +		goto err;
>  
>  	error = security_inode_setattr(dentry, attr);
>  	if (error)
> -		return error;
> +		goto err;
>  	error = try_break_deleg(inode, delegated_inode);
>  	if (error)
> -		return error;
> +		goto err;
>  
>  	if (inode->i_op->setattr)
>  		error = inode->i_op->setattr(dentry, attr);
> @@ -345,6 +428,8 @@ int notify_change(const struct path *path, struct iattr * attr,
>  		evm_inode_post_setattr(dentry, ia_valid);
>  	}
>  
> + err:
> +	revert_userns_creds(cred);
>  	return error;
>  }
>  EXPORT_SYMBOL(notify_change);
> diff --git a/fs/exec.c b/fs/exec.c
> index db17be51b112..926bab39ed45 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1543,13 +1543,14 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
>  
>  	/* Be careful if suid/sgid is set */
>  	inode_lock(inode);
> -
>  	/* reload atomically mode/uid/gid now that lock held */
>  	mode = inode->i_mode;
>  	uid = inode->i_uid;
>  	gid = inode->i_gid;
>  	inode_unlock(inode);
>  
> +	cred_shift(&uid, &gid);
> +
>  	/* We ignore suid/sgid if there are no mappings for them in the ns */
>  	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
>  		 !kgid_has_mapping(bprm->cred->user_ns, gid))
> diff --git a/fs/inode.c b/fs/inode.c
> index be14d3fcbee1..ae75b6396786 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -2064,7 +2064,7 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
>  		if (S_ISDIR(mode))
>  			mode |= S_ISGID;
>  		else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
> -			 !in_group_p(inode->i_gid) &&
> +			 !in_group_p_shifted(inode->i_gid) &&
>  			 !capable_wrt_inode_uidgid(dir, CAP_FSETID))
>  			mode &= ~S_ISGID;
>  	} else
> @@ -2083,12 +2083,16 @@ EXPORT_SYMBOL(inode_init_owner);
>  bool inode_owner_or_capable(const struct inode *inode)
>  {
>  	struct user_namespace *ns;
> +	kuid_t uid = inode->i_uid;
>  
> -	if (uid_eq(current_fsuid(), inode->i_uid))
> +	if (uid_eq(current_fsuid(), uid))
>  		return true;
>  
>  	ns = current_user_ns();
> -	if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER))
> +
> +	cred_shift(&uid, NULL);
> +
> +	if (kuid_has_mapping(ns, uid) && ns_capable(ns, CAP_FOWNER))
>  		return true;
>  	return false;
>  }
> diff --git a/fs/internal.h b/fs/internal.h
> index 80d89ddb9b28..d2adcdb3eb2e 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -73,6 +73,8 @@ long do_symlinkat(const char __user *oldname, int newdfd,
>  		  const char __user *newname);
>  int do_linkat(int olddfd, const char __user *oldname, int newdfd,
>  	      const char __user *newname, int flags);
> +const struct cred *change_userns_creds(const struct path *p);
> +void revert_userns_creds(const struct cred *cred);
>  
>  /*
>   * namespace.c
> diff --git a/fs/mount.h b/fs/mount.h
> index 711a4093e475..c3bfc6ced4c7 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -72,6 +72,7 @@ struct mount {
>  	int mnt_expiry_mark;		/* true if marked for expiry */
>  	struct hlist_head mnt_pins;
>  	struct hlist_head mnt_stuck_children;
> +	struct user_namespace *mnt_userns; /* mapping for underlying mount uid/gid */
>  } __randomize_layout;
>  
>  #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
> diff --git a/fs/namei.c b/fs/namei.c
> index 531ac55c7e67..369bd18c7330 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -124,6 +124,42 @@
>  
>  #define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
>  
> +const struct cred *change_userns_creds(const struct path *p)
> +{
> +	struct mount *m = real_mount(p->mnt);
> +
> +	if ((p->mnt->mnt_flags & MNT_SHIFT) == 0)
> +		return NULL;
> +
> +	if (current->nsproxy->mnt_ns->user_ns != m->mnt_ns->user_ns)
> +		return NULL;
> +
> +	if (current->mnt != p->mnt) {
> +		struct cred *cred;
> +		struct user_namespace *user_ns = m->mnt_ns->user_ns;
> +		kuid_t fsuid = current->cred->fsuid;
> +		kgid_t fsgid = current->cred->fsgid;
> +
> +		if (current->mnt_cred)
> +			put_cred(current->mnt_cred);
> +		cred = prepare_creds();
> +		fsuid = KUIDT_INIT(from_kuid(user_ns, fsuid));
> +		fsgid = KGIDT_INIT(from_kgid(user_ns, fsgid));
> +		cred->fsuid = make_kuid(m->mnt_userns, __kuid_val(fsuid));
> +		cred->fsgid = make_kgid(m->mnt_userns, __kgid_val(fsgid));
> +		current->mnt = p->mnt; /* no reference needed */
> +		current->mnt_cred = cred;
> +	}
> +	return override_creds(current->mnt_cred);
> +}
> +
> +void revert_userns_creds(const struct cred *cred)
> +{
> +	if (!cred)
> +		return;
> +	revert_creds(cred);
> +}
> +
>  struct filename *
>  getname_flags(const char __user *filename, int flags, int *empty)
>  {
> @@ -303,7 +339,7 @@ static int acl_permission_check(struct inode *inode, int mask)
>  				return error;
>  		}
>  
> -		if (in_group_p(inode->i_gid))
> +		if (in_group_p_shifted(inode->i_gid))
>  			mode >>= 3;
>  	}
>  
> @@ -366,7 +402,6 @@ int generic_permission(struct inode *inode, int mask)
>  	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
>  		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
>  			return 0;
> -
>  	return -EACCES;
>  }
>  EXPORT_SYMBOL(generic_permission);
> @@ -1897,6 +1932,7 @@ static int walk_component(struct nameidata *nd, int flags)
>  	struct inode *inode;
>  	unsigned seq;
>  	int err;
> +	const struct cred *cred;
>  	/*
>  	 * "." and ".." are special - ".." especially so because it has
>  	 * to be able to know about the current root directory and
> @@ -1908,25 +1944,31 @@ static int walk_component(struct nameidata *nd, int flags)
>  			put_link(nd);
>  		return err;
>  	}
> +	cred = change_userns_creds(&nd->path);
>  	err = lookup_fast(nd, &path, &inode, &seq);
>  	if (unlikely(err <= 0)) {
>  		if (err < 0)
> -			return err;
> +			goto out;
>  		path.dentry = lookup_slow(&nd->last, nd->path.dentry,
>  					  nd->flags);
> -		if (IS_ERR(path.dentry))
> -			return PTR_ERR(path.dentry);
> +		if (IS_ERR(path.dentry)) {
> +			err = PTR_ERR(path.dentry);
> +			goto out;
> +		}
>  
>  		path.mnt = nd->path.mnt;
>  		err = follow_managed(&path, nd);
>  		if (unlikely(err < 0))
> -			return err;
> +			goto out;
>  
>  		seq = 0;	/* we are already out of RCU mode */
>  		inode = d_backing_inode(path.dentry);
>  	}
>  
> -	return step_into(nd, &path, flags, inode, seq);
> +	err = step_into(nd, &path, flags, inode, seq);
> + out:
> +	revert_userns_creds(cred);
> +	return err;
>  }
>  
>  /*
> @@ -2180,8 +2222,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
>  	for(;;) {
>  		u64 hash_len;
>  		int type;
> +		const struct cred *cred = change_userns_creds(&nd->path);
>  
>  		err = may_lookup(nd);
> +		revert_userns_creds(cred);
>  		if (err)
>  			return err;
>  
> @@ -2373,12 +2417,17 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
>  static const char *trailing_symlink(struct nameidata *nd)
>  {
>  	const char *s;
> +	const struct cred *cred = change_userns_creds(&nd->path);
>  	int error = may_follow_link(nd);
> -	if (unlikely(error))
> -		return ERR_PTR(error);
> +	if (unlikely(error)) {
> +		s = ERR_PTR(error);
> +		goto out;
> +	}
>  	nd->flags |= LOOKUP_PARENT;
>  	nd->stack[0].name = NULL;
>  	s = get_link(nd);
> + out:
> +	revert_userns_creds(cred);
>  	return s ? s : "";
>  }
>  
> @@ -3343,6 +3392,7 @@ static int do_last(struct nameidata *nd,
>  	struct inode *inode;
>  	struct path path;
>  	int error;
> +	const struct cred *cred = change_userns_creds(&nd->path);
>  
>  	nd->flags &= ~LOOKUP_PARENT;
>  	nd->flags |= op->intent;
> @@ -3350,7 +3400,7 @@ static int do_last(struct nameidata *nd,
>  	if (nd->last_type != LAST_NORM) {
>  		error = handle_dots(nd, nd->last_type);
>  		if (unlikely(error))
> -			return error;
> +			goto err;
>  		goto finish_open;
>  	}
>  
> @@ -3363,7 +3413,7 @@ static int do_last(struct nameidata *nd,
>  			goto finish_lookup;
>  
>  		if (error < 0)
> -			return error;
> +			goto err;
>  
>  		BUG_ON(nd->inode != dir->d_inode);
>  		BUG_ON(nd->flags & LOOKUP_RCU);
> @@ -3376,12 +3426,14 @@ static int do_last(struct nameidata *nd,
>  		 */
>  		error = complete_walk(nd);
>  		if (error)
> -			return error;
> +			goto err;
>  
>  		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
>  		/* trailing slashes? */
> -		if (unlikely(nd->last.name[nd->last.len]))
> -			return -EISDIR;
> +		if (unlikely(nd->last.name[nd->last.len])) {
> +			error = -EISDIR;
> +			goto err;
> +		}
>  	}
>  
>  	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
> @@ -3437,7 +3489,7 @@ static int do_last(struct nameidata *nd,
>  
>  	error = follow_managed(&path, nd);
>  	if (unlikely(error < 0))
> -		return error;
> +		goto err;
>  
>  	/*
>  	 * create/update audit record if it already exists.
> @@ -3446,7 +3498,8 @@ static int do_last(struct nameidata *nd,
>  
>  	if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) {
>  		path_to_nameidata(&path, nd);
> -		return -EEXIST;
> +		error = -EEXIST;
> +		goto err;
>  	}
>  
>  	seq = 0;	/* out of RCU mode, so the value doesn't matter */
> @@ -3454,12 +3507,12 @@ static int do_last(struct nameidata *nd,
>  finish_lookup:
>  	error = step_into(nd, &path, 0, inode, seq);
>  	if (unlikely(error))
> -		return error;
> +		goto err;
>  finish_open:
>  	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
>  	error = complete_walk(nd);
>  	if (error)
> -		return error;
> +		goto err;
>  	audit_inode(nd->name, nd->path.dentry, 0);
>  	if (open_flag & O_CREAT) {
>  		error = -EISDIR;
> @@ -3501,6 +3554,8 @@ static int do_last(struct nameidata *nd,
>  	}
>  	if (got_write)
>  		mnt_drop_write(nd->path.mnt);
> + err:
> +	revert_userns_creds(cred);
>  	return error;
>  }
>  
> @@ -3819,6 +3874,7 @@ long do_mknodat(int dfd, const char __user *filename, umode_t mode,
>  	struct path path;
>  	int error;
>  	unsigned int lookup_flags = 0;
> +	const struct cred *cred;
>  
>  	error = may_mknod(mode);
>  	if (error)
> @@ -3828,6 +3884,7 @@ long do_mknodat(int dfd, const char __user *filename, umode_t mode,
>  	if (IS_ERR(dentry))
>  		return PTR_ERR(dentry);
>  
> +	cred = change_userns_creds(&path);
>  	if (!IS_POSIXACL(path.dentry->d_inode))
>  		mode &= ~current_umask();
>  	error = security_path_mknod(&path, dentry, mode, dev);
> @@ -3849,6 +3906,7 @@ long do_mknodat(int dfd, const char __user *filename, umode_t mode,
>  	}
>  out:
>  	done_path_create(&path, dentry);
> +	revert_userns_creds(cred);
>  	if (retry_estale(error, lookup_flags)) {
>  		lookup_flags |= LOOKUP_REVAL;
>  		goto retry;
> @@ -3899,18 +3957,21 @@ long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
>  	struct path path;
>  	int error;
>  	unsigned int lookup_flags = LOOKUP_DIRECTORY;
> +	const struct cred *cred;
>  
>  retry:
>  	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
>  	if (IS_ERR(dentry))
>  		return PTR_ERR(dentry);
>  
> +	cred = change_userns_creds(&path);
>  	if (!IS_POSIXACL(path.dentry->d_inode))
>  		mode &= ~current_umask();
>  	error = security_path_mkdir(&path, dentry, mode);
>  	if (!error)
>  		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
>  	done_path_create(&path, dentry);
> +	revert_userns_creds(cred);
>  	if (retry_estale(error, lookup_flags)) {
>  		lookup_flags |= LOOKUP_REVAL;
>  		goto retry;
> @@ -3977,12 +4038,14 @@ long do_rmdir(int dfd, const char __user *pathname)
>  	struct qstr last;
>  	int type;
>  	unsigned int lookup_flags = 0;
> +	const struct cred *cred;
>  retry:
>  	name = filename_parentat(dfd, getname(pathname), lookup_flags,
>  				&path, &last, &type);
>  	if (IS_ERR(name))
>  		return PTR_ERR(name);
>  
> +	cred = change_userns_creds(&path);
>  	switch (type) {
>  	case LAST_DOTDOT:
>  		error = -ENOTEMPTY;
> @@ -4018,6 +4081,7 @@ long do_rmdir(int dfd, const char __user *pathname)
>  	inode_unlock(path.dentry->d_inode);
>  	mnt_drop_write(path.mnt);
>  exit1:
> +	revert_userns_creds(cred);
>  	path_put(&path);
>  	putname(name);
>  	if (retry_estale(error, lookup_flags)) {
> @@ -4107,11 +4171,13 @@ long do_unlinkat(int dfd, struct filename *name)
>  	struct inode *inode = NULL;
>  	struct inode *delegated_inode = NULL;
>  	unsigned int lookup_flags = 0;
> +	const struct cred *cred;
>  retry:
>  	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
>  	if (IS_ERR(name))
>  		return PTR_ERR(name);
>  
> +	cred = change_userns_creds(&path);
>  	error = -EISDIR;
>  	if (type != LAST_NORM)
>  		goto exit1;
> @@ -4149,6 +4215,7 @@ long do_unlinkat(int dfd, struct filename *name)
>  	}
>  	mnt_drop_write(path.mnt);
>  exit1:
> +	revert_userns_creds(cred);
>  	path_put(&path);
>  	if (retry_estale(error, lookup_flags)) {
>  		lookup_flags |= LOOKUP_REVAL;
> @@ -4213,6 +4280,7 @@ long do_symlinkat(const char __user *oldname, int newdfd,
>  	struct dentry *dentry;
>  	struct path path;
>  	unsigned int lookup_flags = 0;
> +	const struct cred *cred;
>  
>  	from = getname(oldname);
>  	if (IS_ERR(from))
> @@ -4223,6 +4291,7 @@ long do_symlinkat(const char __user *oldname, int newdfd,
>  	if (IS_ERR(dentry))
>  		goto out_putname;
>  
> +	cred = change_userns_creds(&path);
>  	error = security_path_symlink(&path, dentry, from->name);
>  	if (!error)
>  		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
> @@ -4231,6 +4300,7 @@ long do_symlinkat(const char __user *oldname, int newdfd,
>  		lookup_flags |= LOOKUP_REVAL;
>  		goto retry;
>  	}
> +	revert_userns_creds(cred);
>  out_putname:
>  	putname(from);
>  	return error;
> @@ -4344,6 +4414,7 @@ int do_linkat(int olddfd, const char __user *oldname, int newdfd,
>  	struct inode *delegated_inode = NULL;
>  	int how = 0;
>  	int error;
> +	const struct cred *cred;
>  
>  	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
>  		return -EINVAL;
> @@ -4371,6 +4442,7 @@ int do_linkat(int olddfd, const char __user *oldname, int newdfd,
>  	if (IS_ERR(new_dentry))
>  		goto out;
>  
> +	cred = change_userns_creds(&new_path);
>  	error = -EXDEV;
>  	if (old_path.mnt != new_path.mnt)
>  		goto out_dput;
> @@ -4382,6 +4454,7 @@ int do_linkat(int olddfd, const char __user *oldname, int newdfd,
>  		goto out_dput;
>  	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
>  out_dput:
> +	revert_userns_creds(cred);
>  	done_path_create(&new_path, new_dentry);
>  	if (delegated_inode) {
>  		error = break_deleg_wait(&delegated_inode);
> @@ -4601,6 +4674,7 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
>  	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
>  	bool should_retry = false;
>  	int error;
> +	const struct cred *cred;
>  
>  	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
>  		return -EINVAL;
> @@ -4630,6 +4704,7 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
>  		goto exit1;
>  	}
>  
> +	cred = change_userns_creds(&new_path);
>  	error = -EXDEV;
>  	if (old_path.mnt != new_path.mnt)
>  		goto exit2;
> @@ -4714,6 +4789,7 @@ static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
>  	}
>  	mnt_drop_write(old_path.mnt);
>  exit2:
> +	revert_userns_creds(cred);
>  	if (retry_estale(error, lookup_flags))
>  		should_retry = true;
>  	path_put(&new_path);
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 69fb23ae3d8f..4720647588ab 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -200,6 +200,8 @@ static struct mount *alloc_vfsmnt(const char *name)
>  		mnt->mnt_writers = 0;
>  #endif
>  
> +		mnt->mnt_userns = get_user_ns(&init_user_ns);
> +
>  		INIT_HLIST_NODE(&mnt->mnt_hash);
>  		INIT_LIST_HEAD(&mnt->mnt_child);
>  		INIT_LIST_HEAD(&mnt->mnt_mounts);
> @@ -1044,6 +1046,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
>  	mnt->mnt.mnt_root = dget(root);
>  	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
>  	mnt->mnt_parent = mnt;
> +	put_user_ns(mnt->mnt_userns);
> +	mnt->mnt_userns = get_user_ns(old->mnt_userns);
>  	lock_mount_hash();
>  	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
>  	unlock_mount_hash();
> @@ -1102,6 +1106,7 @@ static void cleanup_mnt(struct mount *mnt)
>  	dput(mnt->mnt.mnt_root);
>  	deactivate_super(mnt->mnt.mnt_sb);
>  	mnt_free_id(mnt);
> +	put_user_ns(mnt->mnt_userns);
>  	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
>  }
>  
> diff --git a/fs/open.c b/fs/open.c
> index db6758b9636a..d27b90dce64d 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -456,11 +456,13 @@ int ksys_chdir(const char __user *filename)
>  	struct path path;
>  	int error;
>  	unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
> +	const struct cred *cred;
>  retry:
>  	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
>  	if (error)
>  		goto out;
>  
> +	cred = change_userns_creds(&path);
>  	error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
>  	if (error)
>  		goto dput_and_out;
> @@ -468,6 +470,7 @@ int ksys_chdir(const char __user *filename)
>  	set_fs_pwd(current->fs, &path);
>  
>  dput_and_out:
> +	revert_userns_creds(cred);
>  	path_put(&path);
>  	if (retry_estale(error, lookup_flags)) {
>  		lookup_flags |= LOOKUP_REVAL;
> @@ -486,11 +489,13 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
>  {
>  	struct fd f = fdget_raw(fd);
>  	int error;
> +	const struct cred *cred;
>  
>  	error = -EBADF;
>  	if (!f.file)
>  		goto out;
>  
> +	cred = change_userns_creds(&f.file->f_path);
>  	error = -ENOTDIR;
>  	if (!d_can_lookup(f.file->f_path.dentry))
>  		goto out_putf;
> @@ -499,6 +504,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
>  	if (!error)
>  		set_fs_pwd(current->fs, &f.file->f_path);
>  out_putf:
> +	revert_userns_creds(cred);
>  	fdput(f);
>  out:
>  	return error;
> @@ -547,11 +553,13 @@ static int chmod_common(const struct path *path, umode_t mode)
>  	struct inode *inode = path->dentry->d_inode;
>  	struct inode *delegated_inode = NULL;
>  	struct iattr newattrs;
> +	const struct cred *cred;
>  	int error;
>  
> +	cred = change_userns_creds(path);
>  	error = mnt_want_write(path->mnt);
>  	if (error)
> -		return error;
> +		goto out;
>  retry_deleg:
>  	inode_lock(inode);
>  	error = security_path_chmod(path, mode);
> @@ -568,6 +576,8 @@ static int chmod_common(const struct path *path, umode_t mode)
>  			goto retry_deleg;
>  	}
>  	mnt_drop_write(path->mnt);
> + out:
> +	revert_userns_creds(cred);
>  	return error;
>  }
>  
> @@ -666,6 +676,7 @@ int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
>  	struct path path;
>  	int error = -EINVAL;
>  	int lookup_flags;
> +	const struct cred *cred;
>  
>  	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
>  		goto out;
> @@ -677,12 +688,14 @@ int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
>  	error = user_path_at(dfd, filename, lookup_flags, &path);
>  	if (error)
>  		goto out;
> +	cred = change_userns_creds(&path);
>  	error = mnt_want_write(path.mnt);
>  	if (error)
>  		goto out_release;
>  	error = chown_common(&path, user, group);
>  	mnt_drop_write(path.mnt);
>  out_release:
> +	revert_userns_creds(cred);
>  	path_put(&path);
>  	if (retry_estale(error, lookup_flags)) {
>  		lookup_flags |= LOOKUP_REVAL;
> @@ -713,10 +726,12 @@ int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
>  {
>  	struct fd f = fdget(fd);
>  	int error = -EBADF;
> +	const struct cred *cred;
>  
>  	if (!f.file)
>  		goto out;
>  
> +	cred = change_userns_creds(&f.file->f_path);
>  	error = mnt_want_write_file(f.file);
>  	if (error)
>  		goto out_fput;
> @@ -724,6 +739,7 @@ int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
>  	error = chown_common(&f.file->f_path, user, group);
>  	mnt_drop_write_file(f.file);
>  out_fput:
> +	revert_userns_creds(cred);
>  	fdput(f);
>  out:
>  	return error;
> @@ -911,8 +927,13 @@ EXPORT_SYMBOL(file_path);
>   */
>  int vfs_open(const struct path *path, struct file *file)
>  {
> +	int ret;
> +	const struct cred *cred = change_userns_creds(path);
> +
>  	file->f_path = *path;
> -	return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
> +	ret = do_dentry_open(file, d_backing_inode(path->dentry), NULL);
> +	revert_userns_creds(cred);
> +	return ret;
>  }
>  
>  struct file *dentry_open(const struct path *path, int flags,
> diff --git a/fs/posix_acl.c b/fs/posix_acl.c
> index 249672bf54fe..ff777110f3da 100644
> --- a/fs/posix_acl.c
> +++ b/fs/posix_acl.c
> @@ -364,7 +364,7 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want)
>                                          goto mask;
>  				break;
>                          case ACL_GROUP_OBJ:
> -                                if (in_group_p(inode->i_gid)) {
> +				if (in_group_p_shifted(inode->i_gid)) {
>  					found = 1;
>  					if ((pa->e_perm & want) == want)
>  						goto mask;
> @@ -655,7 +655,7 @@ int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
>  		return error;
>  	if (error == 0)
>  		*acl = NULL;
> -	if (!in_group_p(inode->i_gid) &&
> +	if (!in_group_p_shifted(inode->i_gid) &&
>  	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
>  		mode &= ~S_ISGID;
>  	*mode_p = mode;
> diff --git a/fs/stat.c b/fs/stat.c
> index 030008796479..634b8d13ed51 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -22,6 +22,7 @@
>  #include <asm/unistd.h>
>  
>  #include "internal.h"
> +#include "mount.h"
>  
>  /**
>   * generic_fillattr - Fill in the basic attributes from the inode struct
> @@ -50,6 +51,23 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
>  }
>  EXPORT_SYMBOL(generic_fillattr);
>  
> +static void shift_check(struct vfsmount *mnt, struct kstat *stat)
> +{
> +	struct mount *m = real_mount(mnt);
> +	struct user_namespace *user_ns = m->mnt_ns->user_ns;
> +
> +	if ((mnt->mnt_flags & MNT_SHIFT) == 0)
> +		return;
> +
> +	if (current->nsproxy->mnt_ns->user_ns != m->mnt_ns->user_ns)
> +		return;
> +
> +	stat->uid = KUIDT_INIT(from_kuid(m->mnt_userns, stat->uid));
> +	stat->uid = make_kuid(user_ns, __kuid_val(stat->uid));
> +	stat->gid = KGIDT_INIT(from_kgid(m->mnt_userns, stat->gid));
> +	stat->gid = make_kgid(user_ns, __kgid_val(stat->gid));
> +}
> +
>  /**
>   * vfs_getattr_nosec - getattr without security checks
>   * @path: file to get attributes from
> @@ -67,6 +85,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
>  		      u32 request_mask, unsigned int query_flags)
>  {
>  	struct inode *inode = d_backing_inode(path->dentry);
> +	int ret;
>  
>  	memset(stat, 0, sizeof(*stat));
>  	stat->result_mask |= STATX_BASIC_STATS;
> @@ -79,12 +98,17 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
>  	if (IS_AUTOMOUNT(inode))
>  		stat->attributes |= STATX_ATTR_AUTOMOUNT;
>  
> +	ret = 0;
>  	if (inode->i_op->getattr)
> -		return inode->i_op->getattr(path, stat, request_mask,
> -					    query_flags);
> +		ret = inode->i_op->getattr(path, stat, request_mask,
> +					   query_flags);
> +	else
> +		generic_fillattr(inode, stat);
>  
> -	generic_fillattr(inode, stat);
> -	return 0;
> +	if (!ret)
> +		shift_check(path->mnt, stat);
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL(vfs_getattr_nosec);
>  
> diff --git a/include/linux/cred.h b/include/linux/cred.h
> index 18639c069263..d29638617844 100644
> --- a/include/linux/cred.h
> +++ b/include/linux/cred.h
> @@ -59,6 +59,7 @@ extern struct group_info *groups_alloc(int);
>  extern void groups_free(struct group_info *);
>  
>  extern int in_group_p(kgid_t);
> +extern int in_group_p_shifted(kgid_t);

How do I know when to use in_group_p_shifted vs in_group_p?
What about the various other fs callers?


More information about the Containers mailing list