[PATCH review 1/6] mnt: Track which mounts use a dentry as root.
Nikolay Borisov
kernel at kyup.com
Fri Aug 7 10:46:44 UTC 2015
On 08/04/2015 12:26 AM, Eric W. Biederman wrote:
>
> This is needed infrastructure for better handling of when files
> or directories are moved out from under the root of a bind mount.
>
> Signed-off-by: "Eric W. Biederman" <ebiederm at xmission.com>
> ---
> fs/mount.h | 7 +++
> fs/namespace.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++--
> include/linux/dcache.h | 7 +++
> 3 files changed, 130 insertions(+), 4 deletions(-)
>
> diff --git a/fs/mount.h b/fs/mount.h
> index 14db05d424f7..e8f22970fe59 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -27,6 +27,12 @@ struct mountpoint {
> int m_count;
> };
>
> +struct mountroot {
> + struct hlist_node r_hash;
> + struct dentry *r_dentry;
> + struct hlist_head r_list;
> +};
> +
> struct mount {
> struct hlist_node mnt_hash;
> struct mount *mnt_parent;
> @@ -55,6 +61,7 @@ struct mount {
> struct mnt_namespace *mnt_ns; /* containing namespace */
> struct mountpoint *mnt_mp; /* where is it mounted */
> struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
> + struct hlist_node mnt_mr_list; /* list mounts with the same mountroot */
> #ifdef CONFIG_FSNOTIFY
> struct hlist_head mnt_fsnotify_marks;
> __u32 mnt_fsnotify_mask;
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 2b8aa15fd6df..2ce987af9afa 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -31,6 +31,8 @@ static unsigned int m_hash_mask __read_mostly;
> static unsigned int m_hash_shift __read_mostly;
> static unsigned int mp_hash_mask __read_mostly;
> static unsigned int mp_hash_shift __read_mostly;
> +static unsigned int mr_hash_mask __read_mostly;
> +static unsigned int mr_hash_shift __read_mostly;
>
> static __initdata unsigned long mhash_entries;
> static int __init set_mhash_entries(char *str)
> @@ -52,6 +54,16 @@ static int __init set_mphash_entries(char *str)
> }
> __setup("mphash_entries=", set_mphash_entries);
>
> +static __initdata unsigned long mrhash_entries;
> +static int __init set_mrhash_entries(char *str)
> +{
> + if (!str)
> + return 0;
> + mrhash_entries = simple_strtoul(str, &str, 0);
Nit: Any particular reason for using simple_* rather than kstrto* family
of functions?
> + return 1;
> +}
> +__setup("mrhash_entries=", set_mrhash_entries);
> +
> static u64 event;
> static DEFINE_IDA(mnt_id_ida);
> static DEFINE_IDA(mnt_group_ida);
> @@ -61,6 +73,7 @@ static int mnt_group_start = 1;
>
> static struct hlist_head *mount_hashtable __read_mostly;
> static struct hlist_head *mountpoint_hashtable __read_mostly;
> +static struct hlist_head *mountroot_hashtable __read_mostly;
> static struct kmem_cache *mnt_cache __read_mostly;
> static DECLARE_RWSEM(namespace_sem);
>
> @@ -93,6 +106,13 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)
> return &mountpoint_hashtable[tmp & mp_hash_mask];
> }
>
> +static inline struct hlist_head *mr_hash(struct dentry *dentry)
> +{
> + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
> + tmp = tmp + (tmp >> mr_hash_shift);
> + return &mountroot_hashtable[tmp & mr_hash_mask];
> +}
> +
> /*
> * allocation is serialized by namespace_sem, but we need the spinlock to
> * serialize with freeing.
> @@ -234,6 +254,7 @@ static struct mount *alloc_vfsmnt(const char *name)
> INIT_LIST_HEAD(&mnt->mnt_slave_list);
> INIT_LIST_HEAD(&mnt->mnt_slave);
> INIT_HLIST_NODE(&mnt->mnt_mp_list);
> + INIT_HLIST_NODE(&mnt->mnt_mr_list);
> #ifdef CONFIG_FSNOTIFY
> INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
> #endif
> @@ -779,6 +800,77 @@ static void put_mountpoint(struct mountpoint *mp)
> }
> }
>
> +static struct mountroot *lookup_mountroot(struct dentry *dentry)
> +{
> + struct hlist_head *chain = mr_hash(dentry);
> + struct mountroot *mr;
> +
> + hlist_for_each_entry(mr, chain, r_hash) {
> + if (mr->r_dentry == dentry)
> + return mr;
> + }
> + return NULL;
> +}
> +
> +static int mnt_set_root(struct mount *mnt, struct dentry *root)
> +{
> + struct mountroot *mr = NULL;
> +
> + read_seqlock_excl(&mount_lock);
> + if (d_mountroot(root))
> + mr = lookup_mountroot(root);
> + if (!mr) {
> + struct mountroot *new;
> + read_sequnlock_excl(&mount_lock);
> +
> + new = kmalloc(sizeof(struct mountroot), GFP_KERNEL);
> + if (!new)
> + return -ENOMEM;
> +
> + read_seqlock_excl(&mount_lock);
> + mr = lookup_mountroot(root);
> + if (mr) {
> + kfree(new);
> + } else {
> + struct hlist_head *chain = mr_hash(root);
> +
> + mr = new;
> + mr->r_dentry = root;
> + INIT_HLIST_HEAD(&mr->r_list);
> + hlist_add_head(&mr->r_hash, chain);
> +
> + spin_lock(&root->d_lock);
> + root->d_flags |= DCACHE_MOUNTROOT;
> + spin_unlock(&root->d_lock);
> + }
> + }
> + mnt->mnt.mnt_root = root;
> + hlist_add_head(&mnt->mnt_mr_list, &mr->r_list);
> + read_sequnlock_excl(&mount_lock);
> +
> + return 0;
> +}
> +
> +static void mnt_put_root(struct mount *mnt)
> +{
> + struct dentry *root = mnt->mnt.mnt_root;
> + struct mountroot *mr;
> +
> + read_seqlock_excl(&mount_lock);
> + mr = lookup_mountroot(root);
> + BUG_ON(!mr);
> + hlist_del(&mnt->mnt_mr_list);
> + if (hlist_empty(&mr->r_list)) {
> + hlist_del(&mr->r_hash);
> + spin_lock(&root->d_lock);
> + root->d_flags &= ~DCACHE_MOUNTROOT;
> + spin_unlock(&root->d_lock);
> + kfree(mr);
> + }
> + read_sequnlock_excl(&mount_lock);
> + dput(root);
> +}
> +
> static inline int check_mnt(struct mount *mnt)
> {
> return mnt->mnt_ns == current->nsproxy->mnt_ns;
> @@ -934,6 +1026,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
> {
> struct mount *mnt;
> struct dentry *root;
> + int err;
>
> if (!type)
> return ERR_PTR(-ENODEV);
> @@ -952,8 +1045,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
> return ERR_CAST(root);
> }
>
> - mnt->mnt.mnt_root = root;
> mnt->mnt.mnt_sb = root->d_sb;
> + err = mnt_set_root(mnt, root);
> + if (err) {
> + dput(root);
> + deactivate_super(mnt->mnt.mnt_sb);
> + mnt_free_id(mnt);
> + free_vfsmnt(mnt);
> + return ERR_PTR(err);
> + }
> +
> mnt->mnt_mountpoint = mnt->mnt.mnt_root;
> mnt->mnt_parent = mnt;
> lock_mount_hash();
> @@ -985,6 +1086,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
> goto out_free;
> }
>
> + err = mnt_set_root(mnt, root);
> + if (err)
> + goto out_free;
> +
> mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
> /* Don't allow unprivileged users to change mount flags */
> if (flag & CL_UNPRIVILEGED) {
> @@ -1010,7 +1115,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
>
> atomic_inc(&sb->s_active);
> mnt->mnt.mnt_sb = sb;
> - mnt->mnt.mnt_root = dget(root);
> + dget(root);
> mnt->mnt_mountpoint = mnt->mnt.mnt_root;
> mnt->mnt_parent = mnt;
> lock_mount_hash();
> @@ -1063,7 +1168,7 @@ static void cleanup_mnt(struct mount *mnt)
> if (unlikely(mnt->mnt_pins.first))
> mnt_pin_kill(mnt);
> fsnotify_vfsmount_delete(&mnt->mnt);
> - dput(mnt->mnt.mnt_root);
> + mnt_put_root(mnt);
> deactivate_super(mnt->mnt.mnt_sb);
> mnt_free_id(mnt);
> call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
> @@ -3120,14 +3225,21 @@ void __init mnt_init(void)
> mphash_entries, 19,
> 0,
> &mp_hash_shift, &mp_hash_mask, 0, 0);
> + mountroot_hashtable = alloc_large_system_hash("Mountroot-cache",
> + sizeof(struct hlist_head),
> + mrhash_entries, 19,
> + 0,
> + &mr_hash_shift, &mr_hash_mask, 0, 0);
>
> - if (!mount_hashtable || !mountpoint_hashtable)
> + if (!mount_hashtable || !mountpoint_hashtable || !mountroot_hashtable)
> panic("Failed to allocate mount hash table\n");
>
> for (u = 0; u <= m_hash_mask; u++)
> INIT_HLIST_HEAD(&mount_hashtable[u]);
> for (u = 0; u <= mp_hash_mask; u++)
> INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
> + for (u = 0; u <= mr_hash_mask; u++)
> + INIT_HLIST_HEAD(&mountroot_hashtable[u]);
>
> kernfs_init();
>
> diff --git a/include/linux/dcache.h b/include/linux/dcache.h
> index d67ae119cf4e..52a5e6915f58 100644
> --- a/include/linux/dcache.h
> +++ b/include/linux/dcache.h
> @@ -228,6 +228,8 @@ struct dentry_operations {
> #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */
> #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */
>
> +#define DCACHE_MOUNTROOT 0x04000000 /* Root of a vfsmount */
> +
> extern seqlock_t rename_lock;
>
> /*
> @@ -404,6 +406,11 @@ static inline bool d_mountpoint(const struct dentry *dentry)
> return dentry->d_flags & DCACHE_MOUNTED;
> }
>
> +static inline bool d_mountroot(const struct dentry *dentry)
> +{
> + return dentry->d_flags & DCACHE_MOUNTROOT;
> +}
> +
> /*
> * Directory cache entry type accessor functions.
> */
>
More information about the Containers
mailing list