[PATCH 1/9] exec_path 1/9: introduce ->exec_path and switch /proc/*/exe

Matt Helsley matthltc at us.ibm.com
Thu Jun 4 00:55:32 PDT 2009


On Thu, Jun 04, 2009 at 03:04:22AM +0400, Alexey Dobriyan wrote:
> On Sun, May 31, 2009 at 03:19:53PM -0700, Andrew Morton wrote:
> > On Mon, 1 Jun 2009 01:54:27 +0400 Alexey Dobriyan <adobriyan at gmail.com> wrote:
> > 
> > > And BTW, there is something unnatural when executable path is attached
> > > to mm_struct(!) not task_struct,
> > 
> > mm_struct is the central object for a heavyweight process.  All threads
> > within that process share the same executable path (don't they?) so
> > attaching the executable path to the mm seems OK to me.
> 
> OK, let's try this:
> 
> 
> [PATCH 1/9] exec_path 1/9: introduce ->exec_path and switch /proc/*/exe
> 
> ->exec_path marks executable which is associated with running task.
> Binfmt loader decides which executable is such and can, in theory,
> assign anything. Unlike current status quo when first VM_EXECUTABLE mapping is
> sort of marks running executable.
> 
> If executable unmaps its all VM_EXECUTABLE mappings, /proc/*/exe ceases
> to exists, ick! And userpsace can't even use MAP_EXECUTABLE.

Suprising but intentional and unavoidable. More below..

> 
> Tasks which aren't created by running clone(2) and execve(2)
> (read: kernel threads) get empty ->exec_path and
> 
> ->exec_path is copied on clone(2) and put at do_exit() time.

Doesn't this pin the vfs mount of the executable for the lifetime of
the task?
 
That was one of Al Viro's objections to early revisions of the exe_file
patches. It's the reason the exe_file patches kept track of the number of
VM_EXECUTABLE mappings with num_exe_file_vmas.

I've cc'd Al so he can confirm/deny my recollection of this. Basically
some programs need to be able to umount the filesystem that back their
executables. Being able to unmap these regions was effectively a
userspace API for unpinning these mounts. I needed to preserve that API,
hence the VMA ugliness of exe_file that you object to with the exe_file
patches.

I think patches 2-7 look great and could be adapted to use exe_file instead
of ->exec_path.

Cheers,
	-Matt Helsley

> 
> ->exec_path is going to replace struct mm_struct::exe_file et al
> and allows to remove VM_EXECUTABLE flag while keeping readlink("/proc/*/exe")
> without loop over all VMAs.
> 
> Signed-off-by: Alexey Dobriyan <adobriyan at gmail.com>
> ---
>  fs/binfmt_aout.c      |    1 +
>  fs/binfmt_elf.c       |    1 +
>  fs/binfmt_elf_fdpic.c |    1 +
>  fs/binfmt_flat.c      |    1 +
>  fs/binfmt_som.c       |    1 +
>  fs/proc/base.c        |   38 ++++++++++++++------------------------
>  include/linux/sched.h |   25 +++++++++++++++++++++++++
>  kernel/exit.c         |    1 +
>  kernel/fork.c         |    2 ++
>  9 files changed, 47 insertions(+), 24 deletions(-)
> 
> diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
> index b639dcf..a19b185 100644
> --- a/fs/binfmt_aout.c
> +++ b/fs/binfmt_aout.c
> @@ -379,6 +379,7 @@ beyond_if:
>  	regs->gp = ex.a_gpvalue;
>  #endif
>  	start_thread(regs, ex.a_entry, current->mm->start_stack);
> +	set_task_exec_path(current, &bprm->file->f_path);
>  	return 0;
>  }
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 40381df..b815bfc 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -999,6 +999,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
>  #endif
> 
>  	start_thread(regs, elf_entry, bprm->p);
> +	set_task_exec_path(current, &bprm->file->f_path);
>  	retval = 0;
>  out:
>  	kfree(loc);
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index fdb66fa..f545504 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -1185,6 +1185,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
>  		seg++;
>  	}
> 
> +	set_task_exec_path(current, &file->f_path);
>  	return 0;
>  }
> 
> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
> index 697f6b5..a16f977 100644
> --- a/fs/binfmt_flat.c
> +++ b/fs/binfmt_flat.c
> @@ -798,6 +798,7 @@ static int load_flat_file(struct linux_binprm * bprm,
>  			libinfo->lib_list[id].start_brk) +	/* start brk */
>  			stack_len);
> 
> +	set_task_exec_path(current, &bprm->file->f_path);
>  	return 0;
>  err:
>  	return ret;
> diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
> index eff74b9..6c56262 100644
> --- a/fs/binfmt_som.c
> +++ b/fs/binfmt_som.c
> @@ -174,6 +174,7 @@ static int map_som_binary(struct file *file,
>  	up_write(&current->mm->mmap_sem);
>  	if (retval > 0 || retval < -1024)
>  		retval = 0;
> +	set_task_exec_path(current, &bprm->file->f_path);
>  out:
>  	set_fs(old_fs);
>  	return retval;
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 3326bbf..dc4ee6a 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -201,6 +201,20 @@ static int proc_root_link(struct inode *inode, struct path *path)
>  	return result;
>  }
> 
> +static int proc_exe_link(struct inode *inode, struct path *path)
> +{
> +	struct task_struct *tsk;
> +
> +	tsk = get_proc_task(inode);
> +	if (!tsk)
> +		return -ENOENT;
> +	get_task_exec_path(tsk, path);
> +	put_task_struct(tsk);
> +	if (!path->mnt || !path->dentry)
> +		return -ENOENT;
> +	return 0;
> +}
> +
>  /*
>   * Return zero if current may access user memory in @task, -error if not.
>   */
> @@ -1302,30 +1316,6 @@ void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
>  	newmm->exe_file = get_mm_exe_file(oldmm);
>  }
> 
> -static int proc_exe_link(struct inode *inode, struct path *exe_path)
> -{
> -	struct task_struct *task;
> -	struct mm_struct *mm;
> -	struct file *exe_file;
> -
> -	task = get_proc_task(inode);
> -	if (!task)
> -		return -ENOENT;
> -	mm = get_task_mm(task);
> -	put_task_struct(task);
> -	if (!mm)
> -		return -ENOENT;
> -	exe_file = get_mm_exe_file(mm);
> -	mmput(mm);
> -	if (exe_file) {
> -		*exe_path = exe_file->f_path;
> -		path_get(&exe_file->f_path);
> -		fput(exe_file);
> -		return 0;
> -	} else
> -		return -ENOENT;
> -}
> -
>  static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
>  {
>  	struct inode *inode = dentry->d_inode;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b4c38bc..6b2dd01 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1265,6 +1265,12 @@ struct task_struct {
>  #endif
>  /* CPU-specific state of this task */
>  	struct thread_struct thread;
> +	/*
> +	 * Executable, binfmt loader wants to associate with task
> +	 * (read: execve(2) argument).
> +	 * Empty, if concept isn't applicable, e. g. kernel thread.
> +	 */
> +	struct path exec_path;
>  /* filesystem information */
>  	struct fs_struct *fs;
>  /* open file information */
> @@ -2403,6 +2409,25 @@ static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
> 
>  #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
> 
> +static inline void get_task_exec_path(struct task_struct *tsk, struct path *path)
> +{
> +	task_lock(tsk);
> +	path_get(&tsk->exec_path);
> +	*path = tsk->exec_path;
> +	task_unlock(tsk);
> +}
> +
> +static inline void set_task_exec_path(struct task_struct *tsk, struct path *path)
> +{
> +	struct path old_path;
> +
> +	path_get(path);
> +	task_lock(tsk);
> +	old_path = tsk->exec_path;
> +	tsk->exec_path = *path;
> +	task_unlock(tsk);
> +	path_put(&old_path);
> +}
>  #endif /* __KERNEL__ */
> 
>  #endif
> diff --git a/kernel/exit.c b/kernel/exit.c
> index abf9cf3..8e70b54 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -962,6 +962,7 @@ NORET_TYPE void do_exit(long code)
> 
>  	exit_sem(tsk);
>  	exit_files(tsk);
> +	set_task_exec_path(tsk, &(struct path){ .mnt = NULL, .dentry = NULL });
>  	exit_fs(tsk);
>  	check_stack_usage();
>  	exit_thread();
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b9e2edd..c0ee931 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1191,6 +1191,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	cgroup_fork_callbacks(p);
>  	cgroup_callbacks_done = 1;
> 
> +	get_task_exec_path(current, &p->exec_path);
> +
>  	/* Need tasklist lock for parent etc handling! */
>  	write_lock_irq(&tasklist_lock);
> 


More information about the Containers mailing list