[PATCH] c/r: do not hold mmap_sem while checkpointing vma's

Matt Helsley matthltc at us.ibm.com
Mon Oct 26 13:52:36 PDT 2009


On Sun, Oct 25, 2009 at 06:23:29PM -0400, Oren Laadan wrote:
> This patch modifies the memory checkpoint code to _not_ hold the
> mmap_sem while dumping out the vma's.
> 
> The problem with holding the mmap_sem is that it first takes the
> mmap_sem and then takes the file's inode semaphore. This violates the
> normal locking order, e,g, when taking a page fault during a copyout,
> which is inode sem and then the mmap_sem.
> 
> Normally this reverse locking order won't cause a lockup because a the
> output file for the checkpoint image isn't used by the checkpointee.
> However, there a couple of cases where it may be a problem, e.g. when
> some async-IO happens to complete and triggers a page fault at the
> wrong time.
> 
> This fixes complaints from the lockdep about this reverse ordering.
> 
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
> ---
>  checkpoint/memory.c |  133 ++++++++++++++++++++++++++++++++++++---------------
>  1 files changed, 94 insertions(+), 39 deletions(-)
> 
> diff --git a/checkpoint/memory.c b/checkpoint/memory.c
> index 0da948f..656614c 100644
> --- a/checkpoint/memory.c
> +++ b/checkpoint/memory.c
> @@ -644,11 +644,80 @@ static int anonymous_checkpoint(struct ckpt_ctx *ctx,
>  	return private_vma_checkpoint(ctx, vma, CKPT_VMA_ANON, 0);
>  }
> 
> +static int checkpoint_vmas(struct ckpt_ctx *ctx, struct mm_struct *mm)
> +{
> +	struct vm_area_struct *vma, *next;
> +	int map_count = 0;
> +	int ret = 0;
> +
> +	vma = kzalloc(sizeof(*vma), GFP_KERNEL);
> +	if (!vma)
> +		return -ENOMEM;
> +
> +	/*
> +	 * Must not hold mm->mmap_sem when writing to image file, so
> +	 * can't simply traverse the vma list. Instead, use find_vma()
> +	 * to get the @next and make a local "copy" of it.
> +	 */
> +	while (1) {
> +		down_read(&mm->mmap_sem);
> +		next = find_vma(mm, vma->vm_end);
> +		if (!next) {
> +			up_read(&mm->mmap_sem);
> +			break;
> +		}
> +		if (vma->vm_file)
> +			fput(vma->vm_file);
> +		*vma = *next;
> +		if (vma->vm_file)
> +			get_file(vma->vm_file);
> +		up_read(&mm->mmap_sem);
> +
> +		map_count++;
> +
> +		ckpt_debug("vma %#lx-%#lx flags %#lx\n",
> +			 vma->vm_start, vma->vm_end, vma->vm_flags);
> +
> +		if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
> +			ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n",
> +				       -ENOSYS, vma->vm_flags);
> +			ret = -ENOSYS;
> +			break;
> +		}
> +
> +		if (!vma->vm_ops)
> +			ret = anonymous_checkpoint(ctx, vma);
> +		else if (vma->vm_ops->checkpoint)
> +			ret = (*vma->vm_ops->checkpoint)(ctx, vma);
> +		else
> +			ret = -ENOSYS;
> +		if (ret < 0) {
> +			ckpt_write_err(ctx, "TE", "vma: failed", ret);
> +			break;
> +		}
> +		/*
> +		 * The file was collected, but not always checkpointed;
> +		 * be safe and mark as visited to appease leak detection
> +		 */
> +		if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
> +			ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
> +			if (ret < 0)
> +				break;
> +		}
> +	}
> +
> +	if (vma->vm_file)
> +		fput(vma->vm_file);
> +
> +	kfree(vma);
> +
> +	return ret < 0 ? ret : map_count;
> +}
> +
>  static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
>  {
>  	struct ckpt_hdr_mm *h;
> -	struct vm_area_struct *vma;
> -	int exe_objref = 0;
> +	struct file *exe_file = NULL;
>  	int ret;
> 
>  	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_MM);
> @@ -674,14 +743,23 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
> 
>  	h->map_count = mm->map_count;
> 
> -	/* checkpoint the ->exe_file */
> -	if (mm->exe_file) {
> -		exe_objref = checkpoint_obj(ctx, mm->exe_file, CKPT_OBJ_FILE);
> -		if (exe_objref < 0) {
> -			ret = exe_objref;
> +	if (mm->exe_file) {  /* checkpoint the ->exe_file */
> +		exe_file = mm->exe_file;
> +		get_file(exe_file);
> +	}
> +
> +	/*
> +	 * Drop mm->mmap_sem before writing data to checkpoint image
> +	 * to avoid reverse locking order (inode must come before mm).
> +	 */
> +	up_read(&mm->mmap_sem);
> +
> +	if (exe_file) {
> +		h->exe_objref = checkpoint_obj(ctx, exe_file, CKPT_OBJ_FILE);
> +		if (h->exe_objref < 0) {
> +			ret = h->exe_objref;
>  			goto out;
>  		}
> -		h->exe_objref = exe_objref;
>  	}
> 
>  	ret = ckpt_write_obj(ctx, &h->h);
> @@ -692,40 +770,17 @@ static int do_checkpoint_mm(struct ckpt_ctx *ctx, struct mm_struct *mm)
>  	if (ret < 0)
>  		return ret;
> 
> -	/* write the vma's */
> -	for (vma = mm->mmap; vma; vma = vma->vm_next) {
> -		ckpt_debug("vma %#lx-%#lx flags %#lx\n",
> -			 vma->vm_start, vma->vm_end, vma->vm_flags);
> -		if (vma->vm_flags & CKPT_VMA_NOT_SUPPORTED) {
> -			ckpt_write_err(ctx, "TE", "vma: bad flags (%#lx)\n",
> -				       -ENOSYS, vma->vm_flags);
> -			return -ENOSYS;
> -		}
> -		if (!vma->vm_ops)
> -			ret = anonymous_checkpoint(ctx, vma);
> -		else if (vma->vm_ops->checkpoint)
> -			ret = (*vma->vm_ops->checkpoint)(ctx, vma);
> -		else
> -			ret = -ENOSYS;
> -		if (ret < 0) {
> -			ckpt_write_err(ctx, "TE", "vma: failed", ret);
> -			goto out;
> -		}
> -		/*
> -		 * The file was collected, but not always checkpointed;
> -		 * be safe and mark as visited to appease leak detection
> -		 */
> -		if (vma->vm_file && !(ctx->uflags & CHECKPOINT_SUBTREE)) {
> -			ret = ckpt_obj_visit(ctx, vma->vm_file, CKPT_OBJ_FILE);
> -			if (ret < 0)
> -				goto out;
> -		}
> -	}
> +	ret = checkpoint_vmas(ctx, mm);
> +	if (ret != h->map_count && ret >= 0)
> +		ret = -EBUSY; /* checkpoint mm leak */
> +	if (ret < 0)
> +		goto out;
> 
>  	ret = checkpoint_mm_context(ctx, mm);
>   out:
> +	if (exe_file)
> +		fput(exe_file);
>  	ckpt_hdr_put(ctx, h);
> -	up_read(&mm->mmap_sem);
>  	return ret;
>  }
> 
> @@ -1288,9 +1343,9 @@ static struct mm_struct *do_restore_mm(struct ckpt_ctx *ctx)
>  		}
>  		set_mm_exe_file(mm, file);
>  	}
> +	up_write(&mm->mmap_sem);
> 
>  	ret = _ckpt_read_buffer(ctx, mm->saved_auxv, sizeof(mm->saved_auxv));
> -	up_write(&mm->mmap_sem);
>  	if (ret < 0)
>  		goto out;
> 
> -- 

At least in the restart path it's interesting to see how Alexey did it
without mmap_sem, at least for part of it:

http://patchwork.kernel.org/patch/25337/

(search for kstate_restore_mm_struct())

Is that a feasible and more-suitable approach for the initial portions
of mm restore?

Cheers,
	-Matt Helsley


More information about the Containers mailing list