[PATCH] c/r: fix "scheduling in atomic" while restoring ipc shm

Oren Laadan orenl at cs.columbia.edu
Wed Feb 24 18:53:31 PST 2010


Hi Nikita,

Thanks for the report and the analysis. It actually helped to
pinpoint a couple of other minor issues in the code. This patch
should fix all of these.

Oren.


Oren Laadan wrote:
> """
>   While playing with checkpoint-restart code, version
>   several-commits-before-0.19, we have faced "scheduling in atomic" issue.
>   ...
>   So restore_ipc_shm() calls ipc_lock() and then restore_memory_contents().
>   Inside ipc_lock(), a spinlock is taken.
>   Inside restore_memory_contents(), checkpoint data is read, that results
>   in vfs_read() and a schedule somewhere below.
>   ...
>   Another related bug: if load_ipc_shm_hdr() fails in line 257, control
>   is transfered to mutex: label with negative ret value; ipc_unlock()
>   is not called on this path.
> """
> 
> Actually, the error could have occurred with other sleeping functions,
> for example, security_restore_obj().
> 
> The fix is to simply not take ipc_lock(). this relies on the fact that
> IPC is always restored to a brand new ipc namespace which is only
> accessible to the restarting process(es):
> 
> 1) The ipc object (id) could not have been deleted between its creation
>   and taking the rw_mutex.
> 
> 2) No unauthorized task will attempt to gain access to it, so it is
>   safe to do away with ipc_lock(). This is useful because we can call
>   functions that sleep.
> 
> 3) Likewise, we only restore the security bits further below, so it is
>   safe to ignore a security (theoretical only!) race.
> 
> If (and when) we allow to restore the ipc state within the parent's
> namespace, we will need to re-examine this.
> 
> While at it, there is also a cleanup to not restore perm->{id,key,seq}
> or the shm->shm_segsz, as they are all set upon creation properly.
> 
> Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
> Reported-by: "Nikita V. Youshchenko" <yoush at cs.msu.su>
> ---
>  ipc/checkpoint.c     |    9 ++----
>  ipc/checkpoint_msg.c |   35 +++++++++++++++++-------
>  ipc/checkpoint_sem.c |   33 ++++++++++++++++------
>  ipc/checkpoint_shm.c |   74 +++++++++++++++++++++++++++++++++-----------------
>  4 files changed, 101 insertions(+), 50 deletions(-)
> 
> diff --git a/ipc/checkpoint.c b/ipc/checkpoint.c
> index 4322dea..06027c2 100644
> --- a/ipc/checkpoint.c
> +++ b/ipc/checkpoint.c
> @@ -203,9 +203,6 @@ int restore_load_ipc_perms(struct ckpt_ctx *ctx,
>  
>  	/* FIX: verify the ->mode field makes sense */
>  
> -	perm->id = h->id;
> -	perm->key = h->key;
> -
>  	if (!validate_created_perms(h))
>  		return -EPERM;
>  	perm->uid = h->uid;
> @@ -213,10 +210,10 @@ int restore_load_ipc_perms(struct ckpt_ctx *ctx,
>  	perm->cuid = h->cuid;
>  	perm->cgid = h->cgid;
>  	perm->mode = h->mode;
> -	perm->seq = h->seq;
>  
> -	return security_restore_obj(ctx, (void *)perm, CKPT_SECURITY_IPC,
> -				   h->sec_ref);
> +	return security_restore_obj(ctx, (void *)perm,
> +				    CKPT_SECURITY_IPC,
> +				    h->sec_ref);
>  }
>  
>  static int restore_ipc_any(struct ckpt_ctx *ctx, struct ipc_namespace *ipc_ns,
> diff --git a/ipc/checkpoint_msg.c b/ipc/checkpoint_msg.c
> index 61b3d78..073a893 100644
> --- a/ipc/checkpoint_msg.c
> +++ b/ipc/checkpoint_msg.c
> @@ -306,7 +306,7 @@ static int restore_msg_contents(struct ckpt_ctx *ctx, struct list_head *queue,
>  int restore_ipc_msg(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  {
>  	struct ckpt_hdr_ipc_msg *h;
> -	struct kern_ipc_perm *perms;
> +	struct kern_ipc_perm *ipc;
>  	struct msg_queue *msq;
>  	struct ipc_ids *msg_ids = &ns->ids[IPC_MSG_IDS];
>  	struct list_head messages;
> @@ -344,12 +344,26 @@ int restore_ipc_msg(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  
>  	down_write(&msg_ids->rw_mutex);
>  
> -	/* we are the sole owners/users of this ipc_ns, it can't go away */
> -	perms = ipc_lock(msg_ids, h->perms.id);
> -	BUG_ON(IS_ERR(perms));	/* ipc_ns is private to us */
> +	/*
> +	 * We are the sole owners/users of this brand new ipc-ns, so:
> +	 *
> +	 * 1) The msgid could not have been deleted between its creation
> +	 *   and taking the rw_mutex above.
> +	 * 2) No unauthorized task will attempt to gain access to it,
> +	 *   so it is safe to do away with ipc_lock(). This is useful
> +	 *   because we can call functions that sleep.
> +	 * 3) Likewise, we only restore the security bits further below,
> +	 *   so it is safe to ignore this (theoretical only!) race.
> +	 *
> +	 * If/when we allow to restore the ipc state within the parent's
> +	 * ipc-ns, we will need to re-examine this.
> +	 */
> +
> +	ipc = idr_find(&msg_ids->ipcs_idr, h->perms.id);
> +	BUG_ON(!ipc);
>  
> -	msq = container_of(perms, struct msg_queue, q_perm);
> -	BUG_ON(!list_empty(&msq->q_messages));	/* ipc_ns is private to us */
> +	msq = container_of(ipc, struct msg_queue, q_perm);
> +	BUG_ON(!list_empty(&msq->q_messages));
>  
>  	/* attach queued messages we read before */
>  	list_splice_init(&messages, &msq->q_messages);
> @@ -362,13 +376,14 @@ int restore_ipc_msg(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  	msq->q_qnum = h->q_qnum;
>  
>  	ret = load_ipc_msg_hdr(ctx, h, msq);
> -
>  	if (ret < 0) {
>  		ckpt_debug("msq: need to remove (%d)\n", ret);
> -		freeque(ns, perms);
> -	} else
> -		ipc_unlock(perms);
> +		ipc_lock_by_ptr(&msq->q_perm);
> +		freeque(ns, ipc);
> +		ipc_unlock(ipc);
> +	}
>  	up_write(&msg_ids->rw_mutex);
> +
>   out:
>  	free_msg_list(&messages);  /* no-op if all ok, else cleanup msgs */
>  	ckpt_hdr_put(ctx, h);
> diff --git a/ipc/checkpoint_sem.c b/ipc/checkpoint_sem.c
> index 395c84d..78c1932 100644
> --- a/ipc/checkpoint_sem.c
> +++ b/ipc/checkpoint_sem.c
> @@ -166,7 +166,7 @@ static struct sem *restore_sem_array(struct ckpt_ctx *ctx, int nsems)
>  int restore_ipc_sem(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  {
>  	struct ckpt_hdr_ipc_sem *h;
> -	struct kern_ipc_perm *perms;
> +	struct kern_ipc_perm *ipc;
>  	struct sem_array *sem;
>  	struct sem *sma = NULL;
>  	struct ipc_ids *sem_ids = &ns->ids[IPC_SEM_IDS];
> @@ -201,19 +201,34 @@ int restore_ipc_sem(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  
>  	down_write(&sem_ids->rw_mutex);
>  
> -	/* we are the sole owners/users of this ipc_ns, it can't go away */
> -	perms = ipc_lock(sem_ids, h->perms.id);
> -	BUG_ON(IS_ERR(perms));  /* ipc_ns is private to us */
> -
> -	sem = container_of(perms, struct sem_array, sem_perm);
> +	/*
> +	 * We are the sole owners/users of this brand new ipc-ns, so:
> +	 *
> +	 * 1) The semid could not have been deleted between its creation
> +	 *   and taking the rw_mutex above.
> +	 * 2) No unauthorized task will attempt to gain access to it,
> +	 *   so it is safe to do away with ipc_lock(). This is useful
> +	 *   because we can call functions that sleep.
> +	 * 3) Likewise, we only restore the security bits further below,
> +	 *   so it is safe to ignore this (theoretical only!) race.
> +	 *
> +	 * If/when we allow to restore the ipc state within the parent's
> +	 * ipc-ns, we will need to re-examine this.
> +	 */
> +
> +	ipc = idr_find(&sem_ids->ipcs_idr, h->perms.id);
> +	BUG_ON(!ipc);
> +
> +	sem = container_of(ipc, struct sem_array, sem_perm);
>  	memcpy(sem->sem_base, sma, sem->sem_nsems * sizeof(*sma));
>  
>  	ret = load_ipc_sem_hdr(ctx, h, sem);
>  	if (ret < 0) {
> +		ipc_lock_by_ptr(&sem->sem_perm);
>  		ckpt_debug("sem: need to remove (%d)\n", ret);
> -		freeary(ns, perms);
> -	} else
> -		ipc_unlock(perms);
> +		freeary(ns, ipc);
> +		ipc_unlock(ipc);
> +	}
>  	up_write(&sem_ids->rw_mutex);
>   out:
>  	kfree(sma);
> diff --git a/ipc/checkpoint_shm.c b/ipc/checkpoint_shm.c
> index 01091d9..6329245 100644
> --- a/ipc/checkpoint_shm.c
> +++ b/ipc/checkpoint_shm.c
> @@ -144,18 +144,27 @@ struct dq_ipcshm_del {
>  	int id;
>  };
>  
> -static int ipc_shm_delete(void *data)
> +static int _ipc_shm_delete(struct ipc_namespace *ns, int id)
>  {
> -	struct dq_ipcshm_del *dq = (struct dq_ipcshm_del *) data;
>  	mm_segment_t old_fs;
>  	int ret;
>  
>  	old_fs = get_fs();
>  	set_fs(get_ds());
> -	ret = shmctl_down(dq->ipcns, dq->id, IPC_RMID, NULL, 0);
> +	ret = shmctl_down(ns, id, IPC_RMID, NULL, 0);
>  	set_fs(old_fs);
>  
> +	return ret;
> +}
> +
> +static int ipc_shm_delete(void *data)
> +{
> +	struct dq_ipcshm_del *dq = (struct dq_ipcshm_del *) data;
> +	int ret;
> +
> +	ret = _ipc_shm_delete(dq->ipcns, dq->id);
>  	put_ipc_ns(dq->ipcns);
> +
>  	return ret;
>  }
>  
> @@ -175,7 +184,6 @@ static int load_ipc_shm_hdr(struct ckpt_ctx *ctx,
>  	if (h->shm_cprid < 0 || h->shm_lprid < 0)
>  		return -EINVAL;
>  
> -	shp->shm_segsz = h->shm_segsz;
>  	shp->shm_atim = h->shm_atim;
>  	shp->shm_dtim = h->shm_dtim;
>  	shp->shm_ctim = h->shm_ctim;
> @@ -188,7 +196,7 @@ static int load_ipc_shm_hdr(struct ckpt_ctx *ctx,
>  int restore_ipc_shm(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  {
>  	struct ckpt_hdr_ipc_shm *h;
> -	struct kern_ipc_perm *perms;
> +	struct kern_ipc_perm *ipc;
>  	struct shmid_kernel *shp;
>  	struct ipc_ids *shm_ids = &ns->ids[IPC_SHM_IDS];
>  	struct file *file;
> @@ -213,6 +221,14 @@ int restore_ipc_shm(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  	if (h->flags & SHM_HUGETLB)	/* FIXME: support SHM_HUGETLB */
>  		goto out;
>  
> +	shmflag = h->flags | h->perms.mode | IPC_CREAT | IPC_EXCL;
> +	ckpt_debug("shm: do_shmget size %lld flag %#x id %d\n",
> +		 h->shm_segsz, shmflag, h->perms.id);
> +	ret = do_shmget(ns, h->perms.key, h->shm_segsz, shmflag, h->perms.id);
> +	ckpt_debug("shm: do_shmget ret %d\n", ret);
> +	if (ret < 0)
> +		goto out;
> +
>  	/*
>  	 * SHM_DEST means that the shm is to be deleted after creation.
>  	 * However, deleting before it's actually attached is quite silly.
> @@ -228,29 +244,36 @@ int restore_ipc_shm(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  		dq.ipcns = ns;
>  		get_ipc_ns(dq.ipcns);
>  
> -		/* XXX can safely use put_ipc_ns() as dtor, see above */
>  		ret = deferqueue_add(ctx->deferqueue, &dq, sizeof(dq),
>  				     (deferqueue_func_t) ipc_shm_delete,
> -				     (deferqueue_func_t) put_ipc_ns);
> -		if (ret < 0)
> +				     (deferqueue_func_t) ipc_shm_delete);
> +		if (ret < 0) {
> +			ipc_shm_delete((void *) &dq);
>  			goto out;
> +		}
>  	}
>  
> -	shmflag = h->flags | h->perms.mode | IPC_CREAT | IPC_EXCL;
> -	ckpt_debug("shm: do_shmget size %lld flag %#x id %d\n",
> -		 h->shm_segsz, shmflag, h->perms.id);
> -	ret = do_shmget(ns, h->perms.key, h->shm_segsz, shmflag, h->perms.id);
> -	ckpt_debug("shm: do_shmget ret %d\n", ret);
> -	if (ret < 0)
> -		goto out;
> -
>  	down_write(&shm_ids->rw_mutex);
>  
> -	/* we are the sole owners/users of this ipc_ns, it can't go away */
> -	perms = ipc_lock(shm_ids, h->perms.id);
> -	BUG_ON(IS_ERR(perms));  /* ipc_ns is private to us */
> +	/*
> +	 * We are the sole owners/users of this brand new ipc-ns, so:
> +	 *
> +	 * 1) The shmid could not have been deleted between its creation
> +	 *   and taking the rw_mutex above.
> +	 * 2) No unauthorized task will attempt to gain access to it,
> +	 *   so it is safe to do away with ipc_lock(). This is useful
> +	 *   because we can call functions that sleep.
> +	 * 3) Likewise, we only restore the security bits further below,
> +	 *   so it is safe to ignore this (theoretical only!) race.
> +	 *
> +	 * If/when we allow to restore the ipc state within the parent's
> +	 * ipc-ns, we will need to re-examine this.
> +	 */
> +
> +	ipc = idr_find(&shm_ids->ipcs_idr, h->perms.id);
> +	BUG_ON(!ipc);
>  
> -	shp = container_of(perms, struct shmid_kernel, shm_perm);
> +	shp = container_of(ipc, struct shmid_kernel, shm_perm);
>  	file = shp->shm_file;
>  	get_file(file);
>  
> @@ -265,12 +288,13 @@ int restore_ipc_shm(struct ckpt_ctx *ctx, struct ipc_namespace *ns)
>  	ret = restore_memory_contents(ctx, file->f_dentry->d_inode);
>   mutex:
>  	fput(file);
> -	if (ret < 0) {
> -		ckpt_debug("shm: need to remove (%d)\n", ret);
> -		do_shm_rmid(ns, perms);
> -	} else
> -		ipc_unlock(perms);
>  	up_write(&shm_ids->rw_mutex);
> +
> +	/* discard this shmid if error and deferqueue wasn't set */
> +	if (ret < 0 && !(h->perms.mode & SHM_DEST)) {
> +		ckpt_debug("shm: need to remove (%d)\n", ret);
> +		_ipc_shm_delete(ns, h->perms.id);
> +	}
>   out:
>  	ckpt_hdr_put(ctx, h);
>  	return ret;


More information about the Containers mailing list