[PATCH 1/3] seccomp: Return from SECCOMP_IOCTL_NOTIF_RECV when children are gone

Sargun Dhillon sargun at sargun.me
Wed Nov 4 11:18:02 UTC 2020


On Mon, Nov 02, 2020 at 09:37:04PM +0100, Jann Horn wrote:
> At the moment, the seccomp notifier API is hard to use without combining
> it with APIs like poll() or epoll(); if all target processes have gone
> away, the polling APIs will raise an error indication on the file
> descriptor, but SECCOMP_IOCTL_NOTIF_RECV will keep blocking indefinitely.
> 
> This usability problem was discovered when Michael Kerrisk wrote a
> manpage for this API.
> 
> To fix it, get rid of the semaphore logic and let SECCOMP_IOCTL_NOTIF_RECV
> behave as follows:
> 
> If O_NONBLOCK is set, SECCOMP_IOCTL_NOTIF_RECV always returns
> immediately, no matter whether a notification is available or not.
> 
> If O_NONBLOCK is unset, SECCOMP_IOCTL_NOTIF_RECV blocks until either a
> notification is delivered to userspace or all users of the filter have
> gone away.
> 
> To avoid subtle breakage from eventloop-style code that doesn't set
> O_NONBLOCK, set O_NONBLOCK by default - userspace can clear it if it
> wants blocking behavior, and if blocking-style code forgets to do so,
> that will be much more obvious than the breakage we'd get the other way
> around.
> This also means that UAPI breakage from this change should be limited to
> blocking users of the API, of which, to my knowledge, there are none so far
> (outside of in-tree sample and selftest code, which this patch adjusts - in
> particular the code in samples/ has to change a bunch).
> 
> This should be backported because future userspace code might otherwise not
> work properly on old kernels.
> 
> Cc: stable at vger.kernel.org
> Reported-by: Michael Kerrisk <mtk.manpages at gmail.com>
> Signed-off-by: Jann Horn <jannh at google.com>
> ---
>  kernel/seccomp.c                              | 62 +++++++++++++------
>  samples/seccomp/user-trap.c                   | 16 +++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 21 +++++++
>  3 files changed, 79 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 8ad7a293255a..b3730740515f 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -43,6 +43,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/anon_inodes.h>
>  #include <linux/lockdep.h>
> +#include <linux/freezer.h>
>  
>  /*
>   * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
> @@ -138,7 +139,6 @@ struct seccomp_kaddfd {
>   * @notifications: A list of struct seccomp_knotif elements.
>   */
>  struct notification {
> -	struct semaphore request;
>  	u64 next_id;
>  	struct list_head notifications;
>  };
> @@ -863,7 +863,6 @@ static int seccomp_do_user_notification(int this_syscall,
>  	list_add(&n.list, &match->notif->notifications);
>  	INIT_LIST_HEAD(&n.addfd);
>  
> -	up(&match->notif->request);
>  	wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>  	mutex_unlock(&match->notify_lock);
>  
> @@ -1179,9 +1178,10 @@ find_notification(struct seccomp_filter *filter, u64 id)
>  
>  
>  static long seccomp_notify_recv(struct seccomp_filter *filter,
> -				void __user *buf)
> +				void __user *buf, bool blocking)
>  {
>  	struct seccomp_knotif *knotif = NULL, *cur;
> +	DEFINE_WAIT(wait);
>  	struct seccomp_notif unotif;
>  	ssize_t ret;
>  
> @@ -1194,11 +1194,9 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  
>  	memset(&unotif, 0, sizeof(unotif));
>  
> -	ret = down_interruptible(&filter->notif->request);
> -	if (ret < 0)
> -		return ret;
> -
>  	mutex_lock(&filter->notify_lock);
> +
> +retry:
>  	list_for_each_entry(cur, &filter->notif->notifications, list) {
>  		if (cur->state == SECCOMP_NOTIFY_INIT) {
>  			knotif = cur;
> @@ -1206,14 +1204,40 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  		}
>  	}
>  
> -	/*
> -	 * If we didn't find a notification, it could be that the task was
> -	 * interrupted by a fatal signal between the time we were woken and
> -	 * when we were able to acquire the rw lock.
> -	 */
>  	if (!knotif) {
> -		ret = -ENOENT;
> -		goto out;
> +		if (!blocking) {
> +			if (refcount_read(&filter->users) == 0)
> +				ret = -ENOTCONN;
> +			else
> +				ret = -ENOENT;
> +			goto out;
> +		}
> +
> +		/* This has to happen before checking &filter->users. */
> +		prepare_to_wait(&filter->wqh, &wait, TASK_INTERRUPTIBLE);

Isn't there a subtle race condition here, that if we are in blocking mode, and 
we get a notification here, we wont be woken up? since we enter the wait queue
after checking to see if there are pending notifications, and in that period
the wake_up_poll could have been called?

I very well might be missing something interesting about the semantics of
freezable_schedule.

shouldn't it read something like:

retry:
	mutex_lock(...);
	ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
	if (ret)
		goto out;
	list_for_each_entry(cur, &filter->notif->notifications, list) {
		if (cur->state == SECCOMP_NOTIFY_INIT) {
			knotif = cur;
			break;
		}
	}

	/* Maybe this should be before we iterate over the list of outstanding notifcations? */
	if (refcount_read(&filter->users) == 0) {
		ret = -ENOTCONN;
		goto out;
	}

	if (!knotif) {
		if (!blocking) {
			ret = -ENOENT;
			goto out;
		}
		mutex_unlock(...);
		freezable_schedule();
		goto retry;
	}


out:
	finish_wait(&filter->wqh, &wait);
	mutex_unlock(...);
> +
> +		/*
> +		 * If all users of the filter are gone, throw an error
> +		 * instead of pointlessly continuing to block.
> +		 */
> +		if (refcount_read(&filter->users) == 0) {
> +			finish_wait(&filter->wqh, &wait);
> +			ret = -ENOTCONN;
> +			goto out;
> +		}
> +
> +		/* No notifications - wait for one... */
> +		mutex_unlock(&filter->notify_lock);
> +		freezable_schedule();
> +		finish_wait(&filter->wqh, &wait);
> +
> +		/* ... and retry */
> +		mutex_lock(&filter->notify_lock);
> +		if (signal_pending(current)) {
> +			ret = -EINTR;
> +			goto out;
> +		}
> +		goto retry;
>  	}
>  
>  	unotif.id = knotif->id;
> @@ -1237,10 +1261,8 @@ static long seccomp_notify_recv(struct seccomp_filter *filter,
>  		 */
>  		mutex_lock(&filter->notify_lock);
>  		knotif = find_notification(filter, unotif.id);
> -		if (knotif) {
> +		if (knotif)
>  			knotif->state = SECCOMP_NOTIFY_INIT;
> -			up(&filter->notif->request);
> -		}
>  		mutex_unlock(&filter->notify_lock);
>  	}
>  
> @@ -1416,11 +1438,12 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
>  {
>  	struct seccomp_filter *filter = file->private_data;
>  	void __user *buf = (void __user *)arg;
> +	bool blocking = !(file->f_flags & O_NONBLOCK);
>  
>  	/* Fixed-size ioctls */
>  	switch (cmd) {
>  	case SECCOMP_IOCTL_NOTIF_RECV:
> -		return seccomp_notify_recv(filter, buf);
> +		return seccomp_notify_recv(filter, buf, blocking);
>  	case SECCOMP_IOCTL_NOTIF_SEND:
>  		return seccomp_notify_send(filter, buf);
>  	case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
> @@ -1483,12 +1506,11 @@ static struct file *init_listener(struct seccomp_filter *filter)
>  	if (!filter->notif)
>  		goto out;
>  
> -	sema_init(&filter->notif->request, 0);
>  	filter->notif->next_id = get_random_u64();
>  	INIT_LIST_HEAD(&filter->notif->notifications);
>  
>  	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
> -				 filter, O_RDWR);
> +				 filter, O_RDWR|O_NONBLOCK);
>  	if (IS_ERR(ret))
>  		goto out_notif;
>  
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> index 20291ec6489f..b9e666f15998 100644
> --- a/samples/seccomp/user-trap.c
> +++ b/samples/seccomp/user-trap.c
> @@ -198,6 +198,17 @@ static int handle_req(struct seccomp_notif *req,
>  	return ret;
>  }
>  
> +static void set_blocking(int fd)
> +{
> +	int file_status_flags = fcntl(fd, F_GETFL);
> +
> +	file_status_flags &= ~O_NONBLOCK;
> +	if (fcntl(fd, F_SETFL, file_status_flags)) {
> +		perror("F_SETFL");
> +		exit(1);
> +	}
> +}
> +
>  int main(void)
>  {
>  	int sk_pair[2], ret = 1, status, listener;
> @@ -274,6 +285,8 @@ int main(void)
>  	if (listener < 0)
>  		goto out_kill;
>  
> +	set_blocking(listener);
> +
>  	/*
>  	 * Fork a task to handle the requests. This isn't strictly necessary,
>  	 * but it makes the particular writing of this sample easier, since we
> @@ -307,6 +320,9 @@ int main(void)
>  		while (1) {
>  			memset(req, 0, sizes.seccomp_notif);
>  			if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
> +				if (errno == ENOTCONN)
> +					exit(0);
> +
>  				perror("ioctl recv");
>  				goto out_resp;
>  			}
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 4a180439ee9e..5318f9cb1aec 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -295,6 +295,13 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
>  #endif
>  }
>  
> +#define set_blocking(fd) ({					\
> +	int file_status_flags;					\
> +	file_status_flags = fcntl(fd, F_GETFL);			\
> +	file_status_flags &= ~O_NONBLOCK;			\
> +	ASSERT_EQ(fcntl(fd, F_SETFL, file_status_flags), 0);	\
> +})
> +
>  /* Have TH_LOG report actual location filecmp() is used. */
>  #define filecmp(pid1, pid2, fd1, fd2)	({		\
>  	int _ret;					\
> @@ -3422,6 +3429,8 @@ TEST(user_notification_kill_in_middle)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	/*
>  	 * Check that nothing bad happens when we kill the task in the middle
>  	 * of a syscall.
> @@ -3476,6 +3485,8 @@ TEST(user_notification_signal)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3583,6 +3594,8 @@ TEST(user_notification_child_pid_ns)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3623,6 +3636,8 @@ TEST(user_notification_sibling_pid_ns)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3691,6 +3706,8 @@ TEST(user_notification_fault_recv)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -3972,6 +3989,8 @@ TEST(user_notification_addfd)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> @@ -4099,6 +4118,8 @@ TEST(user_notification_addfd_rlimit)
>  				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
>  	ASSERT_GE(listener, 0);
>  
> +	set_blocking(listener);
> +
>  	pid = fork();
>  	ASSERT_GE(pid, 0);
>  
> 
> base-commit: 4525c8781ec0701ce824e8bd379ae1b129e26568
> -- 
> 2.29.1.341.ge80a0c044ae-goog
> 


More information about the Containers mailing list