[PATCH v8 1/2] seccomp: add a return code to trap to userspace

Tycho Andersen tycho at tycho.ws
Thu Nov 29 23:08:26 UTC 2018


On Mon, Oct 29, 2018 at 04:40:30PM -0600, Tycho Andersen wrote:
> +	resp.id = req.id;
> +	resp.error = -512; /* -ERESTARTSYS */
> +	resp.val = 0;
> +
> +	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);

So, it turns out this *doesn't* work, and the reason this test was
passing is because of poor hygiene on my part.

Per the documentation in include/linux/errno.h,

/*
 * These should never be seen by user programs.  To return one of ERESTART*
 * codes, signal_pending() MUST be set.  Note that ptrace can observe these
 * at syscall exit tracing, but they will never be left for the debugged user
 * process to see.
 */
#define ERESTARTSYS     512

So basically, if you respond with -ERESTARTSYS with no signal pending, you'll
leak it to userspace. It turns out this is already possible with
SECCOMP_RET_TRAP (and probably ptrace alone, although I didn't try it out),
see the program below.

The question is: do we care? If so, it seems like we may need to handle the
-ERESTARTSYS-style cases even when there is no signal pending. If we don't,
there's precedent for us to just do the same thing as what happens for
SECCOMP_RET_TRACE, but we should probably at least fix the docs.

Tycho


#include <stdio.h>
#include <stddef.h>
#include <stdbool.h>
#include <unistd.h>
#include <stdlib.h>
#include <errno.h>
#include <signal.h>
#include <string.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <sys/ioctl.h>
#include <inttypes.h>
#include <linux/filter.h>
#include <linux/seccomp.h>
#include <linux/ptrace.h>
#include <linux/elf.h>
#include <pthread.h>


static int filter_syscall(int syscall_nr)
{
	struct sock_filter filter[] = {
		BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, syscall_nr, 0, 1),
		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_TRACE),
		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
	};

	struct sock_fprog bpf_prog = {
		.len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
		.filter = filter,
	};

	int ret;

	ret = syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, 0, &bpf_prog);
	if (ret < 0) {
		perror("prctl failed");
		return -1;
	}

	return ret;
}

typedef struct {
        uint64_t        r15;
        uint64_t        r14;
        uint64_t        r13;
        uint64_t        r12;
        uint64_t        bp;
        uint64_t        bx;
        uint64_t        r11;
        uint64_t        r10;
        uint64_t        r9;
        uint64_t        r8;
        uint64_t        ax;
        uint64_t        cx;
        uint64_t        dx;
        uint64_t        si;
        uint64_t        di;
        uint64_t        orig_ax;
        uint64_t        ip;
        uint64_t        cs;
        uint64_t        flags;
        uint64_t        sp;
        uint64_t        ss;
        uint64_t        fs_base;
        uint64_t        gs_base;
        uint64_t        ds;
        uint64_t        es;
        uint64_t        fs;
        uint64_t        gs;
} user_regs_struct64;

int main(int argc, char **argv)
{
	pid_t pid;
	user_regs_struct64 regs;
	struct iovec iov = {.iov_base = &regs, .iov_len = sizeof(regs)};
	int status;

	pid = fork();

	if (pid == 0) {
		if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
			perror("signal");
			exit(1);
		}

		if (filter_syscall(__NR_getpid) < 0)
			exit(1);

		/* i'm lazy, so sue me :) */
		sleep(1);

		errno = 0;
		pid = syscall(__NR_getpid);

		/*
		 * we get:
		 *   getpid(): -1, errno: 512
		 * probably should get
		 *   getpid(): <pid> errno: 0
		 */
		printf("getpid(): %d, errno: %d\n", pid, errno);
		exit(errno);
	}

	if (ptrace(PTRACE_ATTACH, pid, NULL, 0) < 0) {
		perror("ptrace attach");
		goto out;
	}

	if (waitpid(pid, NULL, 0) != pid) {
		perror("waitpid");
		goto out;
	}

	if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACESECCOMP) < 0) {
		perror("ptrace setoptions");
		goto out;
	}

	if (ptrace(PTRACE_CONT, pid, NULL, 0) != 0) {
		perror("ptrace cont");
		goto out;
	}

	if (waitpid(pid, &status, 0) != pid) {
		perror("wait for trace");
		goto out;
	}

	if (status >> 8 != (SIGTRAP | (PTRACE_EVENT_SECCOMP<<8))) {
		printf("got bad trap event?\n");
		goto out;
	}

	if (ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov) < 0) {
		perror("getregset");
		goto out;
	}

	/*
	 * Tell the syscall to restart. Per include/linux/errno.h this should
	 * only be set when signal_pending() is set. But we just won't send
	 * any signals to the process, and we'll see this in userspace.
	 */
	regs.ax = -512; /* -ERESTARTSYS */

	/*
	 * This makes the this_syscall < 0 check in __seccomp_filter()
	 * trigger, so we skip the syscall and return whatever is in ax
	 */
	regs.orig_ax = -512; /* -ERESTARTSYS */

	if (ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov) < 0) {
		perror("setregset");
		goto out;
	}

	if (ptrace(PTRACE_CONT, pid, NULL, 0) < 0) {
		perror("cont after setregset");
		goto out;
	}

	while (1) {
		if (waitpid(pid, &status, 0) != pid) {
			perror("wait for death");
			goto out;
		}

		if (!WIFSTOPPED(status)) {
			break;
		}

		if (ptrace(PTRACE_CONT, pid, NULL, 0) < 0) {
			perror("cont after setregset");
			goto out;
		}

	}

	if (WIFSIGNALED(status)) {
		printf("didn't exit: %d\n", WTERMSIG(status));
		return 1;
	}

	if (WEXITSTATUS(status)) {
		printf("exited: %d\n", WEXITSTATUS(status));
		return 1;
	}

	return 0;
out:
	kill(pid, SIGKILL);
	return 1;
}


More information about the Containers mailing list