[PATCH net-next 1/3] bpf, seccomp: Add eBPF filter capabilities

Tue Feb 13 20:34:20 UTC 2018

On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sargun at sargun.me> wrote:
> From: Sargun Dhillon <sargun at netflix.com>
>
> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
> to be used for seccomp filters as an alternative to cBPF filters. The
> program type has relatively limited capabilities in terms of helpers,
> but that can be extended later on.
>
> It also introduces a new mechanism to attach these filters via the
> prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
> SECCOMP_SET_MODE_FILTER_EXTENDED respectively.
>
> Signed-off-by: Sargun Dhillon <sargun at sargun.me>
> ---
>  arch/Kconfig                 |   7 ++
>  include/linux/bpf_types.h    |   3 +
>  include/uapi/linux/bpf.h     |   2 +
>  include/uapi/linux/seccomp.h |  15 +++--
>  kernel/bpf/syscall.c         |   1 +
>  kernel/seccomp.c             | 148 +++++++++++++++++++++++++++++++++++++------
>  6 files changed, 150 insertions(+), 26 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 76c0b54443b1..db675888577c 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -401,6 +401,13 @@ config SECCOMP_FILTER
>
>           See Documentation/prctl/seccomp_filter.txt for details.
>
> +config SECCOMP_FILTER_EXTENDED
> +       bool "Extended BPF seccomp filters"
> +       depends on SECCOMP_FILTER && BPF_SYSCALL
> +       help
> +         Enables seccomp filters to be written in eBPF, as opposed
> +         to just cBPF filters.
> +
>  config HAVE_GCC_PLUGINS
>         bool
>         help
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 19b8349a3809..945c65c4e461 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
>  #ifdef CONFIG_CGROUP_BPF
>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
>  #endif
> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
> +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
> +#endif
>
>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index db6bdc375126..5f96cb7ed954 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1,3 +1,4 @@
> +
>  /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
>   *
> @@ -133,6 +134,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_SOCK_OPS,
>         BPF_PROG_TYPE_SK_SKB,
>         BPF_PROG_TYPE_CGROUP_DEVICE,
> +       BPF_PROG_TYPE_SECCOMP,
>  };
>
>  enum bpf_attach_type {
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 2a0bd9dd104d..7da8b39f2a6a 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -7,14 +7,17 @@
>
>
>  /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
> -#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
> -#define SECCOMP_MODE_STRICT    1 /* uses hard-coded filter. */
> -#define SECCOMP_MODE_FILTER    2 /* uses user-supplied filter. */
> +#define SECCOMP_MODE_DISABLED          0 /* seccomp is not in use. */
> +#define SECCOMP_MODE_STRICT            1 /* uses hard-coded filter. */
> +#define SECCOMP_MODE_FILTER            2 /* uses user-supplied filter. */
> +#define SECCOMP_MODE_FILTER_EXTENDED   3 /* uses eBPF filter from fd */

This MODE flag isn't needed: it's still using a filter, and the
interface changes should be sufficient with
SECCOMP_SET_MODE_FILTER_EXTENDED below.

>  /* Valid operations for seccomp syscall. */
> -#define SECCOMP_SET_MODE_STRICT                0
> -#define SECCOMP_SET_MODE_FILTER                1
> -#define SECCOMP_GET_ACTION_AVAIL       2
> +#define SECCOMP_SET_MODE_STRICT                        0
> +#define SECCOMP_SET_MODE_FILTER                        1
> +#define SECCOMP_GET_ACTION_AVAIL               2
> +#define SECCOMP_SET_MODE_FILTER_EXTENDED       3

It seems like this should be a FILTER flag, not a syscall op change?

> +
>
>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
>  #define SECCOMP_FILTER_FLAG_TSYNC      1
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e24aa3241387..86d6ec8b916d 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
>
>         if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
>             type != BPF_PROG_TYPE_CGROUP_SKB &&
> +           type != BPF_PROG_TYPE_SECCOMP &&
>             !capable(CAP_SYS_ADMIN))
>                 return -EPERM;

So only init_ns-CAP_SYS_ADMIN would be able to use seccomp eBPF?

> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 940fa408a288..b30dd25c1cb8 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -37,6 +37,7 @@
>  #include <linux/security.h>
>  #include <linux/tracehook.h>
>  #include <linux/uaccess.h>
> +#include <linux/bpf.h>
>
>  /**
>   * struct seccomp_filter - container for seccomp BPF programs
> @@ -367,17 +368,6 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
>
>         BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
>
> -       /*
> -        * Installing a seccomp filter requires that the task has
> -        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
> -        * This avoids scenarios where unprivileged tasks can affect the
> -        * behavior of privileged children.
> -        */
> -       if (!task_no_new_privs(current) &&
> -           security_capable_noaudit(current_cred(), current_user_ns(),
> -                                    CAP_SYS_ADMIN) != 0)
> -               return ERR_PTR(-EACCES);
> -
>         /* Allocate a new seccomp_filter */
>         sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
>         if (!sfilter)
> @@ -423,6 +413,48 @@ seccomp_prepare_user_filter(const char __user *user_filter)
>         return filter;
>  }
>
> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
> +/**
> + * seccomp_prepare_extended_filter - prepares a user-supplied eBPF fd
> + * @user_filter: pointer to the user data containing an fd.
> + *
> + * Returns 0 on success and non-zero otherwise.
> + */
> +static struct seccomp_filter *
> +seccomp_prepare_extended_filter(const char __user *user_fd)
> +{
> +       struct seccomp_filter *sfilter;
> +       struct bpf_prog *fp;
> +       int fd;
> +
> +       /* Fetch the fd from userspace */
> +       if (get_user(fd, (int __user *)user_fd))
> +               return ERR_PTR(-EFAULT);
> +
> +       /* Allocate a new seccomp_filter */
> +       sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
> +       if (!sfilter)
> +               return ERR_PTR(-ENOMEM);
> +
> +       fp = bpf_prog_get_type(fd, BPF_PROG_TYPE_SECCOMP);
> +       if (IS_ERR(fp)) {
> +               kfree(sfilter);
> +               return ERR_CAST(fp);
> +       }
> +
> +       sfilter->prog = fp;
> +       refcount_set(&sfilter->usage, 1);
> +
> +       return sfilter;
> +}
> +#else
> +static struct seccomp_filter *
> +seccomp_prepare_extended_filter(const char __user *filter_fd)
> +{
> +       return ERR_PTR(-EINVAL);
> +}
> +#endif
> +
>  /**
>   * seccomp_attach_filter: validate and attach filter
>   * @flags:  flags to change filter behavior
> @@ -492,7 +524,10 @@ void get_seccomp_filter(struct task_struct *tsk)
>  static inline void seccomp_filter_free(struct seccomp_filter *filter)
>  {
>         if (filter) {
> -               bpf_prog_destroy(filter->prog);
> +               if (bpf_prog_was_classic(filter->prog))
> +                       bpf_prog_destroy(filter->prog);
> +               else
> +                       bpf_prog_put(filter->prog);
>                 kfree(filter);
>         }
>  }
> @@ -842,18 +877,36 @@ static long seccomp_set_mode_strict(void)
>   * Returns 0 on success or -EINVAL on failure.
>   */
>  static long seccomp_set_mode_filter(unsigned int flags,
> -                                   const char __user *filter)
> +                                   const char __user *filter,
> +                                   unsigned long filter_type)

I think this can just live in flags?

>  {
> -       const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
> +       /* We use SECCOMP_MODE_FILTER for both eBPF and cBPF filters */
> +       const unsigned long filter_mode = SECCOMP_MODE_FILTER;
>         struct seccomp_filter *prepared = NULL;
>         long ret = -EINVAL;
>
>         /* Validate flags. */
>         if (flags & ~SECCOMP_FILTER_FLAG_MASK)
>                 return -EINVAL;
> +       /*
> +        * Installing a seccomp filter requires that the task has
> +        * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
> +        * This avoids scenarios where unprivileged tasks can affect the
> +        * behavior of privileged children.
> +        */
> +       if (!task_no_new_privs(current) &&
> +           security_capable_noaudit(current_cred(), current_user_ns(),
> +                                    CAP_SYS_ADMIN) != 0)
> +               return -EACCES;

This changes the order of checks -- before, too-large filters would
get EINVAL even if they lacked the needed permissions. As long as this
doesn't break anything in the real world, it should be fine, but I
might want to instead create a perm-check function and just call it in
both functions. (And likely write a self-test that checks this order,
if it doesn't already exist.)

>
>         /* Prepare the new filter before holding any locks. */
> -       prepared = seccomp_prepare_user_filter(filter);
> +       if (filter_type == SECCOMP_SET_MODE_FILTER_EXTENDED)
> +               prepared = seccomp_prepare_extended_filter(filter);
> +       else if (filter_type == SECCOMP_SET_MODE_FILTER)
> +               prepared = seccomp_prepare_user_filter(filter);
> +       else
> +               return -EINVAL;
> +
>         if (IS_ERR(prepared))
>                 return PTR_ERR(prepared);
>
> @@ -867,7 +920,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>
>         spin_lock_irq(&current->sighand->siglock);
>
> -       if (!seccomp_may_assign_mode(seccomp_mode))
> +       if (!seccomp_may_assign_mode(filter_mode))
>                 goto out;
>
>         ret = seccomp_attach_filter(flags, prepared);
> @@ -876,7 +929,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>         /* Do not free the successfully attached filter. */
>         prepared = NULL;
>
> -       seccomp_assign_mode(current, seccomp_mode);
> +       seccomp_assign_mode(current, filter_mode);

With a filter flag, the above hunks don't need to be changed, for example.

>  out:
>         spin_unlock_irq(&current->sighand->siglock);
>         if (flags & SECCOMP_FILTER_FLAG_TSYNC)
> @@ -926,7 +979,9 @@ static long do_seccomp(unsigned int op, unsigned int flags,
>                         return -EINVAL;
>                 return seccomp_set_mode_strict();
>         case SECCOMP_SET_MODE_FILTER:
> -               return seccomp_set_mode_filter(flags, uargs);
> +               return seccomp_set_mode_filter(flags, uargs, op);
> +       case SECCOMP_SET_MODE_FILTER_EXTENDED:
> +               return seccomp_set_mode_filter(flags, uargs, op);

And this isn't needed, since it would be passed as a flag.

>         case SECCOMP_GET_ACTION_AVAIL:
>                 if (flags != 0)
>                         return -EINVAL;
> @@ -969,6 +1024,10 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
>                 op = SECCOMP_SET_MODE_FILTER;
>                 uargs = filter;
>                 break;
> +       case SECCOMP_MODE_FILTER_EXTENDED:
> +               op = SECCOMP_SET_MODE_FILTER_EXTENDED;
> +               uargs = filter;
> +               break;

Same.

>         default:
>                 return -EINVAL;
>         }
> @@ -1040,8 +1099,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>         if (IS_ERR(filter))
>                 return PTR_ERR(filter);
>
> -       fprog = filter->prog->orig_prog;
> -       if (!fprog) {
> +       if (!bpf_prog_was_classic(filter->prog)) {
>                 /* This must be a new non-cBPF filter, since we save
>                  * every cBPF filter's orig_prog above when
>                  * CONFIG_CHECKPOINT_RESTORE is enabled.
> @@ -1050,6 +1108,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
>                 goto out;
>         }
>
> +       fprog = filter->prog->orig_prog;

I wonder if it would be easier to review to split eBPF install from
the eBPF "get filter" changes as separate patches?

>         ret = fprog->len;
>         if (!data)
>                 goto out;
> @@ -1239,6 +1298,55 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
>         return 0;
>  }
>
> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
> +static bool seccomp_is_valid_access(int off, int size,
> +                                   enum bpf_access_type type,
> +                                   struct bpf_insn_access_aux *info)
> +{
> +       if (type != BPF_READ)
> +               return false;
> +
> +       if (off < 0 || off + size > sizeof(struct seccomp_data))
> +               return false;
> +
> +       switch (off) {
> +       case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]):
> +               return (size == sizeof(__u64));
> +       case bpf_ctx_range(struct seccomp_data, nr):
> +               return (size == FIELD_SIZEOF(struct seccomp_data, nr));
> +       case bpf_ctx_range(struct seccomp_data, arch):
> +               return (size == FIELD_SIZEOF(struct seccomp_data, arch));
> +       case bpf_ctx_range(struct seccomp_data, instruction_pointer):
> +               return (size == FIELD_SIZEOF(struct seccomp_data,
> +                                            instruction_pointer));
> +       }
> +
> +       return false;
> +}
> +
> +static const struct bpf_func_proto *
> +seccomp_func_proto(enum bpf_func_id func_id)
> +{
> +       switch (func_id) {
> +       case BPF_FUNC_get_current_uid_gid:
> +               return &bpf_get_current_uid_gid_proto;
> +       case BPF_FUNC_trace_printk:
> +               if (capable(CAP_SYS_ADMIN))
> +                       return bpf_get_trace_printk_proto();
> +       default:
> +               return NULL;
> +       }
> +}

This makes me so uncomfortable. :) Why is uid/gid needed? Why add
printk support here? (And why is it CAP_SYS_ADMIN checked if the
entire filter is CAP_SYS_ADMIN checked before being attached?)

> +
> +const struct bpf_prog_ops seccomp_prog_ops = {
> +};
> +
> +const struct bpf_verifier_ops seccomp_verifier_ops = {
> +       .get_func_proto         = seccomp_func_proto,
> +       .is_valid_access        = seccomp_is_valid_access,
> +};
> +#endif /* CONFIG_SECCOMP_FILTER_EXTENDED */
> +
>  static struct ctl_path seccomp_sysctl_path[] = {
>         { .procname = "kernel", },
>         { .procname = "seccomp", },
> --
> 2.14.1
>

-Kees

-- 
Kees Cook
Pixel Security