diff mbox series

[RFC] Add BPF_PROG_TYPE_CGROUP_IOCTL

Message ID 20201007152355.2446741-1-Kenny.Ho@amd.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series [RFC] Add BPF_PROG_TYPE_CGROUP_IOCTL | expand

Commit Message

Kenny Ho Oct. 7, 2020, 3:23 p.m. UTC
This is a skeleton implementation to invite comments and generate
discussion around the idea of introducing a bpf-cgroup program type to
control ioctl access.  This is modelled after
BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
write bpf programs to block some ioctl access, potentially in conjunction
with data collected by other bpf programs stored in some bpf maps and
with bpf_spin_lock.

For example, a bpf program has been accumulating resource usaging
statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
block access to previously mentioned resource via ioctl when the stats
stored in a bpf map reaches certain threshold.

Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
ioctls are not blocked if no bpf program is present for the cgroup.) to
maintain current interface behaviour when this functionality is unused.

Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
ensure attached bpf programs cannot crash and always terminate quickly.

TODOs:
- correct usage of the verifier
- toolings
- samples
- device driver may provide helper functions that take
bpf_cgroup_ioctl_ctx and return something more useful for specific
device

Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
---
 fs/ioctl.c                 |  5 +++
 include/linux/bpf-cgroup.h | 14 ++++++++
 include/linux/bpf_types.h  |  2 ++
 include/uapi/linux/bpf.h   |  8 +++++
 kernel/bpf/cgroup.c        | 66 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       |  7 ++++
 kernel/bpf/verifier.c      |  1 +
 7 files changed, 103 insertions(+)

Comments

Kenny Ho Nov. 2, 2020, 7:23 p.m. UTC | #1
Adding a few more emails from get_maintainer.pl and bumping this
thread since there hasn't been any comments so far.  Is this too
crazy?  Am I missing something fundamental?

Regards,
Kenny


On Wed, Oct 7, 2020 at 11:24 AM Kenny Ho <Kenny.Ho@amd.com> wrote:
>
> This is a skeleton implementation to invite comments and generate
> discussion around the idea of introducing a bpf-cgroup program type to
> control ioctl access.  This is modelled after
> BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
> write bpf programs to block some ioctl access, potentially in conjunction
> with data collected by other bpf programs stored in some bpf maps and
> with bpf_spin_lock.
>
> For example, a bpf program has been accumulating resource usaging
> statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
> block access to previously mentioned resource via ioctl when the stats
> stored in a bpf map reaches certain threshold.
>
> Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
> ioctls are not blocked if no bpf program is present for the cgroup.) to
> maintain current interface behaviour when this functionality is unused.
>
> Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
> ensure attached bpf programs cannot crash and always terminate quickly.
>
> TODOs:
> - correct usage of the verifier
> - toolings
> - samples
> - device driver may provide helper functions that take
> bpf_cgroup_ioctl_ctx and return something more useful for specific
> device
>
> Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
> ---
>  fs/ioctl.c                 |  5 +++
>  include/linux/bpf-cgroup.h | 14 ++++++++
>  include/linux/bpf_types.h  |  2 ++
>  include/uapi/linux/bpf.h   |  8 +++++
>  kernel/bpf/cgroup.c        | 66 ++++++++++++++++++++++++++++++++++++++
>  kernel/bpf/syscall.c       |  7 ++++
>  kernel/bpf/verifier.c      |  1 +
>  7 files changed, 103 insertions(+)
>
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 4e6cc0a7d69c..a3925486d417 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -19,6 +19,7 @@
>  #include <linux/falloc.h>
>  #include <linux/sched/signal.h>
>  #include <linux/fiemap.h>
> +#include <linux/cgroup.h>
>
>  #include "internal.h"
>
> @@ -45,6 +46,10 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>         if (!filp->f_op->unlocked_ioctl)
>                 goto out;
>
> +       error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
> +       if (error)
> +               goto out;
> +
>         error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
>         if (error == -ENOIOCTLCMD)
>                 error = -ENOTTY;
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 64f367044e25..a5f0b0a8f82b 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -134,6 +134,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
>  int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
>                                       short access, enum bpf_attach_type type);
>
> +int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
> +                                       enum bpf_attach_type type);
> +
>  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
>                                    struct ctl_table *table, int write,
>                                    void **buf, size_t *pcount, loff_t *ppos,
> @@ -346,6 +349,16 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
>         __ret;                                                                 \
>  })
>
> +#define BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg)                            \
> +({                                                                           \
> +       int __ret = 0;                                                        \
> +       if (cgroup_bpf_enabled)                                               \
> +               __ret = __cgroup_bpf_check_ioctl_permission(filp, cmd, arg,   \
> +                                                           BPF_CGROUP_IOCTL);\
> +                                                                             \
> +       __ret;                                                                \
> +})
> +
>  int cgroup_bpf_prog_attach(const union bpf_attr *attr,
>                            enum bpf_prog_type ptype, struct bpf_prog *prog);
>  int cgroup_bpf_prog_detach(const union bpf_attr *attr,
> @@ -429,6 +442,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>                                        optlen, max_optlen, retval) ({ retval; })
>  #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
>                                        kernel_optval) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_IOCTL(type,major,minor,access) ({ 0; })
>
>  #define for_each_cgroup_storage_type(stype) for (; false; )
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index a52a5688418e..3055e7e4918c 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
>               struct bpf_sysctl, struct bpf_sysctl_kern)
>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
>               struct bpf_sockopt, struct bpf_sockopt_kern)
> +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_IOCTL, cg_ioctl,
> +             struct bpf_cgroup_ioctl_ctx, struct bpf_cgroup_ioctl_ctx)
>  #endif
>  #ifdef CONFIG_BPF_LIRC_MODE2
>  BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index b6238b2209b7..6a908e13d3a3 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -197,6 +197,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_EXT,
>         BPF_PROG_TYPE_LSM,
>         BPF_PROG_TYPE_SK_LOOKUP,
> +       BPF_PROG_TYPE_CGROUP_IOCTL,
>  };
>
>  enum bpf_attach_type {
> @@ -238,6 +239,7 @@ enum bpf_attach_type {
>         BPF_XDP_CPUMAP,
>         BPF_SK_LOOKUP,
>         BPF_XDP,
> +       BPF_CGROUP_IOCTL,
>         __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -4276,6 +4278,12 @@ struct bpf_cgroup_dev_ctx {
>         __u32 minor;
>  };
>
> +struct bpf_cgroup_ioctl_ctx {
> +       __u64 filp;
> +       __u32 cmd;
> +       __u32 arg;
> +};
> +
>  struct bpf_raw_tracepoint_args {
>         __u64 args[0];
>  };
> diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
> index 83ff127ef7ae..0958bae3b0b7 100644
> --- a/kernel/bpf/cgroup.c
> +++ b/kernel/bpf/cgroup.c
> @@ -1203,6 +1203,72 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
>         .is_valid_access        = cgroup_dev_is_valid_access,
>  };
>
> +int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
> +                                     enum bpf_attach_type type)
> +{
> +       struct cgroup *cgrp;
> +       struct bpf_cgroup_ioctl_ctx ctx = {
> +               .filp = filp,
> +               .cmd = cmd,
> +               .arg = arg,
> +       };
> +       int allow = 1;
> +
> +       rcu_read_lock();
> +       cgrp = task_dfl_cgroup(current);
> +       allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
> +                                  BPF_PROG_RUN);
> +       rcu_read_unlock();
> +
> +       return !allow;
> +}
> +
> +static const struct bpf_func_proto *
> +cgroup_ioctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> +       return cgroup_base_func_proto(func_id, prog);
> +}
> +
> +static bool cgroup_ioctl_is_valid_access(int off, int size,
> +                                      enum bpf_access_type type,
> +                                      const struct bpf_prog *prog,
> +                                      struct bpf_insn_access_aux *info)
> +{
> +       const int size_default = sizeof(__u32);
> +
> +       if (type == BPF_WRITE)
> +               return false;
> +
> +       if (off < 0 || off + size > sizeof(struct bpf_cgroup_ioctl_ctx))
> +               return false;
> +       /* The verifier guarantees that size > 0. */
> +       if (off % size != 0)
> +               return false;
> +
> +       switch (off) {
> +       case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, filp):
> +               bpf_ctx_record_field_size(info, size_default);
> +               if (!bpf_ctx_narrow_access_ok(off, size, size_default))
> +                       return false;
> +               break;
> +       case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, cmd):
> +       case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, arg):
> +       default:
> +               if (size != size_default)
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
> +const struct bpf_prog_ops cg_ioctl_prog_ops = {
> +};
> +
> +const struct bpf_verifier_ops cg_ioctl_verifier_ops = {
> +       .get_func_proto         = cgroup_ioctl_func_proto,
> +       .is_valid_access        = cgroup_ioctl_is_valid_access,
> +};
> +
>  /**
>   * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
>   *
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 86299a292214..6984a62c96f4 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -2054,6 +2054,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
>         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>         case BPF_PROG_TYPE_SOCK_OPS:
>         case BPF_PROG_TYPE_EXT: /* extends any prog */
>                 return true;
> @@ -2806,6 +2807,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
>                 return BPF_PROG_TYPE_SOCK_OPS;
>         case BPF_CGROUP_DEVICE:
>                 return BPF_PROG_TYPE_CGROUP_DEVICE;
> +       case BPF_CGROUP_IOCTL:
> +               return BPF_PROG_TYPE_CGROUP_IOCTL;
>         case BPF_SK_MSG_VERDICT:
>                 return BPF_PROG_TYPE_SK_MSG;
>         case BPF_SK_SKB_STREAM_PARSER:
> @@ -2878,6 +2881,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
>         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>         case BPF_PROG_TYPE_SOCK_OPS:
>                 ret = cgroup_bpf_prog_attach(attr, ptype, prog);
>                 break;
> @@ -2915,6 +2919,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
>         case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>         case BPF_PROG_TYPE_SOCK_OPS:
>                 return cgroup_bpf_prog_detach(attr, ptype);
>         default:
> @@ -2958,6 +2963,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
>         case BPF_CGROUP_SYSCTL:
>         case BPF_CGROUP_GETSOCKOPT:
>         case BPF_CGROUP_SETSOCKOPT:
> +       case BPF_CGROUP_IOCTL:
>                 return cgroup_bpf_prog_query(attr, uattr);
>         case BPF_LIRC_MODE2:
>                 return lirc_prog_query(attr, uattr);
> @@ -3914,6 +3920,7 @@ static int link_create(union bpf_attr *attr)
>         case BPF_PROG_TYPE_CGROUP_DEVICE:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>                 ret = cgroup_bpf_link_attach(attr, prog);
>                 break;
>         case BPF_PROG_TYPE_TRACING:
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index ef938f17b944..af68f463e828 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7419,6 +7419,7 @@ static int check_return_code(struct bpf_verifier_env *env)
>         case BPF_PROG_TYPE_CGROUP_DEVICE:
>         case BPF_PROG_TYPE_CGROUP_SYSCTL:
>         case BPF_PROG_TYPE_CGROUP_SOCKOPT:
> +       case BPF_PROG_TYPE_CGROUP_IOCTL:
>                 break;
>         case BPF_PROG_TYPE_RAW_TRACEPOINT:
>                 if (!env->prog->aux->attach_btf_id)
> --
> 2.25.1
>
Alexei Starovoitov Nov. 3, 2020, 5:32 a.m. UTC | #2
On Mon, Nov 02, 2020 at 02:23:02PM -0500, Kenny Ho wrote:
> Adding a few more emails from get_maintainer.pl and bumping this
> thread since there hasn't been any comments so far.  Is this too
> crazy?  Am I missing something fundamental?

sorry for delay. Missed it earlier. Feel free to ping the mailing list
sooner next time.

> On Wed, Oct 7, 2020 at 11:24 AM Kenny Ho <Kenny.Ho@amd.com> wrote:
> >
> > This is a skeleton implementation to invite comments and generate
> > discussion around the idea of introducing a bpf-cgroup program type to
> > control ioctl access.  This is modelled after
> > BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
> > write bpf programs to block some ioctl access, potentially in conjunction
> > with data collected by other bpf programs stored in some bpf maps and
> > with bpf_spin_lock.
> >
> > For example, a bpf program has been accumulating resource usaging
> > statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
> > block access to previously mentioned resource via ioctl when the stats
> > stored in a bpf map reaches certain threshold.
> >
> > Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
> > ioctls are not blocked if no bpf program is present for the cgroup.) to
> > maintain current interface behaviour when this functionality is unused.
> >
> > Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
> > ensure attached bpf programs cannot crash and always terminate quickly.
> >
> > TODOs:
> > - correct usage of the verifier
> > - toolings
> > - samples
> > - device driver may provide helper functions that take
> > bpf_cgroup_ioctl_ctx and return something more useful for specific
> > device
> >
> > Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
...
> > @@ -45,6 +46,10 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> >         if (!filp->f_op->unlocked_ioctl)
> >                 goto out;
> >
> > +       error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
> > +       if (error)
> > +               goto out;
> > +

That's a bit problematic, since we have bpf_lsm now.
Could you use security_file_ioctl hook and do the same filtering there?
It's not cgroup based though. Is it a concern?
If cgroup scoping is really necessary then it's probably better
to add it to bpf_lsm. Then all hooks will become cgroup aware.
Kenny Ho Nov. 3, 2020, 5:39 a.m. UTC | #3
Thanks for the reply.  Cgroup awareness is desired because the intent
is to use this for resource management as well (potentially along with
other cgroup controlled resources.)  I will dig into bpf_lsm and learn
more about it.

Regards,
Kenny


On Tue, Nov 3, 2020 at 12:32 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Mon, Nov 02, 2020 at 02:23:02PM -0500, Kenny Ho wrote:
> > Adding a few more emails from get_maintainer.pl and bumping this
> > thread since there hasn't been any comments so far.  Is this too
> > crazy?  Am I missing something fundamental?
>
> sorry for delay. Missed it earlier. Feel free to ping the mailing list
> sooner next time.
>
> > On Wed, Oct 7, 2020 at 11:24 AM Kenny Ho <Kenny.Ho@amd.com> wrote:
> > >
> > > This is a skeleton implementation to invite comments and generate
> > > discussion around the idea of introducing a bpf-cgroup program type to
> > > control ioctl access.  This is modelled after
> > > BPF_PROG_TYPE_CGROUP_DEVICE.  The premise is to allow system admins to
> > > write bpf programs to block some ioctl access, potentially in conjunction
> > > with data collected by other bpf programs stored in some bpf maps and
> > > with bpf_spin_lock.
> > >
> > > For example, a bpf program has been accumulating resource usaging
> > > statistic and a second bpf program of BPF_PROG_TYPE_CGROUP_IOCTL would
> > > block access to previously mentioned resource via ioctl when the stats
> > > stored in a bpf map reaches certain threshold.
> > >
> > > Like BPF_PROG_TYPE_CGROUP_DEVICE, the default is permissive (i.e.,
> > > ioctls are not blocked if no bpf program is present for the cgroup.) to
> > > maintain current interface behaviour when this functionality is unused.
> > >
> > > Performance impact to ioctl calls is minimal as bpf's in-kernel verifier
> > > ensure attached bpf programs cannot crash and always terminate quickly.
> > >
> > > TODOs:
> > > - correct usage of the verifier
> > > - toolings
> > > - samples
> > > - device driver may provide helper functions that take
> > > bpf_cgroup_ioctl_ctx and return something more useful for specific
> > > device
> > >
> > > Signed-off-by: Kenny Ho <Kenny.Ho@amd.com>
> ...
> > > @@ -45,6 +46,10 @@ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
> > >         if (!filp->f_op->unlocked_ioctl)
> > >                 goto out;
> > >
> > > +       error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
> > > +       if (error)
> > > +               goto out;
> > > +
>
> That's a bit problematic, since we have bpf_lsm now.
> Could you use security_file_ioctl hook and do the same filtering there?
> It's not cgroup based though. Is it a concern?
> If cgroup scoping is really necessary then it's probably better
> to add it to bpf_lsm. Then all hooks will become cgroup aware.
Alexei Starovoitov Nov. 3, 2020, 5:42 a.m. UTC | #4
On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
>
> Thanks for the reply.

pls don't top post.

> Cgroup awareness is desired because the intent
> is to use this for resource management as well (potentially along with
> other cgroup controlled resources.)  I will dig into bpf_lsm and learn
> more about it.

Also consider that bpf_lsm hooks have a way to get cgroup-id without
being explicitly scoped. So the bpf program can be made cgroup aware.
It's just not as convenient as attaching a prog to cgroup+hook at once.
For prototyping the existing bpf_lsm facility should be enough.
So please try to follow this route and please share more details about
the use case.
Kenny Ho Nov. 3, 2020, 7:19 p.m. UTC | #5
On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
> pls don't top post.
My apology.

> > Cgroup awareness is desired because the intent
> > is to use this for resource management as well (potentially along with
> > other cgroup controlled resources.)  I will dig into bpf_lsm and learn
> > more about it.
>
> Also consider that bpf_lsm hooks have a way to get cgroup-id without
> being explicitly scoped. So the bpf program can be made cgroup aware.
> It's just not as convenient as attaching a prog to cgroup+hook at once.
> For prototyping the existing bpf_lsm facility should be enough.
> So please try to follow this route and please share more details about
> the use case.

Ok.  I will take a look and see if that is sufficient.  My
understanding of bpf-cgroup is that it not only makes attaching prog
to cgroup easier but it also facilitates hierarchical calling of
attached progs which might be useful if users wants to manage gpu
resources with bpf cgroup along with other cgroup resources (like
cpu/mem/io, etc.)

About the use case.  The high level motivation here is to provide the
ability to subdivide/share a GPU via cgroups/containers in a way that
is similar to other resources like CPU and memory.  Users have been
requesting this type of functionality because GPU compute can get
expensive and they want to maximize the utilization to get the most
bang for their bucks.  A traditional way to do this is via
SRIOV/virtualization but that often means time sharing the GPU as a
whole unit.  That is useful for some applications but not others due
to the flushing and added latency.  We also have a study that
identified various GPU compute application types.  These types can
benefit from more asymmetrical/granular sharing of the GPU (for
example some applications are compute bound while others can be memory
bound that can benefit from having more VRAM.)

I have been trying to add a cgroup subsystem for the drm subsystem for
this purpose but I ran into two challenges.  First, the composition of
a GPU and how some of the subcomponents (like VRAM or shader
engines/compute units) can be shared are very much vendor specific so
we are unable to arrive at a common interface across all vendors.
Because of this and the variety of places a GPU can go into
(smartphone, PC, server, HPC), there is also no agreement on how
exactly a GPU should be shared.  The best way forward appears to
simply provide hooks for users to define how and what they want to
share via a bpf program.

From what I can tell so far (I am still learning), there are multiple
pieces that need to fall in place for bpf-cgroup to work for this use
case.  First there is resource limit enforcement, which is the
motivation for this RFC (I will look into bpf_lsm as the path
forward.)  I have also been thinking about instrumenting the drm
subsystem with a new BPF program type and have various attach types
across the drm subsystem but I am not sure if this is allowed (this
one is more for resource usage monitoring.)  Another thing I have been
considering is to have the gpu driver provide bpf helper functions for
bpf programs to modify drm driver internals.  That was the reason I
asked about the potential of BTF support for kernel modules a couple
of months ago (and Andrii Nakryiko mentioned that it is being worked
on.)

Please feel free to ask more questions if any of the above is unclear.
Feedbacks are always welcome.

Regards,
Kenny
Alexei Starovoitov Nov. 3, 2020, 9:04 p.m. UTC | #6
On Tue, Nov 03, 2020 at 02:19:22PM -0500, Kenny Ho wrote:
> On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> > On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
> > pls don't top post.
> My apology.
> 
> > > Cgroup awareness is desired because the intent
> > > is to use this for resource management as well (potentially along with
> > > other cgroup controlled resources.)  I will dig into bpf_lsm and learn
> > > more about it.
> >
> > Also consider that bpf_lsm hooks have a way to get cgroup-id without
> > being explicitly scoped. So the bpf program can be made cgroup aware.
> > It's just not as convenient as attaching a prog to cgroup+hook at once.
> > For prototyping the existing bpf_lsm facility should be enough.
> > So please try to follow this route and please share more details about
> > the use case.
> 
> Ok.  I will take a look and see if that is sufficient.  My
> understanding of bpf-cgroup is that it not only makes attaching prog
> to cgroup easier but it also facilitates hierarchical calling of
> attached progs which might be useful if users wants to manage gpu
> resources with bpf cgroup along with other cgroup resources (like
> cpu/mem/io, etc.)

Right. Hierarchical cgroup-bpf logic cannot be replicated inside
the program. If you're relying on cgv2 hierarchy to containerize
applications then what I suggested earlier won't work indeed.

> About the use case.  The high level motivation here is to provide the
> ability to subdivide/share a GPU via cgroups/containers in a way that
> is similar to other resources like CPU and memory.  Users have been
> requesting this type of functionality because GPU compute can get
> expensive and they want to maximize the utilization to get the most
> bang for their bucks.  A traditional way to do this is via
> SRIOV/virtualization but that often means time sharing the GPU as a
> whole unit.  That is useful for some applications but not others due
> to the flushing and added latency.  We also have a study that
> identified various GPU compute application types.  These types can
> benefit from more asymmetrical/granular sharing of the GPU (for
> example some applications are compute bound while others can be memory
> bound that can benefit from having more VRAM.)
> 
> I have been trying to add a cgroup subsystem for the drm subsystem for
> this purpose but I ran into two challenges.  First, the composition of
> a GPU and how some of the subcomponents (like VRAM or shader
> engines/compute units) can be shared are very much vendor specific so
> we are unable to arrive at a common interface across all vendors.
> Because of this and the variety of places a GPU can go into
> (smartphone, PC, server, HPC), there is also no agreement on how
> exactly a GPU should be shared.  The best way forward appears to
> simply provide hooks for users to define how and what they want to
> share via a bpf program.

Thank you for sharing the details. It certainly helps.

> From what I can tell so far (I am still learning), there are multiple
> pieces that need to fall in place for bpf-cgroup to work for this use
> case.  First there is resource limit enforcement, which is the
> motivation for this RFC (I will look into bpf_lsm as the path
> forward.)  I have also been thinking about instrumenting the drm
> subsystem with a new BPF program type and have various attach types
> across the drm subsystem but I am not sure if this is allowed (this
> one is more for resource usage monitoring.)  Another thing I have been
> considering is to have the gpu driver provide bpf helper functions for
> bpf programs to modify drm driver internals.  That was the reason I
> asked about the potential of BTF support for kernel modules a couple
> of months ago (and Andrii Nakryiko mentioned that it is being worked
> on.)

Sounds like either bpf_lsm needs to be made aware of cgv2 (which would
be a great thing to have regardless) or cgroup-bpf needs a drm/gpu specific hook.
I think generic ioctl hook is too broad for this use case.
I suspect drm/gpu internal state would be easier to access inside
bpf program if the hook is next to gpu/drm. At ioctl level there is 'file'.
It's probably too abstract for the things you want to do.
Like how VRAM/shader/etc can be accessed through file?
Probably possible through a bunch of lookups and dereferences, but
if the hook is custom to GPU that info is likely readily available.
Then such cgroup-bpf check would be suitable in execution paths where
ioctl-based hook would be too slow.
Kenny Ho Nov. 3, 2020, 10:57 p.m. UTC | #7
On Tue, Nov 3, 2020 at 4:04 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Nov 03, 2020 at 02:19:22PM -0500, Kenny Ho wrote:
> > On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > > On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
>
> Sounds like either bpf_lsm needs to be made aware of cgv2 (which would
> be a great thing to have regardless) or cgroup-bpf needs a drm/gpu specific hook.
> I think generic ioctl hook is too broad for this use case.
> I suspect drm/gpu internal state would be easier to access inside
> bpf program if the hook is next to gpu/drm. At ioctl level there is 'file'.
> It's probably too abstract for the things you want to do.
> Like how VRAM/shader/etc can be accessed through file?
> Probably possible through a bunch of lookups and dereferences, but
> if the hook is custom to GPU that info is likely readily available.
> Then such cgroup-bpf check would be suitable in execution paths where
> ioctl-based hook would be too slow.
Just to clarify, when you say drm specific hook, did you mean just a
unique attach_type or a unique prog_type+attach_type combination?  (I
am still a bit fuzzy on when a new prog type is needed vs a new attach
type.  I think prog type is associated with a unique type of context
that the bpf prog will get but I could be missing some nuances.)

When I was thinking of doing an ioctl wide hook, the file would be the
device file and the thinking was to have a helper function provided by
device drivers to further disambiguate.  For our (AMD's) driver, we
have a bunch of ioctls for set/get/create/destroy
(https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c#L1763)
so the bpf prog can make the decision after the disambiguation.  For
example, we have an ioctl called "kfd_ioctl_set_cu_mask."  You can
think of cu_mask like cpumask but for the cores/compute-unit inside a
GPU.  The ioctl hook will get the file, the bpf prog will call a
helper function from the amdgpu driver to return some data structure
specific to the driver and then the bpf prog can make a decision on
gating the ioctl or not.  From what you are saying, sounds like this
kind of back and forth lookup and dereferencing should be avoided for
performance considerations?

Having a DRM specific hook is certainly an alternative.  I just wasn't
sure which level of trade off on abstraction/generic is acceptable.  I
am guessing a new BPF_PROG_TYPE_CGROUP_AMDGPU is probably too
specific?  But sounds like BPF_PROG_TYPE_CGROUP_DRM may be ok?

Regards,
Kenny
Alexei Starovoitov Nov. 3, 2020, 11:28 p.m. UTC | #8
On Tue, Nov 03, 2020 at 05:57:47PM -0500, Kenny Ho wrote:
> On Tue, Nov 3, 2020 at 4:04 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Nov 03, 2020 at 02:19:22PM -0500, Kenny Ho wrote:
> > > On Tue, Nov 3, 2020 at 12:43 AM Alexei Starovoitov
> > > <alexei.starovoitov@gmail.com> wrote:
> > > > On Mon, Nov 2, 2020 at 9:39 PM Kenny Ho <y2kenny@gmail.com> wrote:
> >
> > Sounds like either bpf_lsm needs to be made aware of cgv2 (which would
> > be a great thing to have regardless) or cgroup-bpf needs a drm/gpu specific hook.
> > I think generic ioctl hook is too broad for this use case.
> > I suspect drm/gpu internal state would be easier to access inside
> > bpf program if the hook is next to gpu/drm. At ioctl level there is 'file'.
> > It's probably too abstract for the things you want to do.
> > Like how VRAM/shader/etc can be accessed through file?
> > Probably possible through a bunch of lookups and dereferences, but
> > if the hook is custom to GPU that info is likely readily available.
> > Then such cgroup-bpf check would be suitable in execution paths where
> > ioctl-based hook would be too slow.
> Just to clarify, when you say drm specific hook, did you mean just a
> unique attach_type or a unique prog_type+attach_type combination?  (I
> am still a bit fuzzy on when a new prog type is needed vs a new attach
> type.  I think prog type is associated with a unique type of context
> that the bpf prog will get but I could be missing some nuances.)
> 
> When I was thinking of doing an ioctl wide hook, the file would be the
> device file and the thinking was to have a helper function provided by
> device drivers to further disambiguate.  For our (AMD's) driver, we
> have a bunch of ioctls for set/get/create/destroy
> (https://elixir.bootlin.com/linux/latest/source/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c#L1763)
> so the bpf prog can make the decision after the disambiguation.  For
> example, we have an ioctl called "kfd_ioctl_set_cu_mask."  You can

Thanks for the pointer.
That's one monster ioctl. So much copy_from_user.
BPF prog would need to be sleepable to able to examine the args in such depth.
After quick glance at the code I would put a new hook into
kfd_ioctl() right before
retcode = func(filep, process, kdata);
At this point kdata is already copied from user space 
and usize, that is cmd specific, is known.
So bpf prog wouldn't need to copy that data again.
That will save one copy.
To drill into details of kfd_ioctl_set_cu_mask() the prog would
need to be sleepable to do second copy_from_user of cu_mask.
At least it's not that big.
Yes, the attachment point will be amd driver specific,
but the program doesn't need to be.
It can be generic tracing prog that is agumented to use BTF.
Something like writeable tracepoint with BTF support would do.
So on the bpf side there will be minimal amount of changes.
And in the driver you'll add one or few writeable tracepoints
and the result of the tracepoint will gate
retcode = func(filep, process, kdata);
call in kfd_ioctl().
The writeable tracepoint would need to be cgroup-bpf based.
So that's the only tricky part. BPF infra doesn't have
cgroup+tracepoint scheme. It's probably going to be useful
in other cases like this. See trace_nbd_send_request.
diff mbox series

Patch

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4e6cc0a7d69c..a3925486d417 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -19,6 +19,7 @@ 
 #include <linux/falloc.h>
 #include <linux/sched/signal.h>
 #include <linux/fiemap.h>
+#include <linux/cgroup.h>
 
 #include "internal.h"
 
@@ -45,6 +46,10 @@  long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	if (!filp->f_op->unlocked_ioctl)
 		goto out;
 
+	error = BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg);
+	if (error)
+		goto out;
+
 	error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
 	if (error == -ENOIOCTLCMD)
 		error = -ENOTTY;
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 64f367044e25..a5f0b0a8f82b 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -134,6 +134,9 @@  int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 				      short access, enum bpf_attach_type type);
 
+int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
+				        enum bpf_attach_type type);
+
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
 				   void **buf, size_t *pcount, loff_t *ppos,
@@ -346,6 +349,16 @@  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_IOCTL(filp, cmd, arg)       	              \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_ioctl_permission(filp, cmd, arg,   \
+							    BPF_CGROUP_IOCTL);\
+									      \
+	__ret;								      \
+})
+
 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 			   enum bpf_prog_type ptype, struct bpf_prog *prog);
 int cgroup_bpf_prog_detach(const union bpf_attr *attr,
@@ -429,6 +442,7 @@  static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 				       optlen, max_optlen, retval) ({ retval; })
 #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \
 				       kernel_optval) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_IOCTL(type,major,minor,access) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index a52a5688418e..3055e7e4918c 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -56,6 +56,8 @@  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
 	      struct bpf_sysctl, struct bpf_sysctl_kern)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
 	      struct bpf_sockopt, struct bpf_sockopt_kern)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_IOCTL, cg_ioctl,
+	      struct bpf_cgroup_ioctl_ctx, struct bpf_cgroup_ioctl_ctx)
 #endif
 #ifdef CONFIG_BPF_LIRC_MODE2
 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b6238b2209b7..6a908e13d3a3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -197,6 +197,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_EXT,
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
+	BPF_PROG_TYPE_CGROUP_IOCTL,
 };
 
 enum bpf_attach_type {
@@ -238,6 +239,7 @@  enum bpf_attach_type {
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
 	BPF_XDP,
+	BPF_CGROUP_IOCTL,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -4276,6 +4278,12 @@  struct bpf_cgroup_dev_ctx {
 	__u32 minor;
 };
 
+struct bpf_cgroup_ioctl_ctx {
+	__u64 filp;
+	__u32 cmd;
+	__u32 arg;
+};
+
 struct bpf_raw_tracepoint_args {
 	__u64 args[0];
 };
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 83ff127ef7ae..0958bae3b0b7 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1203,6 +1203,72 @@  const struct bpf_verifier_ops cg_dev_verifier_ops = {
 	.is_valid_access	= cgroup_dev_is_valid_access,
 };
 
+int __cgroup_bpf_check_ioctl_permission(struct file *filp, unsigned int cmd, unsigned long arg,
+				      enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_ioctl_ctx ctx = {
+		.filp = filp,
+		.cmd = cmd,
+		.arg = arg,
+	};
+	int allow = 1;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return !allow;
+}
+
+static const struct bpf_func_proto *
+cgroup_ioctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	return cgroup_base_func_proto(func_id, prog);
+}
+
+static bool cgroup_ioctl_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       const struct bpf_prog *prog,
+				       struct bpf_insn_access_aux *info)
+{
+	const int size_default = sizeof(__u32);
+
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_ioctl_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+
+	switch (off) {
+	case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, filp):
+		bpf_ctx_record_field_size(info, size_default);
+		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
+			return false;
+		break;
+	case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, cmd):
+	case bpf_ctx_range(struct bpf_cgroup_ioctl_ctx, arg):
+	default:
+		if (size != size_default)
+			return false;
+	}
+
+	return true;
+}
+
+const struct bpf_prog_ops cg_ioctl_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_ioctl_verifier_ops = {
+	.get_func_proto		= cgroup_ioctl_func_proto,
+	.is_valid_access	= cgroup_ioctl_is_valid_access,
+};
+
 /**
  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
  *
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 86299a292214..6984a62c96f4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2054,6 +2054,7 @@  static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_EXT: /* extends any prog */
 		return true;
@@ -2806,6 +2807,8 @@  attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_SOCK_OPS;
 	case BPF_CGROUP_DEVICE:
 		return BPF_PROG_TYPE_CGROUP_DEVICE;
+	case BPF_CGROUP_IOCTL:
+		return BPF_PROG_TYPE_CGROUP_IOCTL;
 	case BPF_SK_MSG_VERDICT:
 		return BPF_PROG_TYPE_SK_MSG;
 	case BPF_SK_SKB_STREAM_PARSER:
@@ -2878,6 +2881,7 @@  static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 		ret = cgroup_bpf_prog_attach(attr, ptype, prog);
 		break;
@@ -2915,6 +2919,7 @@  static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 		return cgroup_bpf_prog_detach(attr, ptype);
 	default:
@@ -2958,6 +2963,7 @@  static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_SYSCTL:
 	case BPF_CGROUP_GETSOCKOPT:
 	case BPF_CGROUP_SETSOCKOPT:
+	case BPF_CGROUP_IOCTL:
 		return cgroup_bpf_prog_query(attr, uattr);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_query(attr, uattr);
@@ -3914,6 +3920,7 @@  static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 		ret = cgroup_bpf_link_attach(attr, prog);
 		break;
 	case BPF_PROG_TYPE_TRACING:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ef938f17b944..af68f463e828 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7419,6 +7419,7 @@  static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+	case BPF_PROG_TYPE_CGROUP_IOCTL:
 		break;
 	case BPF_PROG_TYPE_RAW_TRACEPOINT:
 		if (!env->prog->aux->attach_btf_id)