diff mbox series

[bpf-next,11/13] bpf: libbpf: Add STRUCT_OPS support

Message ID 20191214004803.1653618-1-kafai@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Introduce BPF STRUCT_OPS | expand

Commit Message

Martin KaFai Lau Dec. 14, 2019, 12:48 a.m. UTC
This patch adds BPF STRUCT_OPS support to libbpf.

The only sec_name convention is SEC("struct_ops") to identify the
struct ops implemented in BPF, e.g.
SEC("struct_ops")
struct tcp_congestion_ops dctcp = {
	.init           = (void *)dctcp_init,  /* <-- a bpf_prog */
	/* ... some more func prts ... */
	.name           = "bpf_dctcp",
};

In the bpf_object__open phase, libbpf will look for the "struct_ops"
elf section and find out what is the btf-type the "struct_ops" is
implementing.  Note that the btf-type here is referring to
a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
where are the bpf progs that the func ptrs are referring to.

In the bpf_object__load phase, the prepare_struct_ops() will load
the btf_vmlinux and obtain the corresponding kernel's btf-type.
With the kernel's btf-type, it can then set the prog->type,
prog->attach_btf_id and the prog->expected_attach_type.  Thus,
the prog's properties do not rely on its section name.

Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
process is as simple as: member-name match + btf-kind match + size match.
If these matching conditions fail, libbpf will reject.
The current targeting support is "struct tcp_congestion_ops" which
most of its members are function pointers.
The member ordering of the bpf_prog's btf-type can be different from
the btf_vmlinux's btf-type.

Once the prog's properties are all set,
the libbpf will proceed to load all the progs.

After that, register_struct_ops() will create a map, finalize the
map-value by populating it with the prog-fd, and then register this
"struct_ops" to the kernel by updating the map-value to the map.

By default, libbpf does not unregister the struct_ops from the kernel
during bpf_object__close().  It can be changed by setting the new
"unreg_st_ops" in bpf_object_open_opts.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 tools/lib/bpf/bpf.c           |  10 +-
 tools/lib/bpf/bpf.h           |   5 +-
 tools/lib/bpf/libbpf.c        | 599 +++++++++++++++++++++++++++++++++-
 tools/lib/bpf/libbpf.h        |   3 +-
 tools/lib/bpf/libbpf_probes.c |   2 +
 5 files changed, 612 insertions(+), 7 deletions(-)

Comments

Andrii Nakryiko Dec. 18, 2019, 3:07 a.m. UTC | #1
On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
>
> This patch adds BPF STRUCT_OPS support to libbpf.
>
> The only sec_name convention is SEC("struct_ops") to identify the
> struct ops implemented in BPF, e.g.
> SEC("struct_ops")
> struct tcp_congestion_ops dctcp = {
>         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
>         /* ... some more func prts ... */
>         .name           = "bpf_dctcp",
> };
>
> In the bpf_object__open phase, libbpf will look for the "struct_ops"
> elf section and find out what is the btf-type the "struct_ops" is
> implementing.  Note that the btf-type here is referring to
> a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> where are the bpf progs that the func ptrs are referring to.
>
> In the bpf_object__load phase, the prepare_struct_ops() will load
> the btf_vmlinux and obtain the corresponding kernel's btf-type.
> With the kernel's btf-type, it can then set the prog->type,
> prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> the prog's properties do not rely on its section name.
>
> Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> process is as simple as: member-name match + btf-kind match + size match.
> If these matching conditions fail, libbpf will reject.
> The current targeting support is "struct tcp_congestion_ops" which
> most of its members are function pointers.
> The member ordering of the bpf_prog's btf-type can be different from
> the btf_vmlinux's btf-type.
>
> Once the prog's properties are all set,
> the libbpf will proceed to load all the progs.
>
> After that, register_struct_ops() will create a map, finalize the
> map-value by populating it with the prog-fd, and then register this
> "struct_ops" to the kernel by updating the map-value to the map.
>
> By default, libbpf does not unregister the struct_ops from the kernel
> during bpf_object__close().  It can be changed by setting the new
> "unreg_st_ops" in bpf_object_open_opts.
>
> Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> ---

This looks pretty good to me. The big two things is exposing structops
as real struct bpf_map, so that users can interact with it using
libbpf APIs, as well as splitting struct_ops map creation and
registration. bpf_object__load() should only make sure all maps are
created, progs are loaded/verified, but none of BPF program can yet be
called. Then attach is the phase where registration happens.


>  tools/lib/bpf/bpf.c           |  10 +-
>  tools/lib/bpf/bpf.h           |   5 +-
>  tools/lib/bpf/libbpf.c        | 599 +++++++++++++++++++++++++++++++++-
>  tools/lib/bpf/libbpf.h        |   3 +-
>  tools/lib/bpf/libbpf_probes.c |   2 +
>  5 files changed, 612 insertions(+), 7 deletions(-)
>

[...]

>  LIBBPF_API int
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 27d5f7ecba32..ffb5cdd7db5a 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -67,6 +67,10 @@
>
>  #define __printf(a, b) __attribute__((format(printf, a, b)))
>
> +static struct btf *bpf_core_find_kernel_btf(void);

this is not CO-RE specific anymore, we should probably just rename it
to bpf_find_kernel_btf

> +static struct bpf_program *bpf_object__find_prog_by_idx(struct bpf_object *obj,
> +                                                       int idx);
> +
>  static int __base_pr(enum libbpf_print_level level, const char *format,
>                      va_list args)
>  {
> @@ -128,6 +132,8 @@ void libbpf_print(enum libbpf_print_level level, const char *format, ...)
>  # define LIBBPF_ELF_C_READ_MMAP ELF_C_READ
>  #endif
>
> +#define BPF_STRUCT_OPS_SEC_NAME "struct_ops"

This is a special ELF section recognized by libbpf, so similarly to
".maps" (and ".kconfig", which I'm renaming from ".extern"), I think
this should be ".struct_ops" (or I'd even drop underscore and go with
".structops", but not insisting).

> +
>  static inline __u64 ptr_to_u64(const void *ptr)
>  {
>         return (__u64) (unsigned long) ptr;
> @@ -233,6 +239,32 @@ struct bpf_map {
>         bool reused;
>  };
>
> +struct bpf_struct_ops {
> +       const char *var_name;
> +       const char *tname;
> +       const struct btf_type *type;
> +       struct bpf_program **progs;
> +       __u32 *kern_func_off;
> +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> +       void *data;
> +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf

Using __bpf_ prefix for this struct_ops-specific types is a bit too
generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
it btf_ops_ or btf_structops_?


> +        * format.
> +        * struct __bpf_tcp_congestion_ops {
> +        *      [... some other kernel fields ...]
> +        *      struct tcp_congestion_ops data;
> +        * }
> +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).

Comment isn't very clear.. do you mean that data pointed to by
kern_vdata is of sizeof(...) bytes?

> +        * prepare_struct_ops() will populate the "data" into
> +        * "kern_vdata".
> +        */
> +       void *kern_vdata;
> +       __u32 type_id;
> +       __u32 kern_vtype_id;
> +       __u32 kern_vtype_size;
> +       int fd;
> +       bool unreg;

This unreg flag (and default behavior to not unregister) is bothering
me a bit.. Shouldn't this be controlled by map's lifetime, at least.
E.g., if no one pins that map - then struct_ops should be unregistered
on map destruction. If application wants to keep BPF programs
attached, it should make sure to pin map, before userspace part exits?
Is this problematic in any way?

> +};
> +
>  struct bpf_secdata {
>         void *rodata;
>         void *data;
> @@ -251,6 +283,7 @@ struct bpf_object {
>         size_t nr_maps;
>         size_t maps_cap;
>         struct bpf_secdata sections;
> +       struct bpf_struct_ops st_ops;

These bpf_struct_ops are strictly belonging to that special struct_ops
map, right? So I'd say we should change struct bpf_map to contain
per-map extra piece of info. We can combine that with current mmaped
pointer for internal maps;


struct bpf_map {
    ...
    union {
        void *mmaped;
        struct bpf_struct_ops *st_ops;
    };
};

That way those special maps can have extra piece of information
specific to that special map's type.


>
>         bool loaded;
>         bool has_pseudo_calls;
> @@ -270,6 +303,7 @@ struct bpf_object {
>                 Elf_Data *data;
>                 Elf_Data *rodata;
>                 Elf_Data *bss;
> +               Elf_Data *st_ops_data;
>                 size_t strtabidx;
>                 struct {
>                         GElf_Shdr shdr;
> @@ -282,6 +316,7 @@ struct bpf_object {
>                 int data_shndx;
>                 int rodata_shndx;
>                 int bss_shndx;
> +               int st_ops_shndx;
>         } efile;
>         /*
>          * All loaded bpf_object is linked in a list, which is
> @@ -509,6 +544,508 @@ static __u32 get_kernel_version(void)
>         return KERNEL_VERSION(major, minor, patch);
>  }
>
> +static int bpf_object__register_struct_ops(struct bpf_object *obj)
> +{
> +       struct bpf_create_map_attr map_attr = {};
> +       struct bpf_struct_ops *st_ops;
> +       const char *tname;
> +       __u32 i, zero = 0;
> +       int fd, err;
> +
> +       st_ops = &obj->st_ops;
> +       if (!st_ops->kern_vdata)
> +               return 0;

this shouldn't happen, right? I'd drop the check or return error at least.

> +
> +       tname = st_ops->tname;
> +       for (i = 0; i < btf_vlen(st_ops->type); i++) {
> +               struct bpf_program *prog = st_ops->progs[i];
> +               void *kern_data;
> +               int prog_fd;
> +
> +               if (!prog)
> +                       continue;
> +
> +               prog_fd = bpf_program__nth_fd(prog, 0);

nit: just bpf_program__fd(prog)

> +               if (prog_fd < 0) {
> +                       pr_warn("struct_ops register %s: prog %s is not loaded\n",
> +                               tname, prog->name);
> +                       return -EINVAL;
> +               }

This is redundant check, register_struct_ops will not be called if any
program loading fails.

> +
> +               kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
> +               *(unsigned long *)kern_data = prog_fd;
> +       }
> +
> +       map_attr.map_type = BPF_MAP_TYPE_STRUCT_OPS;
> +       map_attr.key_size = sizeof(unsigned int);
> +       map_attr.value_size = st_ops->kern_vtype_size;
> +       map_attr.max_entries = 1;
> +       map_attr.btf_fd = btf__fd(obj->btf);
> +       map_attr.btf_vmlinux_value_type_id = st_ops->kern_vtype_id;
> +       map_attr.name = st_ops->var_name;
> +
> +       fd = bpf_create_map_xattr(&map_attr);

we should try to reuse bpf_object__init_internal_map(). This will add
struct bpf_map which users can iterate over and look up by name, etc.
We had similar discussion when Daniel was adding  global data maps,
and we conclusively decided that these special maps have to be
represented in libbpf as struct bpf_map as well.

> +       if (fd < 0) {
> +               err = -errno;
> +               pr_warn("struct_ops register %s: Error in creating struct_ops map\n",
> +                       tname);
> +               return err;
> +       }
> +
> +       err = bpf_map_update_elem(fd, &zero, st_ops->kern_vdata, 0);

This is what "activates" struct_ops, so this has to happen outside of
load, load shouldn't trigger execution of BPF programs. So something
like bpf_map__attach_struct_ops() or we if introduce new concept for
struct_ops: bpf_struct_ops__attach(), which can be called explicitly
by user of automatically from skeletons <skeleton>__attach().


> +       if (err) {
> +               err = -errno;
> +               close(fd);
> +               pr_warn("struct_ops register %s: Error in updating struct_ops map\n",
> +                       tname);
> +               return err;
> +       }
> +
> +       st_ops->fd = fd;
> +
> +       return 0;
> +}
> +
> +static int bpf_struct_ops__unregister(struct bpf_struct_ops *st_ops)
> +{
> +       if (st_ops->fd != -1) {
> +               __u32 zero = 0;
> +               int err = 0;
> +
> +               if (bpf_map_delete_elem(st_ops->fd, &zero))
> +                       err = -errno;
> +               zclose(st_ops->fd);
> +
> +               return err;
> +       }
> +
> +       return 0;
> +}
> +
> +static const struct btf_type *
> +resolve_ptr(const struct btf *btf, __u32 id, __u32 *res_id);
> +static const struct btf_type *
> +resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id);
> +
> +static const struct btf_member *
> +find_member_by_offset(const struct btf_type *t, __u32 offset)

nit: find_member_by_bit_offset (offset -> bit_offset)?

> +{
> +       struct btf_member *m;
> +       int i;
> +
> +       for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) {
> +               if (btf_member_bit_offset(t, i) == offset)
> +                       return m;
> +       }
> +
> +       return NULL;
> +}
> +
> +static const struct btf_member *
> +find_member_by_name(const struct btf *btf, const struct btf_type *t,
> +                   const char *name)
> +{
> +       struct btf_member *m;
> +       int i;
> +
> +       for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) {
> +               if (!strcmp(btf__name_by_offset(btf, m->name_off), name))
> +                       return m;
> +       }
> +
> +       return NULL;
> +}
> +
> +#define STRUCT_OPS_VALUE_PREFIX "__bpf_"
> +#define STRUCT_OPS_VALUE_PREFIX_LEN (sizeof(STRUCT_OPS_VALUE_PREFIX) - 1)
> +
> +static int
> +bpf_struct_ops__get_kern_types(const struct btf *btf, const char *tname,

nit: there is no "bpf_struct_ops" object in libbpf and this is not its
method, so it's a violation of libbpf's naming convention, please
consider renaming to something like "find_struct_ops_kern_types"

> +                              const struct btf_type **type, __u32 *type_id,
> +                              const struct btf_type **vtype, __u32 *vtype_id,
> +                              const struct btf_member **data_member)
> +{
> +       const struct btf_type *kern_type, *kern_vtype;
> +       const struct btf_member *kern_data_member;
> +       __s32 kern_vtype_id, kern_type_id;
> +       char vtname[128] = STRUCT_OPS_VALUE_PREFIX;
> +       __u32 i;
> +
> +       kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
> +       if (kern_type_id < 0) {
> +               pr_warn("struct_ops prepare: struct %s is not found in kernel BTF\n",
> +                       tname);
> +               return -ENOTSUP;

just return kern_type_id (pass through btf__find_by_name_kind's
result). Same below.

> +       }
> +       kern_type = btf__type_by_id(btf, kern_type_id);
> +
> +       /* Find the corresponding "map_value" type that will be used
> +        * in map_update(BPF_MAP_TYPE_STRUCT_OPS).  For example,
> +        * find "struct __bpf_tcp_congestion_ops" from the btf_vmlinux.
> +        */
> +       strncat(vtname + STRUCT_OPS_VALUE_PREFIX_LEN, tname,
> +               sizeof(vtname) - STRUCT_OPS_VALUE_PREFIX_LEN - 1);
> +       kern_vtype_id = btf__find_by_name_kind(btf, vtname,
> +                                              BTF_KIND_STRUCT);
> +       if (kern_vtype_id < 0) {
> +               pr_warn("struct_ops prepare: struct %s is not found in kernel BTF\n",
> +                       vtname);
> +               return -ENOTSUP;
> +       }
> +       kern_vtype = btf__type_by_id(btf, kern_vtype_id);
> +
> +       /* Find "struct tcp_congestion_ops" from
> +        * struct __bpf_tcp_congestion_ops {
> +        *      [ ... ]
> +        *      struct tcp_congestion_ops data;
> +        * }
> +        */
> +       for (i = 0, kern_data_member = btf_members(kern_vtype);
> +            i < btf_vlen(kern_vtype);
> +            i++, kern_data_member++) {

nit: multi-line for is kind of ugly, maybe move kern_data_member
assignment out of for?

> +               if (kern_data_member->type == kern_type_id)
> +                       break;
> +       }
> +       if (i == btf_vlen(kern_vtype)) {
> +               pr_warn("struct_ops prepare: struct %s data is not found in struct %s\n",
> +                       tname, vtname);
> +               return -EINVAL;
> +       }
> +

[...]

>  static int bpf_object__init_btf(struct bpf_object *obj,
> @@ -1689,6 +2257,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
>                         } else if (strcmp(name, ".rodata") == 0) {
>                                 obj->efile.rodata = data;
>                                 obj->efile.rodata_shndx = idx;
> +                       } else if (strcmp(name, BPF_STRUCT_OPS_SEC_NAME) == 0) {
> +                               obj->efile.st_ops_data = data;
> +                               obj->efile.st_ops_shndx = idx;
>                         } else {
>                                 pr_debug("skip section(%d) %s\n", idx, name);
>                         }
> @@ -1698,7 +2269,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
>                         int sec = sh.sh_info; /* points to other section */
>
>                         /* Only do relo for section with exec instructions */
> -                       if (!section_have_execinstr(obj, sec)) {
> +                       if (!section_have_execinstr(obj, sec) &&
> +                           !strstr(name, BPF_STRUCT_OPS_SEC_NAME)) {

why substring match?

>                                 pr_debug("skip relo %s(%d) for section(%d)\n",
>                                          name, idx, sec);
>                                 continue;

[...]
Martin KaFai Lau Dec. 18, 2019, 7:03 a.m. UTC | #2
On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >
> > This patch adds BPF STRUCT_OPS support to libbpf.
> >
> > The only sec_name convention is SEC("struct_ops") to identify the
> > struct ops implemented in BPF, e.g.
> > SEC("struct_ops")
> > struct tcp_congestion_ops dctcp = {
> >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> >         /* ... some more func prts ... */
> >         .name           = "bpf_dctcp",
> > };
> >
> > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> > elf section and find out what is the btf-type the "struct_ops" is
> > implementing.  Note that the btf-type here is referring to
> > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> > where are the bpf progs that the func ptrs are referring to.
> >
> > In the bpf_object__load phase, the prepare_struct_ops() will load
> > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> > With the kernel's btf-type, it can then set the prog->type,
> > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> > the prog's properties do not rely on its section name.
> >
> > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> > process is as simple as: member-name match + btf-kind match + size match.
> > If these matching conditions fail, libbpf will reject.
> > The current targeting support is "struct tcp_congestion_ops" which
> > most of its members are function pointers.
> > The member ordering of the bpf_prog's btf-type can be different from
> > the btf_vmlinux's btf-type.
> >
> > Once the prog's properties are all set,
> > the libbpf will proceed to load all the progs.
> >
> > After that, register_struct_ops() will create a map, finalize the
> > map-value by populating it with the prog-fd, and then register this
> > "struct_ops" to the kernel by updating the map-value to the map.
> >
> > By default, libbpf does not unregister the struct_ops from the kernel
> > during bpf_object__close().  It can be changed by setting the new
> > "unreg_st_ops" in bpf_object_open_opts.
> >
> > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > ---
> 
> This looks pretty good to me. The big two things is exposing structops
> as real struct bpf_map, so that users can interact with it using
> libbpf APIs, as well as splitting struct_ops map creation and
> registration. bpf_object__load() should only make sure all maps are
> created, progs are loaded/verified, but none of BPF program can yet be
> called. Then attach is the phase where registration happens.
Thanks for the review.

[ ... ]

> >  static inline __u64 ptr_to_u64(const void *ptr)
> >  {
> >         return (__u64) (unsigned long) ptr;
> > @@ -233,6 +239,32 @@ struct bpf_map {
> >         bool reused;
> >  };
> >
> > +struct bpf_struct_ops {
> > +       const char *var_name;
> > +       const char *tname;
> > +       const struct btf_type *type;
> > +       struct bpf_program **progs;
> > +       __u32 *kern_func_off;
> > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> > +       void *data;
> > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> 
> Using __bpf_ prefix for this struct_ops-specific types is a bit too
> generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> it btf_ops_ or btf_structops_?
Is it a concern on name collision?

The prefix pick is to use a more representative name.
struct_ops use many bpf pieces and btf is one of them.
Very soon, all new codes will depend on BTF and btf_ prefix
could become generic also.

Unlike tracepoint, there is no non-btf version of struct_ops.  

> 
> 
> > +        * format.
> > +        * struct __bpf_tcp_congestion_ops {
> > +        *      [... some other kernel fields ...]
> > +        *      struct tcp_congestion_ops data;
> > +        * }
> > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
> 
> Comment isn't very clear.. do you mean that data pointed to by
> kern_vdata is of sizeof(...) bytes?
> 
> > +        * prepare_struct_ops() will populate the "data" into
> > +        * "kern_vdata".
> > +        */
> > +       void *kern_vdata;
> > +       __u32 type_id;
> > +       __u32 kern_vtype_id;
> > +       __u32 kern_vtype_size;
> > +       int fd;
> > +       bool unreg;
> 
> This unreg flag (and default behavior to not unregister) is bothering
> me a bit.. Shouldn't this be controlled by map's lifetime, at least.
> E.g., if no one pins that map - then struct_ops should be unregistered
> on map destruction. If application wants to keep BPF programs
> attached, it should make sure to pin map, before userspace part exits?
> Is this problematic in any way?
I don't think it should in the struct_ops case.  I think of the
struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
in this case) and this map-progs stay (or keep attaching) until it is
detached.  Like other attached bpf_prog keeps running without
caring if the bpf_prog is pinned or not.

About the "bool unreg;", the default can be changed to true if
it makes more sense.

[ ... ]

> 
> > +
> > +               kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
> > +               *(unsigned long *)kern_data = prog_fd;
> > +       }
> > +
> > +       map_attr.map_type = BPF_MAP_TYPE_STRUCT_OPS;
> > +       map_attr.key_size = sizeof(unsigned int);
> > +       map_attr.value_size = st_ops->kern_vtype_size;
> > +       map_attr.max_entries = 1;
> > +       map_attr.btf_fd = btf__fd(obj->btf);
> > +       map_attr.btf_vmlinux_value_type_id = st_ops->kern_vtype_id;
> > +       map_attr.name = st_ops->var_name;
> > +
> > +       fd = bpf_create_map_xattr(&map_attr);
> 
> we should try to reuse bpf_object__init_internal_map(). This will add
> struct bpf_map which users can iterate over and look up by name, etc.
> We had similar discussion when Daniel was adding  global data maps,
> and we conclusively decided that these special maps have to be
> represented in libbpf as struct bpf_map as well.
I will take a look.

> 
> > +       if (fd < 0) {
> > +               err = -errno;
> > +               pr_warn("struct_ops register %s: Error in creating struct_ops map\n",
> > +                       tname);
> > +               return err;
> > +       }
> > +
> > +       err = bpf_map_update_elem(fd, &zero, st_ops->kern_vdata, 0);
Martin KaFai Lau Dec. 18, 2019, 7:20 a.m. UTC | #3
On Tue, Dec 17, 2019 at 11:03:45PM -0800, Martin Lau wrote:
> On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > This patch adds BPF STRUCT_OPS support to libbpf.
> > >
> > > The only sec_name convention is SEC("struct_ops") to identify the
> > > struct ops implemented in BPF, e.g.
> > > SEC("struct_ops")
> > > struct tcp_congestion_ops dctcp = {
> > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> > >         /* ... some more func prts ... */
> > >         .name           = "bpf_dctcp",
> > > };
> > >
> > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> > > elf section and find out what is the btf-type the "struct_ops" is
> > > implementing.  Note that the btf-type here is referring to
> > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> > > where are the bpf progs that the func ptrs are referring to.
> > >
> > > In the bpf_object__load phase, the prepare_struct_ops() will load
> > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> > > With the kernel's btf-type, it can then set the prog->type,
> > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> > > the prog's properties do not rely on its section name.
> > >
> > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> > > process is as simple as: member-name match + btf-kind match + size match.
> > > If these matching conditions fail, libbpf will reject.
> > > The current targeting support is "struct tcp_congestion_ops" which
> > > most of its members are function pointers.
> > > The member ordering of the bpf_prog's btf-type can be different from
> > > the btf_vmlinux's btf-type.
> > >
> > > Once the prog's properties are all set,
> > > the libbpf will proceed to load all the progs.
> > >
> > > After that, register_struct_ops() will create a map, finalize the
> > > map-value by populating it with the prog-fd, and then register this
> > > "struct_ops" to the kernel by updating the map-value to the map.
> > >
> > > By default, libbpf does not unregister the struct_ops from the kernel
> > > during bpf_object__close().  It can be changed by setting the new
> > > "unreg_st_ops" in bpf_object_open_opts.
> > >
> > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > ---
> > 
> > This looks pretty good to me. The big two things is exposing structops
> > as real struct bpf_map, so that users can interact with it using
> > libbpf APIs, as well as splitting struct_ops map creation and
> > registration. bpf_object__load() should only make sure all maps are
> > created, progs are loaded/verified, but none of BPF program can yet be
> > called. Then attach is the phase where registration happens.
> Thanks for the review.
> 
> [ ... ]
> 
> > >  static inline __u64 ptr_to_u64(const void *ptr)
> > >  {
> > >         return (__u64) (unsigned long) ptr;
> > > @@ -233,6 +239,32 @@ struct bpf_map {
> > >         bool reused;
> > >  };
> > >
> > > +struct bpf_struct_ops {
> > > +       const char *var_name;
> > > +       const char *tname;
> > > +       const struct btf_type *type;
> > > +       struct bpf_program **progs;
> > > +       __u32 *kern_func_off;
> > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> > > +       void *data;
> > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> > 
> > Using __bpf_ prefix for this struct_ops-specific types is a bit too
> > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> > it btf_ops_ or btf_structops_?
> Is it a concern on name collision?
> 
> The prefix pick is to use a more representative name.
> struct_ops use many bpf pieces and btf is one of them.
> Very soon, all new codes will depend on BTF and btf_ prefix
> could become generic also.
> 
> Unlike tracepoint, there is no non-btf version of struct_ops.
May be bpf_struct_ops_?

It was my early pick but it read quite weird,
bpf_[struct]_<ops>_[tcp_congestion]_<ops>.

Hence, I go with __bpf_<actual-name-of-the-kernel-struct> in this series.
Andrii Nakryiko Dec. 18, 2019, 4:34 p.m. UTC | #4
On Tue, Dec 17, 2019 at 11:03 PM Martin Lau <kafai@fb.com> wrote:
>
> On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > >
> > > This patch adds BPF STRUCT_OPS support to libbpf.
> > >
> > > The only sec_name convention is SEC("struct_ops") to identify the
> > > struct ops implemented in BPF, e.g.
> > > SEC("struct_ops")
> > > struct tcp_congestion_ops dctcp = {
> > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> > >         /* ... some more func prts ... */
> > >         .name           = "bpf_dctcp",
> > > };
> > >
> > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> > > elf section and find out what is the btf-type the "struct_ops" is
> > > implementing.  Note that the btf-type here is referring to
> > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> > > where are the bpf progs that the func ptrs are referring to.
> > >
> > > In the bpf_object__load phase, the prepare_struct_ops() will load
> > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> > > With the kernel's btf-type, it can then set the prog->type,
> > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> > > the prog's properties do not rely on its section name.
> > >
> > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> > > process is as simple as: member-name match + btf-kind match + size match.
> > > If these matching conditions fail, libbpf will reject.
> > > The current targeting support is "struct tcp_congestion_ops" which
> > > most of its members are function pointers.
> > > The member ordering of the bpf_prog's btf-type can be different from
> > > the btf_vmlinux's btf-type.
> > >
> > > Once the prog's properties are all set,
> > > the libbpf will proceed to load all the progs.
> > >
> > > After that, register_struct_ops() will create a map, finalize the
> > > map-value by populating it with the prog-fd, and then register this
> > > "struct_ops" to the kernel by updating the map-value to the map.
> > >
> > > By default, libbpf does not unregister the struct_ops from the kernel
> > > during bpf_object__close().  It can be changed by setting the new
> > > "unreg_st_ops" in bpf_object_open_opts.
> > >
> > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > ---
> >
> > This looks pretty good to me. The big two things is exposing structops
> > as real struct bpf_map, so that users can interact with it using
> > libbpf APIs, as well as splitting struct_ops map creation and
> > registration. bpf_object__load() should only make sure all maps are
> > created, progs are loaded/verified, but none of BPF program can yet be
> > called. Then attach is the phase where registration happens.
> Thanks for the review.
>
> [ ... ]
>
> > >  static inline __u64 ptr_to_u64(const void *ptr)
> > >  {
> > >         return (__u64) (unsigned long) ptr;
> > > @@ -233,6 +239,32 @@ struct bpf_map {
> > >         bool reused;
> > >  };
> > >
> > > +struct bpf_struct_ops {
> > > +       const char *var_name;
> > > +       const char *tname;
> > > +       const struct btf_type *type;
> > > +       struct bpf_program **progs;
> > > +       __u32 *kern_func_off;
> > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> > > +       void *data;
> > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> >
> > Using __bpf_ prefix for this struct_ops-specific types is a bit too
> > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> > it btf_ops_ or btf_structops_?
> Is it a concern on name collision?
>
> The prefix pick is to use a more representative name.
> struct_ops use many bpf pieces and btf is one of them.
> Very soon, all new codes will depend on BTF and btf_ prefix
> could become generic also.
>
> Unlike tracepoint, there is no non-btf version of struct_ops.

Not so much name collision, as being able to immediately recognize
that it's used to provide type information for struct_ops. Think about
some automated tooling parsing vmlinux BTF and trying to create some
derivative types for those btf_trace_xxx and __bpf_xxx types. Having
unique prefix that identifies what kind of type-providing struct it is
is very useful to do generic tool like that. While __bpf_ isn't
specifying in any ways that it's for struct_ops.

>
> >
> >
> > > +        * format.
> > > +        * struct __bpf_tcp_congestion_ops {
> > > +        *      [... some other kernel fields ...]
> > > +        *      struct tcp_congestion_ops data;
> > > +        * }
> > > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
> >
> > Comment isn't very clear.. do you mean that data pointed to by
> > kern_vdata is of sizeof(...) bytes?
> >
> > > +        * prepare_struct_ops() will populate the "data" into
> > > +        * "kern_vdata".
> > > +        */
> > > +       void *kern_vdata;
> > > +       __u32 type_id;
> > > +       __u32 kern_vtype_id;
> > > +       __u32 kern_vtype_size;
> > > +       int fd;
> > > +       bool unreg;
> >
> > This unreg flag (and default behavior to not unregister) is bothering
> > me a bit.. Shouldn't this be controlled by map's lifetime, at least.
> > E.g., if no one pins that map - then struct_ops should be unregistered
> > on map destruction. If application wants to keep BPF programs
> > attached, it should make sure to pin map, before userspace part exits?
> > Is this problematic in any way?
> I don't think it should in the struct_ops case.  I think of the
> struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
> in this case) and this map-progs stay (or keep attaching) until it is
> detached.  Like other attached bpf_prog keeps running without
> caring if the bpf_prog is pinned or not.

I'll let someone else comment on how this behaves for cgroup, xdp,
etc, but for tracing, for example, we have FD-based BPF links, which
will detach program automatically when FD is closed. I think the idea
is to extend this to other types of BPF programs as well, so there is
no risk of leaving some stray BPF program running after unintended
crash of userspace program. When application explicitly needs BPF
program to outlive its userspace control app, then this can be
achieved by pinning map/program in BPFFS.

>
> About the "bool unreg;", the default can be changed to true if
> it makes more sense.
>
> [ ... ]
>
> >
> > > +
> > > +               kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
> > > +               *(unsigned long *)kern_data = prog_fd;
> > > +       }
> > > +
> > > +       map_attr.map_type = BPF_MAP_TYPE_STRUCT_OPS;
> > > +       map_attr.key_size = sizeof(unsigned int);
> > > +       map_attr.value_size = st_ops->kern_vtype_size;
> > > +       map_attr.max_entries = 1;
> > > +       map_attr.btf_fd = btf__fd(obj->btf);
> > > +       map_attr.btf_vmlinux_value_type_id = st_ops->kern_vtype_id;
> > > +       map_attr.name = st_ops->var_name;
> > > +
> > > +       fd = bpf_create_map_xattr(&map_attr);
> >
> > we should try to reuse bpf_object__init_internal_map(). This will add
> > struct bpf_map which users can iterate over and look up by name, etc.
> > We had similar discussion when Daniel was adding  global data maps,
> > and we conclusively decided that these special maps have to be
> > represented in libbpf as struct bpf_map as well.
> I will take a look.
>
> >
> > > +       if (fd < 0) {
> > > +               err = -errno;
> > > +               pr_warn("struct_ops register %s: Error in creating struct_ops map\n",
> > > +                       tname);
> > > +               return err;
> > > +       }
> > > +
> > > +       err = bpf_map_update_elem(fd, &zero, st_ops->kern_vdata, 0);
Andrii Nakryiko Dec. 18, 2019, 4:36 p.m. UTC | #5
On Tue, Dec 17, 2019 at 11:20 PM Martin Lau <kafai@fb.com> wrote:
>
> On Tue, Dec 17, 2019 at 11:03:45PM -0800, Martin Lau wrote:
> > On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> > > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > This patch adds BPF STRUCT_OPS support to libbpf.
> > > >
> > > > The only sec_name convention is SEC("struct_ops") to identify the
> > > > struct ops implemented in BPF, e.g.
> > > > SEC("struct_ops")
> > > > struct tcp_congestion_ops dctcp = {
> > > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> > > >         /* ... some more func prts ... */
> > > >         .name           = "bpf_dctcp",
> > > > };
> > > >
> > > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> > > > elf section and find out what is the btf-type the "struct_ops" is
> > > > implementing.  Note that the btf-type here is referring to
> > > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> > > > where are the bpf progs that the func ptrs are referring to.
> > > >
> > > > In the bpf_object__load phase, the prepare_struct_ops() will load
> > > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> > > > With the kernel's btf-type, it can then set the prog->type,
> > > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> > > > the prog's properties do not rely on its section name.
> > > >
> > > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> > > > process is as simple as: member-name match + btf-kind match + size match.
> > > > If these matching conditions fail, libbpf will reject.
> > > > The current targeting support is "struct tcp_congestion_ops" which
> > > > most of its members are function pointers.
> > > > The member ordering of the bpf_prog's btf-type can be different from
> > > > the btf_vmlinux's btf-type.
> > > >
> > > > Once the prog's properties are all set,
> > > > the libbpf will proceed to load all the progs.
> > > >
> > > > After that, register_struct_ops() will create a map, finalize the
> > > > map-value by populating it with the prog-fd, and then register this
> > > > "struct_ops" to the kernel by updating the map-value to the map.
> > > >
> > > > By default, libbpf does not unregister the struct_ops from the kernel
> > > > during bpf_object__close().  It can be changed by setting the new
> > > > "unreg_st_ops" in bpf_object_open_opts.
> > > >
> > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > ---
> > >
> > > This looks pretty good to me. The big two things is exposing structops
> > > as real struct bpf_map, so that users can interact with it using
> > > libbpf APIs, as well as splitting struct_ops map creation and
> > > registration. bpf_object__load() should only make sure all maps are
> > > created, progs are loaded/verified, but none of BPF program can yet be
> > > called. Then attach is the phase where registration happens.
> > Thanks for the review.
> >
> > [ ... ]
> >
> > > >  static inline __u64 ptr_to_u64(const void *ptr)
> > > >  {
> > > >         return (__u64) (unsigned long) ptr;
> > > > @@ -233,6 +239,32 @@ struct bpf_map {
> > > >         bool reused;
> > > >  };
> > > >
> > > > +struct bpf_struct_ops {
> > > > +       const char *var_name;
> > > > +       const char *tname;
> > > > +       const struct btf_type *type;
> > > > +       struct bpf_program **progs;
> > > > +       __u32 *kern_func_off;
> > > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> > > > +       void *data;
> > > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> > >
> > > Using __bpf_ prefix for this struct_ops-specific types is a bit too
> > > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> > > it btf_ops_ or btf_structops_?
> > Is it a concern on name collision?
> >
> > The prefix pick is to use a more representative name.
> > struct_ops use many bpf pieces and btf is one of them.
> > Very soon, all new codes will depend on BTF and btf_ prefix
> > could become generic also.
> >
> > Unlike tracepoint, there is no non-btf version of struct_ops.
> May be bpf_struct_ops_?
>
> It was my early pick but it read quite weird,
> bpf_[struct]_<ops>_[tcp_congestion]_<ops>.
>
> Hence, I go with __bpf_<actual-name-of-the-kernel-struct> in this series.

bpf_struct_ops_ is much better, IMO, but given this struct serves only
the purpose of providing type information to kernel, I think
btf_struct_ops_ is more justified.
And this <ops>_xxx_<ops> duplication doesn't bother me at all, again,
because it's not directly used in C code. But believe me, having
unique prefix is so good, even in the simples case of grepping through
vmlinux.h.
Martin KaFai Lau Dec. 18, 2019, 5:33 p.m. UTC | #6
On Wed, Dec 18, 2019 at 08:34:25AM -0800, Andrii Nakryiko wrote:
> On Tue, Dec 17, 2019 at 11:03 PM Martin Lau <kafai@fb.com> wrote:
> >
> > On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> > > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > >
> > > > This patch adds BPF STRUCT_OPS support to libbpf.
> > > >
> > > > The only sec_name convention is SEC("struct_ops") to identify the
> > > > struct ops implemented in BPF, e.g.
> > > > SEC("struct_ops")
> > > > struct tcp_congestion_ops dctcp = {
> > > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> > > >         /* ... some more func prts ... */
> > > >         .name           = "bpf_dctcp",
> > > > };
> > > >
> > > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> > > > elf section and find out what is the btf-type the "struct_ops" is
> > > > implementing.  Note that the btf-type here is referring to
> > > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> > > > where are the bpf progs that the func ptrs are referring to.
> > > >
> > > > In the bpf_object__load phase, the prepare_struct_ops() will load
> > > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> > > > With the kernel's btf-type, it can then set the prog->type,
> > > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> > > > the prog's properties do not rely on its section name.
> > > >
> > > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> > > > process is as simple as: member-name match + btf-kind match + size match.
> > > > If these matching conditions fail, libbpf will reject.
> > > > The current targeting support is "struct tcp_congestion_ops" which
> > > > most of its members are function pointers.
> > > > The member ordering of the bpf_prog's btf-type can be different from
> > > > the btf_vmlinux's btf-type.
> > > >
> > > > Once the prog's properties are all set,
> > > > the libbpf will proceed to load all the progs.
> > > >
> > > > After that, register_struct_ops() will create a map, finalize the
> > > > map-value by populating it with the prog-fd, and then register this
> > > > "struct_ops" to the kernel by updating the map-value to the map.
> > > >
> > > > By default, libbpf does not unregister the struct_ops from the kernel
> > > > during bpf_object__close().  It can be changed by setting the new
> > > > "unreg_st_ops" in bpf_object_open_opts.
> > > >
> > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > ---
> > >
> > > This looks pretty good to me. The big two things is exposing structops
> > > as real struct bpf_map, so that users can interact with it using
> > > libbpf APIs, as well as splitting struct_ops map creation and
> > > registration. bpf_object__load() should only make sure all maps are
> > > created, progs are loaded/verified, but none of BPF program can yet be
> > > called. Then attach is the phase where registration happens.
> > Thanks for the review.
> >
> > [ ... ]
> >
> > > >  static inline __u64 ptr_to_u64(const void *ptr)
> > > >  {
> > > >         return (__u64) (unsigned long) ptr;
> > > > @@ -233,6 +239,32 @@ struct bpf_map {
> > > >         bool reused;
> > > >  };
> > > >
> > > > +struct bpf_struct_ops {
> > > > +       const char *var_name;
> > > > +       const char *tname;
> > > > +       const struct btf_type *type;
> > > > +       struct bpf_program **progs;
> > > > +       __u32 *kern_func_off;
> > > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> > > > +       void *data;
> > > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> > >
> > > Using __bpf_ prefix for this struct_ops-specific types is a bit too
> > > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> > > it btf_ops_ or btf_structops_?
> > Is it a concern on name collision?
> >
> > The prefix pick is to use a more representative name.
> > struct_ops use many bpf pieces and btf is one of them.
> > Very soon, all new codes will depend on BTF and btf_ prefix
> > could become generic also.
> >
> > Unlike tracepoint, there is no non-btf version of struct_ops.
> 
> Not so much name collision, as being able to immediately recognize
> that it's used to provide type information for struct_ops. Think about
> some automated tooling parsing vmlinux BTF and trying to create some
> derivative types for those btf_trace_xxx and __bpf_xxx types. Having
> unique prefix that identifies what kind of type-providing struct it is
> is very useful to do generic tool like that. While __bpf_ isn't
> specifying in any ways that it's for struct_ops.
> 
> >
> > >
> > >
> > > > +        * format.
> > > > +        * struct __bpf_tcp_congestion_ops {
> > > > +        *      [... some other kernel fields ...]
> > > > +        *      struct tcp_congestion_ops data;
> > > > +        * }
> > > > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
> > >
> > > Comment isn't very clear.. do you mean that data pointed to by
> > > kern_vdata is of sizeof(...) bytes?
> > >
> > > > +        * prepare_struct_ops() will populate the "data" into
> > > > +        * "kern_vdata".
> > > > +        */
> > > > +       void *kern_vdata;
> > > > +       __u32 type_id;
> > > > +       __u32 kern_vtype_id;
> > > > +       __u32 kern_vtype_size;
> > > > +       int fd;
> > > > +       bool unreg;
> > >
> > > This unreg flag (and default behavior to not unregister) is bothering
> > > me a bit.. Shouldn't this be controlled by map's lifetime, at least.
> > > E.g., if no one pins that map - then struct_ops should be unregistered
> > > on map destruction. If application wants to keep BPF programs
> > > attached, it should make sure to pin map, before userspace part exits?
> > > Is this problematic in any way?
> > I don't think it should in the struct_ops case.  I think of the
> > struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
> > in this case) and this map-progs stay (or keep attaching) until it is
> > detached.  Like other attached bpf_prog keeps running without
> > caring if the bpf_prog is pinned or not.
> 
> I'll let someone else comment on how this behaves for cgroup, xdp,
> etc,
> but for tracing, for example, we have FD-based BPF links, which
> will detach program automatically when FD is closed. I think the idea
> is to extend this to other types of BPF programs as well, so there is
> no risk of leaving some stray BPF program running after unintended
Like xdp_prog, struct_ops does not have another fd-based-link.
This link can be created for struct_ops, xdp_prog and others later.
I don't see a conflict here.

> crash of userspace program. When application explicitly needs BPF
> program to outlive its userspace control app, then this can be
> achieved by pinning map/program in BPFFS.
If the concern is about not leaving struct_ops behind,
lets assume there is no "detach" and only depends on the very
last userspace's handles (FD/pinned) of a map goes away,
what may be an easy way to remove bpf_cubic from the system:

[root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion
    net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic
    net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic
    net.ipv4.tcp_congestion_control = bpf_cubic

> 
> >
> > About the "bool unreg;", the default can be changed to true if
> > it makes more sense.
> >
Andrii Nakryiko Dec. 18, 2019, 6:14 p.m. UTC | #7
On Wed, Dec 18, 2019 at 9:34 AM Martin Lau <kafai@fb.com> wrote:
>
> On Wed, Dec 18, 2019 at 08:34:25AM -0800, Andrii Nakryiko wrote:
> > On Tue, Dec 17, 2019 at 11:03 PM Martin Lau <kafai@fb.com> wrote:
> > >
> > > On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> > > > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> > > > >
> > > > > This patch adds BPF STRUCT_OPS support to libbpf.
> > > > >
> > > > > The only sec_name convention is SEC("struct_ops") to identify the
> > > > > struct ops implemented in BPF, e.g.
> > > > > SEC("struct_ops")
> > > > > struct tcp_congestion_ops dctcp = {
> > > > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> > > > >         /* ... some more func prts ... */
> > > > >         .name           = "bpf_dctcp",
> > > > > };
> > > > >
> > > > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> > > > > elf section and find out what is the btf-type the "struct_ops" is
> > > > > implementing.  Note that the btf-type here is referring to
> > > > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> > > > > where are the bpf progs that the func ptrs are referring to.
> > > > >
> > > > > In the bpf_object__load phase, the prepare_struct_ops() will load
> > > > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> > > > > With the kernel's btf-type, it can then set the prog->type,
> > > > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> > > > > the prog's properties do not rely on its section name.
> > > > >
> > > > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> > > > > process is as simple as: member-name match + btf-kind match + size match.
> > > > > If these matching conditions fail, libbpf will reject.
> > > > > The current targeting support is "struct tcp_congestion_ops" which
> > > > > most of its members are function pointers.
> > > > > The member ordering of the bpf_prog's btf-type can be different from
> > > > > the btf_vmlinux's btf-type.
> > > > >
> > > > > Once the prog's properties are all set,
> > > > > the libbpf will proceed to load all the progs.
> > > > >
> > > > > After that, register_struct_ops() will create a map, finalize the
> > > > > map-value by populating it with the prog-fd, and then register this
> > > > > "struct_ops" to the kernel by updating the map-value to the map.
> > > > >
> > > > > By default, libbpf does not unregister the struct_ops from the kernel
> > > > > during bpf_object__close().  It can be changed by setting the new
> > > > > "unreg_st_ops" in bpf_object_open_opts.
> > > > >
> > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> > > > > ---
> > > >
> > > > This looks pretty good to me. The big two things is exposing structops
> > > > as real struct bpf_map, so that users can interact with it using
> > > > libbpf APIs, as well as splitting struct_ops map creation and
> > > > registration. bpf_object__load() should only make sure all maps are
> > > > created, progs are loaded/verified, but none of BPF program can yet be
> > > > called. Then attach is the phase where registration happens.
> > > Thanks for the review.
> > >
> > > [ ... ]
> > >
> > > > >  static inline __u64 ptr_to_u64(const void *ptr)
> > > > >  {
> > > > >         return (__u64) (unsigned long) ptr;
> > > > > @@ -233,6 +239,32 @@ struct bpf_map {
> > > > >         bool reused;
> > > > >  };
> > > > >
> > > > > +struct bpf_struct_ops {
> > > > > +       const char *var_name;
> > > > > +       const char *tname;
> > > > > +       const struct btf_type *type;
> > > > > +       struct bpf_program **progs;
> > > > > +       __u32 *kern_func_off;
> > > > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> > > > > +       void *data;
> > > > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> > > >
> > > > Using __bpf_ prefix for this struct_ops-specific types is a bit too
> > > > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> > > > it btf_ops_ or btf_structops_?
> > > Is it a concern on name collision?
> > >
> > > The prefix pick is to use a more representative name.
> > > struct_ops use many bpf pieces and btf is one of them.
> > > Very soon, all new codes will depend on BTF and btf_ prefix
> > > could become generic also.
> > >
> > > Unlike tracepoint, there is no non-btf version of struct_ops.
> >
> > Not so much name collision, as being able to immediately recognize
> > that it's used to provide type information for struct_ops. Think about
> > some automated tooling parsing vmlinux BTF and trying to create some
> > derivative types for those btf_trace_xxx and __bpf_xxx types. Having
> > unique prefix that identifies what kind of type-providing struct it is
> > is very useful to do generic tool like that. While __bpf_ isn't
> > specifying in any ways that it's for struct_ops.
> >
> > >
> > > >
> > > >
> > > > > +        * format.
> > > > > +        * struct __bpf_tcp_congestion_ops {
> > > > > +        *      [... some other kernel fields ...]
> > > > > +        *      struct tcp_congestion_ops data;
> > > > > +        * }
> > > > > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
> > > >
> > > > Comment isn't very clear.. do you mean that data pointed to by
> > > > kern_vdata is of sizeof(...) bytes?
> > > >
> > > > > +        * prepare_struct_ops() will populate the "data" into
> > > > > +        * "kern_vdata".
> > > > > +        */
> > > > > +       void *kern_vdata;
> > > > > +       __u32 type_id;
> > > > > +       __u32 kern_vtype_id;
> > > > > +       __u32 kern_vtype_size;
> > > > > +       int fd;
> > > > > +       bool unreg;
> > > >
> > > > This unreg flag (and default behavior to not unregister) is bothering
> > > > me a bit.. Shouldn't this be controlled by map's lifetime, at least.
> > > > E.g., if no one pins that map - then struct_ops should be unregistered
> > > > on map destruction. If application wants to keep BPF programs
> > > > attached, it should make sure to pin map, before userspace part exits?
> > > > Is this problematic in any way?
> > > I don't think it should in the struct_ops case.  I think of the
> > > struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
> > > in this case) and this map-progs stay (or keep attaching) until it is
> > > detached.  Like other attached bpf_prog keeps running without
> > > caring if the bpf_prog is pinned or not.
> >
> > I'll let someone else comment on how this behaves for cgroup, xdp,
> > etc,
> > but for tracing, for example, we have FD-based BPF links, which
> > will detach program automatically when FD is closed. I think the idea
> > is to extend this to other types of BPF programs as well, so there is
> > no risk of leaving some stray BPF program running after unintended
> Like xdp_prog, struct_ops does not have another fd-based-link.
> This link can be created for struct_ops, xdp_prog and others later.
> I don't see a conflict here.

My point was that default behavior should be conservative: free up
resources automatically on process exit, unless specifically pinned by
user.
But this discussion made me realize that we miss one thing from
general bpf_link framework. See below.

>
> > crash of userspace program. When application explicitly needs BPF
> > program to outlive its userspace control app, then this can be
> > achieved by pinning map/program in BPFFS.
> If the concern is about not leaving struct_ops behind,
> lets assume there is no "detach" and only depends on the very
> last userspace's handles (FD/pinned) of a map goes away,
> what may be an easy way to remove bpf_cubic from the system:

Yeah, I think this "last map FD close frees up resources/detaches" is
a good behavior.

Where we do have problem is with bpf_link__destroy() unconditionally
also detaching whatever was attached (tracepoint, kprobe, or whatever
was done to create bpf_link in the first place). Now,
bpf_link__destroy() has to be called by user (or skeleton) to at least
free up malloc()'ed structs. But it appears that it's not always
desirable that upon bpf_link destruction underlying BPF program gets
detached. I think this will be the case for xdp and others as well.

I think the good and generic way to go about this is to have this as a
general concept of destroying the link without detaching BPF programs.
E.g., what if we have new API call `void bpf_link__unlink()`, which
will mark that link as not requiring to detach underlying BPF program.
When bpf_link__destroy() is called later, it will just free resources
allocated to maintain bpf_link itself, but won't detach any BPF
programs/resources.

With this, user will have to explicitly specify that he doesn't want
to detach even when skeleton/link is destroyed. If we get consensus on
this, I can add support for this to all the existing bpf_links and you
can build on that?

>
> [root@arch-fb-vm1 bpf]# sysctl -a | egrep congestion
>     net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic
>     net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic
>     net.ipv4.tcp_congestion_control = bpf_cubic
>
> >
> > >
> > > About the "bool unreg;", the default can be changed to true if
> > > it makes more sense.
> > >
Martin KaFai Lau Dec. 18, 2019, 8:19 p.m. UTC | #8
On Wed, Dec 18, 2019 at 10:14:04AM -0800, Andrii Nakryiko wrote:
[ ... ]
> 
> Where we do have problem is with bpf_link__destroy() unconditionally
> also detaching whatever was attached (tracepoint, kprobe, or whatever
> was done to create bpf_link in the first place). Now,
> bpf_link__destroy() has to be called by user (or skeleton) to at least
> free up malloc()'ed structs. But it appears that it's not always
> desirable that upon bpf_link destruction underlying BPF program gets
> detached. I think this will be the case for xdp and others as well.
> 
> I think the good and generic way to go about this is to have this as a
> general concept of destroying the link without detaching BPF programs.
> E.g., what if we have new API call `void bpf_link__unlink()`, which
> will mark that link as not requiring to detach underlying BPF program.
> When bpf_link__destroy() is called later, it will just free resources
> allocated to maintain bpf_link itself, but won't detach any BPF
> programs/resources.
> 
> With this, user will have to explicitly specify that he doesn't want
> to detach even when skeleton/link is destroyed. If we get consensus on
> this, I can add support for this to all the existing bpf_links and you
> can build on that?
Keeping the current struct_ops unreg mechanism (i.e.
bpf_struct_ops__unregister(), to be renamed) and
having a way to opt-out sounds good to me.  Thanks.
Toke Høiland-Jørgensen Dec. 19, 2019, 8:53 a.m. UTC | #9
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Wed, Dec 18, 2019 at 9:34 AM Martin Lau <kafai@fb.com> wrote:
>>
>> On Wed, Dec 18, 2019 at 08:34:25AM -0800, Andrii Nakryiko wrote:
>> > On Tue, Dec 17, 2019 at 11:03 PM Martin Lau <kafai@fb.com> wrote:
>> > >
>> > > On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
>> > > > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
>> > > > >
>> > > > > This patch adds BPF STRUCT_OPS support to libbpf.
>> > > > >
>> > > > > The only sec_name convention is SEC("struct_ops") to identify the
>> > > > > struct ops implemented in BPF, e.g.
>> > > > > SEC("struct_ops")
>> > > > > struct tcp_congestion_ops dctcp = {
>> > > > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
>> > > > >         /* ... some more func prts ... */
>> > > > >         .name           = "bpf_dctcp",
>> > > > > };
>> > > > >
>> > > > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
>> > > > > elf section and find out what is the btf-type the "struct_ops" is
>> > > > > implementing.  Note that the btf-type here is referring to
>> > > > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
>> > > > > where are the bpf progs that the func ptrs are referring to.
>> > > > >
>> > > > > In the bpf_object__load phase, the prepare_struct_ops() will load
>> > > > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
>> > > > > With the kernel's btf-type, it can then set the prog->type,
>> > > > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
>> > > > > the prog's properties do not rely on its section name.
>> > > > >
>> > > > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
>> > > > > process is as simple as: member-name match + btf-kind match + size match.
>> > > > > If these matching conditions fail, libbpf will reject.
>> > > > > The current targeting support is "struct tcp_congestion_ops" which
>> > > > > most of its members are function pointers.
>> > > > > The member ordering of the bpf_prog's btf-type can be different from
>> > > > > the btf_vmlinux's btf-type.
>> > > > >
>> > > > > Once the prog's properties are all set,
>> > > > > the libbpf will proceed to load all the progs.
>> > > > >
>> > > > > After that, register_struct_ops() will create a map, finalize the
>> > > > > map-value by populating it with the prog-fd, and then register this
>> > > > > "struct_ops" to the kernel by updating the map-value to the map.
>> > > > >
>> > > > > By default, libbpf does not unregister the struct_ops from the kernel
>> > > > > during bpf_object__close().  It can be changed by setting the new
>> > > > > "unreg_st_ops" in bpf_object_open_opts.
>> > > > >
>> > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>> > > > > ---
>> > > >
>> > > > This looks pretty good to me. The big two things is exposing structops
>> > > > as real struct bpf_map, so that users can interact with it using
>> > > > libbpf APIs, as well as splitting struct_ops map creation and
>> > > > registration. bpf_object__load() should only make sure all maps are
>> > > > created, progs are loaded/verified, but none of BPF program can yet be
>> > > > called. Then attach is the phase where registration happens.
>> > > Thanks for the review.
>> > >
>> > > [ ... ]
>> > >
>> > > > >  static inline __u64 ptr_to_u64(const void *ptr)
>> > > > >  {
>> > > > >         return (__u64) (unsigned long) ptr;
>> > > > > @@ -233,6 +239,32 @@ struct bpf_map {
>> > > > >         bool reused;
>> > > > >  };
>> > > > >
>> > > > > +struct bpf_struct_ops {
>> > > > > +       const char *var_name;
>> > > > > +       const char *tname;
>> > > > > +       const struct btf_type *type;
>> > > > > +       struct bpf_program **progs;
>> > > > > +       __u32 *kern_func_off;
>> > > > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
>> > > > > +       void *data;
>> > > > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
>> > > >
>> > > > Using __bpf_ prefix for this struct_ops-specific types is a bit too
>> > > > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
>> > > > it btf_ops_ or btf_structops_?
>> > > Is it a concern on name collision?
>> > >
>> > > The prefix pick is to use a more representative name.
>> > > struct_ops use many bpf pieces and btf is one of them.
>> > > Very soon, all new codes will depend on BTF and btf_ prefix
>> > > could become generic also.
>> > >
>> > > Unlike tracepoint, there is no non-btf version of struct_ops.
>> >
>> > Not so much name collision, as being able to immediately recognize
>> > that it's used to provide type information for struct_ops. Think about
>> > some automated tooling parsing vmlinux BTF and trying to create some
>> > derivative types for those btf_trace_xxx and __bpf_xxx types. Having
>> > unique prefix that identifies what kind of type-providing struct it is
>> > is very useful to do generic tool like that. While __bpf_ isn't
>> > specifying in any ways that it's for struct_ops.
>> >
>> > >
>> > > >
>> > > >
>> > > > > +        * format.
>> > > > > +        * struct __bpf_tcp_congestion_ops {
>> > > > > +        *      [... some other kernel fields ...]
>> > > > > +        *      struct tcp_congestion_ops data;
>> > > > > +        * }
>> > > > > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
>> > > >
>> > > > Comment isn't very clear.. do you mean that data pointed to by
>> > > > kern_vdata is of sizeof(...) bytes?
>> > > >
>> > > > > +        * prepare_struct_ops() will populate the "data" into
>> > > > > +        * "kern_vdata".
>> > > > > +        */
>> > > > > +       void *kern_vdata;
>> > > > > +       __u32 type_id;
>> > > > > +       __u32 kern_vtype_id;
>> > > > > +       __u32 kern_vtype_size;
>> > > > > +       int fd;
>> > > > > +       bool unreg;
>> > > >
>> > > > This unreg flag (and default behavior to not unregister) is bothering
>> > > > me a bit.. Shouldn't this be controlled by map's lifetime, at least.
>> > > > E.g., if no one pins that map - then struct_ops should be unregistered
>> > > > on map destruction. If application wants to keep BPF programs
>> > > > attached, it should make sure to pin map, before userspace part exits?
>> > > > Is this problematic in any way?
>> > > I don't think it should in the struct_ops case.  I think of the
>> > > struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
>> > > in this case) and this map-progs stay (or keep attaching) until it is
>> > > detached.  Like other attached bpf_prog keeps running without
>> > > caring if the bpf_prog is pinned or not.
>> >
>> > I'll let someone else comment on how this behaves for cgroup, xdp,
>> > etc,
>> > but for tracing, for example, we have FD-based BPF links, which
>> > will detach program automatically when FD is closed. I think the idea
>> > is to extend this to other types of BPF programs as well, so there is
>> > no risk of leaving some stray BPF program running after unintended
>> Like xdp_prog, struct_ops does not have another fd-based-link.
>> This link can be created for struct_ops, xdp_prog and others later.
>> I don't see a conflict here.
>
> My point was that default behavior should be conservative: free up
> resources automatically on process exit, unless specifically pinned by
> user.
> But this discussion made me realize that we miss one thing from
> general bpf_link framework. See below.
>
>>
>> > crash of userspace program. When application explicitly needs BPF
>> > program to outlive its userspace control app, then this can be
>> > achieved by pinning map/program in BPFFS.
>> If the concern is about not leaving struct_ops behind,
>> lets assume there is no "detach" and only depends on the very
>> last userspace's handles (FD/pinned) of a map goes away,
>> what may be an easy way to remove bpf_cubic from the system:
>
> Yeah, I think this "last map FD close frees up resources/detaches" is
> a good behavior.
>
> Where we do have problem is with bpf_link__destroy() unconditionally
> also detaching whatever was attached (tracepoint, kprobe, or whatever
> was done to create bpf_link in the first place). Now,
> bpf_link__destroy() has to be called by user (or skeleton) to at least
> free up malloc()'ed structs. But it appears that it's not always
> desirable that upon bpf_link destruction underlying BPF program gets
> detached. I think this will be the case for xdp and others as well.

For XDP the model has thus far been "once attached, the program stays
until explicitly detached". Changing that would certainly be surprising,
so I agree that splitting the API is best (not that I'm sure how many
XDP programs will end up using that API, but that's a different
concern)...

-Toke
Andrii Nakryiko Dec. 19, 2019, 8:49 p.m. UTC | #10
On Thu, Dec 19, 2019 at 12:54 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Wed, Dec 18, 2019 at 9:34 AM Martin Lau <kafai@fb.com> wrote:
> >>
> >> On Wed, Dec 18, 2019 at 08:34:25AM -0800, Andrii Nakryiko wrote:
> >> > On Tue, Dec 17, 2019 at 11:03 PM Martin Lau <kafai@fb.com> wrote:
> >> > >
> >> > > On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> >> > > > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >> > > > >
> >> > > > > This patch adds BPF STRUCT_OPS support to libbpf.
> >> > > > >
> >> > > > > The only sec_name convention is SEC("struct_ops") to identify the
> >> > > > > struct ops implemented in BPF, e.g.
> >> > > > > SEC("struct_ops")
> >> > > > > struct tcp_congestion_ops dctcp = {
> >> > > > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> >> > > > >         /* ... some more func prts ... */
> >> > > > >         .name           = "bpf_dctcp",
> >> > > > > };
> >> > > > >
> >> > > > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> >> > > > > elf section and find out what is the btf-type the "struct_ops" is
> >> > > > > implementing.  Note that the btf-type here is referring to
> >> > > > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> >> > > > > where are the bpf progs that the func ptrs are referring to.
> >> > > > >
> >> > > > > In the bpf_object__load phase, the prepare_struct_ops() will load
> >> > > > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> >> > > > > With the kernel's btf-type, it can then set the prog->type,
> >> > > > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> >> > > > > the prog's properties do not rely on its section name.
> >> > > > >
> >> > > > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> >> > > > > process is as simple as: member-name match + btf-kind match + size match.
> >> > > > > If these matching conditions fail, libbpf will reject.
> >> > > > > The current targeting support is "struct tcp_congestion_ops" which
> >> > > > > most of its members are function pointers.
> >> > > > > The member ordering of the bpf_prog's btf-type can be different from
> >> > > > > the btf_vmlinux's btf-type.
> >> > > > >
> >> > > > > Once the prog's properties are all set,
> >> > > > > the libbpf will proceed to load all the progs.
> >> > > > >
> >> > > > > After that, register_struct_ops() will create a map, finalize the
> >> > > > > map-value by populating it with the prog-fd, and then register this
> >> > > > > "struct_ops" to the kernel by updating the map-value to the map.
> >> > > > >
> >> > > > > By default, libbpf does not unregister the struct_ops from the kernel
> >> > > > > during bpf_object__close().  It can be changed by setting the new
> >> > > > > "unreg_st_ops" in bpf_object_open_opts.
> >> > > > >
> >> > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> >> > > > > ---
> >> > > >
> >> > > > This looks pretty good to me. The big two things is exposing structops
> >> > > > as real struct bpf_map, so that users can interact with it using
> >> > > > libbpf APIs, as well as splitting struct_ops map creation and
> >> > > > registration. bpf_object__load() should only make sure all maps are
> >> > > > created, progs are loaded/verified, but none of BPF program can yet be
> >> > > > called. Then attach is the phase where registration happens.
> >> > > Thanks for the review.
> >> > >
> >> > > [ ... ]
> >> > >
> >> > > > >  static inline __u64 ptr_to_u64(const void *ptr)
> >> > > > >  {
> >> > > > >         return (__u64) (unsigned long) ptr;
> >> > > > > @@ -233,6 +239,32 @@ struct bpf_map {
> >> > > > >         bool reused;
> >> > > > >  };
> >> > > > >
> >> > > > > +struct bpf_struct_ops {
> >> > > > > +       const char *var_name;
> >> > > > > +       const char *tname;
> >> > > > > +       const struct btf_type *type;
> >> > > > > +       struct bpf_program **progs;
> >> > > > > +       __u32 *kern_func_off;
> >> > > > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> >> > > > > +       void *data;
> >> > > > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> >> > > >
> >> > > > Using __bpf_ prefix for this struct_ops-specific types is a bit too
> >> > > > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> >> > > > it btf_ops_ or btf_structops_?
> >> > > Is it a concern on name collision?
> >> > >
> >> > > The prefix pick is to use a more representative name.
> >> > > struct_ops use many bpf pieces and btf is one of them.
> >> > > Very soon, all new codes will depend on BTF and btf_ prefix
> >> > > could become generic also.
> >> > >
> >> > > Unlike tracepoint, there is no non-btf version of struct_ops.
> >> >
> >> > Not so much name collision, as being able to immediately recognize
> >> > that it's used to provide type information for struct_ops. Think about
> >> > some automated tooling parsing vmlinux BTF and trying to create some
> >> > derivative types for those btf_trace_xxx and __bpf_xxx types. Having
> >> > unique prefix that identifies what kind of type-providing struct it is
> >> > is very useful to do generic tool like that. While __bpf_ isn't
> >> > specifying in any ways that it's for struct_ops.
> >> >
> >> > >
> >> > > >
> >> > > >
> >> > > > > +        * format.
> >> > > > > +        * struct __bpf_tcp_congestion_ops {
> >> > > > > +        *      [... some other kernel fields ...]
> >> > > > > +        *      struct tcp_congestion_ops data;
> >> > > > > +        * }
> >> > > > > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
> >> > > >
> >> > > > Comment isn't very clear.. do you mean that data pointed to by
> >> > > > kern_vdata is of sizeof(...) bytes?
> >> > > >
> >> > > > > +        * prepare_struct_ops() will populate the "data" into
> >> > > > > +        * "kern_vdata".
> >> > > > > +        */
> >> > > > > +       void *kern_vdata;
> >> > > > > +       __u32 type_id;
> >> > > > > +       __u32 kern_vtype_id;
> >> > > > > +       __u32 kern_vtype_size;
> >> > > > > +       int fd;
> >> > > > > +       bool unreg;
> >> > > >
> >> > > > This unreg flag (and default behavior to not unregister) is bothering
> >> > > > me a bit.. Shouldn't this be controlled by map's lifetime, at least.
> >> > > > E.g., if no one pins that map - then struct_ops should be unregistered
> >> > > > on map destruction. If application wants to keep BPF programs
> >> > > > attached, it should make sure to pin map, before userspace part exits?
> >> > > > Is this problematic in any way?
> >> > > I don't think it should in the struct_ops case.  I think of the
> >> > > struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
> >> > > in this case) and this map-progs stay (or keep attaching) until it is
> >> > > detached.  Like other attached bpf_prog keeps running without
> >> > > caring if the bpf_prog is pinned or not.
> >> >
> >> > I'll let someone else comment on how this behaves for cgroup, xdp,
> >> > etc,
> >> > but for tracing, for example, we have FD-based BPF links, which
> >> > will detach program automatically when FD is closed. I think the idea
> >> > is to extend this to other types of BPF programs as well, so there is
> >> > no risk of leaving some stray BPF program running after unintended
> >> Like xdp_prog, struct_ops does not have another fd-based-link.
> >> This link can be created for struct_ops, xdp_prog and others later.
> >> I don't see a conflict here.
> >
> > My point was that default behavior should be conservative: free up
> > resources automatically on process exit, unless specifically pinned by
> > user.
> > But this discussion made me realize that we miss one thing from
> > general bpf_link framework. See below.
> >
> >>
> >> > crash of userspace program. When application explicitly needs BPF
> >> > program to outlive its userspace control app, then this can be
> >> > achieved by pinning map/program in BPFFS.
> >> If the concern is about not leaving struct_ops behind,
> >> lets assume there is no "detach" and only depends on the very
> >> last userspace's handles (FD/pinned) of a map goes away,
> >> what may be an easy way to remove bpf_cubic from the system:
> >
> > Yeah, I think this "last map FD close frees up resources/detaches" is
> > a good behavior.
> >
> > Where we do have problem is with bpf_link__destroy() unconditionally
> > also detaching whatever was attached (tracepoint, kprobe, or whatever
> > was done to create bpf_link in the first place). Now,
> > bpf_link__destroy() has to be called by user (or skeleton) to at least
> > free up malloc()'ed structs. But it appears that it's not always
> > desirable that upon bpf_link destruction underlying BPF program gets
> > detached. I think this will be the case for xdp and others as well.
>
> For XDP the model has thus far been "once attached, the program stays
> until explicitly detached". Changing that would certainly be surprising,
> so I agree that splitting the API is best (not that I'm sure how many
> XDP programs will end up using that API, but that's a different
> concern)...

This would be a new FD-based API for XDP, I don't think we can change
existing API. But I think default behavior should still be to
auto-detach, unless explicitly "pinned" in whatever way. That would
prevent surprising "leakage" of BPF programs for unsuspecting users.

>
> -Toke
>
Toke Høiland-Jørgensen Dec. 20, 2019, 10:16 a.m. UTC | #11
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Thu, Dec 19, 2019 at 12:54 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>
>> > On Wed, Dec 18, 2019 at 9:34 AM Martin Lau <kafai@fb.com> wrote:
>> >>
>> >> On Wed, Dec 18, 2019 at 08:34:25AM -0800, Andrii Nakryiko wrote:
>> >> > On Tue, Dec 17, 2019 at 11:03 PM Martin Lau <kafai@fb.com> wrote:
>> >> > >
>> >> > > On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
>> >> > > > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
>> >> > > > >
>> >> > > > > This patch adds BPF STRUCT_OPS support to libbpf.
>> >> > > > >
>> >> > > > > The only sec_name convention is SEC("struct_ops") to identify the
>> >> > > > > struct ops implemented in BPF, e.g.
>> >> > > > > SEC("struct_ops")
>> >> > > > > struct tcp_congestion_ops dctcp = {
>> >> > > > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
>> >> > > > >         /* ... some more func prts ... */
>> >> > > > >         .name           = "bpf_dctcp",
>> >> > > > > };
>> >> > > > >
>> >> > > > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
>> >> > > > > elf section and find out what is the btf-type the "struct_ops" is
>> >> > > > > implementing.  Note that the btf-type here is referring to
>> >> > > > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
>> >> > > > > where are the bpf progs that the func ptrs are referring to.
>> >> > > > >
>> >> > > > > In the bpf_object__load phase, the prepare_struct_ops() will load
>> >> > > > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
>> >> > > > > With the kernel's btf-type, it can then set the prog->type,
>> >> > > > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
>> >> > > > > the prog's properties do not rely on its section name.
>> >> > > > >
>> >> > > > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
>> >> > > > > process is as simple as: member-name match + btf-kind match + size match.
>> >> > > > > If these matching conditions fail, libbpf will reject.
>> >> > > > > The current targeting support is "struct tcp_congestion_ops" which
>> >> > > > > most of its members are function pointers.
>> >> > > > > The member ordering of the bpf_prog's btf-type can be different from
>> >> > > > > the btf_vmlinux's btf-type.
>> >> > > > >
>> >> > > > > Once the prog's properties are all set,
>> >> > > > > the libbpf will proceed to load all the progs.
>> >> > > > >
>> >> > > > > After that, register_struct_ops() will create a map, finalize the
>> >> > > > > map-value by populating it with the prog-fd, and then register this
>> >> > > > > "struct_ops" to the kernel by updating the map-value to the map.
>> >> > > > >
>> >> > > > > By default, libbpf does not unregister the struct_ops from the kernel
>> >> > > > > during bpf_object__close().  It can be changed by setting the new
>> >> > > > > "unreg_st_ops" in bpf_object_open_opts.
>> >> > > > >
>> >> > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
>> >> > > > > ---
>> >> > > >
>> >> > > > This looks pretty good to me. The big two things is exposing structops
>> >> > > > as real struct bpf_map, so that users can interact with it using
>> >> > > > libbpf APIs, as well as splitting struct_ops map creation and
>> >> > > > registration. bpf_object__load() should only make sure all maps are
>> >> > > > created, progs are loaded/verified, but none of BPF program can yet be
>> >> > > > called. Then attach is the phase where registration happens.
>> >> > > Thanks for the review.
>> >> > >
>> >> > > [ ... ]
>> >> > >
>> >> > > > >  static inline __u64 ptr_to_u64(const void *ptr)
>> >> > > > >  {
>> >> > > > >         return (__u64) (unsigned long) ptr;
>> >> > > > > @@ -233,6 +239,32 @@ struct bpf_map {
>> >> > > > >         bool reused;
>> >> > > > >  };
>> >> > > > >
>> >> > > > > +struct bpf_struct_ops {
>> >> > > > > +       const char *var_name;
>> >> > > > > +       const char *tname;
>> >> > > > > +       const struct btf_type *type;
>> >> > > > > +       struct bpf_program **progs;
>> >> > > > > +       __u32 *kern_func_off;
>> >> > > > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
>> >> > > > > +       void *data;
>> >> > > > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
>> >> > > >
>> >> > > > Using __bpf_ prefix for this struct_ops-specific types is a bit too
>> >> > > > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
>> >> > > > it btf_ops_ or btf_structops_?
>> >> > > Is it a concern on name collision?
>> >> > >
>> >> > > The prefix pick is to use a more representative name.
>> >> > > struct_ops use many bpf pieces and btf is one of them.
>> >> > > Very soon, all new codes will depend on BTF and btf_ prefix
>> >> > > could become generic also.
>> >> > >
>> >> > > Unlike tracepoint, there is no non-btf version of struct_ops.
>> >> >
>> >> > Not so much name collision, as being able to immediately recognize
>> >> > that it's used to provide type information for struct_ops. Think about
>> >> > some automated tooling parsing vmlinux BTF and trying to create some
>> >> > derivative types for those btf_trace_xxx and __bpf_xxx types. Having
>> >> > unique prefix that identifies what kind of type-providing struct it is
>> >> > is very useful to do generic tool like that. While __bpf_ isn't
>> >> > specifying in any ways that it's for struct_ops.
>> >> >
>> >> > >
>> >> > > >
>> >> > > >
>> >> > > > > +        * format.
>> >> > > > > +        * struct __bpf_tcp_congestion_ops {
>> >> > > > > +        *      [... some other kernel fields ...]
>> >> > > > > +        *      struct tcp_congestion_ops data;
>> >> > > > > +        * }
>> >> > > > > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
>> >> > > >
>> >> > > > Comment isn't very clear.. do you mean that data pointed to by
>> >> > > > kern_vdata is of sizeof(...) bytes?
>> >> > > >
>> >> > > > > +        * prepare_struct_ops() will populate the "data" into
>> >> > > > > +        * "kern_vdata".
>> >> > > > > +        */
>> >> > > > > +       void *kern_vdata;
>> >> > > > > +       __u32 type_id;
>> >> > > > > +       __u32 kern_vtype_id;
>> >> > > > > +       __u32 kern_vtype_size;
>> >> > > > > +       int fd;
>> >> > > > > +       bool unreg;
>> >> > > >
>> >> > > > This unreg flag (and default behavior to not unregister) is bothering
>> >> > > > me a bit.. Shouldn't this be controlled by map's lifetime, at least.
>> >> > > > E.g., if no one pins that map - then struct_ops should be unregistered
>> >> > > > on map destruction. If application wants to keep BPF programs
>> >> > > > attached, it should make sure to pin map, before userspace part exits?
>> >> > > > Is this problematic in any way?
>> >> > > I don't think it should in the struct_ops case.  I think of the
>> >> > > struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
>> >> > > in this case) and this map-progs stay (or keep attaching) until it is
>> >> > > detached.  Like other attached bpf_prog keeps running without
>> >> > > caring if the bpf_prog is pinned or not.
>> >> >
>> >> > I'll let someone else comment on how this behaves for cgroup, xdp,
>> >> > etc,
>> >> > but for tracing, for example, we have FD-based BPF links, which
>> >> > will detach program automatically when FD is closed. I think the idea
>> >> > is to extend this to other types of BPF programs as well, so there is
>> >> > no risk of leaving some stray BPF program running after unintended
>> >> Like xdp_prog, struct_ops does not have another fd-based-link.
>> >> This link can be created for struct_ops, xdp_prog and others later.
>> >> I don't see a conflict here.
>> >
>> > My point was that default behavior should be conservative: free up
>> > resources automatically on process exit, unless specifically pinned by
>> > user.
>> > But this discussion made me realize that we miss one thing from
>> > general bpf_link framework. See below.
>> >
>> >>
>> >> > crash of userspace program. When application explicitly needs BPF
>> >> > program to outlive its userspace control app, then this can be
>> >> > achieved by pinning map/program in BPFFS.
>> >> If the concern is about not leaving struct_ops behind,
>> >> lets assume there is no "detach" and only depends on the very
>> >> last userspace's handles (FD/pinned) of a map goes away,
>> >> what may be an easy way to remove bpf_cubic from the system:
>> >
>> > Yeah, I think this "last map FD close frees up resources/detaches" is
>> > a good behavior.
>> >
>> > Where we do have problem is with bpf_link__destroy() unconditionally
>> > also detaching whatever was attached (tracepoint, kprobe, or whatever
>> > was done to create bpf_link in the first place). Now,
>> > bpf_link__destroy() has to be called by user (or skeleton) to at least
>> > free up malloc()'ed structs. But it appears that it's not always
>> > desirable that upon bpf_link destruction underlying BPF program gets
>> > detached. I think this will be the case for xdp and others as well.
>>
>> For XDP the model has thus far been "once attached, the program stays
>> until explicitly detached". Changing that would certainly be surprising,
>> so I agree that splitting the API is best (not that I'm sure how many
>> XDP programs will end up using that API, but that's a different
>> concern)...
>
> This would be a new FD-based API for XDP, I don't think we can change
> existing API. But I think default behavior should still be to
> auto-detach, unless explicitly "pinned" in whatever way. That would
> prevent surprising "leakage" of BPF programs for unsuspecting users.

But why do we need a new API for attaching XDP programs? Also, what are
the use cases where it makes sense to have this kind of "transient" XDP
program? The only one I can think about is something like xdpdump, which
moves packets to userspace (and should stop doing that when the
userspace listener goes away). But with bpf-to-bpf tracing, xdpdump
won't actually be an XDP program, so what's left? The system firewall
rules don't go away when the program that installed them exits either;
why should an XDP program?

-Toke
Andrii Nakryiko Dec. 20, 2019, 5:34 p.m. UTC | #12
On Fri, Dec 20, 2019 at 2:16 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Thu, Dec 19, 2019 at 12:54 AM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
> >>
> >> > On Wed, Dec 18, 2019 at 9:34 AM Martin Lau <kafai@fb.com> wrote:
> >> >>
> >> >> On Wed, Dec 18, 2019 at 08:34:25AM -0800, Andrii Nakryiko wrote:
> >> >> > On Tue, Dec 17, 2019 at 11:03 PM Martin Lau <kafai@fb.com> wrote:
> >> >> > >
> >> >> > > On Tue, Dec 17, 2019 at 07:07:23PM -0800, Andrii Nakryiko wrote:
> >> >> > > > On Fri, Dec 13, 2019 at 4:48 PM Martin KaFai Lau <kafai@fb.com> wrote:
> >> >> > > > >
> >> >> > > > > This patch adds BPF STRUCT_OPS support to libbpf.
> >> >> > > > >
> >> >> > > > > The only sec_name convention is SEC("struct_ops") to identify the
> >> >> > > > > struct ops implemented in BPF, e.g.
> >> >> > > > > SEC("struct_ops")
> >> >> > > > > struct tcp_congestion_ops dctcp = {
> >> >> > > > >         .init           = (void *)dctcp_init,  /* <-- a bpf_prog */
> >> >> > > > >         /* ... some more func prts ... */
> >> >> > > > >         .name           = "bpf_dctcp",
> >> >> > > > > };
> >> >> > > > >
> >> >> > > > > In the bpf_object__open phase, libbpf will look for the "struct_ops"
> >> >> > > > > elf section and find out what is the btf-type the "struct_ops" is
> >> >> > > > > implementing.  Note that the btf-type here is referring to
> >> >> > > > > a type in the bpf_prog.o's btf.  It will then collect (through SHT_REL)
> >> >> > > > > where are the bpf progs that the func ptrs are referring to.
> >> >> > > > >
> >> >> > > > > In the bpf_object__load phase, the prepare_struct_ops() will load
> >> >> > > > > the btf_vmlinux and obtain the corresponding kernel's btf-type.
> >> >> > > > > With the kernel's btf-type, it can then set the prog->type,
> >> >> > > > > prog->attach_btf_id and the prog->expected_attach_type.  Thus,
> >> >> > > > > the prog's properties do not rely on its section name.
> >> >> > > > >
> >> >> > > > > Currently, the bpf_prog's btf-type ==> btf_vmlinux's btf-type matching
> >> >> > > > > process is as simple as: member-name match + btf-kind match + size match.
> >> >> > > > > If these matching conditions fail, libbpf will reject.
> >> >> > > > > The current targeting support is "struct tcp_congestion_ops" which
> >> >> > > > > most of its members are function pointers.
> >> >> > > > > The member ordering of the bpf_prog's btf-type can be different from
> >> >> > > > > the btf_vmlinux's btf-type.
> >> >> > > > >
> >> >> > > > > Once the prog's properties are all set,
> >> >> > > > > the libbpf will proceed to load all the progs.
> >> >> > > > >
> >> >> > > > > After that, register_struct_ops() will create a map, finalize the
> >> >> > > > > map-value by populating it with the prog-fd, and then register this
> >> >> > > > > "struct_ops" to the kernel by updating the map-value to the map.
> >> >> > > > >
> >> >> > > > > By default, libbpf does not unregister the struct_ops from the kernel
> >> >> > > > > during bpf_object__close().  It can be changed by setting the new
> >> >> > > > > "unreg_st_ops" in bpf_object_open_opts.
> >> >> > > > >
> >> >> > > > > Signed-off-by: Martin KaFai Lau <kafai@fb.com>
> >> >> > > > > ---
> >> >> > > >
> >> >> > > > This looks pretty good to me. The big two things is exposing structops
> >> >> > > > as real struct bpf_map, so that users can interact with it using
> >> >> > > > libbpf APIs, as well as splitting struct_ops map creation and
> >> >> > > > registration. bpf_object__load() should only make sure all maps are
> >> >> > > > created, progs are loaded/verified, but none of BPF program can yet be
> >> >> > > > called. Then attach is the phase where registration happens.
> >> >> > > Thanks for the review.
> >> >> > >
> >> >> > > [ ... ]
> >> >> > >
> >> >> > > > >  static inline __u64 ptr_to_u64(const void *ptr)
> >> >> > > > >  {
> >> >> > > > >         return (__u64) (unsigned long) ptr;
> >> >> > > > > @@ -233,6 +239,32 @@ struct bpf_map {
> >> >> > > > >         bool reused;
> >> >> > > > >  };
> >> >> > > > >
> >> >> > > > > +struct bpf_struct_ops {
> >> >> > > > > +       const char *var_name;
> >> >> > > > > +       const char *tname;
> >> >> > > > > +       const struct btf_type *type;
> >> >> > > > > +       struct bpf_program **progs;
> >> >> > > > > +       __u32 *kern_func_off;
> >> >> > > > > +       /* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
> >> >> > > > > +       void *data;
> >> >> > > > > +       /* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
> >> >> > > >
> >> >> > > > Using __bpf_ prefix for this struct_ops-specific types is a bit too
> >> >> > > > generic (e.g., for raw_tp stuff Alexei used btf_trace_). So maybe make
> >> >> > > > it btf_ops_ or btf_structops_?
> >> >> > > Is it a concern on name collision?
> >> >> > >
> >> >> > > The prefix pick is to use a more representative name.
> >> >> > > struct_ops use many bpf pieces and btf is one of them.
> >> >> > > Very soon, all new codes will depend on BTF and btf_ prefix
> >> >> > > could become generic also.
> >> >> > >
> >> >> > > Unlike tracepoint, there is no non-btf version of struct_ops.
> >> >> >
> >> >> > Not so much name collision, as being able to immediately recognize
> >> >> > that it's used to provide type information for struct_ops. Think about
> >> >> > some automated tooling parsing vmlinux BTF and trying to create some
> >> >> > derivative types for those btf_trace_xxx and __bpf_xxx types. Having
> >> >> > unique prefix that identifies what kind of type-providing struct it is
> >> >> > is very useful to do generic tool like that. While __bpf_ isn't
> >> >> > specifying in any ways that it's for struct_ops.
> >> >> >
> >> >> > >
> >> >> > > >
> >> >> > > >
> >> >> > > > > +        * format.
> >> >> > > > > +        * struct __bpf_tcp_congestion_ops {
> >> >> > > > > +        *      [... some other kernel fields ...]
> >> >> > > > > +        *      struct tcp_congestion_ops data;
> >> >> > > > > +        * }
> >> >> > > > > +        * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
> >> >> > > >
> >> >> > > > Comment isn't very clear.. do you mean that data pointed to by
> >> >> > > > kern_vdata is of sizeof(...) bytes?
> >> >> > > >
> >> >> > > > > +        * prepare_struct_ops() will populate the "data" into
> >> >> > > > > +        * "kern_vdata".
> >> >> > > > > +        */
> >> >> > > > > +       void *kern_vdata;
> >> >> > > > > +       __u32 type_id;
> >> >> > > > > +       __u32 kern_vtype_id;
> >> >> > > > > +       __u32 kern_vtype_size;
> >> >> > > > > +       int fd;
> >> >> > > > > +       bool unreg;
> >> >> > > >
> >> >> > > > This unreg flag (and default behavior to not unregister) is bothering
> >> >> > > > me a bit.. Shouldn't this be controlled by map's lifetime, at least.
> >> >> > > > E.g., if no one pins that map - then struct_ops should be unregistered
> >> >> > > > on map destruction. If application wants to keep BPF programs
> >> >> > > > attached, it should make sure to pin map, before userspace part exits?
> >> >> > > > Is this problematic in any way?
> >> >> > > I don't think it should in the struct_ops case.  I think of the
> >> >> > > struct_ops map is a set of progs "attach" to a subsystem (tcp_cong
> >> >> > > in this case) and this map-progs stay (or keep attaching) until it is
> >> >> > > detached.  Like other attached bpf_prog keeps running without
> >> >> > > caring if the bpf_prog is pinned or not.
> >> >> >
> >> >> > I'll let someone else comment on how this behaves for cgroup, xdp,
> >> >> > etc,
> >> >> > but for tracing, for example, we have FD-based BPF links, which
> >> >> > will detach program automatically when FD is closed. I think the idea
> >> >> > is to extend this to other types of BPF programs as well, so there is
> >> >> > no risk of leaving some stray BPF program running after unintended
> >> >> Like xdp_prog, struct_ops does not have another fd-based-link.
> >> >> This link can be created for struct_ops, xdp_prog and others later.
> >> >> I don't see a conflict here.
> >> >
> >> > My point was that default behavior should be conservative: free up
> >> > resources automatically on process exit, unless specifically pinned by
> >> > user.
> >> > But this discussion made me realize that we miss one thing from
> >> > general bpf_link framework. See below.
> >> >
> >> >>
> >> >> > crash of userspace program. When application explicitly needs BPF
> >> >> > program to outlive its userspace control app, then this can be
> >> >> > achieved by pinning map/program in BPFFS.
> >> >> If the concern is about not leaving struct_ops behind,
> >> >> lets assume there is no "detach" and only depends on the very
> >> >> last userspace's handles (FD/pinned) of a map goes away,
> >> >> what may be an easy way to remove bpf_cubic from the system:
> >> >
> >> > Yeah, I think this "last map FD close frees up resources/detaches" is
> >> > a good behavior.
> >> >
> >> > Where we do have problem is with bpf_link__destroy() unconditionally
> >> > also detaching whatever was attached (tracepoint, kprobe, or whatever
> >> > was done to create bpf_link in the first place). Now,
> >> > bpf_link__destroy() has to be called by user (or skeleton) to at least
> >> > free up malloc()'ed structs. But it appears that it's not always
> >> > desirable that upon bpf_link destruction underlying BPF program gets
> >> > detached. I think this will be the case for xdp and others as well.
> >>
> >> For XDP the model has thus far been "once attached, the program stays
> >> until explicitly detached". Changing that would certainly be surprising,
> >> so I agree that splitting the API is best (not that I'm sure how many
> >> XDP programs will end up using that API, but that's a different
> >> concern)...
> >
> > This would be a new FD-based API for XDP, I don't think we can change
> > existing API. But I think default behavior should still be to
> > auto-detach, unless explicitly "pinned" in whatever way. That would
> > prevent surprising "leakage" of BPF programs for unsuspecting users.
>
> But why do we need a new API for attaching XDP programs? Also, what are
> the use cases where it makes sense to have this kind of "transient" XDP
> program? The only one I can think about is something like xdpdump, which

During development, for instance, when you buggy userspace program
crashes? I think by default all those attached BPF programs should be
auto-detachable, if possible. That's the direction that worked out
really well with kprobes/tracepoints/perf_events. Previously, using
old APIs, you'd attach kprobe and if userspace doesn't clean up, that
kprobe would stay attached in the system, consuming resources without
users noticing this (which is especially critical in production).
Switching to auto-detachable FD-based interface greatly improved that
experience. I think this is a good model going forward.

In practice, for production use cases, it will be just a trivial piece
of code to keep it attached:

struct bpf_link *xdp_link = bpf_program__attach_xdp(...);
bpf_link__disconnect(xdp_link); /* now if userspace program crashes,
xdp BPF program will stay connected */

> moves packets to userspace (and should stop doing that when the
> userspace listener goes away). But with bpf-to-bpf tracing, xdpdump
> won't actually be an XDP program, so what's left? The system firewall
> rules don't go away when the program that installed them exits either;
> why should an XDP program?

See above, I'm not saying that it shouldn't be possible to keep it
attached. I'm just arguing it's not a good default, because it can
catch developers off guard and cause problems, especially in
production environments. In the end, it is a resource leak, unless you
want and expect it.

>
> -Toke
>
diff mbox series

Patch

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 98596e15390f..ebb9d7066173 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -95,7 +95,11 @@  int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
 	attr.btf_key_type_id = create_attr->btf_key_type_id;
 	attr.btf_value_type_id = create_attr->btf_value_type_id;
 	attr.map_ifindex = create_attr->map_ifindex;
-	attr.inner_map_fd = create_attr->inner_map_fd;
+	if (attr.map_type == BPF_MAP_TYPE_STRUCT_OPS)
+		attr.btf_vmlinux_value_type_id =
+			create_attr->btf_vmlinux_value_type_id;
+	else
+		attr.inner_map_fd = create_attr->inner_map_fd;
 
 	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
 }
@@ -228,7 +232,9 @@  int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = load_attr->prog_type;
 	attr.expected_attach_type = load_attr->expected_attach_type;
-	if (attr.prog_type == BPF_PROG_TYPE_TRACING) {
+	if (attr.prog_type == BPF_PROG_TYPE_STRUCT_OPS) {
+		attr.attach_btf_id = load_attr->attach_btf_id;
+	} else if (attr.prog_type == BPF_PROG_TYPE_TRACING) {
 		attr.attach_btf_id = load_attr->attach_btf_id;
 		attr.attach_prog_fd = load_attr->attach_prog_fd;
 	} else {
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 3c791fa8e68e..1ddbf7f33b83 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -48,7 +48,10 @@  struct bpf_create_map_attr {
 	__u32 btf_key_type_id;
 	__u32 btf_value_type_id;
 	__u32 map_ifindex;
-	__u32 inner_map_fd;
+	union {
+		__u32 inner_map_fd;
+		__u32 btf_vmlinux_value_type_id;
+	};
 };
 
 LIBBPF_API int
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 27d5f7ecba32..ffb5cdd7db5a 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -67,6 +67,10 @@ 
 
 #define __printf(a, b)	__attribute__((format(printf, a, b)))
 
+static struct btf *bpf_core_find_kernel_btf(void);
+static struct bpf_program *bpf_object__find_prog_by_idx(struct bpf_object *obj,
+							int idx);
+
 static int __base_pr(enum libbpf_print_level level, const char *format,
 		     va_list args)
 {
@@ -128,6 +132,8 @@  void libbpf_print(enum libbpf_print_level level, const char *format, ...)
 # define LIBBPF_ELF_C_READ_MMAP ELF_C_READ
 #endif
 
+#define BPF_STRUCT_OPS_SEC_NAME "struct_ops"
+
 static inline __u64 ptr_to_u64(const void *ptr)
 {
 	return (__u64) (unsigned long) ptr;
@@ -233,6 +239,32 @@  struct bpf_map {
 	bool reused;
 };
 
+struct bpf_struct_ops {
+	const char *var_name;
+	const char *tname;
+	const struct btf_type *type;
+	struct bpf_program **progs;
+	__u32 *kern_func_off;
+	/* e.g. struct tcp_congestion_ops in bpf_prog's btf format */
+	void *data;
+	/* e.g. struct __bpf_tcp_congestion_ops in btf_vmlinux's btf
+	 * format.
+	 * struct __bpf_tcp_congestion_ops {
+	 *	[... some other kernel fields ...]
+	 *	struct tcp_congestion_ops data;
+	 * }
+	 * kern_vdata in the sizeof(struct __bpf_tcp_congestion_ops).
+	 * prepare_struct_ops() will populate the "data" into
+	 * "kern_vdata".
+	 */
+	void *kern_vdata;
+	__u32 type_id;
+	__u32 kern_vtype_id;
+	__u32 kern_vtype_size;
+	int fd;
+	bool unreg;
+};
+
 struct bpf_secdata {
 	void *rodata;
 	void *data;
@@ -251,6 +283,7 @@  struct bpf_object {
 	size_t nr_maps;
 	size_t maps_cap;
 	struct bpf_secdata sections;
+	struct bpf_struct_ops st_ops;
 
 	bool loaded;
 	bool has_pseudo_calls;
@@ -270,6 +303,7 @@  struct bpf_object {
 		Elf_Data *data;
 		Elf_Data *rodata;
 		Elf_Data *bss;
+		Elf_Data *st_ops_data;
 		size_t strtabidx;
 		struct {
 			GElf_Shdr shdr;
@@ -282,6 +316,7 @@  struct bpf_object {
 		int data_shndx;
 		int rodata_shndx;
 		int bss_shndx;
+		int st_ops_shndx;
 	} efile;
 	/*
 	 * All loaded bpf_object is linked in a list, which is
@@ -509,6 +544,508 @@  static __u32 get_kernel_version(void)
 	return KERNEL_VERSION(major, minor, patch);
 }
 
+static int bpf_object__register_struct_ops(struct bpf_object *obj)
+{
+	struct bpf_create_map_attr map_attr = {};
+	struct bpf_struct_ops *st_ops;
+	const char *tname;
+	__u32 i, zero = 0;
+	int fd, err;
+
+	st_ops = &obj->st_ops;
+	if (!st_ops->kern_vdata)
+		return 0;
+
+	tname = st_ops->tname;
+	for (i = 0; i < btf_vlen(st_ops->type); i++) {
+		struct bpf_program *prog = st_ops->progs[i];
+		void *kern_data;
+		int prog_fd;
+
+		if (!prog)
+			continue;
+
+		prog_fd = bpf_program__nth_fd(prog, 0);
+		if (prog_fd < 0) {
+			pr_warn("struct_ops register %s: prog %s is not loaded\n",
+				tname, prog->name);
+			return -EINVAL;
+		}
+
+		kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i];
+		*(unsigned long *)kern_data = prog_fd;
+	}
+
+	map_attr.map_type = BPF_MAP_TYPE_STRUCT_OPS;
+	map_attr.key_size = sizeof(unsigned int);
+	map_attr.value_size = st_ops->kern_vtype_size;
+	map_attr.max_entries = 1;
+	map_attr.btf_fd = btf__fd(obj->btf);
+	map_attr.btf_vmlinux_value_type_id = st_ops->kern_vtype_id;
+	map_attr.name = st_ops->var_name;
+
+	fd = bpf_create_map_xattr(&map_attr);
+	if (fd < 0) {
+		err = -errno;
+		pr_warn("struct_ops register %s: Error in creating struct_ops map\n",
+			tname);
+		return err;
+	}
+
+	err = bpf_map_update_elem(fd, &zero, st_ops->kern_vdata, 0);
+	if (err) {
+		err = -errno;
+		close(fd);
+		pr_warn("struct_ops register %s: Error in updating struct_ops map\n",
+			tname);
+		return err;
+	}
+
+	st_ops->fd = fd;
+
+	return 0;
+}
+
+static int bpf_struct_ops__unregister(struct bpf_struct_ops *st_ops)
+{
+	if (st_ops->fd != -1) {
+		__u32 zero = 0;
+		int err = 0;
+
+		if (bpf_map_delete_elem(st_ops->fd, &zero))
+			err = -errno;
+		zclose(st_ops->fd);
+
+		return err;
+	}
+
+	return 0;
+}
+
+static const struct btf_type *
+resolve_ptr(const struct btf *btf, __u32 id, __u32 *res_id);
+static const struct btf_type *
+resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id);
+
+static const struct btf_member *
+find_member_by_offset(const struct btf_type *t, __u32 offset)
+{
+	struct btf_member *m;
+	int i;
+
+	for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) {
+		if (btf_member_bit_offset(t, i) == offset)
+			return m;
+	}
+
+	return NULL;
+}
+
+static const struct btf_member *
+find_member_by_name(const struct btf *btf, const struct btf_type *t,
+		    const char *name)
+{
+	struct btf_member *m;
+	int i;
+
+	for (i = 0, m = btf_members(t); i < btf_vlen(t); i++, m++) {
+		if (!strcmp(btf__name_by_offset(btf, m->name_off), name))
+			return m;
+	}
+
+	return NULL;
+}
+
+#define STRUCT_OPS_VALUE_PREFIX "__bpf_"
+#define STRUCT_OPS_VALUE_PREFIX_LEN (sizeof(STRUCT_OPS_VALUE_PREFIX) - 1)
+
+static int
+bpf_struct_ops__get_kern_types(const struct btf *btf, const char *tname,
+			       const struct btf_type **type, __u32 *type_id,
+			       const struct btf_type **vtype, __u32 *vtype_id,
+			       const struct btf_member **data_member)
+{
+	const struct btf_type *kern_type, *kern_vtype;
+	const struct btf_member *kern_data_member;
+	__s32 kern_vtype_id, kern_type_id;
+	char vtname[128] = STRUCT_OPS_VALUE_PREFIX;
+	__u32 i;
+
+	kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
+	if (kern_type_id < 0) {
+		pr_warn("struct_ops prepare: struct %s is not found in kernel BTF\n",
+			tname);
+		return -ENOTSUP;
+	}
+	kern_type = btf__type_by_id(btf, kern_type_id);
+
+	/* Find the corresponding "map_value" type that will be used
+	 * in map_update(BPF_MAP_TYPE_STRUCT_OPS).  For example,
+	 * find "struct __bpf_tcp_congestion_ops" from the btf_vmlinux.
+	 */
+	strncat(vtname + STRUCT_OPS_VALUE_PREFIX_LEN, tname,
+		sizeof(vtname) - STRUCT_OPS_VALUE_PREFIX_LEN - 1);
+	kern_vtype_id = btf__find_by_name_kind(btf, vtname,
+					       BTF_KIND_STRUCT);
+	if (kern_vtype_id < 0) {
+		pr_warn("struct_ops prepare: struct %s is not found in kernel BTF\n",
+			vtname);
+		return -ENOTSUP;
+	}
+	kern_vtype = btf__type_by_id(btf, kern_vtype_id);
+
+	/* Find "struct tcp_congestion_ops" from
+	 * struct __bpf_tcp_congestion_ops {
+	 *	[ ... ]
+	 *	struct tcp_congestion_ops data;
+	 * }
+	 */
+	for (i = 0, kern_data_member = btf_members(kern_vtype);
+	     i < btf_vlen(kern_vtype);
+	     i++, kern_data_member++) {
+		if (kern_data_member->type == kern_type_id)
+			break;
+	}
+	if (i == btf_vlen(kern_vtype)) {
+		pr_warn("struct_ops prepare: struct %s data is not found in struct %s\n",
+			tname, vtname);
+		return -EINVAL;
+	}
+
+	*type = kern_type;
+	*type_id = kern_type_id;
+	*vtype = kern_vtype;
+	*vtype_id = kern_vtype_id;
+	*data_member = kern_data_member;
+
+	return 0;
+}
+
+static int bpf_object__prepare_struct_ops(struct bpf_object *obj)
+{
+	const struct btf_member *member, *kern_member, *kern_data_member;
+	const struct btf_type *type, *kern_type, *kern_vtype;
+	__u32 i, kern_type_id, kern_vtype_id, kern_data_off;
+	struct bpf_struct_ops *st_ops;
+	void *data, *kern_data;
+	const struct btf *btf;
+	struct btf *kern_btf;
+	const char *tname;
+	int err;
+
+	st_ops = &obj->st_ops;
+	if (!st_ops->data)
+		return 0;
+
+	btf = obj->btf;
+	type = st_ops->type;
+	tname = st_ops->tname;
+
+	kern_btf = bpf_core_find_kernel_btf();
+	if (IS_ERR(kern_btf))
+		return PTR_ERR(kern_btf);
+
+	err = bpf_struct_ops__get_kern_types(kern_btf, tname,
+					     &kern_type, &kern_type_id,
+					     &kern_vtype, &kern_vtype_id,
+					     &kern_data_member);
+	if (err)
+		goto done;
+
+	pr_debug("struct_ops prepare %s: type_id:%u kern_type_id:%u kern_vtype_id:%u\n",
+		 tname, st_ops->type_id, kern_type_id, kern_vtype_id);
+
+	kern_data_off = kern_data_member->offset / 8;
+	st_ops->kern_vtype_size = kern_vtype->size;
+	st_ops->kern_vtype_id = kern_vtype_id;
+
+	st_ops->kern_vdata = calloc(1, st_ops->kern_vtype_size);
+	if (!st_ops->kern_vdata) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	data = st_ops->data;
+	kern_data = st_ops->kern_vdata + kern_data_off;
+
+	err = -ENOTSUP;
+	for (i = 0, member = btf_members(type); i < btf_vlen(type);
+	     i++, member++) {
+		const struct btf_type *mtype, *kern_mtype;
+		__u32 mtype_id, kern_mtype_id;
+		void *mdata, *kern_mdata;
+		__s64 msize, kern_msize;
+		__u32 moff, kern_moff;
+		__u32 kern_member_idx;
+		const char *mname;
+
+		mname = btf__name_by_offset(btf, member->name_off);
+		kern_member = find_member_by_name(kern_btf, kern_type, mname);
+		if (!kern_member) {
+			pr_warn("struct_ops prepare %s: Cannot find member %s in kernel BTF\n",
+				tname, mname);
+			goto done;
+		}
+
+		kern_member_idx = kern_member - btf_members(kern_type);
+		if (btf_member_bitfield_size(type, i) ||
+		    btf_member_bitfield_size(kern_type, kern_member_idx)) {
+			pr_warn("struct_ops prepare %s: bitfield %s is not supported\n",
+				tname, mname);
+			goto done;
+		}
+
+		moff = member->offset / 8;
+		kern_moff = kern_member->offset / 8;
+
+		mdata = data + moff;
+		kern_mdata = kern_data + kern_moff;
+
+		mtype_id = member->type;
+		kern_mtype_id = kern_member->type;
+
+		mtype = resolve_ptr(btf, mtype_id, NULL);
+		kern_mtype = resolve_ptr(kern_btf, kern_mtype_id, NULL);
+		if (mtype && kern_mtype) {
+			struct bpf_program *prog;
+
+			if (!btf_is_func_proto(mtype) ||
+			    !btf_is_func_proto(kern_mtype)) {
+				pr_warn("struct_ops prepare %s: non func ptr %s is not supported\n",
+					tname, mname);
+				goto done;
+			}
+
+			prog = st_ops->progs[i];
+			if (!prog) {
+				pr_debug("struct_ops prepare %s: func ptr %s is not set\n",
+					 tname, mname);
+				continue;
+			}
+
+			if (prog->type != BPF_PROG_TYPE_UNSPEC ||
+			    prog->attach_btf_id) {
+				pr_warn("struct_ops prepare %s: cannot use prog %s with attach_btf_id %d for func ptr %s\n",
+					tname, prog->name, prog->attach_btf_id, mname);
+				err = -EINVAL;
+				goto done;
+			}
+
+			prog->type = BPF_PROG_TYPE_STRUCT_OPS;
+			prog->attach_btf_id = kern_type_id;
+			/* expected_attach_type is the member index */
+			prog->expected_attach_type = kern_member_idx;
+
+			st_ops->kern_func_off[i] = kern_data_off + kern_moff;
+
+			pr_debug("struct_ops prepare %s: func ptr %s is set to prog %s from data(+%u) to kern_data(+%u)\n",
+				 tname, mname, prog->name, moff, kern_moff);
+
+			continue;
+		}
+
+		mtype_id = btf__resolve_type(btf, mtype_id);
+		kern_mtype_id = btf__resolve_type(kern_btf, kern_mtype_id);
+		if (mtype_id < 0 || kern_mtype_id < 0) {
+			pr_warn("struct_ops prepare %s: Cannot resolve the type for %s\n",
+				tname, mname);
+			goto done;
+		}
+
+		mtype = btf__type_by_id(btf, mtype_id);
+		kern_mtype = btf__type_by_id(kern_btf, kern_mtype_id);
+		if (BTF_INFO_KIND(mtype->info) !=
+		    BTF_INFO_KIND(kern_mtype->info)) {
+			pr_warn("struct_ops prepare %s: Unmatched member type %s %u != %u(kernel)\n",
+				tname, mname,
+				BTF_INFO_KIND(mtype->info),
+				BTF_INFO_KIND(kern_mtype->info));
+			goto done;
+		}
+
+		msize = btf__resolve_size(btf, mtype_id);
+		kern_msize = btf__resolve_size(kern_btf, kern_mtype_id);
+		if (msize < 0 || kern_msize < 0 || msize != kern_msize) {
+			pr_warn("struct_ops prepare %s: Error in size of member %s: %zd != %zd(kernel)\n",
+				tname, mname,
+				(ssize_t)msize, (ssize_t)kern_msize);
+			goto done;
+		}
+
+		pr_debug("struct_ops prepare %s: copy %s %u bytes from data(+%u) to kern_data(+%u)\n",
+			 tname, mname, (unsigned int)msize,
+			 moff, kern_moff);
+		memcpy(kern_mdata, mdata, msize);
+	}
+
+	err = 0;
+
+done:
+	/* On error case, bpf_object__unload() will free the
+	 * st_ops->kern_vdata.
+	 */
+	btf__free(kern_btf);
+	return err;
+}
+
+static int bpf_object__collect_struct_ops_reloc(struct bpf_object *obj,
+						GElf_Shdr *shdr,
+						Elf_Data *data)
+{
+	const struct btf_member *member;
+	struct bpf_struct_ops *st_ops;
+	struct bpf_program *prog;
+	const char *name, *tname;
+	unsigned int shdr_idx;
+	const struct btf *btf;
+	Elf_Data *symbols;
+	unsigned int moff;
+	GElf_Sym sym;
+	GElf_Rel rel;
+	int i, nrels;
+
+	symbols = obj->efile.symbols;
+	btf = obj->btf;
+	st_ops = &obj->st_ops;
+	tname = st_ops->tname;
+
+	nrels = shdr->sh_size / shdr->sh_entsize;
+	for (i = 0; i < nrels; i++) {
+		if (!gelf_getrel(data, i, &rel)) {
+			pr_warn("struct_ops reloc %s: failed to get %d reloc\n",
+				tname, i);
+			return -LIBBPF_ERRNO__FORMAT;
+		}
+
+		if (!gelf_getsym(symbols, GELF_R_SYM(rel.r_info), &sym)) {
+			pr_warn("struct_ops reloc %s: symbol %" PRIx64 " not found\n",
+				tname, GELF_R_SYM(rel.r_info));
+			return -LIBBPF_ERRNO__FORMAT;
+		}
+
+		name = elf_strptr(obj->efile.elf, obj->efile.strtabidx,
+				  sym.st_name) ? : "<?>";
+
+		pr_debug("%s relo for %lld value %lld name %d (\'%s\')\n",
+			 tname,
+			 (long long) (rel.r_info >> 32),
+			 (long long) sym.st_value, sym.st_name, name);
+
+		shdr_idx = sym.st_shndx;
+		moff = rel.r_offset;
+		pr_debug("struct_ops reloc %s: moff=%u, shdr_idx=%u\n",
+			 tname, moff, shdr_idx);
+
+		if (shdr_idx >= SHN_LORESERVE) {
+			pr_warn("struct_ops reloc %s: moff=%u shdr_idx=%u unsupported non-static function\n",
+				tname, moff, shdr_idx);
+			return -LIBBPF_ERRNO__RELOC;
+		}
+
+		member = find_member_by_offset(st_ops->type, moff * 8);
+		if (!member) {
+			pr_warn("struct_ops reloc %s: cannot find member at moff=%u\n",
+				tname, moff);
+			return -EINVAL;
+		}
+		name = btf__name_by_offset(btf, member->name_off);
+
+		if (!resolve_func_ptr(btf, member->type, NULL)) {
+			pr_warn("struct_ops reloc %s: cannot relocate non func ptr %s\n",
+				tname, name);
+			return -EINVAL;
+		}
+
+		prog = bpf_object__find_prog_by_idx(obj, shdr_idx);
+		if (!prog) {
+			pr_warn("struct_ops reloc %s: cannot find prog at shdr_idx %u to relocate func ptr %s\n",
+				tname, shdr_idx, name);
+			return -EINVAL;
+		}
+
+		st_ops->progs[member - btf_members(st_ops->type)] = prog;
+	}
+
+	return 0;
+}
+
+static int bpf_object__init_struct_ops(struct bpf_object *obj)
+{
+	const struct btf_type *type, *datasec;
+	const struct btf_var_secinfo *vsi;
+	struct bpf_struct_ops *st_ops;
+	const char *tname, *var_name;
+	__s32 type_id, datasec_id;
+	const struct btf *btf;
+
+	if (obj->efile.st_ops_shndx == -1)
+		return 0;
+
+	btf = obj->btf;
+	st_ops = &obj->st_ops;
+	datasec_id = btf__find_by_name_kind(btf, BPF_STRUCT_OPS_SEC_NAME,
+					    BTF_KIND_DATASEC);
+	if (datasec_id < 0) {
+		pr_warn("struct_ops init: DATASEC %s not found\n",
+			BPF_STRUCT_OPS_SEC_NAME);
+		return -EINVAL;
+	}
+
+	datasec = btf__type_by_id(btf, datasec_id);
+	if (btf_vlen(datasec) != 1) {
+		pr_warn("struct_ops init: multiple VAR in DATASEC %s\n",
+			BPF_STRUCT_OPS_SEC_NAME);
+		return -ENOTSUP;
+	}
+	vsi = btf_var_secinfos(datasec);
+
+	type = btf__type_by_id(obj->btf, vsi->type);
+	if (!btf_is_var(type)) {
+		pr_warn("struct_ops init: vsi->type %u is not a VAR\n",
+			vsi->type);
+		return -EINVAL;
+	}
+	var_name = btf__name_by_offset(obj->btf, type->name_off);
+
+	type_id = btf__resolve_type(obj->btf, vsi->type);
+	if (type_id < 0) {
+		pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n",
+			vsi->type, BPF_STRUCT_OPS_SEC_NAME);
+		return -EINVAL;
+	}
+
+	type = btf__type_by_id(obj->btf, type_id);
+	tname = btf__name_by_offset(obj->btf, type->name_off);
+	if (!btf_is_struct(type)) {
+		pr_warn("struct_ops init: %s is not a struct\n", tname);
+		return -EINVAL;
+	}
+
+	if (type->size != obj->efile.st_ops_data->d_size) {
+		pr_warn("struct_ops init: %s unmatched size %u (BTF DATASEC) != %zu (ELF)\n",
+			tname, type->size, obj->efile.st_ops_data->d_size);
+		return -EINVAL;
+	}
+
+	st_ops->data = malloc(type->size);
+	st_ops->progs = calloc(btf_vlen(type), sizeof(*st_ops->progs));
+	st_ops->kern_func_off = malloc(btf_vlen(type) *
+				       sizeof(*st_ops->kern_func_off));
+	/* bpf_object__close() will take care of the free-ings */
+	if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off)
+		return -ENOMEM;
+	memcpy(st_ops->data, obj->efile.st_ops_data->d_buf, type->size);
+
+	st_ops->tname = tname;
+	st_ops->type = type;
+	st_ops->type_id = type_id;
+	st_ops->var_name = var_name;
+
+	pr_debug("struct_ops init: %s found. type_id:%u\n", tname, type_id);
+
+	return 0;
+}
+
 static struct bpf_object *bpf_object__new(const char *path,
 					  const void *obj_buf,
 					  size_t obj_buf_sz,
@@ -550,6 +1087,9 @@  static struct bpf_object *bpf_object__new(const char *path,
 	obj->efile.data_shndx = -1;
 	obj->efile.rodata_shndx = -1;
 	obj->efile.bss_shndx = -1;
+	obj->efile.st_ops_shndx = -1;
+
+	obj->st_ops.fd = -1;
 
 	obj->kern_version = get_kernel_version();
 	obj->loaded = false;
@@ -572,6 +1112,7 @@  static void bpf_object__elf_finish(struct bpf_object *obj)
 	obj->efile.data = NULL;
 	obj->efile.rodata = NULL;
 	obj->efile.bss = NULL;
+	obj->efile.st_ops_data = NULL;
 
 	zfree(&obj->efile.reloc_sects);
 	obj->efile.nr_reloc_sects = 0;
@@ -757,6 +1298,9 @@  int bpf_object__section_size(const struct bpf_object *obj, const char *name,
 	} else if (!strcmp(name, ".rodata")) {
 		if (obj->efile.rodata)
 			*size = obj->efile.rodata->d_size;
+	} else if (!strcmp(name, BPF_STRUCT_OPS_SEC_NAME)) {
+		if (obj->efile.st_ops_data)
+			*size = obj->efile.st_ops_data->d_size;
 	} else {
 		ret = bpf_object_search_section_size(obj, name, &d_size);
 		if (!ret)
@@ -1060,6 +1604,30 @@  skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id)
 	return t;
 }
 
+static const struct btf_type *
+resolve_ptr(const struct btf *btf, __u32 id, __u32 *res_id)
+{
+	const struct btf_type *t;
+
+	t = skip_mods_and_typedefs(btf, id, NULL);
+	if (!btf_is_ptr(t))
+		return NULL;
+
+	return skip_mods_and_typedefs(btf, t->type, res_id);
+}
+
+static const struct btf_type *
+resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id)
+{
+	const struct btf_type *t;
+
+	t = resolve_ptr(btf, id, res_id);
+	if (t && btf_is_func_proto(t))
+		return t;
+
+	return NULL;
+}
+
 /*
  * Fetch integer attribute of BTF map definition. Such attributes are
  * represented using a pointer to an array, in which dimensionality of array
@@ -1509,7 +2077,7 @@  static void bpf_object__sanitize_btf_ext(struct bpf_object *obj)
 
 static bool bpf_object__is_btf_mandatory(const struct bpf_object *obj)
 {
-	return obj->efile.btf_maps_shndx >= 0;
+	return obj->efile.btf_maps_shndx >= 0 || obj->efile.st_ops_shndx >= 0;
 }
 
 static int bpf_object__init_btf(struct bpf_object *obj,
@@ -1689,6 +2257,9 @@  static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
 			} else if (strcmp(name, ".rodata") == 0) {
 				obj->efile.rodata = data;
 				obj->efile.rodata_shndx = idx;
+			} else if (strcmp(name, BPF_STRUCT_OPS_SEC_NAME) == 0) {
+				obj->efile.st_ops_data = data;
+				obj->efile.st_ops_shndx = idx;
 			} else {
 				pr_debug("skip section(%d) %s\n", idx, name);
 			}
@@ -1698,7 +2269,8 @@  static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
 			int sec = sh.sh_info; /* points to other section */
 
 			/* Only do relo for section with exec instructions */
-			if (!section_have_execinstr(obj, sec)) {
+			if (!section_have_execinstr(obj, sec) &&
+			    !strstr(name, BPF_STRUCT_OPS_SEC_NAME)) {
 				pr_debug("skip relo %s(%d) for section(%d)\n",
 					 name, idx, sec);
 				continue;
@@ -1735,6 +2307,8 @@  static int bpf_object__elf_collect(struct bpf_object *obj, bool relaxed_maps,
 		err = bpf_object__sanitize_and_load_btf(obj);
 	if (!err)
 		err = bpf_object__init_prog_names(obj);
+	if (!err)
+		err = bpf_object__init_struct_ops(obj);
 	return err;
 }
 
@@ -3700,6 +4274,13 @@  static int bpf_object__collect_reloc(struct bpf_object *obj)
 			return -LIBBPF_ERRNO__INTERNAL;
 		}
 
+		if (idx == obj->efile.st_ops_shndx) {
+			err = bpf_object__collect_struct_ops_reloc(obj, shdr, data);
+			if (err)
+				return err;
+			continue;
+		}
+
 		prog = bpf_object__find_prog_by_idx(obj, idx);
 		if (!prog) {
 			pr_warn("relocation failed: no section(%d)\n", idx);
@@ -3734,7 +4315,9 @@  load_program(struct bpf_program *prog, struct bpf_insn *insns, int insns_cnt,
 	load_attr.insns = insns;
 	load_attr.insns_cnt = insns_cnt;
 	load_attr.license = license;
-	if (prog->type == BPF_PROG_TYPE_TRACING) {
+	if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+		load_attr.attach_btf_id = prog->attach_btf_id;
+	} else if (prog->type == BPF_PROG_TYPE_TRACING) {
 		load_attr.attach_prog_fd = prog->attach_prog_fd;
 		load_attr.attach_btf_id = prog->attach_btf_id;
 	} else {
@@ -3952,6 +4535,7 @@  __bpf_object__open(const char *path, const void *obj_buf, size_t obj_buf_sz,
 	if (IS_ERR(obj))
 		return obj;
 
+	obj->st_ops.unreg = OPTS_GET(opts, unreg_st_ops, false);
 	obj->relaxed_core_relocs = OPTS_GET(opts, relaxed_core_relocs, false);
 	relaxed_maps = OPTS_GET(opts, relaxed_maps, false);
 	pin_root_path = OPTS_GET(opts, pin_root_path, NULL);
@@ -4077,6 +4661,10 @@  int bpf_object__unload(struct bpf_object *obj)
 	for (i = 0; i < obj->nr_programs; i++)
 		bpf_program__unload(&obj->programs[i]);
 
+	if (obj->st_ops.unreg)
+		bpf_struct_ops__unregister(&obj->st_ops);
+	zfree(&obj->st_ops.kern_vdata);
+
 	return 0;
 }
 
@@ -4100,7 +4688,9 @@  int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
 
 	CHECK_ERR(bpf_object__create_maps(obj), err, out);
 	CHECK_ERR(bpf_object__relocate(obj, attr->target_btf_path), err, out);
+	CHECK_ERR(bpf_object__prepare_struct_ops(obj), err, out);
 	CHECK_ERR(bpf_object__load_progs(obj, attr->log_level), err, out);
+	CHECK_ERR(bpf_object__register_struct_ops(obj), err, out);
 
 	return 0;
 out:
@@ -4690,6 +5280,9 @@  void bpf_object__close(struct bpf_object *obj)
 			bpf_program__exit(&obj->programs[i]);
 	}
 	zfree(&obj->programs);
+	zfree(&obj->st_ops.data);
+	zfree(&obj->st_ops.progs);
+	zfree(&obj->st_ops.kern_func_off);
 
 	list_del(&obj->list);
 	free(obj);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 0dbf4bfba0c4..db255fce4948 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -109,8 +109,9 @@  struct bpf_object_open_opts {
 	 */
 	const char *pin_root_path;
 	__u32 attach_prog_fd;
+	bool unreg_st_ops;
 };
-#define bpf_object_open_opts__last_field attach_prog_fd
+#define bpf_object_open_opts__last_field unreg_st_ops
 
 LIBBPF_API struct bpf_object *bpf_object__open(const char *path);
 LIBBPF_API struct bpf_object *
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index a9eb8b322671..7f06942e9574 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -103,6 +103,7 @@  probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_CGROUP_SOCKOPT:
 	case BPF_PROG_TYPE_TRACING:
+	case BPF_PROG_TYPE_STRUCT_OPS:
 	default:
 		break;
 	}
@@ -251,6 +252,7 @@  bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
 	case BPF_MAP_TYPE_XSKMAP:
 	case BPF_MAP_TYPE_SOCKHASH:
 	case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+	case BPF_MAP_TYPE_STRUCT_OPS:
 	default:
 		break;
 	}