Message ID | 159076798566.1387573.8417040652693679408.stgit@firesoul |
---|---|
State | RFC |
Delegated to: | BPF Maintainers |
Headers | show |
Series | bpf: dynamic map-value config layout via BTF | expand |
Jesper Dangaard Brouer <brouer@redhat.com> writes: > The devmap map-value can be read from BPF-prog side, and could be used for a > storage area per device. This could e.g. contain info on headers that need > to be added when packet egress this device. > > This patchset adds a dynamic storage member to struct bpf_devmap_val. More > importantly the struct bpf_devmap_val is made dynamic via leveraging and > requiring BTF for struct sizes above 4. The only mandatory struct member is > 'ifindex' with a fixed offset of zero. > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> > --- > kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 185 insertions(+), 31 deletions(-) > > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > index 4ab67b2d8159..9cf2dadcc0fe 100644 > --- a/kernel/bpf/devmap.c > +++ b/kernel/bpf/devmap.c > @@ -48,6 +48,7 @@ > #include <net/xdp.h> > #include <linux/filter.h> > #include <trace/events/xdp.h> > +#include <linux/btf.h> > > #define DEV_CREATE_FLAG_MASK \ > (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue { > unsigned int count; > }; > > -/* DEVMAP values */ > +/* DEVMAP map-value layout. > + * > + * The struct data-layout of map-value is a configuration interface. > + * BPF-prog side have read-only access to this memory. > + * > + * The layout might be different than below, because some struct members are > + * optional. This is made dynamic by requiring userspace provides an BTF > + * description of the struct layout, when creating the BPF-map. Struct names > + * are important and part of API, as BTF use these names to identify members. > + */ > struct bpf_devmap_val { > - __u32 ifindex; /* device index */ > + __u32 ifindex; /* device index - mandatory */ > union { > int fd; /* prog fd on map write */ > __u32 id; /* prog id on map read */ > } bpf_prog; > + struct { > + /* This 'storage' member is meant as a dynamically sized area, > + * that BPF developer can redefine. As other members are added > + * overtime, this area can shrink, as size can be regained by > + * not using members above. Add new members above this struct. > + */ > + unsigned char data[24]; > + } storage; Why is this needed? Userspace already passes in the value_size, so why can't the kernel just use the BTF to pick out the values it cares about and let the rest be up to userspace? > }; > > struct bpf_dtab_netdev { > @@ -79,10 +97,18 @@ struct bpf_dtab_netdev { > struct bpf_devmap_val val; > }; > > +struct bpf_devmap_val_cfg { > + struct { > + int ifindex; > + int bpf_prog; > + } btf_offset; > +}; > + > struct bpf_dtab { > struct bpf_map map; > struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ > struct list_head list; > + struct bpf_devmap_val_cfg cfg; > > /* these are only used for DEVMAP_HASH type maps */ > struct hlist_head *dev_index_head; > @@ -116,20 +142,24 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, > > static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) > { > - __u32 valsize = attr->value_size; > u64 cost = 0; > int err; > > - /* check sanity of attributes. 2 value sizes supported: > - * 4 bytes: ifindex > - * 8 bytes: ifindex + prog fd > - */ > + /* Value contents validated in dev_map_check_btf */ > if (attr->max_entries == 0 || attr->key_size != 4 || > - (valsize != offsetofend(struct bpf_devmap_val, ifindex) && > - valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || > + attr->value_size > sizeof(struct bpf_devmap_val) || > attr->map_flags & ~DEV_CREATE_FLAG_MASK) > return -EINVAL; > > + /* Enforce BTF for userspace, unless dealing with legacy kABI */ > + if (attr->value_size != 4 && > + (!attr->btf_key_type_id || !attr->btf_value_type_id)) > + return -EOPNOTSUPP; > + > + /* Mark BTF offset's as invalid */ > + dtab->cfg.btf_offset.ifindex = -1; > + dtab->cfg.btf_offset.bpf_prog = -1; > + > /* Lookup returns a pointer straight to dev->ifindex, so make sure the > * verifier prevents writes from the BPF side > */ > @@ -199,6 +229,119 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) > return &dtab->map; > } > > +struct expect { > + u8 btf_kind; > + bool mandatory; > + int bit_offset; > + int size; > + const char *name; > +}; > + > +static int btf_find_expect_layout_offset(const struct btf *btf, > + const struct btf_type *value_type, > + const struct expect *layout) > +{ > + const struct btf_member *member; > + u32 i, off = -ENOENT; > + > + for_each_member(i, value_type, member) { > + const struct btf_type *member_type; > + const char *member_name; > + > + member_type = btf_type_skip_modifiers(btf, member->type, NULL); > + if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) { > + continue; > + } > + > + member_name = btf_name_by_offset(btf, member->name_off); > + if (!member_name) > + return -EINVAL; > + > + if (strcmp(layout->name, member_name)) > + continue; > + > + if (layout->size > 0 && // btf_type_has_size(member_type) && > + member_type->size != layout->size) > + continue; > + > + off = btf_member_bit_offset(value_type, member); > + if (layout->bit_offset > 0 && > + layout->bit_offset != off) { > + off = -ENOENT; > + continue; > + } Won't this enforced offset cause problems for extensibility? Say we introduce a third struct member that the kernel understands, e.g. another u32 with expect->bit_offset=64. That would mean you can no longer skip members, since that would make any subsequent offset tests fail? Or am I misunderstanding how this is supposed to work? > + > + return off; > + } > + return off; > +} > + > +/* Expected BTF layout that match struct bpf_devmap_val */ > +static const struct expect layout[] = { > + {BTF_KIND_INT, true, 0, 4, "ifindex"}, > + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, > + {BTF_KIND_STRUCT, false, -1, -1, "storage"} > +}; > + > +static int dev_map_check_btf(const struct bpf_map *map, > + const struct btf *btf, > + const struct btf_type *key_type, > + const struct btf_type *value_type) > +{ > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > + u32 found_members_cnt = 0; > + u32 int_data; > + int off; > + u32 i; > + > + /* Validate KEY type and size */ > + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) > + return -EOPNOTSUPP; > + > + int_data = *(u32 *)(key_type + 1); > + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) > + return -EOPNOTSUPP; > + > + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val > + * - With a flexible size of member 'storage'. > + */ > + > + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) > + return -EOPNOTSUPP; > + > + /* Struct/union members in BTF must not exceed (max) expected members */ > + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) > + return -E2BIG; > + > + for (i = 0; i < ARRAY_SIZE(layout); i++) { > + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); > + > + if (off < 0 && layout[i].mandatory) > + return -EUCLEAN; > + > + if (off >= 0) > + found_members_cnt++; > + > + /* Transfer layout config to map */ > + switch (i) { > + case 0: > + dtab->cfg.btf_offset.ifindex = off; > + break; > + case 1: > + dtab->cfg.btf_offset.bpf_prog = off; > + break; > + default: > + break; > + } > + } > + > + /* Detect if BTF/vlen have members that were not found */ > + if (btf_type_vlen(value_type) > found_members_cnt) > + return -E2BIG; > + > + return 0; > +} > + > static void dev_map_free(struct bpf_map *map) > { > struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > @@ -601,42 +744,53 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) > return ret; > } > > +static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab) > +{ > + return dtab->cfg.btf_offset.bpf_prog >= 0; > +} > + > static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, > - struct bpf_dtab *dtab, > + struct bpf_map *map, > struct bpf_devmap_val *val, > unsigned int idx) > { > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > struct bpf_prog *prog = NULL; > struct bpf_dtab_netdev *dev; > > - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, > + dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, > dtab->map.numa_node); > if (!dev) > return ERR_PTR(-ENOMEM); > > + /* Member: ifindex is mandatory, both BTF and kABI */ > dev->dev = dev_get_by_index(net, val->ifindex); > if (!dev->dev) > goto err_out; > > - if (val->bpf_prog.fd >= 0) { > - prog = bpf_prog_get_type_dev(val->bpf_prog.fd, > - BPF_PROG_TYPE_XDP, false); > - if (IS_ERR(prog)) > - goto err_put_dev; > - if (prog->expected_attach_type != BPF_XDP_DEVMAP) > - goto err_put_prog; > + /* Member: bpf_prog union is optional, but have fixed offset if exist */ > + if (map_value_has_bpf_prog(dtab)) { > + if (val->bpf_prog.fd >= 0) { > + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, > + BPF_PROG_TYPE_XDP, false); > + if (IS_ERR(prog)) > + goto err_put_dev; > + if (prog->expected_attach_type != BPF_XDP_DEVMAP) > + goto err_put_prog; > + } > + if (prog) { > + dev->xdp_prog = prog; > + val->bpf_prog.id = prog->aux->id; > + } else { > + dev->xdp_prog = NULL; > + val->bpf_prog.id = 0; > + } > } > - > dev->idx = idx; > dev->dtab = dtab; > - if (prog) { > - dev->xdp_prog = prog; > - dev->val.bpf_prog.id = prog->aux->id; > - } else { > - dev->xdp_prog = NULL; > - dev->val.bpf_prog.id = 0; > - } > - dev->val.ifindex = val->ifindex; > + > + /* After adjustment copy map value to get storage area */ > + memcpy(&dev->val, val, map->value_size); > > return dev; > err_put_prog: > @@ -672,7 +826,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, > if (val.bpf_prog.fd != -1) > return -EINVAL; > } else { > - dev = __dev_map_alloc_node(net, dtab, &val, i); > + dev = __dev_map_alloc_node(net, map, &val, i); > if (IS_ERR(dev)) > return PTR_ERR(dev); > } > @@ -717,7 +871,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, > if (old_dev && (map_flags & BPF_NOEXIST)) > goto out_err; > > - dev = __dev_map_alloc_node(net, dtab, &val, idx); > + dev = __dev_map_alloc_node(net, map, &val, idx); > if (IS_ERR(dev)) { > err = PTR_ERR(dev); > goto out_err; > @@ -762,7 +916,7 @@ const struct bpf_map_ops dev_map_ops = { > .map_lookup_elem = dev_map_lookup_elem, > .map_update_elem = dev_map_update_elem, > .map_delete_elem = dev_map_delete_elem, > - .map_check_btf = map_check_no_btf, > + .map_check_btf = dev_map_check_btf, > }; > > const struct bpf_map_ops dev_map_hash_ops = { > @@ -772,7 +926,7 @@ const struct bpf_map_ops dev_map_hash_ops = { > .map_lookup_elem = dev_map_hash_lookup_elem, > .map_update_elem = dev_map_hash_update_elem, > .map_delete_elem = dev_map_hash_delete_elem, > - .map_check_btf = map_check_no_btf, > + .map_check_btf = dev_map_check_btf, > }; > > static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
On Fri, May 29, 2020 at 8:59 AM Jesper Dangaard Brouer <brouer@redhat.com> wrote: > > The devmap map-value can be read from BPF-prog side, and could be used for a > storage area per device. This could e.g. contain info on headers that need If BPF program needs a storage area per device, why can't it just use a separate map or just plain array (both keyed by ifindex) to store whatever it needs per-device? It's not clear why this flexibility and complexity is needed from the description above. > to be added when packet egress this device. > > This patchset adds a dynamic storage member to struct bpf_devmap_val. More > importantly the struct bpf_devmap_val is made dynamic via leveraging and > requiring BTF for struct sizes above 4. The only mandatory struct member is > 'ifindex' with a fixed offset of zero. > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> > --- > kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- > 1 file changed, 185 insertions(+), 31 deletions(-) > [...]
On Sat, 30 May 2020 00:19:50 -0700 Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > On Fri, May 29, 2020 at 8:59 AM Jesper Dangaard Brouer > <brouer@redhat.com> wrote: > > > > The devmap map-value can be read from BPF-prog side, and could be used for a > > storage area per device. This could e.g. contain info on headers that need > > If BPF program needs a storage area per device, why can't it just use > a separate map or just plain array (both keyed by ifindex) to store > whatever it needs per-device? It's not clear why this flexibility and > complexity is needed from the description above. Sorry I though it was obvious, it is for performance reasons and to reduce the number of maps needed. We do a lookup in the devmap anyhow, thus this memory will be cache-hot. Doing another lookup in a separate map, which is not guaranteed to be cache-hot, will be wasting cycles. > > to be added when packet egress this device. > > > > This patchset adds a dynamic storage member to struct bpf_devmap_val. More > > importantly the struct bpf_devmap_val is made dynamic via leveraging and > > requiring BTF for struct sizes above 4. The only mandatory struct member is > > 'ifindex' with a fixed offset of zero. > > > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> > > --- > > kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- > > 1 file changed, 185 insertions(+), 31 deletions(-) > > > > [...] >
On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote: > + > +/* Expected BTF layout that match struct bpf_devmap_val */ > +static const struct expect layout[] = { > + {BTF_KIND_INT, true, 0, 4, "ifindex"}, > + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, > + {BTF_KIND_STRUCT, false, -1, -1, "storage"} > +}; > + > +static int dev_map_check_btf(const struct bpf_map *map, > + const struct btf *btf, > + const struct btf_type *key_type, > + const struct btf_type *value_type) > +{ > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > + u32 found_members_cnt = 0; > + u32 int_data; > + int off; > + u32 i; > + > + /* Validate KEY type and size */ > + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) > + return -EOPNOTSUPP; > + > + int_data = *(u32 *)(key_type + 1); > + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) > + return -EOPNOTSUPP; > + > + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val > + * - With a flexible size of member 'storage'. > + */ > + > + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) > + return -EOPNOTSUPP; > + > + /* Struct/union members in BTF must not exceed (max) expected members */ > + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) > + return -E2BIG; > + > + for (i = 0; i < ARRAY_SIZE(layout); i++) { > + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); > + > + if (off < 0 && layout[i].mandatory) > + return -EUCLEAN; > + > + if (off >= 0) > + found_members_cnt++; > + > + /* Transfer layout config to map */ > + switch (i) { > + case 0: > + dtab->cfg.btf_offset.ifindex = off; > + break; > + case 1: > + dtab->cfg.btf_offset.bpf_prog = off; > + break; > + default: > + break; > + } > + } > + > + /* Detect if BTF/vlen have members that were not found */ > + if (btf_type_vlen(value_type) > found_members_cnt) > + return -E2BIG; > + > + return 0; > +} This layout validation looks really weird to me. That layout[] array sort of complements BTF to describe the data, but double describe of the layout feels like hack. I'm with Andrii here. Separate array indexed by ifindex or global array without map_lookup() can be used with good performance. I don't think such devamp extension is necessary.
On Mon, 1 Jun 2020 14:30:12 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote: > > + > > +/* Expected BTF layout that match struct bpf_devmap_val */ > > +static const struct expect layout[] = { > > + {BTF_KIND_INT, true, 0, 4, "ifindex"}, > > + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, > > + {BTF_KIND_STRUCT, false, -1, -1, "storage"} > > +}; > > + > > +static int dev_map_check_btf(const struct bpf_map *map, > > + const struct btf *btf, > > + const struct btf_type *key_type, > > + const struct btf_type *value_type) > > +{ > > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > > + u32 found_members_cnt = 0; > > + u32 int_data; > > + int off; > > + u32 i; > > + > > + /* Validate KEY type and size */ > > + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) > > + return -EOPNOTSUPP; > > + > > + int_data = *(u32 *)(key_type + 1); > > + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) > > + return -EOPNOTSUPP; > > + > > + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val > > + * - With a flexible size of member 'storage'. > > + */ > > + > > + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) > > + return -EOPNOTSUPP; > > + > > + /* Struct/union members in BTF must not exceed (max) expected members */ > > + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) > > + return -E2BIG; > > + > > + for (i = 0; i < ARRAY_SIZE(layout); i++) { > > + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); > > + > > + if (off < 0 && layout[i].mandatory) > > + return -EUCLEAN; > > + > > + if (off >= 0) > > + found_members_cnt++; > > + > > + /* Transfer layout config to map */ > > + switch (i) { > > + case 0: > > + dtab->cfg.btf_offset.ifindex = off; > > + break; > > + case 1: > > + dtab->cfg.btf_offset.bpf_prog = off; > > + break; > > + default: > > + break; > > + } > > + } > > + > > + /* Detect if BTF/vlen have members that were not found */ > > + if (btf_type_vlen(value_type) > found_members_cnt) > > + return -E2BIG; > > + > > + return 0; > > +} > > This layout validation looks really weird to me. > That layout[] array sort of complements BTF to describe the data, > but double describe of the layout feels like hack. This is the kind of feedback I'm looking for. I want to make the map-value more dynamic. It seems so old school to keep extending the map-value with a size and fixed binary layout, when we have BTF available. I'm open to input on how to better verify/parse/desc the expected BTF layout for kernel-code side. The patch demonstrates that this is possible, I'm open for changes. E.g. devmap is now extended with a bpf_prog, but most end-users will not be using this feature. Today they can use value_size=4 to avoid using this field. When we extend map-value again, then end-users are force into providing 'bpf_prog.fd' if they want to use the newer options. In this patch end-users don't need to provide 'bpf_prog' if they don't use it. Via BTF we can see this struct member can be skipped.
On Fri, 29 May 2020 18:39:40 +0200 Toke Høiland-Jørgensen <toke@redhat.com> wrote: > Jesper Dangaard Brouer <brouer@redhat.com> writes: > > > The devmap map-value can be read from BPF-prog side, and could be used for a > > storage area per device. This could e.g. contain info on headers that need > > to be added when packet egress this device. > > > > This patchset adds a dynamic storage member to struct bpf_devmap_val. More > > importantly the struct bpf_devmap_val is made dynamic via leveraging and > > requiring BTF for struct sizes above 4. The only mandatory struct member is > > 'ifindex' with a fixed offset of zero. > > > > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> > > --- > > kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- > > 1 file changed, 185 insertions(+), 31 deletions(-) > > > > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > > index 4ab67b2d8159..9cf2dadcc0fe 100644 [...] > > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue { > > unsigned int count; > > }; > > > > -/* DEVMAP values */ > > +/* DEVMAP map-value layout. > > + * > > + * The struct data-layout of map-value is a configuration interface. > > + * BPF-prog side have read-only access to this memory. > > + * > > + * The layout might be different than below, because some struct members are > > + * optional. This is made dynamic by requiring userspace provides an BTF > > + * description of the struct layout, when creating the BPF-map. Struct names > > + * are important and part of API, as BTF use these names to identify members. > > + */ > > struct bpf_devmap_val { > > - __u32 ifindex; /* device index */ > > + __u32 ifindex; /* device index - mandatory */ > > union { > > int fd; /* prog fd on map write */ > > __u32 id; /* prog id on map read */ > > } bpf_prog; > > + struct { > > + /* This 'storage' member is meant as a dynamically sized area, > > + * that BPF developer can redefine. As other members are added > > + * overtime, this area can shrink, as size can be regained by > > + * not using members above. Add new members above this struct. > > + */ > > + unsigned char data[24]; > > + } storage; > > Why is this needed? Userspace already passes in the value_size, so why > can't the kernel just use the BTF to pick out the values it cares about > and let the rest be up to userspace? The kernel cannot just ignore unknown struct members, due to forward compatibility. An older kernel that sees a new struct member, cannot know what this struct member is used for. Thus, later I'm rejecting map creation if I detect members kernel doesn't know about. This means, that I need to create a named area (e.g. named 'storage') that users can define their own layout within. This might be difficult to comprehend for other kernel developers, because usually we create forward compatibility via walking the binary struct and then assume that if an unknown area (in end-of-struct) contains zeros, then it means end-user isn't using that unknown feature. This doesn't work when the default value, as in this exact case, need to be minus-1 do describe "unused" as this is a file descriptor. Forward compatibility is different here. If the end-user include the member in their BTF description, that means they intend to use it. Thus, kernel need to reject map-create if it sees unknown members.
Jesper Dangaard Brouer <brouer@redhat.com> writes: > On Fri, 29 May 2020 18:39:40 +0200 > Toke Høiland-Jørgensen <toke@redhat.com> wrote: > >> Jesper Dangaard Brouer <brouer@redhat.com> writes: >> >> > The devmap map-value can be read from BPF-prog side, and could be used for a >> > storage area per device. This could e.g. contain info on headers that need >> > to be added when packet egress this device. >> > >> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More >> > importantly the struct bpf_devmap_val is made dynamic via leveraging and >> > requiring BTF for struct sizes above 4. The only mandatory struct member is >> > 'ifindex' with a fixed offset of zero. >> > >> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> >> > --- >> > kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- >> > 1 file changed, 185 insertions(+), 31 deletions(-) >> > >> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c >> > index 4ab67b2d8159..9cf2dadcc0fe 100644 > [...] >> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue { >> > unsigned int count; >> > }; >> > >> > -/* DEVMAP values */ >> > +/* DEVMAP map-value layout. >> > + * >> > + * The struct data-layout of map-value is a configuration interface. >> > + * BPF-prog side have read-only access to this memory. >> > + * >> > + * The layout might be different than below, because some struct members are >> > + * optional. This is made dynamic by requiring userspace provides an BTF >> > + * description of the struct layout, when creating the BPF-map. Struct names >> > + * are important and part of API, as BTF use these names to identify members. >> > + */ >> > struct bpf_devmap_val { >> > - __u32 ifindex; /* device index */ >> > + __u32 ifindex; /* device index - mandatory */ >> > union { >> > int fd; /* prog fd on map write */ >> > __u32 id; /* prog id on map read */ >> > } bpf_prog; >> > + struct { >> > + /* This 'storage' member is meant as a dynamically sized area, >> > + * that BPF developer can redefine. As other members are added >> > + * overtime, this area can shrink, as size can be regained by >> > + * not using members above. Add new members above this struct. >> > + */ >> > + unsigned char data[24]; >> > + } storage; >> >> Why is this needed? Userspace already passes in the value_size, so why >> can't the kernel just use the BTF to pick out the values it cares about >> and let the rest be up to userspace? > > The kernel cannot just ignore unknown struct members, due to forward > compatibility. An older kernel that sees a new struct member, cannot > know what this struct member is used for. Thus, later I'm rejecting > map creation if I detect members kernel doesn't know about. > > This means, that I need to create a named area (e.g. named 'storage') > that users can define their own layout within. > > This might be difficult to comprehend for other kernel developers, > because usually we create forward compatibility via walking the binary > struct and then assume that if an unknown area (in end-of-struct) > contains zeros, then it means end-user isn't using that unknown feature. > This doesn't work when the default value, as in this exact case, need > to be minus-1 do describe "unused" as this is a file descriptor. > > Forward compatibility is different here. If the end-user include the > member in their BTF description, that means they intend to use it. > Thus, kernel need to reject map-create if it sees unknown members. Ah, right, of course. You could still allow such a "user-defined" member to be any size userspace likes, though, couldn't you? -Toke
On Tue, 02 Jun 2020 11:23:24 +0200 Toke Høiland-Jørgensen <toke@redhat.com> wrote: > Jesper Dangaard Brouer <brouer@redhat.com> writes: > > > On Fri, 29 May 2020 18:39:40 +0200 > > Toke Høiland-Jørgensen <toke@redhat.com> wrote: > > > >> Jesper Dangaard Brouer <brouer@redhat.com> writes: > >> > >> > The devmap map-value can be read from BPF-prog side, and could be used for a > >> > storage area per device. This could e.g. contain info on headers that need > >> > to be added when packet egress this device. > >> > > >> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More > >> > importantly the struct bpf_devmap_val is made dynamic via leveraging and > >> > requiring BTF for struct sizes above 4. The only mandatory struct member is > >> > 'ifindex' with a fixed offset of zero. > >> > > >> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> > >> > --- > >> > kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- > >> > 1 file changed, 185 insertions(+), 31 deletions(-) > >> > > >> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c > >> > index 4ab67b2d8159..9cf2dadcc0fe 100644 > > [...] > >> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue { > >> > unsigned int count; > >> > }; > >> > > >> > -/* DEVMAP values */ > >> > +/* DEVMAP map-value layout. > >> > + * > >> > + * The struct data-layout of map-value is a configuration interface. > >> > + * BPF-prog side have read-only access to this memory. > >> > + * > >> > + * The layout might be different than below, because some struct members are > >> > + * optional. This is made dynamic by requiring userspace provides an BTF > >> > + * description of the struct layout, when creating the BPF-map. Struct names > >> > + * are important and part of API, as BTF use these names to identify members. > >> > + */ > >> > struct bpf_devmap_val { > >> > - __u32 ifindex; /* device index */ > >> > + __u32 ifindex; /* device index - mandatory */ > >> > union { > >> > int fd; /* prog fd on map write */ > >> > __u32 id; /* prog id on map read */ > >> > } bpf_prog; > >> > + struct { > >> > + /* This 'storage' member is meant as a dynamically sized area, > >> > + * that BPF developer can redefine. As other members are added > >> > + * overtime, this area can shrink, as size can be regained by > >> > + * not using members above. Add new members above this struct. > >> > + */ > >> > + unsigned char data[24]; > >> > + } storage; > >> > >> Why is this needed? Userspace already passes in the value_size, so why > >> can't the kernel just use the BTF to pick out the values it cares about > >> and let the rest be up to userspace? > > > > The kernel cannot just ignore unknown struct members, due to forward > > compatibility. An older kernel that sees a new struct member, cannot > > know what this struct member is used for. Thus, later I'm rejecting > > map creation if I detect members kernel doesn't know about. > > > > This means, that I need to create a named area (e.g. named 'storage') > > that users can define their own layout within. > > > > This might be difficult to comprehend for other kernel developers, > > because usually we create forward compatibility via walking the binary > > struct and then assume that if an unknown area (in end-of-struct) > > contains zeros, then it means end-user isn't using that unknown feature. > > This doesn't work when the default value, as in this exact case, need > > to be minus-1 do describe "unused" as this is a file descriptor. > > > > Forward compatibility is different here. If the end-user include the > > member in their BTF description, that means they intend to use it. > > Thus, kernel need to reject map-create if it sees unknown members. > > Ah, right, of course. You could still allow such a "user-defined" member > to be any size userspace likes, though, couldn't you? Yes. In this implementation the "user-defined" member 'storage' do have variable size (and can be non-existing). Do you mean that I have limited the total size of the struct to be 32 bytes? (Which is true, and that can also be made dynamic, but I was trying to limit the scope of patch. It is hard enough to wrap head around the binary struct from userspace is becoming dynamic)
On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer <brouer@redhat.com> wrote: > > On Mon, 1 Jun 2020 14:30:12 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote: > > > + > > > +/* Expected BTF layout that match struct bpf_devmap_val */ > > > +static const struct expect layout[] = { > > > + {BTF_KIND_INT, true, 0, 4, "ifindex"}, > > > + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, > > > + {BTF_KIND_STRUCT, false, -1, -1, "storage"} > > > +}; > > > + > > > +static int dev_map_check_btf(const struct bpf_map *map, > > > + const struct btf *btf, > > > + const struct btf_type *key_type, > > > + const struct btf_type *value_type) > > > +{ > > > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > > > + u32 found_members_cnt = 0; > > > + u32 int_data; > > > + int off; > > > + u32 i; > > > + > > > + /* Validate KEY type and size */ > > > + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) > > > + return -EOPNOTSUPP; > > > + > > > + int_data = *(u32 *)(key_type + 1); > > > + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) > > > + return -EOPNOTSUPP; > > > + > > > + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val > > > + * - With a flexible size of member 'storage'. > > > + */ > > > + > > > + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) > > > + return -EOPNOTSUPP; > > > + > > > + /* Struct/union members in BTF must not exceed (max) expected members */ > > > + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) > > > + return -E2BIG; > > > + > > > + for (i = 0; i < ARRAY_SIZE(layout); i++) { > > > + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); > > > + > > > + if (off < 0 && layout[i].mandatory) > > > + return -EUCLEAN; > > > + > > > + if (off >= 0) > > > + found_members_cnt++; > > > + > > > + /* Transfer layout config to map */ > > > + switch (i) { > > > + case 0: > > > + dtab->cfg.btf_offset.ifindex = off; > > > + break; > > > + case 1: > > > + dtab->cfg.btf_offset.bpf_prog = off; > > > + break; > > > + default: > > > + break; > > > + } > > > + } > > > + > > > + /* Detect if BTF/vlen have members that were not found */ > > > + if (btf_type_vlen(value_type) > found_members_cnt) > > > + return -E2BIG; > > > + > > > + return 0; > > > +} > > > > This layout validation looks really weird to me. > > That layout[] array sort of complements BTF to describe the data, > > but double describe of the layout feels like hack. > > This is the kind of feedback I'm looking for. I want to make the > map-value more dynamic. It seems so old school to keep extending the > map-value with a size and fixed binary layout, when we have BTF > available. I'm open to input on how to better verify/parse/desc the > expected BTF layout for kernel-code side. > > The patch demonstrates that this is possible, I'm open for changes. > E.g. devmap is now extended with a bpf_prog, but most end-users will > not be using this feature. Today they can use value_size=4 to avoid > using this field. When we extend map-value again, then end-users are > force into providing 'bpf_prog.fd' if they want to use the newer > options. In this patch end-users don't need to provide 'bpf_prog' if > they don't use it. Via BTF we can see this struct member can be skipped. I think 'struct bpf_devmap_val' should be in uapi/bpf.h. That's what it is and it will be extended with new fields at the end just like all other structs in uapi/bpf.h I don't think BTF can become a substitute for uapi where uapi struct has to have all fields defined and backwards supported by the kernel. BTF is for flexible structs where fields may disappear. BTF is there to define a meaning of a binary blob. 'struct bpf_devmap_val' is not such thing. It's very much known with fixed fields and fixed meaning.
On Tue, 2 Jun 2020 11:27:03 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer > <brouer@redhat.com> wrote: > > > > On Mon, 1 Jun 2020 14:30:12 -0700 > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote: > > > > + > > > > +/* Expected BTF layout that match struct bpf_devmap_val */ > > > > +static const struct expect layout[] = { > > > > + {BTF_KIND_INT, true, 0, 4, "ifindex"}, > > > > + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, > > > > + {BTF_KIND_STRUCT, false, -1, -1, "storage"} > > > > +}; > > > > + > > > > +static int dev_map_check_btf(const struct bpf_map *map, > > > > + const struct btf *btf, > > > > + const struct btf_type *key_type, > > > > + const struct btf_type *value_type) > > > > +{ > > > > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > > > > + u32 found_members_cnt = 0; > > > > + u32 int_data; > > > > + int off; > > > > + u32 i; > > > > + > > > > + /* Validate KEY type and size */ > > > > + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) > > > > + return -EOPNOTSUPP; > > > > + > > > > + int_data = *(u32 *)(key_type + 1); > > > > + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) > > > > + return -EOPNOTSUPP; > > > > + > > > > + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val > > > > + * - With a flexible size of member 'storage'. > > > > + */ > > > > + > > > > + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) > > > > + return -EOPNOTSUPP; > > > > + > > > > + /* Struct/union members in BTF must not exceed (max) expected members */ > > > > + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) > > > > + return -E2BIG; > > > > + > > > > + for (i = 0; i < ARRAY_SIZE(layout); i++) { > > > > + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); > > > > + > > > > + if (off < 0 && layout[i].mandatory) > > > > + return -EUCLEAN; > > > > + > > > > + if (off >= 0) > > > > + found_members_cnt++; > > > > + > > > > + /* Transfer layout config to map */ > > > > + switch (i) { > > > > + case 0: > > > > + dtab->cfg.btf_offset.ifindex = off; > > > > + break; > > > > + case 1: > > > > + dtab->cfg.btf_offset.bpf_prog = off; > > > > + break; > > > > + default: > > > > + break; > > > > + } > > > > + } > > > > + > > > > + /* Detect if BTF/vlen have members that were not found */ > > > > + if (btf_type_vlen(value_type) > found_members_cnt) > > > > + return -E2BIG; > > > > + > > > > + return 0; > > > > +} > > > > > > This layout validation looks really weird to me. > > > That layout[] array sort of complements BTF to describe the data, > > > but double describe of the layout feels like hack. > > > > This is the kind of feedback I'm looking for. I want to make the > > map-value more dynamic. It seems so old school to keep extending the > > map-value with a size and fixed binary layout, when we have BTF > > available. I'm open to input on how to better verify/parse/desc the > > expected BTF layout for kernel-code side. > > > > The patch demonstrates that this is possible, I'm open for changes. > > E.g. devmap is now extended with a bpf_prog, but most end-users will > > not be using this feature. Today they can use value_size=4 to avoid > > using this field. When we extend map-value again, then end-users are > > force into providing 'bpf_prog.fd' if they want to use the newer > > options. In this patch end-users don't need to provide 'bpf_prog' if > > they don't use it. Via BTF we can see this struct member can be skipped. > > I think 'struct bpf_devmap_val' should be in uapi/bpf.h. I disagree. > That's what it is and it will be extended with new fields at the end > just like all other structs in uapi/bpf.h This only works when new fields added will be zero, meaning that default value of zero means the feature is not used. In this specific case devmap adds a file-descriptor field, that have to be -1 for the feature to be unused. Thus, when programs gets compiled with this new UAPI header, they will start to fail, because they try to map-insert file-descriptor zero. > I don't think BTF can become a substitute for uapi > where uapi struct has to have all fields defined and backwards supported > by the kernel. > BTF is for flexible structs where fields may disappear. Then BTF is perfect for this, as e.g. I want to remove field/member 'ifindex' for the HASH-variant of devmap, and instead use the key as the ifindex. > BTF is there to define a meaning of a binary blob. > 'struct bpf_devmap_val' is not such thing. It's very much known with > fixed fields and fixed meaning.
On Wed, Jun 03, 2020 at 11:11:58AM +0200, Jesper Dangaard Brouer wrote: > On Tue, 2 Jun 2020 11:27:03 -0700 > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer > > <brouer@redhat.com> wrote: > > > > > > On Mon, 1 Jun 2020 14:30:12 -0700 > > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > > > > > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote: > > > > > + > > > > > +/* Expected BTF layout that match struct bpf_devmap_val */ > > > > > +static const struct expect layout[] = { > > > > > + {BTF_KIND_INT, true, 0, 4, "ifindex"}, > > > > > + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, > > > > > + {BTF_KIND_STRUCT, false, -1, -1, "storage"} > > > > > +}; > > > > > + > > > > > +static int dev_map_check_btf(const struct bpf_map *map, > > > > > + const struct btf *btf, > > > > > + const struct btf_type *key_type, > > > > > + const struct btf_type *value_type) > > > > > +{ > > > > > + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); > > > > > + u32 found_members_cnt = 0; > > > > > + u32 int_data; > > > > > + int off; > > > > > + u32 i; > > > > > + > > > > > + /* Validate KEY type and size */ > > > > > + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) > > > > > + return -EOPNOTSUPP; > > > > > + > > > > > + int_data = *(u32 *)(key_type + 1); > > > > > + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) > > > > > + return -EOPNOTSUPP; > > > > > + > > > > > + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val > > > > > + * - With a flexible size of member 'storage'. > > > > > + */ > > > > > + > > > > > + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) > > > > > + return -EOPNOTSUPP; > > > > > + > > > > > + /* Struct/union members in BTF must not exceed (max) expected members */ > > > > > + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) > > > > > + return -E2BIG; > > > > > + > > > > > + for (i = 0; i < ARRAY_SIZE(layout); i++) { > > > > > + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); > > > > > + > > > > > + if (off < 0 && layout[i].mandatory) > > > > > + return -EUCLEAN; > > > > > + > > > > > + if (off >= 0) > > > > > + found_members_cnt++; > > > > > + > > > > > + /* Transfer layout config to map */ > > > > > + switch (i) { > > > > > + case 0: > > > > > + dtab->cfg.btf_offset.ifindex = off; > > > > > + break; > > > > > + case 1: > > > > > + dtab->cfg.btf_offset.bpf_prog = off; > > > > > + break; > > > > > + default: > > > > > + break; > > > > > + } > > > > > + } > > > > > + > > > > > + /* Detect if BTF/vlen have members that were not found */ > > > > > + if (btf_type_vlen(value_type) > found_members_cnt) > > > > > + return -E2BIG; > > > > > + > > > > > + return 0; > > > > > +} > > > > > > > > This layout validation looks really weird to me. > > > > That layout[] array sort of complements BTF to describe the data, > > > > but double describe of the layout feels like hack. > > > > > > This is the kind of feedback I'm looking for. I want to make the > > > map-value more dynamic. It seems so old school to keep extending the > > > map-value with a size and fixed binary layout, when we have BTF > > > available. I'm open to input on how to better verify/parse/desc the > > > expected BTF layout for kernel-code side. > > > > > > The patch demonstrates that this is possible, I'm open for changes. > > > E.g. devmap is now extended with a bpf_prog, but most end-users will > > > not be using this feature. Today they can use value_size=4 to avoid > > > using this field. When we extend map-value again, then end-users are > > > force into providing 'bpf_prog.fd' if they want to use the newer > > > options. In this patch end-users don't need to provide 'bpf_prog' if > > > they don't use it. Via BTF we can see this struct member can be skipped. > > > > I think 'struct bpf_devmap_val' should be in uapi/bpf.h. > > I disagree. > > > That's what it is and it will be extended with new fields at the end > > just like all other structs in uapi/bpf.h > > This only works when new fields added will be zero, meaning that > default value of zero means the feature is not used. In this specific > case devmap adds a file-descriptor field, that have to be -1 for the > feature to be unused. > > Thus, when programs gets compiled with this new UAPI header, they will > start to fail, because they try to map-insert file-descriptor zero. No, because there is size that has to be specified. There are plenty of other uapi structs that have non-zero values in a newly added fields. > > > I don't think BTF can become a substitute for uapi > > where uapi struct has to have all fields defined and backwards supported > > by the kernel. > > BTF is for flexible structs where fields may disappear. > > Then BTF is perfect for this, as e.g. I want to remove field/member > 'ifindex' for the HASH-variant of devmap, and instead use the key as > the ifindex. nack to that.
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 4ab67b2d8159..9cf2dadcc0fe 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -48,6 +48,7 @@ #include <net/xdp.h> #include <linux/filter.h> #include <trace/events/xdp.h> +#include <linux/btf.h> #define DEV_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue { unsigned int count; }; -/* DEVMAP values */ +/* DEVMAP map-value layout. + * + * The struct data-layout of map-value is a configuration interface. + * BPF-prog side have read-only access to this memory. + * + * The layout might be different than below, because some struct members are + * optional. This is made dynamic by requiring userspace provides an BTF + * description of the struct layout, when creating the BPF-map. Struct names + * are important and part of API, as BTF use these names to identify members. + */ struct bpf_devmap_val { - __u32 ifindex; /* device index */ + __u32 ifindex; /* device index - mandatory */ union { int fd; /* prog fd on map write */ __u32 id; /* prog id on map read */ } bpf_prog; + struct { + /* This 'storage' member is meant as a dynamically sized area, + * that BPF developer can redefine. As other members are added + * overtime, this area can shrink, as size can be regained by + * not using members above. Add new members above this struct. + */ + unsigned char data[24]; + } storage; }; struct bpf_dtab_netdev { @@ -79,10 +97,18 @@ struct bpf_dtab_netdev { struct bpf_devmap_val val; }; +struct bpf_devmap_val_cfg { + struct { + int ifindex; + int bpf_prog; + } btf_offset; +}; + struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ struct list_head list; + struct bpf_devmap_val_cfg cfg; /* these are only used for DEVMAP_HASH type maps */ struct hlist_head *dev_index_head; @@ -116,20 +142,24 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { - __u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes. 2 value sizes supported: - * 4 bytes: ifindex - * 8 bytes: ifindex + prog fd - */ + /* Value contents validated in dev_map_check_btf */ if (attr->max_entries == 0 || attr->key_size != 4 || - (valsize != offsetofend(struct bpf_devmap_val, ifindex) && - valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->value_size > sizeof(struct bpf_devmap_val) || attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; + /* Enforce BTF for userspace, unless dealing with legacy kABI */ + if (attr->value_size != 4 && + (!attr->btf_key_type_id || !attr->btf_value_type_id)) + return -EOPNOTSUPP; + + /* Mark BTF offset's as invalid */ + dtab->cfg.btf_offset.ifindex = -1; + dtab->cfg.btf_offset.bpf_prog = -1; + /* Lookup returns a pointer straight to dev->ifindex, so make sure the * verifier prevents writes from the BPF side */ @@ -199,6 +229,119 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) return &dtab->map; } +struct expect { + u8 btf_kind; + bool mandatory; + int bit_offset; + int size; + const char *name; +}; + +static int btf_find_expect_layout_offset(const struct btf *btf, + const struct btf_type *value_type, + const struct expect *layout) +{ + const struct btf_member *member; + u32 i, off = -ENOENT; + + for_each_member(i, value_type, member) { + const struct btf_type *member_type; + const char *member_name; + + member_type = btf_type_skip_modifiers(btf, member->type, NULL); + if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) { + continue; + } + + member_name = btf_name_by_offset(btf, member->name_off); + if (!member_name) + return -EINVAL; + + if (strcmp(layout->name, member_name)) + continue; + + if (layout->size > 0 && // btf_type_has_size(member_type) && + member_type->size != layout->size) + continue; + + off = btf_member_bit_offset(value_type, member); + if (layout->bit_offset > 0 && + layout->bit_offset != off) { + off = -ENOENT; + continue; + } + + return off; + } + return off; +} + +/* Expected BTF layout that match struct bpf_devmap_val */ +static const struct expect layout[] = { + {BTF_KIND_INT, true, 0, 4, "ifindex"}, + {BTF_KIND_UNION, false, 32, 4, "bpf_prog"}, + {BTF_KIND_STRUCT, false, -1, -1, "storage"} +}; + +static int dev_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 found_members_cnt = 0; + u32 int_data; + int off; + u32 i; + + /* Validate KEY type and size */ + if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) + return -EOPNOTSUPP; + + int_data = *(u32 *)(key_type + 1); + if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0) + return -EOPNOTSUPP; + + /* Validate VALUE have layout that match/map-to struct bpf_devmap_val + * - With a flexible size of member 'storage'. + */ + + if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT) + return -EOPNOTSUPP; + + /* Struct/union members in BTF must not exceed (max) expected members */ + if (btf_type_vlen(value_type) > ARRAY_SIZE(layout)) + return -E2BIG; + + for (i = 0; i < ARRAY_SIZE(layout); i++) { + off = btf_find_expect_layout_offset(btf, value_type, &layout[i]); + + if (off < 0 && layout[i].mandatory) + return -EUCLEAN; + + if (off >= 0) + found_members_cnt++; + + /* Transfer layout config to map */ + switch (i) { + case 0: + dtab->cfg.btf_offset.ifindex = off; + break; + case 1: + dtab->cfg.btf_offset.bpf_prog = off; + break; + default: + break; + } + } + + /* Detect if BTF/vlen have members that were not found */ + if (btf_type_vlen(value_type) > found_members_cnt) + return -E2BIG; + + return 0; +} + static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -601,42 +744,53 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) return ret; } +static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab) +{ + return dtab->cfg.btf_offset.bpf_prog >= 0; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, - struct bpf_dtab *dtab, + struct bpf_map *map, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); + /* Member: ifindex is mandatory, both BTF and kABI */ dev->dev = dev_get_by_index(net, val->ifindex); if (!dev->dev) goto err_out; - if (val->bpf_prog.fd >= 0) { - prog = bpf_prog_get_type_dev(val->bpf_prog.fd, - BPF_PROG_TYPE_XDP, false); - if (IS_ERR(prog)) - goto err_put_dev; - if (prog->expected_attach_type != BPF_XDP_DEVMAP) - goto err_put_prog; + /* Member: bpf_prog union is optional, but have fixed offset if exist */ + if (map_value_has_bpf_prog(dtab)) { + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + if (prog) { + dev->xdp_prog = prog; + val->bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + val->bpf_prog.id = 0; + } } - dev->idx = idx; dev->dtab = dtab; - if (prog) { - dev->xdp_prog = prog; - dev->val.bpf_prog.id = prog->aux->id; - } else { - dev->xdp_prog = NULL; - dev->val.bpf_prog.id = 0; - } - dev->val.ifindex = val->ifindex; + + /* After adjustment copy map value to get storage area */ + memcpy(&dev->val, val, map->value_size); return dev; err_put_prog: @@ -672,7 +826,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (val.bpf_prog.fd != -1) return -EINVAL; } else { - dev = __dev_map_alloc_node(net, dtab, &val, i); + dev = __dev_map_alloc_node(net, map, &val, i); if (IS_ERR(dev)) return PTR_ERR(dev); } @@ -717,7 +871,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, if (old_dev && (map_flags & BPF_NOEXIST)) goto out_err; - dev = __dev_map_alloc_node(net, dtab, &val, idx); + dev = __dev_map_alloc_node(net, map, &val, idx); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out_err; @@ -762,7 +916,7 @@ const struct bpf_map_ops dev_map_ops = { .map_lookup_elem = dev_map_lookup_elem, .map_update_elem = dev_map_update_elem, .map_delete_elem = dev_map_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = dev_map_check_btf, }; const struct bpf_map_ops dev_map_hash_ops = { @@ -772,7 +926,7 @@ const struct bpf_map_ops dev_map_hash_ops = { .map_lookup_elem = dev_map_hash_lookup_elem, .map_update_elem = dev_map_hash_update_elem, .map_delete_elem = dev_map_hash_delete_elem, - .map_check_btf = map_check_no_btf, + .map_check_btf = dev_map_check_btf, }; static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
The devmap map-value can be read from BPF-prog side, and could be used for a storage area per device. This could e.g. contain info on headers that need to be added when packet egress this device. This patchset adds a dynamic storage member to struct bpf_devmap_val. More importantly the struct bpf_devmap_val is made dynamic via leveraging and requiring BTF for struct sizes above 4. The only mandatory struct member is 'ifindex' with a fixed offset of zero. Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> --- kernel/bpf/devmap.c | 216 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 185 insertions(+), 31 deletions(-)