diff mbox series

[bpf-next,RFC,2/3] bpf: devmap dynamic map-value storage area based on BTF

Message ID 159076798566.1387573.8417040652693679408.stgit@firesoul
State RFC
Delegated to: BPF Maintainers
Headers show
Series bpf: dynamic map-value config layout via BTF | expand

Commit Message

Jesper Dangaard Brouer May 29, 2020, 3:59 p.m. UTC
The devmap map-value can be read from BPF-prog side, and could be used for a
storage area per device. This could e.g. contain info on headers that need
to be added when packet egress this device.

This patchset adds a dynamic storage member to struct bpf_devmap_val. More
importantly the struct bpf_devmap_val is made dynamic via leveraging and
requiring BTF for struct sizes above 4. The only mandatory struct member is
'ifindex' with a fixed offset of zero.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 185 insertions(+), 31 deletions(-)

Comments

Toke Høiland-Jørgensen May 29, 2020, 4:39 p.m. UTC | #1
Jesper Dangaard Brouer <brouer@redhat.com> writes:

> The devmap map-value can be read from BPF-prog side, and could be used for a
> storage area per device. This could e.g. contain info on headers that need
> to be added when packet egress this device.
>
> This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> importantly the struct bpf_devmap_val is made dynamic via leveraging and
> requiring BTF for struct sizes above 4. The only mandatory struct member is
> 'ifindex' with a fixed offset of zero.
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 185 insertions(+), 31 deletions(-)
>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 4ab67b2d8159..9cf2dadcc0fe 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -48,6 +48,7 @@
>  #include <net/xdp.h>
>  #include <linux/filter.h>
>  #include <trace/events/xdp.h>
> +#include <linux/btf.h>
>  
>  #define DEV_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
> @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
>  	unsigned int count;
>  };
>  
> -/* DEVMAP values */
> +/* DEVMAP map-value layout.
> + *
> + * The struct data-layout of map-value is a configuration interface.
> + * BPF-prog side have read-only access to this memory.
> + *
> + * The layout might be different than below, because some struct members are
> + * optional.  This is made dynamic by requiring userspace provides an BTF
> + * description of the struct layout, when creating the BPF-map. Struct names
> + * are important and part of API, as BTF use these names to identify members.
> + */
>  struct bpf_devmap_val {
> -	__u32 ifindex;   /* device index */
> +	__u32 ifindex;   /* device index - mandatory */
>  	union {
>  		int   fd;  /* prog fd on map write */
>  		__u32 id;  /* prog id on map read */
>  	} bpf_prog;
> +	struct {
> +		/* This 'storage' member is meant as a dynamically sized area,
> +		 * that BPF developer can redefine.  As other members are added
> +		 * overtime, this area can shrink, as size can be regained by
> +		 * not using members above. Add new members above this struct.
> +		 */
> +		unsigned char data[24];
> +	} storage;

Why is this needed? Userspace already passes in the value_size, so why
can't the kernel just use the BTF to pick out the values it cares about
and let the rest be up to userspace?

>  };
>  
>  struct bpf_dtab_netdev {
> @@ -79,10 +97,18 @@ struct bpf_dtab_netdev {
>  	struct bpf_devmap_val val;
>  };
>  
> +struct bpf_devmap_val_cfg {
> +	struct {
> +		int ifindex;
> +		int bpf_prog;
> +	} btf_offset;
> +};
> +
>  struct bpf_dtab {
>  	struct bpf_map map;
>  	struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
>  	struct list_head list;
> +	struct bpf_devmap_val_cfg cfg;
>  
>  	/* these are only used for DEVMAP_HASH type maps */
>  	struct hlist_head *dev_index_head;
> @@ -116,20 +142,24 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
>  
>  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
>  {
> -	__u32 valsize = attr->value_size;
>  	u64 cost = 0;
>  	int err;
>  
> -	/* check sanity of attributes. 2 value sizes supported:
> -	 * 4 bytes: ifindex
> -	 * 8 bytes: ifindex + prog fd
> -	 */
> +	/* Value contents validated in dev_map_check_btf */
>  	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
> -	     valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
> +	    attr->value_size > sizeof(struct bpf_devmap_val) ||
>  	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>  		return -EINVAL;
>  
> +	/* Enforce BTF for userspace, unless dealing with legacy kABI */
> +	if (attr->value_size != 4 &&
> +	    (!attr->btf_key_type_id || !attr->btf_value_type_id))
> +		return -EOPNOTSUPP;
> +
> +	/* Mark BTF offset's as invalid */
> +	dtab->cfg.btf_offset.ifindex  = -1;
> +	dtab->cfg.btf_offset.bpf_prog = -1;
> +
>  	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
>  	 * verifier prevents writes from the BPF side
>  	 */
> @@ -199,6 +229,119 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  	return &dtab->map;
>  }
>  
> +struct expect {
> +	u8 btf_kind;
> +	bool mandatory;
> +	int bit_offset;
> +	int size;
> +	const char *name;
> +};
> +
> +static int btf_find_expect_layout_offset(const struct btf *btf,
> +					 const struct btf_type *value_type,
> +					 const struct expect *layout)
> +{
> +	const struct btf_member *member;
> +	u32 i, off = -ENOENT;
> +
> +	for_each_member(i, value_type, member) {
> +		const struct btf_type *member_type;
> +		const char *member_name;
> +
> +		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
> +		if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) {
> +			continue;
> +		}
> +
> +		member_name = btf_name_by_offset(btf, member->name_off);
> +		if (!member_name)
> +			return -EINVAL;
> +
> +		if (strcmp(layout->name, member_name))
> +			continue;
> +
> +		if (layout->size > 0 &&  // btf_type_has_size(member_type) &&
> +		    member_type->size != layout->size)
> +			continue;
> +
> +		off = btf_member_bit_offset(value_type, member);
> +		if (layout->bit_offset > 0 &&
> +		    layout->bit_offset != off) {
> +			off = -ENOENT;
> +			continue;
> +		}

Won't this enforced offset cause problems for extensibility? Say we
introduce a third struct member that the kernel understands, e.g.
another u32 with expect->bit_offset=64. That would mean you can no
longer skip members, since that would make any subsequent offset tests
fail? Or am I misunderstanding how this is supposed to work?

> +
> +		return off;
> +	}
> +	return off;
> +}
> +
> +/* Expected BTF layout that match struct bpf_devmap_val */
> +static const struct expect layout[] = {
> +	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
> +	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
> +	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
> +};
> +
> +static int dev_map_check_btf(const struct bpf_map *map,
> +			     const struct btf *btf,
> +			     const struct btf_type *key_type,
> +			     const struct btf_type *value_type)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +	u32 found_members_cnt = 0;
> +	u32 int_data;
> +	int off;
> +	u32 i;
> +
> +	/* Validate KEY type and size */
> +	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> +		return -EOPNOTSUPP;
> +
> +	int_data = *(u32 *)(key_type + 1);
> +	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> +		return -EOPNOTSUPP;
> +
> +	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> +	 * - With a flexible size of member 'storage'.
> +	 */
> +
> +	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> +		return -EOPNOTSUPP;
> +
> +	/* Struct/union members in BTF must not exceed (max) expected members */
> +	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> +			return -E2BIG;
> +
> +	for (i = 0; i < ARRAY_SIZE(layout); i++) {
> +		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> +
> +		if (off < 0 && layout[i].mandatory)
> +			return -EUCLEAN;
> +
> +		if (off >= 0)
> +			found_members_cnt++;
> +
> +		/* Transfer layout config to map */
> +		switch (i) {
> +		case 0:
> +			dtab->cfg.btf_offset.ifindex = off;
> +			break;
> +		case 1:
> +			dtab->cfg.btf_offset.bpf_prog = off;
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	/* Detect if BTF/vlen have members that were not found */
> +	if (btf_type_vlen(value_type) > found_members_cnt)
> +		return -E2BIG;
> +
> +	return 0;
> +}
> +
>  static void dev_map_free(struct bpf_map *map)
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> @@ -601,42 +744,53 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
>  	return ret;
>  }
>  
> +static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab)
> +{
> +	return dtab->cfg.btf_offset.bpf_prog >= 0;
> +}
> +
>  static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
> -						    struct bpf_dtab *dtab,
> +						    struct bpf_map *map,
>  						    struct bpf_devmap_val *val,
>  						    unsigned int idx)
>  {
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>  	struct bpf_prog *prog = NULL;
>  	struct bpf_dtab_netdev *dev;
>  
> -	dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
> +	dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
>  			   dtab->map.numa_node);
>  	if (!dev)
>  		return ERR_PTR(-ENOMEM);
>  
> +	/* Member: ifindex is mandatory, both BTF and kABI */
>  	dev->dev = dev_get_by_index(net, val->ifindex);
>  	if (!dev->dev)
>  		goto err_out;
>  
> -	if (val->bpf_prog.fd >= 0) {
> -		prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
> -					     BPF_PROG_TYPE_XDP, false);
> -		if (IS_ERR(prog))
> -			goto err_put_dev;
> -		if (prog->expected_attach_type != BPF_XDP_DEVMAP)
> -			goto err_put_prog;
> +	/* Member: bpf_prog union is optional, but have fixed offset if exist */
> +	if (map_value_has_bpf_prog(dtab)) {
> +		if (val->bpf_prog.fd >= 0) {
> +			prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
> +						     BPF_PROG_TYPE_XDP, false);
> +			if (IS_ERR(prog))
> +				goto err_put_dev;
> +			if (prog->expected_attach_type != BPF_XDP_DEVMAP)
> +				goto err_put_prog;
> +		}
> +		if (prog) {
> +			dev->xdp_prog = prog;
> +			val->bpf_prog.id = prog->aux->id;
> +		} else {
> +			dev->xdp_prog = NULL;
> +			val->bpf_prog.id = 0;
> +		}
>  	}
> -
>  	dev->idx = idx;
>  	dev->dtab = dtab;
> -	if (prog) {
> -		dev->xdp_prog = prog;
> -		dev->val.bpf_prog.id = prog->aux->id;
> -	} else {
> -		dev->xdp_prog = NULL;
> -		dev->val.bpf_prog.id = 0;
> -	}
> -	dev->val.ifindex = val->ifindex;
> +
> +	/* After adjustment copy map value to get storage area */
> +	memcpy(&dev->val, val, map->value_size);
>  
>  	return dev;
>  err_put_prog:
> @@ -672,7 +826,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
>  		if (val.bpf_prog.fd != -1)
>  			return -EINVAL;
>  	} else {
> -		dev = __dev_map_alloc_node(net, dtab, &val, i);
> +		dev = __dev_map_alloc_node(net, map, &val, i);
>  		if (IS_ERR(dev))
>  			return PTR_ERR(dev);
>  	}
> @@ -717,7 +871,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
>  	if (old_dev && (map_flags & BPF_NOEXIST))
>  		goto out_err;
>  
> -	dev = __dev_map_alloc_node(net, dtab, &val, idx);
> +	dev = __dev_map_alloc_node(net, map, &val, idx);
>  	if (IS_ERR(dev)) {
>  		err = PTR_ERR(dev);
>  		goto out_err;
> @@ -762,7 +916,7 @@ const struct bpf_map_ops dev_map_ops = {
>  	.map_lookup_elem = dev_map_lookup_elem,
>  	.map_update_elem = dev_map_update_elem,
>  	.map_delete_elem = dev_map_delete_elem,
> -	.map_check_btf = map_check_no_btf,
> +	.map_check_btf = dev_map_check_btf,
>  };
>  
>  const struct bpf_map_ops dev_map_hash_ops = {
> @@ -772,7 +926,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
>  	.map_lookup_elem = dev_map_hash_lookup_elem,
>  	.map_update_elem = dev_map_hash_update_elem,
>  	.map_delete_elem = dev_map_hash_delete_elem,
> -	.map_check_btf = map_check_no_btf,
> +	.map_check_btf = dev_map_check_btf,
>  };
>  
>  static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
Andrii Nakryiko May 30, 2020, 7:19 a.m. UTC | #2
On Fri, May 29, 2020 at 8:59 AM Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> The devmap map-value can be read from BPF-prog side, and could be used for a
> storage area per device. This could e.g. contain info on headers that need

If BPF program needs a storage area per device, why can't it just use
a separate map or just plain array (both keyed by ifindex) to store
whatever it needs per-device? It's not clear why this flexibility and
complexity is needed from the description above.

> to be added when packet egress this device.
>
> This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> importantly the struct bpf_devmap_val is made dynamic via leveraging and
> requiring BTF for struct sizes above 4. The only mandatory struct member is
> 'ifindex' with a fixed offset of zero.
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 185 insertions(+), 31 deletions(-)
>

[...]
Jesper Dangaard Brouer May 30, 2020, 2:36 p.m. UTC | #3
On Sat, 30 May 2020 00:19:50 -0700
Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:

> On Fri, May 29, 2020 at 8:59 AM Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> >
> > The devmap map-value can be read from BPF-prog side, and could be used for a
> > storage area per device. This could e.g. contain info on headers that need  
> 
> If BPF program needs a storage area per device, why can't it just use
> a separate map or just plain array (both keyed by ifindex) to store
> whatever it needs per-device? It's not clear why this flexibility and
> complexity is needed from the description above.

Sorry I though it was obvious, it is for performance reasons and to
reduce the number of maps needed.  We do a lookup in the devmap anyhow,
thus this memory will be cache-hot.  Doing another lookup in a separate
map, which is not guaranteed to be cache-hot, will be wasting cycles.

> > to be added when packet egress this device.
> >
> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
> > requiring BTF for struct sizes above 4. The only mandatory struct member is
> > 'ifindex' with a fixed offset of zero.
> >
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > ---
> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 185 insertions(+), 31 deletions(-)
> >  
> 
> [...]
>
Alexei Starovoitov June 1, 2020, 9:30 p.m. UTC | #4
On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:
> +
> +/* Expected BTF layout that match struct bpf_devmap_val */
> +static const struct expect layout[] = {
> +	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
> +	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
> +	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
> +};
> +
> +static int dev_map_check_btf(const struct bpf_map *map,
> +			     const struct btf *btf,
> +			     const struct btf_type *key_type,
> +			     const struct btf_type *value_type)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +	u32 found_members_cnt = 0;
> +	u32 int_data;
> +	int off;
> +	u32 i;
> +
> +	/* Validate KEY type and size */
> +	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> +		return -EOPNOTSUPP;
> +
> +	int_data = *(u32 *)(key_type + 1);
> +	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> +		return -EOPNOTSUPP;
> +
> +	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> +	 * - With a flexible size of member 'storage'.
> +	 */
> +
> +	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> +		return -EOPNOTSUPP;
> +
> +	/* Struct/union members in BTF must not exceed (max) expected members */
> +	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> +			return -E2BIG;
> +
> +	for (i = 0; i < ARRAY_SIZE(layout); i++) {
> +		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> +
> +		if (off < 0 && layout[i].mandatory)
> +			return -EUCLEAN;
> +
> +		if (off >= 0)
> +			found_members_cnt++;
> +
> +		/* Transfer layout config to map */
> +		switch (i) {
> +		case 0:
> +			dtab->cfg.btf_offset.ifindex = off;
> +			break;
> +		case 1:
> +			dtab->cfg.btf_offset.bpf_prog = off;
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	/* Detect if BTF/vlen have members that were not found */
> +	if (btf_type_vlen(value_type) > found_members_cnt)
> +		return -E2BIG;
> +
> +	return 0;
> +}

This layout validation looks really weird to me.
That layout[] array sort of complements BTF to describe the data,
but double describe of the layout feels like hack.
I'm with Andrii here. Separate array indexed by ifindex or global array
without map_lookup() can be used with good performance.
I don't think such devamp extension is necessary.
Jesper Dangaard Brouer June 2, 2020, 7 a.m. UTC | #5
On Mon, 1 Jun 2020 14:30:12 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:
> > +
> > +/* Expected BTF layout that match struct bpf_devmap_val */
> > +static const struct expect layout[] = {
> > +	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
> > +	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
> > +	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
> > +};
> > +
> > +static int dev_map_check_btf(const struct bpf_map *map,
> > +			     const struct btf *btf,
> > +			     const struct btf_type *key_type,
> > +			     const struct btf_type *value_type)
> > +{
> > +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > +	u32 found_members_cnt = 0;
> > +	u32 int_data;
> > +	int off;
> > +	u32 i;
> > +
> > +	/* Validate KEY type and size */
> > +	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > +		return -EOPNOTSUPP;
> > +
> > +	int_data = *(u32 *)(key_type + 1);
> > +	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > +		return -EOPNOTSUPP;
> > +
> > +	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > +	 * - With a flexible size of member 'storage'.
> > +	 */
> > +
> > +	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > +		return -EOPNOTSUPP;
> > +
> > +	/* Struct/union members in BTF must not exceed (max) expected members */
> > +	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > +			return -E2BIG;
> > +
> > +	for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > +		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > +
> > +		if (off < 0 && layout[i].mandatory)
> > +			return -EUCLEAN;
> > +
> > +		if (off >= 0)
> > +			found_members_cnt++;
> > +
> > +		/* Transfer layout config to map */
> > +		switch (i) {
> > +		case 0:
> > +			dtab->cfg.btf_offset.ifindex = off;
> > +			break;
> > +		case 1:
> > +			dtab->cfg.btf_offset.bpf_prog = off;
> > +			break;
> > +		default:
> > +			break;
> > +		}
> > +	}
> > +
> > +	/* Detect if BTF/vlen have members that were not found */
> > +	if (btf_type_vlen(value_type) > found_members_cnt)
> > +		return -E2BIG;
> > +
> > +	return 0;
> > +}  
> 
> This layout validation looks really weird to me.
> That layout[] array sort of complements BTF to describe the data,
> but double describe of the layout feels like hack.

This is the kind of feedback I'm looking for.  I want to make the
map-value more dynamic.  It seems so old school to keep extending the
map-value with a size and fixed binary layout, when we have BTF
available.  I'm open to input on how to better verify/parse/desc the
expected BTF layout for kernel-code side.

The patch demonstrates that this is possible, I'm open for changes.
E.g. devmap is now extended with a bpf_prog, but most end-users will
not be using this feature. Today they can use value_size=4 to avoid
using this field. When we extend map-value again, then end-users are
force into providing 'bpf_prog.fd' if they want to use the newer
options.  In this patch end-users don't need to provide 'bpf_prog' if
they don't use it. Via BTF we can see this struct member can be skipped.
Jesper Dangaard Brouer June 2, 2020, 8:59 a.m. UTC | #6
On Fri, 29 May 2020 18:39:40 +0200
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> 
> > The devmap map-value can be read from BPF-prog side, and could be used for a
> > storage area per device. This could e.g. contain info on headers that need
> > to be added when packet egress this device.
> >
> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
> > requiring BTF for struct sizes above 4. The only mandatory struct member is
> > 'ifindex' with a fixed offset of zero.
> >
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > ---
> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 185 insertions(+), 31 deletions(-)
> >
> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> > index 4ab67b2d8159..9cf2dadcc0fe 100644
[...]
> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
> >  	unsigned int count;
> >  };
> >  
> > -/* DEVMAP values */
> > +/* DEVMAP map-value layout.
> > + *
> > + * The struct data-layout of map-value is a configuration interface.
> > + * BPF-prog side have read-only access to this memory.
> > + *
> > + * The layout might be different than below, because some struct members are
> > + * optional.  This is made dynamic by requiring userspace provides an BTF
> > + * description of the struct layout, when creating the BPF-map. Struct names
> > + * are important and part of API, as BTF use these names to identify members.
> > + */
> >  struct bpf_devmap_val {
> > -	__u32 ifindex;   /* device index */
> > +	__u32 ifindex;   /* device index - mandatory */
> >  	union {
> >  		int   fd;  /* prog fd on map write */
> >  		__u32 id;  /* prog id on map read */
> >  	} bpf_prog;
> > +	struct {
> > +		/* This 'storage' member is meant as a dynamically sized area,
> > +		 * that BPF developer can redefine.  As other members are added
> > +		 * overtime, this area can shrink, as size can be regained by
> > +		 * not using members above. Add new members above this struct.
> > +		 */
> > +		unsigned char data[24];
> > +	} storage;  
> 
> Why is this needed? Userspace already passes in the value_size, so why
> can't the kernel just use the BTF to pick out the values it cares about
> and let the rest be up to userspace?

The kernel cannot just ignore unknown struct members, due to forward
compatibility. An older kernel that sees a new struct member, cannot
know what this struct member is used for.  Thus, later I'm rejecting
map creation if I detect members kernel doesn't know about.

This means, that I need to create a named area (e.g. named 'storage')
that users can define their own layout within.

This might be difficult to comprehend for other kernel developers,
because usually we create forward compatibility via walking the binary
struct and then assume that if an unknown area (in end-of-struct)
contains zeros, then it means end-user isn't using that unknown feature.
This doesn't work when the default value, as in this exact case, need
to be minus-1 do describe "unused" as this is a file descriptor.

Forward compatibility is different here.  If the end-user include the
member in their BTF description, that means they intend to use it.
Thus, kernel need to reject map-create if it sees unknown members.
Toke Høiland-Jørgensen June 2, 2020, 9:23 a.m. UTC | #7
Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Fri, 29 May 2020 18:39:40 +0200
> Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
>> Jesper Dangaard Brouer <brouer@redhat.com> writes:
>> 
>> > The devmap map-value can be read from BPF-prog side, and could be used for a
>> > storage area per device. This could e.g. contain info on headers that need
>> > to be added when packet egress this device.
>> >
>> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
>> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
>> > requiring BTF for struct sizes above 4. The only mandatory struct member is
>> > 'ifindex' with a fixed offset of zero.
>> >
>> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> > ---
>> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
>> >  1 file changed, 185 insertions(+), 31 deletions(-)
>> >
>> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> > index 4ab67b2d8159..9cf2dadcc0fe 100644
> [...]
>> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
>> >  	unsigned int count;
>> >  };
>> >  
>> > -/* DEVMAP values */
>> > +/* DEVMAP map-value layout.
>> > + *
>> > + * The struct data-layout of map-value is a configuration interface.
>> > + * BPF-prog side have read-only access to this memory.
>> > + *
>> > + * The layout might be different than below, because some struct members are
>> > + * optional.  This is made dynamic by requiring userspace provides an BTF
>> > + * description of the struct layout, when creating the BPF-map. Struct names
>> > + * are important and part of API, as BTF use these names to identify members.
>> > + */
>> >  struct bpf_devmap_val {
>> > -	__u32 ifindex;   /* device index */
>> > +	__u32 ifindex;   /* device index - mandatory */
>> >  	union {
>> >  		int   fd;  /* prog fd on map write */
>> >  		__u32 id;  /* prog id on map read */
>> >  	} bpf_prog;
>> > +	struct {
>> > +		/* This 'storage' member is meant as a dynamically sized area,
>> > +		 * that BPF developer can redefine.  As other members are added
>> > +		 * overtime, this area can shrink, as size can be regained by
>> > +		 * not using members above. Add new members above this struct.
>> > +		 */
>> > +		unsigned char data[24];
>> > +	} storage;  
>> 
>> Why is this needed? Userspace already passes in the value_size, so why
>> can't the kernel just use the BTF to pick out the values it cares about
>> and let the rest be up to userspace?
>
> The kernel cannot just ignore unknown struct members, due to forward
> compatibility. An older kernel that sees a new struct member, cannot
> know what this struct member is used for.  Thus, later I'm rejecting
> map creation if I detect members kernel doesn't know about.
>
> This means, that I need to create a named area (e.g. named 'storage')
> that users can define their own layout within.
>
> This might be difficult to comprehend for other kernel developers,
> because usually we create forward compatibility via walking the binary
> struct and then assume that if an unknown area (in end-of-struct)
> contains zeros, then it means end-user isn't using that unknown feature.
> This doesn't work when the default value, as in this exact case, need
> to be minus-1 do describe "unused" as this is a file descriptor.
>
> Forward compatibility is different here.  If the end-user include the
> member in their BTF description, that means they intend to use it.
> Thus, kernel need to reject map-create if it sees unknown members.

Ah, right, of course. You could still allow such a "user-defined" member
to be any size userspace likes, though, couldn't you?

-Toke
Jesper Dangaard Brouer June 2, 2020, 10:01 a.m. UTC | #8
On Tue, 02 Jun 2020 11:23:24 +0200
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> 
> > On Fri, 29 May 2020 18:39:40 +0200
> > Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >  
> >> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> >>   
> >> > The devmap map-value can be read from BPF-prog side, and could be used for a
> >> > storage area per device. This could e.g. contain info on headers that need
> >> > to be added when packet egress this device.
> >> >
> >> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> >> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
> >> > requiring BTF for struct sizes above 4. The only mandatory struct member is
> >> > 'ifindex' with a fixed offset of zero.
> >> >
> >> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> >> > ---
> >> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
> >> >  1 file changed, 185 insertions(+), 31 deletions(-)
> >> >
> >> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> >> > index 4ab67b2d8159..9cf2dadcc0fe 100644  
> > [...]  
> >> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
> >> >  	unsigned int count;
> >> >  };
> >> >  
> >> > -/* DEVMAP values */
> >> > +/* DEVMAP map-value layout.
> >> > + *
> >> > + * The struct data-layout of map-value is a configuration interface.
> >> > + * BPF-prog side have read-only access to this memory.
> >> > + *
> >> > + * The layout might be different than below, because some struct members are
> >> > + * optional.  This is made dynamic by requiring userspace provides an BTF
> >> > + * description of the struct layout, when creating the BPF-map. Struct names
> >> > + * are important and part of API, as BTF use these names to identify members.
> >> > + */
> >> >  struct bpf_devmap_val {
> >> > -	__u32 ifindex;   /* device index */
> >> > +	__u32 ifindex;   /* device index - mandatory */
> >> >  	union {
> >> >  		int   fd;  /* prog fd on map write */
> >> >  		__u32 id;  /* prog id on map read */
> >> >  	} bpf_prog;
> >> > +	struct {
> >> > +		/* This 'storage' member is meant as a dynamically sized area,
> >> > +		 * that BPF developer can redefine.  As other members are added
> >> > +		 * overtime, this area can shrink, as size can be regained by
> >> > +		 * not using members above. Add new members above this struct.
> >> > +		 */
> >> > +		unsigned char data[24];
> >> > +	} storage;    
> >> 
> >> Why is this needed? Userspace already passes in the value_size, so why
> >> can't the kernel just use the BTF to pick out the values it cares about
> >> and let the rest be up to userspace?  
> >
> > The kernel cannot just ignore unknown struct members, due to forward
> > compatibility. An older kernel that sees a new struct member, cannot
> > know what this struct member is used for.  Thus, later I'm rejecting
> > map creation if I detect members kernel doesn't know about.
> >
> > This means, that I need to create a named area (e.g. named 'storage')
> > that users can define their own layout within.
> >
> > This might be difficult to comprehend for other kernel developers,
> > because usually we create forward compatibility via walking the binary
> > struct and then assume that if an unknown area (in end-of-struct)
> > contains zeros, then it means end-user isn't using that unknown feature.
> > This doesn't work when the default value, as in this exact case, need
> > to be minus-1 do describe "unused" as this is a file descriptor.
> >
> > Forward compatibility is different here.  If the end-user include the
> > member in their BTF description, that means they intend to use it.
> > Thus, kernel need to reject map-create if it sees unknown members.  
> 
> Ah, right, of course. You could still allow such a "user-defined" member
> to be any size userspace likes, though, couldn't you?

Yes.  In this implementation the "user-defined" member 'storage' do have
variable size (and can be non-existing).  Do you mean that I have
limited the total size of the struct to be 32 bytes?
(Which is true, and that can also be made dynamic, but I was trying to
limit the scope of patch.  It is hard enough to wrap head around the
binary struct from userspace is becoming dynamic)
Alexei Starovoitov June 2, 2020, 6:27 p.m. UTC | #9
On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> On Mon, 1 Jun 2020 14:30:12 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:
> > > +
> > > +/* Expected BTF layout that match struct bpf_devmap_val */
> > > +static const struct expect layout[] = {
> > > +   {BTF_KIND_INT,          true,    0,      4,     "ifindex"},
> > > +   {BTF_KIND_UNION,        false,  32,      4,     "bpf_prog"},
> > > +   {BTF_KIND_STRUCT,       false,  -1,     -1,     "storage"}
> > > +};
> > > +
> > > +static int dev_map_check_btf(const struct bpf_map *map,
> > > +                        const struct btf *btf,
> > > +                        const struct btf_type *key_type,
> > > +                        const struct btf_type *value_type)
> > > +{
> > > +   struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > > +   u32 found_members_cnt = 0;
> > > +   u32 int_data;
> > > +   int off;
> > > +   u32 i;
> > > +
> > > +   /* Validate KEY type and size */
> > > +   if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > > +           return -EOPNOTSUPP;
> > > +
> > > +   int_data = *(u32 *)(key_type + 1);
> > > +   if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > > +           return -EOPNOTSUPP;
> > > +
> > > +   /* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > > +    * - With a flexible size of member 'storage'.
> > > +    */
> > > +
> > > +   if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > > +           return -EOPNOTSUPP;
> > > +
> > > +   /* Struct/union members in BTF must not exceed (max) expected members */
> > > +   if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > > +                   return -E2BIG;
> > > +
> > > +   for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > > +           off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > > +
> > > +           if (off < 0 && layout[i].mandatory)
> > > +                   return -EUCLEAN;
> > > +
> > > +           if (off >= 0)
> > > +                   found_members_cnt++;
> > > +
> > > +           /* Transfer layout config to map */
> > > +           switch (i) {
> > > +           case 0:
> > > +                   dtab->cfg.btf_offset.ifindex = off;
> > > +                   break;
> > > +           case 1:
> > > +                   dtab->cfg.btf_offset.bpf_prog = off;
> > > +                   break;
> > > +           default:
> > > +                   break;
> > > +           }
> > > +   }
> > > +
> > > +   /* Detect if BTF/vlen have members that were not found */
> > > +   if (btf_type_vlen(value_type) > found_members_cnt)
> > > +           return -E2BIG;
> > > +
> > > +   return 0;
> > > +}
> >
> > This layout validation looks really weird to me.
> > That layout[] array sort of complements BTF to describe the data,
> > but double describe of the layout feels like hack.
>
> This is the kind of feedback I'm looking for.  I want to make the
> map-value more dynamic.  It seems so old school to keep extending the
> map-value with a size and fixed binary layout, when we have BTF
> available.  I'm open to input on how to better verify/parse/desc the
> expected BTF layout for kernel-code side.
>
> The patch demonstrates that this is possible, I'm open for changes.
> E.g. devmap is now extended with a bpf_prog, but most end-users will
> not be using this feature. Today they can use value_size=4 to avoid
> using this field. When we extend map-value again, then end-users are
> force into providing 'bpf_prog.fd' if they want to use the newer
> options.  In this patch end-users don't need to provide 'bpf_prog' if
> they don't use it. Via BTF we can see this struct member can be skipped.

I think 'struct bpf_devmap_val' should be in uapi/bpf.h.
That's what it is and it will be extended with new fields at the end
just like all other structs in uapi/bpf.h
I don't think BTF can become a substitute for uapi
where uapi struct has to have all fields defined and backwards supported
by the kernel.
BTF is for flexible structs where fields may disappear.
BTF is there to define a meaning of a binary blob.
'struct bpf_devmap_val' is not such thing. It's very much known with
fixed fields and fixed meaning.
Jesper Dangaard Brouer June 3, 2020, 9:11 a.m. UTC | #10
On Tue, 2 Jun 2020 11:27:03 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> >
> > On Mon, 1 Jun 2020 14:30:12 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >  
> > > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:  
> > > > +
> > > > +/* Expected BTF layout that match struct bpf_devmap_val */
> > > > +static const struct expect layout[] = {
> > > > +   {BTF_KIND_INT,          true,    0,      4,     "ifindex"},
> > > > +   {BTF_KIND_UNION,        false,  32,      4,     "bpf_prog"},
> > > > +   {BTF_KIND_STRUCT,       false,  -1,     -1,     "storage"}
> > > > +};
> > > > +
> > > > +static int dev_map_check_btf(const struct bpf_map *map,
> > > > +                        const struct btf *btf,
> > > > +                        const struct btf_type *key_type,
> > > > +                        const struct btf_type *value_type)
> > > > +{
> > > > +   struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > > > +   u32 found_members_cnt = 0;
> > > > +   u32 int_data;
> > > > +   int off;
> > > > +   u32 i;
> > > > +
> > > > +   /* Validate KEY type and size */
> > > > +   if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > > > +           return -EOPNOTSUPP;
> > > > +
> > > > +   int_data = *(u32 *)(key_type + 1);
> > > > +   if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > > > +           return -EOPNOTSUPP;
> > > > +
> > > > +   /* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > > > +    * - With a flexible size of member 'storage'.
> > > > +    */
> > > > +
> > > > +   if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > > > +           return -EOPNOTSUPP;
> > > > +
> > > > +   /* Struct/union members in BTF must not exceed (max) expected members */
> > > > +   if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > > > +                   return -E2BIG;
> > > > +
> > > > +   for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > > > +           off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > > > +
> > > > +           if (off < 0 && layout[i].mandatory)
> > > > +                   return -EUCLEAN;
> > > > +
> > > > +           if (off >= 0)
> > > > +                   found_members_cnt++;
> > > > +
> > > > +           /* Transfer layout config to map */
> > > > +           switch (i) {
> > > > +           case 0:
> > > > +                   dtab->cfg.btf_offset.ifindex = off;
> > > > +                   break;
> > > > +           case 1:
> > > > +                   dtab->cfg.btf_offset.bpf_prog = off;
> > > > +                   break;
> > > > +           default:
> > > > +                   break;
> > > > +           }
> > > > +   }
> > > > +
> > > > +   /* Detect if BTF/vlen have members that were not found */
> > > > +   if (btf_type_vlen(value_type) > found_members_cnt)
> > > > +           return -E2BIG;
> > > > +
> > > > +   return 0;
> > > > +}  
> > >
> > > This layout validation looks really weird to me.
> > > That layout[] array sort of complements BTF to describe the data,
> > > but double describe of the layout feels like hack.  
> >
> > This is the kind of feedback I'm looking for.  I want to make the
> > map-value more dynamic.  It seems so old school to keep extending the
> > map-value with a size and fixed binary layout, when we have BTF
> > available.  I'm open to input on how to better verify/parse/desc the
> > expected BTF layout for kernel-code side.
> >
> > The patch demonstrates that this is possible, I'm open for changes.
> > E.g. devmap is now extended with a bpf_prog, but most end-users will
> > not be using this feature. Today they can use value_size=4 to avoid
> > using this field. When we extend map-value again, then end-users are
> > force into providing 'bpf_prog.fd' if they want to use the newer
> > options.  In this patch end-users don't need to provide 'bpf_prog' if
> > they don't use it. Via BTF we can see this struct member can be skipped.  
> 
> I think 'struct bpf_devmap_val' should be in uapi/bpf.h.

I disagree.

> That's what it is and it will be extended with new fields at the end
> just like all other structs in uapi/bpf.h

This only works when new fields added will be zero, meaning that
default value of zero means the feature is not used.  In this specific
case devmap adds a file-descriptor field, that have to be -1 for the
feature to be unused.

Thus, when programs gets compiled with this new UAPI header, they will
start to fail, because they try to map-insert file-descriptor zero.


> I don't think BTF can become a substitute for uapi
> where uapi struct has to have all fields defined and backwards supported
> by the kernel.
> BTF is for flexible structs where fields may disappear.

Then BTF is perfect for this, as e.g. I want to remove field/member
'ifindex' for the HASH-variant of devmap, and instead use the key as
the ifindex.


> BTF is there to define a meaning of a binary blob.
> 'struct bpf_devmap_val' is not such thing. It's very much known with
> fixed fields and fixed meaning.
Alexei Starovoitov June 3, 2020, 4:20 p.m. UTC | #11
On Wed, Jun 03, 2020 at 11:11:58AM +0200, Jesper Dangaard Brouer wrote:
> On Tue, 2 Jun 2020 11:27:03 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer
> > <brouer@redhat.com> wrote:
> > >
> > > On Mon, 1 Jun 2020 14:30:12 -0700
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > >  
> > > > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:  
> > > > > +
> > > > > +/* Expected BTF layout that match struct bpf_devmap_val */
> > > > > +static const struct expect layout[] = {
> > > > > +   {BTF_KIND_INT,          true,    0,      4,     "ifindex"},
> > > > > +   {BTF_KIND_UNION,        false,  32,      4,     "bpf_prog"},
> > > > > +   {BTF_KIND_STRUCT,       false,  -1,     -1,     "storage"}
> > > > > +};
> > > > > +
> > > > > +static int dev_map_check_btf(const struct bpf_map *map,
> > > > > +                        const struct btf *btf,
> > > > > +                        const struct btf_type *key_type,
> > > > > +                        const struct btf_type *value_type)
> > > > > +{
> > > > > +   struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > > > > +   u32 found_members_cnt = 0;
> > > > > +   u32 int_data;
> > > > > +   int off;
> > > > > +   u32 i;
> > > > > +
> > > > > +   /* Validate KEY type and size */
> > > > > +   if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > > > > +           return -EOPNOTSUPP;
> > > > > +
> > > > > +   int_data = *(u32 *)(key_type + 1);
> > > > > +   if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > > > > +           return -EOPNOTSUPP;
> > > > > +
> > > > > +   /* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > > > > +    * - With a flexible size of member 'storage'.
> > > > > +    */
> > > > > +
> > > > > +   if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > > > > +           return -EOPNOTSUPP;
> > > > > +
> > > > > +   /* Struct/union members in BTF must not exceed (max) expected members */
> > > > > +   if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > > > > +                   return -E2BIG;
> > > > > +
> > > > > +   for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > > > > +           off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > > > > +
> > > > > +           if (off < 0 && layout[i].mandatory)
> > > > > +                   return -EUCLEAN;
> > > > > +
> > > > > +           if (off >= 0)
> > > > > +                   found_members_cnt++;
> > > > > +
> > > > > +           /* Transfer layout config to map */
> > > > > +           switch (i) {
> > > > > +           case 0:
> > > > > +                   dtab->cfg.btf_offset.ifindex = off;
> > > > > +                   break;
> > > > > +           case 1:
> > > > > +                   dtab->cfg.btf_offset.bpf_prog = off;
> > > > > +                   break;
> > > > > +           default:
> > > > > +                   break;
> > > > > +           }
> > > > > +   }
> > > > > +
> > > > > +   /* Detect if BTF/vlen have members that were not found */
> > > > > +   if (btf_type_vlen(value_type) > found_members_cnt)
> > > > > +           return -E2BIG;
> > > > > +
> > > > > +   return 0;
> > > > > +}  
> > > >
> > > > This layout validation looks really weird to me.
> > > > That layout[] array sort of complements BTF to describe the data,
> > > > but double describe of the layout feels like hack.  
> > >
> > > This is the kind of feedback I'm looking for.  I want to make the
> > > map-value more dynamic.  It seems so old school to keep extending the
> > > map-value with a size and fixed binary layout, when we have BTF
> > > available.  I'm open to input on how to better verify/parse/desc the
> > > expected BTF layout for kernel-code side.
> > >
> > > The patch demonstrates that this is possible, I'm open for changes.
> > > E.g. devmap is now extended with a bpf_prog, but most end-users will
> > > not be using this feature. Today they can use value_size=4 to avoid
> > > using this field. When we extend map-value again, then end-users are
> > > force into providing 'bpf_prog.fd' if they want to use the newer
> > > options.  In this patch end-users don't need to provide 'bpf_prog' if
> > > they don't use it. Via BTF we can see this struct member can be skipped.  
> > 
> > I think 'struct bpf_devmap_val' should be in uapi/bpf.h.
> 
> I disagree.
>
> > That's what it is and it will be extended with new fields at the end
> > just like all other structs in uapi/bpf.h
> 
> This only works when new fields added will be zero, meaning that
> default value of zero means the feature is not used.  In this specific
> case devmap adds a file-descriptor field, that have to be -1 for the
> feature to be unused.
> 
> Thus, when programs gets compiled with this new UAPI header, they will
> start to fail, because they try to map-insert file-descriptor zero.

No, because there is size that has to be specified.
There are plenty of other uapi structs that have non-zero values
in a newly added fields.

> 
> > I don't think BTF can become a substitute for uapi
> > where uapi struct has to have all fields defined and backwards supported
> > by the kernel.
> > BTF is for flexible structs where fields may disappear.
> 
> Then BTF is perfect for this, as e.g. I want to remove field/member
> 'ifindex' for the HASH-variant of devmap, and instead use the key as
> the ifindex.

nack to that.
diff mbox series

Patch

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4ab67b2d8159..9cf2dadcc0fe 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@ 
 #include <net/xdp.h>
 #include <linux/filter.h>
 #include <trace/events/xdp.h>
+#include <linux/btf.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -60,13 +61,30 @@  struct xdp_dev_bulk_queue {
 	unsigned int count;
 };
 
-/* DEVMAP values */
+/* DEVMAP map-value layout.
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * BPF-prog side have read-only access to this memory.
+ *
+ * The layout might be different than below, because some struct members are
+ * optional.  This is made dynamic by requiring userspace provides an BTF
+ * description of the struct layout, when creating the BPF-map. Struct names
+ * are important and part of API, as BTF use these names to identify members.
+ */
 struct bpf_devmap_val {
-	__u32 ifindex;   /* device index */
+	__u32 ifindex;   /* device index - mandatory */
 	union {
 		int   fd;  /* prog fd on map write */
 		__u32 id;  /* prog id on map read */
 	} bpf_prog;
+	struct {
+		/* This 'storage' member is meant as a dynamically sized area,
+		 * that BPF developer can redefine.  As other members are added
+		 * overtime, this area can shrink, as size can be regained by
+		 * not using members above. Add new members above this struct.
+		 */
+		unsigned char data[24];
+	} storage;
 };
 
 struct bpf_dtab_netdev {
@@ -79,10 +97,18 @@  struct bpf_dtab_netdev {
 	struct bpf_devmap_val val;
 };
 
+struct bpf_devmap_val_cfg {
+	struct {
+		int ifindex;
+		int bpf_prog;
+	} btf_offset;
+};
+
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
 	struct list_head list;
+	struct bpf_devmap_val_cfg cfg;
 
 	/* these are only used for DEVMAP_HASH type maps */
 	struct hlist_head *dev_index_head;
@@ -116,20 +142,24 @@  static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
 
 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 {
-	__u32 valsize = attr->value_size;
 	u64 cost = 0;
 	int err;
 
-	/* check sanity of attributes. 2 value sizes supported:
-	 * 4 bytes: ifindex
-	 * 8 bytes: ifindex + prog fd
-	 */
+	/* Value contents validated in dev_map_check_btf */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
-	     valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
+	    attr->value_size > sizeof(struct bpf_devmap_val) ||
 	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return -EINVAL;
 
+	/* Enforce BTF for userspace, unless dealing with legacy kABI */
+	if (attr->value_size != 4 &&
+	    (!attr->btf_key_type_id || !attr->btf_value_type_id))
+		return -EOPNOTSUPP;
+
+	/* Mark BTF offset's as invalid */
+	dtab->cfg.btf_offset.ifindex  = -1;
+	dtab->cfg.btf_offset.bpf_prog = -1;
+
 	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
 	 * verifier prevents writes from the BPF side
 	 */
@@ -199,6 +229,119 @@  static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	return &dtab->map;
 }
 
+struct expect {
+	u8 btf_kind;
+	bool mandatory;
+	int bit_offset;
+	int size;
+	const char *name;
+};
+
+static int btf_find_expect_layout_offset(const struct btf *btf,
+					 const struct btf_type *value_type,
+					 const struct expect *layout)
+{
+	const struct btf_member *member;
+	u32 i, off = -ENOENT;
+
+	for_each_member(i, value_type, member) {
+		const struct btf_type *member_type;
+		const char *member_name;
+
+		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+		if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) {
+			continue;
+		}
+
+		member_name = btf_name_by_offset(btf, member->name_off);
+		if (!member_name)
+			return -EINVAL;
+
+		if (strcmp(layout->name, member_name))
+			continue;
+
+		if (layout->size > 0 &&  // btf_type_has_size(member_type) &&
+		    member_type->size != layout->size)
+			continue;
+
+		off = btf_member_bit_offset(value_type, member);
+		if (layout->bit_offset > 0 &&
+		    layout->bit_offset != off) {
+			off = -ENOENT;
+			continue;
+		}
+
+		return off;
+	}
+	return off;
+}
+
+/* Expected BTF layout that match struct bpf_devmap_val */
+static const struct expect layout[] = {
+	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
+	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
+	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
+};
+
+static int dev_map_check_btf(const struct bpf_map *map,
+			     const struct btf *btf,
+			     const struct btf_type *key_type,
+			     const struct btf_type *value_type)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 found_members_cnt = 0;
+	u32 int_data;
+	int off;
+	u32 i;
+
+	/* Validate KEY type and size */
+	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EOPNOTSUPP;
+
+	int_data = *(u32 *)(key_type + 1);
+	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
+		return -EOPNOTSUPP;
+
+	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
+	 * - With a flexible size of member 'storage'.
+	 */
+
+	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
+		return -EOPNOTSUPP;
+
+	/* Struct/union members in BTF must not exceed (max) expected members */
+	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
+			return -E2BIG;
+
+	for (i = 0; i < ARRAY_SIZE(layout); i++) {
+		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
+
+		if (off < 0 && layout[i].mandatory)
+			return -EUCLEAN;
+
+		if (off >= 0)
+			found_members_cnt++;
+
+		/* Transfer layout config to map */
+		switch (i) {
+		case 0:
+			dtab->cfg.btf_offset.ifindex = off;
+			break;
+		case 1:
+			dtab->cfg.btf_offset.bpf_prog = off;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Detect if BTF/vlen have members that were not found */
+	if (btf_type_vlen(value_type) > found_members_cnt)
+		return -E2BIG;
+
+	return 0;
+}
+
 static void dev_map_free(struct bpf_map *map)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -601,42 +744,53 @@  static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
+static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab)
+{
+	return dtab->cfg.btf_offset.bpf_prog >= 0;
+}
+
 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
-						    struct bpf_dtab *dtab,
+						    struct bpf_map *map,
 						    struct bpf_devmap_val *val,
 						    unsigned int idx)
 {
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_prog *prog = NULL;
 	struct bpf_dtab_netdev *dev;
 
-	dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+	dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
 			   dtab->map.numa_node);
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
+	/* Member: ifindex is mandatory, both BTF and kABI */
 	dev->dev = dev_get_by_index(net, val->ifindex);
 	if (!dev->dev)
 		goto err_out;
 
-	if (val->bpf_prog.fd >= 0) {
-		prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
-					     BPF_PROG_TYPE_XDP, false);
-		if (IS_ERR(prog))
-			goto err_put_dev;
-		if (prog->expected_attach_type != BPF_XDP_DEVMAP)
-			goto err_put_prog;
+	/* Member: bpf_prog union is optional, but have fixed offset if exist */
+	if (map_value_has_bpf_prog(dtab)) {
+		if (val->bpf_prog.fd >= 0) {
+			prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
+						     BPF_PROG_TYPE_XDP, false);
+			if (IS_ERR(prog))
+				goto err_put_dev;
+			if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+				goto err_put_prog;
+		}
+		if (prog) {
+			dev->xdp_prog = prog;
+			val->bpf_prog.id = prog->aux->id;
+		} else {
+			dev->xdp_prog = NULL;
+			val->bpf_prog.id = 0;
+		}
 	}
-
 	dev->idx = idx;
 	dev->dtab = dtab;
-	if (prog) {
-		dev->xdp_prog = prog;
-		dev->val.bpf_prog.id = prog->aux->id;
-	} else {
-		dev->xdp_prog = NULL;
-		dev->val.bpf_prog.id = 0;
-	}
-	dev->val.ifindex = val->ifindex;
+
+	/* After adjustment copy map value to get storage area */
+	memcpy(&dev->val, val, map->value_size);
 
 	return dev;
 err_put_prog:
@@ -672,7 +826,7 @@  static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 		if (val.bpf_prog.fd != -1)
 			return -EINVAL;
 	} else {
-		dev = __dev_map_alloc_node(net, dtab, &val, i);
+		dev = __dev_map_alloc_node(net, map, &val, i);
 		if (IS_ERR(dev))
 			return PTR_ERR(dev);
 	}
@@ -717,7 +871,7 @@  static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 	if (old_dev && (map_flags & BPF_NOEXIST))
 		goto out_err;
 
-	dev = __dev_map_alloc_node(net, dtab, &val, idx);
+	dev = __dev_map_alloc_node(net, map, &val, idx);
 	if (IS_ERR(dev)) {
 		err = PTR_ERR(dev);
 		goto out_err;
@@ -762,7 +916,7 @@  const struct bpf_map_ops dev_map_ops = {
 	.map_lookup_elem = dev_map_lookup_elem,
 	.map_update_elem = dev_map_update_elem,
 	.map_delete_elem = dev_map_delete_elem,
-	.map_check_btf = map_check_no_btf,
+	.map_check_btf = dev_map_check_btf,
 };
 
 const struct bpf_map_ops dev_map_hash_ops = {
@@ -772,7 +926,7 @@  const struct bpf_map_ops dev_map_hash_ops = {
 	.map_lookup_elem = dev_map_hash_lookup_elem,
 	.map_update_elem = dev_map_hash_update_elem,
 	.map_delete_elem = dev_map_hash_delete_elem,
-	.map_check_btf = map_check_no_btf,
+	.map_check_btf = dev_map_check_btf,
 };
 
 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,