diff mbox series

[bpf-next,1/5] bpf: Handle 8-byte values in DEVMAP and DEVMAP_HASH

Message ID 20200527010905.48135-2-dsahern@kernel.org
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: Add support for XDP programs in DEVMAP entries | expand

Commit Message

David Ahern May 27, 2020, 1:09 a.m. UTC
Add support to DEVMAP and DEVMAP_HASH to support 8-byte values as a
<device index, program id> pair. To do this, a new struct is needed in
bpf_dtab_netdev to hold the values to return on lookup.

Signed-off-by: David Ahern <dsahern@kernel.org>
---
 kernel/bpf/devmap.c | 56 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 13 deletions(-)

Comments

Jesper Dangaard Brouer May 27, 2020, 10:26 a.m. UTC | #1
On Tue, 26 May 2020 19:09:01 -0600
David Ahern <dsahern@kernel.org> wrote:

> Add support to DEVMAP and DEVMAP_HASH to support 8-byte values as a
> <device index, program id> pair. To do this, a new struct is needed in
> bpf_dtab_netdev to hold the values to return on lookup.
> 
> Signed-off-by: David Ahern <dsahern@kernel.org>
> ---
>  kernel/bpf/devmap.c | 56 ++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 43 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index a51d9fb7a359..95db6d8beebc 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -60,12 +60,22 @@ struct xdp_dev_bulk_queue {
>  	unsigned int count;
>  };
>  
> +/* devmap value can be dev index or dev index + prog fd/id */
> +struct dev_map_ext_val {
> +	u32 ifindex;	/* must be first for compat with 4-byte values */
> +	union {
> +		int prog_fd;  /* prog fd on write */
> +		u32 prog_id;  /* prog id on read */
> +	};
> +};

This smells like a BTF structure.
Andrii BTF does support union's right?


>  struct bpf_dtab_netdev {
>  	struct net_device *dev; /* must be first member, due to tracepoint */
>  	struct hlist_node index_hlist;
>  	struct bpf_dtab *dtab;
>  	struct rcu_head rcu;
>  	unsigned int idx;
> +	struct dev_map_ext_val val;
>  };
>  
>  struct bpf_dtab {
> @@ -108,9 +118,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
>  	u64 cost = 0;
>  	int err;
>  
> -	/* check sanity of attributes */
> +	/* check sanity of attributes. 2 value sizes supported:
> +	 * 4 bytes: ifindex
> +	 * 8 bytes: ifindex + prog fd
> +	 */
>  	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
> +	    (attr->value_size != 4 && attr->value_size != 8) ||

IMHO we really need to leverage BTF here, as I'm sure we need to do more
extensions, and this size matching will get more and more unmaintainable.

With BTF in place, dumping the map via bpftool, will also make the
fields "self-documenting".

I will try to implement something that uses BTF for this case (and cpumap).
David Ahern May 27, 2020, 1:56 p.m. UTC | #2
On 5/27/20 4:26 AM, Jesper Dangaard Brouer wrote:
>> @@ -108,9 +118,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
>>  	u64 cost = 0;
>>  	int err;
>>  
>> -	/* check sanity of attributes */
>> +	/* check sanity of attributes. 2 value sizes supported:
>> +	 * 4 bytes: ifindex
>> +	 * 8 bytes: ifindex + prog fd
>> +	 */
>>  	if (attr->max_entries == 0 || attr->key_size != 4 ||
>> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>> +	    (attr->value_size != 4 && attr->value_size != 8) ||
> 
> IMHO we really need to leverage BTF here, as I'm sure we need to do more
> extensions, and this size matching will get more and more unmaintainable.
> 
> With BTF in place, dumping the map via bpftool, will also make the
> fields "self-documenting".
> 
> I will try to implement something that uses BTF for this case (and cpumap).
> 

as mentioned in a past response, BTF does not make any fields special
and this code should not assume it either. You need to know precisely
which 4 bytes is the program fd that needs to be looked up, and that
AFAIK is beyond the scope of BTF.
David Ahern May 27, 2020, 2:27 p.m. UTC | #3
On 5/27/20 4:26 AM, Jesper Dangaard Brouer wrote:
> IMHO we really need to leverage BTF here, as I'm sure we need to do more
> extensions, and this size matching will get more and more unmaintainable.
> 
> With BTF in place, dumping the map via bpftool, will also make the
> fields "self-documenting".

furthermore, the kernel is changing the value - an fd is passed in and
an id is returned. I do not see how any of this fits into BTF.
Toke Høiland-Jørgensen May 27, 2020, 2:57 p.m. UTC | #4
David Ahern <dsahern@gmail.com> writes:

> On 5/27/20 4:26 AM, Jesper Dangaard Brouer wrote:
>>> @@ -108,9 +118,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
>>>  	u64 cost = 0;
>>>  	int err;
>>>  
>>> -	/* check sanity of attributes */
>>> +	/* check sanity of attributes. 2 value sizes supported:
>>> +	 * 4 bytes: ifindex
>>> +	 * 8 bytes: ifindex + prog fd
>>> +	 */
>>>  	if (attr->max_entries == 0 || attr->key_size != 4 ||
>>> -	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>>> +	    (attr->value_size != 4 && attr->value_size != 8) ||
>> 
>> IMHO we really need to leverage BTF here, as I'm sure we need to do more
>> extensions, and this size matching will get more and more unmaintainable.
>> 
>> With BTF in place, dumping the map via bpftool, will also make the
>> fields "self-documenting".
>> 
>> I will try to implement something that uses BTF for this case (and cpumap).
>> 
>
> as mentioned in a past response, BTF does not make any fields special
> and this code should not assume it either.

Either way you're creating a contract where the kernel says "first four
bytes is the ifindex, second four bytes is the fd/id". BTF just makes
that explicit, and allows userspace to declare that it agrees this is
what the fields should mean. And gives us more flexibility when
extending the API later than just adding stuff at the end and looking at
the size...

> You need to know precisely which 4 bytes is the program fd that needs
> to be looked up, and that AFAIK is beyond the scope of BTF.

Exactly - BTF is a way for userspace to explicitly tell the kernel "I am
going to put the fd into these four bytes of the value field", instead
of the kernel implicitly assuming it's always bytes 5-8.

-Toke
David Ahern May 27, 2020, 3:24 p.m. UTC | #5
On 5/27/20 8:57 AM, Toke Høiland-Jørgensen wrote:
> 
> Either way you're creating a contract where the kernel says "first four
> bytes is the ifindex, second four bytes is the fd/id". BTF just makes
> that explicit, and allows userspace to declare that it agrees this is
> what the fields should mean. And gives us more flexibility when
> extending the API later than just adding stuff at the end and looking at
> the size...
> 
>> You need to know precisely which 4 bytes is the program fd that needs
>> to be looked up, and that AFAIK is beyond the scope of BTF.
> 
> Exactly - BTF is a way for userspace to explicitly tell the kernel "I am
> going to put the fd into these four bytes of the value field", instead
> of the kernel implicitly assuming it's always bytes 5-8.
> 

Really, I should define that struct in uapi/linux/bpf.h. The ifindex
field has special meaning: the kernel needs to convert it to a
net_device. The prog_fd field has special meaning: it should map to a
bpf program.

This use case is not in BTF's scope. But, prove me wrong. Ideas are
cheap; code is hard. Show me the code that implements your idea.
Jesper Dangaard Brouer May 27, 2020, 3:30 p.m. UTC | #6
On Wed, 27 May 2020 08:27:36 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 5/27/20 4:26 AM, Jesper Dangaard Brouer wrote:
> > IMHO we really need to leverage BTF here, as I'm sure we need to do more
> > extensions, and this size matching will get more and more unmaintainable.
> > 
> > With BTF in place, dumping the map via bpftool, will also make the
> > fields "self-documenting".  
> 
> furthermore, the kernel is changing the value - an fd is passed in and
> an id is returned. I do not see how any of this fits into BTF.

It can, as BTF actually support union's (I just tested that).

For the sake of end-users understanding this, I do wonder if it is
better to define the struct without the union, and have longer names
that will be part of BTF description, e.g (dumped via bpftool):

 struct dev_map_ext_val {
        u32 ifindex;
        int bpf_prog_fd_write;
        u32 bpf_prog_id_read;
 };

But a union would also work (also tested via BPF loading and BTF dumpinmg):

 struct dev_map_ext_val {
        u32 ifindex;
        union {
                int bpf_prog_fd_write;
                u32 bpf_prog_id_read;
        };
 };
David Ahern May 27, 2020, 6:38 p.m. UTC | #7
On 5/27/20 9:30 AM, Jesper Dangaard Brouer wrote:
> On Wed, 27 May 2020 08:27:36 -0600
> David Ahern <dsahern@gmail.com> wrote:
> 
>> On 5/27/20 4:26 AM, Jesper Dangaard Brouer wrote:
>>> IMHO we really need to leverage BTF here, as I'm sure we need to do more
>>> extensions, and this size matching will get more and more unmaintainable.
>>>
>>> With BTF in place, dumping the map via bpftool, will also make the
>>> fields "self-documenting".  
>>
>> furthermore, the kernel is changing the value - an fd is passed in and
>> an id is returned. I do not see how any of this fits into BTF.
> 
> It can, as BTF actually support union's (I just tested that).
> 

You are trying to force an arbitrary kernel API with BTF, and it adds no
value over a static struct definition that grows as new capability is added.

DEVMAPs (and CPUMAP) are not like other maps - the fields in the entries
have meaning to *the kernel*. That means the kernel needs to know the
fields and what they represent - be it a device index, a program fd
converted to an id, a cpuid, or any future extension. If you add 'u32
foo' to this interface, you still need kernel code to understand 'foo'
maps to resource 'bar' via lookup function 'f'. You can not do this
dynamically. BTF does not make sense here.

Furthermore, if the kernel does not understand a field then it better
return an error code - EINVAL so the user knows expected functionality
is not supported by that kernel.

> 
> But a union would also work (also tested via BPF loading and BTF dumpinmg):
> 
>  struct dev_map_ext_val {
>         u32 ifindex;
>         union {
>                 int bpf_prog_fd_write;
>                 u32 bpf_prog_id_read;
>         };
>  };
> 

I prefer the union and without the verb; comments convey the same
meaning. The fd has no meaning beyond the process that created the entry
so it does not need to be kept around bloating the interface.

For v2, I moved the struct to uapi/linux/bpf.h and added comments.
diff mbox series

Patch

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index a51d9fb7a359..95db6d8beebc 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -60,12 +60,22 @@  struct xdp_dev_bulk_queue {
 	unsigned int count;
 };
 
+/* devmap value can be dev index or dev index + prog fd/id */
+struct dev_map_ext_val {
+	u32 ifindex;	/* must be first for compat with 4-byte values */
+	union {
+		int prog_fd;  /* prog fd on write */
+		u32 prog_id;  /* prog id on read */
+	};
+};
+
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct hlist_node index_hlist;
 	struct bpf_dtab *dtab;
 	struct rcu_head rcu;
 	unsigned int idx;
+	struct dev_map_ext_val val;
 };
 
 struct bpf_dtab {
@@ -108,9 +118,13 @@  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 	u64 cost = 0;
 	int err;
 
-	/* check sanity of attributes */
+	/* check sanity of attributes. 2 value sizes supported:
+	 * 4 bytes: ifindex
+	 * 8 bytes: ifindex + prog fd
+	 */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+	    (attr->value_size != 4 && attr->value_size != 8) ||
+	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return -EINVAL;
 
 	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
@@ -472,18 +486,15 @@  int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
 static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
-	struct net_device *dev = obj ? obj->dev : NULL;
 
-	return dev ? &dev->ifindex : NULL;
+	return obj ? &obj->val : NULL;
 }
 
 static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
 								*(u32 *)key);
-	struct net_device *dev = obj ? obj->dev : NULL;
-
-	return dev ? &dev->ifindex : NULL;
+	return obj ? &obj->val : NULL;
 }
 
 static void __dev_map_entry_free(struct rcu_head *rcu)
@@ -552,15 +563,18 @@  static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 		return ERR_PTR(-ENOMEM);
 
 	dev->dev = dev_get_by_index(net, ifindex);
-	if (!dev->dev) {
-		kfree(dev);
-		return ERR_PTR(-EINVAL);
-	}
+	if (!dev->dev)
+		goto err_out;
 
 	dev->idx = idx;
 	dev->dtab = dtab;
 
+	dev->val.ifindex = ifindex;
+
 	return dev;
+err_out:
+	kfree(dev);
+	return ERR_PTR(-EINVAL);
 }
 
 static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
@@ -568,8 +582,16 @@  static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
-	u32 ifindex = *(u32 *)value;
 	u32 i = *(u32 *)key;
+	u32 ifindex;
+
+	if (map->value_size == 4) {
+		ifindex = *(u32 *)value;
+	} else {
+		struct dev_map_ext_val *val = value;
+
+		ifindex = val->ifindex;
+	}
 
 	if (unlikely(map_flags > BPF_EXIST))
 		return -EINVAL;
@@ -609,10 +631,18 @@  static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_dtab_netdev *dev, *old_dev;
-	u32 ifindex = *(u32 *)value;
 	u32 idx = *(u32 *)key;
 	unsigned long flags;
 	int err = -EEXIST;
+	u32 ifindex;
+
+	if (map->value_size == 4) {
+		ifindex = *(u32 *)value;
+	} else {
+		struct dev_map_ext_val *val = value;
+
+		ifindex = val->ifindex;
+	}
 
 	if (unlikely(map_flags > BPF_EXIST || !ifindex))
 		return -EINVAL;