diff mbox series

[RFC,bpf-next,2/6] net: xdp: RX meta data infrastructure

Message ID 20180627024615.17856-3-saeedm@mellanox.com
State RFC, archived
Delegated to: David Miller
Headers show
Series XDP RX device meta data acceleration (WIP) | expand

Commit Message

Saeed Mahameed June 27, 2018, 2:46 a.m. UTC
The idea from this patch is to define a well known structure for XDP meta
data fields format and offset placement inside the xdp data meta buffer.

For that driver will need some static information to know what to
provide and where, enters struct xdp_md_info and xdp_md_info_arr:

struct xdp_md_info {
       __u16 present:1;
       __u16 offset:15; /* offset from data_meta in xdp_md buffer */
};

/* XDP meta data offsets info array
 * present bit describes if a meta data is or will be present in xdp_md buff
 * offset describes where a meta data is or should be placed in xdp_md buff
 *
 * Kernel builds this array using xdp_md_info_build helper on demand.
 * User space builds it statically in the xdp program.
 */
typedef struct xdp_md_info xdp_md_info_arr[XDP_DATA_META_MAX];

Offsets in xdp_md_info_arr are always in ascending order and only for
requested meta data:
example : flags = XDP_FLAGS_META_HASH | XDP_FLAGS_META_VLAN;

xdp_md_info_arr mdi = {
	[XDP_DATA_META_HASH] = {.offset = 0, .present = 1},
	[XDP_DATA_META_MARK] = {.offset = 0, .present = 0},
	[XDP_DATA_META_VLAN] = {.offset = sizeof(struct xdp_md_hash), .present = 1},
	[XDP_DATA_META_CSUM] = {.offset = 0, .present = 0},
}

i.e: hash fields will always appear first then the vlan for every
xdp_md.

Once requested to provide xdp meta data, device driver will use a pre-built
xdp_md_info_arr which was built via xdp_md_info_build on xdp setup,
xdp_md_info_arr will tell the driver what is the offset of each meta data.

This patch also provides helper functions for the device drivers to store
meta data into xdp_buff, and helper function for XDP programs to fetch
specific xdp meta data from xdp_md buffer.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 include/linux/netdevice.h    |   1 +
 include/net/xdp.h            |   6 ++
 include/uapi/linux/bpf.h     | 114 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/if_link.h |  16 +++--
 net/core/dev.c               |  21 +++++++
 5 files changed, 152 insertions(+), 6 deletions(-)

Comments

Jesper Dangaard Brouer June 27, 2018, 2:15 p.m. UTC | #1
On Tue, 26 Jun 2018 19:46:11 -0700
Saeed Mahameed <saeedm@dev.mellanox.co.il> wrote:

> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 2deea7166a34..afe302613ae1 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -138,6 +138,12 @@ xdp_set_data_meta_invalid(struct xdp_buff *xdp)
>  	xdp->data_meta = xdp->data + 1;
>  }
>  
> +static __always_inline void
> +xdp_reset_data_meta(struct xdp_buff *xdp)
> +{
> +	xdp->data_meta = xdp->data_hard_start;
> +}

This is WRONG ... it should be:

 xdp->data_meta = xdp->data;
Saeed Mahameed June 27, 2018, 5:55 p.m. UTC | #2
On Wed, 2018-06-27 at 16:15 +0200, Jesper Dangaard Brouer wrote:
> On Tue, 26 Jun 2018 19:46:11 -0700
> Saeed Mahameed <saeedm@dev.mellanox.co.il> wrote:
> 
> > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > index 2deea7166a34..afe302613ae1 100644
> > --- a/include/net/xdp.h
> > +++ b/include/net/xdp.h
> > @@ -138,6 +138,12 @@ xdp_set_data_meta_invalid(struct xdp_buff
> > *xdp)
> >  	xdp->data_meta = xdp->data + 1;
> >  }
> >  
> > +static __always_inline void
> > +xdp_reset_data_meta(struct xdp_buff *xdp)
> > +{
> > +	xdp->data_meta = xdp->data_hard_start;
> > +}
> 
> This is WRONG ... it should be:
> 
>  xdp->data_meta = xdp->data;
> 

maybe the name of the function is not suitable for the use case.
i need to set xdp->data_meta = xdp->data in the driver to start storing
meta data.
Daniel Borkmann July 2, 2018, 8:01 a.m. UTC | #3
On 06/27/2018 07:55 PM, Saeed Mahameed wrote:
> On Wed, 2018-06-27 at 16:15 +0200, Jesper Dangaard Brouer wrote:
>> On Tue, 26 Jun 2018 19:46:11 -0700
>> Saeed Mahameed <saeedm@dev.mellanox.co.il> wrote:
>>
>>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>>> index 2deea7166a34..afe302613ae1 100644
>>> --- a/include/net/xdp.h
>>> +++ b/include/net/xdp.h
>>> @@ -138,6 +138,12 @@ xdp_set_data_meta_invalid(struct xdp_buff
>>> *xdp)
>>>  	xdp->data_meta = xdp->data + 1;
>>>  }
>>>  
>>> +static __always_inline void
>>> +xdp_reset_data_meta(struct xdp_buff *xdp)
>>> +{
>>> +	xdp->data_meta = xdp->data_hard_start;
>>> +}
>>
>> This is WRONG ... it should be:
>>
>>  xdp->data_meta = xdp->data;
> 
> maybe the name of the function is not suitable for the use case.
> i need to set xdp->data_meta = xdp->data in the driver to start storing
> meta data.

The xdp_set_data_meta_invalid() is a straight forward way for XDP drivers
to tell they do not support xdp->data_meta, since setting xdp->data + 1 will
fail the checks for direct (meta) packet access in the BPF code and at the
same time bpf_xdp_adjust_meta() will know to bail out with error when program
attempts to make headroom for meta data that driver cannot handle later on.

So later setting 'xdp->data_meta = xdp->data' to enable it is as setting any
other of the initializers in xdp_buff, and done so today in the i40e, ixgbe,
ixgbevf and nfp drivers. (Theoretically it wouldn't have to be exactly set to
xdp->data, but anything <= xdp->data works, if the driver would prepend info
from hw in front of it that program can then use or later override.)

Thanks,
Daniel
Alexei Starovoitov July 3, 2018, 11:01 p.m. UTC | #4
On Tue, Jun 26, 2018 at 07:46:11PM -0700, Saeed Mahameed wrote:
> The idea from this patch is to define a well known structure for XDP meta
> data fields format and offset placement inside the xdp data meta buffer.
> 
> For that driver will need some static information to know what to
> provide and where, enters struct xdp_md_info and xdp_md_info_arr:
> 
> struct xdp_md_info {
>        __u16 present:1;
>        __u16 offset:15; /* offset from data_meta in xdp_md buffer */
> };
> 
> /* XDP meta data offsets info array
>  * present bit describes if a meta data is or will be present in xdp_md buff
>  * offset describes where a meta data is or should be placed in xdp_md buff
>  *
>  * Kernel builds this array using xdp_md_info_build helper on demand.
>  * User space builds it statically in the xdp program.
>  */
> typedef struct xdp_md_info xdp_md_info_arr[XDP_DATA_META_MAX];
> 
> Offsets in xdp_md_info_arr are always in ascending order and only for
> requested meta data:
> example : flags = XDP_FLAGS_META_HASH | XDP_FLAGS_META_VLAN;
> 
> xdp_md_info_arr mdi = {
> 	[XDP_DATA_META_HASH] = {.offset = 0, .present = 1},
> 	[XDP_DATA_META_MARK] = {.offset = 0, .present = 0},
> 	[XDP_DATA_META_VLAN] = {.offset = sizeof(struct xdp_md_hash), .present = 1},
> 	[XDP_DATA_META_CSUM] = {.offset = 0, .present = 0},
> }
> 
> i.e: hash fields will always appear first then the vlan for every
> xdp_md.
> 
> Once requested to provide xdp meta data, device driver will use a pre-built
> xdp_md_info_arr which was built via xdp_md_info_build on xdp setup,
> xdp_md_info_arr will tell the driver what is the offset of each meta data.
> 
> This patch also provides helper functions for the device drivers to store
> meta data into xdp_buff, and helper function for XDP programs to fetch
> specific xdp meta data from xdp_md buffer.
> 
> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
...
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 59b19b6a40d7..e320e7421565 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2353,6 +2353,120 @@ struct xdp_md {
>  	__u32 rx_queue_index;  /* rxq->queue_index  */
>  };
>  
> +enum {
> +	XDP_DATA_META_HASH = 0,
> +	XDP_DATA_META_MARK,
> +	XDP_DATA_META_VLAN,
> +	XDP_DATA_META_CSUM_COMPLETE,
> +
> +	XDP_DATA_META_MAX,
> +};
> +
> +struct xdp_md_hash {
> +	__u32 hash;
> +	__u8  type;
> +} __attribute__((aligned(4)));
> +
> +struct xdp_md_mark {
> +	__u32 mark;
> +} __attribute__((aligned(4)));
> +
> +struct xdp_md_vlan {
> +	__u16 vlan;
> +} __attribute__((aligned(4)));
> +
> +struct xdp_md_csum {
> +	__u16 csum;
> +} __attribute__((aligned(4)));
> +
> +static const __u16 xdp_md_size[XDP_DATA_META_MAX] = {
> +	sizeof(struct xdp_md_hash), /* XDP_DATA_META_HASH */
> +	sizeof(struct xdp_md_mark), /* XDP_DATA_META_MARK */
> +	sizeof(struct xdp_md_vlan), /* XDP_DATA_META_VLAN */
> +	sizeof(struct xdp_md_csum), /* XDP_DATA_META_CSUM_COMPLETE */
> +};
> +
> +struct xdp_md_info {
> +	__u16 present:1;
> +	__u16 offset:15; /* offset from data_meta in xdp_md buffer */
> +};
> +
> +/* XDP meta data offsets info array
> + * present bit describes if a meta data is or will be present in xdp_md buff
> + * offset describes where a meta data is or should be placed in xdp_md buff
> + *
> + * Kernel builds this array using xdp_md_info_build helper on demand.
> + * User space builds it statically in the xdp program.
> + */
> +typedef struct xdp_md_info xdp_md_info_arr[XDP_DATA_META_MAX];
> +
> +static __always_inline __u8
> +xdp_data_meta_present(xdp_md_info_arr mdi, int meta_data)
> +{
> +	return mdi[meta_data].present;
> +}
> +
> +static __always_inline void
> +xdp_data_meta_set_hash(xdp_md_info_arr mdi, void *data_meta, __u32 hash, __u8 htype)
> +{
> +	struct xdp_md_hash *hash_md;
> +
> +	hash_md = (struct xdp_md_hash *)((char*)data_meta + mdi[XDP_DATA_META_HASH].offset);
> +	hash_md->hash = hash;
> +	hash_md->type = htype;
> +}
> +
> +static __always_inline struct xdp_md_hash *
> +xdp_data_meta_get_hash(xdp_md_info_arr mdi, void *data_meta)
> +{
> +	return (struct xdp_md_hash *)((char*)data_meta + mdi[XDP_DATA_META_HASH].offset);
> +}

I'm afraid this is not scalable uapi.
This looks very much mlx specific. Every NIC vendor cannot add its own
struct definitions into uapi/bpf.h. It doesn't scale.
Also doing 15 bit bitfield extraction using bpf instructions is not that simple.
mlx should consider doing plain u8 or u16 fields in firmware instead.
The metadata that "hw" provides is a joint work of actual asic and firmware.
I suspect the format can and will change with different firmware.
Baking this stuff into uapi/bpf.h is not an option.
How about we make driver+firmware provide a BTF definition of metadata that they
can provide? There can be multiple definitions of such structs.
Then in userpsace we can have BTF->plain C converter.
(bpftool practically ready to do that already).
Then the programmer can take such generated C definition, add it to .h and include
it in their programs. llvm will compile the whole thing and will include BTF
of maps, progs and this md struct in the target elf file.
During loading the kernel can check that BTF in elf is matching one-to-one
to what driver+firmware are saying they support.
No ambiguity and no possibility of mistake, since offsets and field names
are verified.
Every driver can have their own BTF for md and their own special features.
We can try to standardize the names (like vlan and csum), so xdp programs
can stay relatively portable across NICs.
Such api will address exposing asic+firmware metadata to the xdp program.
Once we tackle this problem, we'll think how to do the backward config
(to do firmware reconfig for specific BTF definition of md supplied by the prog).
What people think?
Saeed Mahameed July 3, 2018, 11:52 p.m. UTC | #5
On Mon, 2018-07-02 at 10:01 +0200, Daniel Borkmann wrote:
> On 06/27/2018 07:55 PM, Saeed Mahameed wrote:
> > On Wed, 2018-06-27 at 16:15 +0200, Jesper Dangaard Brouer wrote:
> > > On Tue, 26 Jun 2018 19:46:11 -0700
> > > Saeed Mahameed <saeedm@dev.mellanox.co.il> wrote:
> > > 
> > > > diff --git a/include/net/xdp.h b/include/net/xdp.h
> > > > index 2deea7166a34..afe302613ae1 100644
> > > > --- a/include/net/xdp.h
> > > > +++ b/include/net/xdp.h
> > > > @@ -138,6 +138,12 @@ xdp_set_data_meta_invalid(struct xdp_buff
> > > > *xdp)
> > > >  	xdp->data_meta = xdp->data + 1;
> > > >  }
> > > >  
> > > > +static __always_inline void
> > > > +xdp_reset_data_meta(struct xdp_buff *xdp)
> > > > +{
> > > > +	xdp->data_meta = xdp->data_hard_start;
> > > > +}
> > > 
> > > This is WRONG ... it should be:
> > > 
> > >  xdp->data_meta = xdp->data;
> > 
> > maybe the name of the function is not suitable for the use case.
> > i need to set xdp->data_meta = xdp->data in the driver to start
> > storing
> > meta data.
> 
> The xdp_set_data_meta_invalid() is a straight forward way for XDP
> drivers
> to tell they do not support xdp->data_meta, since setting xdp->data +
> 1 will
> fail the checks for direct (meta) packet access in the BPF code and
> at the
> same time bpf_xdp_adjust_meta() will know to bail out with error when
> program
> attempts to make headroom for meta data that driver cannot handle
> later on.
> 
> So later setting 'xdp->data_meta = xdp->data' to enable it is as
> setting any
> other of the initializers in xdp_buff, and done so today in the i40e,
> ixgbe,
> ixgbevf and nfp drivers. (Theoretically it wouldn't have to be
> exactly set to
> xdp->data, but anything <= xdp->data works, if the driver would
> prepend info
> from hw in front of it that program can then use or later override.)
> 

Right ! As jesper pointer out as well, 
driver should set: xdp->data_meta = xdp->data
and then adjust it to the offset of the first meta data hint.
Saeed Mahameed July 4, 2018, 12:57 a.m. UTC | #6
On Tue, 2018-07-03 at 16:01 -0700, Alexei Starovoitov wrote:
> On Tue, Jun 26, 2018 at 07:46:11PM -0700, Saeed Mahameed wrote:
> > The idea from this patch is to define a well known structure for
> > XDP meta
> > data fields format and offset placement inside the xdp data meta
> > buffer.
> > 
> > For that driver will need some static information to know what to
> > provide and where, enters struct xdp_md_info and xdp_md_info_arr:
> > 
> > struct xdp_md_info {
> >        __u16 present:1;
> >        __u16 offset:15; /* offset from data_meta in xdp_md buffer
> > */
> > };
> > 
> > /* XDP meta data offsets info array
> >  * present bit describes if a meta data is or will be present in
> > xdp_md buff
> >  * offset describes where a meta data is or should be placed in
> > xdp_md buff
> >  *
> >  * Kernel builds this array using xdp_md_info_build helper on
> > demand.
> >  * User space builds it statically in the xdp program.
> >  */
> > typedef struct xdp_md_info xdp_md_info_arr[XDP_DATA_META_MAX];
> > 
> > Offsets in xdp_md_info_arr are always in ascending order and only
> > for
> > requested meta data:
> > example : flags = XDP_FLAGS_META_HASH | XDP_FLAGS_META_VLAN;
> > 
> > xdp_md_info_arr mdi = {
> > 	[XDP_DATA_META_HASH] = {.offset = 0, .present = 1},
> > 	[XDP_DATA_META_MARK] = {.offset = 0, .present = 0},
> > 	[XDP_DATA_META_VLAN] = {.offset = sizeof(struct xdp_md_hash),
> > .present = 1},
> > 	[XDP_DATA_META_CSUM] = {.offset = 0, .present = 0},
> > }
> > 
> > i.e: hash fields will always appear first then the vlan for every
> > xdp_md.
> > 
> > Once requested to provide xdp meta data, device driver will use a
> > pre-built
> > xdp_md_info_arr which was built via xdp_md_info_build on xdp setup,
> > xdp_md_info_arr will tell the driver what is the offset of each
> > meta data.
> > 
> > This patch also provides helper functions for the device drivers to
> > store
> > meta data into xdp_buff, and helper function for XDP programs to
> > fetch
> > specific xdp meta data from xdp_md buffer.
> > 
> > Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
> 
> ...
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 59b19b6a40d7..e320e7421565 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2353,6 +2353,120 @@ struct xdp_md {
> >  	__u32 rx_queue_index;  /* rxq->queue_index  */
> >  };
> >  
> > +enum {
> > +	XDP_DATA_META_HASH = 0,
> > +	XDP_DATA_META_MARK,
> > +	XDP_DATA_META_VLAN,
> > +	XDP_DATA_META_CSUM_COMPLETE,
> > +
> > +	XDP_DATA_META_MAX,
> > +};
> > +
> > +struct xdp_md_hash {
> > +	__u32 hash;
> > +	__u8  type;
> > +} __attribute__((aligned(4)));
> > +
> > +struct xdp_md_mark {
> > +	__u32 mark;
> > +} __attribute__((aligned(4)));
> > +
> > +struct xdp_md_vlan {
> > +	__u16 vlan;
> > +} __attribute__((aligned(4)));
> > +
> > +struct xdp_md_csum {
> > +	__u16 csum;
> > +} __attribute__((aligned(4)));
> > +
> > +static const __u16 xdp_md_size[XDP_DATA_META_MAX] = {
> > +	sizeof(struct xdp_md_hash), /* XDP_DATA_META_HASH */
> > +	sizeof(struct xdp_md_mark), /* XDP_DATA_META_MARK */
> > +	sizeof(struct xdp_md_vlan), /* XDP_DATA_META_VLAN */
> > +	sizeof(struct xdp_md_csum), /* XDP_DATA_META_CSUM_COMPLETE
> > */
> > +};
> > +
> > +struct xdp_md_info {
> > +	__u16 present:1;
> > +	__u16 offset:15; /* offset from data_meta in xdp_md buffer
> > */
> > +};
> > +
> > +/* XDP meta data offsets info array
> > + * present bit describes if a meta data is or will be present in
> > xdp_md buff
> > + * offset describes where a meta data is or should be placed in
> > xdp_md buff
> > + *
> > + * Kernel builds this array using xdp_md_info_build helper on
> > demand.
> > + * User space builds it statically in the xdp program.
> > + */
> > +typedef struct xdp_md_info xdp_md_info_arr[XDP_DATA_META_MAX];
> > +
> > +static __always_inline __u8
> > +xdp_data_meta_present(xdp_md_info_arr mdi, int meta_data)
> > +{
> > +	return mdi[meta_data].present;
> > +}
> > +
> > +static __always_inline void
> > +xdp_data_meta_set_hash(xdp_md_info_arr mdi, void *data_meta, __u32
> > hash, __u8 htype)
> > +{
> > +	struct xdp_md_hash *hash_md;
> > +
> > +	hash_md = (struct xdp_md_hash *)((char*)data_meta +
> > mdi[XDP_DATA_META_HASH].offset);
> > +	hash_md->hash = hash;
> > +	hash_md->type = htype;
> > +}
> > +
> > +static __always_inline struct xdp_md_hash *
> > +xdp_data_meta_get_hash(xdp_md_info_arr mdi, void *data_meta)
> > +{
> > +	return (struct xdp_md_hash *)((char*)data_meta +
> > mdi[XDP_DATA_META_HASH].offset);
> > +}
> 
> I'm afraid this is not scalable uapi.
> This looks very much mlx specific. Every NIC vendor cannot add its
> own
> struct definitions into uapi/bpf.h. It doesn't scale.

No, this is not mlx specific, all you can find here is standard
hash/csum/vlan/flow mark, fields as described in SKB, we want them to
be well defined so you can construct SKBs with these standard values
from xdp_md with no surprises.

I agree this is not scalable, but this patch is not about arbitrary
vendor specific hints, with unknown sizes and shapes, i understand the
demand but we do need the notion of standard hints as well in order to
allow (driver/fw/hw)=>(xdp program)=>(stack) data flow, since the three
of those entities must agree on some well defined hints.

Anyway we discussed this in the last iovisor meeting and we agreed that
along with the standard hints suggested by this patch we want to allow
arbitrary hw specific hints, the question is how to have both.

> Also doing 15 bit bitfield extraction using bpf instructions is not
> that simple.
> mlx should consider doing plain u8 or u16 fields in firmware instead.
> The metadata that "hw" provides is a joint work of actual asic and
> firmware.

Right, this would be a huge flexibility improvement for future hw.
what about current HW, the driver can always translate the current
format to whatever format we decide on.

> I suspect the format can and will change with different firmware.
> Baking this stuff into uapi/bpf.h is not an option.

Yes but we still need to have a well defined structures for standard
hints, for the hints to flow into the stack as well. current and future
HW/FW/driver must respect the well known format of specific hints, such
as hash/csum/vlan, etc..

> How about we make driver+firmware provide a BTF definition of
> metadata that they
> can provide? There can be multiple definitions of such structs.
> Then in userpsace we can have BTF->plain C converter.
> (bpftool practically ready to do that already).
> Then the programmer can take such generated C definition, add it to
> .h and include
> it in their programs. llvm will compile the whole thing and will
> include BTF
> of maps, progs and this md struct in the target elf file.
> During loading the kernel can check that BTF in elf is matching one-
> to-one
> to what driver+firmware are saying they support.

Just thinking out loud, can't we do this at program load ? just run a
setup function in the xdp program to load nic md BTF definition into
the elf section ?

> No ambiguity and no possibility of mistake, since offsets and field
> names
> are verified.

But what about the dynamic nature of this feature ? Sometimes you only
want HW/Driver to provide a subset of whatever the HW can provide and
save md buffer for other stuff.

Yes a well defined format is favorable here, but we need to make sure
there is no computational overhead in data path just to extract each
field! for example if i want to know what is the offset of the hash
will i need to go parse (for every packet) the whole BTF definition of
metadata just to find the offset of type=hash ?

> Every driver can have their own BTF for md and their own special
> features.
> We can try to standardize the names (like vlan and csum), so xdp
> programs
> can stay relatively portable across NICs.

Yes this is a must.

> Such api will address exposing asic+firmware metadata to the xdp
> program.
> Once we tackle this problem, we'll think how to do the backward
> config
> (to do firmware reconfig for specific BTF definition of md supplied
> by the prog).
> What people think?
> 

For legacy HW, we can do it already in the driver, provide whatever the
prog requested, its only a matter of translation to the BTF format in
the driver xdp setup and pushing the values accordingly into the md
offsets on data path.

Question: how can you share the md BTF from the driver/HW with the xdp
program ?
Daniel Borkmann July 4, 2018, 7:51 a.m. UTC | #7
On 07/04/2018 02:57 AM, Saeed Mahameed wrote:
> On Tue, 2018-07-03 at 16:01 -0700, Alexei Starovoitov wrote:
[...]
>> How about we make driver+firmware provide a BTF definition of
>> metadata that they
>> can provide? There can be multiple definitions of such structs.
>> Then in userpsace we can have BTF->plain C converter.
>> (bpftool practically ready to do that already).
>> Then the programmer can take such generated C definition, add it to
>> .h and include
>> it in their programs. llvm will compile the whole thing and will
>> include BTF
>> of maps, progs and this md struct in the target elf file.
>> During loading the kernel can check that BTF in elf is matching one-
>> to-one
>> to what driver+firmware are saying they support.

I do like the above idea of utilizing BTF for this, seems like a good fit.

> Just thinking out loud, can't we do this at program load ? just run a
> setup function in the xdp program to load nic md BTF definition into
> the elf section ?
> 
>> No ambiguity and no possibility of mistake, since offsets and field
>> names
>> are verified.
> 
> But what about the dynamic nature of this feature ? Sometimes you only
> want HW/Driver to provide a subset of whatever the HW can provide and
> save md buffer for other stuff.
> 
> Yes a well defined format is favorable here, but we need to make sure
> there is no computational overhead in data path just to extract each
> field! for example if i want to know what is the offset of the hash
> will i need to go parse (for every packet) the whole BTF definition of
> metadata just to find the offset of type=hash ?

I don't think this would be the case that you'd need to walk BTF in fast
path here. In the ideal case, the only thing that driver would need to do
in fast path would be to set proper xdp->data_meta offset and _that_ would
be it. For the rest, program would know how to access the data since it's
already aware of it from BTF definition the driver provided. Other drivers
which would be less flexible on that regard would internally prep the buffer
based on the progs needs more or less similar as in mlx5e_xdp_fill_data_meta(),
but it would be really up to the driver how to handle this internally. The
BTF it would check at XDP setup time to do the configuration needed in the
driver. Verifier would only check BTF, pass it along for XDP setup, prog
rewrites in verifier aren't even needed since LLVM compiled everything
already.

>> Every driver can have their own BTF for md and their own special
>> features.
>> We can try to standardize the names (like vlan and csum), so xdp
>> programs
>> can stay relatively portable across NICs.
> 
> Yes this is a must.

Agree, there needs to be a basic common set that would be provided by
every XDP aware driver.

>> Such api will address exposing asic+firmware metadata to the xdp
>> program.
>> Once we tackle this problem, we'll think how to do the backward
>> config
>> (to do firmware reconfig for specific BTF definition of md supplied
>> by the prog).
>> What people think?
> 
> For legacy HW, we can do it already in the driver, provide whatever the
> prog requested, its only a matter of translation to the BTF format in
> the driver xdp setup and pushing the values accordingly into the md
> offsets on data path.
> 
> Question: how can you share the md BTF from the driver/HW with the xdp
> program ?
I think this would likely be a new query as in XDP_QUERY_META_BTF
implemented in ndo_bpf callback and then exported e.g. via bpf(2)
or netlink such that bpftool can generate BTF -> C from there for the
program to include later in compilation.

Thanks,
Daniel
Jakub Kicinski July 5, 2018, 5:18 p.m. UTC | #8
On Wed, 4 Jul 2018 09:51:54 +0200, Daniel Borkmann wrote:
> On 07/04/2018 02:57 AM, Saeed Mahameed wrote:
> > On Tue, 2018-07-03 at 16:01 -0700, Alexei Starovoitov wrote:  
> >> How about we make driver+firmware provide a BTF definition of
> >> metadata that they
> >> can provide? There can be multiple definitions of such structs.
> >> Then in userpsace we can have BTF->plain C converter.
> >> (bpftool practically ready to do that already).
> >> Then the programmer can take such generated C definition, add it to
> >> .h and include
> >> it in their programs. llvm will compile the whole thing and will
> >> include BTF
> >> of maps, progs and this md struct in the target elf file.
> >> During loading the kernel can check that BTF in elf is matching one-
> >> to-one
> >> to what driver+firmware are saying they support.  
> 
> I do like the above idea of utilizing BTF for this, seems like a good fit.
>
> > Just thinking out loud, can't we do this at program load ? just run a
> > setup function in the xdp program to load nic md BTF definition into
> > the elf section ?
> >   
> >> No ambiguity and no possibility of mistake, since offsets and field
> >> names
> >> are verified.  
> > 
> > But what about the dynamic nature of this feature ? Sometimes you only
> > want HW/Driver to provide a subset of whatever the HW can provide and
> > save md buffer for other stuff.
> > 
> > Yes a well defined format is favorable here, but we need to make sure
> > there is no computational overhead in data path just to extract each
> > field! for example if i want to know what is the offset of the hash
> > will i need to go parse (for every packet) the whole BTF definition of
> > metadata just to find the offset of type=hash ?  
> 
> I don't think this would be the case that you'd need to walk BTF in fast
> path here. In the ideal case, the only thing that driver would need to do
> in fast path would be to set proper xdp->data_meta offset and _that_ would
> be it. For the rest, program would know how to access the data since it's
> already aware of it from BTF definition the driver provided. Other drivers
> which would be less flexible on that regard would internally prep the buffer
> based on the progs needs more or less similar as in mlx5e_xdp_fill_data_meta(),
> but it would be really up to the driver how to handle this internally. The
> BTF it would check at XDP setup time to do the configuration needed in the
> driver. Verifier would only check BTF, pass it along for XDP setup, prog
> rewrites in verifier aren't even needed since LLVM compiled everything
> already.

I don't think we should force drivers to place such meta data in the
buffer.  The moment that happens we loose the "zero-touch" abilities
Jesper was trying to achieve.

Besides what happens to the meta data after XDP is finished.  We really
need the ability to communicate the modified fields further to the
stack.  With meta data in the buffer we don't really know if the
information place there after XDP finishes is still valid or did the
program overwrite it with something completely different.

I'm also not 100% on board with the argument that "future" FW can
reshuffle things whatever way it wants to.  Is the assumption that
future ASICs/FW will be designed to always use the "blessed" BTF
format?  Or will it be reconfigurable at runtime?

> >> Every driver can have their own BTF for md and their own special
> >> features.
> >> We can try to standardize the names (like vlan and csum), so xdp
> >> programs
> >> can stay relatively portable across NICs.  
> > 
> > Yes this is a must.  
> 
> Agree, there needs to be a basic common set that would be provided by
> every XDP aware driver.

I'm sorry to bring this up again, but can we really not let drivers
define their own "get_XYZ/set_XYZ" helpers, and link those to the
program at attachment time?  Sure we'd have to create a new copy of the
program for each driver it's used with, but is that really a problem?
That'd have the lowest impact on the performance and complexity of the
driver fast path.  The BTF solution already has all the same problems
WRT tail calls and not being sure which driver the program is attached
to.

> >> Such api will address exposing asic+firmware metadata to the xdp
> >> program.
> >> Once we tackle this problem, we'll think how to do the backward
> >> config
> >> (to do firmware reconfig for specific BTF definition of md supplied
> >> by the prog).
> >> What people think?  
> > 
> > For legacy HW, we can do it already in the driver, provide whatever the
> > prog requested, its only a matter of translation to the BTF format in
> > the driver xdp setup and pushing the values accordingly into the md
> > offsets on data path.
> > 
> > Question: how can you share the md BTF from the driver/HW with the xdp
> > program ?
> 
> I think this would likely be a new query as in XDP_QUERY_META_BTF
> implemented in ndo_bpf callback and then exported e.g. via bpf(2)
> or netlink such that bpftool can generate BTF -> C from there for the
> program to include later in compilation.
Alexei Starovoitov July 6, 2018, 4:30 p.m. UTC | #9
On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:
> 
> I'm also not 100% on board with the argument that "future" FW can
> reshuffle things whatever way it wants to.  Is the assumption that
> future ASICs/FW will be designed to always use the "blessed" BTF
> format?  Or will it be reconfigurable at runtime?

let's table configuration of metadata aside for a second.

Describing metedata layout in BTF allows NICs to disclose everything
NIC has to users in a standard and generic way.
Whether firmware is reconfigurable on the fly or has to reflashed
and hw powercycled to have new md layout (and corresponding BTF description)
is a separate discussion.
Saeed's proposal introduces the concept of 'offset' inside 'struct xdp_md_info'
to reach 'hash' value in metadata.
Essentially it's a run-time way to access 'hash' instead of build-time.
So bpf program would need two loads to read csum or hash field instead of one.
With BTF the layout of metadata is known to the program at build-time.

To reiterate the proposal:
- driver+firmware keep layout of the metadata in BTF format (either in the driver
  or driver can read it from firmware)
- 'bpftool read-metadata-desc eth0 > md_desc.h' command will query the driver and
  generate normal C header file based on BTF in the given NIC
- user does #include "md_desc.h" and bpf program can access md->csum or md->hash
  with direct single load out of metadata area in front of the packet
- llvm compiles bpf program and records how program is doing this md->csum accesses
  in BTF format as well (the compiler will be keeping such records
  for __sk_buff and all other structs too, but that's separate discussion)
- during sys_bpf(prog_load) the kernel checks (via supplied BTF) that the way the program
  accesses metadata (and other structs) matches BTF from the driver,
  so no surprises if driver+firmware got updated, but program is not recompiled
- every NIC can have their own layout of metadata and its own meaning of the fields,
  but would be good to standardize at least a few common fields like hash

Once this is working we can do more cool things with BTF.
Like doing offset rewriting at program load time similar to what we plan
to do for tracing. Tracing programs will be doing 'task->pid' access
and the kernel will adjust offsetof(struct task_struct, pid) during program load
depending on BTF for the kernel.
The same trick we can do for networking metadata.
The program will contain load instruction md->hash that will get automatically
adjusted to proper offset depending on BTF of 'hash' field in the NIC.
For now I'm proposing _not_ to go that far with offset rewriting and start
with simple steps described above.
Waskiewicz Jr, Peter July 6, 2018, 8:44 p.m. UTC | #10
On Fri, 2018-07-06 at 09:30 -0700, Alexei Starovoitov wrote:
> On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:
> > 
> > I'm also not 100% on board with the argument that "future" FW can
> > reshuffle things whatever way it wants to.  Is the assumption that
> > future ASICs/FW will be designed to always use the "blessed" BTF
> > format?  Or will it be reconfigurable at runtime?
> 
> let's table configuration of metadata aside for a second.

I agree that this should/could be NIC-specific and shouldn't weigh on
the metadata interface between the drivers and XDP layer.

> Describing metedata layout in BTF allows NICs to disclose everything
> NIC has to users in a standard and generic way.
> Whether firmware is reconfigurable on the fly or has to reflashed
> and hw powercycled to have new md layout (and corresponding BTF
> description)
> is a separate discussion.
> Saeed's proposal introduces the concept of 'offset' inside 'struct
> xdp_md_info'
> to reach 'hash' value in metadata.
> Essentially it's a run-time way to access 'hash' instead of build-
> time.
> So bpf program would need two loads to read csum or hash field
> instead of one.
> With BTF the layout of metadata is known to the program at build-
> time.
> 
> To reiterate the proposal:
> - driver+firmware keep layout of the metadata in BTF format (either
> in the driver
>   or driver can read it from firmware)
> - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query
> the driver and
>   generate normal C header file based on BTF in the given NIC
> - user does #include "md_desc.h" and bpf program can access md->csum
> or md->hash
>   with direct single load out of metadata area in front of the packet

This piece is where I'd like to discuss more.  When we discussed this
in Seoul, the initial proposal was a static struct that we'd try to
hammer out a common layout between the interested parties.  That
obviously wasn't going to scale, and we wanted to pursue something more
dynamic.  But I thought the goal was the XDP/eBPF program wouldn't want
to care what the underlying device is, and could just ask for metadata
that it's interested in.  With this approach, your eBPF program is now
bound/tied to the NIC/driver, and if you switched to a differen
NIC/driver combo, then you'd have to rewrite part of your eBPF program
to comprehend that.  I thought we were trying to avoid that.

Our proposed approach (still working on something ready to RFC) is to
provide a method for the eBPF program to send a struct of requested
hints down to the driver on load.  If the driver can provide the hints,
then that'd be how they'd be laid out in the metadata.  If it can't
provide them, we'd probably reject the program loading, or discuss
providing a software fallback (I know this is an area of contention).

I suppose we could get there with the rewriting mechanism described
below, but that'd be a tough sell to set a bit of ABI for metadata,
then change it to be potentially dynamic at runtime in the future.

-PJ
Jakub Kicinski July 6, 2018, 9:33 p.m. UTC | #11
On Fri, 6 Jul 2018 09:30:42 -0700, Alexei Starovoitov wrote:
> On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:
> > 
> > I'm also not 100% on board with the argument that "future" FW can
> > reshuffle things whatever way it wants to.  Is the assumption that
> > future ASICs/FW will be designed to always use the "blessed" BTF
> > format?  Or will it be reconfigurable at runtime?  
> 
> let's table configuration of metadata aside for a second.
> 
> Describing metedata layout in BTF allows NICs to disclose everything
> NIC has to users in a standard and generic way.
> Whether firmware is reconfigurable on the fly or has to reflashed
> and hw powercycled to have new md layout (and corresponding BTF description)
> is a separate discussion.
> Saeed's proposal introduces the concept of 'offset' inside 'struct xdp_md_info'
> to reach 'hash' value in metadata.
> Essentially it's a run-time way to access 'hash' instead of build-time.
> So bpf program would need two loads to read csum or hash field instead of one.
> With BTF the layout of metadata is known to the program at build-time.
> 
> To reiterate the proposal:
> - driver+firmware keep layout of the metadata in BTF format (either in the driver
>   or driver can read it from firmware)
> - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query the driver and
>   generate normal C header file based on BTF in the given NIC
> - user does #include "md_desc.h" and bpf program can access md->csum or md->hash
>   with direct single load out of metadata area in front of the packet
> - llvm compiles bpf program and records how program is doing this md->csum accesses
>   in BTF format as well (the compiler will be keeping such records
>   for __sk_buff and all other structs too, but that's separate discussion)
> - during sys_bpf(prog_load) the kernel checks (via supplied BTF) that the way the program
>   accesses metadata (and other structs) matches BTF from the driver,
>   so no surprises if driver+firmware got updated, but program is not recompiled
> - every NIC can have their own layout of metadata and its own meaning of the fields,
>   but would be good to standardize at least a few common fields like hash

Can I expose HW descriptors this way, though, or is the proposal to
copy this data into the packet buffer?

> Once this is working we can do more cool things with BTF.
> Like doing offset rewriting at program load time similar to what we plan
> to do for tracing. Tracing programs will be doing 'task->pid' access
> and the kernel will adjust offsetof(struct task_struct, pid) during program load
> depending on BTF for the kernel.
> The same trick we can do for networking metadata.
> The program will contain load instruction md->hash that will get automatically
> adjusted to proper offset depending on BTF of 'hash' field in the NIC.
> For now I'm proposing _not_ to go that far with offset rewriting and start
> with simple steps described above.

Why? :(  Could we please go with the rewrite/driver helpers instead of
impacting fast paths of the drivers yet again?  This rewrite should be
easier than task->pid, because we have the synthetic user space struct
xdp_md definition.

The flexibility and power of BPF rewrites/JITing is at our disposal to
deliver maximum performance..
Alexei Starovoitov July 6, 2018, 11:38 p.m. UTC | #12
On Fri, Jul 06, 2018 at 08:44:24PM +0000, Waskiewicz Jr, Peter wrote:
> On Fri, 2018-07-06 at 09:30 -0700, Alexei Starovoitov wrote:
> > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:
> > > 
> > > I'm also not 100% on board with the argument that "future" FW can
> > > reshuffle things whatever way it wants to.  Is the assumption that
> > > future ASICs/FW will be designed to always use the "blessed" BTF
> > > format?  Or will it be reconfigurable at runtime?
> > 
> > let's table configuration of metadata aside for a second.
> 
> I agree that this should/could be NIC-specific and shouldn't weigh on
> the metadata interface between the drivers and XDP layer.
> 
> > Describing metedata layout in BTF allows NICs to disclose everything
> > NIC has to users in a standard and generic way.
> > Whether firmware is reconfigurable on the fly or has to reflashed
> > and hw powercycled to have new md layout (and corresponding BTF
> > description)
> > is a separate discussion.
> > Saeed's proposal introduces the concept of 'offset' inside 'struct
> > xdp_md_info'
> > to reach 'hash' value in metadata.
> > Essentially it's a run-time way to access 'hash' instead of build-
> > time.
> > So bpf program would need two loads to read csum or hash field
> > instead of one.
> > With BTF the layout of metadata is known to the program at build-
> > time.
> > 
> > To reiterate the proposal:
> > - driver+firmware keep layout of the metadata in BTF format (either
> > in the driver
> >   or driver can read it from firmware)
> > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query
> > the driver and
> >   generate normal C header file based on BTF in the given NIC
> > - user does #include "md_desc.h" and bpf program can access md->csum
> > or md->hash
> >   with direct single load out of metadata area in front of the packet
> 
> This piece is where I'd like to discuss more.  When we discussed this
> in Seoul, the initial proposal was a static struct that we'd try to
> hammer out a common layout between the interested parties.  That
> obviously wasn't going to scale, and we wanted to pursue something more
> dynamic.  But I thought the goal was the XDP/eBPF program wouldn't want
> to care what the underlying device is, and could just ask for metadata
> that it's interested in.  With this approach, your eBPF program is now
> bound/tied to the NIC/driver, and if you switched to a differen
> NIC/driver combo, then you'd have to rewrite part of your eBPF program
> to comprehend that.  I thought we were trying to avoid that.

It looks to me that NICs have a lot more differences instead of common pieces.
If we focus the architecture on making common things as generic
as possible we miss the bigger picture of covering distinct
and unique features that hw provides.

In the last email when I said "would be good to standardize at least a few common fields"
I meant that it make sense only at the later phases.
I don't think we should put things into uapi upfront.
Otherwise we will end up with likely useless fields.
Like vlan field. Surely a lot of nics can store it into metadata, but why?
bpf program can easily read it from the packet.
What's the benefit of adding it to md?
This metadata concept we're discussing in the context of program acceleration.
If metadata doesn't give perf benefits it should not be done by
firmware, it should not be supported by the driver and certainly
should not be part of uapi.
Hence I'd like to see pure BTF description without any common/standard
fields across the drivers and figure out what (if anything) is useful
to accelerate xdp programs (and af_xdp in the future).

> Our proposed approach (still working on something ready to RFC) is to
> provide a method for the eBPF program to send a struct of requested
> hints down to the driver on load.  If the driver can provide the hints,
> then that'd be how they'd be laid out in the metadata.  If it can't
> provide them, we'd probably reject the program loading, or discuss
> providing a software fallback (I know this is an area of contention).

I don't think that approach can work.
My .02 is the only useful acceleration feature out if intel nics is
an ability to parse the packet and report it to the xdp program
as a node id in the parse graph. As far as I know that is pretty unique
to intel. Configuration of that is a different matter.
Other things like hash are interesting, but we will quickly get
into rss spec definition if we go standardization route now instead
of later phases.
Therefore I'd rather let firmware+driver define its own BTF description
that says here is 'hash' of the packet hw can provide and it's fine
that this hash may be over different tuples depending on the nic
and even version of the firmware in that nic.
We need to see performance numbers and benefits for real world
xdp programs before drilling into specific semantics of the hash.

> I suppose we could get there with the rewriting mechanism described
> below, but that'd be a tough sell to set a bit of ABI for metadata,
> then change it to be potentially dynamic at runtime in the future.

Hmm. I don't see how future offset rewriting will break abi.
It looks to me as natural extension that wouldn't break any apps
that would be written for specific nic with given BTF description.
With rewritting in place some progs would become portable. That's all.
Alexei Starovoitov July 6, 2018, 11:42 p.m. UTC | #13
On Fri, Jul 06, 2018 at 02:33:58PM -0700, Jakub Kicinski wrote:
> On Fri, 6 Jul 2018 09:30:42 -0700, Alexei Starovoitov wrote:
> > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:
> > > 
> > > I'm also not 100% on board with the argument that "future" FW can
> > > reshuffle things whatever way it wants to.  Is the assumption that
> > > future ASICs/FW will be designed to always use the "blessed" BTF
> > > format?  Or will it be reconfigurable at runtime?  
> > 
> > let's table configuration of metadata aside for a second.
> > 
> > Describing metedata layout in BTF allows NICs to disclose everything
> > NIC has to users in a standard and generic way.
> > Whether firmware is reconfigurable on the fly or has to reflashed
> > and hw powercycled to have new md layout (and corresponding BTF description)
> > is a separate discussion.
> > Saeed's proposal introduces the concept of 'offset' inside 'struct xdp_md_info'
> > to reach 'hash' value in metadata.
> > Essentially it's a run-time way to access 'hash' instead of build-time.
> > So bpf program would need two loads to read csum or hash field instead of one.
> > With BTF the layout of metadata is known to the program at build-time.
> > 
> > To reiterate the proposal:
> > - driver+firmware keep layout of the metadata in BTF format (either in the driver
> >   or driver can read it from firmware)
> > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query the driver and
> >   generate normal C header file based on BTF in the given NIC
> > - user does #include "md_desc.h" and bpf program can access md->csum or md->hash
> >   with direct single load out of metadata area in front of the packet
> > - llvm compiles bpf program and records how program is doing this md->csum accesses
> >   in BTF format as well (the compiler will be keeping such records
> >   for __sk_buff and all other structs too, but that's separate discussion)
> > - during sys_bpf(prog_load) the kernel checks (via supplied BTF) that the way the program
> >   accesses metadata (and other structs) matches BTF from the driver,
> >   so no surprises if driver+firmware got updated, but program is not recompiled
> > - every NIC can have their own layout of metadata and its own meaning of the fields,
> >   but would be good to standardize at least a few common fields like hash
> 
> Can I expose HW descriptors this way, though, or is the proposal to
> copy this data into the packet buffer?

That crossed my mind too. We can use BTF to describe HW descriptors too,
but I don't think it would buy us anything. AF_XDP approach is better.

> > Once this is working we can do more cool things with BTF.
> > Like doing offset rewriting at program load time similar to what we plan
> > to do for tracing. Tracing programs will be doing 'task->pid' access
> > and the kernel will adjust offsetof(struct task_struct, pid) during program load
> > depending on BTF for the kernel.
> > The same trick we can do for networking metadata.
> > The program will contain load instruction md->hash that will get automatically
> > adjusted to proper offset depending on BTF of 'hash' field in the NIC.
> > For now I'm proposing _not_ to go that far with offset rewriting and start
> > with simple steps described above.
> 
> Why? :(  Could we please go with the rewrite/driver helpers instead of
> impacting fast paths of the drivers yet again?  This rewrite should be
> easier than task->pid, because we have the synthetic user space struct
> xdp_md definition.

I don't understand 'impact fast path yet again' concern.
If NIC has certain metadata today, just derscribe what it has in BTF
and supply the actual per-packet md to xdp program as-is.
No changes for fast path at all.
Future rewritting will be done by the bpf/xdp core with zero
changes to the driver. All driver has to do is to provide BTF.
Waskiewicz Jr, Peter July 6, 2018, 11:49 p.m. UTC | #14
On Fri, 2018-07-06 at 16:38 -0700, Alexei Starovoitov wrote:
> On Fri, Jul 06, 2018 at 08:44:24PM +0000, Waskiewicz Jr, Peter wrote:
> > On Fri, 2018-07-06 at 09:30 -0700, Alexei Starovoitov wrote:
> > > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:
> > > > 
> > > > I'm also not 100% on board with the argument that "future" FW
> > > > can
> > > > reshuffle things whatever way it wants to.  Is the assumption
> > > > that
> > > > future ASICs/FW will be designed to always use the "blessed"
> > > > BTF
> > > > format?  Or will it be reconfigurable at runtime?
> > > 
> > > let's table configuration of metadata aside for a second.
> > 
> > I agree that this should/could be NIC-specific and shouldn't weigh
> > on
> > the metadata interface between the drivers and XDP layer.
> > 
> > > Describing metedata layout in BTF allows NICs to disclose
> > > everything
> > > NIC has to users in a standard and generic way.
> > > Whether firmware is reconfigurable on the fly or has to reflashed
> > > and hw powercycled to have new md layout (and corresponding BTF
> > > description)
> > > is a separate discussion.
> > > Saeed's proposal introduces the concept of 'offset' inside
> > > 'struct
> > > xdp_md_info'
> > > to reach 'hash' value in metadata.
> > > Essentially it's a run-time way to access 'hash' instead of
> > > build-
> > > time.
> > > So bpf program would need two loads to read csum or hash field
> > > instead of one.
> > > With BTF the layout of metadata is known to the program at build-
> > > time.
> > > 
> > > To reiterate the proposal:
> > > - driver+firmware keep layout of the metadata in BTF format
> > > (either
> > > in the driver
> > >   or driver can read it from firmware)
> > > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will
> > > query
> > > the driver and
> > >   generate normal C header file based on BTF in the given NIC
> > > - user does #include "md_desc.h" and bpf program can access md-
> > > >csum
> > > or md->hash
> > >   with direct single load out of metadata area in front of the
> > > packet
> > 
> > This piece is where I'd like to discuss more.  When we discussed
> > this
> > in Seoul, the initial proposal was a static struct that we'd try to
> > hammer out a common layout between the interested parties.  That
> > obviously wasn't going to scale, and we wanted to pursue something
> > more
> > dynamic.  But I thought the goal was the XDP/eBPF program wouldn't
> > want
> > to care what the underlying device is, and could just ask for
> > metadata
> > that it's interested in.  With this approach, your eBPF program is
> > now
> > bound/tied to the NIC/driver, and if you switched to a differen
> > NIC/driver combo, then you'd have to rewrite part of your eBPF
> > program
> > to comprehend that.  I thought we were trying to avoid that.
> 
> It looks to me that NICs have a lot more differences instead of
> common pieces.
> If we focus the architecture on making common things as generic
> as possible we miss the bigger picture of covering distinct
> and unique features that hw provides.
> 
> In the last email when I said "would be good to standardize at least
> a few common fields"
> I meant that it make sense only at the later phases.
> I don't think we should put things into uapi upfront.
> Otherwise we will end up with likely useless fields.
> Like vlan field. Surely a lot of nics can store it into metadata, but
> why?
> bpf program can easily read it from the packet.
> What's the benefit of adding it to md?
> This metadata concept we're discussing in the context of program
> acceleration.
> If metadata doesn't give perf benefits it should not be done by
> firmware, it should not be supported by the driver and certainly
> should not be part of uapi.
> Hence I'd like to see pure BTF description without any
> common/standard
> fields across the drivers and figure out what (if anything) is useful
> to accelerate xdp programs (and af_xdp in the future).

I guess I didn't write my response very well before, apologies.  I
actually really like the BTF description and the dynamic-ness of it. 
It solves lots of problems and keeps things out of the UAPI.

What I was getting at was using the BTF description and dumping a
header file to be used in the program to be loaded.  If we have an
application developer include that header, it will be the Intel BTF
description, or Mellanox BTF description, etc.  I'm fine with that, but
it makes your program not as portable, since you'd need to get a
different header and recompile it if you change the NIC.  Unless I'm
missing something.

> 
> > Our proposed approach (still working on something ready to RFC) is
> > to
> > provide a method for the eBPF program to send a struct of requested
> > hints down to the driver on load.  If the driver can provide the
> > hints,
> > then that'd be how they'd be laid out in the metadata.  If it can't
> > provide them, we'd probably reject the program loading, or discuss
> > providing a software fallback (I know this is an area of
> > contention).
> 
> I don't think that approach can work.
> My .02 is the only useful acceleration feature out if intel nics is
> an ability to parse the packet and report it to the xdp program
> as a node id in the parse graph. As far as I know that is pretty
> unique
> to intel. Configuration of that is a different matter.
> Other things like hash are interesting, but we will quickly get
> into rss spec definition if we go standardization route now instead
> of later phases.
> Therefore I'd rather let firmware+driver define its own BTF
> description
> that says here is 'hash' of the packet hw can provide and it's fine
> that this hash may be over different tuples depending on the nic
> and even version of the firmware in that nic.
> We need to see performance numbers and benefits for real world
> xdp programs before drilling into specific semantics of the hash.

I agree that "useful" fields should be targeted.  I can see hash
getting more interesting though if one can specify what parts of the n-
tuples get included in the hash, or if we want hashes for inner packets
or outer headers if things are tunneled.  But that can be revisited
down the road.

I also agree that changing the configuration of the underlying parser
or pipeline to generate the metadata is a completely separate
discussion.

> 
> > I suppose we could get there with the rewriting mechanism described
> > below, but that'd be a tough sell to set a bit of ABI for metadata,
> > then change it to be potentially dynamic at runtime in the future.
> 
> Hmm. I don't see how future offset rewriting will break abi.
> It looks to me as natural extension that wouldn't break any apps
> that would be written for specific nic with given BTF description.
> With rewritting in place some progs would become portable. That's
> all.

What I meant here was if anything was put into the UAPI, then changing
to something more dynamic might make transitioning more challenging. 
And I can see now what the rewriting can do to help my previous
thoughts on the BTF description locking a program to a specific NIC. 
But I'd need to play with it to be convinced about the portability.

-PJ
Jakub Kicinski July 7, 2018, 12:08 a.m. UTC | #15
On Fri, 6 Jul 2018 16:42:51 -0700, Alexei Starovoitov wrote:
> On Fri, Jul 06, 2018 at 02:33:58PM -0700, Jakub Kicinski wrote:
> > On Fri, 6 Jul 2018 09:30:42 -0700, Alexei Starovoitov wrote:  
> > > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:  
> > > > 
> > > > I'm also not 100% on board with the argument that "future" FW can
> > > > reshuffle things whatever way it wants to.  Is the assumption that
> > > > future ASICs/FW will be designed to always use the "blessed" BTF
> > > > format?  Or will it be reconfigurable at runtime?    
> > > 
> > > let's table configuration of metadata aside for a second.
> > > 
> > > Describing metedata layout in BTF allows NICs to disclose everything
> > > NIC has to users in a standard and generic way.
> > > Whether firmware is reconfigurable on the fly or has to reflashed
> > > and hw powercycled to have new md layout (and corresponding BTF description)
> > > is a separate discussion.
> > > Saeed's proposal introduces the concept of 'offset' inside 'struct xdp_md_info'
> > > to reach 'hash' value in metadata.
> > > Essentially it's a run-time way to access 'hash' instead of build-time.
> > > So bpf program would need two loads to read csum or hash field instead of one.
> > > With BTF the layout of metadata is known to the program at build-time.
> > > 
> > > To reiterate the proposal:
> > > - driver+firmware keep layout of the metadata in BTF format (either in the driver
> > >   or driver can read it from firmware)
> > > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query the driver and
> > >   generate normal C header file based on BTF in the given NIC
> > > - user does #include "md_desc.h" and bpf program can access md->csum or md->hash
> > >   with direct single load out of metadata area in front of the packet
> > > - llvm compiles bpf program and records how program is doing this md->csum accesses
> > >   in BTF format as well (the compiler will be keeping such records
> > >   for __sk_buff and all other structs too, but that's separate discussion)
> > > - during sys_bpf(prog_load) the kernel checks (via supplied BTF) that the way the program
> > >   accesses metadata (and other structs) matches BTF from the driver,
> > >   so no surprises if driver+firmware got updated, but program is not recompiled
> > > - every NIC can have their own layout of metadata and its own meaning of the fields,
> > >   but would be good to standardize at least a few common fields like hash  
> > 
> > Can I expose HW descriptors this way, though, or is the proposal to
> > copy this data into the packet buffer?  
> 
> That crossed my mind too. We can use BTF to describe HW descriptors too,
> but I don't think it would buy us anything. AF_XDP approach is better.
>
> > > Once this is working we can do more cool things with BTF.
> > > Like doing offset rewriting at program load time similar to what we plan
> > > to do for tracing. Tracing programs will be doing 'task->pid' access
> > > and the kernel will adjust offsetof(struct task_struct, pid) during program load
> > > depending on BTF for the kernel.
> > > The same trick we can do for networking metadata.
> > > The program will contain load instruction md->hash that will get automatically
> > > adjusted to proper offset depending on BTF of 'hash' field in the NIC.
> > > For now I'm proposing _not_ to go that far with offset rewriting and start
> > > with simple steps described above.  
> > 
> > Why? :(  Could we please go with the rewrite/driver helpers instead of
> > impacting fast paths of the drivers yet again?  This rewrite should be
> > easier than task->pid, because we have the synthetic user space struct
> > xdp_md definition.  
> 
> I don't understand 'impact fast path yet again' concern.
> If NIC has certain metadata today, just derscribe what it has in BTF
> and supply the actual per-packet md to xdp program as-is.
> No changes for fast path at all.
> Future rewritting will be done by the bpf/xdp core with zero
> changes to the driver. All driver has to do is to provide BTF.

I'm confused.  AFAIK most *existing* NICs have the metadata in the
"descriptor", i.e. not in the packet buffer.  So if the NIC just
describes what it has, and there is no data shuffling/copying
(performance) then we have to expose the descriptor, no?

In case of NFP half of the metadata is in the packet buffer, half in
the descriptor.

Maybe I'm conflating your proposal with Saeed's.  In your proposal
would we still use the overload of metadata prepend (as in struct
xdp_md::data_meta, bpf_xdp_adjust_meta() etc.) or are we adding a
pointer to a new driver-defined struct inside struct xdp_md?

Thanks for bearing with me!
Jakub Kicinski July 7, 2018, 12:40 a.m. UTC | #16
On Fri, 6 Jul 2018 23:49:55 +0000, Waskiewicz Jr, Peter wrote:
> On Fri, 2018-07-06 at 16:38 -0700, Alexei Starovoitov wrote:
> > On Fri, Jul 06, 2018 at 08:44:24PM +0000, Waskiewicz Jr, Peter wrote:  
> > > On Fri, 2018-07-06 at 09:30 -0700, Alexei Starovoitov wrote:  
> > > > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:  
> > > > > 
> > > > > I'm also not 100% on board with the argument that "future" FW
> > > > > can
> > > > > reshuffle things whatever way it wants to.  Is the assumption
> > > > > that
> > > > > future ASICs/FW will be designed to always use the "blessed"
> > > > > BTF
> > > > > format?  Or will it be reconfigurable at runtime?  
> > > > 
> > > > let's table configuration of metadata aside for a second.  
> > > 
> > > I agree that this should/could be NIC-specific and shouldn't weigh
> > > on
> > > the metadata interface between the drivers and XDP layer.
> > >   
> > > > Describing metedata layout in BTF allows NICs to disclose
> > > > everything
> > > > NIC has to users in a standard and generic way.
> > > > Whether firmware is reconfigurable on the fly or has to reflashed
> > > > and hw powercycled to have new md layout (and corresponding BTF
> > > > description)
> > > > is a separate discussion.
> > > > Saeed's proposal introduces the concept of 'offset' inside
> > > > 'struct
> > > > xdp_md_info'
> > > > to reach 'hash' value in metadata.
> > > > Essentially it's a run-time way to access 'hash' instead of
> > > > build-
> > > > time.
> > > > So bpf program would need two loads to read csum or hash field
> > > > instead of one.
> > > > With BTF the layout of metadata is known to the program at build-
> > > > time.
> > > > 
> > > > To reiterate the proposal:
> > > > - driver+firmware keep layout of the metadata in BTF format
> > > > (either
> > > > in the driver
> > > >   or driver can read it from firmware)
> > > > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will
> > > > query
> > > > the driver and
> > > >   generate normal C header file based on BTF in the given NIC
> > > > - user does #include "md_desc.h" and bpf program can access md-  
> > > > >csum  
> > > > or md->hash
> > > >   with direct single load out of metadata area in front of the
> > > > packet  
> > > 
> > > This piece is where I'd like to discuss more.  When we discussed
> > > this
> > > in Seoul, the initial proposal was a static struct that we'd try to
> > > hammer out a common layout between the interested parties.  That
> > > obviously wasn't going to scale, and we wanted to pursue something
> > > more
> > > dynamic.  But I thought the goal was the XDP/eBPF program wouldn't
> > > want
> > > to care what the underlying device is, and could just ask for
> > > metadata
> > > that it's interested in.  With this approach, your eBPF program is
> > > now
> > > bound/tied to the NIC/driver, and if you switched to a differen
> > > NIC/driver combo, then you'd have to rewrite part of your eBPF
> > > program
> > > to comprehend that.  I thought we were trying to avoid that.  
> > 
> > It looks to me that NICs have a lot more differences instead of
> > common pieces.
> > If we focus the architecture on making common things as generic
> > as possible we miss the bigger picture of covering distinct
> > and unique features that hw provides.
> > 
> > In the last email when I said "would be good to standardize at least
> > a few common fields"
> > I meant that it make sense only at the later phases.
> > I don't think we should put things into uapi upfront.
> > Otherwise we will end up with likely useless fields.
> > Like vlan field. Surely a lot of nics can store it into metadata, but
> > why?

FWIW vlan is extracted from the packet because of cBPF uABI.  tcpdump
breaks badly if VLAN header is in the packet buffer.  It may not be
entirely without merit to expose the field for writing, perhaps that can
simplify some EVPN use cases?  But that's most likely off-topic.

> > bpf program can easily read it from the packet.
> > What's the benefit of adding it to md?
> > This metadata concept we're discussing in the context of program
> > acceleration.
> > If metadata doesn't give perf benefits it should not be done by
> > firmware, it should not be supported by the driver and certainly
> > should not be part of uapi.
> > Hence I'd like to see pure BTF description without any
> > common/standard
> > fields across the drivers and figure out what (if anything) is useful
> > to accelerate xdp programs (and af_xdp in the future).  
> 
> I guess I didn't write my response very well before, apologies.  I
> actually really like the BTF description and the dynamic-ness of it. 
> It solves lots of problems and keeps things out of the UAPI.
> 
> What I was getting at was using the BTF description and dumping a
> header file to be used in the program to be loaded.  If we have an
> application developer include that header, it will be the Intel BTF
> description, or Mellanox BTF description, etc.  I'm fine with that, but
> it makes your program not as portable, since you'd need to get a
> different header and recompile it if you change the NIC.  Unless I'm
> missing something.

Could we just say that at the start we expose only existing SKB fields
(csum, hash, mark) and the meaning of the is the same as in the SKB?
That covers a lot of use cases and provides clear semantics (at least
to the driver developer) while naturally allowing to communicate the
fields from XDP to the stack with XDP_PASS?  It covers all Saeed did 
in his RFC, for example.
Alexei Starovoitov July 7, 2018, 12:45 a.m. UTC | #17
On Fri, Jul 06, 2018 at 11:49:55PM +0000, Waskiewicz Jr, Peter wrote:
> On Fri, 2018-07-06 at 16:38 -0700, Alexei Starovoitov wrote:
> > On Fri, Jul 06, 2018 at 08:44:24PM +0000, Waskiewicz Jr, Peter wrote:
> > > On Fri, 2018-07-06 at 09:30 -0700, Alexei Starovoitov wrote:
> > > > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:
> > > > > 
> > > > > I'm also not 100% on board with the argument that "future" FW
> > > > > can
> > > > > reshuffle things whatever way it wants to.  Is the assumption
> > > > > that
> > > > > future ASICs/FW will be designed to always use the "blessed"
> > > > > BTF
> > > > > format?  Or will it be reconfigurable at runtime?
> > > > 
> > > > let's table configuration of metadata aside for a second.
> > > 
> > > I agree that this should/could be NIC-specific and shouldn't weigh
> > > on
> > > the metadata interface between the drivers and XDP layer.
> > > 
> > > > Describing metedata layout in BTF allows NICs to disclose
> > > > everything
> > > > NIC has to users in a standard and generic way.
> > > > Whether firmware is reconfigurable on the fly or has to reflashed
> > > > and hw powercycled to have new md layout (and corresponding BTF
> > > > description)
> > > > is a separate discussion.
> > > > Saeed's proposal introduces the concept of 'offset' inside
> > > > 'struct
> > > > xdp_md_info'
> > > > to reach 'hash' value in metadata.
> > > > Essentially it's a run-time way to access 'hash' instead of
> > > > build-
> > > > time.
> > > > So bpf program would need two loads to read csum or hash field
> > > > instead of one.
> > > > With BTF the layout of metadata is known to the program at build-
> > > > time.
> > > > 
> > > > To reiterate the proposal:
> > > > - driver+firmware keep layout of the metadata in BTF format
> > > > (either
> > > > in the driver
> > > >   or driver can read it from firmware)
> > > > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will
> > > > query
> > > > the driver and
> > > >   generate normal C header file based on BTF in the given NIC
> > > > - user does #include "md_desc.h" and bpf program can access md-
> > > > >csum
> > > > or md->hash
> > > >   with direct single load out of metadata area in front of the
> > > > packet
> > > 
> > > This piece is where I'd like to discuss more.  When we discussed
> > > this
> > > in Seoul, the initial proposal was a static struct that we'd try to
> > > hammer out a common layout between the interested parties.  That
> > > obviously wasn't going to scale, and we wanted to pursue something
> > > more
> > > dynamic.  But I thought the goal was the XDP/eBPF program wouldn't
> > > want
> > > to care what the underlying device is, and could just ask for
> > > metadata
> > > that it's interested in.  With this approach, your eBPF program is
> > > now
> > > bound/tied to the NIC/driver, and if you switched to a differen
> > > NIC/driver combo, then you'd have to rewrite part of your eBPF
> > > program
> > > to comprehend that.  I thought we were trying to avoid that.
> > 
> > It looks to me that NICs have a lot more differences instead of
> > common pieces.
> > If we focus the architecture on making common things as generic
> > as possible we miss the bigger picture of covering distinct
> > and unique features that hw provides.
> > 
> > In the last email when I said "would be good to standardize at least
> > a few common fields"
> > I meant that it make sense only at the later phases.
> > I don't think we should put things into uapi upfront.
> > Otherwise we will end up with likely useless fields.
> > Like vlan field. Surely a lot of nics can store it into metadata, but
> > why?
> > bpf program can easily read it from the packet.
> > What's the benefit of adding it to md?
> > This metadata concept we're discussing in the context of program
> > acceleration.
> > If metadata doesn't give perf benefits it should not be done by
> > firmware, it should not be supported by the driver and certainly
> > should not be part of uapi.
> > Hence I'd like to see pure BTF description without any
> > common/standard
> > fields across the drivers and figure out what (if anything) is useful
> > to accelerate xdp programs (and af_xdp in the future).
> 
> I guess I didn't write my response very well before, apologies.  I
> actually really like the BTF description and the dynamic-ness of it. 
> It solves lots of problems and keeps things out of the UAPI.
> 
> What I was getting at was using the BTF description and dumping a
> header file to be used in the program to be loaded.  If we have an
> application developer include that header, it will be the Intel BTF
> description, or Mellanox BTF description, etc.  I'm fine with that, but
> it makes your program not as portable, since you'd need to get a
> different header and recompile it if you change the NIC.  Unless I'm
> missing something.

Correct. It makes programs not portable.
That is phase one to figure out what's useful.
Rewriting of offsets and figuring out which fields should be
standardized in uapi is a second phase to make those programs
actually portable.

Once it becomes obvious that feature X of vendor Y is useful
to accelerate particular well known production xdp application
I suspect other vendors will find a way to tweak their firmware
to provide the same feature. Right now vendor <-> customer discussions
are somewhat baseless. Vendors believe that what they have in firmware
is useful, but sw owners may be don't see it this way and don't have
a way to try it and cannot prove one way or the other.
This will help us get unstuck: expose features that nics have
while not baking anything into uapi
Alexei Starovoitov July 7, 2018, 12:53 a.m. UTC | #18
On Fri, Jul 06, 2018 at 05:08:47PM -0700, Jakub Kicinski wrote:
> On Fri, 6 Jul 2018 16:42:51 -0700, Alexei Starovoitov wrote:
> > On Fri, Jul 06, 2018 at 02:33:58PM -0700, Jakub Kicinski wrote:
> > > On Fri, 6 Jul 2018 09:30:42 -0700, Alexei Starovoitov wrote:  
> > > > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:  
> > > > > 
> > > > > I'm also not 100% on board with the argument that "future" FW can
> > > > > reshuffle things whatever way it wants to.  Is the assumption that
> > > > > future ASICs/FW will be designed to always use the "blessed" BTF
> > > > > format?  Or will it be reconfigurable at runtime?    
> > > > 
> > > > let's table configuration of metadata aside for a second.
> > > > 
> > > > Describing metedata layout in BTF allows NICs to disclose everything
> > > > NIC has to users in a standard and generic way.
> > > > Whether firmware is reconfigurable on the fly or has to reflashed
> > > > and hw powercycled to have new md layout (and corresponding BTF description)
> > > > is a separate discussion.
> > > > Saeed's proposal introduces the concept of 'offset' inside 'struct xdp_md_info'
> > > > to reach 'hash' value in metadata.
> > > > Essentially it's a run-time way to access 'hash' instead of build-time.
> > > > So bpf program would need two loads to read csum or hash field instead of one.
> > > > With BTF the layout of metadata is known to the program at build-time.
> > > > 
> > > > To reiterate the proposal:
> > > > - driver+firmware keep layout of the metadata in BTF format (either in the driver
> > > >   or driver can read it from firmware)
> > > > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query the driver and
> > > >   generate normal C header file based on BTF in the given NIC
> > > > - user does #include "md_desc.h" and bpf program can access md->csum or md->hash
> > > >   with direct single load out of metadata area in front of the packet
> > > > - llvm compiles bpf program and records how program is doing this md->csum accesses
> > > >   in BTF format as well (the compiler will be keeping such records
> > > >   for __sk_buff and all other structs too, but that's separate discussion)
> > > > - during sys_bpf(prog_load) the kernel checks (via supplied BTF) that the way the program
> > > >   accesses metadata (and other structs) matches BTF from the driver,
> > > >   so no surprises if driver+firmware got updated, but program is not recompiled
> > > > - every NIC can have their own layout of metadata and its own meaning of the fields,
> > > >   but would be good to standardize at least a few common fields like hash  
> > > 
> > > Can I expose HW descriptors this way, though, or is the proposal to
> > > copy this data into the packet buffer?  
> > 
> > That crossed my mind too. We can use BTF to describe HW descriptors too,
> > but I don't think it would buy us anything. AF_XDP approach is better.
> >
> > > > Once this is working we can do more cool things with BTF.
> > > > Like doing offset rewriting at program load time similar to what we plan
> > > > to do for tracing. Tracing programs will be doing 'task->pid' access
> > > > and the kernel will adjust offsetof(struct task_struct, pid) during program load
> > > > depending on BTF for the kernel.
> > > > The same trick we can do for networking metadata.
> > > > The program will contain load instruction md->hash that will get automatically
> > > > adjusted to proper offset depending on BTF of 'hash' field in the NIC.
> > > > For now I'm proposing _not_ to go that far with offset rewriting and start
> > > > with simple steps described above.  
> > > 
> > > Why? :(  Could we please go with the rewrite/driver helpers instead of
> > > impacting fast paths of the drivers yet again?  This rewrite should be
> > > easier than task->pid, because we have the synthetic user space struct
> > > xdp_md definition.  
> > 
> > I don't understand 'impact fast path yet again' concern.
> > If NIC has certain metadata today, just derscribe what it has in BTF
> > and supply the actual per-packet md to xdp program as-is.
> > No changes for fast path at all.
> > Future rewritting will be done by the bpf/xdp core with zero
> > changes to the driver. All driver has to do is to provide BTF.
> 
> I'm confused.  AFAIK most *existing* NICs have the metadata in the
> "descriptor", i.e. not in the packet buffer.  So if the NIC just
> describes what it has, and there is no data shuffling/copying
> (performance) then we have to expose the descriptor, no?

which piece of sw put that data into desciptor ?
I bet it's firmware. It could have stored it into pre-packet data, no?
I'd like to avoid _all_ copies.
Right now xdp program can only see a pointer to packet and pre-packet.
If we need another pointer to a piece of the packet descriptor,
that's also fine. Both pre-packet metadata and pieces of descriptor
can be described in BTF.
I'd like to push back on firmware folks that should be listening
to feedback from driver folks and kernel stack instead of saying
'here is hw spec that firmware provides'. Firmware is software.
It can change and should be open to change by the community
with proper maintainership.
Alexei Starovoitov July 7, 2018, 1 a.m. UTC | #19
On Fri, Jul 06, 2018 at 05:40:43PM -0700, Jakub Kicinski wrote:
> 
> Could we just say that at the start we expose only existing SKB fields
> (csum, hash, mark) and the meaning of the is the same as in the SKB?

what would be the meaning of 'hash' ? Over which fields?
Does it support inner and outer packets? How about udp encap (vxlan and friends) ?
Same question of csum... tcp only? how about crc for sctp?
What is 'mark' ?
Jakub Kicinski July 7, 2018, 1:20 a.m. UTC | #20
On Fri, 6 Jul 2018 18:00:13 -0700, Alexei Starovoitov wrote:
> On Fri, Jul 06, 2018 at 05:40:43PM -0700, Jakub Kicinski wrote:
> > 
> > Could we just say that at the start we expose only existing SKB fields
> > (csum, hash, mark) and the meaning of the is the same as in the SKB?  
> 
> what would be the meaning of 'hash' ? Over which fields?
> Does it support inner and outer packets? How about udp encap (vxlan and friends) ?

We don't seem to need to answer that for the rest of the stack, no?  We
can expose the "hash type" field as well if that's *really* necessary.

> Same question of csum... tcp only? 

Shouldn't we just stick to CHECKSUM_COMPLETE?

> how about crc for sctp?

That's harder to answer.  Can cls_bpf access such info?

> What is 'mark' ?

Same thing it would be on an skb.  Most likely set with an offloaded TC
rule?
David Miller July 7, 2018, 1:27 a.m. UTC | #21
From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Date: Fri, 6 Jul 2018 09:30:42 -0700

> Like doing offset rewriting at program load time similar to what we plan
> to do for tracing. Tracing programs will be doing 'task->pid' access
> and the kernel will adjust offsetof(struct task_struct, pid) during program load
> depending on BTF for the kernel.
> The same trick we can do for networking metadata.
> The program will contain load instruction md->hash that will get automatically
> adjusted to proper offset depending on BTF of 'hash' field in the NIC.
> For now I'm proposing _not_ to go that far with offset rewriting and start
> with simple steps described above.

+1
David Miller July 7, 2018, 1:37 a.m. UTC | #22
From: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Date: Fri, 6 Jul 2018 17:53:18 -0700

> I'd like to push back on firmware folks that should be listening
> to feedback from driver folks and kernel stack instead of saying
> 'here is hw spec that firmware provides'. Firmware is software.
> It can change and should be open to change by the community
> with proper maintainership.

+1
Jakub Kicinski July 7, 2018, 1:44 a.m. UTC | #23
On Fri, 6 Jul 2018 17:53:18 -0700, Alexei Starovoitov wrote:
> On Fri, Jul 06, 2018 at 05:08:47PM -0700, Jakub Kicinski wrote:
> > On Fri, 6 Jul 2018 16:42:51 -0700, Alexei Starovoitov wrote:  
> > > On Fri, Jul 06, 2018 at 02:33:58PM -0700, Jakub Kicinski wrote:  
> > > > On Fri, 6 Jul 2018 09:30:42 -0700, Alexei Starovoitov wrote:    
> > > > > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:    
> > > > > > 
> > > > > > I'm also not 100% on board with the argument that "future" FW can
> > > > > > reshuffle things whatever way it wants to.  Is the assumption that
> > > > > > future ASICs/FW will be designed to always use the "blessed" BTF
> > > > > > format?  Or will it be reconfigurable at runtime?      
> > > > > 
> > > > > let's table configuration of metadata aside for a second.
> > > > > 
> > > > > Describing metedata layout in BTF allows NICs to disclose everything
> > > > > NIC has to users in a standard and generic way.
> > > > > Whether firmware is reconfigurable on the fly or has to reflashed
> > > > > and hw powercycled to have new md layout (and corresponding BTF description)
> > > > > is a separate discussion.
> > > > > Saeed's proposal introduces the concept of 'offset' inside 'struct xdp_md_info'
> > > > > to reach 'hash' value in metadata.
> > > > > Essentially it's a run-time way to access 'hash' instead of build-time.
> > > > > So bpf program would need two loads to read csum or hash field instead of one.
> > > > > With BTF the layout of metadata is known to the program at build-time.
> > > > > 
> > > > > To reiterate the proposal:
> > > > > - driver+firmware keep layout of the metadata in BTF format (either in the driver
> > > > >   or driver can read it from firmware)
> > > > > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query the driver and
> > > > >   generate normal C header file based on BTF in the given NIC
> > > > > - user does #include "md_desc.h" and bpf program can access md->csum or md->hash
> > > > >   with direct single load out of metadata area in front of the packet
> > > > > - llvm compiles bpf program and records how program is doing this md->csum accesses
> > > > >   in BTF format as well (the compiler will be keeping such records
> > > > >   for __sk_buff and all other structs too, but that's separate discussion)
> > > > > - during sys_bpf(prog_load) the kernel checks (via supplied BTF) that the way the program
> > > > >   accesses metadata (and other structs) matches BTF from the driver,
> > > > >   so no surprises if driver+firmware got updated, but program is not recompiled
> > > > > - every NIC can have their own layout of metadata and its own meaning of the fields,
> > > > >   but would be good to standardize at least a few common fields like hash    
> > > > 
> > > > Can I expose HW descriptors this way, though, or is the proposal to
> > > > copy this data into the packet buffer?    
> > > 
> > > That crossed my mind too. We can use BTF to describe HW descriptors too,
> > > but I don't think it would buy us anything. AF_XDP approach is better.
> > >  
> > > > > Once this is working we can do more cool things with BTF.
> > > > > Like doing offset rewriting at program load time similar to what we plan
> > > > > to do for tracing. Tracing programs will be doing 'task->pid' access
> > > > > and the kernel will adjust offsetof(struct task_struct, pid) during program load
> > > > > depending on BTF for the kernel.
> > > > > The same trick we can do for networking metadata.
> > > > > The program will contain load instruction md->hash that will get automatically
> > > > > adjusted to proper offset depending on BTF of 'hash' field in the NIC.
> > > > > For now I'm proposing _not_ to go that far with offset rewriting and start
> > > > > with simple steps described above.    
> > > > 
> > > > Why? :(  Could we please go with the rewrite/driver helpers instead of
> > > > impacting fast paths of the drivers yet again?  This rewrite should be
> > > > easier than task->pid, because we have the synthetic user space struct
> > > > xdp_md definition.    
> > > 
> > > I don't understand 'impact fast path yet again' concern.
> > > If NIC has certain metadata today, just derscribe what it has in BTF
> > > and supply the actual per-packet md to xdp program as-is.
> > > No changes for fast path at all.
> > > Future rewritting will be done by the bpf/xdp core with zero
> > > changes to the driver. All driver has to do is to provide BTF.  
> > 
> > I'm confused.  AFAIK most *existing* NICs have the metadata in the
> > "descriptor", i.e. not in the packet buffer.  So if the NIC just
> > describes what it has, and there is no data shuffling/copying
> > (performance) then we have to expose the descriptor, no?  
> 
> which piece of sw put that data into desciptor ?
> I bet it's firmware. It could have stored it into pre-packet data, no?
> I'd like to avoid _all_ copies.
> Right now xdp program can only see a pointer to packet and pre-packet.
> If we need another pointer to a piece of the packet descriptor,
> that's also fine. Both pre-packet metadata and pieces of descriptor
> can be described in BTF.

Okay, if we expose another pointer then it's possible to avoid copies.

But please keep in mind that descriptors are very compact, there is
a lot of interdependencies between fields and the fields can shift
depending on the type of packet.  HW/FW guys always quote the 64B
packet performance as a reason why things can't be simple.  We can't
consume 50% of PCIe bandwidth to DMA the metadata alone.

> I'd like to push back on firmware folks that should be listening
> to feedback from driver folks and kernel stack instead of saying
> 'here is hw spec that firmware provides'. Firmware is software.
> It can change and should be open to change by the community
> with proper maintainership.

I think the way to influence FW/HW is to provide a strong and well
justified standard, and if we just expose raw HW data we are doing 
just the opposite.  We can claim BTF driver provides is not uABI, but
I will personally no longer feel comfortable with modifying descriptor
formats (and I've done that for the NFP in the past - grep for
chained_metadata_format).
Alexei Starovoitov July 7, 2018, 2:38 a.m. UTC | #24
On Fri, Jul 06, 2018 at 06:20:47PM -0700, Jakub Kicinski wrote:
> On Fri, 6 Jul 2018 18:00:13 -0700, Alexei Starovoitov wrote:
> > On Fri, Jul 06, 2018 at 05:40:43PM -0700, Jakub Kicinski wrote:
> > > 
> > > Could we just say that at the start we expose only existing SKB fields
> > > (csum, hash, mark) and the meaning of the is the same as in the SKB?  
> > 
> > what would be the meaning of 'hash' ? Over which fields?
> > Does it support inner and outer packets? How about udp encap (vxlan and friends) ?
> 
> We don't seem to need to answer that for the rest of the stack, no?  We
> can expose the "hash type" field as well if that's *really* necessary.
> 
> > Same question of csum... tcp only? 
> 
> Shouldn't we just stick to CHECKSUM_COMPLETE?

I don't yet see how checksum_complete and 'hash just like stack'
helps to accelerate xdp programs like ddos and load balancer.
'hash like stack' may help to accelerate networking stack itself,
but that's very different discussion. There is no xdp in such case.
No programs and no metadata. If some NICs can do
'hash like stack' and demonstrate performance improvements
for regular tcp/ip processing with netperf in user space
that would definitely be interesting and standardizing such
hash from the drivers into upper layers would warrant its own patches,
but seems orthogonal to this xdp hints discussion.
Alexei Starovoitov July 7, 2018, 2:51 a.m. UTC | #25
On Fri, Jul 06, 2018 at 06:44:24PM -0700, Jakub Kicinski wrote:
> On Fri, 6 Jul 2018 17:53:18 -0700, Alexei Starovoitov wrote:
> > On Fri, Jul 06, 2018 at 05:08:47PM -0700, Jakub Kicinski wrote:
> > > On Fri, 6 Jul 2018 16:42:51 -0700, Alexei Starovoitov wrote:  
> > > > On Fri, Jul 06, 2018 at 02:33:58PM -0700, Jakub Kicinski wrote:  
> > > > > On Fri, 6 Jul 2018 09:30:42 -0700, Alexei Starovoitov wrote:    
> > > > > > On Thu, Jul 05, 2018 at 10:18:23AM -0700, Jakub Kicinski wrote:    
> > > > > > > 
> > > > > > > I'm also not 100% on board with the argument that "future" FW can
> > > > > > > reshuffle things whatever way it wants to.  Is the assumption that
> > > > > > > future ASICs/FW will be designed to always use the "blessed" BTF
> > > > > > > format?  Or will it be reconfigurable at runtime?      
> > > > > > 
> > > > > > let's table configuration of metadata aside for a second.
> > > > > > 
> > > > > > Describing metedata layout in BTF allows NICs to disclose everything
> > > > > > NIC has to users in a standard and generic way.
> > > > > > Whether firmware is reconfigurable on the fly or has to reflashed
> > > > > > and hw powercycled to have new md layout (and corresponding BTF description)
> > > > > > is a separate discussion.
> > > > > > Saeed's proposal introduces the concept of 'offset' inside 'struct xdp_md_info'
> > > > > > to reach 'hash' value in metadata.
> > > > > > Essentially it's a run-time way to access 'hash' instead of build-time.
> > > > > > So bpf program would need two loads to read csum or hash field instead of one.
> > > > > > With BTF the layout of metadata is known to the program at build-time.
> > > > > > 
> > > > > > To reiterate the proposal:
> > > > > > - driver+firmware keep layout of the metadata in BTF format (either in the driver
> > > > > >   or driver can read it from firmware)
> > > > > > - 'bpftool read-metadata-desc eth0 > md_desc.h' command will query the driver and
> > > > > >   generate normal C header file based on BTF in the given NIC
> > > > > > - user does #include "md_desc.h" and bpf program can access md->csum or md->hash
> > > > > >   with direct single load out of metadata area in front of the packet
> > > > > > - llvm compiles bpf program and records how program is doing this md->csum accesses
> > > > > >   in BTF format as well (the compiler will be keeping such records
> > > > > >   for __sk_buff and all other structs too, but that's separate discussion)
> > > > > > - during sys_bpf(prog_load) the kernel checks (via supplied BTF) that the way the program
> > > > > >   accesses metadata (and other structs) matches BTF from the driver,
> > > > > >   so no surprises if driver+firmware got updated, but program is not recompiled
> > > > > > - every NIC can have their own layout of metadata and its own meaning of the fields,
> > > > > >   but would be good to standardize at least a few common fields like hash    
> > > > > 
> > > > > Can I expose HW descriptors this way, though, or is the proposal to
> > > > > copy this data into the packet buffer?    
> > > > 
> > > > That crossed my mind too. We can use BTF to describe HW descriptors too,
> > > > but I don't think it would buy us anything. AF_XDP approach is better.
> > > >  
> > > > > > Once this is working we can do more cool things with BTF.
> > > > > > Like doing offset rewriting at program load time similar to what we plan
> > > > > > to do for tracing. Tracing programs will be doing 'task->pid' access
> > > > > > and the kernel will adjust offsetof(struct task_struct, pid) during program load
> > > > > > depending on BTF for the kernel.
> > > > > > The same trick we can do for networking metadata.
> > > > > > The program will contain load instruction md->hash that will get automatically
> > > > > > adjusted to proper offset depending on BTF of 'hash' field in the NIC.
> > > > > > For now I'm proposing _not_ to go that far with offset rewriting and start
> > > > > > with simple steps described above.    
> > > > > 
> > > > > Why? :(  Could we please go with the rewrite/driver helpers instead of
> > > > > impacting fast paths of the drivers yet again?  This rewrite should be
> > > > > easier than task->pid, because we have the synthetic user space struct
> > > > > xdp_md definition.    
> > > > 
> > > > I don't understand 'impact fast path yet again' concern.
> > > > If NIC has certain metadata today, just derscribe what it has in BTF
> > > > and supply the actual per-packet md to xdp program as-is.
> > > > No changes for fast path at all.
> > > > Future rewritting will be done by the bpf/xdp core with zero
> > > > changes to the driver. All driver has to do is to provide BTF.  
> > > 
> > > I'm confused.  AFAIK most *existing* NICs have the metadata in the
> > > "descriptor", i.e. not in the packet buffer.  So if the NIC just
> > > describes what it has, and there is no data shuffling/copying
> > > (performance) then we have to expose the descriptor, no?  
> > 
> > which piece of sw put that data into desciptor ?
> > I bet it's firmware. It could have stored it into pre-packet data, no?
> > I'd like to avoid _all_ copies.
> > Right now xdp program can only see a pointer to packet and pre-packet.
> > If we need another pointer to a piece of the packet descriptor,
> > that's also fine. Both pre-packet metadata and pieces of descriptor
> > can be described in BTF.
> 
> Okay, if we expose another pointer then it's possible to avoid copies.
> 
> But please keep in mind that descriptors are very compact, there is
> a lot of interdependencies between fields and the fields can shift
> depending on the type of packet.  HW/FW guys always quote the 64B
> packet performance as a reason why things can't be simple.  We can't
> consume 50% of PCIe bandwidth to DMA the metadata alone.

Certainly. With l4 load balancer we're after short packet performance too.

> > I'd like to push back on firmware folks that should be listening
> > to feedback from driver folks and kernel stack instead of saying
> > 'here is hw spec that firmware provides'. Firmware is software.
> > It can change and should be open to change by the community
> > with proper maintainership.
> 
> I think the way to influence FW/HW is to provide a strong and well
> justified standard, and if we just expose raw HW data we are doing 
> just the opposite.  We can claim BTF driver provides is not uABI, but
> I will personally no longer feel comfortable with modifying descriptor
> formats (and I've done that for the NFP in the past - grep for
> chained_metadata_format).

I think AF_XDP is doing what you're asking for. It defined a standard
descriptor format. Metadata for the packet is use case specific.
What is useful for l4 load balancer could be a waste of pcie bandwidth
for regular tcp/ip. I'm not proposing to have it always on.

What I'm proposing with BTF is instead of doing nfp_net_parse_meta()
for this chained format in the nfp driver expose it to xdp program.
Let program deal with it when it wants to. Would that work?
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fc8b6ce48a0f..172363310ade 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -838,6 +838,7 @@  struct netdev_bpf {
 			u32 flags;
 			struct bpf_prog *prog;
 			struct netlink_ext_ack *extack;
+			xdp_md_info_arr md_info;
 		};
 		/* XDP_QUERY_PROG */
 		struct {
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 2deea7166a34..afe302613ae1 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -138,6 +138,12 @@  xdp_set_data_meta_invalid(struct xdp_buff *xdp)
 	xdp->data_meta = xdp->data + 1;
 }
 
+static __always_inline void
+xdp_reset_data_meta(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data_hard_start;
+}
+
 static __always_inline bool
 xdp_data_meta_unsupported(const struct xdp_buff *xdp)
 {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 59b19b6a40d7..e320e7421565 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2353,6 +2353,120 @@  struct xdp_md {
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
 
+enum {
+	XDP_DATA_META_HASH = 0,
+	XDP_DATA_META_MARK,
+	XDP_DATA_META_VLAN,
+	XDP_DATA_META_CSUM_COMPLETE,
+
+	XDP_DATA_META_MAX,
+};
+
+struct xdp_md_hash {
+	__u32 hash;
+	__u8  type;
+} __attribute__((aligned(4)));
+
+struct xdp_md_mark {
+	__u32 mark;
+} __attribute__((aligned(4)));
+
+struct xdp_md_vlan {
+	__u16 vlan;
+} __attribute__((aligned(4)));
+
+struct xdp_md_csum {
+	__u16 csum;
+} __attribute__((aligned(4)));
+
+static const __u16 xdp_md_size[XDP_DATA_META_MAX] = {
+	sizeof(struct xdp_md_hash), /* XDP_DATA_META_HASH */
+	sizeof(struct xdp_md_mark), /* XDP_DATA_META_MARK */
+	sizeof(struct xdp_md_vlan), /* XDP_DATA_META_VLAN */
+	sizeof(struct xdp_md_csum), /* XDP_DATA_META_CSUM_COMPLETE */
+};
+
+struct xdp_md_info {
+	__u16 present:1;
+	__u16 offset:15; /* offset from data_meta in xdp_md buffer */
+};
+
+/* XDP meta data offsets info array
+ * present bit describes if a meta data is or will be present in xdp_md buff
+ * offset describes where a meta data is or should be placed in xdp_md buff
+ *
+ * Kernel builds this array using xdp_md_info_build helper on demand.
+ * User space builds it statically in the xdp program.
+ */
+typedef struct xdp_md_info xdp_md_info_arr[XDP_DATA_META_MAX];
+
+static __always_inline __u8
+xdp_data_meta_present(xdp_md_info_arr mdi, int meta_data)
+{
+	return mdi[meta_data].present;
+}
+
+static __always_inline void
+xdp_data_meta_set_hash(xdp_md_info_arr mdi, void *data_meta, __u32 hash, __u8 htype)
+{
+	struct xdp_md_hash *hash_md;
+
+	hash_md = (struct xdp_md_hash *)((char*)data_meta + mdi[XDP_DATA_META_HASH].offset);
+	hash_md->hash = hash;
+	hash_md->type = htype;
+}
+
+static __always_inline struct xdp_md_hash *
+xdp_data_meta_get_hash(xdp_md_info_arr mdi, void *data_meta)
+{
+	return (struct xdp_md_hash *)((char*)data_meta + mdi[XDP_DATA_META_HASH].offset);
+}
+
+static __always_inline void
+xdp_data_meta_set_mark(xdp_md_info_arr mdi, void *data_meta, __u32 mark)
+{
+	struct xdp_md_mark *mark_md;
+
+	mark_md = (struct xdp_md_mark *)((char*)data_meta + mdi[XDP_DATA_META_MARK].offset);
+	mark_md->mark = mark;
+}
+
+static __always_inline struct xdp_md_mark *
+xdp_data_meta_get_mark(xdp_md_info_arr mdi, void *data_meta)
+{
+	return (struct xdp_md_mark *)((char*)data_meta + mdi[XDP_DATA_META_MARK].offset);
+}
+
+static __always_inline void
+xdp_data_meta_set_vlan(xdp_md_info_arr mdi, void *data_meta, __u16 vlan_tci)
+{
+	struct xdp_md_vlan *vlan_md;
+
+	vlan_md = (struct xdp_md_vlan *)((char*)data_meta + mdi[XDP_DATA_META_VLAN].offset);
+	vlan_md->vlan = vlan_tci;
+}
+
+static __always_inline struct xdp_md_vlan *
+xdp_data_meta_get_vlan(xdp_md_info_arr mdi, void *data_meta)
+{
+	return (struct xdp_md_vlan *)((char*)data_meta + mdi[XDP_DATA_META_VLAN].offset);
+}
+
+static __always_inline void
+xdp_data_meta_set_csum_complete(xdp_md_info_arr mdi, void *data_meta, __u16 csum)
+{
+	struct xdp_md_csum *csum_md;
+
+	csum_md = (struct xdp_md_csum *)((char*)data_meta + mdi[XDP_DATA_META_CSUM_COMPLETE].offset);
+	csum_md->csum = csum;
+}
+
+static __always_inline struct xdp_md_csum *
+xdp_data_meta_get_csum_complete(xdp_md_info_arr mdi, void *data_meta)
+{
+	return (struct xdp_md_csum *)((char*)data_meta + mdi[XDP_DATA_META_CSUM_COMPLETE].offset);
+}
+
 enum sk_action {
 	SK_DROP = 0,
 	SK_PASS,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index dfb1e26bacef..5bdc07bfad35 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -4,6 +4,7 @@ 
 
 #include <linux/types.h>
 #include <linux/netlink.h>
+#include <linux/bpf.h>
 
 /* This struct should be in sync with struct rtnl_link_stats64 */
 struct rtnl_link_stats {
@@ -913,13 +914,16 @@  enum {
 					 XDP_FLAGS_HW_MODE)
 
 /* TODO : add new netlink xdp u64 meta_flags
- * for meta data only
+ * for meta data only & build according XDP_DATA_META_* enums
  */
-#define XDP_FLAGS_META_HASH		(1U << 16)
-#define XDP_FLAGS_META_MARK		(1U << 17)
-#define XDP_FLAGS_META_VLAN		(1U << 18)
-#define XDP_FLAGS_META_CSUM_COMPLETE	(1U << 19)
-#define XDP_FLAGS_META_ALL		(XDP_FLAGS_META_HASH      | \
+
+#define XDP_FLAGS_META_SHIFT            (16)
+#define XDP_FLAG_META(FLAG)             ((1U << XDP_DATA_META_##FLAG) << XDP_FLAGS_META_SHIFT)
+#define XDP_FLAGS_META_HASH             XDP_FLAG_META(HASH)
+#define XDP_FLAGS_META_MARK             XDP_FLAG_META(MARK)
+#define XDP_FLAGS_META_VLAN             XDP_FLAG_META(VLAN)
+#define XDP_FLAGS_META_CSUM_COMPLETE    XDP_FLAG_META(CSUM_COMPLETE)
+#define XDP_FLAGS_META_ALL              (XDP_FLAGS_META_HASH      | \
 					 XDP_FLAGS_META_MARK      | \
 					 XDP_FLAGS_META_VLAN      | \
 					 XDP_FLAGS_META_CSUM_COMPLETE)
diff --git a/net/core/dev.c b/net/core/dev.c
index 8a5cc2c731ec..141dd6632b00 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7310,6 +7310,25 @@  static bool __dev_xdp_meta_supported(struct net_device *dev,
 	return ((xdp.meta_flags & meta_flags) == meta_flags);
 }
 
+static void xdp_md_info_build(xdp_md_info_arr mdi, u32 xdp_flags_meta)
+{
+	u16 offset = 0;
+	int i;
+
+	for (i = 0; i < XDP_DATA_META_MAX; i++) {
+		u32 meta_data_flag = BIT(i) << XDP_FLAGS_META_SHIFT;
+
+		if (!(meta_data_flag & xdp_flags_meta)) {
+			mdi[i].present = 0;
+			continue;
+		}
+
+		mdi[i].present = 1;
+		mdi[i].offset = offset;
+		offset += xdp_md_size[i];
+	}
+}
+
 static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 			   struct netlink_ext_ack *extack, u32 flags,
 			   struct bpf_prog *prog)
@@ -7325,6 +7344,8 @@  static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
 	xdp.flags = flags;
 	xdp.prog = prog;
 
+	xdp_md_info_build(xdp.md_info, flags & XDP_FLAGS_META_ALL);
+
 	return bpf_op(dev, &xdp);
 }