diff mbox series

[v2,net-next,6/9] bpf: helpers: add bpf_xdp_adjust_mb_header helper

Message ID b7475687bb09aac6ec051596a8ccbb311a54cb8a.1599165031.git.lorenzo@kernel.org
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series mvneta: introduce XDP multi-buffer support | expand

Commit Message

Lorenzo Bianconi Sept. 3, 2020, 8:58 p.m. UTC
Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
headers moving *offset* bytes from/to the second buffer to/from the
first one.
This helper can be used to move headers when the hw DMA SG is not able
to copy all the headers in the first fragment and split header and data
pages.

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
---
 include/uapi/linux/bpf.h       | 25 ++++++++++++----
 net/core/filter.c              | 54 ++++++++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h | 26 ++++++++++++----
 3 files changed, 95 insertions(+), 10 deletions(-)

Comments

Alexei Starovoitov Sept. 4, 2020, 1:09 a.m. UTC | #1
On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:
> Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
> headers moving *offset* bytes from/to the second buffer to/from the
> first one.
> This helper can be used to move headers when the hw DMA SG is not able
> to copy all the headers in the first fragment and split header and data
> pages.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/uapi/linux/bpf.h       | 25 ++++++++++++----
>  net/core/filter.c              | 54 ++++++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h | 26 ++++++++++++----
>  3 files changed, 95 insertions(+), 10 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 8dda13880957..c4a6d245619c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3571,11 +3571,25 @@ union bpf_attr {
>   *		value.
>   *
>   * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
> - * 	Description
> - * 		Read *size* bytes from user space address *user_ptr* and store
> - * 		the data in *dst*. This is a wrapper of copy_from_user().
> - * 	Return
> - * 		0 on success, or a negative error in case of failure.
> + *	Description
> + *		Read *size* bytes from user space address *user_ptr* and store
> + *		the data in *dst*. This is a wrapper of copy_from_user().
> + *
> + * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)

botched rebase?
Alexei Starovoitov Sept. 4, 2020, 1:13 a.m. UTC | #2
On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:
> +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> +	   int, offset)
> +{
> +	void *data_hard_end, *data_end;
> +	struct skb_shared_info *sinfo;
> +	int frag_offset, frag_len;
> +	u8 *addr;
> +
> +	if (!xdp->mb)
> +		return -EOPNOTSUPP;
> +
> +	sinfo = xdp_get_shared_info_from_buff(xdp);
> +
> +	frag_len = skb_frag_size(&sinfo->frags[0]);
> +	if (offset > frag_len)
> +		return -EINVAL;
> +
> +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> +	data_end = xdp->data_end + offset;
> +
> +	if (offset < 0 && (-offset > frag_offset ||
> +			   data_end < xdp->data + ETH_HLEN))
> +		return -EINVAL;
> +
> +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> +	if (data_end > data_hard_end)
> +		return -EINVAL;
> +
> +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> +	if (offset > 0) {
> +		memcpy(xdp->data_end, addr, offset);
> +	} else {
> +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> +		memset(xdp->data_end + offset, 0, -offset);
> +	}
> +
> +	skb_frag_size_sub(&sinfo->frags[0], offset);
> +	skb_frag_off_add(&sinfo->frags[0], offset);
> +	xdp->data_end = data_end;
> +
> +	return 0;
> +}

wait a sec. Are you saying that multi buffer XDP actually should be skb based?
If that's what mvneta driver is doing that's fine, but that is not a
reasonable requirement to put on all other drivers.
John Fastabend Sept. 4, 2020, 6:47 a.m. UTC | #3
Lorenzo Bianconi wrote:
> Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
> headers moving *offset* bytes from/to the second buffer to/from the
> first one.
> This helper can be used to move headers when the hw DMA SG is not able
> to copy all the headers in the first fragment and split header and data
> pages.
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> ---
>  include/uapi/linux/bpf.h       | 25 ++++++++++++----
>  net/core/filter.c              | 54 ++++++++++++++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h | 26 ++++++++++++----
>  3 files changed, 95 insertions(+), 10 deletions(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 8dda13880957..c4a6d245619c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3571,11 +3571,25 @@ union bpf_attr {
>   *		value.
>   *
>   * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
> - * 	Description
> - * 		Read *size* bytes from user space address *user_ptr* and store
> - * 		the data in *dst*. This is a wrapper of copy_from_user().
> - * 	Return
> - * 		0 on success, or a negative error in case of failure.
> + *	Description
> + *		Read *size* bytes from user space address *user_ptr* and store
> + *		the data in *dst*. This is a wrapper of copy_from_user().
> + *
> + * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
> + *	Description
> + *		Adjust frame headers moving *offset* bytes from/to the second
> + *		buffer to/from the first one. This helper can be used to move
> + *		headers when the hw DMA SG does not copy all the headers in
> + *		the first fragment.

This is confusing to read. Does this mean I can "move bytes to the second
buffer from the first one" or "move bytes from the second buffer to the first
one" And what are frame headers? I'm sure I can read below code and work
it out, but users reading the header file should be able to parse this.

Also we want to be able to read all data not just headers. Reading the
payload of a TCP packet is equally important for many l7 load balancers.

> + *
> + *		A call to this helper is susceptible to change the underlying
> + *		packet buffer. Therefore, at load time, all checks on pointers
> + *		previously done by the verifier are invalidated and must be
> + *		performed again, if the helper is used in combination with
> + *		direct packet access.
> + *
> + *	Return
> + *		0 on success, or a negative error in case of failure.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -3727,6 +3741,7 @@ union bpf_attr {
>  	FN(inode_storage_delete),	\
>  	FN(d_path),			\
>  	FN(copy_from_user),		\
> +	FN(xdp_adjust_mb_header),	\
>  	/* */
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 47eef9a0be6a..ae6b10cf062d 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3475,6 +3475,57 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
>  	.arg2_type	= ARG_ANYTHING,
>  };
>  
> +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> +	   int, offset)
> +{
> +	void *data_hard_end, *data_end;
> +	struct skb_shared_info *sinfo;
> +	int frag_offset, frag_len;
> +	u8 *addr;
> +
> +	if (!xdp->mb)
> +		return -EOPNOTSUPP;
> +
> +	sinfo = xdp_get_shared_info_from_buff(xdp);
> +
> +	frag_len = skb_frag_size(&sinfo->frags[0]);
> +	if (offset > frag_len)
> +		return -EINVAL;

What if we want data in frags[1] and so on.

> +
> +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> +	data_end = xdp->data_end + offset;
> +
> +	if (offset < 0 && (-offset > frag_offset ||
> +			   data_end < xdp->data + ETH_HLEN))
> +		return -EINVAL;
> +
> +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> +	if (data_end > data_hard_end)
> +		return -EINVAL;
> +
> +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> +	if (offset > 0) {
> +		memcpy(xdp->data_end, addr, offset);

But this could get expensive for large amount of data? And also
limiting because we require the room to do the copy. Presumably
the reason we have fargs[1] is because the normal page or half
page is in use?

> +	} else {
> +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> +		memset(xdp->data_end + offset, 0, -offset);
> +	}
> +
> +	skb_frag_size_sub(&sinfo->frags[0], offset);
> +	skb_frag_off_add(&sinfo->frags[0], offset);
> +	xdp->data_end = data_end;
> +
> +	return 0;
> +}

So overall I don't see the point of copying bytes from one frag to
another. Create an API that adjusts the data pointers and then
copies are avoided and manipulating frags is not needed.

Also and even more concerning I think this API requires the
driver to populate shinfo. If we use TX_REDIRECT a lot or TX_XMIT
this means we need to populate shinfo when its probably not ever
used. If our driver is smart L2/L3 headers are in the readable
data and prefetched. Writing frags into the end of a page is likely
not free.

Did you benchmark this?

In general users of this API should know the bytes they want
to fetch. Use an API like this,

  bpf_xdp_adjust_bytes(xdp, start, end)

Where xdp is the frame, start is the first byte the user wants
and end is the last byte. Then usually you can skip the entire
copy part and just move the xdp pointesr around. The ugly case
is if the user puts start/end across a frag boundary and a copy
is needed. In that case maybe we use end as a hint and not a
hard requirement.

The use case I see is I read L2/L3/L4 headers and I need the
first N bytes of the payload. I know where the payload starts
and I know how many bytes I need so,

  bpf_xdp_adjust_bytes(xdp, payload_offset, bytes_needed);

Then hopefully that is all in one frag. If its not I'll need
to do a second helper call. Still nicer than forcing drivers
to populate this shinfo I think. If you think I'm wrong try
a benchmark. Benchmarks aside I get stuck when data_end and
data_hard_end are too close together.

Thanks,
John
Lorenzo Bianconi Sept. 4, 2020, 7:50 a.m. UTC | #4
> On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:
> > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > +	   int, offset)
> > +{
> > +	void *data_hard_end, *data_end;
> > +	struct skb_shared_info *sinfo;
> > +	int frag_offset, frag_len;
> > +	u8 *addr;
> > +
> > +	if (!xdp->mb)
> > +		return -EOPNOTSUPP;
> > +
> > +	sinfo = xdp_get_shared_info_from_buff(xdp);
> > +
> > +	frag_len = skb_frag_size(&sinfo->frags[0]);
> > +	if (offset > frag_len)
> > +		return -EINVAL;
> > +
> > +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> > +	data_end = xdp->data_end + offset;
> > +
> > +	if (offset < 0 && (-offset > frag_offset ||
> > +			   data_end < xdp->data + ETH_HLEN))
> > +		return -EINVAL;
> > +
> > +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > +	if (data_end > data_hard_end)
> > +		return -EINVAL;
> > +
> > +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > +	if (offset > 0) {
> > +		memcpy(xdp->data_end, addr, offset);
> > +	} else {
> > +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> > +		memset(xdp->data_end + offset, 0, -offset);
> > +	}
> > +
> > +	skb_frag_size_sub(&sinfo->frags[0], offset);
> > +	skb_frag_off_add(&sinfo->frags[0], offset);
> > +	xdp->data_end = data_end;
> > +
> > +	return 0;
> > +}
> 
> wait a sec. Are you saying that multi buffer XDP actually should be skb based?
> If that's what mvneta driver is doing that's fine, but that is not a
> reasonable requirement to put on all other drivers.

I did not got what you mean here. The xdp multi-buffer layout uses the skb_shared_info
at the end of the first buffer to link subsequent frames [0] and we rely on skb_frag*
utilities to set/read offset and length of subsequent buffers.

Regards,
Lorenzo

[0] http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html - XDP multi-buffers section
Lorenzo Bianconi Sept. 4, 2020, 8:07 a.m. UTC | #5
On Sep 03, Alexei Starovoitov wrote:
> On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:
> > Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
> > headers moving *offset* bytes from/to the second buffer to/from the
> > first one.
> > This helper can be used to move headers when the hw DMA SG is not able
> > to copy all the headers in the first fragment and split header and data
> > pages.
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  include/uapi/linux/bpf.h       | 25 ++++++++++++----
> >  net/core/filter.c              | 54 ++++++++++++++++++++++++++++++++++
> >  tools/include/uapi/linux/bpf.h | 26 ++++++++++++----
> >  3 files changed, 95 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 8dda13880957..c4a6d245619c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3571,11 +3571,25 @@ union bpf_attr {
> >   *		value.
> >   *
> >   * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
> > - * 	Description
> > - * 		Read *size* bytes from user space address *user_ptr* and store
> > - * 		the data in *dst*. This is a wrapper of copy_from_user().
> > - * 	Return
> > - * 		0 on success, or a negative error in case of failure.
> > + *	Description
> > + *		Read *size* bytes from user space address *user_ptr* and store
> > + *		the data in *dst*. This is a wrapper of copy_from_user().
> > + *
> > + * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
> 
> botched rebase?

Yes, sorry. I will fix in v3.

Regards,
Lorenzo
Lorenzo Bianconi Sept. 4, 2020, 9:45 a.m. UTC | #6
> Lorenzo Bianconi wrote:
> > Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
> > headers moving *offset* bytes from/to the second buffer to/from the
> > first one.
> > This helper can be used to move headers when the hw DMA SG is not able
> > to copy all the headers in the first fragment and split header and data
> > pages.
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > ---
> >  include/uapi/linux/bpf.h       | 25 ++++++++++++----
> >  net/core/filter.c              | 54 ++++++++++++++++++++++++++++++++++
> >  tools/include/uapi/linux/bpf.h | 26 ++++++++++++----
> >  3 files changed, 95 insertions(+), 10 deletions(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 8dda13880957..c4a6d245619c 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -3571,11 +3571,25 @@ union bpf_attr {
> >   *		value.
> >   *
> >   * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
> > - * 	Description
> > - * 		Read *size* bytes from user space address *user_ptr* and store
> > - * 		the data in *dst*. This is a wrapper of copy_from_user().
> > - * 	Return
> > - * 		0 on success, or a negative error in case of failure.
> > + *	Description
> > + *		Read *size* bytes from user space address *user_ptr* and store
> > + *		the data in *dst*. This is a wrapper of copy_from_user().
> > + *
> > + * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
> > + *	Description
> > + *		Adjust frame headers moving *offset* bytes from/to the second
> > + *		buffer to/from the first one. This helper can be used to move
> > + *		headers when the hw DMA SG does not copy all the headers in
> > + *		the first fragment.

+ Eric to the discussion

> 
> This is confusing to read. Does this mean I can "move bytes to the second
> buffer from the first one" or "move bytes from the second buffer to the first
> one" And what are frame headers? I'm sure I can read below code and work
> it out, but users reading the header file should be able to parse this.

Our main goal with this helper is to fix the use-case where we request the hw
to put L2/L3/L4 headers (and all the TCP options) in the first fragment and TCP
data starting from the second fragment (headers split) but for some reasons
the hw puts the TCP options in the second fragment (if we understood correctly
this issue has been introduced by Eric @ NetDevConf 0x14).
bpf_xdp_adjust_mb_header() can fix this use-case moving bytes from the second fragment
to the first one (offset > 0) or from the first buffer to the second one (offset < 0).

> 
> Also we want to be able to read all data not just headers. Reading the
> payload of a TCP packet is equally important for many l7 load balancers.
> 

In order to avoid to slow down performances we require that eBPF sandbox can
read/write only buff0 in a xdp multi-buffer. The xdp program can only
perform some restricted changes to buff<n> (n >= 1) (e.g. what we did in
bpf_xdp_adjust_mb_header()).
You can find the xdp multi-buff design principles here [0][1]

[0] https://github.com/xdp-project/xdp-project/blob/master/areas/core/xdp-multi-buffer01-design.org
[1] http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html - XDP multi-buffers section (slide 40)

> > + *
> > + *		A call to this helper is susceptible to change the underlying
> > + *		packet buffer. Therefore, at load time, all checks on pointers
> > + *		previously done by the verifier are invalidated and must be
> > + *		performed again, if the helper is used in combination with
> > + *		direct packet access.
> > + *
> > + *	Return
> > + *		0 on success, or a negative error in case of failure.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)		\
> >  	FN(unspec),			\
> > @@ -3727,6 +3741,7 @@ union bpf_attr {
> >  	FN(inode_storage_delete),	\
> >  	FN(d_path),			\
> >  	FN(copy_from_user),		\
> > +	FN(xdp_adjust_mb_header),	\
> >  	/* */
> >  
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 47eef9a0be6a..ae6b10cf062d 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -3475,6 +3475,57 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
> >  	.arg2_type	= ARG_ANYTHING,
> >  };
> >  
> > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > +	   int, offset)
> > +{
> > +	void *data_hard_end, *data_end;
> > +	struct skb_shared_info *sinfo;
> > +	int frag_offset, frag_len;
> > +	u8 *addr;
> > +
> > +	if (!xdp->mb)
> > +		return -EOPNOTSUPP;
> > +
> > +	sinfo = xdp_get_shared_info_from_buff(xdp);
> > +
> > +	frag_len = skb_frag_size(&sinfo->frags[0]);
> > +	if (offset > frag_len)
> > +		return -EINVAL;
> 
> What if we want data in frags[1] and so on.
> 
> > +
> > +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> > +	data_end = xdp->data_end + offset;
> > +
> > +	if (offset < 0 && (-offset > frag_offset ||
> > +			   data_end < xdp->data + ETH_HLEN))
> > +		return -EINVAL;
> > +
> > +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > +	if (data_end > data_hard_end)
> > +		return -EINVAL;
> > +
> > +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > +	if (offset > 0) {
> > +		memcpy(xdp->data_end, addr, offset);
> 
> But this could get expensive for large amount of data? And also
> limiting because we require the room to do the copy. Presumably
> the reason we have fargs[1] is because the normal page or half
> page is in use?
> 
> > +	} else {
> > +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> > +		memset(xdp->data_end + offset, 0, -offset);
> > +	}
> > +
> > +	skb_frag_size_sub(&sinfo->frags[0], offset);
> > +	skb_frag_off_add(&sinfo->frags[0], offset);
> > +	xdp->data_end = data_end;
> > +
> > +	return 0;
> > +}
> 
> So overall I don't see the point of copying bytes from one frag to
> another. Create an API that adjusts the data pointers and then
> copies are avoided and manipulating frags is not needed.

please see above.

> 
> Also and even more concerning I think this API requires the
> driver to populate shinfo. If we use TX_REDIRECT a lot or TX_XMIT
> this means we need to populate shinfo when its probably not ever
> used. If our driver is smart L2/L3 headers are in the readable
> data and prefetched. Writing frags into the end of a page is likely
> not free.

Sorry I did not get what you mean with "populate shinfo" here. We need to
properly set shared_info in order to create the xdp multi-buff.
Apart of header splits, please consider the main uses-cases for
xdp multi-buff are XDP with TSO and Jumbo frames.

> 
> Did you benchmark this?

will do, I need to understand if we can use tiny buffers in mvneta.

> 
> In general users of this API should know the bytes they want
> to fetch. Use an API like this,
> 
>   bpf_xdp_adjust_bytes(xdp, start, end)
> 
> Where xdp is the frame, start is the first byte the user wants
> and end is the last byte. Then usually you can skip the entire
> copy part and just move the xdp pointesr around. The ugly case
> is if the user puts start/end across a frag boundary and a copy
> is needed. In that case maybe we use end as a hint and not a
> hard requirement.
> 
> The use case I see is I read L2/L3/L4 headers and I need the
> first N bytes of the payload. I know where the payload starts
> and I know how many bytes I need so,
> 
>   bpf_xdp_adjust_bytes(xdp, payload_offset, bytes_needed);
> 
> Then hopefully that is all in one frag. If its not I'll need
> to do a second helper call. Still nicer than forcing drivers
> to populate this shinfo I think. If you think I'm wrong try
> a benchmark. Benchmarks aside I get stuck when data_end and
> data_hard_end are too close together.

IIUC what you mean here is to expose L2/L3/L4 headers + some data to
the ebpf program to perform like L7 load-balancing, right?
Let's consider the Jumbo frames use-case (so the data are split in multiple
buffers). I can see to issues here:
- since in XDP we can't linearize the buffer if start and end are on the
  different pages (like we do in bpf_msg_pull_data()), are we ending up
  in the case where requested data are all in buff0? 
- if  start and end are in buff<2>, we should report the fragment number to the
  ebpf program to "fetch" the data. Are we exposing too low-level details to
  user-space?

Regards,
Lorenzo

> 
> Thanks,
> John
Jesper Dangaard Brouer Sept. 4, 2020, 1:52 p.m. UTC | #7
On Fri, 4 Sep 2020 09:50:31 +0200
Lorenzo Bianconi <lorenzo@kernel.org> wrote:

> > On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:  
> > > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > > +	   int, offset)
> > > +{
> > > +	void *data_hard_end, *data_end;
> > > +	struct skb_shared_info *sinfo;
> > > +	int frag_offset, frag_len;
> > > +	u8 *addr;
> > > +
> > > +	if (!xdp->mb)
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	sinfo = xdp_get_shared_info_from_buff(xdp);
> > > +
> > > +	frag_len = skb_frag_size(&sinfo->frags[0]);
> > > +	if (offset > frag_len)
> > > +		return -EINVAL;
> > > +
> > > +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> > > +	data_end = xdp->data_end + offset;
> > > +
> > > +	if (offset < 0 && (-offset > frag_offset ||
> > > +			   data_end < xdp->data + ETH_HLEN))
> > > +		return -EINVAL;
> > > +
> > > +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > > +	if (data_end > data_hard_end)
> > > +		return -EINVAL;
> > > +
> > > +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > > +	if (offset > 0) {
> > > +		memcpy(xdp->data_end, addr, offset);
> > > +	} else {
> > > +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> > > +		memset(xdp->data_end + offset, 0, -offset);
> > > +	}
> > > +
> > > +	skb_frag_size_sub(&sinfo->frags[0], offset);
> > > +	skb_frag_off_add(&sinfo->frags[0], offset);
> > > +	xdp->data_end = data_end;
> > > +
> > > +	return 0;
> > > +}  
> > 
> > wait a sec. Are you saying that multi buffer XDP actually should be skb based?
> > If that's what mvneta driver is doing that's fine, but that is not a
> > reasonable requirement to put on all other drivers.  
> 
> I did not got what you mean here. The xdp multi-buffer layout uses
> the skb_shared_info at the end of the first buffer to link subsequent
> frames [0] and we rely on skb_frag* utilities to set/read offset and
> length of subsequent buffers.

Yes, for now the same layout as "skb_shared_info" is "reuse", but I
think we should think of this as "xdp_shared_info" instead, as how it
is used for XDP is going to divert from SKBs.  We already discussed (in
conf call) that we could store the total len of "frags" here, to
simplify the other helper.

Using the skb_frag_* helper functions are misleading, and will make it
more difficult to divert from how SKB handle frags.  What about
introducing xdp_frag_* wrappers? (what do others think?)


> 
> [0] http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html
>     - XDP multi-buffers section (slide 40)
Lorenzo Bianconi Sept. 4, 2020, 2:27 p.m. UTC | #8
> On Fri, 4 Sep 2020 09:50:31 +0200
> Lorenzo Bianconi <lorenzo@kernel.org> wrote:
> 
> > > On Thu, Sep 03, 2020 at 10:58:50PM +0200, Lorenzo Bianconi wrote:  
> > > > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > > > +	   int, offset)
> > > > +{
> > > > +	void *data_hard_end, *data_end;
> > > > +	struct skb_shared_info *sinfo;
> > > > +	int frag_offset, frag_len;
> > > > +	u8 *addr;
> > > > +
> > > > +	if (!xdp->mb)
> > > > +		return -EOPNOTSUPP;
> > > > +
> > > > +	sinfo = xdp_get_shared_info_from_buff(xdp);
> > > > +
> > > > +	frag_len = skb_frag_size(&sinfo->frags[0]);
> > > > +	if (offset > frag_len)
> > > > +		return -EINVAL;
> > > > +
> > > > +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> > > > +	data_end = xdp->data_end + offset;
> > > > +
> > > > +	if (offset < 0 && (-offset > frag_offset ||
> > > > +			   data_end < xdp->data + ETH_HLEN))
> > > > +		return -EINVAL;
> > > > +
> > > > +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > > > +	if (data_end > data_hard_end)
> > > > +		return -EINVAL;
> > > > +
> > > > +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > > > +	if (offset > 0) {
> > > > +		memcpy(xdp->data_end, addr, offset);
> > > > +	} else {
> > > > +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> > > > +		memset(xdp->data_end + offset, 0, -offset);
> > > > +	}
> > > > +
> > > > +	skb_frag_size_sub(&sinfo->frags[0], offset);
> > > > +	skb_frag_off_add(&sinfo->frags[0], offset);
> > > > +	xdp->data_end = data_end;
> > > > +
> > > > +	return 0;
> > > > +}  
> > > 
> > > wait a sec. Are you saying that multi buffer XDP actually should be skb based?
> > > If that's what mvneta driver is doing that's fine, but that is not a
> > > reasonable requirement to put on all other drivers.  
> > 
> > I did not got what you mean here. The xdp multi-buffer layout uses
> > the skb_shared_info at the end of the first buffer to link subsequent
> > frames [0] and we rely on skb_frag* utilities to set/read offset and
> > length of subsequent buffers.
> 
> Yes, for now the same layout as "skb_shared_info" is "reuse", but I
> think we should think of this as "xdp_shared_info" instead, as how it
> is used for XDP is going to divert from SKBs.  We already discussed (in
> conf call) that we could store the total len of "frags" here, to
> simplify the other helper.

I like this approach, at the end the first fragment we can have something like:

struct xdp_shared_info {
	skb_frag_f frags[16];
	int n_frags;
	int frag_len;
};

or do you prefer to not use skb_frag_t struct?

> 
> Using the skb_frag_* helper functions are misleading, and will make it
> more difficult to divert from how SKB handle frags.  What about
> introducing xdp_frag_* wrappers? (what do others think?)

I am fine with having some dedicated helpers.
Anyway we need to construct the xdp_buff {} receiving the dma descriptors.

Regards,
Lorenzo

> 
> 
> > 
> > [0] http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html
> >     - XDP multi-buffers section (slide 40)
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer
>
John Fastabend Sept. 4, 2020, 3:23 p.m. UTC | #9
Lorenzo Bianconi wrote:
> > Lorenzo Bianconi wrote:
> > > Introduce bpf_xdp_adjust_mb_header helper in order to adjust frame
> > > headers moving *offset* bytes from/to the second buffer to/from the
> > > first one.
> > > This helper can be used to move headers when the hw DMA SG is not able
> > > to copy all the headers in the first fragment and split header and data
> > > pages.
> > > 
> > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
> > > ---
> > >  include/uapi/linux/bpf.h       | 25 ++++++++++++----
> > >  net/core/filter.c              | 54 ++++++++++++++++++++++++++++++++++
> > >  tools/include/uapi/linux/bpf.h | 26 ++++++++++++----
> > >  3 files changed, 95 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index 8dda13880957..c4a6d245619c 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -3571,11 +3571,25 @@ union bpf_attr {
> > >   *		value.
> > >   *
> > >   * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
> > > - * 	Description
> > > - * 		Read *size* bytes from user space address *user_ptr* and store
> > > - * 		the data in *dst*. This is a wrapper of copy_from_user().
> > > - * 	Return
> > > - * 		0 on success, or a negative error in case of failure.
> > > + *	Description
> > > + *		Read *size* bytes from user space address *user_ptr* and store
> > > + *		the data in *dst*. This is a wrapper of copy_from_user().
> > > + *
> > > + * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
> > > + *	Description
> > > + *		Adjust frame headers moving *offset* bytes from/to the second
> > > + *		buffer to/from the first one. This helper can be used to move
> > > + *		headers when the hw DMA SG does not copy all the headers in
> > > + *		the first fragment.
> 
> + Eric to the discussion
> 
> > 
> > This is confusing to read. Does this mean I can "move bytes to the second
> > buffer from the first one" or "move bytes from the second buffer to the first
> > one" And what are frame headers? I'm sure I can read below code and work
> > it out, but users reading the header file should be able to parse this.
> 
> Our main goal with this helper is to fix the use-case where we request the hw
> to put L2/L3/L4 headers (and all the TCP options) in the first fragment and TCP
> data starting from the second fragment (headers split) but for some reasons
> the hw puts the TCP options in the second fragment (if we understood correctly
> this issue has been introduced by Eric @ NetDevConf 0x14).
> bpf_xdp_adjust_mb_header() can fix this use-case moving bytes from the second fragment
> to the first one (offset > 0) or from the first buffer to the second one (offset < 0).

Ah OK, so the description needs the information about how to use offset then it
would have been clear I think. Something like that last line "moving bytes from
the second fragment ...."

So this is to fixup header-spit for RX zerocopy? Add that to the commit
message then.

> 
> > 
> > Also we want to be able to read all data not just headers. Reading the
> > payload of a TCP packet is equally important for many l7 load balancers.
> > 
> 
> In order to avoid to slow down performances we require that eBPF sandbox can
> read/write only buff0 in a xdp multi-buffer. The xdp program can only
> perform some restricted changes to buff<n> (n >= 1) (e.g. what we did in
> bpf_xdp_adjust_mb_header()).
> You can find the xdp multi-buff design principles here [0][1]
> 
> [0] https://github.com/xdp-project/xdp-project/blob/master/areas/core/xdp-multi-buffer01-design.org
> [1] http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html - XDP multi-buffers section (slide 40)
> 
> > > + *
> > > + *		A call to this helper is susceptible to change the underlying
> > > + *		packet buffer. Therefore, at load time, all checks on pointers
> > > + *		previously done by the verifier are invalidated and must be
> > > + *		performed again, if the helper is used in combination with
> > > + *		direct packet access.
> > > + *
> > > + *	Return
> > > + *		0 on success, or a negative error in case of failure.
> > >   */
> > >  #define __BPF_FUNC_MAPPER(FN)		\
> > >  	FN(unspec),			\
> > > @@ -3727,6 +3741,7 @@ union bpf_attr {
> > >  	FN(inode_storage_delete),	\
> > >  	FN(d_path),			\
> > >  	FN(copy_from_user),		\
> > > +	FN(xdp_adjust_mb_header),	\
> > >  	/* */
> > >  
> > >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > index 47eef9a0be6a..ae6b10cf062d 100644
> > > --- a/net/core/filter.c
> > > +++ b/net/core/filter.c
> > > @@ -3475,6 +3475,57 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
> > >  	.arg2_type	= ARG_ANYTHING,
> > >  };
> > >  
> > > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > > +	   int, offset)
> > > +{
> > > +	void *data_hard_end, *data_end;
> > > +	struct skb_shared_info *sinfo;
> > > +	int frag_offset, frag_len;
> > > +	u8 *addr;
> > > +
> > > +	if (!xdp->mb)
> > > +		return -EOPNOTSUPP;

Not required for this patch necessarily but I think it would be better user
experience if instead of EOPNOTSUPP here we did the header split. This
would allocate a frag and copy the bytes around as needed. Yes it might
be slow if you don't have a frag free in the driver, but if user wants to
do header split and their hardware can't do it we would have a way out.

I guess it could be an improvement for later though.


> > > +
> > > +	sinfo = xdp_get_shared_info_from_buff(xdp);
> > > +
> > > +	frag_len = skb_frag_size(&sinfo->frags[0]);
> > > +	if (offset > frag_len)
> > > +		return -EINVAL;
> > 
> > What if we want data in frags[1] and so on.
> > 
> > > +
> > > +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> > > +	data_end = xdp->data_end + offset;
> > > +
> > > +	if (offset < 0 && (-offset > frag_offset ||
> > > +			   data_end < xdp->data + ETH_HLEN))
> > > +		return -EINVAL;
> > > +
> > > +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > > +	if (data_end > data_hard_end)
> > > +		return -EINVAL;
> > > +
> > > +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > > +	if (offset > 0) {
> > > +		memcpy(xdp->data_end, addr, offset);
> > 
> > But this could get expensive for large amount of data? And also
> > limiting because we require the room to do the copy. Presumably
> > the reason we have fargs[1] is because the normal page or half
> > page is in use?
> > 
> > > +	} else {
> > > +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> > > +		memset(xdp->data_end + offset, 0, -offset);
> > > +	}
> > > +
> > > +	skb_frag_size_sub(&sinfo->frags[0], offset);
> > > +	skb_frag_off_add(&sinfo->frags[0], offset);
> > > +	xdp->data_end = data_end;
> > > +
> > > +	return 0;
> > > +}
> > 
> > So overall I don't see the point of copying bytes from one frag to
> > another. Create an API that adjusts the data pointers and then
> > copies are avoided and manipulating frags is not needed.
> 
> please see above.

OK it makes more sense with the context. It doesn't have much if anything
to do about making data visible to the BPF program. This is about
changing the layout of the frags list.

How/when does the header split go wrong on the mvneta device? I guess
this is to fix a real bug/issue not some theoritical one? An example
in the commit message would make this concrete. Soemthing like,
"When using RX zerocopy to mmap data into userspace application if
a packet with [all these wild headers] is received rx zerocopy breaks
because header split puts headers X in the data frag confusing apps".

> 
> > 
> > Also and even more concerning I think this API requires the
> > driver to populate shinfo. If we use TX_REDIRECT a lot or TX_XMIT
> > this means we need to populate shinfo when its probably not ever
> > used. If our driver is smart L2/L3 headers are in the readable
> > data and prefetched. Writing frags into the end of a page is likely
> > not free.
> 
> Sorry I did not get what you mean with "populate shinfo" here. We need to
> properly set shared_info in order to create the xdp multi-buff.
> Apart of header splits, please consider the main uses-cases for
> xdp multi-buff are XDP with TSO and Jumbo frames.

The use case I have in mind is a XDP_TX or XDP_REDIRECT load balancer.
I wont know this at the driver level and now I'll have to write into
the back of every page with this shinfo just in case. If header
split is working I should never need to even touch the page outside
the first N bytes that were DMAd and in cache with DDIO. So its extra
overhead for something that is unlikely to happen in the LB case.

If you take the simplest possible program that just returns XDP_TX
and run a pkt generator against it. I believe (haven't run any
tests) that you will see overhead now just from populating this
shinfo. I think it needs to only be done when its needed e.g. when
user makes this helper call or we need to build the skb and populate
the frags there.

I think a smart driver will just keep the frags list in whatever
form it has them (rx descriptors?) and push them over to the
tx descriptors without having to do extra work with frag lists.

> 
> > 
> > Did you benchmark this?
> 
> will do, I need to understand if we can use tiny buffers in mvneta.

Why tiny buffers? How does mvneta layout the frags when doing
header split? Can we just benchmark what mvneta is doing at the
end of this patch series?

Also can you try the basic XDP_TX case mentioned above.
I don't want this to degrade existing use cases if at all
possible.

> 
> > 
> > In general users of this API should know the bytes they want
> > to fetch. Use an API like this,
> > 
> >   bpf_xdp_adjust_bytes(xdp, start, end)
> > 
> > Where xdp is the frame, start is the first byte the user wants
> > and end is the last byte. Then usually you can skip the entire
> > copy part and just move the xdp pointesr around. The ugly case
> > is if the user puts start/end across a frag boundary and a copy
> > is needed. In that case maybe we use end as a hint and not a
> > hard requirement.
> > 
> > The use case I see is I read L2/L3/L4 headers and I need the
> > first N bytes of the payload. I know where the payload starts
> > and I know how many bytes I need so,
> > 
> >   bpf_xdp_adjust_bytes(xdp, payload_offset, bytes_needed);
> > 
> > Then hopefully that is all in one frag. If its not I'll need
> > to do a second helper call. Still nicer than forcing drivers
> > to populate this shinfo I think. If you think I'm wrong try
> > a benchmark. Benchmarks aside I get stuck when data_end and
> > data_hard_end are too close together.
> 
> IIUC what you mean here is to expose L2/L3/L4 headers + some data to
> the ebpf program to perform like L7 load-balancing, right?

Correct, but with extra context I see in this patch you are trying
to build an XDP controlled header split. This seems like a different
use case from mine.

> Let's consider the Jumbo frames use-case (so the data are split in multiple
> buffers). I can see to issues here:
> - since in XDP we can't linearize the buffer if start and end are on the
>   different pages (like we do in bpf_msg_pull_data()), are we ending up
>   in the case where requested data are all in buff0? 

In this case I would expect either the helper returns how many bytes
were pulled in, maybe just (start, end_of_frag) or user can find
it from data_end pointer. Here end is just a hint.

> - if  start and end are in buff<2>, we should report the fragment number to the
>   ebpf program to "fetch" the data. Are we exposing too low-level details to
>   user-space?

Why do you need the frag number? Just say I want bytes (X,Y) if that
happens to be on buff<2> let the helper find it.

I think having a helper to read/write any bytes is important and
necessary, but the helper implemented in this patch is something
else. I get naming is hard what if we called it xdp_header_split().

> 
> Regards,
> Lorenzo
> 
> > 
> > Thanks,
> > John
Lorenzo Bianconi Sept. 6, 2020, 1:36 p.m. UTC | #10
> Lorenzo Bianconi wrote:
> > > Lorenzo Bianconi wrote:

[...]

> > > > + *	Description
> > > > + *		Adjust frame headers moving *offset* bytes from/to the second
> > > > + *		buffer to/from the first one. This helper can be used to move
> > > > + *		headers when the hw DMA SG does not copy all the headers in
> > > > + *		the first fragment.
> > 
> > + Eric to the discussion
> > 
> > > 
> > > This is confusing to read. Does this mean I can "move bytes to the second
> > > buffer from the first one" or "move bytes from the second buffer to the first
> > > one" And what are frame headers? I'm sure I can read below code and work
> > > it out, but users reading the header file should be able to parse this.
> > 
> > Our main goal with this helper is to fix the use-case where we request the hw
> > to put L2/L3/L4 headers (and all the TCP options) in the first fragment and TCP
> > data starting from the second fragment (headers split) but for some reasons
> > the hw puts the TCP options in the second fragment (if we understood correctly
> > this issue has been introduced by Eric @ NetDevConf 0x14).
> > bpf_xdp_adjust_mb_header() can fix this use-case moving bytes from the second fragment
> > to the first one (offset > 0) or from the first buffer to the second one (offset < 0).
> 
> Ah OK, so the description needs the information about how to use offset then it
> would have been clear I think. Something like that last line "moving bytes from
> the second fragment ...."

ack, right. I will do in v3.

> 
> So this is to fixup header-spit for RX zerocopy? Add that to the commit
> message then.

Right. I will improve comments in v3.

> 
> > 
> > > 
> > > Also we want to be able to read all data not just headers. Reading the
> > > payload of a TCP packet is equally important for many l7 load balancers.
> > > 
> > 
> > In order to avoid to slow down performances we require that eBPF sandbox can
> > read/write only buff0 in a xdp multi-buffer. The xdp program can only
> > perform some restricted changes to buff<n> (n >= 1) (e.g. what we did in
> > bpf_xdp_adjust_mb_header()).
> > You can find the xdp multi-buff design principles here [0][1]
> > 
> > [0] https://github.com/xdp-project/xdp-project/blob/master/areas/core/xdp-multi-buffer01-design.org
> > [1] http://people.redhat.com/lbiancon/conference/NetDevConf2020-0x14/add-xdp-on-driver.html - XDP multi-buffers section (slide 40)
> > 
> > > > + *
> > > > + *		A call to this helper is susceptible to change the underlying
> > > > + *		packet buffer. Therefore, at load time, all checks on pointers
> > > > + *		previously done by the verifier are invalidated and must be
> > > > + *		performed again, if the helper is used in combination with
> > > > + *		direct packet access.
> > > > + *
> > > > + *	Return
> > > > + *		0 on success, or a negative error in case of failure.
> > > >   */
> > > >  #define __BPF_FUNC_MAPPER(FN)		\
> > > >  	FN(unspec),			\
> > > > @@ -3727,6 +3741,7 @@ union bpf_attr {
> > > >  	FN(inode_storage_delete),	\
> > > >  	FN(d_path),			\
> > > >  	FN(copy_from_user),		\
> > > > +	FN(xdp_adjust_mb_header),	\
> > > >  	/* */
> > > >  
> > > >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > > > diff --git a/net/core/filter.c b/net/core/filter.c
> > > > index 47eef9a0be6a..ae6b10cf062d 100644
> > > > --- a/net/core/filter.c
> > > > +++ b/net/core/filter.c
> > > > @@ -3475,6 +3475,57 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
> > > >  	.arg2_type	= ARG_ANYTHING,
> > > >  };
> > > >  
> > > > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > > > +	   int, offset)
> > > > +{
> > > > +	void *data_hard_end, *data_end;
> > > > +	struct skb_shared_info *sinfo;
> > > > +	int frag_offset, frag_len;
> > > > +	u8 *addr;
> > > > +
> > > > +	if (!xdp->mb)
> > > > +		return -EOPNOTSUPP;
> 
> Not required for this patch necessarily but I think it would be better user
> experience if instead of EOPNOTSUPP here we did the header split. This
> would allocate a frag and copy the bytes around as needed. Yes it might
> be slow if you don't have a frag free in the driver, but if user wants to
> do header split and their hardware can't do it we would have a way out.
> 
> I guess it could be an improvement for later though.

I have no a strong opinion on this, I did it in this way to respect the rule "we
do not allocate memory for XDP".

@Jesper, David: thoughts?

> 
> 
> > > > +
> > > > +	sinfo = xdp_get_shared_info_from_buff(xdp);
> > > > +
> > > > +	frag_len = skb_frag_size(&sinfo->frags[0]);
> > > > +	if (offset > frag_len)
> > > > +		return -EINVAL;
> > > 
> > > What if we want data in frags[1] and so on.
> > > 
> > > > +
> > > > +	frag_offset = skb_frag_off(&sinfo->frags[0]);
> > > > +	data_end = xdp->data_end + offset;
> > > > +
> > > > +	if (offset < 0 && (-offset > frag_offset ||
> > > > +			   data_end < xdp->data + ETH_HLEN))
> > > > +		return -EINVAL;
> > > > +
> > > > +	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
> > > > +	if (data_end > data_hard_end)
> > > > +		return -EINVAL;
> > > > +
> > > > +	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
> > > > +	if (offset > 0) {
> > > > +		memcpy(xdp->data_end, addr, offset);
> > > 
> > > But this could get expensive for large amount of data? And also
> > > limiting because we require the room to do the copy. Presumably
> > > the reason we have fargs[1] is because the normal page or half
> > > page is in use?
> > > 
> > > > +	} else {
> > > > +		memcpy(addr + offset, xdp->data_end + offset, -offset);
> > > > +		memset(xdp->data_end + offset, 0, -offset);
> > > > +	}
> > > > +
> > > > +	skb_frag_size_sub(&sinfo->frags[0], offset);
> > > > +	skb_frag_off_add(&sinfo->frags[0], offset);
> > > > +	xdp->data_end = data_end;
> > > > +
> > > > +	return 0;
> > > > +}
> > > 
> > > So overall I don't see the point of copying bytes from one frag to
> > > another. Create an API that adjusts the data pointers and then
> > > copies are avoided and manipulating frags is not needed.
> > 
> > please see above.
> 
> OK it makes more sense with the context. It doesn't have much if anything
> to do about making data visible to the BPF program. This is about
> changing the layout of the frags list.

correct.

> 
> How/when does the header split go wrong on the mvneta device? I guess
> this is to fix a real bug/issue not some theoritical one? An example
> in the commit message would make this concrete. Soemthing like,
> "When using RX zerocopy to mmap data into userspace application if
> a packet with [all these wild headers] is received rx zerocopy breaks
> because header split puts headers X in the data frag confusing apps".

This issue does not occur with mvneta since the driver is not capable of
performing header split AFAIK. The helper has been introduced to cover the
"issue" reported by Eric in his NetDevConf presentation. In order to test the
helper I modified the mventa rx napi loop in a controlled way (this patch can't
be sent upstream, it is for testing only :))
I will improve commit message in v3.

> 
> > 
> > > 
> > > Also and even more concerning I think this API requires the
> > > driver to populate shinfo. If we use TX_REDIRECT a lot or TX_XMIT
> > > this means we need to populate shinfo when its probably not ever
> > > used. If our driver is smart L2/L3 headers are in the readable
> > > data and prefetched. Writing frags into the end of a page is likely
> > > not free.
> > 
> > Sorry I did not get what you mean with "populate shinfo" here. We need to
> > properly set shared_info in order to create the xdp multi-buff.
> > Apart of header splits, please consider the main uses-cases for
> > xdp multi-buff are XDP with TSO and Jumbo frames.
> 
> The use case I have in mind is a XDP_TX or XDP_REDIRECT load balancer.
> I wont know this at the driver level and now I'll have to write into
> the back of every page with this shinfo just in case. If header
> split is working I should never need to even touch the page outside
> the first N bytes that were DMAd and in cache with DDIO. So its extra
> overhead for something that is unlikely to happen in the LB case.

So far the skb_shared_info in constructed in mvneta only if the hw splits
the received data in multiple buffers (so if the MTU is greater than 1 PAGE,
please see comments below). Moreover the shared_info is present only in the
first buffer.

> 
> If you take the simplest possible program that just returns XDP_TX
> and run a pkt generator against it. I believe (haven't run any
> tests) that you will see overhead now just from populating this
> shinfo. I think it needs to only be done when its needed e.g. when
> user makes this helper call or we need to build the skb and populate
> the frags there.

sure, I will carry out some tests.

> 
> I think a smart driver will just keep the frags list in whatever
> form it has them (rx descriptors?) and push them over to the
> tx descriptors without having to do extra work with frag lists.

I think there are many use-cases where we want to have this info available in
xdp_buff/xdp_frame. E.g: let's consider the following Jumbo frame example:
- MTU > 1 PAGE (so we the driver will split the received data in multiple rx
  descriptors)
- the driver performs a XDP_REDIRECT to a veth or cpumap

Relying on the proposed architecture we could enable GRO in veth or cpumap I
guess since we can build a non-linear skb from the xdp multi-buff, right?

> 
> > 
> > > 
> > > Did you benchmark this?
> > 
> > will do, I need to understand if we can use tiny buffers in mvneta.
> 
> Why tiny buffers? How does mvneta layout the frags when doing
> header split? Can we just benchmark what mvneta is doing at the
> end of this patch series?

for the moment mvneta can split the received data when the previous buffer is
full (e.g. when we the first page is completely written). I want to explore if
I can set a tiny buffer (e.g. 128B) as max received buffer to run some performance
tests and have some "comparable" results respect to the ones I got when I added XDP
support to mvneta.

> 
> Also can you try the basic XDP_TX case mentioned above.
> I don't want this to degrade existing use cases if at all
> possible.

sure, will do.

> 
> > 
> > > 
> > > In general users of this API should know the bytes they want
> > > to fetch. Use an API like this,
> > > 
> > >   bpf_xdp_adjust_bytes(xdp, start, end)
> > > 
> > > Where xdp is the frame, start is the first byte the user wants
> > > and end is the last byte. Then usually you can skip the entire
> > > copy part and just move the xdp pointesr around. The ugly case
> > > is if the user puts start/end across a frag boundary and a copy
> > > is needed. In that case maybe we use end as a hint and not a
> > > hard requirement.
> > > 
> > > The use case I see is I read L2/L3/L4 headers and I need the
> > > first N bytes of the payload. I know where the payload starts
> > > and I know how many bytes I need so,
> > > 
> > >   bpf_xdp_adjust_bytes(xdp, payload_offset, bytes_needed);
> > > 
> > > Then hopefully that is all in one frag. If its not I'll need
> > > to do a second helper call. Still nicer than forcing drivers
> > > to populate this shinfo I think. If you think I'm wrong try
> > > a benchmark. Benchmarks aside I get stuck when data_end and
> > > data_hard_end are too close together.
> > 
> > IIUC what you mean here is to expose L2/L3/L4 headers + some data to
> > the ebpf program to perform like L7 load-balancing, right?
> 
> Correct, but with extra context I see in this patch you are trying
> to build an XDP controlled header split. This seems like a different
> use case from mine.

I agree.

> 
> > Let's consider the Jumbo frames use-case (so the data are split in multiple
> > buffers). I can see to issues here:
> > - since in XDP we can't linearize the buffer if start and end are on the
> >   different pages (like we do in bpf_msg_pull_data()), are we ending up
> >   in the case where requested data are all in buff0? 
> 
> In this case I would expect either the helper returns how many bytes
> were pulled in, maybe just (start, end_of_frag) or user can find
> it from data_end pointer. Here end is just a hint.
> 
> > - if  start and end are in buff<2>, we should report the fragment number to the
> >   ebpf program to "fetch" the data. Are we exposing too low-level details to
> >   user-space?
> 
> Why do you need the frag number? Just say I want bytes (X,Y) if that
> happens to be on buff<2> let the helper find it.
> 
> I think having a helper to read/write any bytes is important and
> necessary, but the helper implemented in this patch is something
> else. I get naming is hard what if we called it xdp_header_split().

ack, sure. I will fix it in v3.

Regards,
Lorenzo

> 
> > 
> > Regards,
> > Lorenzo
> > 
> > > 
> > > Thanks,
> > > John
> 
>
John Fastabend Sept. 8, 2020, 7:57 p.m. UTC | #11
Lorenzo Bianconi wrote:
> > Lorenzo Bianconi wrote:
> > > > Lorenzo Bianconi wrote:
> 
> [...]
> 
> > > > > + *	Description
> > > > > + *		Adjust frame headers moving *offset* bytes from/to the second
> > > > > + *		buffer to/from the first one. This helper can be used to move
> > > > > + *		headers when the hw DMA SG does not copy all the headers in
> > > > > + *		the first fragment.
> > > 
> > > + Eric to the discussion
> > > 

[...]

> > > > > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > > > > +	   int, offset)
> > > > > +{
> > > > > +	void *data_hard_end, *data_end;
> > > > > +	struct skb_shared_info *sinfo;
> > > > > +	int frag_offset, frag_len;
> > > > > +	u8 *addr;
> > > > > +
> > > > > +	if (!xdp->mb)
> > > > > +		return -EOPNOTSUPP;
> > 
> > Not required for this patch necessarily but I think it would be better user
> > experience if instead of EOPNOTSUPP here we did the header split. This
> > would allocate a frag and copy the bytes around as needed. Yes it might
> > be slow if you don't have a frag free in the driver, but if user wants to
> > do header split and their hardware can't do it we would have a way out.
> > 
> > I guess it could be an improvement for later though.
> 
> I have no a strong opinion on this, I did it in this way to respect the rule "we
> do not allocate memory for XDP".
> 
> @Jesper, David: thoughts?

Consider adding a flags field to the helper so we could do this later with
a flag. Then users who want the alloc can set the flag and get it.

[...]

> 
> > 
> > How/when does the header split go wrong on the mvneta device? I guess
> > this is to fix a real bug/issue not some theoritical one? An example
> > in the commit message would make this concrete. Soemthing like,
> > "When using RX zerocopy to mmap data into userspace application if
> > a packet with [all these wild headers] is received rx zerocopy breaks
> > because header split puts headers X in the data frag confusing apps".
> 
> This issue does not occur with mvneta since the driver is not capable of
> performing header split AFAIK. The helper has been introduced to cover the
> "issue" reported by Eric in his NetDevConf presentation. In order to test the
> helper I modified the mventa rx napi loop in a controlled way (this patch can't
> be sent upstream, it is for testing only :))
> I will improve commit message in v3.

Ah ok so really there is no users for the helper then IMO just drop
the patch until we have a user then.

> 
> > 
> > > 
> > > > 
> > > > Also and even more concerning I think this API requires the
> > > > driver to populate shinfo. If we use TX_REDIRECT a lot or TX_XMIT
> > > > this means we need to populate shinfo when its probably not ever
> > > > used. If our driver is smart L2/L3 headers are in the readable
> > > > data and prefetched. Writing frags into the end of a page is likely
> > > > not free.
> > > 
> > > Sorry I did not get what you mean with "populate shinfo" here. We need to
> > > properly set shared_info in order to create the xdp multi-buff.
> > > Apart of header splits, please consider the main uses-cases for
> > > xdp multi-buff are XDP with TSO and Jumbo frames.
> > 
> > The use case I have in mind is a XDP_TX or XDP_REDIRECT load balancer.
> > I wont know this at the driver level and now I'll have to write into
> > the back of every page with this shinfo just in case. If header
> > split is working I should never need to even touch the page outside
> > the first N bytes that were DMAd and in cache with DDIO. So its extra
> > overhead for something that is unlikely to happen in the LB case.
> 
> So far the skb_shared_info in constructed in mvneta only if the hw splits
> the received data in multiple buffers (so if the MTU is greater than 1 PAGE,
> please see comments below). Moreover the shared_info is present only in the
> first buffer.

Still in a normal L2/L3/L4 use case I expect all the headers you
need to be in the fist buffer so its unlikely for use cases that
send most traffic via XDP_TX for example to ever need the extra
info. In these cases I think you are paying some penalty for
having to do the work of populating the shinfo. Maybe its measurable
maybe not I'm not sure.

Also if we make it required for multi-buffer than we also need
the shinfo on 40gbps or 100gbps nics and now even small costs
matter.

> 
> > 
> > If you take the simplest possible program that just returns XDP_TX
> > and run a pkt generator against it. I believe (haven't run any
> > tests) that you will see overhead now just from populating this
> > shinfo. I think it needs to only be done when its needed e.g. when
> > user makes this helper call or we need to build the skb and populate
> > the frags there.
> 
> sure, I will carry out some tests.

Thanks!

> 
> > 
> > I think a smart driver will just keep the frags list in whatever
> > form it has them (rx descriptors?) and push them over to the
> > tx descriptors without having to do extra work with frag lists.
> 
> I think there are many use-cases where we want to have this info available in
> xdp_buff/xdp_frame. E.g: let's consider the following Jumbo frame example:
> - MTU > 1 PAGE (so we the driver will split the received data in multiple rx
>   descriptors)
> - the driver performs a XDP_REDIRECT to a veth or cpumap
> 
> Relying on the proposed architecture we could enable GRO in veth or cpumap I
> guess since we can build a non-linear skb from the xdp multi-buff, right?

I'm not disputing there are use-cases. But, I'm trying to see if we
can cover those without introducing additional latency in other
cases. Hence the extra benchmarks request ;)

> 
> > 
> > > 
> > > > 
> > > > Did you benchmark this?
> > > 
> > > will do, I need to understand if we can use tiny buffers in mvneta.
> > 
> > Why tiny buffers? How does mvneta layout the frags when doing
> > header split? Can we just benchmark what mvneta is doing at the
> > end of this patch series?
> 
> for the moment mvneta can split the received data when the previous buffer is
> full (e.g. when we the first page is completely written). I want to explore if
> I can set a tiny buffer (e.g. 128B) as max received buffer to run some performance
> tests and have some "comparable" results respect to the ones I got when I added XDP
> support to mvneta.

OK would be great.

> 
> > 
> > Also can you try the basic XDP_TX case mentioned above.
> > I don't want this to degrade existing use cases if at all
> > possible.
> 
> sure, will do.

Thanks!
Lorenzo Bianconi Sept. 8, 2020, 9:31 p.m. UTC | #12
> Lorenzo Bianconi wrote:
> > > Lorenzo Bianconi wrote:
> > > > > Lorenzo Bianconi wrote:
> > 
> > [...]
> > 
> > > > > > + *	Description
> > > > > > + *		Adjust frame headers moving *offset* bytes from/to the second
> > > > > > + *		buffer to/from the first one. This helper can be used to move
> > > > > > + *		headers when the hw DMA SG does not copy all the headers in
> > > > > > + *		the first fragment.
> > > > 
> > > > + Eric to the discussion
> > > > 
> 
> [...]
> 
> > > > > > +BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
> > > > > > +	   int, offset)
> > > > > > +{
> > > > > > +	void *data_hard_end, *data_end;
> > > > > > +	struct skb_shared_info *sinfo;
> > > > > > +	int frag_offset, frag_len;
> > > > > > +	u8 *addr;
> > > > > > +
> > > > > > +	if (!xdp->mb)
> > > > > > +		return -EOPNOTSUPP;
> > > 
> > > Not required for this patch necessarily but I think it would be better user
> > > experience if instead of EOPNOTSUPP here we did the header split. This
> > > would allocate a frag and copy the bytes around as needed. Yes it might
> > > be slow if you don't have a frag free in the driver, but if user wants to
> > > do header split and their hardware can't do it we would have a way out.
> > > 
> > > I guess it could be an improvement for later though.
> > 
> > I have no a strong opinion on this, I did it in this way to respect the rule "we
> > do not allocate memory for XDP".
> > 
> > @Jesper, David: thoughts?
> 
> Consider adding a flags field to the helper so we could do this later with
> a flag. Then users who want the alloc can set the flag and get it.
> 
> [...]
> 
> > 
> > > 
> > > How/when does the header split go wrong on the mvneta device? I guess
> > > this is to fix a real bug/issue not some theoritical one? An example
> > > in the commit message would make this concrete. Soemthing like,
> > > "When using RX zerocopy to mmap data into userspace application if
> > > a packet with [all these wild headers] is received rx zerocopy breaks
> > > because header split puts headers X in the data frag confusing apps".
> > 
> > This issue does not occur with mvneta since the driver is not capable of
> > performing header split AFAIK. The helper has been introduced to cover the
> > "issue" reported by Eric in his NetDevConf presentation. In order to test the
> > helper I modified the mventa rx napi loop in a controlled way (this patch can't
> > be sent upstream, it is for testing only :))
> > I will improve commit message in v3.
> 
> Ah ok so really there is no users for the helper then IMO just drop
> the patch until we have a user then.

I agree, this helper is not strictly related to the series. I added it in v2
to provide another example of using xdp_buff mb bit.

> 
> > 
> > > 
> > > > 
> > > > > 
> > > > > Also and even more concerning I think this API requires the
> > > > > driver to populate shinfo. If we use TX_REDIRECT a lot or TX_XMIT
> > > > > this means we need to populate shinfo when its probably not ever
> > > > > used. If our driver is smart L2/L3 headers are in the readable
> > > > > data and prefetched. Writing frags into the end of a page is likely
> > > > > not free.
> > > > 
> > > > Sorry I did not get what you mean with "populate shinfo" here. We need to
> > > > properly set shared_info in order to create the xdp multi-buff.
> > > > Apart of header splits, please consider the main uses-cases for
> > > > xdp multi-buff are XDP with TSO and Jumbo frames.
> > > 
> > > The use case I have in mind is a XDP_TX or XDP_REDIRECT load balancer.
> > > I wont know this at the driver level and now I'll have to write into
> > > the back of every page with this shinfo just in case. If header
> > > split is working I should never need to even touch the page outside
> > > the first N bytes that were DMAd and in cache with DDIO. So its extra
> > > overhead for something that is unlikely to happen in the LB case.
> > 
> > So far the skb_shared_info in constructed in mvneta only if the hw splits
> > the received data in multiple buffers (so if the MTU is greater than 1 PAGE,
> > please see comments below). Moreover the shared_info is present only in the
> > first buffer.
> 
> Still in a normal L2/L3/L4 use case I expect all the headers you
> need to be in the fist buffer so its unlikely for use cases that
> send most traffic via XDP_TX for example to ever need the extra
> info. In these cases I think you are paying some penalty for
> having to do the work of populating the shinfo. Maybe its measurable
> maybe not I'm not sure.
> 
> Also if we make it required for multi-buffer than we also need
> the shinfo on 40gbps or 100gbps nics and now even small costs
> matter.

Now I realized I used the word "split" in a not clear way here,
I apologize for that.
What I mean is not related "header" split, I am referring to the case where
the hw is configured with a given rx buffer size (e.g. 1 PAGE) and we have
set a higher MTU/max received size (e.g. 9K).
In this case the hw will "split" the jumbo received frame over multiple rx
buffers/descriptors. Populating the "xdp_shared_info" we will forward this
layout info to the eBPF sandbox and to a remote driver/cpu.
Please note this use case is not currently covered by XDP so if we develop it a
proper way I guess we should not get any performance hit for the legacy single-buffer
mode since we will not populate the shared_info for it (I think you refer to
the "legacy" use-case in your "normal L2/L3/L4" example, right?)
Anyway I will run some tests to verify the performances for the single buffer
use-case are not hit.

Regards,
Lorenzo

> 
> > 
> > > 
> > > If you take the simplest possible program that just returns XDP_TX
> > > and run a pkt generator against it. I believe (haven't run any
> > > tests) that you will see overhead now just from populating this
> > > shinfo. I think it needs to only be done when its needed e.g. when
> > > user makes this helper call or we need to build the skb and populate
> > > the frags there.
> > 
> > sure, I will carry out some tests.
> 
> Thanks!
> 
> > 
> > > 
> > > I think a smart driver will just keep the frags list in whatever
> > > form it has them (rx descriptors?) and push them over to the
> > > tx descriptors without having to do extra work with frag lists.
> > 
> > I think there are many use-cases where we want to have this info available in
> > xdp_buff/xdp_frame. E.g: let's consider the following Jumbo frame example:
> > - MTU > 1 PAGE (so we the driver will split the received data in multiple rx
> >   descriptors)
> > - the driver performs a XDP_REDIRECT to a veth or cpumap
> > 
> > Relying on the proposed architecture we could enable GRO in veth or cpumap I
> > guess since we can build a non-linear skb from the xdp multi-buff, right?
> 
> I'm not disputing there are use-cases. But, I'm trying to see if we
> can cover those without introducing additional latency in other
> cases. Hence the extra benchmarks request ;)
> 
> > 
> > > 
> > > > 
> > > > > 
> > > > > Did you benchmark this?
> > > > 
> > > > will do, I need to understand if we can use tiny buffers in mvneta.
> > > 
> > > Why tiny buffers? How does mvneta layout the frags when doing
> > > header split? Can we just benchmark what mvneta is doing at the
> > > end of this patch series?
> > 
> > for the moment mvneta can split the received data when the previous buffer is
> > full (e.g. when we the first page is completely written). I want to explore if
> > I can set a tiny buffer (e.g. 128B) as max received buffer to run some performance
> > tests and have some "comparable" results respect to the ones I got when I added XDP
> > support to mvneta.
> 
> OK would be great.
> 
> > 
> > > 
> > > Also can you try the basic XDP_TX case mentioned above.
> > > I don't want this to degrade existing use cases if at all
> > > possible.
> > 
> > sure, will do.
> 
> Thanks!
>
Lorenzo Bianconi Sept. 9, 2020, 8:51 p.m. UTC | #13
> > Lorenzo Bianconi wrote:
> > > > Lorenzo Bianconi wrote:
> > > > > > Lorenzo Bianconi wrote:
> > > 
> > > [...]
> > > 
> > > > > > > + *	Description
> > > > > > > + *		Adjust frame headers moving *offset* bytes from/to the second
> > > > > > > + *		buffer to/from the first one. This helper can be used to move
> > > > > > > + *		headers when the hw DMA SG does not copy all the headers in
> > > > > > > + *		the first fragment.
> > > > > 
> > > > > + Eric to the discussion
> > > > > 
> > 
> > [...]
> > 

[...]

> > 
> > Still in a normal L2/L3/L4 use case I expect all the headers you
> > need to be in the fist buffer so its unlikely for use cases that
> > send most traffic via XDP_TX for example to ever need the extra
> > info. In these cases I think you are paying some penalty for
> > having to do the work of populating the shinfo. Maybe its measurable
> > maybe not I'm not sure.
> > 
> > Also if we make it required for multi-buffer than we also need
> > the shinfo on 40gbps or 100gbps nics and now even small costs
> > matter.
> 
> Now I realized I used the word "split" in a not clear way here,
> I apologize for that.
> What I mean is not related "header" split, I am referring to the case where
> the hw is configured with a given rx buffer size (e.g. 1 PAGE) and we have
> set a higher MTU/max received size (e.g. 9K).
> In this case the hw will "split" the jumbo received frame over multiple rx
> buffers/descriptors. Populating the "xdp_shared_info" we will forward this
> layout info to the eBPF sandbox and to a remote driver/cpu.
> Please note this use case is not currently covered by XDP so if we develop it a
> proper way I guess we should not get any performance hit for the legacy single-buffer
> mode since we will not populate the shared_info for it (I think you refer to
> the "legacy" use-case in your "normal L2/L3/L4" example, right?)
> Anyway I will run some tests to verify the performances for the single buffer
> use-case are not hit.
> 
> Regards,
> Lorenzo

I carried out some performance measurements on my Espressobin to check if the
XDP "single buffer" use-case has been hit introducing xdp multi-buff support.
Each test has been carried out sending ~900Kpps (pkt length 64B). The rx
buffer size was set to 1 PAGE (default value).
The results are roughly the same:

commit: f2ca673d2cd5 "net: mvneta: fix use of state->speed"
==========================================================
- XDP-DROP: ~ 740 Kpps
- XDP-TX: ~ 286 Kpps
- XDP-PASS + tc drop: ~ 219.5 Kpps

xdp multi-buff:
===============
- XDP-DROP: ~ 739-740 Kpps
- XDP-TX: ~ 285 Kpps
- XDP-PASS + tc drop: ~ 223 Kpps

I will add these results to v3 cover letter.

Regards,
Lorenzo

> 
> > 
> > > 
> > > > 
> > > > If you take the simplest possible program that just returns XDP_TX
> > > > and run a pkt generator against it. I believe (haven't run any
> > > > tests) that you will see overhead now just from populating this
> > > > shinfo. I think it needs to only be done when its needed e.g. when
> > > > user makes this helper call or we need to build the skb and populate
> > > > the frags there.
> > > 
> > > sure, I will carry out some tests.
> > 
> > Thanks!
> > 
> > > 
> > > > 
> > > > I think a smart driver will just keep the frags list in whatever
> > > > form it has them (rx descriptors?) and push them over to the
> > > > tx descriptors without having to do extra work with frag lists.
> > > 
> > > I think there are many use-cases where we want to have this info available in
> > > xdp_buff/xdp_frame. E.g: let's consider the following Jumbo frame example:
> > > - MTU > 1 PAGE (so we the driver will split the received data in multiple rx
> > >   descriptors)
> > > - the driver performs a XDP_REDIRECT to a veth or cpumap
> > > 
> > > Relying on the proposed architecture we could enable GRO in veth or cpumap I
> > > guess since we can build a non-linear skb from the xdp multi-buff, right?
> > 
> > I'm not disputing there are use-cases. But, I'm trying to see if we
> > can cover those without introducing additional latency in other
> > cases. Hence the extra benchmarks request ;)
> > 
> > > 
> > > > 
> > > > > 
> > > > > > 
> > > > > > Did you benchmark this?
> > > > > 
> > > > > will do, I need to understand if we can use tiny buffers in mvneta.
> > > > 
> > > > Why tiny buffers? How does mvneta layout the frags when doing
> > > > header split? Can we just benchmark what mvneta is doing at the
> > > > end of this patch series?
> > > 
> > > for the moment mvneta can split the received data when the previous buffer is
> > > full (e.g. when we the first page is completely written). I want to explore if
> > > I can set a tiny buffer (e.g. 128B) as max received buffer to run some performance
> > > tests and have some "comparable" results respect to the ones I got when I added XDP
> > > support to mvneta.
> > 
> > OK would be great.
> > 
> > > 
> > > > 
> > > > Also can you try the basic XDP_TX case mentioned above.
> > > > I don't want this to degrade existing use cases if at all
> > > > possible.
> > > 
> > > sure, will do.
> > 
> > Thanks!
> >
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8dda13880957..c4a6d245619c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3571,11 +3571,25 @@  union bpf_attr {
  *		value.
  *
  * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
- * 	Description
- * 		Read *size* bytes from user space address *user_ptr* and store
- * 		the data in *dst*. This is a wrapper of copy_from_user().
- * 	Return
- * 		0 on success, or a negative error in case of failure.
+ *	Description
+ *		Read *size* bytes from user space address *user_ptr* and store
+ *		the data in *dst*. This is a wrapper of copy_from_user().
+ *
+ * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
+ *	Description
+ *		Adjust frame headers moving *offset* bytes from/to the second
+ *		buffer to/from the first one. This helper can be used to move
+ *		headers when the hw DMA SG does not copy all the headers in
+ *		the first fragment.
+ *
+ *		A call to this helper is susceptible to change the underlying
+ *		packet buffer. Therefore, at load time, all checks on pointers
+ *		previously done by the verifier are invalidated and must be
+ *		performed again, if the helper is used in combination with
+ *		direct packet access.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3727,6 +3741,7 @@  union bpf_attr {
 	FN(inode_storage_delete),	\
 	FN(d_path),			\
 	FN(copy_from_user),		\
+	FN(xdp_adjust_mb_header),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index 47eef9a0be6a..ae6b10cf062d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3475,6 +3475,57 @@  static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_mb_header, struct  xdp_buff *, xdp,
+	   int, offset)
+{
+	void *data_hard_end, *data_end;
+	struct skb_shared_info *sinfo;
+	int frag_offset, frag_len;
+	u8 *addr;
+
+	if (!xdp->mb)
+		return -EOPNOTSUPP;
+
+	sinfo = xdp_get_shared_info_from_buff(xdp);
+
+	frag_len = skb_frag_size(&sinfo->frags[0]);
+	if (offset > frag_len)
+		return -EINVAL;
+
+	frag_offset = skb_frag_off(&sinfo->frags[0]);
+	data_end = xdp->data_end + offset;
+
+	if (offset < 0 && (-offset > frag_offset ||
+			   data_end < xdp->data + ETH_HLEN))
+		return -EINVAL;
+
+	data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
+	if (data_end > data_hard_end)
+		return -EINVAL;
+
+	addr = page_address(skb_frag_page(&sinfo->frags[0])) + frag_offset;
+	if (offset > 0) {
+		memcpy(xdp->data_end, addr, offset);
+	} else {
+		memcpy(addr + offset, xdp->data_end + offset, -offset);
+		memset(xdp->data_end + offset, 0, -offset);
+	}
+
+	skb_frag_size_sub(&sinfo->frags[0], offset);
+	skb_frag_off_add(&sinfo->frags[0], offset);
+	xdp->data_end = data_end;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_mb_header_proto = {
+	.func		= bpf_xdp_adjust_mb_header,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
 {
 	void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
@@ -6505,6 +6556,7 @@  bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_msg_push_data ||
 	    func == bpf_msg_pop_data ||
 	    func == bpf_xdp_adjust_tail ||
+	    func == bpf_xdp_adjust_mb_header ||
 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
 	    func == bpf_lwt_seg6_store_bytes ||
 	    func == bpf_lwt_seg6_adjust_srh ||
@@ -6835,6 +6887,8 @@  xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_xdp_redirect_map_proto;
 	case BPF_FUNC_xdp_adjust_tail:
 		return &bpf_xdp_adjust_tail_proto;
+	case BPF_FUNC_xdp_adjust_mb_header:
+		return &bpf_xdp_adjust_mb_header_proto;
 	case BPF_FUNC_fib_lookup:
 		return &bpf_xdp_fib_lookup_proto;
 #ifdef CONFIG_INET
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 8dda13880957..392d52a2ecef 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3407,6 +3407,7 @@  union bpf_attr {
  *		A non-negative value equal to or less than *size* on success,
  *		or a negative error in case of failure.
  *
+ *
  * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags)
  *	Description
  *		Load header option.  Support reading a particular TCP header
@@ -3571,11 +3572,25 @@  union bpf_attr {
  *		value.
  *
  * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr)
- * 	Description
- * 		Read *size* bytes from user space address *user_ptr* and store
- * 		the data in *dst*. This is a wrapper of copy_from_user().
- * 	Return
- * 		0 on success, or a negative error in case of failure.
+ *	Description
+ *		Read *size* bytes from user space address *user_ptr* and store
+ *		the data in *dst*. This is a wrapper of copy_from_user().
+ *
+ * long bpf_xdp_adjust_mb_header(struct xdp_buff *xdp_md, int offset)
+ *	Description
+ *		Adjust frame headers moving *offset* bytes from/to the second
+ *		buffer to/from the first one. This helper can be used to move
+ *		headers when the hw DMA SG does not copy all the headers in
+ *		the first fragment.
+ *
+ *		A call to this helper is susceptible to change the underlying
+ *		packet buffer. Therefore, at load time, all checks on pointers
+ *		previously done by the verifier are invalidated and must be
+ *		performed again, if the helper is used in combination with
+ *		direct packet access.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3727,6 +3742,7 @@  union bpf_attr {
 	FN(inode_storage_delete),	\
 	FN(d_path),			\
 	FN(copy_from_user),		\
+	FN(xdp_adjust_mb_header),	\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper