diff mbox series

[RFC,v4,bpf-next,03/11] xdp: Add xdp_txq_info to xdp_buff

Message ID 20200227032013.12385-4-dsahern@kernel.org
State RFC
Delegated to: BPF Maintainers
Headers show
Series Add support for XDP in egress path | expand

Commit Message

David Ahern Feb. 27, 2020, 3:20 a.m. UTC
From: David Ahern <dahern@digitalocean.com>

Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the
moment only the device is added. Other fields (queue_index)
can be added as use cases arise.

From a UAPI perspective, egress_ifindex is a union with ingress_ifindex
since only one applies based on where the program is attached.

Signed-off-by: David Ahern <dahern@digitalocean.com>
---
 include/net/xdp.h        |  5 +++++
 include/uapi/linux/bpf.h |  6 ++++--
 net/core/filter.c        | 27 +++++++++++++++++++--------
 3 files changed, 28 insertions(+), 10 deletions(-)

Comments

Jesper Dangaard Brouer Feb. 27, 2020, 8 a.m. UTC | #1
On Wed, 26 Feb 2020 20:20:05 -0700
David Ahern <dsahern@kernel.org> wrote:

> From: David Ahern <dahern@digitalocean.com>
> 
> Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the
> moment only the device is added. Other fields (queue_index)
> can be added as use cases arise.
> 
> From a UAPI perspective, egress_ifindex is a union with ingress_ifindex
> since only one applies based on where the program is attached.
> 
> Signed-off-by: David Ahern <dahern@digitalocean.com>
> ---
>  include/net/xdp.h        |  5 +++++
>  include/uapi/linux/bpf.h |  6 ++++--
>  net/core/filter.c        | 27 +++++++++++++++++++--------
>  3 files changed, 28 insertions(+), 10 deletions(-)
> 
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 40c6d3398458..5584b9db86fe 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -63,6 +63,10 @@ struct xdp_rxq_info {
>  	struct xdp_mem_info mem;
>  } ____cacheline_aligned; /* perf critical, avoid false-sharing */
>  
> +struct xdp_txq_info {
> +	struct net_device *dev;
> +};
> +
>  struct xdp_buff {
>  	void *data;
>  	void *data_end;
> @@ -70,6 +74,7 @@ struct xdp_buff {
>  	void *data_hard_start;
>  	unsigned long handle;
>  	struct xdp_rxq_info *rxq;
> +	struct xdp_txq_info *txq;
>  };
>  
>  struct xdp_frame {
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 7850f8683b81..5e3f8aefad41 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3334,8 +3334,10 @@ struct xdp_md {
>  	__u32 data;
>  	__u32 data_end;
>  	__u32 data_meta;
> -	/* Below access go through struct xdp_rxq_info */
> -	__u32 ingress_ifindex; /* rxq->dev->ifindex */
> +	union {
> +		__u32 ingress_ifindex; /* rxq->dev->ifindex */
> +		__u32 egress_ifindex;  /* txq->dev->ifindex */
> +	};

Are we sure it is wise to "union share" (struct) xdp_md as the
XDP-context in the XDP programs, with different expected_attach_type?
As this allows the XDP-programmer to code an EGRESS program that access
ctx->ingress_ifindex, this will under the hood be translated to
ctx->egress_ifindex, because from the compilers-PoV this will just be an
offset.

We are setting up the XDP-programmer for a long debugging session, as
she will be expecting to read 'ingress_ifindex', but will be getting
'egress_ifindex'.  (As the compiler cannot warn her, and it is also
correct seen from the verifier).


>  	__u32 rx_queue_index;  /* rxq->queue_index  */

So, the TX program can still read 'rx_queue_index', is this wise?
(It should be easy to catch below and reject).


>  };
>  
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c7cc98c55621..d1c65dccd671 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -7716,14 +7716,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
>  				      offsetof(struct xdp_buff, data_end));
>  		break;
>  	case offsetof(struct xdp_md, ingress_ifindex):
> -		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
> -				      si->dst_reg, si->src_reg,
> -				      offsetof(struct xdp_buff, rxq));
> -		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
> -				      si->dst_reg, si->dst_reg,
> -				      offsetof(struct xdp_rxq_info, dev));
> -		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
> -				      offsetof(struct net_device, ifindex));
> +		if (prog->expected_attach_type == BPF_XDP_EGRESS) {
> +			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
> +					      si->dst_reg, si->src_reg,
> +					      offsetof(struct xdp_buff, txq));
> +			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
> +					      si->dst_reg, si->dst_reg,
> +					      offsetof(struct xdp_txq_info, dev));
> +			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
> +					      offsetof(struct net_device, ifindex));
> +		} else {
> +			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
> +					      si->dst_reg, si->src_reg,
> +					      offsetof(struct xdp_buff, rxq));
> +			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
> +					      si->dst_reg, si->dst_reg,
> +					      offsetof(struct xdp_rxq_info, dev));
> +			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
> +					      offsetof(struct net_device, ifindex));
> +		}
>  		break;
>  	case offsetof(struct xdp_md, rx_queue_index):
>  		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),

We can catch and disallow access to rx_queue_index from expected_attach_type
BPF_XDP_EGRESS, here.  But then we are adding more code to handle/separate
egress from normal RX/ingress.
Toke Høiland-Jørgensen Feb. 27, 2020, 11:58 a.m. UTC | #2
Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Wed, 26 Feb 2020 20:20:05 -0700
> David Ahern <dsahern@kernel.org> wrote:
>
>> From: David Ahern <dahern@digitalocean.com>
>> 
>> Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the
>> moment only the device is added. Other fields (queue_index)
>> can be added as use cases arise.
>> 
>> From a UAPI perspective, egress_ifindex is a union with ingress_ifindex
>> since only one applies based on where the program is attached.
>> 
>> Signed-off-by: David Ahern <dahern@digitalocean.com>
>> ---
>>  include/net/xdp.h        |  5 +++++
>>  include/uapi/linux/bpf.h |  6 ++++--
>>  net/core/filter.c        | 27 +++++++++++++++++++--------
>>  3 files changed, 28 insertions(+), 10 deletions(-)
>> 
>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>> index 40c6d3398458..5584b9db86fe 100644
>> --- a/include/net/xdp.h
>> +++ b/include/net/xdp.h
>> @@ -63,6 +63,10 @@ struct xdp_rxq_info {
>>  	struct xdp_mem_info mem;
>>  } ____cacheline_aligned; /* perf critical, avoid false-sharing */
>>  
>> +struct xdp_txq_info {
>> +	struct net_device *dev;
>> +};
>> +
>>  struct xdp_buff {
>>  	void *data;
>>  	void *data_end;
>> @@ -70,6 +74,7 @@ struct xdp_buff {
>>  	void *data_hard_start;
>>  	unsigned long handle;
>>  	struct xdp_rxq_info *rxq;
>> +	struct xdp_txq_info *txq;
>>  };
>>  
>>  struct xdp_frame {
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 7850f8683b81..5e3f8aefad41 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3334,8 +3334,10 @@ struct xdp_md {
>>  	__u32 data;
>>  	__u32 data_end;
>>  	__u32 data_meta;
>> -	/* Below access go through struct xdp_rxq_info */
>> -	__u32 ingress_ifindex; /* rxq->dev->ifindex */
>> +	union {
>> +		__u32 ingress_ifindex; /* rxq->dev->ifindex */
>> +		__u32 egress_ifindex;  /* txq->dev->ifindex */
>> +	};
>
> Are we sure it is wise to "union share" (struct) xdp_md as the
> XDP-context in the XDP programs, with different expected_attach_type?
> As this allows the XDP-programmer to code an EGRESS program that access
> ctx->ingress_ifindex, this will under the hood be translated to
> ctx->egress_ifindex, because from the compilers-PoV this will just be an
> offset.
>
> We are setting up the XDP-programmer for a long debugging session, as
> she will be expecting to read 'ingress_ifindex', but will be getting
> 'egress_ifindex'.  (As the compiler cannot warn her, and it is also
> correct seen from the verifier).

+1 on this; also, an egress program may want to actually know which
ingress iface the packet was first received on. So why not just keep
both fields? Since ifindex 0 is invalid anyway, the field could just be
0 when it isn't known (e.g., egress ifindex on RX, or ingress ifindex if
it comes from the stack)?

>>  	__u32 rx_queue_index;  /* rxq->queue_index  */
>
> So, the TX program can still read 'rx_queue_index', is this wise?

Why shouldn't it be able to (as well as ingress ifindex)?

-Toke
David Ahern Feb. 27, 2020, 8:44 p.m. UTC | #3
On 2/27/20 1:00 AM, Jesper Dangaard Brouer wrote:
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 7850f8683b81..5e3f8aefad41 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3334,8 +3334,10 @@ struct xdp_md {
>>  	__u32 data;
>>  	__u32 data_end;
>>  	__u32 data_meta;
>> -	/* Below access go through struct xdp_rxq_info */
>> -	__u32 ingress_ifindex; /* rxq->dev->ifindex */
>> +	union {
>> +		__u32 ingress_ifindex; /* rxq->dev->ifindex */
>> +		__u32 egress_ifindex;  /* txq->dev->ifindex */
>> +	};
> 
> Are we sure it is wise to "union share" (struct) xdp_md as the
> XDP-context in the XDP programs, with different expected_attach_type?
> As this allows the XDP-programmer to code an EGRESS program that access
> ctx->ingress_ifindex, this will under the hood be translated to
> ctx->egress_ifindex, because from the compilers-PoV this will just be an
> offset.
> 
> We are setting up the XDP-programmer for a long debugging session, as
> she will be expecting to read 'ingress_ifindex', but will be getting
> 'egress_ifindex'.  (As the compiler cannot warn her, and it is also
> correct seen from the verifier).

It both cases it means the device handling the packet. ingress_ifindex
== device handling the Rx, egress_ifindex == device handling the Tx.
Really, it is syntactic sugar for program writers. It would have been
better had xdp_md only called it ifindex from the beginning.

> 
> 
>>  	__u32 rx_queue_index;  /* rxq->queue_index  */
> 
> So, the TX program can still read 'rx_queue_index', is this wise?
> (It should be easy to catch below and reject).

See patch 2.

In time I expect rx_queue_index to be a union with tx_queue_index for
the same reasons as the ifindex.
David Ahern Feb. 28, 2020, 3:01 a.m. UTC | #4
On 2/27/20 4:58 AM, Toke Høiland-Jørgensen wrote:
>  also, an egress program may want to actually know which
> ingress iface the packet was first received on. So why not just keep
> both fields? Since ifindex 0 is invalid anyway, the field could just be
> 0 when it isn't known (e.g., egress ifindex on RX, or ingress ifindex if
> it comes from the stack)?

Today, the ingress device is lost in the conversion from xdp_buff to
xdp_frame. The plumbing needed to keep that information is beyond the
scope of this set.

I am open to making the UAPI separate entries if there is a real reason
for it. Do you have a specific use case? I am not aware of any situation
where a packet queued up for Tx on a device would want to know the
ingress device. At that point it is kind of irrelevant; the packet is
about to hit the "wire". Further, it would only apply to XDP_redirected
frames which could be only a limited set in the ways that a packet can
end up at a device for Tx. I suspect the flow is more relevant than the
device. When you factor on other details - e.g., bonds, vlans - the
ingress device is not the full story. Perhaps the metadata area is more
appropriate than the overhead of managing that information in the kernel
code.
Toke Høiland-Jørgensen Feb. 28, 2020, 10:07 a.m. UTC | #5
David Ahern <dahern@digitalocean.com> writes:

> On 2/27/20 1:00 AM, Jesper Dangaard Brouer wrote:
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 7850f8683b81..5e3f8aefad41 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -3334,8 +3334,10 @@ struct xdp_md {
>>>  	__u32 data;
>>>  	__u32 data_end;
>>>  	__u32 data_meta;
>>> -	/* Below access go through struct xdp_rxq_info */
>>> -	__u32 ingress_ifindex; /* rxq->dev->ifindex */
>>> +	union {
>>> +		__u32 ingress_ifindex; /* rxq->dev->ifindex */
>>> +		__u32 egress_ifindex;  /* txq->dev->ifindex */
>>> +	};
>> 
>> Are we sure it is wise to "union share" (struct) xdp_md as the
>> XDP-context in the XDP programs, with different expected_attach_type?
>> As this allows the XDP-programmer to code an EGRESS program that access
>> ctx->ingress_ifindex, this will under the hood be translated to
>> ctx->egress_ifindex, because from the compilers-PoV this will just be an
>> offset.
>> 
>> We are setting up the XDP-programmer for a long debugging session, as
>> she will be expecting to read 'ingress_ifindex', but will be getting
>> 'egress_ifindex'.  (As the compiler cannot warn her, and it is also
>> correct seen from the verifier).
>
> It both cases it means the device handling the packet. ingress_ifindex
> == device handling the Rx, egress_ifindex == device handling the Tx.
> Really, it is syntactic sugar for program writers. It would have been
> better had xdp_md only called it ifindex from the beginning.

Telling users that they are doing it wrong is not going to make their
debugging session any less frustrating :)

If we keep rx_ifindex a separate field we can unambiguously reject a TX
program that tries to access it, *and* we keep the option of allowing
access to it later if it does turn out to be useful. IMO that is worth
the four extra bytes.

-Toke
Toke Høiland-Jørgensen Feb. 28, 2020, 10:10 a.m. UTC | #6
David Ahern <dahern@digitalocean.com> writes:

> On 2/27/20 4:58 AM, Toke Høiland-Jørgensen wrote:
>>  also, an egress program may want to actually know which
>> ingress iface the packet was first received on. So why not just keep
>> both fields? Since ifindex 0 is invalid anyway, the field could just be
>> 0 when it isn't known (e.g., egress ifindex on RX, or ingress ifindex if
>> it comes from the stack)?
>
> Today, the ingress device is lost in the conversion from xdp_buff to
> xdp_frame. The plumbing needed to keep that information is beyond the
> scope of this set.
>
> I am open to making the UAPI separate entries if there is a real reason
> for it. Do you have a specific use case?

I was thinking it could be a nice shorthand for whether a packet comes
from the local stack or was forwarded (0 == stack). But no, I don't have
a concrete application where this is useful. However, if we define it as
a union we lose the ability to change our mind. Together with the
debugability issue I just replied with to your other email, I think it
would be better to expend the four bytes keep them as separate fields,
but still restrict access to the RX ifindex for now.

-Toke
Jesper Dangaard Brouer Feb. 28, 2020, 10:41 a.m. UTC | #7
On Fri, 28 Feb 2020 11:07:23 +0100
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> David Ahern <dahern@digitalocean.com> writes:
> 
> > On 2/27/20 1:00 AM, Jesper Dangaard Brouer wrote:  
> >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >>> index 7850f8683b81..5e3f8aefad41 100644
> >>> --- a/include/uapi/linux/bpf.h
> >>> +++ b/include/uapi/linux/bpf.h
> >>> @@ -3334,8 +3334,10 @@ struct xdp_md {
> >>>  	__u32 data;
> >>>  	__u32 data_end;
> >>>  	__u32 data_meta;
> >>> -	/* Below access go through struct xdp_rxq_info */
> >>> -	__u32 ingress_ifindex; /* rxq->dev->ifindex */
> >>> +	union {
> >>> +		__u32 ingress_ifindex; /* rxq->dev->ifindex */
> >>> +		__u32 egress_ifindex;  /* txq->dev->ifindex */
> >>> +	};  
> >> 
> >> Are we sure it is wise to "union share" (struct) xdp_md as the
> >> XDP-context in the XDP programs, with different expected_attach_type?
> >> As this allows the XDP-programmer to code an EGRESS program that access
> >> ctx->ingress_ifindex, this will under the hood be translated to
> >> ctx->egress_ifindex, because from the compilers-PoV this will just be an
> >> offset.
> >> 
> >> We are setting up the XDP-programmer for a long debugging session, as
> >> she will be expecting to read 'ingress_ifindex', but will be getting
> >> 'egress_ifindex'.  (As the compiler cannot warn her, and it is also
> >> correct seen from the verifier).  
> >
> > It both cases it means the device handling the packet. ingress_ifindex
> > == device handling the Rx, egress_ifindex == device handling the Tx.
> > Really, it is syntactic sugar for program writers. It would have been
> > better had xdp_md only called it ifindex from the beginning.  
> 
> Telling users that they are doing it wrong is not going to make their
> debugging session any less frustrating :)
> 
> If we keep rx_ifindex a separate field we can unambiguously reject a TX
> program that tries to access it, *and* we keep the option of allowing
> access to it later if it does turn out to be useful. IMO that is worth
> the four extra bytes.

I agree. We need unambiguously to help the program writer.

This is the wrong kind of 'syntactic sugar'.  If you want a straight
'ifindex', that translates to the running_ifindex, when you need to add
a new member 'ifindex' that does this rewriting based on attach type.
diff mbox series

Patch

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 40c6d3398458..5584b9db86fe 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -63,6 +63,10 @@  struct xdp_rxq_info {
 	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+struct xdp_txq_info {
+	struct net_device *dev;
+};
+
 struct xdp_buff {
 	void *data;
 	void *data_end;
@@ -70,6 +74,7 @@  struct xdp_buff {
 	void *data_hard_start;
 	unsigned long handle;
 	struct xdp_rxq_info *rxq;
+	struct xdp_txq_info *txq;
 };
 
 struct xdp_frame {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7850f8683b81..5e3f8aefad41 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3334,8 +3334,10 @@  struct xdp_md {
 	__u32 data;
 	__u32 data_end;
 	__u32 data_meta;
-	/* Below access go through struct xdp_rxq_info */
-	__u32 ingress_ifindex; /* rxq->dev->ifindex */
+	union {
+		__u32 ingress_ifindex; /* rxq->dev->ifindex */
+		__u32 egress_ifindex;  /* txq->dev->ifindex */
+	};
 	__u32 rx_queue_index;  /* rxq->queue_index  */
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index c7cc98c55621..d1c65dccd671 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7716,14 +7716,25 @@  static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct xdp_buff, data_end));
 		break;
 	case offsetof(struct xdp_md, ingress_ifindex):
-		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
-				      si->dst_reg, si->src_reg,
-				      offsetof(struct xdp_buff, rxq));
-		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
-				      si->dst_reg, si->dst_reg,
-				      offsetof(struct xdp_rxq_info, dev));
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      offsetof(struct net_device, ifindex));
+		if (prog->expected_attach_type == BPF_XDP_EGRESS) {
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
+					      si->dst_reg, si->src_reg,
+					      offsetof(struct xdp_buff, txq));
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
+					      si->dst_reg, si->dst_reg,
+					      offsetof(struct xdp_txq_info, dev));
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+					      offsetof(struct net_device, ifindex));
+		} else {
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
+					      si->dst_reg, si->src_reg,
+					      offsetof(struct xdp_buff, rxq));
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
+					      si->dst_reg, si->dst_reg,
+					      offsetof(struct xdp_rxq_info, dev));
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+					      offsetof(struct net_device, ifindex));
+		}
 		break;
 	case offsetof(struct xdp_md, rx_queue_index):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),