Message ID | 20200227032013.12385-4-dsahern@kernel.org |
---|---|
State | RFC |
Delegated to: | BPF Maintainers |
Headers | show |
Series | Add support for XDP in egress path | expand |
On Wed, 26 Feb 2020 20:20:05 -0700 David Ahern <dsahern@kernel.org> wrote: > From: David Ahern <dahern@digitalocean.com> > > Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the > moment only the device is added. Other fields (queue_index) > can be added as use cases arise. > > From a UAPI perspective, egress_ifindex is a union with ingress_ifindex > since only one applies based on where the program is attached. > > Signed-off-by: David Ahern <dahern@digitalocean.com> > --- > include/net/xdp.h | 5 +++++ > include/uapi/linux/bpf.h | 6 ++++-- > net/core/filter.c | 27 +++++++++++++++++++-------- > 3 files changed, 28 insertions(+), 10 deletions(-) > > diff --git a/include/net/xdp.h b/include/net/xdp.h > index 40c6d3398458..5584b9db86fe 100644 > --- a/include/net/xdp.h > +++ b/include/net/xdp.h > @@ -63,6 +63,10 @@ struct xdp_rxq_info { > struct xdp_mem_info mem; > } ____cacheline_aligned; /* perf critical, avoid false-sharing */ > > +struct xdp_txq_info { > + struct net_device *dev; > +}; > + > struct xdp_buff { > void *data; > void *data_end; > @@ -70,6 +74,7 @@ struct xdp_buff { > void *data_hard_start; > unsigned long handle; > struct xdp_rxq_info *rxq; > + struct xdp_txq_info *txq; > }; > > struct xdp_frame { > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index 7850f8683b81..5e3f8aefad41 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -3334,8 +3334,10 @@ struct xdp_md { > __u32 data; > __u32 data_end; > __u32 data_meta; > - /* Below access go through struct xdp_rxq_info */ > - __u32 ingress_ifindex; /* rxq->dev->ifindex */ > + union { > + __u32 ingress_ifindex; /* rxq->dev->ifindex */ > + __u32 egress_ifindex; /* txq->dev->ifindex */ > + }; Are we sure it is wise to "union share" (struct) xdp_md as the XDP-context in the XDP programs, with different expected_attach_type? As this allows the XDP-programmer to code an EGRESS program that access ctx->ingress_ifindex, this will under the hood be translated to ctx->egress_ifindex, because from the compilers-PoV this will just be an offset. We are setting up the XDP-programmer for a long debugging session, as she will be expecting to read 'ingress_ifindex', but will be getting 'egress_ifindex'. (As the compiler cannot warn her, and it is also correct seen from the verifier). > __u32 rx_queue_index; /* rxq->queue_index */ So, the TX program can still read 'rx_queue_index', is this wise? (It should be easy to catch below and reject). > }; > > diff --git a/net/core/filter.c b/net/core/filter.c > index c7cc98c55621..d1c65dccd671 100644 > --- a/net/core/filter.c > +++ b/net/core/filter.c > @@ -7716,14 +7716,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, > offsetof(struct xdp_buff, data_end)); > break; > case offsetof(struct xdp_md, ingress_ifindex): > - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), > - si->dst_reg, si->src_reg, > - offsetof(struct xdp_buff, rxq)); > - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), > - si->dst_reg, si->dst_reg, > - offsetof(struct xdp_rxq_info, dev)); > - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, > - offsetof(struct net_device, ifindex)); > + if (prog->expected_attach_type == BPF_XDP_EGRESS) { > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), > + si->dst_reg, si->src_reg, > + offsetof(struct xdp_buff, txq)); > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), > + si->dst_reg, si->dst_reg, > + offsetof(struct xdp_txq_info, dev)); > + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, > + offsetof(struct net_device, ifindex)); > + } else { > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), > + si->dst_reg, si->src_reg, > + offsetof(struct xdp_buff, rxq)); > + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), > + si->dst_reg, si->dst_reg, > + offsetof(struct xdp_rxq_info, dev)); > + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, > + offsetof(struct net_device, ifindex)); > + } > break; > case offsetof(struct xdp_md, rx_queue_index): > *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), We can catch and disallow access to rx_queue_index from expected_attach_type BPF_XDP_EGRESS, here. But then we are adding more code to handle/separate egress from normal RX/ingress.
Jesper Dangaard Brouer <brouer@redhat.com> writes: > On Wed, 26 Feb 2020 20:20:05 -0700 > David Ahern <dsahern@kernel.org> wrote: > >> From: David Ahern <dahern@digitalocean.com> >> >> Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the >> moment only the device is added. Other fields (queue_index) >> can be added as use cases arise. >> >> From a UAPI perspective, egress_ifindex is a union with ingress_ifindex >> since only one applies based on where the program is attached. >> >> Signed-off-by: David Ahern <dahern@digitalocean.com> >> --- >> include/net/xdp.h | 5 +++++ >> include/uapi/linux/bpf.h | 6 ++++-- >> net/core/filter.c | 27 +++++++++++++++++++-------- >> 3 files changed, 28 insertions(+), 10 deletions(-) >> >> diff --git a/include/net/xdp.h b/include/net/xdp.h >> index 40c6d3398458..5584b9db86fe 100644 >> --- a/include/net/xdp.h >> +++ b/include/net/xdp.h >> @@ -63,6 +63,10 @@ struct xdp_rxq_info { >> struct xdp_mem_info mem; >> } ____cacheline_aligned; /* perf critical, avoid false-sharing */ >> >> +struct xdp_txq_info { >> + struct net_device *dev; >> +}; >> + >> struct xdp_buff { >> void *data; >> void *data_end; >> @@ -70,6 +74,7 @@ struct xdp_buff { >> void *data_hard_start; >> unsigned long handle; >> struct xdp_rxq_info *rxq; >> + struct xdp_txq_info *txq; >> }; >> >> struct xdp_frame { >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 7850f8683b81..5e3f8aefad41 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -3334,8 +3334,10 @@ struct xdp_md { >> __u32 data; >> __u32 data_end; >> __u32 data_meta; >> - /* Below access go through struct xdp_rxq_info */ >> - __u32 ingress_ifindex; /* rxq->dev->ifindex */ >> + union { >> + __u32 ingress_ifindex; /* rxq->dev->ifindex */ >> + __u32 egress_ifindex; /* txq->dev->ifindex */ >> + }; > > Are we sure it is wise to "union share" (struct) xdp_md as the > XDP-context in the XDP programs, with different expected_attach_type? > As this allows the XDP-programmer to code an EGRESS program that access > ctx->ingress_ifindex, this will under the hood be translated to > ctx->egress_ifindex, because from the compilers-PoV this will just be an > offset. > > We are setting up the XDP-programmer for a long debugging session, as > she will be expecting to read 'ingress_ifindex', but will be getting > 'egress_ifindex'. (As the compiler cannot warn her, and it is also > correct seen from the verifier). +1 on this; also, an egress program may want to actually know which ingress iface the packet was first received on. So why not just keep both fields? Since ifindex 0 is invalid anyway, the field could just be 0 when it isn't known (e.g., egress ifindex on RX, or ingress ifindex if it comes from the stack)? >> __u32 rx_queue_index; /* rxq->queue_index */ > > So, the TX program can still read 'rx_queue_index', is this wise? Why shouldn't it be able to (as well as ingress ifindex)? -Toke
On 2/27/20 1:00 AM, Jesper Dangaard Brouer wrote: >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >> index 7850f8683b81..5e3f8aefad41 100644 >> --- a/include/uapi/linux/bpf.h >> +++ b/include/uapi/linux/bpf.h >> @@ -3334,8 +3334,10 @@ struct xdp_md { >> __u32 data; >> __u32 data_end; >> __u32 data_meta; >> - /* Below access go through struct xdp_rxq_info */ >> - __u32 ingress_ifindex; /* rxq->dev->ifindex */ >> + union { >> + __u32 ingress_ifindex; /* rxq->dev->ifindex */ >> + __u32 egress_ifindex; /* txq->dev->ifindex */ >> + }; > > Are we sure it is wise to "union share" (struct) xdp_md as the > XDP-context in the XDP programs, with different expected_attach_type? > As this allows the XDP-programmer to code an EGRESS program that access > ctx->ingress_ifindex, this will under the hood be translated to > ctx->egress_ifindex, because from the compilers-PoV this will just be an > offset. > > We are setting up the XDP-programmer for a long debugging session, as > she will be expecting to read 'ingress_ifindex', but will be getting > 'egress_ifindex'. (As the compiler cannot warn her, and it is also > correct seen from the verifier). It both cases it means the device handling the packet. ingress_ifindex == device handling the Rx, egress_ifindex == device handling the Tx. Really, it is syntactic sugar for program writers. It would have been better had xdp_md only called it ifindex from the beginning. > > >> __u32 rx_queue_index; /* rxq->queue_index */ > > So, the TX program can still read 'rx_queue_index', is this wise? > (It should be easy to catch below and reject). See patch 2. In time I expect rx_queue_index to be a union with tx_queue_index for the same reasons as the ifindex.
On 2/27/20 4:58 AM, Toke Høiland-Jørgensen wrote: > also, an egress program may want to actually know which > ingress iface the packet was first received on. So why not just keep > both fields? Since ifindex 0 is invalid anyway, the field could just be > 0 when it isn't known (e.g., egress ifindex on RX, or ingress ifindex if > it comes from the stack)? Today, the ingress device is lost in the conversion from xdp_buff to xdp_frame. The plumbing needed to keep that information is beyond the scope of this set. I am open to making the UAPI separate entries if there is a real reason for it. Do you have a specific use case? I am not aware of any situation where a packet queued up for Tx on a device would want to know the ingress device. At that point it is kind of irrelevant; the packet is about to hit the "wire". Further, it would only apply to XDP_redirected frames which could be only a limited set in the ways that a packet can end up at a device for Tx. I suspect the flow is more relevant than the device. When you factor on other details - e.g., bonds, vlans - the ingress device is not the full story. Perhaps the metadata area is more appropriate than the overhead of managing that information in the kernel code.
David Ahern <dahern@digitalocean.com> writes: > On 2/27/20 1:00 AM, Jesper Dangaard Brouer wrote: >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h >>> index 7850f8683b81..5e3f8aefad41 100644 >>> --- a/include/uapi/linux/bpf.h >>> +++ b/include/uapi/linux/bpf.h >>> @@ -3334,8 +3334,10 @@ struct xdp_md { >>> __u32 data; >>> __u32 data_end; >>> __u32 data_meta; >>> - /* Below access go through struct xdp_rxq_info */ >>> - __u32 ingress_ifindex; /* rxq->dev->ifindex */ >>> + union { >>> + __u32 ingress_ifindex; /* rxq->dev->ifindex */ >>> + __u32 egress_ifindex; /* txq->dev->ifindex */ >>> + }; >> >> Are we sure it is wise to "union share" (struct) xdp_md as the >> XDP-context in the XDP programs, with different expected_attach_type? >> As this allows the XDP-programmer to code an EGRESS program that access >> ctx->ingress_ifindex, this will under the hood be translated to >> ctx->egress_ifindex, because from the compilers-PoV this will just be an >> offset. >> >> We are setting up the XDP-programmer for a long debugging session, as >> she will be expecting to read 'ingress_ifindex', but will be getting >> 'egress_ifindex'. (As the compiler cannot warn her, and it is also >> correct seen from the verifier). > > It both cases it means the device handling the packet. ingress_ifindex > == device handling the Rx, egress_ifindex == device handling the Tx. > Really, it is syntactic sugar for program writers. It would have been > better had xdp_md only called it ifindex from the beginning. Telling users that they are doing it wrong is not going to make their debugging session any less frustrating :) If we keep rx_ifindex a separate field we can unambiguously reject a TX program that tries to access it, *and* we keep the option of allowing access to it later if it does turn out to be useful. IMO that is worth the four extra bytes. -Toke
David Ahern <dahern@digitalocean.com> writes: > On 2/27/20 4:58 AM, Toke Høiland-Jørgensen wrote: >> also, an egress program may want to actually know which >> ingress iface the packet was first received on. So why not just keep >> both fields? Since ifindex 0 is invalid anyway, the field could just be >> 0 when it isn't known (e.g., egress ifindex on RX, or ingress ifindex if >> it comes from the stack)? > > Today, the ingress device is lost in the conversion from xdp_buff to > xdp_frame. The plumbing needed to keep that information is beyond the > scope of this set. > > I am open to making the UAPI separate entries if there is a real reason > for it. Do you have a specific use case? I was thinking it could be a nice shorthand for whether a packet comes from the local stack or was forwarded (0 == stack). But no, I don't have a concrete application where this is useful. However, if we define it as a union we lose the ability to change our mind. Together with the debugability issue I just replied with to your other email, I think it would be better to expend the four bytes keep them as separate fields, but still restrict access to the RX ifindex for now. -Toke
On Fri, 28 Feb 2020 11:07:23 +0100 Toke Høiland-Jørgensen <toke@redhat.com> wrote: > David Ahern <dahern@digitalocean.com> writes: > > > On 2/27/20 1:00 AM, Jesper Dangaard Brouer wrote: > >>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > >>> index 7850f8683b81..5e3f8aefad41 100644 > >>> --- a/include/uapi/linux/bpf.h > >>> +++ b/include/uapi/linux/bpf.h > >>> @@ -3334,8 +3334,10 @@ struct xdp_md { > >>> __u32 data; > >>> __u32 data_end; > >>> __u32 data_meta; > >>> - /* Below access go through struct xdp_rxq_info */ > >>> - __u32 ingress_ifindex; /* rxq->dev->ifindex */ > >>> + union { > >>> + __u32 ingress_ifindex; /* rxq->dev->ifindex */ > >>> + __u32 egress_ifindex; /* txq->dev->ifindex */ > >>> + }; > >> > >> Are we sure it is wise to "union share" (struct) xdp_md as the > >> XDP-context in the XDP programs, with different expected_attach_type? > >> As this allows the XDP-programmer to code an EGRESS program that access > >> ctx->ingress_ifindex, this will under the hood be translated to > >> ctx->egress_ifindex, because from the compilers-PoV this will just be an > >> offset. > >> > >> We are setting up the XDP-programmer for a long debugging session, as > >> she will be expecting to read 'ingress_ifindex', but will be getting > >> 'egress_ifindex'. (As the compiler cannot warn her, and it is also > >> correct seen from the verifier). > > > > It both cases it means the device handling the packet. ingress_ifindex > > == device handling the Rx, egress_ifindex == device handling the Tx. > > Really, it is syntactic sugar for program writers. It would have been > > better had xdp_md only called it ifindex from the beginning. > > Telling users that they are doing it wrong is not going to make their > debugging session any less frustrating :) > > If we keep rx_ifindex a separate field we can unambiguously reject a TX > program that tries to access it, *and* we keep the option of allowing > access to it later if it does turn out to be useful. IMO that is worth > the four extra bytes. I agree. We need unambiguously to help the program writer. This is the wrong kind of 'syntactic sugar'. If you want a straight 'ifindex', that translates to the running_ifindex, when you need to add a new member 'ifindex' that does this rewriting based on attach type.
diff --git a/include/net/xdp.h b/include/net/xdp.h index 40c6d3398458..5584b9db86fe 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -63,6 +63,10 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_txq_info { + struct net_device *dev; +}; + struct xdp_buff { void *data; void *data_end; @@ -70,6 +74,7 @@ struct xdp_buff { void *data_hard_start; unsigned long handle; struct xdp_rxq_info *rxq; + struct xdp_txq_info *txq; }; struct xdp_frame { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7850f8683b81..5e3f8aefad41 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3334,8 +3334,10 @@ struct xdp_md { __u32 data; __u32 data_end; __u32 data_meta; - /* Below access go through struct xdp_rxq_info */ - __u32 ingress_ifindex; /* rxq->dev->ifindex */ + union { + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 egress_ifindex; /* txq->dev->ifindex */ + }; __u32 rx_queue_index; /* rxq->queue_index */ }; diff --git a/net/core/filter.c b/net/core/filter.c index c7cc98c55621..d1c65dccd671 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7716,14 +7716,25 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_buff, data_end)); break; case offsetof(struct xdp_md, ingress_ifindex): - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), - si->dst_reg, si->src_reg, - offsetof(struct xdp_buff, rxq)); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), - si->dst_reg, si->dst_reg, - offsetof(struct xdp_rxq_info, dev)); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - offsetof(struct net_device, ifindex)); + if (prog->expected_attach_type == BPF_XDP_EGRESS) { + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + } else { + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + } break; case offsetof(struct xdp_md, rx_queue_index): *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),