diff mbox

[RFC,1/5] bpf: add PHYS_DEV prog type for early driver filter

Message ID 1459560118-5582-2-git-send-email-bblanco@plumgrid.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Brenden Blanco April 2, 2016, 1:21 a.m. UTC
Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a new
context type, struct xdp_metadata, is exposed to userspace. So far only
expose the readable packet length, and only in read mode.

The PHYS_DEV name is chosen to represent that the program is meant only
for physical adapters, rather than all netdevs.

While the user visible struct is new, the underlying context must be
implemented as a minimal skb in order for the packet load_* instructions
to work. The skb filled in by the driver must have skb->len, skb->head,
and skb->data set, and skb->data_len == 0.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/uapi/linux/bpf.h |  5 ++++
 kernel/bpf/verifier.c    |  1 +
 net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

Comments

Tom Herbert April 2, 2016, 4:39 p.m. UTC | #1
On Fri, Apr 1, 2016 at 9:21 PM, Brenden Blanco <bblanco@plumgrid.com> wrote:
> Add a new bpf prog type that is intended to run in early stages of the
> packet rx path. Only minimal packet metadata will be available, hence a new
> context type, struct xdp_metadata, is exposed to userspace. So far only
> expose the readable packet length, and only in read mode.
>
This would eventually be a generic abstraction of receive descriptors?

> The PHYS_DEV name is chosen to represent that the program is meant only
> for physical adapters, rather than all netdevs.
>
Is there a hard restriction that this could only work with physical devices?

> While the user visible struct is new, the underlying context must be
> implemented as a minimal skb in order for the packet load_* instructions
> to work. The skb filled in by the driver must have skb->len, skb->head,
> and skb->data set, and skb->data_len == 0.
>
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>  include/uapi/linux/bpf.h |  5 ++++
>  kernel/bpf/verifier.c    |  1 +
>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 74 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 924f537..b8a4ef2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -92,6 +92,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_KPROBE,
>         BPF_PROG_TYPE_SCHED_CLS,
>         BPF_PROG_TYPE_SCHED_ACT,
> +       BPF_PROG_TYPE_PHYS_DEV,
>  };
>
>  #define BPF_PSEUDO_MAP_FD      1
> @@ -367,6 +368,10 @@ struct __sk_buff {
>         __u32 tc_classid;
>  };
>
> +struct xdp_metadata {
> +       __u32 len;
> +};
> +
>  struct bpf_tunnel_key {
>         __u32 tunnel_id;
>         union {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2e08f8e..804ca70 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1340,6 +1340,7 @@ static bool may_access_skb(enum bpf_prog_type type)
>         case BPF_PROG_TYPE_SOCKET_FILTER:
>         case BPF_PROG_TYPE_SCHED_CLS:
>         case BPF_PROG_TYPE_SCHED_ACT:
> +       case BPF_PROG_TYPE_PHYS_DEV:
>                 return true;
>         default:
>                 return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index b7177d0..c417db6 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2018,6 +2018,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
>         }
>  }
>
> +static const struct bpf_func_proto *
> +phys_dev_func_proto(enum bpf_func_id func_id)
> +{
> +       return sk_filter_func_proto(func_id);
> +}
> +
>  static bool __is_valid_access(int off, int size, enum bpf_access_type type)
>  {
>         /* check bounds */
> @@ -2073,6 +2079,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
>         return __is_valid_access(off, size, type);
>  }
>
> +static bool __is_valid_xdp_access(int off, int size,
> +                                 enum bpf_access_type type)
> +{
> +       if (off < 0 || off >= sizeof(struct xdp_metadata))
> +               return false;
> +
> +       if (off % size != 0)
> +               return false;
> +
> +       if (size != 4)
> +               return false;
> +
> +       return true;
> +}
> +
> +static bool phys_dev_is_valid_access(int off, int size,
> +                                    enum bpf_access_type type)
> +{
> +       if (type == BPF_WRITE)
> +               return false;
> +
> +       switch (off) {
> +       case offsetof(struct xdp_metadata, len):
> +               break;
> +       default:
> +               return false;
> +       }
> +       return __is_valid_xdp_access(off, size, type);
> +}
> +
>  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>                                       int src_reg, int ctx_off,
>                                       struct bpf_insn *insn_buf,
> @@ -2210,6 +2246,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>         return insn - insn_buf;
>  }
>
> +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
> +                                          int dst_reg, int src_reg,
> +                                          int ctx_off,
> +                                          struct bpf_insn *insn_buf,
> +                                          struct bpf_prog *prog)
> +{
> +       struct bpf_insn *insn = insn_buf;
> +
> +       switch (ctx_off) {
> +       case offsetof(struct xdp_metadata, len):
> +               BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
> +
> +               *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
> +                                     offsetof(struct sk_buff, len));
> +               break;
> +       }
> +
> +       return insn - insn_buf;
> +}
> +
>  static const struct bpf_verifier_ops sk_filter_ops = {
>         .get_func_proto = sk_filter_func_proto,
>         .is_valid_access = sk_filter_is_valid_access,
> @@ -2222,6 +2278,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
>         .convert_ctx_access = bpf_net_convert_ctx_access,
>  };
>
> +static const struct bpf_verifier_ops phys_dev_ops = {
> +       .get_func_proto = phys_dev_func_proto,
> +       .is_valid_access = phys_dev_is_valid_access,
> +       .convert_ctx_access = bpf_phys_dev_convert_ctx_access,
> +};
> +
>  static struct bpf_prog_type_list sk_filter_type __read_mostly = {
>         .ops = &sk_filter_ops,
>         .type = BPF_PROG_TYPE_SOCKET_FILTER,
> @@ -2237,11 +2299,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
>         .type = BPF_PROG_TYPE_SCHED_ACT,
>  };
>
> +static struct bpf_prog_type_list phys_dev_type __read_mostly = {
> +       .ops = &phys_dev_ops,
> +       .type = BPF_PROG_TYPE_PHYS_DEV,
> +};
> +
>  static int __init register_sk_filter_ops(void)
>  {
>         bpf_register_prog_type(&sk_filter_type);
>         bpf_register_prog_type(&sched_cls_type);
>         bpf_register_prog_type(&sched_act_type);
> +       bpf_register_prog_type(&phys_dev_type);
>
>         return 0;
>  }
> --
> 2.8.0
>
Brenden Blanco April 3, 2016, 7:02 a.m. UTC | #2
On Sat, Apr 02, 2016 at 12:39:45PM -0400, Tom Herbert wrote:
> On Fri, Apr 1, 2016 at 9:21 PM, Brenden Blanco <bblanco@plumgrid.com> wrote:
> > Add a new bpf prog type that is intended to run in early stages of the
> > packet rx path. Only minimal packet metadata will be available, hence a new
> > context type, struct xdp_metadata, is exposed to userspace. So far only
> > expose the readable packet length, and only in read mode.
> >
> This would eventually be a generic abstraction of receive descriptors?
Exactly. For instance, maybe let the hw's hash be available.
> 
> > The PHYS_DEV name is chosen to represent that the program is meant only
> > for physical adapters, rather than all netdevs.
> >
> Is there a hard restriction that this could only work with physical devices?
I suppose not, but there wouldn't be much use case as compared to tc
ingress, no? Since I was imagining that this new hook would be more
restricted in functionality due to operating on descriptors rather than
with a full skb, I tried to think of an appropriate name.

If you think that this hook would spread, then a better name is needed.
> 
[...]
Daniel Borkmann April 4, 2016, 8:49 a.m. UTC | #3
On 04/02/2016 03:21 AM, Brenden Blanco wrote:
> Add a new bpf prog type that is intended to run in early stages of the
> packet rx path. Only minimal packet metadata will be available, hence a new
> context type, struct xdp_metadata, is exposed to userspace. So far only
> expose the readable packet length, and only in read mode.
>
> The PHYS_DEV name is chosen to represent that the program is meant only
> for physical adapters, rather than all netdevs.
>
> While the user visible struct is new, the underlying context must be
> implemented as a minimal skb in order for the packet load_* instructions
> to work. The skb filled in by the driver must have skb->len, skb->head,
> and skb->data set, and skb->data_len == 0.
>
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>   include/uapi/linux/bpf.h |  5 ++++
>   kernel/bpf/verifier.c    |  1 +
>   net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>   3 files changed, 74 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 924f537..b8a4ef2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -92,6 +92,7 @@ enum bpf_prog_type {
>   	BPF_PROG_TYPE_KPROBE,
>   	BPF_PROG_TYPE_SCHED_CLS,
>   	BPF_PROG_TYPE_SCHED_ACT,
> +	BPF_PROG_TYPE_PHYS_DEV,
>   };
>
>   #define BPF_PSEUDO_MAP_FD	1
> @@ -367,6 +368,10 @@ struct __sk_buff {
>   	__u32 tc_classid;
>   };
>
> +struct xdp_metadata {
> +	__u32 len;
> +};

Should this consistently be called 'xdp' or rather 'phys dev',
because currently it's a mixture of both everywhere?

>   struct bpf_tunnel_key {
>   	__u32 tunnel_id;
>   	union {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2e08f8e..804ca70 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1340,6 +1340,7 @@ static bool may_access_skb(enum bpf_prog_type type)
>   	case BPF_PROG_TYPE_SOCKET_FILTER:
>   	case BPF_PROG_TYPE_SCHED_CLS:
>   	case BPF_PROG_TYPE_SCHED_ACT:
> +	case BPF_PROG_TYPE_PHYS_DEV:
>   		return true;
>   	default:
>   		return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index b7177d0..c417db6 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2018,6 +2018,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
>   	}
>   }
>
> +static const struct bpf_func_proto *
> +phys_dev_func_proto(enum bpf_func_id func_id)
> +{
> +	return sk_filter_func_proto(func_id);

Do you plan to support bpf_skb_load_bytes() as well? I like using
this API especially when dealing with larger chunks (>4 bytes) to
load into stack memory, plus content is kept in network byte order.

What about other helpers such as bpf_skb_store_bytes() et al that
work on skbs. Do you intent to reuse them as is and thus populate
the per cpu skb with needed fields (faking linear data), or do you
see larger obstacles that prevent for this?

Thanks,
Daniel
Jesper Dangaard Brouer April 4, 2016, 1:07 p.m. UTC | #4
On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> wrote:

> On 04/02/2016 03:21 AM, Brenden Blanco wrote:
> > Add a new bpf prog type that is intended to run in early stages of the
> > packet rx path. Only minimal packet metadata will be available, hence a new
> > context type, struct xdp_metadata, is exposed to userspace. So far only
> > expose the readable packet length, and only in read mode.
> >
> > The PHYS_DEV name is chosen to represent that the program is meant only
> > for physical adapters, rather than all netdevs.
> >
> > While the user visible struct is new, the underlying context must be
> > implemented as a minimal skb in order for the packet load_* instructions
> > to work. The skb filled in by the driver must have skb->len, skb->head,
> > and skb->data set, and skb->data_len == 0.
> >
[...]
> 
> Do you plan to support bpf_skb_load_bytes() as well? I like using
> this API especially when dealing with larger chunks (>4 bytes) to
> load into stack memory, plus content is kept in network byte order.
> 
> What about other helpers such as bpf_skb_store_bytes() et al that
> work on skbs. Do you intent to reuse them as is and thus populate
> the per cpu skb with needed fields (faking linear data), or do you
> see larger obstacles that prevent for this?

Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
to users of this API.

The hole idea is that an SKB is NOT allocated yet, and not needed at
this level.  If we start supporting calling underlying SKB functions,
then we will end-up in the same place (performance wise).
Daniel Borkmann April 4, 2016, 1:36 p.m. UTC | #5
On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote:
> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 04/02/2016 03:21 AM, Brenden Blanco wrote:
>>> Add a new bpf prog type that is intended to run in early stages of the
>>> packet rx path. Only minimal packet metadata will be available, hence a new
>>> context type, struct xdp_metadata, is exposed to userspace. So far only
>>> expose the readable packet length, and only in read mode.
>>>
>>> The PHYS_DEV name is chosen to represent that the program is meant only
>>> for physical adapters, rather than all netdevs.
>>>
>>> While the user visible struct is new, the underlying context must be
>>> implemented as a minimal skb in order for the packet load_* instructions
>>> to work. The skb filled in by the driver must have skb->len, skb->head,
>>> and skb->data set, and skb->data_len == 0.
>>>
> [...]
>>
>> Do you plan to support bpf_skb_load_bytes() as well? I like using
>> this API especially when dealing with larger chunks (>4 bytes) to
>> load into stack memory, plus content is kept in network byte order.
>>
>> What about other helpers such as bpf_skb_store_bytes() et al that
>> work on skbs. Do you intent to reuse them as is and thus populate
>> the per cpu skb with needed fields (faking linear data), or do you
>> see larger obstacles that prevent for this?
>
> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
> to users of this API.
>
> The hole idea is that an SKB is NOT allocated yet, and not needed at
> this level.  If we start supporting calling underlying SKB functions,
> then we will end-up in the same place (performance wise).

I'm talking about the current skb-related BPF helper functions we have,
so the question is how much from that code we have we can reuse under
these constraints (obviously things like the tunnel helpers are a different
story) and if that trade-off is acceptable for us. I'm also thinking
that, for example, if you need to parse the packet data anyway for a drop
verdict, you might as well pass some meta data (that is set in the real
skb later on) for those packets that go up the stack.
Tom Herbert April 4, 2016, 2:09 p.m. UTC | #6
On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote:
>>
>> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net>
>> wrote:
>>>
>>> On 04/02/2016 03:21 AM, Brenden Blanco wrote:
>>>>
>>>> Add a new bpf prog type that is intended to run in early stages of the
>>>> packet rx path. Only minimal packet metadata will be available, hence a
>>>> new
>>>> context type, struct xdp_metadata, is exposed to userspace. So far only
>>>> expose the readable packet length, and only in read mode.
>>>>
>>>> The PHYS_DEV name is chosen to represent that the program is meant only
>>>> for physical adapters, rather than all netdevs.
>>>>
>>>> While the user visible struct is new, the underlying context must be
>>>> implemented as a minimal skb in order for the packet load_* instructions
>>>> to work. The skb filled in by the driver must have skb->len, skb->head,
>>>> and skb->data set, and skb->data_len == 0.
>>>>
>> [...]
>>>
>>>
>>> Do you plan to support bpf_skb_load_bytes() as well? I like using
>>> this API especially when dealing with larger chunks (>4 bytes) to
>>> load into stack memory, plus content is kept in network byte order.
>>>
>>> What about other helpers such as bpf_skb_store_bytes() et al that
>>> work on skbs. Do you intent to reuse them as is and thus populate
>>> the per cpu skb with needed fields (faking linear data), or do you
>>> see larger obstacles that prevent for this?
>>
>>
>> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
>> to users of this API.
>>
>> The hole idea is that an SKB is NOT allocated yet, and not needed at
>> this level.  If we start supporting calling underlying SKB functions,
>> then we will end-up in the same place (performance wise).
>
>
> I'm talking about the current skb-related BPF helper functions we have,
> so the question is how much from that code we have we can reuse under
> these constraints (obviously things like the tunnel helpers are a different
> story) and if that trade-off is acceptable for us. I'm also thinking
> that, for example, if you need to parse the packet data anyway for a drop
> verdict, you might as well pass some meta data (that is set in the real
> skb later on) for those packets that go up the stack.

Right, the meta data in this case is an abstracted receive descriptor.
This would include items that we get in a device receive descriptor
(computed checksum, hash, VLAN tag). This is purposely a small
restricted data structure. I'm hoping we can minimize the size of this
to not much more than 32 bytes (including pointers to data and
linkage).

How this translates to skb to maintain compatibility is with BPF
interesting question. One other consideration is that skb's are kernel
specific, we should be able to use the same BPF filter program in
userspace over DPDK for instance-- so an skb interface as the packet
abstraction might not be the right model...

Tom
Eric Dumazet April 4, 2016, 2:33 p.m. UTC | #7
On Mon, 2016-04-04 at 15:07 +0200, Jesper Dangaard Brouer wrote:

> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
> to users of this API.
> 
> The hole idea is that an SKB is NOT allocated yet, and not needed at
> this level.  If we start supporting calling underlying SKB functions,
> then we will end-up in the same place (performance wise).

A BPF program can access many skb fields.

If you plan to support BPF, your fake skb needs to be populated like a
real one. Looks like some code will be replicated in all drivers that
want this facility...

Or accept (document ?) that some BPF instructions are just not there.
(hash, queue_mapping ...)
Jesper Dangaard Brouer April 4, 2016, 3:12 p.m. UTC | #8
On Mon, 4 Apr 2016 11:09:57 -0300
Tom Herbert <tom@herbertland.com> wrote:

> On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote:  
> >>
> >> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net>
> >> wrote:  
> >>>
> >>> On 04/02/2016 03:21 AM, Brenden Blanco wrote:  
> >>>>
> >>>> Add a new bpf prog type that is intended to run in early stages of the
> >>>> packet rx path. Only minimal packet metadata will be available, hence a
> >>>> new
> >>>> context type, struct xdp_metadata, is exposed to userspace. So far only
> >>>> expose the readable packet length, and only in read mode.
> >>>>
> >>>> The PHYS_DEV name is chosen to represent that the program is meant only
> >>>> for physical adapters, rather than all netdevs.
> >>>>
> >>>> While the user visible struct is new, the underlying context must be
> >>>> implemented as a minimal skb in order for the packet load_* instructions
> >>>> to work. The skb filled in by the driver must have skb->len, skb->head,
> >>>> and skb->data set, and skb->data_len == 0.
> >>>>  
> >> [...]  
> >>>
> >>>
> >>> Do you plan to support bpf_skb_load_bytes() as well? I like using
> >>> this API especially when dealing with larger chunks (>4 bytes) to
> >>> load into stack memory, plus content is kept in network byte order.
> >>>
> >>> What about other helpers such as bpf_skb_store_bytes() et al that
> >>> work on skbs. Do you intent to reuse them as is and thus populate
> >>> the per cpu skb with needed fields (faking linear data), or do you
> >>> see larger obstacles that prevent for this?  
> >>
> >>
> >> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
> >> to users of this API.
> >>
> >> The hole idea is that an SKB is NOT allocated yet, and not needed at
> >> this level.  If we start supporting calling underlying SKB functions,
> >> then we will end-up in the same place (performance wise).  
> >
> >
> > I'm talking about the current skb-related BPF helper functions we have,
> > so the question is how much from that code we have we can reuse under
> > these constraints (obviously things like the tunnel helpers are a different
> > story) and if that trade-off is acceptable for us. I'm also thinking
> > that, for example, if you need to parse the packet data anyway for a drop
> > verdict, you might as well pass some meta data (that is set in the real
> > skb later on) for those packets that go up the stack.  
> 
> Right, the meta data in this case is an abstracted receive descriptor.
> This would include items that we get in a device receive descriptor
> (computed checksum, hash, VLAN tag). This is purposely a small
> restricted data structure. I'm hoping we can minimize the size of this
> to not much more than 32 bytes (including pointers to data and
> linkage).

I agree.
 
> How this translates to skb to maintain compatibility is with BPF
> interesting question. One other consideration is that skb's are kernel
> specific, we should be able to use the same BPF filter program in
> userspace over DPDK for instance-- so an skb interface as the packet
> abstraction might not be the right model...

I agree.  I don't think reusing the SKB data structure is the right
model.  We should drop the SKB pointer from the API.

As Tom also points out, making the BPF interface independent of the SKB
meta-data structure, would also make the eBPF program more generally
applicable.
Edward Cree April 4, 2016, 3:18 p.m. UTC | #9
On 04/04/16 15:33, Eric Dumazet wrote:
> On Mon, 2016-04-04 at 15:07 +0200, Jesper Dangaard Brouer wrote:
> 
>> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
>> to users of this API.
>>
>> The hole idea is that an SKB is NOT allocated yet, and not needed at
>> this level.  If we start supporting calling underlying SKB functions,
>> then we will end-up in the same place (performance wise).
> 
> A BPF program can access many skb fields.
> 
> If you plan to support BPF, your fake skb needs to be populated like a
> real one. Looks like some code will be replicated in all drivers that
> want this facility...
> 
> Or accept (document ?) that some BPF instructions are just not there.
> (hash, queue_mapping ...)

If these progs are eventually going to get pushed down into supporting
hardware, many skb things won't make sense at all at that level.  I would
suggest that anything hardware wouldn't reasonably have available should
be "just not there"; I suspect that'll lead you to the right API for early
driver filter as well.  And it probably won't look much like an skb.

-Ed
Brenden Blanco April 4, 2016, 3:29 p.m. UTC | #10
On Mon, Apr 04, 2016 at 05:12:27PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 4 Apr 2016 11:09:57 -0300
> Tom Herbert <tom@herbertland.com> wrote:
> 
> > On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> > > On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote:  
> > >>
> > >> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net>
> > >> wrote:  
> > >>>
> > >>> On 04/02/2016 03:21 AM, Brenden Blanco wrote:  
> > >>>>
> > >>>> Add a new bpf prog type that is intended to run in early stages of the
> > >>>> packet rx path. Only minimal packet metadata will be available, hence a
> > >>>> new
> > >>>> context type, struct xdp_metadata, is exposed to userspace. So far only
> > >>>> expose the readable packet length, and only in read mode.
> > >>>>
> > >>>> The PHYS_DEV name is chosen to represent that the program is meant only
> > >>>> for physical adapters, rather than all netdevs.
> > >>>>
> > >>>> While the user visible struct is new, the underlying context must be
> > >>>> implemented as a minimal skb in order for the packet load_* instructions
> > >>>> to work. The skb filled in by the driver must have skb->len, skb->head,
> > >>>> and skb->data set, and skb->data_len == 0.
> > >>>>  
> > >> [...]  
> > >>>
> > >>>
> > >>> Do you plan to support bpf_skb_load_bytes() as well? I like using
> > >>> this API especially when dealing with larger chunks (>4 bytes) to
> > >>> load into stack memory, plus content is kept in network byte order.
> > >>>
> > >>> What about other helpers such as bpf_skb_store_bytes() et al that
> > >>> work on skbs. Do you intent to reuse them as is and thus populate
> > >>> the per cpu skb with needed fields (faking linear data), or do you
> > >>> see larger obstacles that prevent for this?  
> > >>
> > >>
> > >> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
> > >> to users of this API.
> > >>
> > >> The hole idea is that an SKB is NOT allocated yet, and not needed at
> > >> this level.  If we start supporting calling underlying SKB functions,
> > >> then we will end-up in the same place (performance wise).  
> > >
> > >
> > > I'm talking about the current skb-related BPF helper functions we have,
> > > so the question is how much from that code we have we can reuse under
> > > these constraints (obviously things like the tunnel helpers are a different
> > > story) and if that trade-off is acceptable for us. I'm also thinking
> > > that, for example, if you need to parse the packet data anyway for a drop
> > > verdict, you might as well pass some meta data (that is set in the real
> > > skb later on) for those packets that go up the stack.  
> > 
> > Right, the meta data in this case is an abstracted receive descriptor.
> > This would include items that we get in a device receive descriptor
> > (computed checksum, hash, VLAN tag). This is purposely a small
> > restricted data structure. I'm hoping we can minimize the size of this
> > to not much more than 32 bytes (including pointers to data and
> > linkage).
> 
> I agree.
>  
> > How this translates to skb to maintain compatibility is with BPF
> > interesting question. One other consideration is that skb's are kernel
> > specific, we should be able to use the same BPF filter program in
> > userspace over DPDK for instance-- so an skb interface as the packet
> > abstraction might not be the right model...
> 
> I agree.  I don't think reusing the SKB data structure is the right
> model.  We should drop the SKB pointer from the API.
> 
> As Tom also points out, making the BPF interface independent of the SKB
> meta-data structure, would also make the eBPF program more generally
> applicable.
The initial approach that I tried went down this path. Alexei advised
that I use the pseudo skb, and in the future the API between drivers and
bpf can change to adopt non-skb context. The only user facing ABIs in
this patchset are the IFLA, the xdp_metadata struct, and the name of the
new enum.

The reason to use a pseudo skb for now is that there will be a fair
amount of churn to get bpf jit and interpreter to understand non-skb
context in the bpf_load_pointer() code. I don't see the need for
requiring that for this patchset, as it will be internal-only change
if/when we use something else.
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer
John Fastabend April 4, 2016, 4:07 p.m. UTC | #11
On 16-04-04 08:29 AM, Brenden Blanco wrote:
> On Mon, Apr 04, 2016 at 05:12:27PM +0200, Jesper Dangaard Brouer wrote:
>> On Mon, 4 Apr 2016 11:09:57 -0300
>> Tom Herbert <tom@herbertland.com> wrote:
>>
>>> On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>>> On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote:  
>>>>>
>>>>> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net>
>>>>> wrote:  
>>>>>>
>>>>>> On 04/02/2016 03:21 AM, Brenden Blanco wrote:  
>>>>>>>
>>>>>>> Add a new bpf prog type that is intended to run in early stages of the
>>>>>>> packet rx path. Only minimal packet metadata will be available, hence a
>>>>>>> new
>>>>>>> context type, struct xdp_metadata, is exposed to userspace. So far only
>>>>>>> expose the readable packet length, and only in read mode.
>>>>>>>
>>>>>>> The PHYS_DEV name is chosen to represent that the program is meant only
>>>>>>> for physical adapters, rather than all netdevs.
>>>>>>>
>>>>>>> While the user visible struct is new, the underlying context must be
>>>>>>> implemented as a minimal skb in order for the packet load_* instructions
>>>>>>> to work. The skb filled in by the driver must have skb->len, skb->head,
>>>>>>> and skb->data set, and skb->data_len == 0.
>>>>>>>  
>>>>> [...]  
>>>>>>
>>>>>>
>>>>>> Do you plan to support bpf_skb_load_bytes() as well? I like using
>>>>>> this API especially when dealing with larger chunks (>4 bytes) to
>>>>>> load into stack memory, plus content is kept in network byte order.
>>>>>>
>>>>>> What about other helpers such as bpf_skb_store_bytes() et al that
>>>>>> work on skbs. Do you intent to reuse them as is and thus populate
>>>>>> the per cpu skb with needed fields (faking linear data), or do you
>>>>>> see larger obstacles that prevent for this?  
>>>>>
>>>>>
>>>>> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
>>>>> to users of this API.
>>>>>
>>>>> The hole idea is that an SKB is NOT allocated yet, and not needed at
>>>>> this level.  If we start supporting calling underlying SKB functions,
>>>>> then we will end-up in the same place (performance wise).  
>>>>
>>>>
>>>> I'm talking about the current skb-related BPF helper functions we have,
>>>> so the question is how much from that code we have we can reuse under
>>>> these constraints (obviously things like the tunnel helpers are a different
>>>> story) and if that trade-off is acceptable for us. I'm also thinking
>>>> that, for example, if you need to parse the packet data anyway for a drop
>>>> verdict, you might as well pass some meta data (that is set in the real
>>>> skb later on) for those packets that go up the stack.  
>>>
>>> Right, the meta data in this case is an abstracted receive descriptor.
>>> This would include items that we get in a device receive descriptor
>>> (computed checksum, hash, VLAN tag). This is purposely a small
>>> restricted data structure. I'm hoping we can minimize the size of this
>>> to not much more than 32 bytes (including pointers to data and
>>> linkage).
>>
>> I agree.
>>  
>>> How this translates to skb to maintain compatibility is with BPF
>>> interesting question. One other consideration is that skb's are kernel
>>> specific, we should be able to use the same BPF filter program in
>>> userspace over DPDK for instance-- so an skb interface as the packet
>>> abstraction might not be the right model...
>>
>> I agree.  I don't think reusing the SKB data structure is the right
>> model.  We should drop the SKB pointer from the API.
>>
>> As Tom also points out, making the BPF interface independent of the SKB
>> meta-data structure, would also make the eBPF program more generally
>> applicable.
> The initial approach that I tried went down this path. Alexei advised
> that I use the pseudo skb, and in the future the API between drivers and
> bpf can change to adopt non-skb context. The only user facing ABIs in
> this patchset are the IFLA, the xdp_metadata struct, and the name of the
> new enum.
> 
> The reason to use a pseudo skb for now is that there will be a fair
> amount of churn to get bpf jit and interpreter to understand non-skb
> context in the bpf_load_pointer() code. I don't see the need for
> requiring that for this patchset, as it will be internal-only change
> if/when we use something else.

Another option would be to have per driver JIT code to patch up the
skb read/loads with descriptor reads and metadata. From a strictly
performance stand point it should be better than pseudo skbs.

.John
Brenden Blanco April 4, 2016, 4:17 p.m. UTC | #12
On Mon, Apr 04, 2016 at 09:07:03AM -0700, John Fastabend wrote:
> On 16-04-04 08:29 AM, Brenden Blanco wrote:
> > On Mon, Apr 04, 2016 at 05:12:27PM +0200, Jesper Dangaard Brouer wrote:
> >> On Mon, 4 Apr 2016 11:09:57 -0300
> >> Tom Herbert <tom@herbertland.com> wrote:
> >>
> >>> On Mon, Apr 4, 2016 at 10:36 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>>> On 04/04/2016 03:07 PM, Jesper Dangaard Brouer wrote:  
> >>>>>
> >>>>> On Mon, 04 Apr 2016 10:49:09 +0200 Daniel Borkmann <daniel@iogearbox.net>
> >>>>> wrote:  
> >>>>>>
> >>>>>> On 04/02/2016 03:21 AM, Brenden Blanco wrote:  
> >>>>>>>
> >>>>>>> Add a new bpf prog type that is intended to run in early stages of the
> >>>>>>> packet rx path. Only minimal packet metadata will be available, hence a
> >>>>>>> new
> >>>>>>> context type, struct xdp_metadata, is exposed to userspace. So far only
> >>>>>>> expose the readable packet length, and only in read mode.
> >>>>>>>
> >>>>>>> The PHYS_DEV name is chosen to represent that the program is meant only
> >>>>>>> for physical adapters, rather than all netdevs.
> >>>>>>>
> >>>>>>> While the user visible struct is new, the underlying context must be
> >>>>>>> implemented as a minimal skb in order for the packet load_* instructions
> >>>>>>> to work. The skb filled in by the driver must have skb->len, skb->head,
> >>>>>>> and skb->data set, and skb->data_len == 0.
> >>>>>>>  
> >>>>> [...]  
> >>>>>>
> >>>>>>
> >>>>>> Do you plan to support bpf_skb_load_bytes() as well? I like using
> >>>>>> this API especially when dealing with larger chunks (>4 bytes) to
> >>>>>> load into stack memory, plus content is kept in network byte order.
> >>>>>>
> >>>>>> What about other helpers such as bpf_skb_store_bytes() et al that
> >>>>>> work on skbs. Do you intent to reuse them as is and thus populate
> >>>>>> the per cpu skb with needed fields (faking linear data), or do you
> >>>>>> see larger obstacles that prevent for this?  
> >>>>>
> >>>>>
> >>>>> Argh... maybe the minimal pseudo/fake SKB is the wrong "signal" to send
> >>>>> to users of this API.
> >>>>>
> >>>>> The hole idea is that an SKB is NOT allocated yet, and not needed at
> >>>>> this level.  If we start supporting calling underlying SKB functions,
> >>>>> then we will end-up in the same place (performance wise).  
> >>>>
> >>>>
> >>>> I'm talking about the current skb-related BPF helper functions we have,
> >>>> so the question is how much from that code we have we can reuse under
> >>>> these constraints (obviously things like the tunnel helpers are a different
> >>>> story) and if that trade-off is acceptable for us. I'm also thinking
> >>>> that, for example, if you need to parse the packet data anyway for a drop
> >>>> verdict, you might as well pass some meta data (that is set in the real
> >>>> skb later on) for those packets that go up the stack.  
> >>>
> >>> Right, the meta data in this case is an abstracted receive descriptor.
> >>> This would include items that we get in a device receive descriptor
> >>> (computed checksum, hash, VLAN tag). This is purposely a small
> >>> restricted data structure. I'm hoping we can minimize the size of this
> >>> to not much more than 32 bytes (including pointers to data and
> >>> linkage).
> >>
> >> I agree.
> >>  
> >>> How this translates to skb to maintain compatibility is with BPF
> >>> interesting question. One other consideration is that skb's are kernel
> >>> specific, we should be able to use the same BPF filter program in
> >>> userspace over DPDK for instance-- so an skb interface as the packet
> >>> abstraction might not be the right model...
> >>
> >> I agree.  I don't think reusing the SKB data structure is the right
> >> model.  We should drop the SKB pointer from the API.
> >>
> >> As Tom also points out, making the BPF interface independent of the SKB
> >> meta-data structure, would also make the eBPF program more generally
> >> applicable.
> > The initial approach that I tried went down this path. Alexei advised
> > that I use the pseudo skb, and in the future the API between drivers and
> > bpf can change to adopt non-skb context. The only user facing ABIs in
> > this patchset are the IFLA, the xdp_metadata struct, and the name of the
> > new enum.
> > 
> > The reason to use a pseudo skb for now is that there will be a fair
> > amount of churn to get bpf jit and interpreter to understand non-skb
> > context in the bpf_load_pointer() code. I don't see the need for
> > requiring that for this patchset, as it will be internal-only change
> > if/when we use something else.
> 
> Another option would be to have per driver JIT code to patch up the
> skb read/loads with descriptor reads and metadata. From a strictly
> performance stand point it should be better than pseudo skbs.

I considered (and implemented) this as well, but there my problem was
that I needed to inform the bpf() syscall at BPF_PROG_LOAD time which
ifindex to look at for fixups, so I had to add a new ifindex field to
bpf_attr. Then during verification I had to use a new ndo to get the
driver-specific offsets for its particular descriptor format. It seemed
kludgy.
> 
> .John
Alexei Starovoitov April 4, 2016, 8 p.m. UTC | #13
On Mon, Apr 04, 2016 at 09:17:22AM -0700, Brenden Blanco wrote:
> > >>
> > >> As Tom also points out, making the BPF interface independent of the SKB
> > >> meta-data structure, would also make the eBPF program more generally
> > >> applicable.
> > > The initial approach that I tried went down this path. Alexei advised
> > > that I use the pseudo skb, and in the future the API between drivers and
> > > bpf can change to adopt non-skb context. The only user facing ABIs in
> > > this patchset are the IFLA, the xdp_metadata struct, and the name of the
> > > new enum.

Exactly. That the most important part of this rfc.
Right now redirect to different queue, batching, prefetch and tons of
other code are mising. We have to plan the whole project, so we can
incrementally add features without breaking abi.
So new IFLA, xdp_metadata struct and enum for bpf return codes are
the main things to agree on.

> > > The reason to use a pseudo skb for now is that there will be a fair
> > > amount of churn to get bpf jit and interpreter to understand non-skb
> > > context in the bpf_load_pointer() code. I don't see the need for
> > > requiring that for this patchset, as it will be internal-only change
> > > if/when we use something else.
> > 
> > Another option would be to have per driver JIT code to patch up the
> > skb read/loads with descriptor reads and metadata. From a strictly
> > performance stand point it should be better than pseudo skbs.

Per-driver pre/post JIT-like phase is actually on the table. It may
still be needed. If we can avoid it while keeping high performance, great.
 
> I considered (and implemented) this as well, but there my problem was
> that I needed to inform the bpf() syscall at BPF_PROG_LOAD time which
> ifindex to look at for fixups, so I had to add a new ifindex field to
> bpf_attr. Then during verification I had to use a new ndo to get the
> driver-specific offsets for its particular descriptor format. It seemed
> kludgy.

Another reason for going with 'pseudo skb' structure was to reuse
load_byte/half/word instructions in bpf interpreter as-is.
Right now these instructions have to see in-kernel
'struct sk_buff' as context (note we have mirror __sk_buff
for user space), so to use load_byte for bpf_prog_type_phys_dev
we have to give real 'struct sk_buff' to interpter with
data, head, len, data_len fields initialized, so that
interpreter 'just works'.
The potential fix would be to add phys_dev specific load_byte/word
instructions. Then we can drop all the legacy negative offset
stuff that <1% uses, but it slows down everyone.
We can also drop byteswap that load_word does (which turned out
to be confusing and often programs do 2nd byteswap to go
back to cpu endiannes).
And if we do it smart, we can drop length check as well,
then new_load_byte will actually be single load byte cpu instruction.
We can drop length check when offset is constant in the verfier
and that constant is less than 64, since all packets are larger.
As seen in 'perf report' from patch 5:
  3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
this is 14Mpps and 4 assembler instructions in the above function
are consuming 3% of the cpu. Making new_load_byte to be single
x86 insn would be really cool.

Of course, there are other pieces to accelerate:
 12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
  6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
  4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
  4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
and I think Jesper's work on batch allocation is going help that a lot.
Thomas Graf April 4, 2016, 10:04 p.m. UTC | #14
On 04/04/16 at 01:00pm, Alexei Starovoitov wrote:
> Exactly. That the most important part of this rfc.
> Right now redirect to different queue, batching, prefetch and tons of
> other code are mising. We have to plan the whole project, so we can
> incrementally add features without breaking abi.
> So new IFLA, xdp_metadata struct and enum for bpf return codes are
> the main things to agree on.

+1
This is the most important statement in this thread so far. A plan
that gets us from this RFC series to a functional forwarding engine
with redirect and load/write is essential. [...]

> Another reason for going with 'pseudo skb' structure was to reuse
> load_byte/half/word instructions in bpf interpreter as-is.
> Right now these instructions have to see in-kernel
> 'struct sk_buff' as context (note we have mirror __sk_buff
> for user space), so to use load_byte for bpf_prog_type_phys_dev
> we have to give real 'struct sk_buff' to interpter with
> data, head, len, data_len fields initialized, so that
> interpreter 'just works'.
> The potential fix would be to add phys_dev specific load_byte/word
> instructions. Then we can drop all the legacy negative offset
> stuff that <1% uses, but it slows down everyone.
> We can also drop byteswap that load_word does (which turned out
> to be confusing and often programs do 2nd byteswap to go
> back to cpu endiannes).

[...] I would really like to see a common set of helpers which
applies to both cls_bpf and phys_dev. Given the existing skb based
helpers cannot be overloaded, at least the phys_dev helpers should
be made to work in cls_bpf context as well to allow for some
portability. Otherwise we'll end up with half a dozen flavours of
BPF which are all incompatible.

> And if we do it smart, we can drop length check as well,
> then new_load_byte will actually be single load byte cpu instruction.
> We can drop length check when offset is constant in the verfier
> and that constant is less than 64, since all packets are larger.
> As seen in 'perf report' from patch 5:
>   3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
> this is 14Mpps and 4 assembler instructions in the above function
> are consuming 3% of the cpu. Making new_load_byte to be single
> x86 insn would be really cool.

Neat.
Thomas Graf April 4, 2016, 10:07 p.m. UTC | #15
On 04/03/16 at 12:02am, Brenden Blanco wrote:
> On Sat, Apr 02, 2016 at 12:39:45PM -0400, Tom Herbert wrote:
> > Is there a hard restriction that this could only work with physical devices?
> I suppose not, but there wouldn't be much use case as compared to tc
> ingress, no? Since I was imagining that this new hook would be more
> restricted in functionality due to operating on descriptors rather than
> with a full skb, I tried to think of an appropriate name.
> 
> If you think that this hook would spread, then a better name is needed.

The thing that comes to mind is that this prog type makes it easier to
implement batched processing of packets in BPF which would require major
surgery across the tc layer to make it available in cls_bpf. Batched
processing will be beneficial for software devices as well.
Alexei Starovoitov April 5, 2016, 2:25 a.m. UTC | #16
On Tue, Apr 05, 2016 at 12:04:39AM +0200, Thomas Graf wrote:
> On 04/04/16 at 01:00pm, Alexei Starovoitov wrote:
> > Exactly. That the most important part of this rfc.
> > Right now redirect to different queue, batching, prefetch and tons of
> > other code are mising. We have to plan the whole project, so we can
> > incrementally add features without breaking abi.
> > So new IFLA, xdp_metadata struct and enum for bpf return codes are
> > the main things to agree on.
> 
> +1
> This is the most important statement in this thread so far. A plan
> that gets us from this RFC series to a functional forwarding engine
> with redirect and load/write is essential. [...]

exactly. I think the next step 2 is to figure out the redirect return code
and 'rewiring' of the rx dma buffer into tx ring and auto-batching.
As this rfc showed even when using standard page alloc/free the peformance
is hitting 10Gbps hw limit and not being cpu bounded, so recycling of
the pages and avoiding map/unmap will come at step 3.
Batching is necessary even for basic redirect, since ringing doorbell
for every tx buffer is not an option.

> [...] I would really like to see a common set of helpers which
> applies to both cls_bpf and phys_dev. Given the existing skb based
> helpers cannot be overloaded, at least the phys_dev helpers should
> be made to work in cls_bpf context as well to allow for some
> portability. Otherwise we'll end up with half a dozen flavours of
> BPF which are all incompatible.

The helpers can be 'overloaded'. In my upcoming patches for
bpf+tracepoints the bpf_perf_event_output() helper is different
depending on program type (kprobe vs tracepoint), but logically
it looks exactly the same from program point of view and
BPF_FUNC_id is reused.
So for cls_bpf vs bpf_phys_dev we can have the same bpf_csum_diff()
helper which will have different internal implementation depending
on program type.
Jesper Dangaard Brouer April 5, 2016, 8:11 a.m. UTC | #17
On Mon, 4 Apr 2016 19:25:08 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Tue, Apr 05, 2016 at 12:04:39AM +0200, Thomas Graf wrote:
> > On 04/04/16 at 01:00pm, Alexei Starovoitov wrote:  
> > > Exactly. That the most important part of this rfc.
> > > Right now redirect to different queue, batching, prefetch and tons of
> > > other code are mising. We have to plan the whole project, so we can
> > > incrementally add features without breaking abi.
> > > So new IFLA, xdp_metadata struct and enum for bpf return codes are
> > > the main things to agree on.  
> > 
> > +1
> > This is the most important statement in this thread so far. A plan
> > that gets us from this RFC series to a functional forwarding engine
> > with redirect and load/write is essential. [...]  

+1 agreed, I love to see the energy in this thread! :-)
 
> exactly. I think the next step 2 is to figure out the redirect return code
> and 'rewiring' of the rx dma buffer into tx ring and auto-batching.
>
> As this rfc showed even when using standard page alloc/free the peformance
> is hitting 10Gbps hw limit and not being cpu bounded, so recycling of
> the pages and avoiding map/unmap will come at step 3.

Yes, I've also noticed the standard page alloc/free performance is
slowing us down.  I will be working on identifying (and measuring) page
allocator bottlenecks.

For the early drop case, we should be able to hack the driver to,
recycle the page directly (and even avoid the DMA unmap).  But for the
TX (forward) case, we would need some kind of page-pool cache API to
recycle the page through (also useful for normal netstack usage).
  I'm interested in implementing a generic page-pool cache mechanism,
and I plan to bring up this topic at the MM-summit in 2 weeks.  Input
are welcome... but as Alexei says this is likely a step 3 project.


> Batching is necessary even for basic redirect, since ringing doorbell
> for every tx buffer is not an option.

Yes, we know TX batching is essential for performance.  If we create a
RX bundle/batch step (in the driver) then eBFP forward step can work on
a RX-bundle and build up TX-bundle(s) (per TX device), that can TX bulk
send these.  Notice that I propose building TX-bundles, not sending
each individual packet and flushing tail/doorbell, because I want to
maximize icache efficiency of the eBFP program.
Jesper Dangaard Brouer April 5, 2016, 8:19 a.m. UTC | #18
On Tue, 5 Apr 2016 00:07:14 +0200 Thomas Graf <tgraf@suug.ch> wrote:

> On 04/03/16 at 12:02am, Brenden Blanco wrote:
> > On Sat, Apr 02, 2016 at 12:39:45PM -0400, Tom Herbert wrote:  
> > > Is there a hard restriction that this could only work with physical devices?  
> > I suppose not, but there wouldn't be much use case as compared to tc
> > ingress, no? Since I was imagining that this new hook would be more
> > restricted in functionality due to operating on descriptors rather than
> > with a full skb, I tried to think of an appropriate name.
> > 
> > If you think that this hook would spread, then a better name is needed.  
> 
> The thing that comes to mind is that this prog type makes it easier to
> implement batched processing of packets in BPF which would require major
> surgery across the tc layer to make it available in cls_bpf. Batched
> processing will be beneficial for software devices as well.

+1 agreed, I would really like to see batched processing of packets in
BPF, for this packet type, designed in from the start.
Jesper Dangaard Brouer April 5, 2016, 9:29 a.m. UTC | #19
On Mon, 4 Apr 2016 13:00:34 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> As seen in 'perf report' from patch 5:
>   3.32%  ksoftirqd/1    [kernel.vmlinux]  [k] sk_load_byte_positive_offset
> this is 14Mpps and 4 assembler instructions in the above function
> are consuming 3% of the cpu.

At this level we also need to take into account the cost/overhead of a
function call.  Which I've measured to between 5-7 cycles, part of my
time_bench_sample[1] test.

> Making new_load_byte to be single  x86 insn would be really cool.
> 
> Of course, there are other pieces to accelerate:
>  12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
>   6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
>   4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
>   4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
> and I think Jesper's work on batch allocation is going help that a lot.

Actually, it looks like all of this "overhead" comes from the page
alloc/free (+ dma unmap/map). We would need a page-pool recycle
mechanism to solve/remove this overhead.  For the early drop case we
might be able to hack recycle the page directly in the driver (and also
avoid dma_unmap/map cycle).


[1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_sample.c
Alexei Starovoitov April 5, 2016, 10:06 p.m. UTC | #20
On Tue, Apr 05, 2016 at 11:29:05AM +0200, Jesper Dangaard Brouer wrote:
> > 
> > Of course, there are other pieces to accelerate:
> >  12.71%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_alloc_frags
> >   6.87%  ksoftirqd/1    [mlx4_en]         [k] mlx4_en_free_frag
> >   4.20%  ksoftirqd/1    [kernel.vmlinux]  [k] get_page_from_freelist
> >   4.09%  swapper        [mlx4_en]         [k] mlx4_en_process_rx_cq
> > and I think Jesper's work on batch allocation is going help that a lot.
> 
> Actually, it looks like all of this "overhead" comes from the page
> alloc/free (+ dma unmap/map). We would need a page-pool recycle
> mechanism to solve/remove this overhead.  For the early drop case we
> might be able to hack recycle the page directly in the driver (and also
> avoid dma_unmap/map cycle).

Exactly. A cache of allocated and mapped pages will help a lot both drop
and redirect use cases. After tx completion we can recycle still mmaped
page into the cache (need to make sure to map them PCI_DMA_BIDIRECTIONAL)
and rx can refill the ring with it. For load balancer steady state
we won't have any calls to page allocator and dma.
Being able to do cheap percpu pool like this is a huge advantage
that any kernel bypass cannot have. I'm pretty sure it will be
possible to avoid local_cmpxchg as well.
diff mbox

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 924f537..b8a4ef2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_KPROBE,
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_PHYS_DEV,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
@@ -367,6 +368,10 @@  struct __sk_buff {
 	__u32 tc_classid;
 };
 
+struct xdp_metadata {
+	__u32 len;
+};
+
 struct bpf_tunnel_key {
 	__u32 tunnel_id;
 	union {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e..804ca70 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1340,6 +1340,7 @@  static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_PHYS_DEV:
 		return true;
 	default:
 		return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index b7177d0..c417db6 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2018,6 +2018,12 @@  tc_cls_act_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+phys_dev_func_proto(enum bpf_func_id func_id)
+{
+	return sk_filter_func_proto(func_id);
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	/* check bounds */
@@ -2073,6 +2079,36 @@  static bool tc_cls_act_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool __is_valid_xdp_access(int off, int size,
+				  enum bpf_access_type type)
+{
+	if (off < 0 || off >= sizeof(struct xdp_metadata))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	if (size != 4)
+		return false;
+
+	return true;
+}
+
+static bool phys_dev_is_valid_access(int off, int size,
+				     enum bpf_access_type type)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case offsetof(struct xdp_metadata, len):
+		break;
+	default:
+		return false;
+	}
+	return __is_valid_xdp_access(off, size, type);
+}
+
 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				      int src_reg, int ctx_off,
 				      struct bpf_insn *insn_buf,
@@ -2210,6 +2246,26 @@  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
+					   int dst_reg, int src_reg,
+					   int ctx_off,
+					   struct bpf_insn *insn_buf,
+					   struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct xdp_metadata, len):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, len));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto = sk_filter_func_proto,
 	.is_valid_access = sk_filter_is_valid_access,
@@ -2222,6 +2278,12 @@  static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops phys_dev_ops = {
+	.get_func_proto = phys_dev_func_proto,
+	.is_valid_access = phys_dev_is_valid_access,
+	.convert_ctx_access = bpf_phys_dev_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops = &sk_filter_ops,
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2237,11 +2299,17 @@  static struct bpf_prog_type_list sched_act_type __read_mostly = {
 	.type = BPF_PROG_TYPE_SCHED_ACT,
 };
 
+static struct bpf_prog_type_list phys_dev_type __read_mostly = {
+	.ops = &phys_dev_ops,
+	.type = BPF_PROG_TYPE_PHYS_DEV,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
 	bpf_register_prog_type(&sched_act_type);
+	bpf_register_prog_type(&phys_dev_type);
 
 	return 0;
 }