diff mbox

[RFC,v2,1/5] bpf: add PHYS_DEV prog type for early driver filter

Message ID 1460090930-11219-1-git-send-email-bblanco@plumgrid.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Brenden Blanco April 8, 2016, 4:48 a.m. UTC
Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a
new context type, struct bpf_phys_dev_md, is exposed to userspace. So
far only expose the readable packet length, and only in read mode.

The PHYS_DEV name is chosen to represent that the program is meant only
for physical adapters, rather than all netdevs.

While the user visible struct is new, the underlying context must be
implemented as a minimal skb in order for the packet load_* instructions
to work. The skb filled in by the driver must have skb->len, skb->head,
and skb->data set, and skb->data_len == 0.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/uapi/linux/bpf.h | 14 ++++++++++
 kernel/bpf/verifier.c    |  1 +
 net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)

Comments

Jesper Dangaard Brouer April 8, 2016, 10:36 a.m. UTC | #1
On Thu,  7 Apr 2016 21:48:46 -0700
Brenden Blanco <bblanco@plumgrid.com> wrote:

> Add a new bpf prog type that is intended to run in early stages of the
> packet rx path. Only minimal packet metadata will be available, hence a
> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
> far only expose the readable packet length, and only in read mode.
> 
> The PHYS_DEV name is chosen to represent that the program is meant only
> for physical adapters, rather than all netdevs.
> 
> While the user visible struct is new, the underlying context must be
> implemented as a minimal skb in order for the packet load_* instructions
> to work. The skb filled in by the driver must have skb->len, skb->head,
> and skb->data set, and skb->data_len == 0.
> 
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>  include/uapi/linux/bpf.h | 14 ++++++++++
>  kernel/bpf/verifier.c    |  1 +
>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 70eda5a..3018d83 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>  	BPF_PROG_TYPE_SCHED_CLS,
>  	BPF_PROG_TYPE_SCHED_ACT,
>  	BPF_PROG_TYPE_TRACEPOINT,
> +	BPF_PROG_TYPE_PHYS_DEV,
>  };
>  
>  #define BPF_PSEUDO_MAP_FD	1
> @@ -368,6 +369,19 @@ struct __sk_buff {
>  	__u32 tc_classid;
>  };
>  
> +/* user return codes for PHYS_DEV prog type */
> +enum bpf_phys_dev_action {
> +	BPF_PHYS_DEV_DROP,
> +	BPF_PHYS_DEV_OK,
> +};

I can imagine these extra return codes:

 BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
 BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
 BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */

The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
which we can look at when we get that far...

For the "MODIFIED" case, people caring about checksumming, might want
to voice their concern if we want additional info or return codes,
indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
hdr was modified).


> +/* user accessible metadata for PHYS_DEV packet hook
> + * new fields must be added to the end of this structure
> + */
> +struct bpf_phys_dev_md {
> +	__u32 len;
> +};

This is userspace visible.  I can easily imagine this structure will get
extended.  How does a userspace eBPF program know that the struct got
extended? (bet you got some scheme, normally I would add a "version" as
first elem).

BTW, how and where is this "struct bpf_phys_dev_md" allocated?


>  struct bpf_tunnel_key {
>  	__u32 tunnel_id;
>  	union {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58792fe..877542d 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1344,6 +1344,7 @@ static bool may_access_skb(enum bpf_prog_type type)
>  	case BPF_PROG_TYPE_SOCKET_FILTER:
>  	case BPF_PROG_TYPE_SCHED_CLS:
>  	case BPF_PROG_TYPE_SCHED_ACT:
> +	case BPF_PROG_TYPE_PHYS_DEV:
>  		return true;
>  	default:
>  		return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e8486ba..4f73fb9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2021,6 +2021,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
>  	}
>  }
>  
> +static const struct bpf_func_proto *
> +phys_dev_func_proto(enum bpf_func_id func_id)
> +{
> +	return sk_filter_func_proto(func_id);
> +}
> +
>  static bool __is_valid_access(int off, int size, enum bpf_access_type type)
>  {
>  	/* check bounds */
> @@ -2076,6 +2082,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
>  	return __is_valid_access(off, size, type);
>  }
>  
> +static bool __is_valid_phys_dev_access(int off, int size,
> +				       enum bpf_access_type type)
> +{
> +	if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
> +		return false;
> +
> +	if (off % size != 0)
> +		return false;
> +
> +	if (size != 4)
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool phys_dev_is_valid_access(int off, int size,
> +				     enum bpf_access_type type)
> +{
> +	if (type == BPF_WRITE)
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct bpf_phys_dev_md, len):
> +		break;
> +	default:
> +		return false;
> +	}
> +	return __is_valid_phys_dev_access(off, size, type);
> +}
> +
>  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>  				      int src_reg, int ctx_off,
>  				      struct bpf_insn *insn_buf,
> @@ -2213,6 +2249,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>  	return insn - insn_buf;
>  }
>  
> +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
> +					   int dst_reg, int src_reg,
> +					   int ctx_off,
> +					   struct bpf_insn *insn_buf,
> +					   struct bpf_prog *prog)
> +{
> +	struct bpf_insn *insn = insn_buf;
> +
> +	switch (ctx_off) {
> +	case offsetof(struct bpf_phys_dev_md, len):
> +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
> +
> +		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
> +				      offsetof(struct sk_buff, len));
> +		break;
> +	}
> +
> +	return insn - insn_buf;
> +}
> +
>  static const struct bpf_verifier_ops sk_filter_ops = {
>  	.get_func_proto = sk_filter_func_proto,
>  	.is_valid_access = sk_filter_is_valid_access,
> @@ -2225,6 +2281,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
>  	.convert_ctx_access = bpf_net_convert_ctx_access,
>  };
>  
> +static const struct bpf_verifier_ops phys_dev_ops = {
> +	.get_func_proto = phys_dev_func_proto,
> +	.is_valid_access = phys_dev_is_valid_access,
> +	.convert_ctx_access = bpf_phys_dev_convert_ctx_access,
> +};
> +
>  static struct bpf_prog_type_list sk_filter_type __read_mostly = {
>  	.ops = &sk_filter_ops,
>  	.type = BPF_PROG_TYPE_SOCKET_FILTER,
> @@ -2240,11 +2302,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
>  	.type = BPF_PROG_TYPE_SCHED_ACT,
>  };
>  
> +static struct bpf_prog_type_list phys_dev_type __read_mostly = {
> +	.ops = &phys_dev_ops,
> +	.type = BPF_PROG_TYPE_PHYS_DEV,
> +};
> +
>  static int __init register_sk_filter_ops(void)
>  {
>  	bpf_register_prog_type(&sk_filter_type);
>  	bpf_register_prog_type(&sched_cls_type);
>  	bpf_register_prog_type(&sched_act_type);
> +	bpf_register_prog_type(&phys_dev_type);
>  
>  	return 0;
>  }
Daniel Borkmann April 8, 2016, 11:09 a.m. UTC | #2
On 04/08/2016 12:36 PM, Jesper Dangaard Brouer wrote:
> On Thu,  7 Apr 2016 21:48:46 -0700
> Brenden Blanco <bblanco@plumgrid.com> wrote:
>
>> Add a new bpf prog type that is intended to run in early stages of the
>> packet rx path. Only minimal packet metadata will be available, hence a
>> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
>> far only expose the readable packet length, and only in read mode.
>>
>> The PHYS_DEV name is chosen to represent that the program is meant only
>> for physical adapters, rather than all netdevs.
>>
>> While the user visible struct is new, the underlying context must be
>> implemented as a minimal skb in order for the packet load_* instructions
>> to work. The skb filled in by the driver must have skb->len, skb->head,
>> and skb->data set, and skb->data_len == 0.
>>
>> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
>> ---
>>   include/uapi/linux/bpf.h | 14 ++++++++++
>>   kernel/bpf/verifier.c    |  1 +
>>   net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 83 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 70eda5a..3018d83 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>>   	BPF_PROG_TYPE_SCHED_CLS,
>>   	BPF_PROG_TYPE_SCHED_ACT,
>>   	BPF_PROG_TYPE_TRACEPOINT,
>> +	BPF_PROG_TYPE_PHYS_DEV,
>>   };
>>
>>   #define BPF_PSEUDO_MAP_FD	1
>> @@ -368,6 +369,19 @@ struct __sk_buff {
>>   	__u32 tc_classid;
>>   };
>>
>> +/* user return codes for PHYS_DEV prog type */
>> +enum bpf_phys_dev_action {
>> +	BPF_PHYS_DEV_DROP,
>> +	BPF_PHYS_DEV_OK,
>> +};
>
> I can imagine these extra return codes:
>
>   BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>   BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>   BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
>
> The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> which we can look at when we get that far...

I'd probably let the tcpdump case be handled by the rest of the stack.
Forwarding could be to some txqueue or perhaps directly to a virtual net
device e.g. the veth end sitting in a container (where fake skb would
need to be promoted to a real one). Or, perhaps instead of veth end,
directly demuxed into a target socket queue in that container ...
Alternatively for tcpdump use-case, you could also do sampling on the
bpf_phy_dev with eBPF maps.

> For the "MODIFIED" case, people caring about checksumming, might want
> to voice their concern if we want additional info or return codes,
> indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
> hdr was modified).
>
>> +/* user accessible metadata for PHYS_DEV packet hook
>> + * new fields must be added to the end of this structure
>> + */
>> +struct bpf_phys_dev_md {
>> +	__u32 len;
>> +};
>
> This is userspace visible.  I can easily imagine this structure will get
> extended.  How does a userspace eBPF program know that the struct got
> extended? (bet you got some scheme, normally I would add a "version" as
> first elem).
>
> BTW, how and where is this "struct bpf_phys_dev_md" allocated?

It isn't, see bpf_phys_dev_convert_ctx_access(). At load time the verifier
will convert/rewrite this based on offsetof() to a real access of the per
cpu sk_buff, that's the only purpose.

Cheers,
Daniel
Jesper Dangaard Brouer April 8, 2016, 12:33 p.m. UTC | #3
On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> > +/* user return codes for PHYS_DEV prog type */
> > +enum bpf_phys_dev_action {
> > +	BPF_PHYS_DEV_DROP,
> > +	BPF_PHYS_DEV_OK,
> > +};  
> 
> I can imagine these extra return codes:
> 
>  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> 
> The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> which we can look at when we get that far...

I want to point out something which is quite FUNDAMENTAL, for
understanding these return codes (and network stack).


At driver RX time, the network stack basically have two ways of
building an SKB, which is send up the stack.

Option-A (fastest): The packet page is writable. The SKB can be
allocated and skb->data/head can point directly to the page.  And
we place/write skb_shared_info in the end/tail-room. (This is done by
calling build_skb()).

Option-B (slower): The packet page is read-only.  The SKB cannot point
skb->data/head directly to the page, because skb_shared_info need to be
written into skb->end (slightly hidden via skb_shinfo() casting).  To
get around this, a separate piece of memory is allocated (speedup by
__alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
be written. (This is done when calling netdev/napi_alloc_skb()).
  Drivers then need to copy over packet headers, and assign + adjust
skb_shinfo(skb)->frags[0] offset to skip copied headers.


Unfortunately most drivers use option-B.  Due to cost of calling the
page allocator.  It is only slightly most expensive to get a larger
compound page from the page allocator, which then can be partitioned into
page-fragments, thus amortizing the page alloc cost.  Unfortunately the
cost is added later, when constructing the SKB.
 Another reason for option-B, is that archs with expensive IOMMU
requirements (like PowerPC), don't need to dma_unmap on every packet,
but only on the compound page level.

Side-note: Most drivers have a "copy-break" optimization.  Especially
for option-B, when copying header data anyhow. For small packet, one
might as well free (or recycle) the RX page, if header size fits into
the newly allocated memory (for skb_shared_info).


For the early filter drop (DDoS use-case), it does not matter that the
packet-page is read-only.

BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
we ever want to see speeds comparable to DPDK, then drivers to
need to implement option-A, as this allow forwarding at the packet-page
level.

I hope, my future page-pool facility can remove/hide the cost calling
the page allocator.


Back to the return codes, thus:
-------------------------------
BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
the SKB, and treat packet data as read-only.

BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.
Brenden Blanco April 8, 2016, 4:48 p.m. UTC | #4
On Fri, Apr 08, 2016 at 01:09:30PM +0200, Daniel Borkmann wrote:
> On 04/08/2016 12:36 PM, Jesper Dangaard Brouer wrote:
> >On Thu,  7 Apr 2016 21:48:46 -0700
> >Brenden Blanco <bblanco@plumgrid.com> wrote:
> >
> >>Add a new bpf prog type that is intended to run in early stages of the
> >>packet rx path. Only minimal packet metadata will be available, hence a
> >>new context type, struct bpf_phys_dev_md, is exposed to userspace. So
> >>far only expose the readable packet length, and only in read mode.
> >>
> >>The PHYS_DEV name is chosen to represent that the program is meant only
> >>for physical adapters, rather than all netdevs.
> >>
> >>While the user visible struct is new, the underlying context must be
> >>implemented as a minimal skb in order for the packet load_* instructions
> >>to work. The skb filled in by the driver must have skb->len, skb->head,
> >>and skb->data set, and skb->data_len == 0.
> >>
> >>Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> >>---
> >>  include/uapi/linux/bpf.h | 14 ++++++++++
> >>  kernel/bpf/verifier.c    |  1 +
> >>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 83 insertions(+)
> >>
> >>diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >>index 70eda5a..3018d83 100644
> >>--- a/include/uapi/linux/bpf.h
> >>+++ b/include/uapi/linux/bpf.h
> >>@@ -93,6 +93,7 @@ enum bpf_prog_type {
> >>  	BPF_PROG_TYPE_SCHED_CLS,
> >>  	BPF_PROG_TYPE_SCHED_ACT,
> >>  	BPF_PROG_TYPE_TRACEPOINT,
> >>+	BPF_PROG_TYPE_PHYS_DEV,
> >>  };
> >>
> >>  #define BPF_PSEUDO_MAP_FD	1
> >>@@ -368,6 +369,19 @@ struct __sk_buff {
> >>  	__u32 tc_classid;
> >>  };
> >>
> >>+/* user return codes for PHYS_DEV prog type */
> >>+enum bpf_phys_dev_action {
> >>+	BPF_PHYS_DEV_DROP,
> >>+	BPF_PHYS_DEV_OK,
> >>+};
> >
> >I can imagine these extra return codes:
> >
> >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> >
> >The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> >which we can look at when we get that far...
> 
> I'd probably let the tcpdump case be handled by the rest of the stack.
> Forwarding could be to some txqueue or perhaps directly to a virtual net
> device e.g. the veth end sitting in a container (where fake skb would
> need to be promoted to a real one). Or, perhaps instead of veth end,
> directly demuxed into a target socket queue in that container ...
> Alternatively for tcpdump use-case, you could also do sampling on the
> bpf_phy_dev with eBPF maps.
+1, there is plenty of infrastructure to deal with this already.
> 
> >For the "MODIFIED" case, people caring about checksumming, might want
> >to voice their concern if we want additional info or return codes,
> >indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
> >hdr was modified).
> >
> >>+/* user accessible metadata for PHYS_DEV packet hook
> >>+ * new fields must be added to the end of this structure
> >>+ */
> >>+struct bpf_phys_dev_md {
> >>+	__u32 len;
> >>+};
> >
> >This is userspace visible.  I can easily imagine this structure will get
> >extended.  How does a userspace eBPF program know that the struct got
> >extended? (bet you got some scheme, normally I would add a "version" as
> >first elem).
Since fields are only ever added to the end, programs that access beyond
the struct as understood by the running kernel will be rejected. The
struct ordering is a hard requirement. We've gone through quite a few
upgrades of this style of struct and not had any issues that I am aware.
> >
> >BTW, how and where is this "struct bpf_phys_dev_md" allocated?
> 
> It isn't, see bpf_phys_dev_convert_ctx_access(). At load time the verifier
> will convert/rewrite this based on offsetof() to a real access of the per
> cpu sk_buff, that's the only purpose.
> 
> Cheers,
> Daniel
Brenden Blanco April 8, 2016, 5:02 p.m. UTC | #5
On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> 
> On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> > > +/* user return codes for PHYS_DEV prog type */
> > > +enum bpf_phys_dev_action {
> > > +	BPF_PHYS_DEV_DROP,
> > > +	BPF_PHYS_DEV_OK,
> > > +};  
> > 
> > I can imagine these extra return codes:
> > 
> >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > 
> > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > which we can look at when we get that far...
> 
> I want to point out something which is quite FUNDAMENTAL, for
> understanding these return codes (and network stack).
> 
> 
> At driver RX time, the network stack basically have two ways of
> building an SKB, which is send up the stack.
> 
> Option-A (fastest): The packet page is writable. The SKB can be
> allocated and skb->data/head can point directly to the page.  And
> we place/write skb_shared_info in the end/tail-room. (This is done by
> calling build_skb()).
> 
> Option-B (slower): The packet page is read-only.  The SKB cannot point
> skb->data/head directly to the page, because skb_shared_info need to be
> written into skb->end (slightly hidden via skb_shinfo() casting).  To
> get around this, a separate piece of memory is allocated (speedup by
> __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> be written. (This is done when calling netdev/napi_alloc_skb()).
>   Drivers then need to copy over packet headers, and assign + adjust
> skb_shinfo(skb)->frags[0] offset to skip copied headers.
> 
> 
> Unfortunately most drivers use option-B.  Due to cost of calling the
> page allocator.  It is only slightly most expensive to get a larger
> compound page from the page allocator, which then can be partitioned into
> page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> cost is added later, when constructing the SKB.
>  Another reason for option-B, is that archs with expensive IOMMU
> requirements (like PowerPC), don't need to dma_unmap on every packet,
> but only on the compound page level.
> 
> Side-note: Most drivers have a "copy-break" optimization.  Especially
> for option-B, when copying header data anyhow. For small packet, one
> might as well free (or recycle) the RX page, if header size fits into
> the newly allocated memory (for skb_shared_info).
> 
> 
> For the early filter drop (DDoS use-case), it does not matter that the
> packet-page is read-only.
> 
> BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
> we ever want to see speeds comparable to DPDK, then drivers to
> need to implement option-A, as this allow forwarding at the packet-page
> level.
> 
> I hope, my future page-pool facility can remove/hide the cost calling
> the page allocator.
> 
Can't wait! This will open up a lot of doors.
> 
> Back to the return codes, thus:
> -------------------------------
> BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
> the SKB, and treat packet data as read-only.
> 
> BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.
I understand the driver/hw requirement, but the codes themselves I think
need some tweaking. For instance, if the packet is both modified and
forwarded, should the flags be ORed together? Or is the need for this
return code made obsolete if the driver knows ahead of time via struct
bpf_prog flags that the prog intends to modify the packet, and can set
up the page accordingly?
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer
Alexei Starovoitov April 8, 2016, 5:26 p.m. UTC | #6
On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> 
> On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> > > +/* user return codes for PHYS_DEV prog type */
> > > +enum bpf_phys_dev_action {
> > > +	BPF_PHYS_DEV_DROP,
> > > +	BPF_PHYS_DEV_OK,
> > > +};  
> > 
> > I can imagine these extra return codes:
> > 
> >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > 
> > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > which we can look at when we get that far...
> 
> I want to point out something which is quite FUNDAMENTAL, for
> understanding these return codes (and network stack).
> 
> 
> At driver RX time, the network stack basically have two ways of
> building an SKB, which is send up the stack.
> 
> Option-A (fastest): The packet page is writable. The SKB can be
> allocated and skb->data/head can point directly to the page.  And
> we place/write skb_shared_info in the end/tail-room. (This is done by
> calling build_skb()).
> 
> Option-B (slower): The packet page is read-only.  The SKB cannot point
> skb->data/head directly to the page, because skb_shared_info need to be
> written into skb->end (slightly hidden via skb_shinfo() casting).  To
> get around this, a separate piece of memory is allocated (speedup by
> __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> be written. (This is done when calling netdev/napi_alloc_skb()).
>   Drivers then need to copy over packet headers, and assign + adjust
> skb_shinfo(skb)->frags[0] offset to skip copied headers.
> 
> 
> Unfortunately most drivers use option-B.  Due to cost of calling the
> page allocator.  It is only slightly most expensive to get a larger
> compound page from the page allocator, which then can be partitioned into
> page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> cost is added later, when constructing the SKB.
>  Another reason for option-B, is that archs with expensive IOMMU
> requirements (like PowerPC), don't need to dma_unmap on every packet,
> but only on the compound page level.
> 
> Side-note: Most drivers have a "copy-break" optimization.  Especially
> for option-B, when copying header data anyhow. For small packet, one
> might as well free (or recycle) the RX page, if header size fits into
> the newly allocated memory (for skb_shared_info).

I think you guys are going into overdesign territory, so
. nack on read-only pages
. nack on copy-break approach
. nack on per-ring programs
. nack on modified/stolen/shared return codes

The whole thing must be dead simple to use. Above is not simple by any means.
The programs must see writeable pages only and return codes:
drop, pass to stack, redirect to xmit.
If program wishes to modify packets before passing it to stack, it
shouldn't need to deal with different return values.
No special things to deal with small or large packets. No header splits.
Program must not be aware of any such things.
Drivers can use DMA_BIDIRECTIONAL to allow received page to be
modified by the program and immediately sent to xmit. 
No dma map/unmap/sync per packet. If some odd architectures/dma setups
cannot do it, then XDP will not be applicable there.
We are not going to sacrifice performance for generality.
Jesper Dangaard Brouer April 8, 2016, 7:05 p.m. UTC | #7
On Fri, 8 Apr 2016 10:02:00 -0700
Brenden Blanco <bblanco@plumgrid.com> wrote:

> On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> >   
> > > > +/* user return codes for PHYS_DEV prog type */
> > > > +enum bpf_phys_dev_action {
> > > > +	BPF_PHYS_DEV_DROP,
> > > > +	BPF_PHYS_DEV_OK,
> > > > +};    
> > > 
> > > I can imagine these extra return codes:
> > > 
> > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > > 
> > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > which we can look at when we get that far...  
> > 
> > I want to point out something which is quite FUNDAMENTAL, for
> > understanding these return codes (and network stack).
> > 
> > 
> > At driver RX time, the network stack basically have two ways of
> > building an SKB, which is send up the stack.
> > 
> > Option-A (fastest): The packet page is writable. The SKB can be
> > allocated and skb->data/head can point directly to the page.  And
> > we place/write skb_shared_info in the end/tail-room. (This is done by
> > calling build_skb()).
> > 
> > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > skb->data/head directly to the page, because skb_shared_info need to be
> > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > get around this, a separate piece of memory is allocated (speedup by
> > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > be written. (This is done when calling netdev/napi_alloc_skb()).
> >   Drivers then need to copy over packet headers, and assign + adjust
> > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > 
> > 
> > Unfortunately most drivers use option-B.  Due to cost of calling the
> > page allocator.  It is only slightly most expensive to get a larger
> > compound page from the page allocator, which then can be partitioned into
> > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > cost is added later, when constructing the SKB.
> >  Another reason for option-B, is that archs with expensive IOMMU
> > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > but only on the compound page level.
> > 
> > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > for option-B, when copying header data anyhow. For small packet, one
> > might as well free (or recycle) the RX page, if header size fits into
> > the newly allocated memory (for skb_shared_info).
> > 
> > 
> > For the early filter drop (DDoS use-case), it does not matter that the
> > packet-page is read-only.
> > 
> > BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
> > we ever want to see speeds comparable to DPDK, then drivers to
> > need to implement option-A, as this allow forwarding at the packet-page
> > level.
> > 
> > I hope, my future page-pool facility can remove/hide the cost calling
> > the page allocator.
> >   
> Can't wait! This will open up a lot of doors.
>

If you talk about the page-pool, then it is just once piece of the
puzzle, not the silver bullet ;-)

> > 
> > Back to the return codes, thus:
> > -------------------------------
> > BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
> > the SKB, and treat packet data as read-only.
> > 
> > BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.  
>
> I understand the driver/hw requirement, but the codes themselves I think
> need some tweaking.

I'm very open to changing these return codes. I'm just trying to open
up the discussion.


> For instance, if the packet is both modified and forwarded, should
> the flags be ORed together? 

I didn't see these as bit-flags. I assumed that if you want to forward
the packet, then you need to steal it (BPF_PHYS_DEV_STOLEN) and cannot
return it to the stack.

I'm open to changing this to bit-flags, BUT we just have to take care
not to introduce too many things we need to check, due to performance
issues.


> Or is the need for this return code made obsolete if the driver knows
> ahead of time via struct bpf_prog flags that the prog intends to
> modify the packet, and can set up the page accordingly?

Yes, maybe we can drop the modified (BPF_PHYS_DEV_MODIFIED) return code.
I was just thinking this could be used to indicate if the checksum
would need to be recalculated.  If the usual checksum people don't
care, we should drop this indication.

Think about it performance wise... if we know the program _can_ modify
(but don't know if it did so), then we would have mark the SKB to the
stack as the checksum needed to be recalculated, always...
Jesper Dangaard Brouer April 8, 2016, 8:08 p.m. UTC | #8
On Fri, 8 Apr 2016 10:26:53 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> >   
> > > > +/* user return codes for PHYS_DEV prog type */
> > > > +enum bpf_phys_dev_action {
> > > > +	BPF_PHYS_DEV_DROP,
> > > > +	BPF_PHYS_DEV_OK,
> > > > +};    
> > > 
> > > I can imagine these extra return codes:
> > > 
> > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > > 
> > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > which we can look at when we get that far...  
> > 
> > I want to point out something which is quite FUNDAMENTAL, for
> > understanding these return codes (and network stack).
> > 
> > 
> > At driver RX time, the network stack basically have two ways of
> > building an SKB, which is send up the stack.
> > 
> > Option-A (fastest): The packet page is writable. The SKB can be
> > allocated and skb->data/head can point directly to the page.  And
> > we place/write skb_shared_info in the end/tail-room. (This is done by
> > calling build_skb()).
> > 
> > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > skb->data/head directly to the page, because skb_shared_info need to be
> > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > get around this, a separate piece of memory is allocated (speedup by
> > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > be written. (This is done when calling netdev/napi_alloc_skb()).
> >   Drivers then need to copy over packet headers, and assign + adjust
> > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > 
> > 
> > Unfortunately most drivers use option-B.  Due to cost of calling the
> > page allocator.  It is only slightly most expensive to get a larger
> > compound page from the page allocator, which then can be partitioned into
> > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > cost is added later, when constructing the SKB.
> >  Another reason for option-B, is that archs with expensive IOMMU
> > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > but only on the compound page level.
> > 
> > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > for option-B, when copying header data anyhow. For small packet, one
> > might as well free (or recycle) the RX page, if header size fits into
> > the newly allocated memory (for skb_shared_info).  
> 
> I think you guys are going into overdesign territory, so
> . nack on read-only pages

Unfortunately you cannot just ignore or nack read-only pages. They are
a fact in the current drivers.

Most drivers today (at-least the ones we care about) only deliver
read-only pages.  If you don't accept read-only pages day-1, then you
first have to rewrite a lot of drivers... and that will stall the
project!  How will you deal with this fact?

The early drop filter use-case in this patchset, can ignore read-only
pages.  But ABI wise we need to deal with the future case where we do
need/require writeable pages.  A simple need-writable pages in the API
could help us move forward.


> . nack on copy-break approach

Copy-break can be ignored.  It sort of happens at a higher-level in the
driver. (Eric likely want/care this happens for local socket delivery).


> . nack on per-ring programs

Hmmm... I don't see it as a lot more complicated to attach the program
to the ring.  But maybe we can extend the API later, and thus postpone that
discussion.

> . nack on modified/stolen/shared return codes
> 
> The whole thing must be dead simple to use. Above is not simple by any means.

Maybe you missed that the above was a description of how the current
network stack handles this, which is not simple... which is root of the
hole performance issue.


> The programs must see writeable pages only and return codes:
> drop, pass to stack, redirect to xmit.
> If program wishes to modify packets before passing it to stack, it
> shouldn't need to deal with different return values.

> No special things to deal with small or large packets. No header splits.
> Program must not be aware of any such things.

I agree on this.  This layer only deals with packets at the page level,
single packets stored in continuous memory.


> Drivers can use DMA_BIDIRECTIONAL to allow received page to be
> modified by the program and immediately sent to xmit.

We just have to verify that DMA_BIDIRECTIONAL does not add extra
overhead (which is explicitly stated that it likely does on the
DMA-API-HOWTO.txt, but I like to verify this with a micro benchmark)

> No dma map/unmap/sync per packet. If some odd architectures/dma setups
> cannot do it, then XDP will not be applicable there.

I do like the idea of rejecting XDP eBPF programs based on the DMA
setup is not compatible, or if the driver does not implement e.g.
writable DMA pages.

Customers wanting this feature will then go buy the NIC which support
this feature.  There is nothing more motivating for NIC vendors seeing
customers buying the competitors hardware. And it only require a driver
change to get this market...


> We are not going to sacrifice performance for generality.

Agree.
Alexei Starovoitov April 8, 2016, 9:34 p.m. UTC | #9
On Fri, Apr 08, 2016 at 10:08:08PM +0200, Jesper Dangaard Brouer wrote:
> On Fri, 8 Apr 2016 10:26:53 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > > 
> > > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> > >   
> > > > > +/* user return codes for PHYS_DEV prog type */
> > > > > +enum bpf_phys_dev_action {
> > > > > +	BPF_PHYS_DEV_DROP,
> > > > > +	BPF_PHYS_DEV_OK,
> > > > > +};    
> > > > 
> > > > I can imagine these extra return codes:
> > > > 
> > > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> > > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > > > 
> > > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > > which we can look at when we get that far...  
> > > 
> > > I want to point out something which is quite FUNDAMENTAL, for
> > > understanding these return codes (and network stack).
> > > 
> > > 
> > > At driver RX time, the network stack basically have two ways of
> > > building an SKB, which is send up the stack.
> > > 
> > > Option-A (fastest): The packet page is writable. The SKB can be
> > > allocated and skb->data/head can point directly to the page.  And
> > > we place/write skb_shared_info in the end/tail-room. (This is done by
> > > calling build_skb()).
> > > 
> > > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > > skb->data/head directly to the page, because skb_shared_info need to be
> > > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > > get around this, a separate piece of memory is allocated (speedup by
> > > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > > be written. (This is done when calling netdev/napi_alloc_skb()).
> > >   Drivers then need to copy over packet headers, and assign + adjust
> > > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > > 
> > > 
> > > Unfortunately most drivers use option-B.  Due to cost of calling the
> > > page allocator.  It is only slightly most expensive to get a larger
> > > compound page from the page allocator, which then can be partitioned into
> > > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > > cost is added later, when constructing the SKB.
> > >  Another reason for option-B, is that archs with expensive IOMMU
> > > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > > but only on the compound page level.
> > > 
> > > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > > for option-B, when copying header data anyhow. For small packet, one
> > > might as well free (or recycle) the RX page, if header size fits into
> > > the newly allocated memory (for skb_shared_info).  
> > 
> > I think you guys are going into overdesign territory, so
> > . nack on read-only pages
> 
> Unfortunately you cannot just ignore or nack read-only pages. They are
> a fact in the current drivers.
> 
> Most drivers today (at-least the ones we care about) only deliver
> read-only pages.  If you don't accept read-only pages day-1, then you
> first have to rewrite a lot of drivers... and that will stall the
> project!  How will you deal with this fact?
> 
> The early drop filter use-case in this patchset, can ignore read-only
> pages.  But ABI wise we need to deal with the future case where we do
> need/require writeable pages.  A simple need-writable pages in the API
> could help us move forward.

the program should never need to worry about whether dma buffer is
writeable or not. Complicating drivers, api, abi, usability
for the single use case of fast packet drop is not acceptable.
XDP is not going to be a fit for all drivers and all architectures.
That is cruicial 'performance vs generality' aspect of the design.
All kernel-bypasses are taking advantage of specific architecture.
We have to take advantage of it as well. If it doesn't fit
powerpc with iommu, so be it. XDP will return -enotsupp.
That is fundamental point. We have to cut such corners and avoid
all cases where unnecessary generality hurts performance.
Read-only pages is clearly such thing.

> > The whole thing must be dead simple to use. Above is not simple by any means.
> 
> Maybe you missed that the above was a description of how the current
> network stack handles this, which is not simple... which is root of the
> hole performance issue.

Disagree. The stack has copy-break, gro, gso and everything else because
it's serving _host_ use case. XDP is packet forwarder use case.
The requirements are completely different. Ex. the host needs gso
in the core and drivers. It needs to deliver data all the way
to user space and back. That is hard and that's where complexity
comes from. For packet forwarder none of it is needed. So saying,
look we have this complexity, so XDP needs it too, is flawed argument.
The kernel is serving host and applications.
XDP is pure packet-in/packet-out framework to achieve better
performance than kernel-bypass, since kernel is the right
place to do it. It has clean access to interrupts, per-cpu,
scheduler, device registers and so on.
Though there are only two broad use cases packet drop and forward,
they cover a ton of real cases: firewalls, dos prevention,
load balancer, nat, etc. In other words mostly stateless.
As soon as packet needs to be queued somewhere we have to
instantiate skb and pass it to the stack.
So no queues in XDP and no 'stolen' and 'shared' return codes.
The program always runs to completion with single packet.
There is no header vs payload split. There is no header
from program point of view. It's raw bytes in dma buffer.

> I do like the idea of rejecting XDP eBPF programs based on the DMA
> setup is not compatible, or if the driver does not implement e.g.
> writable DMA pages.

exactly.

> Customers wanting this feature will then go buy the NIC which support
> this feature.  There is nothing more motivating for NIC vendors seeing
> customers buying the competitors hardware. And it only require a driver
> change to get this market...

exactly.
Tom Herbert April 9, 2016, 11:17 a.m. UTC | #10
On Fri, Apr 8, 2016 at 1:48 AM, Brenden Blanco <bblanco@plumgrid.com> wrote:
> Add a new bpf prog type that is intended to run in early stages of the
> packet rx path. Only minimal packet metadata will be available, hence a
> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
> far only expose the readable packet length, and only in read mode.
>
> The PHYS_DEV name is chosen to represent that the program is meant only
> for physical adapters, rather than all netdevs.
>
> While the user visible struct is new, the underlying context must be
> implemented as a minimal skb in order for the packet load_* instructions
> to work. The skb filled in by the driver must have skb->len, skb->head,
> and skb->data set, and skb->data_len == 0.
>
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>  include/uapi/linux/bpf.h | 14 ++++++++++
>  kernel/bpf/verifier.c    |  1 +
>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 70eda5a..3018d83 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_SCHED_CLS,
>         BPF_PROG_TYPE_SCHED_ACT,
>         BPF_PROG_TYPE_TRACEPOINT,
> +       BPF_PROG_TYPE_PHYS_DEV,
>  };
>
>  #define BPF_PSEUDO_MAP_FD      1
> @@ -368,6 +369,19 @@ struct __sk_buff {
>         __u32 tc_classid;
>  };
>
> +/* user return codes for PHYS_DEV prog type */
> +enum bpf_phys_dev_action {
> +       BPF_PHYS_DEV_DROP,
> +       BPF_PHYS_DEV_OK,

I don't like OK. Maybe this mean LOCAL. We also need FORWARD (not sure
how to handle GRO yet).

I would suggest that we format the return code as code:subcode, where
the above are codes. subcode is relevant to major code. For instance
in forwarding the subcodes indicate a forwarding instruction (maybe a
queue). DROP subcodes might answer why.

> +};
> +
> +/* user accessible metadata for PHYS_DEV packet hook
> + * new fields must be added to the end of this structure
> + */
> +struct bpf_phys_dev_md {
> +       __u32 len;
> +};
> +

The naming is verbose and too tied to specific implementation. The
meta data structure is just a generic abstraction of a receive
descriptor, it is not specific BPF or physical devices. For instance,
the BPF programs for this should be exportable to a device, userspace,
other OSes, etc. Also, inevitably someone will have a reason to use an
alternate filtering than BPF based on same interfaces abstraction.

So for the above I suggest we simply name the structure xdp_md.
Similarly, drop codes can be XDP_DROP, etc. Also, these should be in a
new header say xdp.h which will also be in uapi.

One other API issue is how to deal with encapsulation. In this case a
header may be prepended to the packet, I assume there are BPF helper
functions and we don't need to return a new length or start?

Tom

>  struct bpf_tunnel_key {
>         __u32 tunnel_id;
>         union {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58792fe..877542d 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1344,6 +1344,7 @@ static bool may_access_skb(enum bpf_prog_type type)
>         case BPF_PROG_TYPE_SOCKET_FILTER:
>         case BPF_PROG_TYPE_SCHED_CLS:
>         case BPF_PROG_TYPE_SCHED_ACT:
> +       case BPF_PROG_TYPE_PHYS_DEV:
>                 return true;
>         default:
>                 return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e8486ba..4f73fb9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2021,6 +2021,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
>         }
>  }
>
> +static const struct bpf_func_proto *
> +phys_dev_func_proto(enum bpf_func_id func_id)
> +{
> +       return sk_filter_func_proto(func_id);
> +}
> +
>  static bool __is_valid_access(int off, int size, enum bpf_access_type type)
>  {
>         /* check bounds */
> @@ -2076,6 +2082,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
>         return __is_valid_access(off, size, type);
>  }
>
> +static bool __is_valid_phys_dev_access(int off, int size,
> +                                      enum bpf_access_type type)
> +{
> +       if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
> +               return false;
> +
> +       if (off % size != 0)
> +               return false;
> +
> +       if (size != 4)
> +               return false;
> +
> +       return true;
> +}
> +
> +static bool phys_dev_is_valid_access(int off, int size,
> +                                    enum bpf_access_type type)
> +{
> +       if (type == BPF_WRITE)
> +               return false;
> +
> +       switch (off) {
> +       case offsetof(struct bpf_phys_dev_md, len):
> +               break;
> +       default:
> +               return false;
> +       }
> +       return __is_valid_phys_dev_access(off, size, type);
> +}
> +
>  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>                                       int src_reg, int ctx_off,
>                                       struct bpf_insn *insn_buf,
> @@ -2213,6 +2249,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>         return insn - insn_buf;
>  }
>
> +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
> +                                          int dst_reg, int src_reg,
> +                                          int ctx_off,
> +                                          struct bpf_insn *insn_buf,
> +                                          struct bpf_prog *prog)
> +{
> +       struct bpf_insn *insn = insn_buf;
> +
> +       switch (ctx_off) {
> +       case offsetof(struct bpf_phys_dev_md, len):
> +               BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
> +
> +               *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
> +                                     offsetof(struct sk_buff, len));
> +               break;
> +       }
> +
> +       return insn - insn_buf;
> +}
> +
>  static const struct bpf_verifier_ops sk_filter_ops = {
>         .get_func_proto = sk_filter_func_proto,
>         .is_valid_access = sk_filter_is_valid_access,
> @@ -2225,6 +2281,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
>         .convert_ctx_access = bpf_net_convert_ctx_access,
>  };
>
> +static const struct bpf_verifier_ops phys_dev_ops = {
> +       .get_func_proto = phys_dev_func_proto,
> +       .is_valid_access = phys_dev_is_valid_access,
> +       .convert_ctx_access = bpf_phys_dev_convert_ctx_access,
> +};
> +
>  static struct bpf_prog_type_list sk_filter_type __read_mostly = {
>         .ops = &sk_filter_ops,
>         .type = BPF_PROG_TYPE_SOCKET_FILTER,
> @@ -2240,11 +2302,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
>         .type = BPF_PROG_TYPE_SCHED_ACT,
>  };
>
> +static struct bpf_prog_type_list phys_dev_type __read_mostly = {
> +       .ops = &phys_dev_ops,
> +       .type = BPF_PROG_TYPE_PHYS_DEV,
> +};
> +
>  static int __init register_sk_filter_ops(void)
>  {
>         bpf_register_prog_type(&sk_filter_type);
>         bpf_register_prog_type(&sched_cls_type);
>         bpf_register_prog_type(&sched_act_type);
> +       bpf_register_prog_type(&phys_dev_type);
>
>         return 0;
>  }
> --
> 2.8.0
>
Tom Herbert April 9, 2016, 11:29 a.m. UTC | #11
On Fri, Apr 8, 2016 at 6:34 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, Apr 08, 2016 at 10:08:08PM +0200, Jesper Dangaard Brouer wrote:
>> On Fri, 8 Apr 2016 10:26:53 -0700
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>
>> > On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
>> > >
>> > > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
>> > >
>> > > > > +/* user return codes for PHYS_DEV prog type */
>> > > > > +enum bpf_phys_dev_action {
>> > > > > +     BPF_PHYS_DEV_DROP,
>> > > > > +     BPF_PHYS_DEV_OK,
>> > > > > +};
>> > > >
>> > > > I can imagine these extra return codes:
>> > > >
>> > > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>> > > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>> > > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
>> > > >
>> > > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
>> > > > which we can look at when we get that far...
>> > >
>> > > I want to point out something which is quite FUNDAMENTAL, for
>> > > understanding these return codes (and network stack).
>> > >
>> > >
>> > > At driver RX time, the network stack basically have two ways of
>> > > building an SKB, which is send up the stack.
>> > >
>> > > Option-A (fastest): The packet page is writable. The SKB can be
>> > > allocated and skb->data/head can point directly to the page.  And
>> > > we place/write skb_shared_info in the end/tail-room. (This is done by
>> > > calling build_skb()).
>> > >
>> > > Option-B (slower): The packet page is read-only.  The SKB cannot point
>> > > skb->data/head directly to the page, because skb_shared_info need to be
>> > > written into skb->end (slightly hidden via skb_shinfo() casting).  To
>> > > get around this, a separate piece of memory is allocated (speedup by
>> > > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
>> > > be written. (This is done when calling netdev/napi_alloc_skb()).
>> > >   Drivers then need to copy over packet headers, and assign + adjust
>> > > skb_shinfo(skb)->frags[0] offset to skip copied headers.
>> > >
>> > >
>> > > Unfortunately most drivers use option-B.  Due to cost of calling the
>> > > page allocator.  It is only slightly most expensive to get a larger
>> > > compound page from the page allocator, which then can be partitioned into
>> > > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
>> > > cost is added later, when constructing the SKB.
>> > >  Another reason for option-B, is that archs with expensive IOMMU
>> > > requirements (like PowerPC), don't need to dma_unmap on every packet,
>> > > but only on the compound page level.
>> > >
>> > > Side-note: Most drivers have a "copy-break" optimization.  Especially
>> > > for option-B, when copying header data anyhow. For small packet, one
>> > > might as well free (or recycle) the RX page, if header size fits into
>> > > the newly allocated memory (for skb_shared_info).
>> >
>> > I think you guys are going into overdesign territory, so
>> > . nack on read-only pages
>>
>> Unfortunately you cannot just ignore or nack read-only pages. They are
>> a fact in the current drivers.
>>
>> Most drivers today (at-least the ones we care about) only deliver
>> read-only pages.  If you don't accept read-only pages day-1, then you
>> first have to rewrite a lot of drivers... and that will stall the
>> project!  How will you deal with this fact?
>>
>> The early drop filter use-case in this patchset, can ignore read-only
>> pages.  But ABI wise we need to deal with the future case where we do
>> need/require writeable pages.  A simple need-writable pages in the API
>> could help us move forward.
>
> the program should never need to worry about whether dma buffer is
> writeable or not. Complicating drivers, api, abi, usability
> for the single use case of fast packet drop is not acceptable.
> XDP is not going to be a fit for all drivers and all architectures.
> That is cruicial 'performance vs generality' aspect of the design.
> All kernel-bypasses are taking advantage of specific architecture.
> We have to take advantage of it as well. If it doesn't fit
> powerpc with iommu, so be it. XDP will return -enotsupp.
> That is fundamental point. We have to cut such corners and avoid
> all cases where unnecessary generality hurts performance.
> Read-only pages is clearly such thing.
>
+1. Forwarding which will be a common application almost always
requires modification (decrement TTL), and header data split has
always been a weak feature since the device has to have some arbitrary
rules about what headers needs to be split out (either implements
protocol specific parsing or some fixed length).

>> > The whole thing must be dead simple to use. Above is not simple by any means.
>>
>> Maybe you missed that the above was a description of how the current
>> network stack handles this, which is not simple... which is root of the
>> hole performance issue.
>
> Disagree. The stack has copy-break, gro, gso and everything else because
> it's serving _host_ use case. XDP is packet forwarder use case.
> The requirements are completely different. Ex. the host needs gso
> in the core and drivers. It needs to deliver data all the way
> to user space and back. That is hard and that's where complexity
> comes from. For packet forwarder none of it is needed. So saying,
> look we have this complexity, so XDP needs it too, is flawed argument.
> The kernel is serving host and applications.
> XDP is pure packet-in/packet-out framework to achieve better
> performance than kernel-bypass, since kernel is the right
> place to do it. It has clean access to interrupts, per-cpu,
> scheduler, device registers and so on.
> Though there are only two broad use cases packet drop and forward,
> they cover a ton of real cases: firewalls, dos prevention,
> load balancer, nat, etc. In other words mostly stateless.
> As soon as packet needs to be queued somewhere we have to
> instantiate skb and pass it to the stack.
> So no queues in XDP and no 'stolen' and 'shared' return codes.
> The program always runs to completion with single packet.
> There is no header vs payload split. There is no header
> from program point of view. It's raw bytes in dma buffer.
>
Exactly. We are rethinking the low level data path for performance. An
all encompassing solution that covers ever existing driver model only
results in complexity which is what makes things "slow" in the first
place. Drivers need to change to implement XDP, but the model is as
simple as we can make it-- for instance we are putting very little
requirements on device features.

Tom


>> I do like the idea of rejecting XDP eBPF programs based on the DMA
>> setup is not compatible, or if the driver does not implement e.g.
>> writable DMA pages.
>
> exactly.
>
>> Customers wanting this feature will then go buy the NIC which support
>> this feature.  There is nothing more motivating for NIC vendors seeing
>> customers buying the competitors hardware. And it only require a driver
>> change to get this market...
>
> exactly.
>
Jesper Dangaard Brouer April 9, 2016, 12:27 p.m. UTC | #12
On Sat, 9 Apr 2016 08:17:04 -0300
Tom Herbert <tom@herbertland.com> wrote:

> One other API issue is how to deal with encapsulation. In this case a
> header may be prepended to the packet, I assume there are BPF helper
> functions and we don't need to return a new length or start?

That reminds me.  Do the BPF program need to know the head-room, then?
Tom Herbert April 9, 2016, 1:17 p.m. UTC | #13
On Sat, Apr 9, 2016 at 9:27 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> On Sat, 9 Apr 2016 08:17:04 -0300
> Tom Herbert <tom@herbertland.com> wrote:
>
>> One other API issue is how to deal with encapsulation. In this case a
>> header may be prepended to the packet, I assume there are BPF helper
>> functions and we don't need to return a new length or start?
>
> That reminds me.  Do the BPF program need to know the head-room, then?
>
Right, that is basically my question. Can we have a helper function in
BPF that will prepend n bytes to the buffer? (I don't think we want
expose a notion of headroom).

Tom

> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer
Jamal Hadi Salim April 9, 2016, 3:29 p.m. UTC | #14
On 16-04-09 07:29 AM, Tom Herbert wrote:

> +1. Forwarding which will be a common application almost always
> requires modification (decrement TTL), and header data split has
> always been a weak feature since the device has to have some arbitrary
> rules about what headers needs to be split out (either implements
> protocol specific parsing or some fixed length).

Then this is sensible. I was cruising the threads and
confused by your earlier emails Tom because you talked
about XPS etc. It sounded like the idea evolved into putting
the whole freaking stack on bpf.
If this is _forwarding only_ it maybe useful to look at
Alexey's old code in particular the DMA bits;
he built his own lookup algorithm but sounds like bpf is
a much better fit today.

cheers,
jamal
Alexei Starovoitov April 9, 2016, 5 p.m. UTC | #15
On Sat, Apr 09, 2016 at 08:17:04AM -0300, Tom Herbert wrote:
> >
> > +/* user return codes for PHYS_DEV prog type */
> > +enum bpf_phys_dev_action {
> > +       BPF_PHYS_DEV_DROP,
> > +       BPF_PHYS_DEV_OK,
> 
> I don't like OK. Maybe this mean LOCAL. We also need FORWARD (not sure
> how to handle GRO yet).
> 
> I would suggest that we format the return code as code:subcode, where
> the above are codes. subcode is relevant to major code. For instance
> in forwarding the subcodes indicate a forwarding instruction (maybe a
> queue). DROP subcodes might answer why.

for tc redirect we use hidden percpu variable to pass additional
info together with return code. The cost of it is extra bpf_redirect() call.
Here we can do better and embed such info for xmit,
but subcodes for drop is slippery slop, since it's adding concepts
to design that are not going to be used by everyone.
If necessary bpf programs can count drop reasons internally.
Drops due to protocol!=ipv6 or drops due to ip frags present
will be program internal reasons. No need to expose them in api.

We need to get xmit part implemented first and see how it looks
before deciding on this part of api.
Right now I think we do not need tx queue number actually.
The prog should just return 'XMIT' and xdp(driver) side will decide
which tx queue to use.

> One other API issue is how to deal with encapsulation. In this case a
> header may be prepended to the packet, I assume there are BPF helper
> functions and we don't need to return a new length or start?

a bit of history:
for tc+bpf we've been trying to come up with clean helpers to do
header push/pop and it was very difficult, since skb keeps a ton
of metedata about header offsets, csum offsets, encap flag, etc
we've lost the count on number of different approaches we've
implemented and discarded.
For XDP there is no such issue.
Likely we'll have single bpf_packet_change(ctx, off, len) helper
that will grow(len) or trim(-len) bytes at offset(off) in the packet.
ctx->len will be adjusted automatically by the helper.
The headroom, tailroom will not be exposed and will never be known
to the bpf side. It's up to the helper and the driver to decide how to
insert N bytes at offset M. If the driver reserved headroom in dma
buffer, it can grow into it, if not it can grow tail and move
the whole packet. For performance reasons we obviously want some
headroom in dma buffer, but it's not exposed to bpf.

But it could be that directly adjusting ctx->len and ctx->data is faster.
For cls_bpf ctx->data is hidden and packet access is done via
special instructions and helpers. For XDP we can hopefully do better
and do packet access with direct loads. I outlined that plan
in the previous thread.
Alexei Starovoitov April 9, 2016, 5:26 p.m. UTC | #16
On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
> On 16-04-09 07:29 AM, Tom Herbert wrote:
> 
> >+1. Forwarding which will be a common application almost always
> >requires modification (decrement TTL), and header data split has
> >always been a weak feature since the device has to have some arbitrary
> >rules about what headers needs to be split out (either implements
> >protocol specific parsing or some fixed length).
> 
> Then this is sensible. I was cruising the threads and
> confused by your earlier emails Tom because you talked
> about XPS etc. It sounded like the idea evolved into putting
> the whole freaking stack on bpf.

yeah, no stack, no queues in bpf.

> If this is _forwarding only_ it maybe useful to look at
> Alexey's old code in particular the DMA bits;
> he built his own lookup algorithm but sounds like bpf is
> a much better fit today.

a link to these old bits?

Just to be clear: this rfc is not the only thing we're considering.
In particular huawei guys did a monster effort to improve performance
in this area as well. We'll try to blend all the code together and
pick what's the best.
Thomas Graf April 10, 2016, 7:55 a.m. UTC | #17
On 04/09/16 at 10:26am, Alexei Starovoitov wrote:
> On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
> > If this is _forwarding only_ it maybe useful to look at
> > Alexey's old code in particular the DMA bits;
> > he built his own lookup algorithm but sounds like bpf is
> > a much better fit today.
> 
> a link to these old bits?
> 
> Just to be clear: this rfc is not the only thing we're considering.
> In particular huawei guys did a monster effort to improve performance
> in this area as well. We'll try to blend all the code together and
> pick what's the best.

What's the plan on opening the discussion on this? Can we get a peek?
Is it an alternative to XDP and the driver hook? Different architecture
or just different implementation? I understood it as another pseudo
skb model with a path on converting to real skbs for stack processing.

I really like the current proposal by Brenden for its simplicity and
targeted compatibility with cls_bpf.
Jamal Hadi Salim April 10, 2016, 1:07 p.m. UTC | #18
On 16-04-09 01:26 PM, Alexei Starovoitov wrote:

>
> yeah, no stack, no queues in bpf.

Thanks.

>
>> If this is _forwarding only_ it maybe useful to look at
>> Alexey's old code in particular the DMA bits;
>> he built his own lookup algorithm but sounds like bpf is
>> a much better fit today.
>
> a link to these old bits?
>

Dang. Trying to remember exact name (I think it has been gone for at
least 10 years now). I know it is not CONFIG_NET_FASTROUTE although
it could have been that depending on the driver (tulip had some
nice DMA properties - which by todays standards would be considered
primitive ;->).
+Cc Robert and Alexey (Trying to figure out name of driver based
routing code that DMAed from ingress to egress port)

> Just to be clear: this rfc is not the only thing we're considering.
> In particular huawei guys did a monster effort to improve performance
> in this area as well. We'll try to blend all the code together and
> pick what's the best.
>

Sounds very interesting.

cheers,
jamal
Tom Herbert April 10, 2016, 4:53 p.m. UTC | #19
On Sun, Apr 10, 2016 at 12:55 AM, Thomas Graf <tgraf@suug.ch> wrote:
> On 04/09/16 at 10:26am, Alexei Starovoitov wrote:
>> On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
>> > If this is _forwarding only_ it maybe useful to look at
>> > Alexey's old code in particular the DMA bits;
>> > he built his own lookup algorithm but sounds like bpf is
>> > a much better fit today.
>>
>> a link to these old bits?
>>
>> Just to be clear: this rfc is not the only thing we're considering.
>> In particular huawei guys did a monster effort to improve performance
>> in this area as well. We'll try to blend all the code together and
>> pick what's the best.
>
> What's the plan on opening the discussion on this? Can we get a peek?
> Is it an alternative to XDP and the driver hook? Different architecture
> or just different implementation? I understood it as another pseudo
> skb model with a path on converting to real skbs for stack processing.
>
We started discussions about this in IOvisor. The Huawei project is
called ceth (Common Ethernet). It is essentially a layer called
directly from drivers intended for fast path forwarding and network
virtualization. They have put quite a bit of effort into buffer
management and other parts of the infrastructure, much of which we
would like to leverage in XDP. The code is currently in github, will
ask them to make it generally accessible.

The programmability part, essentially BPF, should be part of a common
solution. We can define the necessary interfaces independently of the
underlying infrastructure which is really the only way we can do this
if we want the BPF programs to be portable across different
platforms-- in Linux, userspace, HW, etc.

Tom

> I really like the current proposal by Brenden for its simplicity and
> targeted compatibility with cls_bpf.
Jamal Hadi Salim April 10, 2016, 6:09 p.m. UTC | #20
On 16-04-10 12:53 PM, Tom Herbert wrote:

> We started discussions about this in IOvisor. The Huawei project is
> called ceth (Common Ethernet). It is essentially a layer called
> directly from drivers intended for fast path forwarding and network
> virtualization. They have put quite a bit of effort into buffer
> management and other parts of the infrastructure, much of which we
> would like to leverage in XDP. The code is currently in github, will
> ask them to make it generally accessible.
>

Cant seem to find any info on it on the googles.
If it is forwarding then it should hopefully at least make use of Linux
control APIs I hope.

cheers,
jamal
diff mbox

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70eda5a..3018d83 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -93,6 +93,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
 	BPF_PROG_TYPE_TRACEPOINT,
+	BPF_PROG_TYPE_PHYS_DEV,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
@@ -368,6 +369,19 @@  struct __sk_buff {
 	__u32 tc_classid;
 };
 
+/* user return codes for PHYS_DEV prog type */
+enum bpf_phys_dev_action {
+	BPF_PHYS_DEV_DROP,
+	BPF_PHYS_DEV_OK,
+};
+
+/* user accessible metadata for PHYS_DEV packet hook
+ * new fields must be added to the end of this structure
+ */
+struct bpf_phys_dev_md {
+	__u32 len;
+};
+
 struct bpf_tunnel_key {
 	__u32 tunnel_id;
 	union {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 58792fe..877542d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1344,6 +1344,7 @@  static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_PHYS_DEV:
 		return true;
 	default:
 		return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index e8486ba..4f73fb9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2021,6 +2021,12 @@  tc_cls_act_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+phys_dev_func_proto(enum bpf_func_id func_id)
+{
+	return sk_filter_func_proto(func_id);
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	/* check bounds */
@@ -2076,6 +2082,36 @@  static bool tc_cls_act_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool __is_valid_phys_dev_access(int off, int size,
+				       enum bpf_access_type type)
+{
+	if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	if (size != 4)
+		return false;
+
+	return true;
+}
+
+static bool phys_dev_is_valid_access(int off, int size,
+				     enum bpf_access_type type)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_phys_dev_md, len):
+		break;
+	default:
+		return false;
+	}
+	return __is_valid_phys_dev_access(off, size, type);
+}
+
 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				      int src_reg, int ctx_off,
 				      struct bpf_insn *insn_buf,
@@ -2213,6 +2249,26 @@  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
+					   int dst_reg, int src_reg,
+					   int ctx_off,
+					   struct bpf_insn *insn_buf,
+					   struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_phys_dev_md, len):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, len));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto = sk_filter_func_proto,
 	.is_valid_access = sk_filter_is_valid_access,
@@ -2225,6 +2281,12 @@  static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops phys_dev_ops = {
+	.get_func_proto = phys_dev_func_proto,
+	.is_valid_access = phys_dev_is_valid_access,
+	.convert_ctx_access = bpf_phys_dev_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops = &sk_filter_ops,
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2240,11 +2302,17 @@  static struct bpf_prog_type_list sched_act_type __read_mostly = {
 	.type = BPF_PROG_TYPE_SCHED_ACT,
 };
 
+static struct bpf_prog_type_list phys_dev_type __read_mostly = {
+	.ops = &phys_dev_ops,
+	.type = BPF_PROG_TYPE_PHYS_DEV,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
 	bpf_register_prog_type(&sched_act_type);
+	bpf_register_prog_type(&phys_dev_type);
 
 	return 0;
 }