diff mbox series

[RFC,bpf-next,v2,1/9] net: introduce __init_skb{, _data, _shinfo} helpers

Message ID 20190319221948.170441-2-sdf@google.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series net: flow_dissector: trigger BPF hook when called from eth_get_headlen | expand

Commit Message

Stanislav Fomichev March 19, 2019, 10:19 p.m. UTC
__init_skb is essentially a version of __build_skb which accepts skb as
an argument (instead of doing kmem_cache_alloc to allocate it).

__init_skb_shinfo initializes shinfo.

__init_skb_data initializes skb data pointers.

No functional changes.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 include/linux/skbuff.h | 14 ++++++++++
 net/core/skbuff.c      | 60 +++++++++++++++++-------------------------
 2 files changed, 38 insertions(+), 36 deletions(-)

Comments

Alexei Starovoitov March 21, 2019, 3:39 a.m. UTC | #1
On Tue, Mar 19, 2019 at 03:19:40PM -0700, Stanislav Fomichev wrote:
> __init_skb is essentially a version of __build_skb which accepts skb as
> an argument (instead of doing kmem_cache_alloc to allocate it).
> 
> __init_skb_shinfo initializes shinfo.
> 
> __init_skb_data initializes skb data pointers.
> 
> No functional changes.
> 
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  include/linux/skbuff.h | 14 ++++++++++
>  net/core/skbuff.c      | 60 +++++++++++++++++-------------------------
>  2 files changed, 38 insertions(+), 36 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 9027a8c4219f..e8c1d5b97f96 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -4399,5 +4399,19 @@ static inline __wsum lco_csum(struct sk_buff *skb)
>  	return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
>  }
>  
> +static inline void __init_skb_data(struct sk_buff *skb, u8 *data,
> +				   unsigned int size)
> +{
> +	/* Account for allocated memory : skb + skb->head */
> +	skb->truesize = SKB_TRUESIZE(size);
> +	refcount_set(&skb->users, 1);
> +	skb->head = data;
> +	skb->data = data;
> +	skb_reset_tail_pointer(skb);
> +	skb->end = skb->tail + size;
> +	skb->mac_header = (typeof(skb->mac_header))~0U;
> +	skb->transport_header = (typeof(skb->transport_header))~0U;
> +}
> +
>  #endif	/* __KERNEL__ */
>  #endif	/* _LINUX_SKBUFF_H */
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 2415d9cb9b89..b413354ee709 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -160,6 +160,26 @@ static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
>   *
>   */
>  
> +static inline void __init_skb(struct sk_buff *skb, u8 *data, unsigned int size)
> +{
> +	/* Only clear those fields we need to clear, not those that we will
> +	 * actually initialize below. Hence, don't put any more fields after
> +	 * the tail pointer in struct sk_buff!
> +	 */
> +	memset(skb, 0, offsetof(struct sk_buff, tail));
> +	__init_skb_data(skb, data, size);
> +}
> +
> +static inline void __init_skb_shinfo(struct sk_buff *skb)
> +{
> +	struct skb_shared_info *shinfo;
> +
> +	/* make sure we initialize shinfo sequentially */
> +	shinfo = skb_shinfo(skb);
> +	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
> +	atomic_set(&shinfo->dataref, 1);
> +}
> +
>  /**
>   *	__alloc_skb	-	allocate a network buffer
>   *	@size: size to allocate
> @@ -181,7 +201,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
>  			    int flags, int node)
>  {
>  	struct kmem_cache *cache;
> -	struct skb_shared_info *shinfo;
>  	struct sk_buff *skb;
>  	u8 *data;
>  	bool pfmemalloc;
> @@ -215,27 +234,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
>  	size = SKB_WITH_OVERHEAD(ksize(data));
>  	prefetchw(data + size);
>  
> -	/*
> -	 * Only clear those fields we need to clear, not those that we will
> -	 * actually initialise below. Hence, don't put any more fields after
> -	 * the tail pointer in struct sk_buff!
> -	 */
> -	memset(skb, 0, offsetof(struct sk_buff, tail));
> -	/* Account for allocated memory : skb + skb->head */
> -	skb->truesize = SKB_TRUESIZE(size);
> +	__init_skb(skb, data, size);
> +	__init_skb_shinfo(skb);
>  	skb->pfmemalloc = pfmemalloc;
> -	refcount_set(&skb->users, 1);
> -	skb->head = data;
> -	skb->data = data;
> -	skb_reset_tail_pointer(skb);
> -	skb->end = skb->tail + size;
> -	skb->mac_header = (typeof(skb->mac_header))~0U;
> -	skb->transport_header = (typeof(skb->transport_header))~0U;
> -
> -	/* make sure we initialize shinfo sequentially */
> -	shinfo = skb_shinfo(skb);
> -	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
> -	atomic_set(&shinfo->dataref, 1);
>  
>  	if (flags & SKB_ALLOC_FCLONE) {
>  		struct sk_buff_fclones *fclones;
> @@ -277,7 +278,6 @@ EXPORT_SYMBOL(__alloc_skb);
>   */
>  struct sk_buff *__build_skb(void *data, unsigned int frag_size)
>  {
> -	struct skb_shared_info *shinfo;
>  	struct sk_buff *skb;
>  	unsigned int size = frag_size ? : ksize(data);
>  
> @@ -287,20 +287,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
>  
>  	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>  
> -	memset(skb, 0, offsetof(struct sk_buff, tail));
> -	skb->truesize = SKB_TRUESIZE(size);
> -	refcount_set(&skb->users, 1);
> -	skb->head = data;
> -	skb->data = data;
> -	skb_reset_tail_pointer(skb);
> -	skb->end = skb->tail + size;
> -	skb->mac_header = (typeof(skb->mac_header))~0U;
> -	skb->transport_header = (typeof(skb->transport_header))~0U;
> -
> -	/* make sure we initialize shinfo sequentially */
> -	shinfo = skb_shinfo(skb);
> -	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
> -	atomic_set(&shinfo->dataref, 1);
> +	__init_skb(skb, data, size);
> +	__init_skb_shinfo(skb);

I think you need to convince Dave and Eric that
above surgery is necessary to do the hack in patch 6 with
+static DEFINE_PER_CPU(struct sk_buff, bpf_flow_skb);

I think the better option it to introduce new prog type that works
without skb. I think it can be pretty close to shape and form to xdp.
Eric Dumazet March 21, 2019, 4:44 a.m. UTC | #2
On 03/20/2019 08:39 PM, Alexei Starovoitov wrote:

> I think you need to convince Dave and Eric that
> above surgery is necessary to do the hack in patch 6 with
> +static DEFINE_PER_CPU(struct sk_buff, bpf_flow_skb);
> 

Yes, this is a huge code churn.

Honestly I believe we are going too far in this series.

> I think the better option it to introduce new prog type that works
> without skb. I think it can be pretty close to shape and form to xdp.
>
Willem de Bruijn March 21, 2019, 1:58 p.m. UTC | #3
On Thu, Mar 21, 2019 at 12:45 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 03/20/2019 08:39 PM, Alexei Starovoitov wrote:
>
> > I think you need to convince Dave and Eric that
> > above surgery is necessary to do the hack in patch 6 with
> > +static DEFINE_PER_CPU(struct sk_buff, bpf_flow_skb);
> >
>
> Yes, this is a huge code churn.

It touches a lot of lines. But it deduplicates logic that might become
out of sync if we don't do that now.

> Honestly I believe we are going too far in this series.
>
> > I think the better option it to introduce new prog type that works
> > without skb. I think it can be pretty close to shape and form to xdp.
> >

That's an interesting alternative. The existing sample dissector
does not access any skb fields aside from vlan. The interface could
be superseded with one that has a more constrained context, like xdp.

When parsing for headlen in the device driver rx path, a program could
conceivably in the same pass also compute an rxhash in case the
device did not supply an L4 one. And perhaps some hash (the same?)
to speed up GRO flow lookup. This would require a few extra fields in
bpf_flow_keys.
Stanislav Fomichev March 21, 2019, 3:44 p.m. UTC | #4
On 03/21, Willem de Bruijn wrote:
> On Thu, Mar 21, 2019 at 12:45 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> >
> > On 03/20/2019 08:39 PM, Alexei Starovoitov wrote:
> >
> > > I think you need to convince Dave and Eric that
> > > above surgery is necessary to do the hack in patch 6 with
> > > +static DEFINE_PER_CPU(struct sk_buff, bpf_flow_skb);
> > >
> >
> > Yes, this is a huge code churn.
> 
> It touches a lot of lines. But it deduplicates logic that might become
> out of sync if we don't do that now.
Yeah, I remove a bunch of copy-paste with this change.

> > Honestly I believe we are going too far in this series.
> >
> > > I think the better option it to introduce new prog type that works
> > > without skb. I think it can be pretty close to shape and form to xdp.
> > >
> 
> That's an interesting alternative. The existing sample dissector
> does not access any skb fields aside from vlan. The interface could
> be superseded with one that has a more constrained context, like xdp.
> 
> When parsing for headlen in the device driver rx path, a program could
> conceivably in the same pass also compute an rxhash in case the
> device did not supply an L4 one. And perhaps some hash (the same?)
> to speed up GRO flow lookup. This would require a few extra fields in
> bpf_flow_keys.
With the xdp-like interface, what would be the end goal? Two different
program types, one triggering from eth_get_headlen, the other from
existing __skb_flow_dissect? Or we can use xdp-like interface everywhere
and just pass the first fragment in __skb_flow_dissect (that should be
enough, in theory, to get everything we want)?

If we end up with two interface, that can be confusing to the users. Which
one to use? Why? Why write essentially the same program twice?

If we can agree that we switch everything to xpd-like, do we deprecate the
skb-one?
Alexei Starovoitov March 21, 2019, 4 p.m. UTC | #5
On Thu, Mar 21, 2019 at 08:44:33AM -0700, Stanislav Fomichev wrote:
> 
> If we can agree that we switch everything to xpd-like, do we deprecate the
> skb-one?

This whole discussion that have been going on for long time is an indication
that initial bpf flow dissector concept was not thought through
and I regret on merging it in the first place.
Adding more hacks on top of it with fake skbs is not going to make it any better.
Since it's been in the official release we cannot remove it now.
Willem de Bruijn March 21, 2019, 4:13 p.m. UTC | #6
On Thu, Mar 21, 2019 at 12:01 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Mar 21, 2019 at 08:44:33AM -0700, Stanislav Fomichev wrote:
> >
> > If we can agree that we switch everything to xpd-like, do we deprecate the
> > skb-one?
>
> This whole discussion that have been going on for long time is an indication
> that initial bpf flow dissector concept was not thought through
> and I regret on merging it in the first place.
> Adding more hacks on top of it with fake skbs is not going to make it any better.
> Since it's been in the official release we cannot remove it now.

This patch set addresses the only open issue.

That said, if direction is towards an alternative interface, then it would
make sense for the new interface to supplant the existing one for all
use-cases, even if that existing one cannot be removed.

Essentially a BPF_PROG_TYPE_FLOW_DISSECTOR_RAW that
takes a simpler context than skb. And either that or a program of
type BPF_PROG_TYPE_FLOW_DISSECTOR can be attached in
skb_flow_dissector_bpf_prog_attach, but not both.
Alexei Starovoitov March 21, 2019, 8:56 p.m. UTC | #7
On Thu, Mar 21, 2019 at 9:13 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Thu, Mar 21, 2019 at 12:01 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Thu, Mar 21, 2019 at 08:44:33AM -0700, Stanislav Fomichev wrote:
> > >
> > > If we can agree that we switch everything to xpd-like, do we deprecate the
> > > skb-one?
> >
> > This whole discussion that have been going on for long time is an indication
> > that initial bpf flow dissector concept was not thought through
> > and I regret on merging it in the first place.
> > Adding more hacks on top of it with fake skbs is not going to make it any better.
> > Since it's been in the official release we cannot remove it now.
>
> This patch set addresses the only open issue.
>
> That said, if direction is towards an alternative interface, then it would
> make sense for the new interface to supplant the existing one for all
> use-cases, even if that existing one cannot be removed.
>
> Essentially a BPF_PROG_TYPE_FLOW_DISSECTOR_RAW that
> takes a simpler context than skb. And either that or a program of
> type BPF_PROG_TYPE_FLOW_DISSECTOR can be attached in
> skb_flow_dissector_bpf_prog_attach, but not both.

another idea is to keep 'struct __sk_buff' as a context,
but have kernel side to be something like struct xdp_buff
and disallow access to fields of __sk_buff depending on
attach_type.
and do different ctx rewrite for __sk_buff->foo
depending on attach_type.
Stanislav Fomichev March 21, 2019, 9:13 p.m. UTC | #8
On 03/21, Alexei Starovoitov wrote:
> On Thu, Mar 21, 2019 at 9:13 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > On Thu, Mar 21, 2019 at 12:01 PM Alexei Starovoitov
> > <alexei.starovoitov@gmail.com> wrote:
> > >
> > > On Thu, Mar 21, 2019 at 08:44:33AM -0700, Stanislav Fomichev wrote:
> > > >
> > > > If we can agree that we switch everything to xpd-like, do we deprecate the
> > > > skb-one?
> > >
> > > This whole discussion that have been going on for long time is an indication
> > > that initial bpf flow dissector concept was not thought through
> > > and I regret on merging it in the first place.
> > > Adding more hacks on top of it with fake skbs is not going to make it any better.
> > > Since it's been in the official release we cannot remove it now.
> >
> > This patch set addresses the only open issue.
> >
> > That said, if direction is towards an alternative interface, then it would
> > make sense for the new interface to supplant the existing one for all
> > use-cases, even if that existing one cannot be removed.
> >
> > Essentially a BPF_PROG_TYPE_FLOW_DISSECTOR_RAW that
> > takes a simpler context than skb. And either that or a program of
> > type BPF_PROG_TYPE_FLOW_DISSECTOR can be attached in
> > skb_flow_dissector_bpf_prog_attach, but not both.
> 
> another idea is to keep 'struct __sk_buff' as a context,
> but have kernel side to be something like struct xdp_buff
> and disallow access to fields of __sk_buff depending on
> attach_type.
> and do different ctx rewrite for __sk_buff->foo
> depending on attach_type.
That's a great idea! The only problem that in this case, I guess, we
need another attach_type and need to ask users explicitly attach to it.

What if we go a bit further: always have xdp_buff-like struct in the kernel?
For the real skb it would have data..data_end point to linear packet data (or
the first fragment). We would always prohibit access to most __sk_buff
fields (like in patch #5 from this series). We'd have to write our
own bpf_skb_load_bytes, but that doesn't sound like a big issue.
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9027a8c4219f..e8c1d5b97f96 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4399,5 +4399,19 @@  static inline __wsum lco_csum(struct sk_buff *skb)
 	return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
 }
 
+static inline void __init_skb_data(struct sk_buff *skb, u8 *data,
+				   unsigned int size)
+{
+	/* Account for allocated memory : skb + skb->head */
+	skb->truesize = SKB_TRUESIZE(size);
+	refcount_set(&skb->users, 1);
+	skb->head = data;
+	skb->data = data;
+	skb_reset_tail_pointer(skb);
+	skb->end = skb->tail + size;
+	skb->mac_header = (typeof(skb->mac_header))~0U;
+	skb->transport_header = (typeof(skb->transport_header))~0U;
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2415d9cb9b89..b413354ee709 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -160,6 +160,26 @@  static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
  *
  */
 
+static inline void __init_skb(struct sk_buff *skb, u8 *data, unsigned int size)
+{
+	/* Only clear those fields we need to clear, not those that we will
+	 * actually initialize below. Hence, don't put any more fields after
+	 * the tail pointer in struct sk_buff!
+	 */
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	__init_skb_data(skb, data, size);
+}
+
+static inline void __init_skb_shinfo(struct sk_buff *skb)
+{
+	struct skb_shared_info *shinfo;
+
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+	atomic_set(&shinfo->dataref, 1);
+}
+
 /**
  *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
@@ -181,7 +201,6 @@  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 			    int flags, int node)
 {
 	struct kmem_cache *cache;
-	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	u8 *data;
 	bool pfmemalloc;
@@ -215,27 +234,9 @@  struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	size = SKB_WITH_OVERHEAD(ksize(data));
 	prefetchw(data + size);
 
-	/*
-	 * Only clear those fields we need to clear, not those that we will
-	 * actually initialise below. Hence, don't put any more fields after
-	 * the tail pointer in struct sk_buff!
-	 */
-	memset(skb, 0, offsetof(struct sk_buff, tail));
-	/* Account for allocated memory : skb + skb->head */
-	skb->truesize = SKB_TRUESIZE(size);
+	__init_skb(skb, data, size);
+	__init_skb_shinfo(skb);
 	skb->pfmemalloc = pfmemalloc;
-	refcount_set(&skb->users, 1);
-	skb->head = data;
-	skb->data = data;
-	skb_reset_tail_pointer(skb);
-	skb->end = skb->tail + size;
-	skb->mac_header = (typeof(skb->mac_header))~0U;
-	skb->transport_header = (typeof(skb->transport_header))~0U;
-
-	/* make sure we initialize shinfo sequentially */
-	shinfo = skb_shinfo(skb);
-	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
-	atomic_set(&shinfo->dataref, 1);
 
 	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff_fclones *fclones;
@@ -277,7 +278,6 @@  EXPORT_SYMBOL(__alloc_skb);
  */
 struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 {
-	struct skb_shared_info *shinfo;
 	struct sk_buff *skb;
 	unsigned int size = frag_size ? : ksize(data);
 
@@ -287,20 +287,8 @@  struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 
 	size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-	memset(skb, 0, offsetof(struct sk_buff, tail));
-	skb->truesize = SKB_TRUESIZE(size);
-	refcount_set(&skb->users, 1);
-	skb->head = data;
-	skb->data = data;
-	skb_reset_tail_pointer(skb);
-	skb->end = skb->tail + size;
-	skb->mac_header = (typeof(skb->mac_header))~0U;
-	skb->transport_header = (typeof(skb->transport_header))~0U;
-
-	/* make sure we initialize shinfo sequentially */
-	shinfo = skb_shinfo(skb);
-	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
-	atomic_set(&shinfo->dataref, 1);
+	__init_skb(skb, data, size);
+	__init_skb_shinfo(skb);
 
 	return skb;
 }