diff mbox series

[net-next,02/13] sk_buff: add skb extension infrastructure

Message ID 20181210145006.19098-3-fw@strlen.de
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series sk_buff: add extension infrastructure | expand

Commit Message

Florian Westphal Dec. 10, 2018, 2:49 p.m. UTC
The (out-of-tree) Multipath-TCP implementation needs a significant amount
of extra space in the skb control buffer.

Increasing skb->cb[] size in mainline is a non-starter for memory and
and performance reasons (f.e. increase in cb size also moves several
frequently-accessed fields to other cache lines).

One approach that might work for MPTCP is to extend skb_shared_info instead
of sk_buff.  However, this comes with other drawbacks, e.g.  it either
needs special skb allocation to make sure there is enough space for such
'extended shinfo' at the end of data buffer (which would make this only
useable for the MPTCP tx path) or such a change would increase size of
skb_shared_info.

This work adds an extension infrastructure for sk_buff:
1. extension memory is released when the sk_buff is free'd.
2. data is shared after cloning an skb.

This is also how xfrm and bridge netfilter skb-associated data
(skb->sp and skb->nf_bridge) is handled.

This series removes the nf_bridge pointer and makes bridge netfilter
use the extension infrastructure instead.

Two new members are added to sk_buff:
1. 'active_extensions' byte (filling a hole), telling which extensions
   have been allocated for the skb.
2. extension pointer, located at the end of the sk_buff.
   If active_extensions is 0, its content is undefined.

The 'nf_bridge' pointer is removed, i.e. sk_buff size remains the same.

This adds extra code to skb clone and free paths (to deal with
refcount/free of extension area) but replaces the existing code that
deals with skb->nf_bridge.

This patch only adds the basic infrastructure, the nf_bridge conversion
is added in the next patch.

Conversion of skb->sp (ipsec/xfrm secpath) to an skb extension is planned
as a followup.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/skbuff.h | 119 +++++++++++++++++++++++++++++++++++-
 net/Kconfig            |   3 +
 net/core/skbuff.c      | 133 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_output.c   |   1 +
 net/ipv6/ip6_output.c  |   1 +
 5 files changed, 256 insertions(+), 1 deletion(-)

Comments

Mat Martineau Dec. 12, 2018, 12:07 a.m. UTC | #1
On Mon, 11 Dec 2018, Florian Westphal wrote:

...

> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index b1831a5ca173..d715736eb734 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h

...

> @@ -3896,6 +3906,113 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct)
>                 atomic_inc(&nfct->use);
>  }
>  #endif
> +
> +#ifdef CONFIG_SKB_EXTENSIONS
> +enum skb_ext_id {
> +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> +       SKB_EXT_BRIDGE_NF,
> +#endif
> +       SKB_EXT_NUM, /* must be last */
> +};

Could enum skb_ext_id always be defined, with none of the SKB_EXT_* values 
conditionally excluded? In combination with some alternate function 
definitions below (see later comments) I think this could reduce the need 
for CONFIG_SKB_EXTENSIONS preprocessor conditionals throughout the net 
code.

> +
> +/**
> + *     struct skb_ext - sk_buff extensions
> + *     @refcnt: 1 on allocation, deallocated on 0
> + *     @offset: offset to add to @data to obtain extension address
> + *     @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
> + *     @data: start of extension data, variable sized
> + *
> + *     Note: offsets/lengths are stored in chunks of 8 bytes, this allows
> + *     to use 'u8' types while allowing up to 2kb worth of extension data.
> + */
> +struct skb_ext {
> +       refcount_t refcnt;
> +       u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
> +       u8 chunks;              /* same */
> +       char data[0] __aligned(8);
> +};
> +
> +void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
> +void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
> +void __skb_ext_free(struct skb_ext *ext);
> +
> +static inline void __skb_ext_put(struct skb_ext *ext)
> +{
> +       if (ext && refcount_dec_and_test(&ext->refcnt))
> +               __skb_ext_free(ext);
> +}
> +
> +static inline void skb_ext_put(struct sk_buff *skb)
> +{
> +       if (skb->active_extensions)
> +               __skb_ext_put(skb->extensions);
> +}
> +
> +static inline void skb_ext_get(struct sk_buff *skb)
> +{
> +       if (skb->active_extensions) {
> +               struct skb_ext *ext = skb->extensions;
> +
> +               if (ext)
> +                       refcount_inc(&ext->refcnt);
> +       }
> +}
> +
> +static inline void __skb_ext_copy(struct sk_buff *dst,
> +                                 const struct sk_buff *src)
> +{
> +       dst->active_extensions = src->active_extensions;
> +
> +       if (src->active_extensions) {
> +               struct skb_ext *ext = src->extensions;
> +
> +               if (ext)
> +                       refcount_inc(&ext->refcnt);
> +               dst->extensions = ext;
> +       }
> +}
> +
> +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
> +{
> +       skb_ext_put(dst);
> +       __skb_ext_copy(dst, src);
> +}
> +
> +static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
> +{
> +       return !!ext->offset[i];
> +}
> +
> +static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
> +{
> +       return skb->active_extensions & (1 << id);
> +}
> +
> +static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
> +{
> +       if (skb_ext_exist(skb, id))
> +               __skb_ext_del(skb, id);
> +}
> +
> +static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
> +{
> +       if (skb_ext_exist(skb, id)) {
> +               struct skb_ext *ext = skb->extensions;
> +
> +               if (ext && __skb_ext_exist(ext, id))
> +                       return (void *)ext + (ext->offset[id] << 3);
> +       }
> +
> +       return NULL;
> +}
> +#else
> +static inline void skb_ext_put(struct sk_buff *skb) {}
> +static inline void skb_ext_get(struct sk_buff *skb) {}
> +static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
> +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
> +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}

For the !CONFIG_SKB_EXTENSIONS case, an alternate definition of 
skb_ext_exist() that always returns false would be useful to reduce the 
need for preprocessor conditionals. A similar skb_ext_find() that always 
returns NULL might also be helpful.

> +#endif /* CONFIG_SKB_EXTENSIONS */
> +
>  #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
>  static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
>  {

Thanks,

--
Mat Martineau
Intel OTC
Florian Westphal Dec. 12, 2018, 12:11 a.m. UTC | #2
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
> 
> On Mon, 11 Dec 2018, Florian Westphal wrote:
> 
> ...
> 
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index b1831a5ca173..d715736eb734 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> 
> ...
> 
> > @@ -3896,6 +3906,113 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct)
> >                 atomic_inc(&nfct->use);
> >  }
> >  #endif
> > +
> > +#ifdef CONFIG_SKB_EXTENSIONS
> > +enum skb_ext_id {
> > +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> > +       SKB_EXT_BRIDGE_NF,
> > +#endif
> > +       SKB_EXT_NUM, /* must be last */
> > +};
> 
> Could enum skb_ext_id always be defined, with none of the SKB_EXT_* values
> conditionally excluded?

Yes, this is certainly possible.

> In combination with some alternate function
> definitions below (see later comments) I think this could reduce the need
> for CONFIG_SKB_EXTENSIONS preprocessor conditionals throughout the net code.

Perhaps.  I'll give this a shot.

> > +static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
> > +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
> > +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
> 
> For the !CONFIG_SKB_EXTENSIONS case, an alternate definition of
> skb_ext_exist() that always returns false would be useful to reduce the need
> for preprocessor conditionals. A similar skb_ext_find() that always returns
> NULL might also be helpful.

I'll do this and double-check that gcc removes the branches accordingly.

Thanks,
Florian
Florian Westphal Dec. 12, 2018, 11:59 a.m. UTC | #3
Florian Westphal <fw@strlen.de> wrote:
> Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
> > > +#ifdef CONFIG_SKB_EXTENSIONS
> > > +enum skb_ext_id {
> > > +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> > > +       SKB_EXT_BRIDGE_NF,
> > > +#endif
> > > +       SKB_EXT_NUM, /* must be last */
> > > +};
> > 
> > Could enum skb_ext_id always be defined, with none of the SKB_EXT_* values
> > conditionally excluded?
> 
> Yes, this is certainly possible.

Did this today, did not like the result (see below).

> > In combination with some alternate function
> > definitions below (see later comments) I think this could reduce the need
> > for CONFIG_SKB_EXTENSIONS preprocessor conditionals throughout the net code.
> 
> Perhaps.  I'll give this a shot.
> 
> > > +static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
> > > +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
> > > +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
> > 
> > For the !CONFIG_SKB_EXTENSIONS case, an alternate definition of
> > skb_ext_exist() that always returns false would be useful to reduce the need
> > for preprocessor conditionals. A similar skb_ext_find() that always returns
> > NULL might also be helpful.
> 
> I'll do this and double-check that gcc removes the branches accordingly.

Problem is that the alternate skb_ext_exist() becomes bloated, and
instead gets the #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)/XFRM, etc.
junk.

Otherwise, gcc can't remove the branch in case e.g. CONFIG_XFRM=y
BRIDGE_NF=n.

With current approach, SKB_EXT_BRIDGE_NF id is hidden, i.e. all
calls to skb_ext_add/exists/del(, SKB_EXT_BRIDGE_NF) result in a build
error unless they appear in bridge-netfilter-only sections of the code
(this is intentional).

So, no extra code is generated.
The empty stubs are there only in case no subsystem requires the
extension infrastructure, and in that case, there should be no calls
to skb_ext_exist() in the first place.

Gist is that CONFIG_SKB_EXTENSIONS=y can still mean that all the
'remove bridge' or 'add secpath' etc. calls are not needed, but thats
something the compiler can't know unless called function explicitly
considers this in the inlined section.

Unfortunately, the existing 'id not defined in enum' seems the best
approach to ensure that.

If you have a better idea, let me know.
Willem de Bruijn Dec. 12, 2018, 2:44 p.m. UTC | #4
On Mon, Dec 10, 2018 at 11:21 AM Florian Westphal <fw@strlen.de> wrote:
>
> The (out-of-tree) Multipath-TCP implementation needs a significant amount
> of extra space in the skb control buffer.
>
> Increasing skb->cb[] size in mainline is a non-starter for memory and
> and performance reasons (f.e. increase in cb size also moves several
> frequently-accessed fields to other cache lines).
>
> One approach that might work for MPTCP is to extend skb_shared_info instead
> of sk_buff.  However, this comes with other drawbacks, e.g.  it either
> needs special skb allocation to make sure there is enough space for such
> 'extended shinfo' at the end of data buffer (which would make this only
> useable for the MPTCP tx path) or such a change would increase size of
> skb_shared_info.
>
> This work adds an extension infrastructure for sk_buff:
> 1. extension memory is released when the sk_buff is free'd.
> 2. data is shared after cloning an skb.
>
> This is also how xfrm and bridge netfilter skb-associated data
> (skb->sp and skb->nf_bridge) is handled.
>
> This series removes the nf_bridge pointer and makes bridge netfilter
> use the extension infrastructure instead.
>
> Two new members are added to sk_buff:
> 1. 'active_extensions' byte (filling a hole), telling which extensions
>    have been allocated for the skb.
> 2. extension pointer, located at the end of the sk_buff.
>    If active_extensions is 0, its content is undefined.
>
> The 'nf_bridge' pointer is removed, i.e. sk_buff size remains the same.
>
> This adds extra code to skb clone and free paths (to deal with
> refcount/free of extension area) but replaces the existing code that
> deals with skb->nf_bridge.
>
> This patch only adds the basic infrastructure, the nf_bridge conversion
> is added in the next patch.
>
> Conversion of skb->sp (ipsec/xfrm secpath) to an skb extension is planned
> as a followup.
>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  include/linux/skbuff.h | 119 +++++++++++++++++++++++++++++++++++-
>  net/Kconfig            |   3 +
>  net/core/skbuff.c      | 133 +++++++++++++++++++++++++++++++++++++++++
>  net/ipv4/ip_output.c   |   1 +
>  net/ipv6/ip6_output.c  |   1 +
>  5 files changed, 256 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index b1831a5ca173..d715736eb734 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -245,6 +245,7 @@ struct iov_iter;
>  struct napi_struct;
>  struct bpf_prog;
>  union bpf_attr;
> +struct skb_ext;
>
>  #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
>  struct nf_conntrack {
> @@ -636,6 +637,7 @@ typedef unsigned char *sk_buff_data_t;
>   *     @queue_mapping: Queue mapping for multiqueue devices
>   *     @xmit_more: More SKBs are pending for this queue
>   *     @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
> + *     @active_extensions: active extensions (skb_ext_id types)
>   *     @ndisc_nodetype: router type (from link layer)
>   *     @ooo_okay: allow the mapping of a socket to a queue to be changed
>   *     @l4_hash: indicate hash is a canonical 4-tuple hash over transport
> @@ -665,6 +667,7 @@ typedef unsigned char *sk_buff_data_t;
>   *     @data: Data head pointer
>   *     @truesize: Buffer size
>   *     @users: User count - see {datagram,tcp}.c
> + *     @extensions: allocated extensions, valid if active_extensions is nonzero
>   */
>
>  struct sk_buff {
> @@ -747,7 +750,9 @@ struct sk_buff {
>                                 head_frag:1,
>                                 xmit_more:1,
>                                 pfmemalloc:1;
> -
> +#ifdef CONFIG_SKB_EXTENSIONS
> +       __u8                    active_extensions;
> +#endif

This byte could be saved by moving the bits to the first byte of the
new extension. The likely cold cacheline now needs to be checked, but
only if the pointer is non-NULL. If exclusively using accessor
functions to access the struct, even this can be avoided if encoding
the first 3 or 7 active extensions in the pointer itself.

>         /* fields enclosed in headers_start/headers_end are copied
>          * using a single memcpy() in __copy_skb_header()
>          */
> @@ -869,6 +874,11 @@ struct sk_buff {
>                                 *data;
>         unsigned int            truesize;
>         refcount_t              users;
> +
> +#ifdef CONFIG_SKB_EXTENSIONS
> +       /* only useable after checking ->active_extensions != 0 */
> +       struct skb_ext          *extensions;
> +#endif
>  };
Florian Westphal Dec. 12, 2018, 3:40 p.m. UTC | #5
Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> > +#ifdef CONFIG_SKB_EXTENSIONS
> > +       __u8                    active_extensions;
> > +#endif
> 
> This byte could be saved by moving the bits to the first byte of the
> new extension.

I tried to do this, but could not resolve following problem:
- extensions a and b are active
- skb is cloned
- extension a is to be disabled

In this patch series, we can just clear the 'a' bit from
from clone->active_extensions.

But if its part of the extension itself, we must first need to
be able to duplicate the extension blob before we can disable
the extension, which means that attempt to disable an extension
would now fail on -ENOMEM.  I think disabling should always work.
(its not a problem at the moment for either secpath or bridge
 as both use distinct pointers in sk_buff).

> The likely cold cacheline now needs to be checked, but
> only if the pointer is non-NULL.

The upside of the 'active_extension' member is that we can keep the
pointer in uninitialized state for the active_extensions == 0 case,
i.e., when allocating a new skb skb->extensions is not initialized.

> If exclusively using accessor
> functions to access the struct, even this can be avoided if encoding
> the first 3 or 7 active extensions in the pointer itself.

Yes, that would work, but requires to init the pointer on skb allocation
plus a few tricks to align the allocation so we have three bits to use
on arches where kmalloc doesn't guarantee 8-byte-aligned pointers.

Would also need a 'plan b' in place when extension number 4 arrives.
Willem de Bruijn Dec. 12, 2018, 3:45 p.m. UTC | #6
On Wed, Dec 12, 2018 at 10:40 AM Florian Westphal <fw@strlen.de> wrote:
>
> Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote:
> > > +#ifdef CONFIG_SKB_EXTENSIONS
> > > +       __u8                    active_extensions;
> > > +#endif
> >
> > This byte could be saved by moving the bits to the first byte of the
> > new extension.
>
> I tried to do this, but could not resolve following problem:
> - extensions a and b are active
> - skb is cloned
> - extension a is to be disabled
>
> In this patch series, we can just clear the 'a' bit from
> from clone->active_extensions.
>
> But if its part of the extension itself, we must first need to
> be able to duplicate the extension blob before we can disable
> the extension, which means that attempt to disable an extension
> would now fail on -ENOMEM.  I think disabling should always work.

Absolutely. I certainly overlooked that complicating factor.

> (its not a problem at the moment for either secpath or bridge
>  as both use distinct pointers in sk_buff).
>
> > The likely cold cacheline now needs to be checked, but
> > only if the pointer is non-NULL.
>
> The upside of the 'active_extension' member is that we can keep the
> pointer in uninitialized state for the active_extensions == 0 case,
> i.e., when allocating a new skb skb->extensions is not initialized.
>
> > If exclusively using accessor
> > functions to access the struct, even this can be avoided if encoding
> > the first 3 or 7 active extensions in the pointer itself.
>
> Yes, that would work, but requires to init the pointer on skb allocation
> plus a few tricks to align the allocation so we have three bits to use
> on arches where kmalloc doesn't guarantee 8-byte-aligned pointers.
>
> Would also need a 'plan b' in place when extension number 4 arrives.

My assumed fallback was the first byte(s) of the extension, but as you
point out above, we cannot rely on that.

Okay, in that case the current solution is definitely preferable.

Ack also on the zeroing and alignment. Thanks for pointing out the
various pros and cons.
Mat Martineau Dec. 12, 2018, 4:59 p.m. UTC | #7
On Wed, 12 Dec 2018, Florian Westphal wrote:

> Florian Westphal <fw@strlen.de> wrote:
>> Mat Martineau <mathew.j.martineau@linux.intel.com> wrote:
>>>> +#ifdef CONFIG_SKB_EXTENSIONS
>>>> +enum skb_ext_id {
>>>> +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
>>>> +       SKB_EXT_BRIDGE_NF,
>>>> +#endif
>>>> +       SKB_EXT_NUM, /* must be last */
>>>> +};
>>>
>>> Could enum skb_ext_id always be defined, with none of the SKB_EXT_* values
>>> conditionally excluded?
>>
>> Yes, this is certainly possible.
>
> Did this today, did not like the result (see below).
>
>>> In combination with some alternate function
>>> definitions below (see later comments) I think this could reduce the need
>>> for CONFIG_SKB_EXTENSIONS preprocessor conditionals throughout the net code.
>>
>> Perhaps.  I'll give this a shot.
>>
>>>> +static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
>>>> +static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
>>>> +static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
>>>
>>> For the !CONFIG_SKB_EXTENSIONS case, an alternate definition of
>>> skb_ext_exist() that always returns false would be useful to reduce the need
>>> for preprocessor conditionals. A similar skb_ext_find() that always returns
>>> NULL might also be helpful.
>>
>> I'll do this and double-check that gcc removes the branches accordingly.
>
> Problem is that the alternate skb_ext_exist() becomes bloated, and
> instead gets the #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)/XFRM, etc.
> junk.
>
> Otherwise, gcc can't remove the branch in case e.g. CONFIG_XFRM=y
> BRIDGE_NF=n.
>

Thanks for looking at this. You're definitely right, there will be cases 
where some CONFIG_SKB_EXTENSION users will be enabled and not others, and 
that needs to be handled cleanly.

> If you have a better idea, let me know.
>

I think the best thing is to keep the skb_ext_* functions you have, and 
CONFIG_SKB_EXTENSION-dependent features like XFRM/BRIDGE_NF/etc can define 
alternate inline functions or macros that call skb_ext_exist() if they 
benefit from that.

--
Mat Martineau
Intel OTC
Eric Dumazet Dec. 12, 2018, 5:23 p.m. UTC | #8
On 12/10/2018 06:49 AM, Florian Westphal wrote:
> The (out-of-tree) Multipath-TCP implementation needs a significant amount
> of extra space in the skb control buffer.

Which skbs ? Input or output path ?

> 
> Increasing skb->cb[] size in mainline is a non-starter for memory and
> and performance reasons (f.e. increase in cb size also moves several
> frequently-accessed fields to other cache lines).
> 
> One approach that might work for MPTCP is to extend skb_shared_info instead
> of sk_buff.  However, this comes with other drawbacks, e.g.  it either
> needs special skb allocation to make sure there is enough space for such
> 'extended shinfo' at the end of data buffer (which would make this only
> useable for the MPTCP tx path) or such a change would increase size of
> skb_shared_info.
> 
> This work adds an extension infrastructure for sk_buff:
> 1. extension memory is released when the sk_buff is free'd.
> 2. data is shared after cloning an skb.
> 

This seems additional atomic increments and decrements all over the places,
and code bloat for a very precise reason :

      skb->cb[] is too small.

We do not want to increase skb->cb[] for two reasons, the first one being the killer.

1) we clear it at skb allocation, and copy it at skb cloning.
2) extra memory cost.

Why can't we have another skb->cb2[] field that is not cleared/copied by skb functions at all ?

Each layer using skb->cb2[] would be responsible to fully manage it.

I do not know what are MPTCP needs, but I doubt adding XX bytes to skb will
have serious memory cost impact for any MPTCP users.
Stephen Suryaputra Dec. 12, 2018, 6:16 p.m. UTC | #9
On Mon, Dec 10, 2018 at 11:20 AM Florian Westphal <fw@strlen.de> wrote:

> +#ifdef CONFIG_SKB_EXTENSIONS
> +enum skb_ext_id {
> +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> +       SKB_EXT_BRIDGE_NF,
> +#endif
> +       SKB_EXT_NUM, /* must be last */
> +};

How about when proprietary extensions is desired? There are cases
where our system has to pass special metadata from input to output in
skbs.
Florian Westphal Dec. 12, 2018, 6:38 p.m. UTC | #10
Stephen Suryaputra <ssuryaextr@gmail.com> wrote:
> On Mon, Dec 10, 2018 at 11:20 AM Florian Westphal <fw@strlen.de> wrote:
> 
> > +#ifdef CONFIG_SKB_EXTENSIONS
> > +enum skb_ext_id {
> > +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
> > +       SKB_EXT_BRIDGE_NF,
> > +#endif
> > +       SKB_EXT_NUM, /* must be last */
> > +};
> 
> How about when proprietary extensions is desired? There are cases
> where our system has to pass special metadata from input to output in
> skbs.

Sorry, I don't care about out-of-tree work.
Florian Westphal Dec. 12, 2018, 6:44 p.m. UTC | #11
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On 12/10/2018 06:49 AM, Florian Westphal wrote:
> > The (out-of-tree) Multipath-TCP implementation needs a significant amount
> > of extra space in the skb control buffer.
> 
> Which skbs ? Input or output path ?

Both.

> > This work adds an extension infrastructure for sk_buff:
> > 1. extension memory is released when the sk_buff is free'd.
> > 2. data is shared after cloning an skb.
> > 
> 
> This seems additional atomic increments and decrements all over the places,
> and code bloat for a very precise reason :

No, it replaces the atomic dec/inc of nf_bridge and secpath.
If skb passes neither bridge netfilter nor ipsec, no atomic op is added.

If it passes either or, one atomic inc and dec is done, just like
current kernel (skb->sp or skb->nf_bridge refcount).

If it passes both (unlikely but possible) its now one instead of two,
or two (if one operation happens in different netns for instance, due to
skb scrubbing).

>       skb->cb[] is too small.
> 
> We do not want to increase skb->cb[] for two reasons, the first one being the killer.
> 
> 1) we clear it at skb allocation, and copy it at skb cloning.

Right.

> 2) extra memory cost.
> 
> Why can't we have another skb->cb2[] field that is not cleared/copied by skb functions at all ?
> 
> Each layer using skb->cb2[] would be responsible to fully manage it.

We could do that, its probably enough for mptcp needs.
This would keep nf_bridge and secpath pointers as-is and increase
skb truesize.

If you prefer that, ok, but I don't see why we can't unify them behind
a single layer?
Eric Dumazet Dec. 12, 2018, 8:17 p.m. UTC | #12
On 12/12/2018 10:44 AM, Florian Westphal wrote:

> We could do that, its probably enough for mptcp needs.
> This would keep nf_bridge and secpath pointers as-is and increase
> skb truesize.
> 
> If you prefer that, ok, but I don't see why we can't unify them behind
> a single layer?
> 

Well, for a start we do not use nf_brifge or secpath.

XDP is all about not unifying because unifying has a cost.

Do we really want to slow down the stack just because MPTCP is coming ?
Florian Westphal Dec. 12, 2018, 8:52 p.m. UTC | #13
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > If you prefer that, ok, but I don't see why we can't unify them behind
> > a single layer?
> 
> Well, for a start we do not use nf_brifge or secpath.

Then the extension framework isn't built and the result
is exactly the same as before these patches: helpers
turn into empty inline stubs.

If its built, sk_buff size is reduced by up to one pointer,
and the added one isn't initialised at allocation time.

sk_buff layout with the extension patches is almost same
as with XFRM=n BRIDGE_NF=n (because nf_bridge and sp pointers
get removed), there is only one additional pointer at the end,
not inited at alloc time.  The other new member fills a hole.

> XDP is all about not unifying because unifying has a cost.
> 
> Do we really want to slow down the stack just because MPTCP is coming ?

I don't want to slow down the stack.  All places that gain a helper call
do so instead of the nf_bridge/secpath one.  They only expand into code
if at least one of secpath or nf_bridge are built in.
Both struct secpath and nf_bridge have their refcounts removed, so no
additional reference counts are added.

If you still think this is proposal is a bad idea, ok, let me know
and I will stop working on this.

MPTCP can then just add skb->cb2[], but neither nf_bridge nor secpath can
use it.

If not, I will send another iteration that just allocates the entire
extension space if first extension is added, it simplifies allocation
handling a little.
David Miller Dec. 13, 2018, 12:38 a.m. UTC | #14
From: Stephen Suryaputra <ssuryaextr@gmail.com>
Date: Wed, 12 Dec 2018 13:16:57 -0500

> On Mon, Dec 10, 2018 at 11:20 AM Florian Westphal <fw@strlen.de> wrote:
> 
>> +#ifdef CONFIG_SKB_EXTENSIONS
>> +enum skb_ext_id {
>> +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
>> +       SKB_EXT_BRIDGE_NF,
>> +#endif
>> +       SKB_EXT_NUM, /* must be last */
>> +};
> 
> How about when proprietary extensions is desired? There are cases
> where our system has to pass special metadata from input to output in
> skbs.

Proprietary extensions will absolutely not be taken into consideration
when designing upstream features in the Linux kernel.
Eric Dumazet Dec. 13, 2018, 5:40 a.m. UTC | #15
On 12/12/2018 12:52 PM, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>> If you prefer that, ok, but I don't see why we can't unify them behind
>>> a single layer?
>>
>> Well, for a start we do not use nf_brifge or secpath.
> 
> Then the extension framework isn't built and the result
> is exactly the same as before these patches: helpers
> turn into empty inline stubs.
> 
> If its built, sk_buff size is reduced by up to one pointer,
> and the added one isn't initialised at allocation time.
> 
> sk_buff layout with the extension patches is almost same
> as with XFRM=n BRIDGE_NF=n (because nf_bridge and sp pointers
> get removed), there is only one additional pointer at the end,
> not inited at alloc time.  The other new member fills a hole.
> 
>> XDP is all about not unifying because unifying has a cost.
>>
>> Do we really want to slow down the stack just because MPTCP is coming ?
> 
> I don't want to slow down the stack.  All places that gain a helper call
> do so instead of the nf_bridge/secpath one.  They only expand into code
> if at least one of secpath or nf_bridge are built in.
> Both struct secpath and nf_bridge have their refcounts removed, so no
> additional reference counts are added.
> 
> If you still think this is proposal is a bad idea, ok, let me know
> and I will stop working on this.

Certainly not a bad idea.

> 
> MPTCP can then just add skb->cb2[], but neither nf_bridge nor secpath can
> use it.
> 
> If not, I will send another iteration that just allocates the entire
> extension space if first extension is added, it simplifies allocation
> handling a little.
> 

I am still unsure of the added extra costs, but for a start, TCP xmit clones
the skbs. 

Does it mean an additional pair of atomic operations if the skb comes from MPTCP enabled flow ?

The clone is used in the lower layers (IP , qdisc, device xmit), so I do not see why a clone
should share the extdata from the master.
Florian Westphal Dec. 13, 2018, 9:27 a.m. UTC | #16
Eric Dumazet <eric.dumazet@gmail.com> wrote:

[ CC Christoph, Mathew, Peter ]

> > If not, I will send another iteration that just allocates the entire
> > extension space if first extension is added, it simplifies allocation
> > handling a little.
> > 
> 
> I am still unsure of the added extra costs, but for a start, TCP xmit clones
> the skbs. 

Yes.

> Does it mean an additional pair of atomic operations if the skb comes from MPTCP enabled flow ?

Depends.

If its going to be used as I expect, then the extension could be
discarded after the DSS mapping has been written to the tcp option
space, i.e. before cloning occurs.

So skb_clone would not cause a refcount increase, as the skb
does not have extension (anymore) by that time.

If its presevered because it has to be used again
after the cloning, then you are right, and we get refcont_inc() cost.

As for the freeing, this currently has an atomic op:

static inline void __skb_ext_put(struct skb_ext *ext)
{
	if (ext && refcount_dec_and_test(&ext->refcnt))
		__skb_ext_free(ext);
}

static inline void skb_ext_put(struct sk_buff *skb)
{
	if (skb->active_extensions)
		__skb_ext_put(skb->extensions);
}

I think this could be avoided by using same trick as
fclones->fclone_ref one in kfree_skbmem():

static inline void skb_ext_put(struct sk_buff *skb)
{
	if (skb->active_extensions)
		__skb_ext_put(skb->extensions);
}

void __skb_ext_put(struct skb_ext *ext)
{
	if (refcount_read(&ext->refcnt) == 1)
		goto free_now;

	if (!refcount_dec_and_test(&ext->refcnt))
		return;
free_now:
#ifdef CONFIG_XFRM
        if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
		skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif
	kfree(ext);
}

I think this could work, if refcount is 1, then there is no clone, i.e.
nothing else can inc/dec reference after the "== 1" check is true.

I will change it to above version.

> The clone is used in the lower layers (IP , qdisc, device xmit), so I do not see why a clone
> should share the extdata from the master.

For TCP, thats true.  But there are other places that could clone, e.g.
when bridge has to flood-forward.

At least in bridge case the 'preseve on clone' is needed, else required
information is missing from the cloned skb.
Eric Dumazet Dec. 13, 2018, 10:18 a.m. UTC | #17
On 12/13/2018 01:27 AM, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> [ CC Christoph, Mathew, Peter ]
> 
>>> If not, I will send another iteration that just allocates the entire
>>> extension space if first extension is added, it simplifies allocation
>>> handling a little.
>>>
>>
>> I am still unsure of the added extra costs, but for a start, TCP xmit clones
>> the skbs. 
> 
> Yes.
> 
>> Does it mean an additional pair of atomic operations if the skb comes from MPTCP enabled flow ?
> 
> Depends.
> 
> If its going to be used as I expect, then the extension could be
> discarded after the DSS mapping has been written to the tcp option
> space, i.e. before cloning occurs.

I do not see how this would work, without also discarding on the master skb
the needed info.

> 
> So skb_clone would not cause a refcount increase, as the skb
> does not have extension (anymore) by that time.
> 
> If its presevered because it has to be used again
> after the cloning, then you are right, and we get refcont_inc() cost.
> 
> As for the freeing, this currently has an atomic op:
> 
> static inline void __skb_ext_put(struct skb_ext *ext)
> {
> 	if (ext && refcount_dec_and_test(&ext->refcnt))
> 		__skb_ext_free(ext);
> }
> 
> static inline void skb_ext_put(struct sk_buff *skb)
> {
> 	if (skb->active_extensions)
> 		__skb_ext_put(skb->extensions);
> }
> 
> I think this could be avoided by using same trick as
> fclones->fclone_ref one in kfree_skbmem():
> 
> static inline void skb_ext_put(struct sk_buff *skb)
> {
> 	if (skb->active_extensions)
> 		__skb_ext_put(skb->extensions);
> }
> 
> void __skb_ext_put(struct skb_ext *ext)
> {
> 	if (refcount_read(&ext->refcnt) == 1)
> 		goto free_now;
> 
> 	if (!refcount_dec_and_test(&ext->refcnt))
> 		return;
> free_now:
> #ifdef CONFIG_XFRM
>         if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
> 		skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
> #endif
> 	kfree(ext);
> }
> 
> I think this could work, if refcount is 1, then there is no clone, i.e.
> nothing else can inc/dec reference after the "== 1" check is true.
> 
> I will change it to above version.
> 
>> The clone is used in the lower layers (IP , qdisc, device xmit), so I do not see why a clone
>> should share the extdata from the master.
> 
> For TCP, thats true.  But there are other places that could clone, e.g.
> when bridge has to flood-forward.
> 

So you propose a mechanism that forces a preserve on clone, base on existing needs
for bridging.

> At least in bridge case the 'preseve on clone' is needed, else required
> information is missing from the cloned skb.
> 

We need something where MPTCP info does not need to be propagated all the way to the NIC...

This skb extension is an incentive for adding more sticky things in the skbs
to violate layering of networking stacks :/
Florian Westphal Dec. 13, 2018, 10:39 a.m. UTC | #18
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > If its going to be used as I expect, then the extension could be
> > discarded after the DSS mapping has been written to the tcp option
> > space, i.e. before cloning occurs.
> 
> I do not see how this would work, without also discarding on the master skb
> the needed info.

Ok, so lets assume this would result in one atomic_inc/dec due to clone
for now for skbs coming from mptcp socket.

But I don't see why this would have to be.

> > For TCP, thats true.  But there are other places that could clone, e.g.
> > when bridge has to flood-forward.
> > 
>
> So you propose a mechanism that forces a preserve on clone, base on existing needs
> for bridging.

secpath does the same thing:

static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
...
#ifdef CONFIG_XFRM
        new->sp                 = secpath_get(old->sp);
#endif
...

So I am not proposing anything new.

> > At least in bridge case the 'preseve on clone' is needed, else required
> > information is missing from the cloned skb.
> > 
> 
> We need something where MPTCP info does not need to be propagated all the way to the NIC...

Thats whats done in the MPTCP out-of-tree implementation, but I don't
think its needed.

It could just delete the extension before ->queue_xmit() AFAIU.

> This skb extension is an incentive for adding more sticky things in the skbs
> to violate layering of networking stacks :/

8-(

Where do you see "layering violations"?
Eric Dumazet Dec. 13, 2018, 10:58 a.m. UTC | #19
On 12/13/2018 02:39 AM, Florian Westphal wrote:
>
> Thats whats done in the MPTCP out-of-tree implementation, but I don't
> think its needed.
> 
> It could just delete the extension before ->queue_xmit() AFAIU.

So, cloning would do an refcount_inc(), and deleting the extension would do an refcount_dec_and_test() ?

That is what I called an extra pair of atomic operations.
Florian Westphal Dec. 13, 2018, 11:03 a.m. UTC | #20
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> 
> On 12/13/2018 02:39 AM, Florian Westphal wrote:
> >
> > Thats whats done in the MPTCP out-of-tree implementation, but I don't
> > think its needed.
> > 
> > It could just delete the extension before ->queue_xmit() AFAIU.
> 
> So, cloning would do an refcount_inc(), and deleting the extension would do an refcount_dec_and_test() ?
> 
> That is what I called an extra pair of atomic operations.

If it replaces 1:1 current mptcp skb->private out-of-tree storage, then
yes.
Eric Dumazet Dec. 13, 2018, 11:16 a.m. UTC | #21
On 12/13/2018 03:03 AM, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>
>>
>> On 12/13/2018 02:39 AM, Florian Westphal wrote:
>>>
>>> Thats whats done in the MPTCP out-of-tree implementation, but I don't
>>> think its needed.
>>>
>>> It could just delete the extension before ->queue_xmit() AFAIU.
>>
>> So, cloning would do an refcount_inc(), and deleting the extension would do an refcount_dec_and_test() ?
>>
>> That is what I called an extra pair of atomic operations.
> 
> If it replaces 1:1 current mptcp skb->private out-of-tree storage, then
> yes.
> 

One day I will write a book on the number of atomic operations done on a TCP sendmsg() system call :/
Florian Westphal Dec. 13, 2018, 11:44 a.m. UTC | #22
Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On 12/13/2018 03:03 AM, Florian Westphal wrote:
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >> So, cloning would do an refcount_inc(), and deleting the extension would do an refcount_dec_and_test() ?
> >>
> >> That is what I called an extra pair of atomic operations.
> > 
> > If it replaces 1:1 current mptcp skb->private out-of-tree storage, then
> > yes.
> > 
> 
> One day I will write a book on the number of atomic operations done on a TCP sendmsg() system call :/

Just to clarify: mptcp skb->private out-of-tree storage is maintened
using atomic_ops too, so if its converted 1:1 nothing changes in the
*mptcp tree*.

If its possible to re-arrange the out-of-tree so that it doesn't need
that (only used while skb is not cloned), then the same applies to the
extension conversion.

So this, as far as *this series* is concerened, a "future problem" that
may or may not exist.
Christoph Paasch Dec. 13, 2018, 5 p.m. UTC | #23
On 13/12/18 - 11:39:18, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > If its going to be used as I expect, then the extension could be
> > > discarded after the DSS mapping has been written to the tcp option
> > > space, i.e. before cloning occurs.
> > 
> > I do not see how this would work, without also discarding on the master skb
> > the needed info.
> 
> Ok, so lets assume this would result in one atomic_inc/dec due to clone
> for now for skbs coming from mptcp socket.
> 
> But I don't see why this would have to be.
> 
> > > For TCP, thats true.  But there are other places that could clone, e.g.
> > > when bridge has to flood-forward.
> > > 
> >
> > So you propose a mechanism that forces a preserve on clone, base on existing needs
> > for bridging.
> 
> secpath does the same thing:
> 
> static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
> {
> ...
> #ifdef CONFIG_XFRM
>         new->sp                 = secpath_get(old->sp);
> #endif
> ...
> 
> So I am not proposing anything new.
> 
> > > At least in bridge case the 'preseve on clone' is needed, else required
> > > information is missing from the cloned skb.
> > > 
> > 
> > We need something where MPTCP info does not need to be propagated all the way to the NIC...
> 
> Thats whats done in the MPTCP out-of-tree implementation, but I don't
> think its needed.

Yes, it indeed does not need to go all the way down to the NIC.

The info basically "just" needs to be propagated from the MPTCP-layer down
to the TCP-option space. Thus, it needs to remain on the skbs that are
sitting in the TCP-subflow's send-queue and rexmit tree as we need it when
retransmitting.

In tcp_transmit_skb, the clone is done at the beginning. Thus, we could for
example not inc the refcount on the clone and simply pass a pointer to the
original skb to tcp_established_options.

That way it the DSS option stays within the MPTCP/TCP layer and does not
make it down to the NIC.


Christoph

> 
> It could just delete the extension before ->queue_xmit() AFAIU.
> 
> > This skb extension is an incentive for adding more sticky things in the skbs
> > to violate layering of networking stacks :/
> 
> 8-(
> 
> Where do you see "layering violations"?
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b1831a5ca173..d715736eb734 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -245,6 +245,7 @@  struct iov_iter;
 struct napi_struct;
 struct bpf_prog;
 union bpf_attr;
+struct skb_ext;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -636,6 +637,7 @@  typedef unsigned char *sk_buff_data_t;
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
  *	@pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
+ *	@active_extensions: active extensions (skb_ext_id types)
  *	@ndisc_nodetype: router type (from link layer)
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -665,6 +667,7 @@  typedef unsigned char *sk_buff_data_t;
  *	@data: Data head pointer
  *	@truesize: Buffer size
  *	@users: User count - see {datagram,tcp}.c
+ *	@extensions: allocated extensions, valid if active_extensions is nonzero
  */
 
 struct sk_buff {
@@ -747,7 +750,9 @@  struct sk_buff {
 				head_frag:1,
 				xmit_more:1,
 				pfmemalloc:1;
-
+#ifdef CONFIG_SKB_EXTENSIONS
+	__u8			active_extensions;
+#endif
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
 	 */
@@ -869,6 +874,11 @@  struct sk_buff {
 				*data;
 	unsigned int		truesize;
 	refcount_t		users;
+
+#ifdef CONFIG_SKB_EXTENSIONS
+	/* only useable after checking ->active_extensions != 0 */
+	struct skb_ext		*extensions;
+#endif
 };
 
 #ifdef __KERNEL__
@@ -3896,6 +3906,113 @@  static inline void nf_conntrack_get(struct nf_conntrack *nfct)
 		atomic_inc(&nfct->use);
 }
 #endif
+
+#ifdef CONFIG_SKB_EXTENSIONS
+enum skb_ext_id {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	SKB_EXT_BRIDGE_NF,
+#endif
+	SKB_EXT_NUM, /* must be last */
+};
+
+/**
+ *	struct skb_ext - sk_buff extensions
+ *	@refcnt: 1 on allocation, deallocated on 0
+ *	@offset: offset to add to @data to obtain extension address
+ *	@chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
+ *	@data: start of extension data, variable sized
+ *
+ *	Note: offsets/lengths are stored in chunks of 8 bytes, this allows
+ *	to use 'u8' types while allowing up to 2kb worth of extension data.
+ */
+struct skb_ext {
+	refcount_t refcnt;
+	u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
+	u8 chunks;		/* same */
+	char data[0] __aligned(8);
+};
+
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_free(struct skb_ext *ext);
+
+static inline void __skb_ext_put(struct skb_ext *ext)
+{
+	if (ext && refcount_dec_and_test(&ext->refcnt))
+		__skb_ext_free(ext);
+}
+
+static inline void skb_ext_put(struct sk_buff *skb)
+{
+	if (skb->active_extensions)
+		__skb_ext_put(skb->extensions);
+}
+
+static inline void skb_ext_get(struct sk_buff *skb)
+{
+	if (skb->active_extensions) {
+		struct skb_ext *ext = skb->extensions;
+
+		if (ext)
+			refcount_inc(&ext->refcnt);
+	}
+}
+
+static inline void __skb_ext_copy(struct sk_buff *dst,
+				  const struct sk_buff *src)
+{
+	dst->active_extensions = src->active_extensions;
+
+	if (src->active_extensions) {
+		struct skb_ext *ext = src->extensions;
+
+		if (ext)
+			refcount_inc(&ext->refcnt);
+		dst->extensions = ext;
+	}
+}
+
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
+{
+	skb_ext_put(dst);
+	__skb_ext_copy(dst, src);
+}
+
+static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
+{
+	return !!ext->offset[i];
+}
+
+static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
+{
+	return skb->active_extensions & (1 << id);
+}
+
+static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+	if (skb_ext_exist(skb, id))
+		__skb_ext_del(skb, id);
+}
+
+static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
+{
+	if (skb_ext_exist(skb, id)) {
+		struct skb_ext *ext = skb->extensions;
+
+		if (ext && __skb_ext_exist(ext, id))
+			return (void *)ext + (ext->offset[id] << 3);
+	}
+
+	return NULL;
+}
+#else
+static inline void skb_ext_put(struct sk_buff *skb) {}
+static inline void skb_ext_get(struct sk_buff *skb) {}
+static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
+static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
+#endif /* CONFIG_SKB_EXTENSIONS */
+
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
 {
diff --git a/net/Kconfig b/net/Kconfig
index f235edb593ba..93b291292860 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -51,6 +51,9 @@  config NET_INGRESS
 config NET_EGRESS
 	bool
 
+config SKB_EXTENSIONS
+	bool
+
 menu "Networking options"
 
 source "net/packet/Kconfig"
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 40552547c69a..e36461d03d4c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -617,6 +617,7 @@  void skb_release_head_state(struct sk_buff *skb)
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(skb->nf_bridge);
 #endif
+	skb_ext_put(skb);
 }
 
 /* Free everything but the sk_buff shell. */
@@ -796,6 +797,7 @@  static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->dev		= old->dev;
 	memcpy(new->cb, old->cb, sizeof(old->cb));
 	skb_dst_copy(new, old);
+	__skb_ext_copy(new, old);
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
@@ -5554,3 +5556,134 @@  void skb_condense(struct sk_buff *skb)
 	 */
 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 }
+
+#ifdef CONFIG_SKB_EXTENSIONS
+#define SKB_EXT_ALIGN_VALUE	8
+#define SKB_EXT_CHUNKSIZEOF(x)	(ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
+
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	[SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
+#endif
+};
+
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+	return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
+}
+
+static struct skb_ext *skb_ext_cow(unsigned int len,
+				   struct skb_ext *old)
+{
+	struct skb_ext *new = kmalloc(len, GFP_ATOMIC);
+
+	if (!new)
+		return NULL;
+
+	if (!old) {
+		memset(new->offset, 0, sizeof(new->offset));
+		refcount_set(&new->refcnt, 1);
+		return new;
+	}
+
+	memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
+	refcount_set(&new->refcnt, 1);
+	__skb_ext_put(old);
+	return new;
+}
+
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+	return 0 +
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+		skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
+#endif
+		0;
+}
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extenion or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+	struct skb_ext *new, *old = NULL;
+	unsigned int newlen, newoff;
+	bool cow_needed = true;
+
+	BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+	BUILD_BUG_ON(skb_ext_total_length() > 255);
+
+	if (skb->active_extensions) {
+		old = skb->extensions;
+
+		cow_needed = refcount_read(&old->refcnt) > 1;
+
+		if (__skb_ext_exist(old, id)) {
+			if (!cow_needed) {
+				new = old;
+				goto set_active;
+			}
+
+			/* extension was allocated previously and it
+			 * might be used by a cloned skb. COW needed.
+			 */
+			new = skb_ext_cow(old->chunks * SKB_EXT_ALIGN_VALUE, old);
+			if (!new)
+				return NULL;
+
+			skb->extensions = new;
+			goto set_active;
+		}
+		newoff = old->chunks;
+	} else {
+		newoff = SKB_EXT_CHUNKSIZEOF(*new);
+	}
+
+	newlen = newoff + skb_ext_type_len[id];
+
+	if (cow_needed)
+		new = skb_ext_cow(newlen * SKB_EXT_ALIGN_VALUE, old);
+	else
+		new = krealloc(old, newlen * SKB_EXT_ALIGN_VALUE, GFP_ATOMIC);
+	if (!new)
+		return NULL;
+
+	new->offset[id] = newoff;
+	new->chunks = newlen;
+	skb->extensions = new;
+set_active:
+	skb->active_extensions |= 1 << id;
+	return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+	struct skb_ext *ext;
+
+	skb->active_extensions &= ~(1 << id);
+	if (skb->active_extensions == 0) {
+		ext = skb->extensions;
+		skb->extensions = NULL;
+		__skb_ext_put(ext);
+	}
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_free(struct skb_ext *ext)
+{
+	kfree(ext);
+}
+EXPORT_SYMBOL(__skb_ext_free);
+#endif /* CONFIG_SKB_EXTENSIONS */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ab6618036afe..c80188875f39 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -533,6 +533,7 @@  static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
+	skb_ext_copy(to, from);
 #if IS_ENABLED(CONFIG_IP_VS)
 	to->ipvs_property = from->ipvs_property;
 #endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9d55ee33b7f9..703a8e801c5c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -581,6 +581,7 @@  static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
+	skb_ext_copy(to, from);
 	skb_copy_secmark(to, from);
 }