diff mbox

[net-next] net: adjust skb->truesize in pskb_expand_head()

Message ID 1485476480.5145.194.camel@edumazet-glaptop3.roam.corp.google.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Jan. 27, 2017, 12:21 a.m. UTC
From: Eric Dumazet <edumazet@google.com>

Slava Shwartsman reported a warning in skb_try_coalesce(), when we
detect skb->truesize is completely wrong.

In his case, issue came from IPv6 reassembly coping with malicious
datagrams, that forced various pskb_may_pull() to reallocate a bigger
skb->head than the one allocated by NIC driver before entering GRO
layer.

Current code does not change skb->truesize, leaving this burden to
callers if they care enough.

Blindly changing skb->truesize in pskb_expand_head() is not
easy, as some producers might track skb->truesize, for example
in xmit path for back pressure feedback (sk->sk_wmem_alloc)

We can detect the cases where it should be safe to change
skb->truesize :

1) skb is not attached to a socket.
2) If it is attached to a socket, destructor is sock_edemux()

My audit gave only two callers doing their own skb->truesize
manipulation.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Slava Shwartsman <slavash@mellanox.com>
---
 net/core/skbuff.c        |   14 +++++++++++---
 net/netlink/af_netlink.c |    8 +++-----
 net/wireless/util.c      |    2 --
 3 files changed, 14 insertions(+), 10 deletions(-)

Comments

kernel test robot Jan. 27, 2017, 2:22 a.m. UTC | #1
Hi Eric,

[auto build test WARNING on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Eric-Dumazet/net-adjust-skb-truesize-in-pskb_expand_head/20170127-082517
config: i386-randconfig-x0-01270914 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All warnings (new ones prefixed by >>):

   In file included from include/uapi/linux/stddef.h:1:0,
                    from include/linux/stddef.h:4,
                    from include/uapi/linux/posix_types.h:4,
                    from include/uapi/linux/types.h:13,
                    from include/linux/types.h:5,
                    from include/linux/list.h:4,
                    from include/linux/module.h:9,
                    from net/core/skbuff.c:41:
   net/core/skbuff.c: In function 'pskb_expand_head':
   net/core/skbuff.c:1265:37: error: 'sock_edemux' undeclared (first use in this function)
     if (!skb->sk || skb->destructor == sock_edemux)
                                        ^
   include/linux/compiler.h:149:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
>> net/core/skbuff.c:1265:2: note: in expansion of macro 'if'
     if (!skb->sk || skb->destructor == sock_edemux)
     ^~
   net/core/skbuff.c:1265:37: note: each undeclared identifier is reported only once for each function it appears in
     if (!skb->sk || skb->destructor == sock_edemux)
                                        ^
   include/linux/compiler.h:149:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
>> net/core/skbuff.c:1265:2: note: in expansion of macro 'if'
     if (!skb->sk || skb->destructor == sock_edemux)
     ^~

vim +/if +1265 net/core/skbuff.c

  1249		skb->end      = size;
  1250		off           = nhead;
  1251	#else
  1252		skb->end      = skb->head + size;
  1253	#endif
  1254		skb->tail	      += off;
  1255		skb_headers_offset_update(skb, nhead);
  1256		skb->cloned   = 0;
  1257		skb->hdr_len  = 0;
  1258		skb->nohdr    = 0;
  1259		atomic_set(&skb_shinfo(skb)->dataref, 1);
  1260	
  1261		/* It is not generally safe to change skb->truesize.
  1262		 * For the moment, we really care of rx path, or
  1263		 * when skb is orphaned (not attached to a socket)
  1264		 */
> 1265		if (!skb->sk || skb->destructor == sock_edemux)
  1266			skb->truesize += size - osize;
  1267	
  1268		return 0;
  1269	
  1270	nofrags:
  1271		kfree(data);
  1272	nodata:
  1273		return -ENOMEM;

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
David Laight Jan. 27, 2017, 10:49 a.m. UTC | #2
From: Eric Dumazet

> Sent: 27 January 2017 00:21

> Slava Shwartsman reported a warning in skb_try_coalesce(), when we

> detect skb->truesize is completely wrong.

> 

> In his case, issue came from IPv6 reassembly coping with malicious

> datagrams, that forced various pskb_may_pull() to reallocate a bigger

> skb->head than the one allocated by NIC driver before entering GRO

> layer.

> 

> Current code does not change skb->truesize, leaving this burden to

> callers if they care enough.

> 

> Blindly changing skb->truesize in pskb_expand_head() is not

> easy, as some producers might track skb->truesize, for example

> in xmit path for back pressure feedback (sk->sk_wmem_alloc)

> 

> We can detect the cases where it should be safe to change

> skb->truesize :

> 

> 1) skb is not attached to a socket.

> 2) If it is attached to a socket, destructor is sock_edemux()

...
>  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>  		     gfp_t gfp_mask)

>  {

> +	int i, osize = skb_end_offset(skb);

> +	int size = osize + nhead + ntail;

>  	long off;

> +	u8 *data;

> 

>  	BUG_ON(nhead < 0);

> 

> @@ -1257,6 +1257,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

>  	skb->hdr_len  = 0;

>  	skb->nohdr    = 0;

>  	atomic_set(&skb_shinfo(skb)->dataref, 1);

> +

> +	/* It is not generally safe to change skb->truesize.

> +	 * For the moment, we really care of rx path, or

> +	 * when skb is orphaned (not attached to a socket)

> +	 */

> +	if (!skb->sk || skb->destructor == sock_edemux)

> +		skb->truesize += size - osize;


That calculation doesn't look right to me at all.
Isn't 'truesize' supposed to reflect the amount of memory allocated to the skb.
So you need the difference between the size of the new and old memory blocks.

I'm also guessing that extra headroom can be generated by stealing unused tailroom.

	David
Eric Dumazet Jan. 27, 2017, 2:44 p.m. UTC | #3
On Fri, 2017-01-27 at 10:49 +0000, David Laight wrote:
> From: Eric Dumazet
> > Sent: 27 January 2017 00:21
> > Slava Shwartsman reported a warning in skb_try_coalesce(), when we
> > detect skb->truesize is completely wrong.
> > 
> > In his case, issue came from IPv6 reassembly coping with malicious
> > datagrams, that forced various pskb_may_pull() to reallocate a bigger
> > skb->head than the one allocated by NIC driver before entering GRO
> > layer.
> > 
> > Current code does not change skb->truesize, leaving this burden to
> > callers if they care enough.
> > 
> > Blindly changing skb->truesize in pskb_expand_head() is not
> > easy, as some producers might track skb->truesize, for example
> > in xmit path for back pressure feedback (sk->sk_wmem_alloc)
> > 
> > We can detect the cases where it should be safe to change
> > skb->truesize :
> > 
> > 1) skb is not attached to a socket.
> > 2) If it is attached to a socket, destructor is sock_edemux()
> ...
> >  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> >  		     gfp_t gfp_mask)
> >  {
> > +	int i, osize = skb_end_offset(skb);
> > +	int size = osize + nhead + ntail;
> >  	long off;
> > +	u8 *data;
> > 
> >  	BUG_ON(nhead < 0);
> > 
> > @@ -1257,6 +1257,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
> >  	skb->hdr_len  = 0;
> >  	skb->nohdr    = 0;
> >  	atomic_set(&skb_shinfo(skb)->dataref, 1);
> > +
> > +	/* It is not generally safe to change skb->truesize.
> > +	 * For the moment, we really care of rx path, or
> > +	 * when skb is orphaned (not attached to a socket)
> > +	 */
> > +	if (!skb->sk || skb->destructor == sock_edemux)
> > +		skb->truesize += size - osize;
> 
> That calculation doesn't look right to me at all.
> Isn't 'truesize' supposed to reflect the amount of memory allocated to the skb.
> So you need the difference between the size of the new and old memory blocks.
> 

Well, please take a look at the code, because I believe I did exactly
that.

> I'm also guessing that extra headroom can be generated by stealing unused tailroom.

This is already done.

Quoting
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=87fb4b7b533073eeeaed0b6bf7c2328995f6c075

    At skb alloc phase, we put skb_shared_info struct at the exact end of
    skb head, to allow a better use of memory (lowering number of
    reallocations), since kmalloc() gives us power-of-two memory blocks.
David Laight Jan. 27, 2017, 3:46 p.m. UTC | #4
From: Eric Dumazet [mailto:eric.dumazet@gmail.com]

> Sent: 27 January 2017 14:44

> On Fri, 2017-01-27 at 10:49 +0000, David Laight wrote:

> > From: Eric Dumazet

> > > Sent: 27 January 2017 00:21

> > > Slava Shwartsman reported a warning in skb_try_coalesce(), when we

> > > detect skb->truesize is completely wrong.

> > >

> > > In his case, issue came from IPv6 reassembly coping with malicious

> > > datagrams, that forced various pskb_may_pull() to reallocate a bigger

> > > skb->head than the one allocated by NIC driver before entering GRO

> > > layer.

> > >

> > > Current code does not change skb->truesize, leaving this burden to

> > > callers if they care enough.

> > >

> > > Blindly changing skb->truesize in pskb_expand_head() is not

> > > easy, as some producers might track skb->truesize, for example

> > > in xmit path for back pressure feedback (sk->sk_wmem_alloc)

> > >

> > > We can detect the cases where it should be safe to change

> > > skb->truesize :

> > >

> > > 1) skb is not attached to a socket.

> > > 2) If it is attached to a socket, destructor is sock_edemux()

> > ...

> > >  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> > >  		     gfp_t gfp_mask)

> > >  {

> > > +	int i, osize = skb_end_offset(skb);

> > > +	int size = osize + nhead + ntail;

> > >  	long off;

> > > +	u8 *data;

> > >

> > >  	BUG_ON(nhead < 0);

> > >

> > > @@ -1257,6 +1257,14 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,

> > >  	skb->hdr_len  = 0;

> > >  	skb->nohdr    = 0;

> > >  	atomic_set(&skb_shinfo(skb)->dataref, 1);

> > > +

> > > +	/* It is not generally safe to change skb->truesize.

> > > +	 * For the moment, we really care of rx path, or

> > > +	 * when skb is orphaned (not attached to a socket)

> > > +	 */

> > > +	if (!skb->sk || skb->destructor == sock_edemux)

> > > +		skb->truesize += size - osize;

> >

> > That calculation doesn't look right to me at all.

> > Isn't 'truesize' supposed to reflect the amount of memory allocated to the skb.

> > So you need the difference between the size of the new and old memory blocks.

> >

> 

> Well, please take a look at the code, because I believe I did exactly

> that.


Reads code ...
My confusion is that the call is specifying the number of EXTRA bytes of head/tail
room rather than the number of bytes needed.

> > I'm also guessing that extra headroom can be generated by stealing unused tailroom.

> 

> This is already done.

> 

> Quoting

> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=87fb4b7b533073eeeaed0b6bf7c

> 2328995f6c075

> 

>     At skb alloc phase, we put skb_shared_info struct at the exact end of

>     skb head, to allow a better use of memory (lowering number of

>     reallocations), since kmalloc() gives us power-of-two memory blocks.


It looks as though that just makes all the 'spare' space tailroom.
My guess is that headroom is needed more often than tailroom.
It doesn't look as though the amount of tailroom that has actually been requested
is saved either (nor headroom for that matter).

I was thinking that pskb_expand_head(skb, 16, -16, ...) could be implemented
(mostly) with memmove().

Hmmm.... Also if a caller asks for 3 extra bytes of headroom you really want to
allocate 8 extra bytes so that the memcpy() is aligned.

	David
Eric Dumazet Jan. 27, 2017, 4:14 p.m. UTC | #5
On Fri, 2017-01-27 at 15:46 +0000, David Laight wrote:

> Reads code ...
> My confusion is that the call is specifying the number of EXTRA bytes of head/tail
> room rather than the number of bytes needed.

And the fact that @size is changed in existing code (so not visible in
patch diff) to

size = SKB_WITH_OVERHEAD(ksize(data));


> I was thinking that pskb_expand_head(skb, 16, -16, ...) could be implemented
> (mostly) with memmove().

Yes, might worth doing that if one day a caller tries that ;)

In the meantime, not worth adding code that wont be reached.

Thanks.
David Laight Jan. 27, 2017, 5:24 p.m. UTC | #6
From: Eric Dumazet

> Sent: 27 January 2017 14:44

...
> > I'm also guessing that extra headroom can be generated by stealing unused tailroom.

> 

> This is already done.

> 

> Quoting

> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=87fb4b7b533073eeeaed0b6bf7c

> 2328995f6c075

> 

>     At skb alloc phase, we put skb_shared_info struct at the exact end of

>     skb head, to allow a better use of memory (lowering number of

>     reallocations), since kmalloc() gives us power-of-two memory blocks.


Does that actually have the expected effect?

Allocate an skb for 512 bytes, copy in some data with 64 bytes of headroom.
This is (probably) a 1k memory block with skb_shared_info at the end.

Some code needs to add a header that doesn't fit, calls pskb_expand_head()
to get another 4 bytes.
Since the existing amount of 'tailroom' must be kept kmalloc(1024+4) is called.
This allocates a 2k memory block, again skb_shared_info is put at the end.
So the headroom has been increased by 4 bytes and the tailroom by 1020.

Another layer needs to add another header.
The memory block becomes 4k large.

What have I missed?

	David
Eric Dumazet Jan. 27, 2017, 6:16 p.m. UTC | #7
On Fri, 2017-01-27 at 17:24 +0000, David Laight wrote:
> From: Eric Dumazet
> > Sent: 27 January 2017 14:44
> ...
> > > I'm also guessing that extra headroom can be generated by stealing unused tailroom.
> > 
> > This is already done.
> > 
> > Quoting
> > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=87fb4b7b533073eeeaed0b6bf7c
> > 2328995f6c075
> > 
> >     At skb alloc phase, we put skb_shared_info struct at the exact end of
> >     skb head, to allow a better use of memory (lowering number of
> >     reallocations), since kmalloc() gives us power-of-two memory blocks.
> 
> Does that actually have the expected effect?
> 
> Allocate an skb for 512 bytes, copy in some data with 64 bytes of headroom.
> This is (probably) a 1k memory block with skb_shared_info at the end.
> 
> Some code needs to add a header that doesn't fit, calls pskb_expand_head()
> to get another 4 bytes.
> Since the existing amount of 'tailroom' must be kept kmalloc(1024+4) is called.
> This allocates a 2k memory block, again skb_shared_info is put at the end.
> So the headroom has been increased by 4 bytes and the tailroom by 1020.
> 
> Another layer needs to add another header.
> The memory block becomes 4k large.
> 
> What have I missed?

We try hard to pre-allocate enough headroom.

Because copies are expensive.

Look for MAX_HEADER, MAX_TCP_HEADER, and things like that.

For the tail, we add 128 bytes of extra tail when __pskb_pull_tail()
wants to expand skb->head.

If you believe you found a use case where we do stupid reallocations,
please fix the caller.

pskb_expand_head() should never be called, really.

Only for some pathological/malicious traffic we have to, eventually.
diff mbox

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f8dbe4a7ab46a9196c6683ce5c9c14d3d99d..6cd59da7ec583260748b9c45b99a824bcc61 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1192,10 +1192,10 @@  EXPORT_SYMBOL(__pskb_copy_fclone);
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		     gfp_t gfp_mask)
 {
-	int i;
-	u8 *data;
-	int size = nhead + skb_end_offset(skb) + ntail;
+	int i, osize = skb_end_offset(skb);
+	int size = osize + nhead + ntail;
 	long off;
+	u8 *data;
 
 	BUG_ON(nhead < 0);
 
@@ -1257,6 +1257,14 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->hdr_len  = 0;
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
+
+	/* It is not generally safe to change skb->truesize.
+	 * For the moment, we really care of rx path, or
+	 * when skb is orphaned (not attached to a socket)
+	 */
+	if (!skb->sk || skb->destructor == sock_edemux)
+		skb->truesize += size - osize;
+
 	return 0;
 
 nofrags:
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index edcc1e19ad532641f51f6809b8c90d1e3770..7b73c7c161a9680b8691a712c31073b77896 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1210,11 +1210,9 @@  static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 		skb = nskb;
 	}
 
-	if (!pskb_expand_head(skb, 0, -delta,
-			      (allocation & ~__GFP_DIRECT_RECLAIM) |
-			      __GFP_NOWARN | __GFP_NORETRY))
-		skb->truesize -= delta;
-
+	pskb_expand_head(skb, 0, -delta,
+			 (allocation & ~__GFP_DIRECT_RECLAIM) |
+			 __GFP_NOWARN | __GFP_NORETRY);
 	return skb;
 }
 
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 1b9296882dcd6a0b585dfd604a30807e7f26..68e5f2ecee1aa22f17ab9a55eb566124e585 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -618,8 +618,6 @@  int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 
 		if (pskb_expand_head(skb, head_need, 0, GFP_ATOMIC))
 			return -ENOMEM;
-
-		skb->truesize += head_need;
 	}
 
 	if (encaps_data) {