diff mbox

[2/2] tcp: cleanup tcp_try_coalesce

Message ID 20120503033901.5482.27183.stgit@gitlad.jf.intel.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Duyck, Alexander H May 3, 2012, 3:39 a.m. UTC
This change is mostly meant to help improve the readability of
tcp_try_coalesce.  I had a few issues since there were several points when
the code would test for a conditional, fail, then succeed on another
conditional take some action, and then follow a goto back into the previous
conditional.  I just tore all of that apart and made the whole thing one
linear flow with a single goto.

Also there were multiple ways of computing the delta, the one for head_frag
made the least amount of sense to me since we were only dropping the
sk_buff so I have updated the logic for the stolen head case so that delta
is only truesize - sizeof(skb_buff), and for the case where we are dropping
the head as well it is truesize - SKB_TRUESIZE(skb_end_pointer - head).
This way we can also account for the head_frag with headlen == 0.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---

 net/ipv4/tcp_input.c |   80 +++++++++++++++++++++++++++-----------------------
 1 files changed, 43 insertions(+), 37 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Eric Dumazet May 3, 2012, 4:06 a.m. UTC | #1
On Wed, 2012-05-02 at 20:39 -0700, Alexander Duyck wrote:
> This change is mostly meant to help improve the readability of
> tcp_try_coalesce.  I had a few issues since there were several points when
> the code would test for a conditional, fail, then succeed on another
> conditional take some action, and then follow a goto back into the previous
> conditional.  I just tore all of that apart and made the whole thing one
> linear flow with a single goto.
> 
> Also there were multiple ways of computing the delta, the one for head_frag
> made the least amount of sense to me since we were only dropping the
> sk_buff so I have updated the logic for the stolen head case so that delta
> is only truesize - sizeof(skb_buff), and for the case where we are dropping
> the head as well it is truesize - SKB_TRUESIZE(skb_end_pointer - head).
> This way we can also account for the head_frag with headlen == 0.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
> ---
> 

Sorry I prefer you dont touch this code like this.

The truesize bits must stay as is, since it'll track drivers that lies
about truesize.

>  net/ipv4/tcp_input.c |   80 +++++++++++++++++++++++++++-----------------------
>  1 files changed, 43 insertions(+), 37 deletions(-)
> 
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index c6f78e2..23bc3ff 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4548,62 +4548,68 @@ static bool tcp_try_coalesce(struct sock *sk,
>  	int i, delta, len = from->len;
>  
>  	*fragstolen = false;
> +
>  	if (tcp_hdr(from)->fin || skb_cloned(to))
>  		return false;
> +
>  	if (len <= skb_tailroom(to)) {
>  		BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
> -merge:
> -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
> -		TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
> -		TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
> -		return true;
> +		goto merge;
>  	}
>  
>  	if (skb_has_frag_list(to) || skb_has_frag_list(from))
>  		return false;
>  
> -	if (skb_headlen(from) == 0 &&
> -	    (skb_shinfo(to)->nr_frags +
> -	     skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) {
> -		WARN_ON_ONCE(from->head_frag);
> -		delta = from->truesize - ksize(from->head) -
> -			SKB_DATA_ALIGN(sizeof(struct sk_buff));


This delta was done on purpose. We want the ksize()

> -
> -		WARN_ON_ONCE(delta < len);
> -copyfrags:
> -		memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
> -		       skb_shinfo(from)->frags,
> -		       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
> -		skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
> -
> -		if (skb_cloned(from))
> -			for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
> -				skb_frag_ref(from, i);
> -		else
> -			skb_shinfo(from)->nr_frags = 0;
> -
> -		to->truesize += delta;
> -		atomic_add(delta, &sk->sk_rmem_alloc);
> -		sk_mem_charge(sk, delta);
> -		to->len += len;
> -		to->data_len += len;
> -		goto merge;
> -	}
> -	if (from->head_frag && !skb_cloned(from)) {
> +	if (skb_headlen(from) != 0) {
>  		struct page *page;
>  		unsigned int offset;
>  
> -		if (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
> +		if (skb_shinfo(to)->nr_frags +
> +		    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
> +			return false;
> +
> +		if (!from->head_frag || skb_cloned(from))
>  			return false;
> +
> +		delta = from->truesize - sizeof(struct sk_buff);



> +
>  		page = virt_to_head_page(from->head);
>  		offset = from->data - (unsigned char *)page_address(page);
> +
>  		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
>  				   page, offset, skb_headlen(from));
>  		*fragstolen = true;
> -		delta = len; /* we dont know real truesize... */
> -		goto copyfrags;
> +	} else {
> +		if (skb_shinfo(to)->nr_frags +
> +		    skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
> +			return false;
> +
> +		delta = from->truesize -
> +			SKB_TRUESIZE(skb_end_pointer(from) - from->head);

No... SKB_TRUESIZE() doesnt account of power-of-two roundings in
kmalloc()

>  	}
> -	return false;
> +
> +	memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
> +	       skb_shinfo(from)->frags,
> +	       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
> +	skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
> +
> +	if (!skb_cloned(from))
> +		skb_shinfo(from)->nr_frags = 0;
> +

You break the code here... we had an else.

> +	for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
> +		skb_frag_ref(from, i);
> +
> +	to->truesize += delta;
> +	atomic_add(delta, &sk->sk_rmem_alloc);
> +	sk_mem_charge(sk, delta);
> +	to->len += len;
> +	to->data_len += len;
> +
> +merge:
> +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
> +	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
> +	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
> +	return true;
>  }
>  
>  static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
> 

Really this patch is too hard to review.

Thanks


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck May 3, 2012, 4:58 a.m. UTC | #2
On 5/2/2012 9:06 PM, Eric Dumazet wrote:
> On Wed, 2012-05-02 at 20:39 -0700, Alexander Duyck wrote:
>> This change is mostly meant to help improve the readability of
>> tcp_try_coalesce.  I had a few issues since there were several points when
>> the code would test for a conditional, fail, then succeed on another
>> conditional take some action, and then follow a goto back into the previous
>> conditional.  I just tore all of that apart and made the whole thing one
>> linear flow with a single goto.
>>
>> Also there were multiple ways of computing the delta, the one for head_frag
>> made the least amount of sense to me since we were only dropping the
>> sk_buff so I have updated the logic for the stolen head case so that delta
>> is only truesize - sizeof(skb_buff), and for the case where we are dropping
>> the head as well it is truesize - SKB_TRUESIZE(skb_end_pointer - head).
>> This way we can also account for the head_frag with headlen == 0.
>>
>> Signed-off-by: Alexander Duyck<alexander.h.duyck@intel.com>
>> Cc: Eric Dumazet<edumazet@google.com>
>> Cc: Jeff Kirsher<jeffrey.t.kirsher@intel.com>
>> ---
>>
> Sorry I prefer you dont touch this code like this.
>
> The truesize bits must stay as is, since it'll track drivers that lies
> about truesize.

I can understand that.  But from what I can tell the only way they can 
lie about truesize is to actually change the value itself.  The code I 
modified was just tightening things up so we didn't do things like use 
the length to track the truesize like we were in the original code.  
 From what I can tell the new code should have been more accurate.

>>   net/ipv4/tcp_input.c |   80 +++++++++++++++++++++++++++-----------------------
>>   1 files changed, 43 insertions(+), 37 deletions(-)
>>
>> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
>> index c6f78e2..23bc3ff 100644
>> --- a/net/ipv4/tcp_input.c
>> +++ b/net/ipv4/tcp_input.c
>> @@ -4548,62 +4548,68 @@ static bool tcp_try_coalesce(struct sock *sk,
>>   	int i, delta, len = from->len;
>>
>>   	*fragstolen = false;
>> +
>>   	if (tcp_hdr(from)->fin || skb_cloned(to))
>>   		return false;
>> +
>>   	if (len<= skb_tailroom(to)) {
>>   		BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
>> -merge:
>> -		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
>> -		TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
>> -		TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
>> -		return true;
>> +		goto merge;
>>   	}
>>
>>   	if (skb_has_frag_list(to) || skb_has_frag_list(from))
>>   		return false;
>>
>> -	if (skb_headlen(from) == 0&&
>> -	    (skb_shinfo(to)->nr_frags +
>> -	     skb_shinfo(from)->nr_frags<= MAX_SKB_FRAGS)) {
>> -		WARN_ON_ONCE(from->head_frag);
>> -		delta = from->truesize - ksize(from->head) -
>> -			SKB_DATA_ALIGN(sizeof(struct sk_buff));
>
> This delta was done on purpose. We want the ksize()
The question I have is how can you get into a case where the ksize is 
different from the end offset plus the aligned size of skb_shared_info?  
From what I can tell it looks like the only place we can lie is if we 
use build_skb with the frag_size option, and in that case we are using a 
page, not kmalloc memory.  Otherwise in all other cases __alloc_skb or 
build_skb is using ksize(skb->head) - SKB_DATA_ALIGN(struct 
skb_shared_info) to set the end pointer, so reversing that should give 
us the same value as ksize(skb->head).

>> -
>> -		WARN_ON_ONCE(delta<  len);
>> -copyfrags:
>> -		memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
>> -		       skb_shinfo(from)->frags,
>> -		       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
>> -		skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
>> -
>> -		if (skb_cloned(from))
>> -			for (i = 0; i<  skb_shinfo(from)->nr_frags; i++)
>> -				skb_frag_ref(from, i);
>> -		else
>> -			skb_shinfo(from)->nr_frags = 0;
>> -
>> -		to->truesize += delta;
>> -		atomic_add(delta,&sk->sk_rmem_alloc);
>> -		sk_mem_charge(sk, delta);
>> -		to->len += len;
>> -		to->data_len += len;
>> -		goto merge;
>> -	}
>> -	if (from->head_frag&&  !skb_cloned(from)) {
>> +	if (skb_headlen(from) != 0) {
>>   		struct page *page;
>>   		unsigned int offset;
>>
>> -		if (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags>= MAX_SKB_FRAGS)
>> +		if (skb_shinfo(to)->nr_frags +
>> +		    skb_shinfo(from)->nr_frags>= MAX_SKB_FRAGS)
>> +			return false;
>> +
>> +		if (!from->head_frag || skb_cloned(from))
>>   			return false;
>> +
>> +		delta = from->truesize - sizeof(struct sk_buff);
>>
It looks like you were thinking of something here since the quote lines 
were broken.  This was the piece I saw in the original code that caught 
my eye as probably being wrong.  We are letting packets using head_frag 
just report length as the truesize.  This is why I favored using the end 
offset.  In the case of us using the head_frag we should only be 
deducting SKB_DATA_ALIGN(sizeof(struct sk_buff)) (I just relized the 
line above is incorrect), and in the case of us dropping the head_frag 
as well we should be able to also deduct the size of the shared_info and 
everything up to the offset of end.

>> +
>>   		page = virt_to_head_page(from->head);
>>   		offset = from->data - (unsigned char *)page_address(page);
>> +
>>   		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
>>   				   page, offset, skb_headlen(from));
>>   		*fragstolen = true;
>> -		delta = len; /* we dont know real truesize... */
>> -		goto copyfrags;
>> +	} else {
>> +		if (skb_shinfo(to)->nr_frags +
>> +		    skb_shinfo(from)->nr_frags>  MAX_SKB_FRAGS)
>> +			return false;
>> +
>> +		delta = from->truesize -
>> +			SKB_TRUESIZE(skb_end_pointer(from) - from->head);
> No... SKB_TRUESIZE() doesnt account of power-of-two roundings in
> kmalloc()
You used ksize in alloc_skb or build_skb to set the end pointer.  All I 
am doing by using the end pointer is getting the value you had already 
extracted.  The end pointer should be unmovable once it is set so I 
wouldn't think it would be an issue to use it to compute the truesize 
for the section including the sk_buff and skb->head.


>>   	}
>> -	return false;
>> +
>> +	memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
>> +	       skb_shinfo(from)->frags,
>> +	       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
>> +	skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
>> +
>> +	if (!skb_cloned(from))
>> +		skb_shinfo(from)->nr_frags = 0;
>> +
> You break the code here... we had an else.
Yes and no.  I just realized that I didn't need the else because the for 
loop below does nothing if nr_frags is 0.  I suspect gcc is probably 
smart enough to just skip the loop if the skb is cloned.  I should 
probably have added a one line comment explaining that above the loop.

>> +	for (i = 0; i<  skb_shinfo(from)->nr_frags; i++)
>> +		skb_frag_ref(from, i);
>> +
>> +	to->truesize += delta;
>> +	atomic_add(delta,&sk->sk_rmem_alloc);
>> +	sk_mem_charge(sk, delta);
>> +	to->len += len;
>> +	to->data_len += len;
>> +
>> +merge:
>> +	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
>> +	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
>> +	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
>> +	return true;
>>   }
>>
>>   static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
>>
> Really this patch is too hard to review.
Yeah, it is a bit difficult.  After Dave pulled your patches it ended up 
pushing most of the changes into this one.  I'll see if I can break this 
back up into smaller bits.  I just needed to flush the patches I had so 
I could get some feedback and stop talking in circles about the other patch.

Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet May 3, 2012, 5:19 a.m. UTC | #3
On Wed, 2012-05-02 at 21:58 -0700, Alexander Duyck wrote:
> The question I have is how can you get into a case where the ksize is 
> different from the end offset plus the aligned size of skb_shared_info?  
> From what I can tell it looks like the only place we can lie is if we 
> use build_skb with the frag_size option, and in that case we are using a 
> page, not kmalloc memory.  Otherwise in all other cases __alloc_skb or 
> build_skb is using ksize(skb->head) - SKB_DATA_ALIGN(struct 
> skb_shared_info) to set the end pointer, so reversing that should give 
> us the same value as ksize(skb->head).


Right after skb is allocated (build_skb() or other skb_alloc...
variants), truesize is correct by construction.

Then drivers add fragments and can make truesize smaller than reality.

And Intel drivers are known to abuse truesize.

My last patch against iwlwifi is still waiting to make its way into
official tree.

http://www.spinics.net/lists/netdev/msg192629.html



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 3, 2012, 5:25 a.m. UTC | #4
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 03 May 2012 07:19:33 +0200

> My last patch against iwlwifi is still waiting to make its way into
> official tree.
> 
> http://www.spinics.net/lists/netdev/msg192629.html

John, please rectify this situation.

The Intel Wireless folks said they would test it, but that was more
than a month ago.

It's not acceptable to let bug fixes rot for that long, I don't care
what their special internal testing procedure is.

If they give you further pushback, please just ignore them and apply
Eric's fix directly.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck May 3, 2012, 5:41 a.m. UTC | #5
On 5/2/2012 10:19 PM, Eric Dumazet wrote:
> On Wed, 2012-05-02 at 21:58 -0700, Alexander Duyck wrote:
>> The question I have is how can you get into a case where the ksize is
>> different from the end offset plus the aligned size of skb_shared_info?
>>  From what I can tell it looks like the only place we can lie is if we
>> use build_skb with the frag_size option, and in that case we are using a
>> page, not kmalloc memory.  Otherwise in all other cases __alloc_skb or
>> build_skb is using ksize(skb->head) - SKB_DATA_ALIGN(struct
>> skb_shared_info) to set the end pointer, so reversing that should give
>> us the same value as ksize(skb->head).
>
> Right after skb is allocated (build_skb() or other skb_alloc...
> variants), truesize is correct by construction.
>
> Then drivers add fragments and can make truesize smaller than reality.
>
> And Intel drivers are known to abuse truesize.
>
> My last patch against iwlwifi is still waiting to make its way into
> official tree.
>
> http://www.spinics.net/lists/netdev/msg192629.html
I think the part that has me confused is how being more precise about 
removing from truesize gets in the way of detecting abuses of truesize.  
It seems like it should be more as good as or better then the original 
approach of just using skb->len.

Then again we might just be talking in circles again.  I have things 
broken out into 3 patches now that are much more readable.  I will email 
them out in an hour or so once I do some quick tests to verify they are 
building and don't break anything.

Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 3, 2012, 5:50 a.m. UTC | #6
From: Alexander Duyck <alexander.duyck@gmail.com>
Date: Wed, 02 May 2012 22:41:36 -0700

> I think the part that has me confused is how being more precise about
> removing from truesize gets in the way of detecting abuses of
> truesize.  It seems like it should be more as good as or better then
> the original approach of just using skb->len.

You can only trim from truesize if you can be absolutely certain that
you have removed any reference in the fraglist, or the page'd head, to
the entire "block" of data.

If the skb still refers to even just one byte in a particular block,
we must still charge the entire block in the truesize.

For example, NIU has three pools of power-of-2 blocks of data it
maintainers and the device pulls from to build incoming packets.

So if the chip used one of the 2048 byte buffers, we charge the entire
2048 bytes even of the packet is much smaller.

Conversely this means we cannot trim the 2048 part of the truesize of
that SKB unless we had some mechanism to know for certain 1) what the
block size of the underlying data is and 2) that we've removed all
references to that.

Currently this is not really possible, so we therefore defer truesize
adjustments.

Behaving otherwise is dangerous, because then we'd potentially end up
with a lot of memory used, but not actually accounted for by anyone.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander H Duyck May 3, 2012, 7:08 a.m. UTC | #7
On 5/2/2012 10:50 PM, David Miller wrote:
> From: Alexander Duyck<alexander.duyck@gmail.com>
> Date: Wed, 02 May 2012 22:41:36 -0700
>
>> I think the part that has me confused is how being more precise about
>> removing from truesize gets in the way of detecting abuses of
>> truesize.  It seems like it should be more as good as or better then
>> the original approach of just using skb->len.
> You can only trim from truesize if you can be absolutely certain that
> you have removed any reference in the fraglist, or the page'd head, to
> the entire "block" of data.
>
> If the skb still refers to even just one byte in a particular block,
> we must still charge the entire block in the truesize.
I get that, and that is what my code was doing that the old code wasn't 
doing.  That is why I am confused.  I should have another patch ready in 
an hour or so that should help to make my position clear.

> For example, NIU has three pools of power-of-2 blocks of data it
> maintainers and the device pulls from to build incoming packets.
>
> So if the chip used one of the 2048 byte buffers, we charge the entire
> 2048 bytes even of the packet is much smaller.
>
> Conversely this means we cannot trim the 2048 part of the truesize of
> that SKB unless we had some mechanism to know for certain 1) what the
> block size of the underlying data is and 2) that we've removed all
> references to that.
>
> Currently this is not really possible, so we therefore defer truesize
> adjustments.
This is true for the frags, but not for the head buffer allocated with 
__alloc_skb or build_skb.  For either of these we set both truesize and 
the end pointer offset based on the ksize(data).  The truesize is the 
value plus SKB_DATA_ALIGNED(sizeof(struct sk_buff)), and the end pointer 
offset is the value minus SKB_DATA_ALIGNED(sizeof(struct 
skb_shared_info)).  So in the case where we are removing the sk_buff and 
skb->head, I am subtracting the end pointer offset plus the aligned 
shared info and sk_buff values from skb->truesize to get the size of 
just the paged region.

> Behaving otherwise is dangerous, because then we'd potentially end up
> with a lot of memory used, but not actually accounted for by anyone.
I fully agree that is why I was surprised when I was told not to use the 
values for skb->truesize that were computed back when we allocated the 
skb to determine the truesize to remove from the delta when we were 
removing it.

Thanks,

Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John W. Linville May 3, 2012, 3:14 p.m. UTC | #8
On Thu, May 03, 2012 at 01:25:02AM -0400, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Thu, 03 May 2012 07:19:33 +0200
> 
> > My last patch against iwlwifi is still waiting to make its way into
> > official tree.
> > 
> > http://www.spinics.net/lists/netdev/msg192629.html
> 
> John, please rectify this situation.
> 
> The Intel Wireless folks said they would test it, but that was more
> than a month ago.
> 
> It's not acceptable to let bug fixes rot for that long, I don't care
> what their special internal testing procedure is.
> 
> If they give you further pushback, please just ignore them and apply
> Eric's fix directly.
> 
> Thank you.

I imagine that this somehow got lost in the shuffle during the
merge window.  That doesn't excuse it, of course.

It has waited long enough already, so I'll just go ahead and take it.

John
Guy, Wey-Yi May 3, 2012, 3:24 p.m. UTC | #9
Hi John,

On Thu, 2012-05-03 at 11:14 -0400, John W. Linville wrote:
> On Thu, May 03, 2012 at 01:25:02AM -0400, David Miller wrote:
> > From: Eric Dumazet <eric.dumazet@gmail.com>
> > Date: Thu, 03 May 2012 07:19:33 +0200
> > 
> > > My last patch against iwlwifi is still waiting to make its way into
> > > official tree.
> > > 
> > > http://www.spinics.net/lists/netdev/msg192629.html
> > 
> > John, please rectify this situation.
> > 
> > The Intel Wireless folks said they would test it, but that was more
> > than a month ago.
> > 
> > It's not acceptable to let bug fixes rot for that long, I don't care
> > what their special internal testing procedure is.
> > 
> > If they give you further pushback, please just ignore them and apply
> > Eric's fix directly.
> > 
> > Thank you.
> 
> I imagine that this somehow got lost in the shuffle during the
> merge window.  That doesn't excuse it, of course.
> 
> It has waited long enough already, so I'll just go ahead and take it.
> 
it is my mistake to lost this patch during the iwlwifi re-factor work,
the patch is no longer apply and I ask Eric to rebase the patch.

Sorry again for the mistake

Thanks
Wey


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
John W. Linville May 3, 2012, 5:07 p.m. UTC | #10
On Thu, May 03, 2012 at 08:24:19AM -0700, Guy, Wey-Yi wrote:
> Hi John,
> 
> On Thu, 2012-05-03 at 11:14 -0400, John W. Linville wrote:
> > On Thu, May 03, 2012 at 01:25:02AM -0400, David Miller wrote:
> > > From: Eric Dumazet <eric.dumazet@gmail.com>
> > > Date: Thu, 03 May 2012 07:19:33 +0200
> > > 
> > > > My last patch against iwlwifi is still waiting to make its way into
> > > > official tree.
> > > > 
> > > > http://www.spinics.net/lists/netdev/msg192629.html
> > > 
> > > John, please rectify this situation.
> > > 
> > > The Intel Wireless folks said they would test it, but that was more
> > > than a month ago.
> > > 
> > > It's not acceptable to let bug fixes rot for that long, I don't care
> > > what their special internal testing procedure is.
> > > 
> > > If they give you further pushback, please just ignore them and apply
> > > Eric's fix directly.
> > > 
> > > Thank you.
> > 
> > I imagine that this somehow got lost in the shuffle during the
> > merge window.  That doesn't excuse it, of course.
> > 
> > It has waited long enough already, so I'll just go ahead and take it.
> > 
> it is my mistake to lost this patch during the iwlwifi re-factor work,
> the patch is no longer apply and I ask Eric to rebase the patch.
> 
> Sorry again for the mistake

Well, it seems like a fix needed for 3.4.  And, the patch applies there.

It does cause some merge breakage in wireless-testing (and presumably
in linux-next).  I'll attach the commit diff for the wireless-testing
merge fixup I did, for review and/or as a peace offering to sfr... :-)

Please take a look at the result in wireless-testing and let me know
if it is broken...thanks!

John

---

commit 22a101d22ad3296b55d87e92c4a94548aaf6f595
Merge: 20ef730 ed90542
Author: John W. Linville <linville@tuxdriver.com>
Date:   Thu May 3 12:58:38 2012 -0400

    Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless
    
    Conflicts:
    	drivers/net/wireless/iwlwifi/iwl-agn-rx.c
    	drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
    	drivers/net/wireless/iwlwifi/iwl-trans.h

diff --cc drivers/net/wireless/iwlwifi/iwl-agn-rx.c
index f941223,2247460..ff758fe
--- a/drivers/net/wireless/iwlwifi/iwl-agn-rx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn-rx.c
@@@ -752,15 -787,25 +751,25 @@@ static void iwlagn_pass_packet_to_mac80
  	    iwlagn_set_decrypted_flag(priv, hdr, ampdu_status, stats))
  		return;
  
- 	skb = dev_alloc_skb(128);
+ 	/* Dont use dev_alloc_skb(), we'll have enough headroom once
+ 	 * ieee80211_hdr pulled.
+ 	 */
+ 	skb = alloc_skb(128, GFP_ATOMIC);
  	if (!skb) {
- 		IWL_ERR(priv, "dev_alloc_skb failed\n");
+ 		IWL_ERR(priv, "alloc_skb failed\n");
  		return;
  	}
+ 	hdrlen = min_t(unsigned int, len, skb_tailroom(skb));
+ 	memcpy(skb_put(skb, hdrlen), hdr, hdrlen);
+ 	fraglen = len - hdrlen;
+ 
+ 	if (fraglen) {
 -		int offset = (void *)hdr + hdrlen - rxb_addr(rxb);
++		int offset = (void *)hdr + hdrlen - rxb_addr(rxb)
++				+ rxb_offset(rxb);
  
- 	offset = (void *)hdr - rxb_addr(rxb) + rxb_offset(rxb);
- 	p = rxb_steal_page(rxb);
- 	skb_add_rx_frag(skb, 0, p, offset, len, len);
+ 		skb_add_rx_frag(skb, 0, rxb_steal_page(rxb), offset,
+ 				fraglen, rxb->truesize);
+ 	}
 -	iwl_update_stats(priv, false, fc, len);
  
  	/*
  	* Wake any queues that were stopped due to a passive channel tx
diff --cc drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
index d2239aa,aa7aea1..08517d3
--- a/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-trans-pcie-rx.c
@@@ -373,89 -374,72 +373,90 @@@ static void iwl_rx_handle_rxbuf(struct 
  	if (WARN_ON(!rxb))
  		return;
  
 -	rxcb.truesize = PAGE_SIZE << hw_params(trans).rx_page_order;
 -	dma_unmap_page(trans->dev, rxb->page_dma,
 -		       rxcb.truesize,
 -		       DMA_FROM_DEVICE);
 -
 -	rxcb._page = rxb->page;
 -	pkt = rxb_addr(&rxcb);
 -
 -	IWL_DEBUG_RX(trans, "%s, 0x%02x\n",
 -		     get_cmd_string(pkt->hdr.cmd), pkt->hdr.cmd);
 +	dma_unmap_page(trans->dev, rxb->page_dma, max_len, DMA_FROM_DEVICE);
  
 +	while (offset + sizeof(u32) + sizeof(struct iwl_cmd_header) < max_len) {
 +		struct iwl_rx_packet *pkt;
 +		struct iwl_device_cmd *cmd;
 +		u16 sequence;
 +		bool reclaim;
 +		int index, cmd_index, err, len;
 +		struct iwl_rx_cmd_buffer rxcb = {
 +			._offset = offset,
 +			._page = rxb->page,
 +			._page_stolen = false,
++			.truesize = max_len,
 +		};
  
 -	len = le32_to_cpu(pkt->len_n_flags) & FH_RSCSR_FRAME_SIZE_MSK;
 -	len += sizeof(u32); /* account for status word */
 -	trace_iwlwifi_dev_rx(trans->dev, pkt, len);
 +		pkt = rxb_addr(&rxcb);
  
 -	/* Reclaim a command buffer only if this packet is a response
 -	 *   to a (driver-originated) command.
 -	 * If the packet (e.g. Rx frame) originated from uCode,
 -	 *   there is no command buffer to reclaim.
 -	 * Ucode should set SEQ_RX_FRAME bit if ucode-originated,
 -	 *   but apparently a few don't get set; catch them here. */
 -	reclaim = !(pkt->hdr.sequence & SEQ_RX_FRAME);
 -	if (reclaim) {
 -		int i;
 +		if (pkt->len_n_flags == cpu_to_le32(FH_RSCSR_FRAME_INVALID))
 +			break;
  
 -		for (i = 0; i < trans_pcie->n_no_reclaim_cmds; i++) {
 -			if (trans_pcie->no_reclaim_cmds[i] == pkt->hdr.cmd) {
 -				reclaim = false;
 -				break;
 +		IWL_DEBUG_RX(trans, "cmd at offset %d: %s (0x%.2x)\n",
 +			rxcb._offset,
 +			trans_pcie_get_cmd_string(trans_pcie, pkt->hdr.cmd),
 +			pkt->hdr.cmd);
 +
 +		len = le32_to_cpu(pkt->len_n_flags) & FH_RSCSR_FRAME_SIZE_MSK;
 +		len += sizeof(u32); /* account for status word */
 +		trace_iwlwifi_dev_rx(trans->dev, pkt, len);
 +
 +		/* Reclaim a command buffer only if this packet is a response
 +		 *   to a (driver-originated) command.
 +		 * If the packet (e.g. Rx frame) originated from uCode,
 +		 *   there is no command buffer to reclaim.
 +		 * Ucode should set SEQ_RX_FRAME bit if ucode-originated,
 +		 *   but apparently a few don't get set; catch them here. */
 +		reclaim = !(pkt->hdr.sequence & SEQ_RX_FRAME);
 +		if (reclaim) {
 +			int i;
 +
 +			for (i = 0; i < trans_pcie->n_no_reclaim_cmds; i++) {
 +				if (trans_pcie->no_reclaim_cmds[i] ==
 +							pkt->hdr.cmd) {
 +					reclaim = false;
 +					break;
 +				}
  			}
  		}
 -	}
  
 -	sequence = le16_to_cpu(pkt->hdr.sequence);
 -	index = SEQ_TO_INDEX(sequence);
 -	cmd_index = get_cmd_index(&txq->q, index);
 +		sequence = le16_to_cpu(pkt->hdr.sequence);
 +		index = SEQ_TO_INDEX(sequence);
 +		cmd_index = get_cmd_index(&txq->q, index);
  
 -	if (reclaim)
 -		cmd = txq->cmd[cmd_index];
 -	else
 -		cmd = NULL;
 +		if (reclaim)
 +			cmd = txq->entries[cmd_index].cmd;
 +		else
 +			cmd = NULL;
  
 -	err = iwl_op_mode_rx(trans->op_mode, &rxcb, cmd);
 +		err = iwl_op_mode_rx(trans->op_mode, &rxcb, cmd);
  
 -	/*
 -	 * XXX: After here, we should always check rxcb._page
 -	 * against NULL before touching it or its virtual
 -	 * memory (pkt). Because some rx_handler might have
 -	 * already taken or freed the pages.
 -	 */
 +		/*
 +		 * After here, we should always check rxcb._page_stolen,
 +		 * if it is true then one of the handlers took the page.
 +		 */
  
 -	if (reclaim) {
 -		/* Invoke any callbacks, transfer the buffer to caller,
 -		 * and fire off the (possibly) blocking
 -		 * iwl_trans_send_cmd()
 -		 * as we reclaim the driver command queue */
 -		if (rxcb._page)
 -			iwl_tx_cmd_complete(trans, &rxcb, err);
 -		else
 -			IWL_WARN(trans, "Claim null rxb?\n");
 +		if (reclaim) {
 +			/* Invoke any callbacks, transfer the buffer to caller,
 +			 * and fire off the (possibly) blocking
 +			 * iwl_trans_send_cmd()
 +			 * as we reclaim the driver command queue */
 +			if (!rxcb._page_stolen)
 +				iwl_tx_cmd_complete(trans, &rxcb, err);
 +			else
 +				IWL_WARN(trans, "Claim null rxb?\n");
 +		}
 +
 +		page_stolen |= rxcb._page_stolen;
 +		offset += ALIGN(len, FH_RSCSR_FRAME_ALIGN);
  	}
  
 -	/* page was stolen from us */
 -	if (rxcb._page == NULL)
 +	/* page was stolen from us -- free our reference */
 +	if (page_stolen) {
 +		__free_pages(rxb->page, trans_pcie->rx_page_order);
  		rxb->page = NULL;
 +	}
  
  	/* Reuse the page if possible. For notification packets and
  	 * SKBs that fail to Rx correctly, add them back into the
diff --cc drivers/net/wireless/iwlwifi/iwl-trans.h
index 7018d31,fdf9788..79a1e7a
--- a/drivers/net/wireless/iwlwifi/iwl-trans.h
+++ b/drivers/net/wireless/iwlwifi/iwl-trans.h
@@@ -256,8 -260,7 +256,9 @@@ static inline void iwl_free_resp(struc
  
  struct iwl_rx_cmd_buffer {
  	struct page *_page;
 +	int _offset;
 +	bool _page_stolen;
+ 	unsigned int truesize;
  };
  
  static inline void *rxb_addr(struct iwl_rx_cmd_buffer *r)
Guy, Wey-Yi May 3, 2012, 8:21 p.m. UTC | #11
Hi John,

On Thu, 2012-05-03 at 13:07 -0400, John W. Linville wrote:
> On Thu, May 03, 2012 at 08:24:19AM -0700, Guy, Wey-Yi wrote:
> > Hi John,
> > 
> > On Thu, 2012-05-03 at 11:14 -0400, John W. Linville wrote:
> > > On Thu, May 03, 2012 at 01:25:02AM -0400, David Miller wrote:
> > > > From: Eric Dumazet <eric.dumazet@gmail.com>
> > > > Date: Thu, 03 May 2012 07:19:33 +0200
> > > > 
> > > > > My last patch against iwlwifi is still waiting to make its way into
> > > > > official tree.
> > > > > 
> > > > > http://www.spinics.net/lists/netdev/msg192629.html
> > > > 
> > > > John, please rectify this situation.
> > > > 
> > > > The Intel Wireless folks said they would test it, but that was more
> > > > than a month ago.
> > > > 
> > > > It's not acceptable to let bug fixes rot for that long, I don't care
> > > > what their special internal testing procedure is.
> > > > 
> > > > If they give you further pushback, please just ignore them and apply
> > > > Eric's fix directly.
> > > > 
> > > > Thank you.
> > > 
> > > I imagine that this somehow got lost in the shuffle during the
> > > merge window.  That doesn't excuse it, of course.
> > > 
> > > It has waited long enough already, so I'll just go ahead and take it.
> > > 
> > it is my mistake to lost this patch during the iwlwifi re-factor work,
> > the patch is no longer apply and I ask Eric to rebase the patch.
> > 
> > Sorry again for the mistake
> 
> Well, it seems like a fix needed for 3.4.  And, the patch applies there.
> 
> It does cause some merge breakage in wireless-testing (and presumably
> in linux-next).  I'll attach the commit diff for the wireless-testing
> merge fixup I did, for review and/or as a peace offering to sfr... :-)
> 
> Please take a look at the result in wireless-testing and let me know
> if it is broken...thanks!
> 
Looks good to me, thanks very much for helping this.

Wey

> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c6f78e2..23bc3ff 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4548,62 +4548,68 @@  static bool tcp_try_coalesce(struct sock *sk,
 	int i, delta, len = from->len;
 
 	*fragstolen = false;
+
 	if (tcp_hdr(from)->fin || skb_cloned(to))
 		return false;
+
 	if (len <= skb_tailroom(to)) {
 		BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
-merge:
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
-		TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
-		TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
-		return true;
+		goto merge;
 	}
 
 	if (skb_has_frag_list(to) || skb_has_frag_list(from))
 		return false;
 
-	if (skb_headlen(from) == 0 &&
-	    (skb_shinfo(to)->nr_frags +
-	     skb_shinfo(from)->nr_frags <= MAX_SKB_FRAGS)) {
-		WARN_ON_ONCE(from->head_frag);
-		delta = from->truesize - ksize(from->head) -
-			SKB_DATA_ALIGN(sizeof(struct sk_buff));
-
-		WARN_ON_ONCE(delta < len);
-copyfrags:
-		memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
-		       skb_shinfo(from)->frags,
-		       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
-		skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
-
-		if (skb_cloned(from))
-			for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
-				skb_frag_ref(from, i);
-		else
-			skb_shinfo(from)->nr_frags = 0;
-
-		to->truesize += delta;
-		atomic_add(delta, &sk->sk_rmem_alloc);
-		sk_mem_charge(sk, delta);
-		to->len += len;
-		to->data_len += len;
-		goto merge;
-	}
-	if (from->head_frag && !skb_cloned(from)) {
+	if (skb_headlen(from) != 0) {
 		struct page *page;
 		unsigned int offset;
 
-		if (skb_shinfo(to)->nr_frags + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+		if (skb_shinfo(to)->nr_frags +
+		    skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+			return false;
+
+		if (!from->head_frag || skb_cloned(from))
 			return false;
+
+		delta = from->truesize - sizeof(struct sk_buff);
+
 		page = virt_to_head_page(from->head);
 		offset = from->data - (unsigned char *)page_address(page);
+
 		skb_fill_page_desc(to, skb_shinfo(to)->nr_frags,
 				   page, offset, skb_headlen(from));
 		*fragstolen = true;
-		delta = len; /* we dont know real truesize... */
-		goto copyfrags;
+	} else {
+		if (skb_shinfo(to)->nr_frags +
+		    skb_shinfo(from)->nr_frags > MAX_SKB_FRAGS)
+			return false;
+
+		delta = from->truesize -
+			SKB_TRUESIZE(skb_end_pointer(from) - from->head);
 	}
-	return false;
+
+	memcpy(skb_shinfo(to)->frags + skb_shinfo(to)->nr_frags,
+	       skb_shinfo(from)->frags,
+	       skb_shinfo(from)->nr_frags * sizeof(skb_frag_t));
+	skb_shinfo(to)->nr_frags += skb_shinfo(from)->nr_frags;
+
+	if (!skb_cloned(from))
+		skb_shinfo(from)->nr_frags = 0;
+
+	for (i = 0; i < skb_shinfo(from)->nr_frags; i++)
+		skb_frag_ref(from, i);
+
+	to->truesize += delta;
+	atomic_add(delta, &sk->sk_rmem_alloc);
+	sk_mem_charge(sk, delta);
+	to->len += len;
+	to->data_len += len;
+
+merge:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+	TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
+	TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
+	return true;
 }
 
 static void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)