diff mbox

tcp: splice as many packets as possible at once

Message ID 4967CBB9.1060403@cosmosbay.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Jan. 9, 2009, 10:12 p.m. UTC
Willy Tarreau a écrit :
> On Fri, Jan 09, 2009 at 10:24:00PM +0100, Willy Tarreau wrote:
>> On Fri, Jan 09, 2009 at 09:51:17PM +0100, Eric Dumazet wrote:
>> (...)
>>>> Also, in your second mail, you're saying that your change
>>>> might return more data than requested by the user. I can't
>>>> find why, could you please explain to me, as I'm still quite
>>>> ignorant in this area ?
>>> Well, I just tested various user programs and indeed got this
>>> strange result :
>>>
>>> Here I call splice() with len=1000 (0x3e8), and you can see
>>> it gives a result of 1460 at the second call.
> 
> OK finally I could reproduce it and found why we have this. It's
> expected in fact.
> 
> The problem when we loop in tcp_read_sock() is that tss->len is
> not decremented by the amount of bytes read, this one is done
> only in tcp_splice_read() which is outer.
> 
> The solution I found was to do just like other callers, which means
> use desc->count to keep the remaining number of bytes we want to
> read. In fact, tcp_read_sock() is designed to use that one as a stop
> condition, which explains why you first had to hide it.
> 
> Now with the attached patch as a replacement for my previous one,
> both issues are solved :
>   - I splice 1000 bytes if I ask to do so
>   - I splice as much as possible if available (typically 23 kB).
> 
> My observed performances are still at the top of earlier results
> and IMHO that way of counting bytes makes sense for an actor called
> from tcp_read_sock().
> 
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 35bcddf..51ff3aa 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -522,8 +522,12 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
>  				unsigned int offset, size_t len)
>  {
>  	struct tcp_splice_state *tss = rd_desc->arg.data;
> +	int ret;
>  
> -	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
> +	ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags);
> +	if (ret > 0)
> +		rd_desc->count -= ret;
> +	return ret;
>  }
>  
>  static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
> @@ -531,6 +535,7 @@ static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
>  	/* Store TCP splice context information in read_descriptor_t. */
>  	read_descriptor_t rd_desc = {
>  		.arg.data = tss,
> +		.count = tss->len,
>  	};
>  
>  	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
> 

OK, I came to a different patch. Please check other tcp_read_sock() callers in tree :)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Willy Tarreau Jan. 9, 2009, 10:17 p.m. UTC | #1
On Fri, Jan 09, 2009 at 11:12:09PM +0100, Eric Dumazet wrote:
> Willy Tarreau a écrit :
> > On Fri, Jan 09, 2009 at 10:24:00PM +0100, Willy Tarreau wrote:
> >> On Fri, Jan 09, 2009 at 09:51:17PM +0100, Eric Dumazet wrote:
> >> (...)
> >>>> Also, in your second mail, you're saying that your change
> >>>> might return more data than requested by the user. I can't
> >>>> find why, could you please explain to me, as I'm still quite
> >>>> ignorant in this area ?
> >>> Well, I just tested various user programs and indeed got this
> >>> strange result :
> >>>
> >>> Here I call splice() with len=1000 (0x3e8), and you can see
> >>> it gives a result of 1460 at the second call.
> > 
> > OK finally I could reproduce it and found why we have this. It's
> > expected in fact.
> > 
> > The problem when we loop in tcp_read_sock() is that tss->len is
> > not decremented by the amount of bytes read, this one is done
> > only in tcp_splice_read() which is outer.
> > 
> > The solution I found was to do just like other callers, which means
> > use desc->count to keep the remaining number of bytes we want to
> > read. In fact, tcp_read_sock() is designed to use that one as a stop
> > condition, which explains why you first had to hide it.
> > 
> > Now with the attached patch as a replacement for my previous one,
> > both issues are solved :
> >   - I splice 1000 bytes if I ask to do so
> >   - I splice as much as possible if available (typically 23 kB).
> > 
> > My observed performances are still at the top of earlier results
> > and IMHO that way of counting bytes makes sense for an actor called
> > from tcp_read_sock().
> > 
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index 35bcddf..51ff3aa 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -522,8 +522,12 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
> >  				unsigned int offset, size_t len)
> >  {
> >  	struct tcp_splice_state *tss = rd_desc->arg.data;
> > +	int ret;
> >  
> > -	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
> > +	ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags);
> > +	if (ret > 0)
> > +		rd_desc->count -= ret;
> > +	return ret;
> >  }
> >  
> >  static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
> > @@ -531,6 +535,7 @@ static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
> >  	/* Store TCP splice context information in read_descriptor_t. */
> >  	read_descriptor_t rd_desc = {
> >  		.arg.data = tss,
> > +		.count = tss->len,
> >  	};
> >  
> >  	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
> > 
> 
> OK, I came to a different patch. Please check other tcp_read_sock() callers in tree :)

I've seen the other callers, but they all use desc->count for their own
purpose. That's how I understood what it was used for :-)

I think it's better not to change the API here and use tcp_read_sock()
how it's supposed to be used. Also, the less parameters to the function,
the better.

However I'm OK for the !timeo before release_sock/lock_sock. I just
don't know if we can put the rest of the if above or not. I don't
know what changes we're supposed to collect by doing release_sock/
lock_sock before the if().

Regards,
Willy

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Evgeniy Polyakov Jan. 9, 2009, 10:42 p.m. UTC | #2
On Fri, Jan 09, 2009 at 11:17:44PM +0100, Willy Tarreau (w@1wt.eu) wrote:
> However I'm OK for the !timeo before release_sock/lock_sock. I just
> don't know if we can put the rest of the if above or not. I don't
> know what changes we're supposed to collect by doing release_sock/
> lock_sock before the if().

Not to interrupt the discussion, but for the clarification, that
release_sock/lock_sock is used to process the backlog accumulated while
socket was locked. And while dropping additional pair before the final
release is ok, but moving this itself should be thought of twice.
Eric Dumazet Jan. 9, 2009, 10:45 p.m. UTC | #3
Willy Tarreau a écrit :
> On Fri, Jan 09, 2009 at 11:12:09PM +0100, Eric Dumazet wrote:
>> Willy Tarreau a écrit :
>>> On Fri, Jan 09, 2009 at 10:24:00PM +0100, Willy Tarreau wrote:
>>>> On Fri, Jan 09, 2009 at 09:51:17PM +0100, Eric Dumazet wrote:
>>>> (...)
>>>>>> Also, in your second mail, you're saying that your change
>>>>>> might return more data than requested by the user. I can't
>>>>>> find why, could you please explain to me, as I'm still quite
>>>>>> ignorant in this area ?
>>>>> Well, I just tested various user programs and indeed got this
>>>>> strange result :
>>>>>
>>>>> Here I call splice() with len=1000 (0x3e8), and you can see
>>>>> it gives a result of 1460 at the second call.
>>> OK finally I could reproduce it and found why we have this. It's
>>> expected in fact.
>>>
>>> The problem when we loop in tcp_read_sock() is that tss->len is
>>> not decremented by the amount of bytes read, this one is done
>>> only in tcp_splice_read() which is outer.
>>>
>>> The solution I found was to do just like other callers, which means
>>> use desc->count to keep the remaining number of bytes we want to
>>> read. In fact, tcp_read_sock() is designed to use that one as a stop
>>> condition, which explains why you first had to hide it.
>>>
>>> Now with the attached patch as a replacement for my previous one,
>>> both issues are solved :
>>>   - I splice 1000 bytes if I ask to do so
>>>   - I splice as much as possible if available (typically 23 kB).
>>>
>>> My observed performances are still at the top of earlier results
>>> and IMHO that way of counting bytes makes sense for an actor called
>>> from tcp_read_sock().
>>>
>>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
>>> index 35bcddf..51ff3aa 100644
>>> --- a/net/ipv4/tcp.c
>>> +++ b/net/ipv4/tcp.c
>>> @@ -522,8 +522,12 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
>>>  				unsigned int offset, size_t len)
>>>  {
>>>  	struct tcp_splice_state *tss = rd_desc->arg.data;
>>> +	int ret;
>>>  
>>> -	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
>>> +	ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags);
>>> +	if (ret > 0)
>>> +		rd_desc->count -= ret;
>>> +	return ret;
>>>  }
>>>  
>>>  static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
>>> @@ -531,6 +535,7 @@ static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
>>>  	/* Store TCP splice context information in read_descriptor_t. */
>>>  	read_descriptor_t rd_desc = {
>>>  		.arg.data = tss,
>>> +		.count = tss->len,
>>>  	};
>>>  
>>>  	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
>>>
>> OK, I came to a different patch. Please check other tcp_read_sock() callers in tree :)
> 
> I've seen the other callers, but they all use desc->count for their own
> purpose. That's how I understood what it was used for :-)

Ah yes, I reread your patch and you are right.

> 
> I think it's better not to change the API here and use tcp_read_sock()
> how it's supposed to be used. Also, the less parameters to the function,
> the better.
> 
> However I'm OK for the !timeo before release_sock/lock_sock. I just
> don't know if we can put the rest of the if above or not. I don't
> know what changes we're supposed to collect by doing release_sock/
> lock_sock before the if().

Only the (!timeo) can be above. Other conditions must be checked after
the release/lock.


Thank you


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willy Tarreau Jan. 9, 2009, 10:50 p.m. UTC | #4
On Sat, Jan 10, 2009 at 01:42:58AM +0300, Evgeniy Polyakov wrote:
> On Fri, Jan 09, 2009 at 11:17:44PM +0100, Willy Tarreau (w@1wt.eu) wrote:
> > However I'm OK for the !timeo before release_sock/lock_sock. I just
> > don't know if we can put the rest of the if above or not. I don't
> > know what changes we're supposed to collect by doing release_sock/
> > lock_sock before the if().
> 
> Not to interrupt the discussion, but for the clarification, that
> release_sock/lock_sock is used to process the backlog accumulated while
> socket was locked. And while dropping additional pair before the final
> release is ok, but moving this itself should be thought of twice.

Nice, thanks Evgeniy. So it makes sense to move only the !timeo test
above since it's not dependant on the socket, and leave the rest of
the test where it currently is. That's what Eric has proposed in his
latest patch.

Well, I'm now trying to educate myself on the send part. It's still
not very clear to me and I'd like to understand a little bit better
why we have this corruption problem and why there is a difference
between sending segments from memory and sending them from another
socket where they were already waiting.

I think I'll put printks everywhere and see what I can observe.
Knowing about the GSO/SG workaround already helps me enable/disable
the bug.

Regards,
Willy

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Willy Tarreau Jan. 9, 2009, 10:53 p.m. UTC | #5
On Fri, Jan 09, 2009 at 11:45:02PM +0100, Eric Dumazet wrote:
> Willy Tarreau a écrit :
> > On Fri, Jan 09, 2009 at 11:12:09PM +0100, Eric Dumazet wrote:
> >> Willy Tarreau a écrit :
> >>> On Fri, Jan 09, 2009 at 10:24:00PM +0100, Willy Tarreau wrote:
> >>>> On Fri, Jan 09, 2009 at 09:51:17PM +0100, Eric Dumazet wrote:
> >>>> (...)
> >>>>>> Also, in your second mail, you're saying that your change
> >>>>>> might return more data than requested by the user. I can't
> >>>>>> find why, could you please explain to me, as I'm still quite
> >>>>>> ignorant in this area ?
> >>>>> Well, I just tested various user programs and indeed got this
> >>>>> strange result :
> >>>>>
> >>>>> Here I call splice() with len=1000 (0x3e8), and you can see
> >>>>> it gives a result of 1460 at the second call.
> >>> OK finally I could reproduce it and found why we have this. It's
> >>> expected in fact.
> >>>
> >>> The problem when we loop in tcp_read_sock() is that tss->len is
> >>> not decremented by the amount of bytes read, this one is done
> >>> only in tcp_splice_read() which is outer.
> >>>
> >>> The solution I found was to do just like other callers, which means
> >>> use desc->count to keep the remaining number of bytes we want to
> >>> read. In fact, tcp_read_sock() is designed to use that one as a stop
> >>> condition, which explains why you first had to hide it.
> >>>
> >>> Now with the attached patch as a replacement for my previous one,
> >>> both issues are solved :
> >>>   - I splice 1000 bytes if I ask to do so
> >>>   - I splice as much as possible if available (typically 23 kB).
> >>>
> >>> My observed performances are still at the top of earlier results
> >>> and IMHO that way of counting bytes makes sense for an actor called
> >>> from tcp_read_sock().
> >>>
> >>> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> >>> index 35bcddf..51ff3aa 100644
> >>> --- a/net/ipv4/tcp.c
> >>> +++ b/net/ipv4/tcp.c
> >>> @@ -522,8 +522,12 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
> >>>  				unsigned int offset, size_t len)
> >>>  {
> >>>  	struct tcp_splice_state *tss = rd_desc->arg.data;
> >>> +	int ret;
> >>>  
> >>> -	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
> >>> +	ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags);
> >>> +	if (ret > 0)
> >>> +		rd_desc->count -= ret;
> >>> +	return ret;
> >>>  }
> >>>  
> >>>  static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
> >>> @@ -531,6 +535,7 @@ static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
> >>>  	/* Store TCP splice context information in read_descriptor_t. */
> >>>  	read_descriptor_t rd_desc = {
> >>>  		.arg.data = tss,
> >>> +		.count = tss->len,
> >>>  	};
> >>>  
> >>>  	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
> >>>
> >> OK, I came to a different patch. Please check other tcp_read_sock() callers in tree :)
> > 
> > I've seen the other callers, but they all use desc->count for their own
> > purpose. That's how I understood what it was used for :-)
> 
> Ah yes, I reread your patch and you are right.
> 
> > 
> > I think it's better not to change the API here and use tcp_read_sock()
> > how it's supposed to be used. Also, the less parameters to the function,
> > the better.
> > 
> > However I'm OK for the !timeo before release_sock/lock_sock. I just
> > don't know if we can put the rest of the if above or not. I don't
> > know what changes we're supposed to collect by doing release_sock/
> > lock_sock before the if().
> 
> Only the (!timeo) can be above. Other conditions must be checked after
> the release/lock.

Yes that's what Evgeniy explained too. I smelled something like this
but did not know.

Care to redo the whole patch, since you already have all code parts
at hand as well as some fragments of commit messages ? You can even
add my Tested-by if you want. Finally it was nice that Dave asked
for this explanation because it drove our nose to the fishy parts ;-)

Thanks,
Willy

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Evgeniy Polyakov Jan. 9, 2009, 11:01 p.m. UTC | #6
On Fri, Jan 09, 2009 at 11:50:10PM +0100, Willy Tarreau (w@1wt.eu) wrote:
> Well, I'm now trying to educate myself on the send part. It's still
> not very clear to me and I'd like to understand a little bit better
> why we have this corruption problem and why there is a difference
> between sending segments from memory and sending them from another
> socket where they were already waiting.

printks are the best choice, since you will get exactly what you are
looking for instead of deciphering what developer or code told you.

> I think I'll put printks everywhere and see what I can observe.
> Knowing about the GSO/SG workaround already helps me enable/disable
> the bug.

I wish I could also be capable to disable the bugs... :)
Willy Tarreau Jan. 9, 2009, 11:06 p.m. UTC | #7
On Sat, Jan 10, 2009 at 02:01:24AM +0300, Evgeniy Polyakov wrote:
> On Fri, Jan 09, 2009 at 11:50:10PM +0100, Willy Tarreau (w@1wt.eu) wrote:
> > Well, I'm now trying to educate myself on the send part. It's still
> > not very clear to me and I'd like to understand a little bit better
> > why we have this corruption problem and why there is a difference
> > between sending segments from memory and sending them from another
> > socket where they were already waiting.
> 
> printks are the best choice, since you will get exactly what you are
> looking for instead of deciphering what developer or code told you.

I know, but as I told in an earlier mail, it was not even clear to
me what function were called. I have made guesses but that's quite
hard when you don't know the entry point. I think I'm not too far
from having discovered the call chain. Understanding it will be
another story, of course ;-)

> > I think I'll put printks everywhere and see what I can observe.
> > Knowing about the GSO/SG workaround already helps me enable/disable
> > the bug.
> 
> I wish I could also be capable to disable the bugs... :)

Me too :-)
Here we have an opportunity to turn it on/off, let's take it !

Willy

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Jan. 10, 2009, 7:40 a.m. UTC | #8
Evgeniy Polyakov a écrit :
> On Fri, Jan 09, 2009 at 11:17:44PM +0100, Willy Tarreau (w@1wt.eu) wrote:
>> However I'm OK for the !timeo before release_sock/lock_sock. I just
>> don't know if we can put the rest of the if above or not. I don't
>> know what changes we're supposed to collect by doing release_sock/
>> lock_sock before the if().
> 
> Not to interrupt the discussion, but for the clarification, that
> release_sock/lock_sock is used to process the backlog accumulated while
> socket was locked. And while dropping additional pair before the final
> release is ok, but moving this itself should be thought of twice.
> 

Hum... I just caught the release_sock(sk)/lock_sock(sk) done in skb_splice_bits()

So :

1) the release_sock/lock_sock done in tcp_splice_read() is not necessary
to process backlog. Its already done in skb_splice_bits()

2) If we loop in tcp_read_sock() calling skb_splice_bits() several times
then we should perform the following tests inside this loop ?

if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) ||
   signal_pending(current)) break;

And removie them from tcp_splice_read() ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Evgeniy Polyakov Jan. 11, 2009, 12:58 p.m. UTC | #9
Hi Eric.

On Sat, Jan 10, 2009 at 08:40:05AM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
> > Not to interrupt the discussion, but for the clarification, that
> > release_sock/lock_sock is used to process the backlog accumulated while
> > socket was locked. And while dropping additional pair before the final
> > release is ok, but moving this itself should be thought of twice.
> > 
> 
> Hum... I just caught the release_sock(sk)/lock_sock(sk) done in skb_splice_bits()
> 
> So :
> 
> 1) the release_sock/lock_sock done in tcp_splice_read() is not necessary
> to process backlog. Its already done in skb_splice_bits()

Yes, in the tcp_splice_read() they are added to remove a deadlock.

> 2) If we loop in tcp_read_sock() calling skb_splice_bits() several times
> then we should perform the following tests inside this loop ?
> 
> if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) ||
>    signal_pending(current)) break;
> 
> And removie them from tcp_splice_read() ?

It could be done, but for what reason? To detect disconnected socket early?
Does it worth the changes?
Eric Dumazet Jan. 11, 2009, 1:14 p.m. UTC | #10
Evgeniy Polyakov a écrit :
> Hi Eric.
> 
> On Sat, Jan 10, 2009 at 08:40:05AM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
>>> Not to interrupt the discussion, but for the clarification, that
>>> release_sock/lock_sock is used to process the backlog accumulated while
>>> socket was locked. And while dropping additional pair before the final
>>> release is ok, but moving this itself should be thought of twice.
>>>
>> Hum... I just caught the release_sock(sk)/lock_sock(sk) done in skb_splice_bits()
>>
>> So :
>>
>> 1) the release_sock/lock_sock done in tcp_splice_read() is not necessary
>> to process backlog. Its already done in skb_splice_bits()
> 
> Yes, in the tcp_splice_read() they are added to remove a deadlock.

Could you elaborate ? A deadlock only if !SPLICE_F_NONBLOCK ?

> 
>> 2) If we loop in tcp_read_sock() calling skb_splice_bits() several times
>> then we should perform the following tests inside this loop ?
>>
>> if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) ||
>>    signal_pending(current)) break;
>>
>> And removie them from tcp_splice_read() ?
> 
> It could be done, but for what reason? To detect disconnected socket early?
> Does it worth the changes?
> 

I was thinking about the case your thread is doing a splice() from tcp socket to
 a pipe, while another thread is doing the splice from this pipe to something else.

Once patched, tcp_read_sock() could loop a long time...


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Evgeniy Polyakov Jan. 11, 2009, 1:35 p.m. UTC | #11
On Sun, Jan 11, 2009 at 02:14:57PM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
> >> 1) the release_sock/lock_sock done in tcp_splice_read() is not necessary
> >> to process backlog. Its already done in skb_splice_bits()
> > 
> > Yes, in the tcp_splice_read() they are added to remove a deadlock.
> 
> Could you elaborate ? A deadlock only if !SPLICE_F_NONBLOCK ?

Sorry, I meant that we drop lock in skb_splice_bits() to prevent the deadlock,
and tcp_splice_read() needs it to process the backlog.

I think that even with non-blocking splice that release_sock/lock_sock
is needed, since we are able to do a parallel job: to receive new data
(scheduled by early release_sock backlog processing) in bh and to
process already received data via splice codepath.
Maybe in non-blocking splice mode this is not an issue though, but for
the blocking mode this allows to grab more skbs at once in skb_splice_bits.

> > 
> >> 2) If we loop in tcp_read_sock() calling skb_splice_bits() several times
> >> then we should perform the following tests inside this loop ?
> >>
> >> if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) ||
> >>    signal_pending(current)) break;
> >>
> >> And removie them from tcp_splice_read() ?
> > 
> > It could be done, but for what reason? To detect disconnected socket early?
> > Does it worth the changes?
> 
> I was thinking about the case your thread is doing a splice() from tcp socket to
>  a pipe, while another thread is doing the splice from this pipe to something else.
> 
> Once patched, tcp_read_sock() could loop a long time...
 
Well, it maybe a good idea... Can not say anything against it :)
Eric Dumazet Jan. 11, 2009, 4 p.m. UTC | #12
Evgeniy Polyakov a écrit :
> On Sun, Jan 11, 2009 at 02:14:57PM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
>>>> 1) the release_sock/lock_sock done in tcp_splice_read() is not necessary
>>>> to process backlog. Its already done in skb_splice_bits()
>>> Yes, in the tcp_splice_read() they are added to remove a deadlock.
>> Could you elaborate ? A deadlock only if !SPLICE_F_NONBLOCK ?
> 
> Sorry, I meant that we drop lock in skb_splice_bits() to prevent the deadlock,
> and tcp_splice_read() needs it to process the backlog.

While we drop lock in skb_splice_bits() to prevent the deadlock, we
also process backlog at this stage. No need to process backlog
again in the higher level function.

> 
> I think that even with non-blocking splice that release_sock/lock_sock
> is needed, since we are able to do a parallel job: to receive new data
> (scheduled by early release_sock backlog processing) in bh and to
> process already received data via splice codepath.
> Maybe in non-blocking splice mode this is not an issue though, but for
> the blocking mode this allows to grab more skbs at once in skb_splice_bits.

skb_splice_bits() operates on one skb, you lost me :)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Evgeniy Polyakov Jan. 11, 2009, 4:05 p.m. UTC | #13
On Sun, Jan 11, 2009 at 05:00:37PM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
> >>>> 1) the release_sock/lock_sock done in tcp_splice_read() is not necessary
> >>>> to process backlog. Its already done in skb_splice_bits()
> >>> Yes, in the tcp_splice_read() they are added to remove a deadlock.
> >> Could you elaborate ? A deadlock only if !SPLICE_F_NONBLOCK ?
> > 
> > Sorry, I meant that we drop lock in skb_splice_bits() to prevent the deadlock,
> > and tcp_splice_read() needs it to process the backlog.
> 
> While we drop lock in skb_splice_bits() to prevent the deadlock, we
> also process backlog at this stage. No need to process backlog
> again in the higher level function.

Yes, but having it earlier allows to receive new skb while processing
already received.

> > I think that even with non-blocking splice that release_sock/lock_sock
> > is needed, since we are able to do a parallel job: to receive new data
> > (scheduled by early release_sock backlog processing) in bh and to
> > process already received data via splice codepath.
> > Maybe in non-blocking splice mode this is not an issue though, but for
> > the blocking mode this allows to grab more skbs at once in skb_splice_bits.
> 
> skb_splice_bits() operates on one skb, you lost me :)

Exactly, and to have it we earlier release a socket so that it could be
acked and while we copy it or doing anything else, the next one would
received.
David Miller Jan. 14, 2009, 12:07 a.m. UTC | #14
From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Sun, 11 Jan 2009 19:05:48 +0300

> On Sun, Jan 11, 2009 at 05:00:37PM +0100, Eric Dumazet (dada1@cosmosbay.com) wrote:
> > While we drop lock in skb_splice_bits() to prevent the deadlock, we
> > also process backlog at this stage. No need to process backlog
> > again in the higher level function.
> 
> Yes, but having it earlier allows to receive new skb while processing
> already received.
> 
> > > I think that even with non-blocking splice that release_sock/lock_sock
> > > is needed, since we are able to do a parallel job: to receive new data
> > > (scheduled by early release_sock backlog processing) in bh and to
> > > process already received data via splice codepath.
> > > Maybe in non-blocking splice mode this is not an issue though, but for
> > > the blocking mode this allows to grab more skbs at once in skb_splice_bits.
> > 
> > skb_splice_bits() operates on one skb, you lost me :)
> 
> Exactly, and to have it we earlier release a socket so that it could be
> acked and while we copy it or doing anything else, the next one would
> received.

I think the socket release in skb_splice_bits() (although necessary)
just muddies the waters, and whether the extra one done in
tcp_splice_read() helps at all is open to debate.

That skb_clone() done by skb_splice_bits() pisses me off too,
we really ought to fix that.  And we also have that data corruption
bug to cure too.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Evgeniy Polyakov Jan. 14, 2009, 12:13 a.m. UTC | #15
On Tue, Jan 13, 2009 at 04:07:21PM -0800, David Miller (davem@davemloft.net) wrote:
> > Exactly, and to have it we earlier release a socket so that it could be
> > acked and while we copy it or doing anything else, the next one would
> > received.
> 
> I think the socket release in skb_splice_bits() (although necessary)
> just muddies the waters, and whether the extra one done in
> tcp_splice_read() helps at all is open to debate.

Well, yes, probably simple performance test with and without will
clarify the things.

> That skb_clone() done by skb_splice_bits() pisses me off too,
> we really ought to fix that.  And we also have that data corruption
> bug to cure too.

Clone is needed since tcp expects to own the skb and frees it
unconditionally via __kfree_skb().

What is the best solution for the data corruption bug? To copy the data
all the time or implement own allocator to be used in alloc_skb and
friends to allocate the head? I think it can be done transparently for
the drivers. I can volunteer for this :)
The first one is more appropriate for the current bugfix-only stage,
but this will result in the whole release being too slow.
David Miller Jan. 14, 2009, 12:16 a.m. UTC | #16
From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Wed, 14 Jan 2009 03:13:45 +0300

> What is the best solution for the data corruption bug? To copy the data
> all the time or implement own allocator to be used in alloc_skb and
> friends to allocate the head? I think it can be done transparently for
> the drivers. I can volunteer for this :)
> The first one is more appropriate for the current bugfix-only stage,
> but this will result in the whole release being too slow.

I am looking into this right now.

I wish there were some way we could make this code grab and release a
reference to the SKB data area (I mean skb_shinfo(skb)->dataref) to
accomplish it's goals.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Evgeniy Polyakov Jan. 14, 2009, 12:22 a.m. UTC | #17
On Tue, Jan 13, 2009 at 04:16:25PM -0800, David Miller (davem@davemloft.net) wrote:
> I wish there were some way we could make this code grab and release a
> reference to the SKB data area (I mean skb_shinfo(skb)->dataref) to
> accomplish it's goals.

Ugh... Clone without cloninig, but by increasing the dataref. Getting
that splice only needs that skb to track the head of the original, this
may really work.
Herbert Xu Jan. 14, 2009, 12:51 a.m. UTC | #18
David Miller <davem@davemloft.net> wrote:
>
> I wish there were some way we could make this code grab and release a
> reference to the SKB data area (I mean skb_shinfo(skb)->dataref) to
> accomplish it's goals.

We can probably do that for spliced data that end up going to
the networking stack again.  However, as splice is generic the
data may be headed to a destination other than the network stack.

In that case to make dataref work we'd need some mechanism
that allows non-network entities to get/drop this ref count.

Cheers,
David Miller Jan. 14, 2009, 1:24 a.m. UTC | #19
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 14 Jan 2009 11:51:10 +1100

> We can probably do that for spliced data that end up going to
> the networking stack again.  However, as splice is generic the
> data may be headed to a destination other than the network stack.
> 
> In that case to make dataref work we'd need some mechanism
> that allows non-network entities to get/drop this ref count.

I see.

I'll think about this some more.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 23808df..96b49e1 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -100,13 +100,11 @@  static void iscsi_sw_tcp_data_ready(struct sock *sk, int flag)
 
 	/*
 	 * Use rd_desc to pass 'conn' to iscsi_tcp_recv.
-	 * We set count to 1 because we want the network layer to
-	 * hand us all the skbs that are available. iscsi_tcp_recv
-	 * handled pdus that cross buffers or pdus that still need data.
+	 * iscsi_tcp_recv handled pdus that cross buffers or pdus that
+	 * still need data.
 	 */
 	rd_desc.arg.data = conn;
-	rd_desc.count = 1;
-	tcp_read_sock(sk, &rd_desc, iscsi_sw_tcp_recv);
+	tcp_read_sock(sk, &rd_desc, iscsi_sw_tcp_recv, 65536);
 
 	read_unlock(&sk->sk_callback_lock);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 218235d..b1facd1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -490,7 +490,7 @@  extern void tcp_get_info(struct sock *, struct tcp_info *);
 typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 				unsigned int, size_t);
 extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
-			 sk_read_actor_t recv_actor);
+			 sk_read_actor_t recv_actor, size_t tlen);
 
 extern void tcp_initialize_rcv_mss(struct sock *sk);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bd6ff90..fbbddf4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -523,7 +523,7 @@  static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 {
 	struct tcp_splice_state *tss = rd_desc->arg.data;
 
-	return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags);
+	return skb_splice_bits(skb, offset, tss->pipe, len, tss->flags);
 }
 
 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
@@ -533,7 +533,7 @@  static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
 		.arg.data = tss,
 	};
 
-	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv, tss->len);
 }
 
 /**
@@ -611,11 +611,13 @@  ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 		tss.len -= ret;
 		spliced += ret;
 
+		if (!timeo)
+			break;
 		release_sock(sk);
 		lock_sock(sk);
 
 		if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
-		    (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo ||
+		    (sk->sk_shutdown & RCV_SHUTDOWN) ||
 		    signal_pending(current))
 			break;
 	}
@@ -1193,7 +1195,7 @@  static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
  *	  (although both would be easy to implement).
  */
 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
-		  sk_read_actor_t recv_actor)
+		  sk_read_actor_t recv_actor, size_t tlen)
 {
 	struct sk_buff *skb;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1209,6 +1211,8 @@  int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			size_t len;
 
 			len = skb->len - offset;
+			if (len > tlen)
+				len = tlen;
 			/* Stop reading if we hit a patch of urgent data */
 			if (tp->urg_data) {
 				u32 urg_offset = tp->urg_seq - seq;
@@ -1226,6 +1230,7 @@  int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 				seq += used;
 				copied += used;
 				offset += used;
+				tlen -= used;
 			}
 			/*
 			 * If recv_actor drops the lock (e.g. TCP splice
@@ -1243,7 +1248,7 @@  int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 			break;
 		}
 		sk_eat_skb(sk, skb, 0);
-		if (!desc->count)
+		if (!tlen)
 			break;
 	}
 	tp->copied_seq = seq;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 5cbb404..75f8e83 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1109,8 +1109,7 @@  static void xs_tcp_data_ready(struct sock *sk, int bytes)
 	/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
 	rd_desc.arg.data = xprt;
 	do {
-		rd_desc.count = 65536;
-		read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+		read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv, 65536);
 	} while (read > 0);
 out:
 	read_unlock(&sk->sk_callback_lock);