diff mbox

[net,v2] datagram: When peeking datagrams with offset < 0 don't skip empty skbs

Message ID 20170818021157.20070-1-matthew@mjdsystems.ca
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Matthew Dawson Aug. 18, 2017, 2:11 a.m. UTC
Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
headers from UDP packets before queueing"), when udp packets are being
peeked the requested extra offset is always 0 as there is no need to skip
the udp header.  However, when the offset is 0 and the next skb is
of length 0, it is only returned once.  The behaviour can be seen with
the following python script:

from socket import *;
f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
f.bind(('::', 0));
addr=('::1', f.getsockname()[1]);
g.sendto(b'', addr)
g.sendto(b'b', addr)
print(f.recvfrom(10, MSG_PEEK));
print(f.recvfrom(10, MSG_PEEK));

Where the expected output should be the empty string twice.

Instead, make sk_peek_offset return negative values, and pass those values
to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed offset
to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
__skb_try_recv_from_queue will then ensure the offset is reset back to 0
if a peek is requested without an offset, unless no packets are found.

Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
greater then 0, and off is greater then or equal to skb->len, then
(_off || skb->len) must always be true assuming skb->len >= 0 is always
true.

Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
as it double checked if MSG_PEEK was set in the flags.

V2:
 - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
redundant checks
 - Fix peeking in udp{,v6}_recvmsg to report the right value when the
offset is 0

Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
---
 include/net/sock.h  |  4 +---
 net/core/datagram.c | 12 +++++++++---
 net/ipv4/udp.c      |  3 ++-
 net/ipv6/udp.c      |  3 ++-
 net/unix/af_unix.c  |  5 +----
 5 files changed, 15 insertions(+), 12 deletions(-)

Comments

Paolo Abeni Aug. 18, 2017, 2:05 p.m. UTC | #1
On Thu, 2017-08-17 at 22:11 -0400, Matthew Dawson wrote:
> Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
> headers from UDP packets before queueing"), when udp packets are being
> peeked the requested extra offset is always 0 as there is no need to skip
> the udp header.  However, when the offset is 0 and the next skb is
> of length 0, it is only returned once.  The behaviour can be seen with
> the following python script:
> 
> from socket import *;
> f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
> g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
> f.bind(('::', 0));
> addr=('::1', f.getsockname()[1]);
> g.sendto(b'', addr)
> g.sendto(b'b', addr)
> print(f.recvfrom(10, MSG_PEEK));
> print(f.recvfrom(10, MSG_PEEK));
> 
> Where the expected output should be the empty string twice.
> 
> Instead, make sk_peek_offset return negative values, and pass those values
> to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed offset
> to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
> __skb_try_recv_from_queue will then ensure the offset is reset back to 0
> if a peek is requested without an offset, unless no packets are found.
> 
> Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
> greater then 0, and off is greater then or equal to skb->len, then
> (_off || skb->len) must always be true assuming skb->len >= 0 is always
> true.
> 
> Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
> as it double checked if MSG_PEEK was set in the flags.
> 
> V2:
>  - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
> redundant checks
>  - Fix peeking in udp{,v6}_recvmsg to report the right value when the
> offset is 0
> 
> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
> ---
>  include/net/sock.h  |  4 +---
>  net/core/datagram.c | 12 +++++++++---
>  net/ipv4/udp.c      |  3 ++-
>  net/ipv6/udp.c      |  3 ++-
>  net/unix/af_unix.c  |  5 +----
>  5 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 7c0632c7e870..aeeec62992ca 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val);
>  static inline int sk_peek_offset(struct sock *sk, int flags)
>  {
>  	if (unlikely(flags & MSG_PEEK)) {
> -		s32 off = READ_ONCE(sk->sk_peek_off);
> -		if (off >= 0)
> -			return off;
> +		return READ_ONCE(sk->sk_peek_off);
>  	}
>  
>  	return 0;
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index ee5647bd91b3..4b558503bef5 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
>  					  int *peeked, int *off, int *err,
>  					  struct sk_buff **last)
>  {
> +	bool peek_at_off = false;
>  	struct sk_buff *skb;
> -	int _off = *off;
> +	int _off = 0;
> +
> +	if (flags & MSG_PEEK && *off >= 0) {
> +		peek_at_off = true;
> +		_off = *off;
> +	}

I think that unlikely() will fit the above condition

>  
>  	*last = queue->prev;
>  	skb_queue_walk(queue, skb) {
>  		if (flags & MSG_PEEK) {
> -			if (_off >= skb->len && (skb->len || _off ||
> -						 skb->peeked)) {
> +			if (peek_at_off && _off >= skb->len &&
> +			    (_off || skb->peeked)) {
>  				_off -= skb->len;
>  				continue;
>  			}
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index a7c804f73990..cd1d044a7fa5 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
>  		return ip_recv_error(sk, msg, len, addr_len);
>  
>  try_again:
> -	peeking = off = sk_peek_offset(sk, flags);
> +	peeking = flags & MSG_PEEK;
> +	off = sk_peek_offset(sk, flags);
>  	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
>  	if (!skb)
>  		return err;
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 578142b7ca3e..20039c8501eb 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
>  		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
>  
>  try_again:
> -	peeking = off = sk_peek_offset(sk, flags);
> +	peeking = flags & MSG_PEEK;
> +	off = sk_peek_offset(sk, flags);
>  	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
>  	if (!skb)
>  		return err;
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index 7b52a380d710..be8982b4f8c0 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
>  	 */
>  	mutex_lock(&u->iolock);
>  
> -	if (flags & MSG_PEEK)
> -		skip = sk_peek_offset(sk, flags);
> -	else
> -		skip = 0;
> +	skip = max(sk_peek_offset(sk, flags), 0);
>  
>  	do {
>  		int chunk;

later we have:

	chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);

without any call to __skb_try_recv_from_queue(), so we will get
bad/unexpected values from the above assignment when 'skip' is
negative.

Overall I still think that adding/using an explicit MSG_PEEK_OFF bit
would produce a simpler code, but is just a personal preference.

Paolo
Matthew Dawson Aug. 18, 2017, 4:39 p.m. UTC | #2
On Friday, August 18, 2017 10:05:18 AM EDT Paolo Abeni wrote:
> On Thu, 2017-08-17 at 22:11 -0400, Matthew Dawson wrote:
> > Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
> > headers from UDP packets before queueing"), when udp packets are being
> > peeked the requested extra offset is always 0 as there is no need to skip
> > the udp header.  However, when the offset is 0 and the next skb is
> > of length 0, it is only returned once.  The behaviour can be seen with
> > the following python script:
> > 
> > from socket import *;
> > f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
> > g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
> > f.bind(('::', 0));
> > addr=('::1', f.getsockname()[1]);
> > g.sendto(b'', addr)
> > g.sendto(b'b', addr)
> > print(f.recvfrom(10, MSG_PEEK));
> > print(f.recvfrom(10, MSG_PEEK));
> > 
> > Where the expected output should be the empty string twice.
> > 
> > Instead, make sk_peek_offset return negative values, and pass those values
> > to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed
> > offset
> > to __skb_try_recv_from_queue is negative, the checked skb is never
> > skipped.
> > __skb_try_recv_from_queue will then ensure the offset is reset back to 0
> > if a peek is requested without an offset, unless no packets are found.
> > 
> > Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
> > greater then 0, and off is greater then or equal to skb->len, then
> > (_off || skb->len) must always be true assuming skb->len >= 0 is always
> > true.
> > 
> > Also remove a redundant check around a call to sk_peek_offset in
> > af_unix.c,
> > as it double checked if MSG_PEEK was set in the flags.
> > 
> > V2:
> >  - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
> > 
> > redundant checks
> > 
> >  - Fix peeking in udp{,v6}_recvmsg to report the right value when the
> > 
> > offset is 0
> > 
> > Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>
> > ---
> > 
> >  include/net/sock.h  |  4 +---
> >  net/core/datagram.c | 12 +++++++++---
> >  net/ipv4/udp.c      |  3 ++-
> >  net/ipv6/udp.c      |  3 ++-
> >  net/unix/af_unix.c  |  5 +----
> >  5 files changed, 15 insertions(+), 12 deletions(-)
> > 
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 7c0632c7e870..aeeec62992ca 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -507,9 +507,7 @@ int sk_set_peek_off(struct sock *sk, int val);
> > 
> >  static inline int sk_peek_offset(struct sock *sk, int flags)
> >  {
> >  
> >  	if (unlikely(flags & MSG_PEEK)) {
> > 
> > -		s32 off = READ_ONCE(sk->sk_peek_off);
> > -		if (off >= 0)
> > -			return off;
> > +		return READ_ONCE(sk->sk_peek_off);
> > 
> >  	}
> >  	
> >  	return 0;
> > 
> > diff --git a/net/core/datagram.c b/net/core/datagram.c
> > index ee5647bd91b3..4b558503bef5 100644
> > --- a/net/core/datagram.c
> > +++ b/net/core/datagram.c
> > @@ -169,14 +169,20 @@ struct sk_buff *__skb_try_recv_from_queue(struct
> > sock *sk,> 
> >  					  int *peeked, int *off, int *err,
> >  					  struct sk_buff **last)
> >  
> >  {
> > 
> > +	bool peek_at_off = false;
> > 
> >  	struct sk_buff *skb;
> > 
> > -	int _off = *off;
> > +	int _off = 0;
> > +
> > +	if (flags & MSG_PEEK && *off >= 0) {
> > +		peek_at_off = true;
> > +		_off = *off;
> > +	}
> 
> I think that unlikely() will fit the above condition
Sounds good.

> 
> >  	*last = queue->prev;
> >  	skb_queue_walk(queue, skb) {
> >  	
> >  		if (flags & MSG_PEEK) {
> > 
> > -			if (_off >= skb->len && (skb->len || _off ||
> > -						 skb->peeked)) {
> > +			if (peek_at_off && _off >= skb->len &&
> > +			    (_off || skb->peeked)) {
> > 
> >  				_off -= skb->len;
> >  				continue;
> >  			
> >  			}
> > 
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index a7c804f73990..cd1d044a7fa5 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -1574,7 +1574,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg,
> > size_t len, int noblock,> 
> >  		return ip_recv_error(sk, msg, len, addr_len);
> >  
> >  try_again:
> > -	peeking = off = sk_peek_offset(sk, flags);
> > +	peeking = flags & MSG_PEEK;
> > +	off = sk_peek_offset(sk, flags);
> > 
> >  	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
> >  	if (!skb)
> >  	
> >  		return err;
> > 
> > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> > index 578142b7ca3e..20039c8501eb 100644
> > --- a/net/ipv6/udp.c
> > +++ b/net/ipv6/udp.c
> > @@ -362,7 +362,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg,
> > size_t len,> 
> >  		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
> >  
> >  try_again:
> > -	peeking = off = sk_peek_offset(sk, flags);
> > +	peeking = flags & MSG_PEEK;
> > +	off = sk_peek_offset(sk, flags);
> > 
> >  	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
> >  	if (!skb)
> >  	
> >  		return err;
> > 
> > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> > index 7b52a380d710..be8982b4f8c0 100644
> > --- a/net/unix/af_unix.c
> > +++ b/net/unix/af_unix.c
> > @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct
> > unix_stream_read_state *state,> 
> >  	 */
> >  	
> >  	mutex_lock(&u->iolock);
> > 
> > -	if (flags & MSG_PEEK)
> > -		skip = sk_peek_offset(sk, flags);
> > -	else
> > -		skip = 0;
> > +	skip = max(sk_peek_offset(sk, flags), 0);
> > 
> >  	do {
> >  	
> >  		int chunk;
> 
> later we have:
> 
> 	chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
> 
> without any call to __skb_try_recv_from_queue(), so we will get
> bad/unexpected values from the above assignment when 'skip' is
> negative.
The assignment to skip should ensure it is never less then zero, thanks to the 
max(sk...(), 0).  Thus that shouldn't be an issue?

> 
> Overall I still think that adding/using an explicit MSG_PEEK_OFF bit
> would produce a simpler code, but is just a personal preference.
I don't mind either way, that just seemed to be the preference I saw from the 
discussion around the patch.  I think either way will work, so whatever the 
list prefers I'm happy with.
Paolo Abeni Aug. 18, 2017, 5:42 p.m. UTC | #3
On Fri, 2017-08-18 at 12:39 -0400, Matthew Dawson wrote:
> On Friday, August 18, 2017 10:05:18 AM EDT Paolo Abeni wrote:
> > On Thu, 2017-08-17 at 22:11 -0400, Matthew Dawson wrote:
> > > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> > > index 7b52a380d710..be8982b4f8c0 100644
> > > --- a/net/unix/af_unix.c
> > > +++ b/net/unix/af_unix.c
> > > @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct
> > > unix_stream_read_state *state,> 
> > >  	 */
> > >  	
> > >  	mutex_lock(&u->iolock);
> > > 
> > > -	if (flags & MSG_PEEK)
> > > -		skip = sk_peek_offset(sk, flags);
> > > -	else
> > > -		skip = 0;
> > > +	skip = max(sk_peek_offset(sk, flags), 0);
> > > 
> > >  	do {
> > >  	
> > >  		int chunk;
> > 
> > later we have:
> > 
> > 	chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
> > 
> > without any call to __skb_try_recv_from_queue(), so we will get
> > bad/unexpected values from the above assignment when 'skip' is
> > negative.
> 
> The assignment to skip should ensure it is never less then zero, thanks to the 
> max(sk...(), 0).  Thus that shouldn't be an issue?

Right, I missed the max() call. Thanks for pointing it out. 
I'm fine with the above.

> > 
> > Overall I still think that adding/using an explicit MSG_PEEK_OFF bit
> > would produce a simpler code, but is just a personal preference.
> 
> I don't mind either way, that just seemed to be the preference I saw from the 
> discussion around the patch.  I think either way will work, so whatever the 
> list prefers I'm happy with.

I'm ok either way. Probably it's worth continue this way.

Paolo
Willem de Bruijn Aug. 18, 2017, 5:52 p.m. UTC | #4
On Fri, Aug 18, 2017 at 1:42 PM, Paolo Abeni <pabeni@redhat.com> wrote:
> On Fri, 2017-08-18 at 12:39 -0400, Matthew Dawson wrote:
>> On Friday, August 18, 2017 10:05:18 AM EDT Paolo Abeni wrote:
>> > On Thu, 2017-08-17 at 22:11 -0400, Matthew Dawson wrote:
>> > > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
>> > > index 7b52a380d710..be8982b4f8c0 100644
>> > > --- a/net/unix/af_unix.c
>> > > +++ b/net/unix/af_unix.c
>> > > @@ -2304,10 +2304,7 @@ static int unix_stream_read_generic(struct
>> > > unix_stream_read_state *state,>
>> > >    */
>> > >
>> > >   mutex_lock(&u->iolock);
>> > >
>> > > - if (flags & MSG_PEEK)
>> > > -         skip = sk_peek_offset(sk, flags);
>> > > - else
>> > > -         skip = 0;
>> > > + skip = max(sk_peek_offset(sk, flags), 0);
>> > >
>> > >   do {
>> > >
>> > >           int chunk;
>> >
>> > later we have:
>> >
>> >     chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
>> >
>> > without any call to __skb_try_recv_from_queue(), so we will get
>> > bad/unexpected values from the above assignment when 'skip' is
>> > negative.
>>
>> The assignment to skip should ensure it is never less then zero, thanks to the
>> max(sk...(), 0).  Thus that shouldn't be an issue?
>
> Right, I missed the max() call. Thanks for pointing it out.
> I'm fine with the above.
>
>> >
>> > Overall I still think that adding/using an explicit MSG_PEEK_OFF bit
>> > would produce a simpler code, but is just a personal preference.
>>
>> I don't mind either way, that just seemed to be the preference I saw from the
>> discussion around the patch.  I think either way will work, so whatever the
>> list prefers I'm happy with.
>
> I'm ok either way. Probably it's worth continue this way.

I don't think anyone cares too much, as long as this is fixed.

I have a slight subjective preference for directly passing the sk_peek_off
variable and relying on the negative value as signal for whether the
SO_PEEK_OFF mode is enabled or not, simply because that signal
already exists and we avoid an intermediate conversion step.

That said, I have a follow-on bug fix for __sk_queue_drop_skb where
that signal is not available, but the flags argument is. I believe that
we will need to take the same action in both cases, so that this is moot.
Just mentioning it in case that would sway opinion the other way.

As said, I'm fine with both. Will Ack this.
Willem de Bruijn Aug. 18, 2017, 5:52 p.m. UTC | #5
On Thu, Aug 17, 2017 at 10:11 PM, Matthew Dawson <matthew@mjdsystems.ca> wrote:
> Due to commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac ("udp: remove
> headers from UDP packets before queueing"), when udp packets are being
> peeked the requested extra offset is always 0 as there is no need to skip
> the udp header.  However, when the offset is 0 and the next skb is
> of length 0, it is only returned once.  The behaviour can be seen with
> the following python script:
>
> from socket import *;
> f=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
> g=socket(AF_INET6, SOCK_DGRAM | SOCK_NONBLOCK, 0);
> f.bind(('::', 0));
> addr=('::1', f.getsockname()[1]);
> g.sendto(b'', addr)
> g.sendto(b'b', addr)
> print(f.recvfrom(10, MSG_PEEK));
> print(f.recvfrom(10, MSG_PEEK));
>
> Where the expected output should be the empty string twice.
>
> Instead, make sk_peek_offset return negative values, and pass those values
> to __skb_try_recv_datagram/__skb_try_recv_from_queue.  If the passed offset
> to __skb_try_recv_from_queue is negative, the checked skb is never skipped.
> __skb_try_recv_from_queue will then ensure the offset is reset back to 0
> if a peek is requested without an offset, unless no packets are found.
>
> Also simplify the if condition in __skb_try_recv_from_queue.  If _off is
> greater then 0, and off is greater then or equal to skb->len, then
> (_off || skb->len) must always be true assuming skb->len >= 0 is always
> true.
>
> Also remove a redundant check around a call to sk_peek_offset in af_unix.c,
> as it double checked if MSG_PEEK was set in the flags.
>
> V2:
>  - Moved the negative fixup into __skb_try_recv_from_queue, and remove now
> redundant checks
>  - Fix peeking in udp{,v6}_recvmsg to report the right value when the
> offset is 0
>
> Signed-off-by: Matthew Dawson <matthew@mjdsystems.ca>

Acked-by: Willem de Bruijn <willemb@google.com>
Willem de Bruijn Aug. 18, 2017, 5:56 p.m. UTC | #6
>> > +   if (flags & MSG_PEEK && *off >= 0) {
>> > +           peek_at_off = true;
>> > +           _off = *off;
>> > +   }
>>
>> I think that unlikely() will fit the above condition
> Sounds good.

Doesn't the compiler implicitly mark branches as unlikely if they
do not have an else clause?
diff mbox

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..aeeec62992ca 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -507,9 +507,7 @@  int sk_set_peek_off(struct sock *sk, int val);
 static inline int sk_peek_offset(struct sock *sk, int flags)
 {
 	if (unlikely(flags & MSG_PEEK)) {
-		s32 off = READ_ONCE(sk->sk_peek_off);
-		if (off >= 0)
-			return off;
+		return READ_ONCE(sk->sk_peek_off);
 	}
 
 	return 0;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..4b558503bef5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -169,14 +169,20 @@  struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
 					  int *peeked, int *off, int *err,
 					  struct sk_buff **last)
 {
+	bool peek_at_off = false;
 	struct sk_buff *skb;
-	int _off = *off;
+	int _off = 0;
+
+	if (flags & MSG_PEEK && *off >= 0) {
+		peek_at_off = true;
+		_off = *off;
+	}
 
 	*last = queue->prev;
 	skb_queue_walk(queue, skb) {
 		if (flags & MSG_PEEK) {
-			if (_off >= skb->len && (skb->len || _off ||
-						 skb->peeked)) {
+			if (peek_at_off && _off >= skb->len &&
+			    (_off || skb->peeked)) {
 				_off -= skb->len;
 				continue;
 			}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a7c804f73990..cd1d044a7fa5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1574,7 +1574,8 @@  int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
 		return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..20039c8501eb 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -362,7 +362,8 @@  int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
-	peeking = off = sk_peek_offset(sk, flags);
+	peeking = flags & MSG_PEEK;
+	off = sk_peek_offset(sk, flags);
 	skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
 	if (!skb)
 		return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7b52a380d710..be8982b4f8c0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2304,10 +2304,7 @@  static int unix_stream_read_generic(struct unix_stream_read_state *state,
 	 */
 	mutex_lock(&u->iolock);
 
-	if (flags & MSG_PEEK)
-		skip = sk_peek_offset(sk, flags);
-	else
-		skip = 0;
+	skip = max(sk_peek_offset(sk, flags), 0);
 
 	do {
 		int chunk;