diff mbox

[net-next,v3,4/8] net: Change return type of sk_busy_loop from bool to void

Message ID 20170324170812.15226.97497.stgit@localhost.localdomain
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Alexander H Duyck March 24, 2017, 5:08 p.m. UTC
From: Alexander Duyck <alexander.h.duyck@intel.com>

>From what I can tell there is only a couple spots where we are actually
checking the return value of sk_busy_loop. As there are only a few
consumers of that data, and the data being checked for can be replaced
with a check for !skb_queue_empty() we might as well just pull the code
out of sk_busy_loop and place it in the spots that actually need it.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Eric Dumazet <edumazet@google.com>
---
 include/net/busy_poll.h |    5 ++---
 net/core/datagram.c     |    8 ++++++--
 net/core/dev.c          |   25 +++++++++++--------------
 net/sctp/socket.c       |    9 ++++++---
 4 files changed, 25 insertions(+), 22 deletions(-)

Comments

Christoph Paasch March 20, 2019, 6:35 p.m. UTC | #1
Hello,

On Fri, Mar 24, 2017 at 3:23 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> From: Alexander Duyck <alexander.h.duyck@intel.com>
>
> >From what I can tell there is only a couple spots where we are actually
> checking the return value of sk_busy_loop. As there are only a few
> consumers of that data, and the data being checked for can be replaced
> with a check for !skb_queue_empty() we might as well just pull the code
> out of sk_busy_loop and place it in the spots that actually need it.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> Acked-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/net/busy_poll.h |    5 ++---
>  net/core/datagram.c     |    8 ++++++--
>  net/core/dev.c          |   25 +++++++++++--------------
>  net/sctp/socket.c       |    9 ++++++---
>  4 files changed, 25 insertions(+), 22 deletions(-)
>
> diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
> index b82d6ba70a14..c55760f4820f 100644
> --- a/include/net/busy_poll.h
> +++ b/include/net/busy_poll.h
> @@ -74,7 +74,7 @@ static inline bool busy_loop_timeout(unsigned long end_time)
>         return time_after(now, end_time);
>  }
>
> -bool sk_busy_loop(struct sock *sk, int nonblock);
> +void sk_busy_loop(struct sock *sk, int nonblock);
>
>  #else /* CONFIG_NET_RX_BUSY_POLL */
>  static inline unsigned long net_busy_loop_on(void)
> @@ -97,9 +97,8 @@ static inline bool busy_loop_timeout(unsigned long end_time)
>         return true;
>  }
>
> -static inline bool sk_busy_loop(struct sock *sk, int nonblock)
> +static inline void sk_busy_loop(struct sock *sk, int nonblock)
>  {
> -       return false;
>  }
>
>  #endif /* CONFIG_NET_RX_BUSY_POLL */
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index ea633342ab0d..4608aa245410 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
>                 }
>
>                 spin_unlock_irqrestore(&queue->lock, cpu_flags);
> -       } while (sk_can_busy_loop(sk) &&
> -                sk_busy_loop(sk, flags & MSG_DONTWAIT));
> +
> +               if (!sk_can_busy_loop(sk))
> +                       break;
> +
> +               sk_busy_loop(sk, flags & MSG_DONTWAIT);
> +       } while (!skb_queue_empty(&sk->sk_receive_queue));

since this change I am hitting stalls where it's looping in this
while-loop with syzkaller.

It worked prior to this change because sk->sk_napi_id was not set thus
sk_busy_loop would make us get out of the loop.

Now, it keeps on looping because there is an skb in the queue with
skb->len == 0 and we are peeking with an offset, thus
__skb_try_recv_from_queue will return NULL and thus we have no way of
getting out of the loop.

I'm not sure what would be the best way to fix it. I don't see why we
end up with an skb in the list with skb->len == 0. So, shooting a
quick e-mail, maybe somebody has an idea :-)

I have the syzkaller-reproducer if needed.

Thanks,
Christoph



>
>         error = -EAGAIN;
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index ab337bf5bbf4..af70eb6ba682 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -5060,21 +5060,19 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
>                 do_softirq();
>  }
>
> -bool sk_busy_loop(struct sock *sk, int nonblock)
> +void sk_busy_loop(struct sock *sk, int nonblock)
>  {
>         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
>         int (*napi_poll)(struct napi_struct *napi, int budget);
>         void *have_poll_lock = NULL;
>         struct napi_struct *napi;
>         unsigned int napi_id;
> -       int rc;
>
>  restart:
>         napi_id = READ_ONCE(sk->sk_napi_id);
>         if (napi_id < MIN_NAPI_ID)
> -               return 0;
> +               return;
>
> -       rc = false;
>         napi_poll = NULL;
>
>         rcu_read_lock();
> @@ -5085,7 +5083,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
>
>         preempt_disable();
>         for (;;) {
> -               rc = 0;
> +               int work = 0;
> +
>                 local_bh_disable();
>                 if (!napi_poll) {
>                         unsigned long val = READ_ONCE(napi->state);
> @@ -5103,12 +5102,12 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
>                         have_poll_lock = netpoll_poll_lock(napi);
>                         napi_poll = napi->poll;
>                 }
> -               rc = napi_poll(napi, BUSY_POLL_BUDGET);
> -               trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
> +               work = napi_poll(napi, BUSY_POLL_BUDGET);
> +               trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
>  count:
> -               if (rc > 0)
> +               if (work > 0)
>                         __NET_ADD_STATS(sock_net(sk),
> -                                       LINUX_MIB_BUSYPOLLRXPACKETS, rc);
> +                                       LINUX_MIB_BUSYPOLLRXPACKETS, work);
>                 local_bh_enable();
>
>                 if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
> @@ -5121,9 +5120,9 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
>                         preempt_enable();
>                         rcu_read_unlock();
>                         cond_resched();
> -                       rc = !skb_queue_empty(&sk->sk_receive_queue);
> -                       if (rc || busy_loop_timeout(end_time))
> -                               return rc;
> +                       if (!skb_queue_empty(&sk->sk_receive_queue) ||
> +                           busy_loop_timeout(end_time))
> +                               return;
>                         goto restart;
>                 }
>                 cpu_relax();
> @@ -5131,10 +5130,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock)
>         if (napi_poll)
>                 busy_poll_stop(napi, have_poll_lock);
>         preempt_enable();
> -       rc = !skb_queue_empty(&sk->sk_receive_queue);
>  out:
>         rcu_read_unlock();
> -       return rc;
>  }
>  EXPORT_SYMBOL(sk_busy_loop);
>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 72cc3ecf6516..ccc08fc39722 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -7518,9 +7518,12 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
>                 if (sk->sk_shutdown & RCV_SHUTDOWN)
>                         break;
>
> -               if (sk_can_busy_loop(sk) &&
> -                   sk_busy_loop(sk, noblock))
> -                       continue;
> +               if (sk_can_busy_loop(sk)) {
> +                       sk_busy_loop(sk, noblock);
> +
> +                       if (!skb_queue_empty(&sk->sk_receive_queue))
> +                               continue;
> +               }
>
>                 /* User doesn't want to wait.  */
>                 error = -EAGAIN;
>
David Miller March 20, 2019, 7:40 p.m. UTC | #2
From: Christoph Paasch <christoph.paasch@gmail.com>
Date: Wed, 20 Mar 2019 11:35:31 -0700

> On Fri, Mar 24, 2017 at 3:23 PM Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
>> diff --git a/net/core/datagram.c b/net/core/datagram.c
>> index ea633342ab0d..4608aa245410 100644
>> --- a/net/core/datagram.c
>> +++ b/net/core/datagram.c
>> @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
>>                 }
>>
>>                 spin_unlock_irqrestore(&queue->lock, cpu_flags);
>> -       } while (sk_can_busy_loop(sk) &&
>> -                sk_busy_loop(sk, flags & MSG_DONTWAIT));
>> +
>> +               if (!sk_can_busy_loop(sk))
>> +                       break;
>> +
>> +               sk_busy_loop(sk, flags & MSG_DONTWAIT);
>> +       } while (!skb_queue_empty(&sk->sk_receive_queue));
> 
> since this change I am hitting stalls where it's looping in this
> while-loop with syzkaller.
> 
> It worked prior to this change because sk->sk_napi_id was not set thus
> sk_busy_loop would make us get out of the loop.
> 
> Now, it keeps on looping because there is an skb in the queue with
> skb->len == 0 and we are peeking with an offset, thus
> __skb_try_recv_from_queue will return NULL and thus we have no way of
> getting out of the loop.
> 
> I'm not sure what would be the best way to fix it. I don't see why we
> end up with an skb in the list with skb->len == 0. So, shooting a
> quick e-mail, maybe somebody has an idea :-)
> 
> I have the syzkaller-reproducer if needed.

Just for the record, __skb_try_recv_datagram() and it's friend
__skb_try_recv_from_queue() are my least favorite functions in the
entire tree for the past year or so.

Their current design, and how they assume things about the
implementation of SKB queues, together result in all the weird
problems we keep fixing in them.

There has to be a much better way to do this.
Paolo Abeni March 21, 2019, 9:45 a.m. UTC | #3
Hi,

On Wed, 2019-03-20 at 11:35 -0700, Christoph Paasch wrote:
> Hello,
> 
> On Fri, Mar 24, 2017 at 3:23 PM Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
> > From: Alexander Duyck <alexander.h.duyck@intel.com>
> > 
> > > From what I can tell there is only a couple spots where we are actually
> > checking the return value of sk_busy_loop. As there are only a few
> > consumers of that data, and the data being checked for can be replaced
> > with a check for !skb_queue_empty() we might as well just pull the code
> > out of sk_busy_loop and place it in the spots that actually need it.
> > 
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> > Acked-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/net/busy_poll.h |    5 ++---
> >  net/core/datagram.c     |    8 ++++++--
> >  net/core/dev.c          |   25 +++++++++++--------------
> >  net/sctp/socket.c       |    9 ++++++---
> >  4 files changed, 25 insertions(+), 22 deletions(-)
> > 
> > diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
> > index b82d6ba70a14..c55760f4820f 100644
> > --- a/include/net/busy_poll.h
> > +++ b/include/net/busy_poll.h
> > @@ -74,7 +74,7 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> >         return time_after(now, end_time);
> >  }
> > 
> > -bool sk_busy_loop(struct sock *sk, int nonblock);
> > +void sk_busy_loop(struct sock *sk, int nonblock);
> > 
> >  #else /* CONFIG_NET_RX_BUSY_POLL */
> >  static inline unsigned long net_busy_loop_on(void)
> > @@ -97,9 +97,8 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> >         return true;
> >  }
> > 
> > -static inline bool sk_busy_loop(struct sock *sk, int nonblock)
> > +static inline void sk_busy_loop(struct sock *sk, int nonblock)
> >  {
> > -       return false;
> >  }
> > 
> >  #endif /* CONFIG_NET_RX_BUSY_POLL */
> > diff --git a/net/core/datagram.c b/net/core/datagram.c
> > index ea633342ab0d..4608aa245410 100644
> > --- a/net/core/datagram.c
> > +++ b/net/core/datagram.c
> > @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct
> > sock *sk, unsigned int flags,
> >                 }
> > 
> >                 spin_unlock_irqrestore(&queue->lock, cpu_flags);
> > -       } while (sk_can_busy_loop(sk) &&
> > -                sk_busy_loop(sk, flags & MSG_DONTWAIT));
> > +
> > +               if (!sk_can_busy_loop(sk))
> > +                       break;
> > +
> > +               sk_busy_loop(sk, flags & MSG_DONTWAIT);
> > +       } while (!skb_queue_empty(&sk->sk_receive_queue));
> 
> since this change I am hitting stalls where it's looping in this
> while-loop with syzkaller.
> 
> It worked prior to this change because sk->sk_napi_id was not set thus
> sk_busy_loop would make us get out of the loop.
> 
> Now, it keeps on looping because there is an skb in the queue with
> skb->len == 0 and we are peeking with an offset, thus
> __skb_try_recv_from_queue will return NULL and thus we have no way of
> getting out of the loop.
> 
> I'm not sure what would be the best way to fix it. I don't see why we
> end up with an skb in the list with skb->len == 0. So, shooting a
> quick e-mail, maybe somebody has an idea :-)
> 
> I have the syzkaller-reproducer if needed.

IIRC we can have 0 len UDP packet sitting on sk_receive_queue since:

commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac
Author: samanthakumar <samanthakumar@google.com>
Date:   Tue Apr 5 12:41:15 2016 -0400

    udp: remove headers from UDP packets before queueing

Both __skb_try_recv_datagram() and napi_busy_loop() assume that we
received some packets if the queue is not empty. When peeking such
assumption is not true, we should check if the last packet is changed,
as __skb_recv_datagram() already does. So I *think* the root cause of
this issue is older than Alex's patch.

The following - completely untested - should avoid the unbounded loop,
but it's not a complete fix, I *think* we should also change
sk_busy_loop_end() in a similar way, but that is a little more complex
due to the additional indirections.

Could you please test it?

Any feedback welcome!


Could you please test it?

Paolo
---
diff --git a/net/core/datagram.c b/net/core/datagram.c
index b2651bb6d2a3..e657289db4ac 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -279,7 +279,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock
*sk, unsigned int flags,
                        break;
 
                sk_busy_loop(sk, flags & MSG_DONTWAIT);
-       } while (!skb_queue_empty(&sk->sk_receive_queue));
+       } while (sk->sk_receive_queue.prev != *last);
 
        error = -EAGAIN;
Willem de Bruijn March 21, 2019, 2:28 p.m. UTC | #4
On Thu, Mar 21, 2019 at 5:46 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hi,
>
> On Wed, 2019-03-20 at 11:35 -0700, Christoph Paasch wrote:
> > Hello,
> >
> > On Fri, Mar 24, 2017 at 3:23 PM Alexander Duyck
> > <alexander.duyck@gmail.com> wrote:
> > > From: Alexander Duyck <alexander.h.duyck@intel.com>
> > >
> > > > From what I can tell there is only a couple spots where we are actually
> > > checking the return value of sk_busy_loop. As there are only a few
> > > consumers of that data, and the data being checked for can be replaced
> > > with a check for !skb_queue_empty() we might as well just pull the code
> > > out of sk_busy_loop and place it in the spots that actually need it.
> > >
> > > Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> > > Acked-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  include/net/busy_poll.h |    5 ++---
> > >  net/core/datagram.c     |    8 ++++++--
> > >  net/core/dev.c          |   25 +++++++++++--------------
> > >  net/sctp/socket.c       |    9 ++++++---
> > >  4 files changed, 25 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
> > > index b82d6ba70a14..c55760f4820f 100644
> > > --- a/include/net/busy_poll.h
> > > +++ b/include/net/busy_poll.h
> > > @@ -74,7 +74,7 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> > >         return time_after(now, end_time);
> > >  }
> > >
> > > -bool sk_busy_loop(struct sock *sk, int nonblock);
> > > +void sk_busy_loop(struct sock *sk, int nonblock);
> > >
> > >  #else /* CONFIG_NET_RX_BUSY_POLL */
> > >  static inline unsigned long net_busy_loop_on(void)
> > > @@ -97,9 +97,8 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> > >         return true;
> > >  }
> > >
> > > -static inline bool sk_busy_loop(struct sock *sk, int nonblock)
> > > +static inline void sk_busy_loop(struct sock *sk, int nonblock)
> > >  {
> > > -       return false;
> > >  }
> > >
> > >  #endif /* CONFIG_NET_RX_BUSY_POLL */
> > > diff --git a/net/core/datagram.c b/net/core/datagram.c
> > > index ea633342ab0d..4608aa245410 100644
> > > --- a/net/core/datagram.c
> > > +++ b/net/core/datagram.c
> > > @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct
> > > sock *sk, unsigned int flags,
> > >                 }
> > >
> > >                 spin_unlock_irqrestore(&queue->lock, cpu_flags);
> > > -       } while (sk_can_busy_loop(sk) &&
> > > -                sk_busy_loop(sk, flags & MSG_DONTWAIT));
> > > +
> > > +               if (!sk_can_busy_loop(sk))
> > > +                       break;
> > > +
> > > +               sk_busy_loop(sk, flags & MSG_DONTWAIT);
> > > +       } while (!skb_queue_empty(&sk->sk_receive_queue));
> >
> > since this change I am hitting stalls where it's looping in this
> > while-loop with syzkaller.
> >
> > It worked prior to this change because sk->sk_napi_id was not set thus
> > sk_busy_loop would make us get out of the loop.
> >
> > Now, it keeps on looping because there is an skb in the queue with
> > skb->len == 0 and we are peeking with an offset, thus
> > __skb_try_recv_from_queue will return NULL and thus we have no way of
> > getting out of the loop.
> >
> > I'm not sure what would be the best way to fix it. I don't see why we
> > end up with an skb in the list with skb->len == 0. So, shooting a
> > quick e-mail, maybe somebody has an idea :-)
> > I have the syzkaller-reproducer if needed.
>
> IIRC we can have 0 len UDP packet sitting on sk_receive_queue since:

Yes, as of header before enqueue pulling zero byte datagrams may be
queued. And these need to be delivered, among other reason for their
cmsg metadata.

> commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac
> Author: samanthakumar <samanthakumar@google.com>
> Date:   Tue Apr 5 12:41:15 2016 -0400
>
>     udp: remove headers from UDP packets before queueing
>
> Both __skb_try_recv_datagram() and napi_busy_loop() assume that we
> received some packets if the queue is not empty. When peeking such
> assumption is not true, we should check if the last packet is changed,
> as __skb_recv_datagram() already does.

Good catch. The condition in sk_busy_loop_end is not easy to address.
Since busy poll is an optimization and poll at offset rare, one way
out may be to amend the __sk_can_busy_loop test in __skb_recv_udp to
disallow busy polling together with peek at offset.

The difference in behavior betwee __skb_try_recv_datagram and
__skb_recv_datagram also reminds of Alexei's earlier report (without
busy polling, seemingly with a list corruption introduced elsewhere)
in

  [net-next,1/3] net/sock: factor out dequeue/peek with offset code
  https://patchwork.ozlabs.org/patch/762327/


> So I *think* the root cause of
> this issue is older than Alex's patch.
>
> The following - completely untested - should avoid the unbounded loop,
> but it's not a complete fix, I *think* we should also change
> sk_busy_loop_end() in a similar way, but that is a little more complex
> due to the additional indirections.
>
> Could you please test it?
>
> Any feedback welcome!
>
>
> Could you please test it?
>
> Paolo
> ---
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index b2651bb6d2a3..e657289db4ac 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -279,7 +279,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock
> *sk, unsigned int flags,
>                         break;
>
>                 sk_busy_loop(sk, flags & MSG_DONTWAIT);
> -       } while (!skb_queue_empty(&sk->sk_receive_queue));
> +       } while (sk->sk_receive_queue.prev != *last);
>
>         error = -EAGAIN;
>
>
Alexander H Duyck March 21, 2019, 4:43 p.m. UTC | #5
On Thu, Mar 21, 2019 at 2:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hi,
>
> On Wed, 2019-03-20 at 11:35 -0700, Christoph Paasch wrote:
> > Hello,
> >
> > On Fri, Mar 24, 2017 at 3:23 PM Alexander Duyck
> > <alexander.duyck@gmail.com> wrote:
> > > From: Alexander Duyck <alexander.h.duyck@intel.com>
> > >
> > > > From what I can tell there is only a couple spots where we are actually
> > > checking the return value of sk_busy_loop. As there are only a few
> > > consumers of that data, and the data being checked for can be replaced
> > > with a check for !skb_queue_empty() we might as well just pull the code
> > > out of sk_busy_loop and place it in the spots that actually need it.
> > >
> > > Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> > > Acked-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  include/net/busy_poll.h |    5 ++---
> > >  net/core/datagram.c     |    8 ++++++--
> > >  net/core/dev.c          |   25 +++++++++++--------------
> > >  net/sctp/socket.c       |    9 ++++++---
> > >  4 files changed, 25 insertions(+), 22 deletions(-)
> > >
> > > diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
> > > index b82d6ba70a14..c55760f4820f 100644
> > > --- a/include/net/busy_poll.h
> > > +++ b/include/net/busy_poll.h
> > > @@ -74,7 +74,7 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> > >         return time_after(now, end_time);
> > >  }
> > >
> > > -bool sk_busy_loop(struct sock *sk, int nonblock);
> > > +void sk_busy_loop(struct sock *sk, int nonblock);
> > >
> > >  #else /* CONFIG_NET_RX_BUSY_POLL */
> > >  static inline unsigned long net_busy_loop_on(void)
> > > @@ -97,9 +97,8 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> > >         return true;
> > >  }
> > >
> > > -static inline bool sk_busy_loop(struct sock *sk, int nonblock)
> > > +static inline void sk_busy_loop(struct sock *sk, int nonblock)
> > >  {
> > > -       return false;
> > >  }
> > >
> > >  #endif /* CONFIG_NET_RX_BUSY_POLL */
> > > diff --git a/net/core/datagram.c b/net/core/datagram.c
> > > index ea633342ab0d..4608aa245410 100644
> > > --- a/net/core/datagram.c
> > > +++ b/net/core/datagram.c
> > > @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct
> > > sock *sk, unsigned int flags,
> > >                 }
> > >
> > >                 spin_unlock_irqrestore(&queue->lock, cpu_flags);
> > > -       } while (sk_can_busy_loop(sk) &&
> > > -                sk_busy_loop(sk, flags & MSG_DONTWAIT));
> > > +
> > > +               if (!sk_can_busy_loop(sk))
> > > +                       break;
> > > +
> > > +               sk_busy_loop(sk, flags & MSG_DONTWAIT);
> > > +       } while (!skb_queue_empty(&sk->sk_receive_queue));
> >
> > since this change I am hitting stalls where it's looping in this
> > while-loop with syzkaller.
> >
> > It worked prior to this change because sk->sk_napi_id was not set thus
> > sk_busy_loop would make us get out of the loop.
> >
> > Now, it keeps on looping because there is an skb in the queue with
> > skb->len == 0 and we are peeking with an offset, thus
> > __skb_try_recv_from_queue will return NULL and thus we have no way of
> > getting out of the loop.
> >
> > I'm not sure what would be the best way to fix it. I don't see why we
> > end up with an skb in the list with skb->len == 0. So, shooting a
> > quick e-mail, maybe somebody has an idea :-)
> >
> > I have the syzkaller-reproducer if needed.
>
> IIRC we can have 0 len UDP packet sitting on sk_receive_queue since:
>
> commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac
> Author: samanthakumar <samanthakumar@google.com>
> Date:   Tue Apr 5 12:41:15 2016 -0400
>
>     udp: remove headers from UDP packets before queueing
>
> Both __skb_try_recv_datagram() and napi_busy_loop() assume that we
> received some packets if the queue is not empty. When peeking such
> assumption is not true, we should check if the last packet is changed,
> as __skb_recv_datagram() already does. So I *think* the root cause of
> this issue is older than Alex's patch.

I agree.

> The following - completely untested - should avoid the unbounded loop,
> but it's not a complete fix, I *think* we should also change
> sk_busy_loop_end() in a similar way, but that is a little more complex
> due to the additional indirections.

As far as sk_busy_loop_end we could look at just forking sk_busy_loop
and writing a separate implementation for datagram sockets that uses a
different loop_end function. It shouldn't take much to change since
all we would need to do is pass a structure containing the sk and last
pointers instead of just passing the sk directly as the loop_end
argument.

> Could you please test it?
>
> Any feedback welcome!

The change below looks good to me.

> Could you please test it?
>
> Paolo
> ---
> diff --git a/net/core/datagram.c b/net/core/datagram.c
> index b2651bb6d2a3..e657289db4ac 100644
> --- a/net/core/datagram.c
> +++ b/net/core/datagram.c
> @@ -279,7 +279,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock
> *sk, unsigned int flags,
>                         break;
>
>                 sk_busy_loop(sk, flags & MSG_DONTWAIT);
> -       } while (!skb_queue_empty(&sk->sk_receive_queue));
> +       } while (sk->sk_receive_queue.prev != *last);
>
>         error = -EAGAIN;
>
>
Christoph Paasch March 22, 2019, 3:05 a.m. UTC | #6
Hi,

On Thu, Mar 21, 2019 at 12:43 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> On Thu, Mar 21, 2019 at 2:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> >
> > Hi,
> >
> > On Wed, 2019-03-20 at 11:35 -0700, Christoph Paasch wrote:
> > > Hello,
> > >
> > > On Fri, Mar 24, 2017 at 3:23 PM Alexander Duyck
> > > <alexander.duyck@gmail.com> wrote:
> > > > From: Alexander Duyck <alexander.h.duyck@intel.com>
> > > >
> > > > > From what I can tell there is only a couple spots where we are actually
> > > > checking the return value of sk_busy_loop. As there are only a few
> > > > consumers of that data, and the data being checked for can be replaced
> > > > with a check for !skb_queue_empty() we might as well just pull the code
> > > > out of sk_busy_loop and place it in the spots that actually need it.
> > > >
> > > > Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
> > > > Acked-by: Eric Dumazet <edumazet@google.com>
> > > > ---
> > > >  include/net/busy_poll.h |    5 ++---
> > > >  net/core/datagram.c     |    8 ++++++--
> > > >  net/core/dev.c          |   25 +++++++++++--------------
> > > >  net/sctp/socket.c       |    9 ++++++---
> > > >  4 files changed, 25 insertions(+), 22 deletions(-)
> > > >
> > > > diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
> > > > index b82d6ba70a14..c55760f4820f 100644
> > > > --- a/include/net/busy_poll.h
> > > > +++ b/include/net/busy_poll.h
> > > > @@ -74,7 +74,7 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> > > >         return time_after(now, end_time);
> > > >  }
> > > >
> > > > -bool sk_busy_loop(struct sock *sk, int nonblock);
> > > > +void sk_busy_loop(struct sock *sk, int nonblock);
> > > >
> > > >  #else /* CONFIG_NET_RX_BUSY_POLL */
> > > >  static inline unsigned long net_busy_loop_on(void)
> > > > @@ -97,9 +97,8 @@ static inline bool busy_loop_timeout(unsigned long end_time)
> > > >         return true;
> > > >  }
> > > >
> > > > -static inline bool sk_busy_loop(struct sock *sk, int nonblock)
> > > > +static inline void sk_busy_loop(struct sock *sk, int nonblock)
> > > >  {
> > > > -       return false;
> > > >  }
> > > >
> > > >  #endif /* CONFIG_NET_RX_BUSY_POLL */
> > > > diff --git a/net/core/datagram.c b/net/core/datagram.c
> > > > index ea633342ab0d..4608aa245410 100644
> > > > --- a/net/core/datagram.c
> > > > +++ b/net/core/datagram.c
> > > > @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct
> > > > sock *sk, unsigned int flags,
> > > >                 }
> > > >
> > > >                 spin_unlock_irqrestore(&queue->lock, cpu_flags);
> > > > -       } while (sk_can_busy_loop(sk) &&
> > > > -                sk_busy_loop(sk, flags & MSG_DONTWAIT));
> > > > +
> > > > +               if (!sk_can_busy_loop(sk))
> > > > +                       break;
> > > > +
> > > > +               sk_busy_loop(sk, flags & MSG_DONTWAIT);
> > > > +       } while (!skb_queue_empty(&sk->sk_receive_queue));
> > >
> > > since this change I am hitting stalls where it's looping in this
> > > while-loop with syzkaller.
> > >
> > > It worked prior to this change because sk->sk_napi_id was not set thus
> > > sk_busy_loop would make us get out of the loop.
> > >
> > > Now, it keeps on looping because there is an skb in the queue with
> > > skb->len == 0 and we are peeking with an offset, thus
> > > __skb_try_recv_from_queue will return NULL and thus we have no way of
> > > getting out of the loop.
> > >
> > > I'm not sure what would be the best way to fix it. I don't see why we
> > > end up with an skb in the list with skb->len == 0. So, shooting a
> > > quick e-mail, maybe somebody has an idea :-)
> > >
> > > I have the syzkaller-reproducer if needed.
> >
> > IIRC we can have 0 len UDP packet sitting on sk_receive_queue since:
> >
> > commit e6afc8ace6dd5cef5e812f26c72579da8806f5ac
> > Author: samanthakumar <samanthakumar@google.com>
> > Date:   Tue Apr 5 12:41:15 2016 -0400
> >
> >     udp: remove headers from UDP packets before queueing
> >
> > Both __skb_try_recv_datagram() and napi_busy_loop() assume that we
> > received some packets if the queue is not empty. When peeking such
> > assumption is not true, we should check if the last packet is changed,
> > as __skb_recv_datagram() already does. So I *think* the root cause of
> > this issue is older than Alex's patch.
>
> I agree.
>
> > The following - completely untested - should avoid the unbounded loop,
> > but it's not a complete fix, I *think* we should also change
> > sk_busy_loop_end() in a similar way, but that is a little more complex
> > due to the additional indirections.
>
> As far as sk_busy_loop_end we could look at just forking sk_busy_loop
> and writing a separate implementation for datagram sockets that uses a
> different loop_end function. It shouldn't take much to change since
> all we would need to do is pass a structure containing the sk and last
> pointers instead of just passing the sk directly as the loop_end
> argument.
>
> > Could you please test it?
> >
> > Any feedback welcome!
>
> The change below looks good to me.

I just tried it out. Worked for me!

You can add my Tested-by if you do a formal patch-submission:

Tested-by: Christoph Paasch <cpaasch@apple.com>


Christoph

>
> > Could you please test it?
> >
> > Paolo
> > ---
> > diff --git a/net/core/datagram.c b/net/core/datagram.c
> > index b2651bb6d2a3..e657289db4ac 100644
> > --- a/net/core/datagram.c
> > +++ b/net/core/datagram.c
> > @@ -279,7 +279,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock
> > *sk, unsigned int flags,
> >                         break;
> >
> >                 sk_busy_loop(sk, flags & MSG_DONTWAIT);
> > -       } while (!skb_queue_empty(&sk->sk_receive_queue));
> > +       } while (sk->sk_receive_queue.prev != *last);
> >
> >         error = -EAGAIN;
> >
> >
Paolo Abeni March 22, 2019, 10:33 a.m. UTC | #7
Hi,

On Thu, 2019-03-21 at 23:05 -0400, Christoph Paasch wrote:
> On Thu, Mar 21, 2019 at 12:43 PM Alexander Duyck
> <alexander.duyck@gmail.com> wrote:
> > On Thu, Mar 21, 2019 at 2:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > The following - completely untested - should avoid the unbounded loop,
> > > but it's not a complete fix, I *think* we should also change
> > > sk_busy_loop_end() in a similar way, but that is a little more complex
> > > due to the additional indirections.
> > 
> > As far as sk_busy_loop_end we could look at just forking sk_busy_loop
> > and writing a separate implementation for datagram sockets that uses a
> > different loop_end function. It shouldn't take much to change since
> > all we would need to do is pass a structure containing the sk and last
> > pointers instead of just passing the sk directly as the loop_end
> > argument.
> > 
> > > Could you please test it?
> > > 
> > > Any feedback welcome!
> > 
> > The change below looks good to me.
> 
> I just tried it out. Worked for me!
> 
> You can add my Tested-by if you do a formal patch-submission:
> 
> Tested-by: Christoph Paasch <cpaasch@apple.com>

Thanks for testing!

I'm trying to reproduce the issue locally, but I'm unable. I think that
the current UDP implementation is not affected, as we always ensure
sk_receive_queue is empty before busy polling. Unix sockets should not
be affected, too, as busy polling should not have any effect there
(sk_napi_id should be never >= MIN_NAPI_ID). Can you reproduce the
issue on an unpatched, recent, upstream kernel?

Can you please provide the syzkaller repro?

Thanks,

Paolo
Eric Dumazet March 22, 2019, 12:59 p.m. UTC | #8
On Fri, Mar 22, 2019 at 3:33 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Hi,
>
> On Thu, 2019-03-21 at 23:05 -0400, Christoph Paasch wrote:
> > On Thu, Mar 21, 2019 at 12:43 PM Alexander Duyck
> > <alexander.duyck@gmail.com> wrote:
> > > On Thu, Mar 21, 2019 at 2:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > The following - completely untested - should avoid the unbounded loop,
> > > > but it's not a complete fix, I *think* we should also change
> > > > sk_busy_loop_end() in a similar way, but that is a little more complex
> > > > due to the additional indirections.
> > >
> > > As far as sk_busy_loop_end we could look at just forking sk_busy_loop
> > > and writing a separate implementation for datagram sockets that uses a
> > > different loop_end function. It shouldn't take much to change since
> > > all we would need to do is pass a structure containing the sk and last
> > > pointers instead of just passing the sk directly as the loop_end
> > > argument.
> > >
> > > > Could you please test it?
> > > >
> > > > Any feedback welcome!
> > >
> > > The change below looks good to me.
> >
> > I just tried it out. Worked for me!
> >
> > You can add my Tested-by if you do a formal patch-submission:
> >
> > Tested-by: Christoph Paasch <cpaasch@apple.com>
>
> Thanks for testing!
>
> I'm trying to reproduce the issue locally, but I'm unable. I think that
> the current UDP implementation is not affected, as we always ensure
> sk_receive_queue is empty before busy polling.

But right after check is done we release the queue lock, so a packet might
come right after the test has been done.

> Unix sockets should not
> be affected, too, as busy polling should not have any effect there
> (sk_napi_id should be never >= MIN_NAPI_ID). Can you reproduce the
> issue on an unpatched, recent, upstream kernel?



>
> Can you please provide the syzkaller repro?
>
> Thanks,
>
> Paolo
>
>
>
>
>
Paolo Abeni March 22, 2019, 1:35 p.m. UTC | #9
On Fri, 2019-03-22 at 05:59 -0700, Eric Dumazet wrote:
> On Fri, Mar 22, 2019 at 3:33 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > Hi,
> > 
> > On Thu, 2019-03-21 at 23:05 -0400, Christoph Paasch wrote:
> > > On Thu, Mar 21, 2019 at 12:43 PM Alexander Duyck
> > > <alexander.duyck@gmail.com> wrote:
> > > > On Thu, Mar 21, 2019 at 2:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > > The following - completely untested - should avoid the unbounded loop,
> > > > > but it's not a complete fix, I *think* we should also change
> > > > > sk_busy_loop_end() in a similar way, but that is a little more complex
> > > > > due to the additional indirections.
> > > > 
> > > > As far as sk_busy_loop_end we could look at just forking sk_busy_loop
> > > > and writing a separate implementation for datagram sockets that uses a
> > > > different loop_end function. It shouldn't take much to change since
> > > > all we would need to do is pass a structure containing the sk and last
> > > > pointers instead of just passing the sk directly as the loop_end
> > > > argument.
> > > > 
> > > > > Could you please test it?
> > > > > 
> > > > > Any feedback welcome!
> > > > 
> > > > The change below looks good to me.
> > > 
> > > I just tried it out. Worked for me!
> > > 
> > > You can add my Tested-by if you do a formal patch-submission:
> > > 
> > > Tested-by: Christoph Paasch <cpaasch@apple.com>
> > 
> > Thanks for testing!
> > 
> > I'm trying to reproduce the issue locally, but I'm unable. I think that
> > the current UDP implementation is not affected, as we always ensure
> > sk_receive_queue is empty before busy polling.
> 
> But right after check is done we release the queue lock, so a packet might
> come right after the test has been done.

Yep I was unclear and uncorrect. My point is: with the current UDP
implementation, if we have a non empty sk_receive_queue after the busy
loop, it always means there are more packets to be processed and we
should loop again, as we currently do.

AFAICS, that is different from the reported issue, where the system
stall looping around sk_receive_queue while no new packets are appended
there.

Cheers,

Paol
Christoph Paasch March 22, 2019, 7:25 p.m. UTC | #10
Hi Paolo,

On Fri, Mar 22, 2019 at 6:33 AM Paolo Abeni <pabeni@redhat.com> wrote:
> On Thu, 2019-03-21 at 23:05 -0400, Christoph Paasch wrote:
> > On Thu, Mar 21, 2019 at 12:43 PM Alexander Duyck
> > <alexander.duyck@gmail.com> wrote:
> > > On Thu, Mar 21, 2019 at 2:45 AM Paolo Abeni <pabeni@redhat.com> wrote:
> > > > The following - completely untested - should avoid the unbounded loop,
> > > > but it's not a complete fix, I *think* we should also change
> > > > sk_busy_loop_end() in a similar way, but that is a little more complex
> > > > due to the additional indirections.
> > >
> > > As far as sk_busy_loop_end we could look at just forking sk_busy_loop
> > > and writing a separate implementation for datagram sockets that uses a
> > > different loop_end function. It shouldn't take much to change since
> > > all we would need to do is pass a structure containing the sk and last
> > > pointers instead of just passing the sk directly as the loop_end
> > > argument.
> > >
> > > > Could you please test it?
> > > >
> > > > Any feedback welcome!
> > >
> > > The change below looks good to me.
> >
> > I just tried it out. Worked for me!
> >
> > You can add my Tested-by if you do a formal patch-submission:
> >
> > Tested-by: Christoph Paasch <cpaasch@apple.com>
>
> Thanks for testing!
>
> I'm trying to reproduce the issue locally, but I'm unable. I think that
> the current UDP implementation is not affected, as we always ensure
> sk_receive_queue is empty before busy polling. Unix sockets should not
> be affected, too, as busy polling should not have any effect there
> (sk_napi_id should be never >= MIN_NAPI_ID). Can you reproduce the
> issue on an unpatched, recent, upstream kernel?

yes, I can repro it with the C-reproducer reliably on the latest
net-branch, v4.14.105 and then also bisected it down to the commit.

> Can you please provide the syzkaller repro?

Sure, here is the full report:

Syzkaller hit 'INFO: rcu detected stall in unix_seqpacket_recvmsg' bug.

INFO: rcu_sched self-detected stall on CPU
1-...: (19373 ticks this GP) idle=ac6/140000000000001/0 softirq=4477/4477 fqs=4
(t=21000 jiffies g=1853 c=1852 q=16)
rcu_sched kthread starved for 20984 jiffies! g1853 c1852 f0x0
RCU_GP_WAIT_FQS(3) ->state=0x0 ->cpu=0
rcu_sched       R  running task    21040     8      2 0x80000000
Call Trace:
 schedule+0xee/0x3a0 kernel/sched/core.c:3427
 schedule_timeout+0x158/0x260 kernel/time/timer.c:1744
 rcu_gp_kthread+0x12fc/0x3580 kernel/rcu/tree.c:2247
 kthread+0x355/0x430 kernel/kthread.c:232
 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:402
NMI backtrace for cpu 1
CPU: 1 PID: 1956 Comm: syz-executor808 Not tainted 4.14.105 #6
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.5.1 01/01/2011
Call Trace:
 <IRQ>
 __dump_stack lib/dump_stack.c:17 [inline]
 dump_stack+0x10a/0x1d1 lib/dump_stack.c:53
 nmi_cpu_backtrace+0xf2/0x110 lib/nmi_backtrace.c:101
 nmi_trigger_cpumask_backtrace+0x116/0x170 lib/nmi_backtrace.c:62
 trigger_single_cpu_backtrace include/linux/nmi.h:158 [inline]
 rcu_dump_cpu_stacks+0x180/0x1dd kernel/rcu/tree.c:1396
 print_cpu_stall kernel/rcu/tree.c:1542 [inline]
 check_cpu_stall.isra.69+0x9f1/0x1080 kernel/rcu/tree.c:1610
 __rcu_pending kernel/rcu/tree.c:3382 [inline]
 rcu_pending kernel/rcu/tree.c:3444 [inline]
 rcu_check_callbacks+0x380/0xc90 kernel/rcu/tree.c:2784
 update_process_times+0x28/0x60 kernel/time/timer.c:1588
 tick_sched_handle+0x7d/0x150 kernel/time/tick-sched.c:161
 tick_sched_timer+0x3d/0x110 kernel/time/tick-sched.c:1219
 __run_hrtimer kernel/time/hrtimer.c:1220 [inline]
 __hrtimer_run_queues+0x3d8/0xb80 kernel/time/hrtimer.c:1284
 hrtimer_interrupt+0x1b0/0x590 kernel/time/hrtimer.c:1318
 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1037 [inline]
 smp_apic_timer_interrupt+0x1d5/0x5b0 arch/x86/kernel/apic/apic.c:1062
 apic_timer_interrupt+0x87/0x90 arch/x86/entry/entry_64.S:787
 </IRQ>
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:778 [inline]
RIP: 0010:__raw_spin_unlock_irqrestore
include/linux/spinlock_api_smp.h:160 [inline]
RIP: 0010:_raw_spin_unlock_irqrestore+0x32/0x60 kernel/locking/spinlock.c:192
RSP: 0018:ffff888065b273b0 EFLAGS: 00000297 ORIG_RAX: ffffffffffffff10
RAX: 0000000000000007 RBX: 0000000000000297 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000297
RBP: ffff88806a2447a0 R08: 1ffff1100cb64e57 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff888065b27450
R13: ffffed100cb64e92 R14: 0000000000000000 R15: ffff88806a244788
 spin_unlock_irqrestore include/linux/spinlock.h:372 [inline]
 __skb_try_recv_datagram+0x299/0x4b0 net/core/datagram.c:274
 unix_dgram_recvmsg+0x294/0x1840 net/unix/af_unix.c:2107
 unix_seqpacket_recvmsg+0x82/0xb0 net/unix/af_unix.c:2073
 sock_recvmsg_nosec net/socket.c:818 [inline]
 sock_recvmsg+0xc4/0x110 net/socket.c:825
 ___sys_recvmsg+0x2a7/0x620 net/socket.c:2220
 __sys_recvmsg+0xc6/0x200 net/socket.c:2265
 SYSC_recvmsg net/socket.c:2277 [inline]
 SyS_recvmsg+0x27/0x40 net/socket.c:2272
 do_syscall_64+0x23f/0x6d0 arch/x86/entry/common.c:289
 entry_SYSCALL_64_after_hwframe+0x42/0xb7
RIP: 0033:0x7f4a53385469
RSP: 002b:00007f4a53a76f28 EFLAGS: 00000246 ORIG_RAX: 000000000000002f
RAX: ffffffffffffffda RBX: 0000000000603168 RCX: 00007f4a53385469
RDX: 0000000040000002 RSI: 0000000020001680 RDI: 0000000000000003
RBP: 0000000000603160 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 000000000060316c
R13: 0800000000008923 R14: 00007f4a53a57000 R15: 0000000000000003


Syzkaller reproducer:
# {Threaded:true Collide:false Repeat:true RepeatTimes:0 Procs:8
Sandbox:none Fault:false FaultCall:-1 FaultNth:0 EnableTun:false
UseTmpDir:false EnableCgroups:false EnableNetdev:false ResetNet:false
HandleSegv:false Repro:false Trace:false}
socketpair$unix(0x1, 0x1000000000005, 0x0,
&(0x7f0000000340)={<r0=>0xffffffffffffffff, <r1=>0xffffffffffffffff})
setsockopt$sock_int(r0, 0x1, 0x2a, &(0x7f00000000c0)=0x9, 0x4)
recvmsg(r0, &(0x7f0000001680)={0x0, 0x0, 0x0}, 0x40000002)
setsockopt$sock_int(r0, 0x1, 0x2e, &(0x7f00000003c0)=0x9, 0x4)
sendmsg(r1, &(0x7f0000000680)={0x0, 0x0, 0x0}, 0x0)
connect$inet6(0xffffffffffffffff, 0x0, 0x0)
socketpair(0xa, 0x800, 0x4, 0x0)
fcntl$getflags(0xffffffffffffffff, 0x40b)
sendto$inet6(0xffffffffffffffff, 0x0, 0x0, 0x24000855, 0x0, 0x0)
getsockopt$inet_IP_XFRM_POLICY(0xffffffffffffff9c, 0x0, 0x11, 0x0, 0x0)
socket$nl_xfrm(0x10, 0x3, 0x6)
setsockopt$netlink_NETLINK_BROADCAST_ERROR(0xffffffffffffffff, 0x10e,
0x4, 0x0, 0x0)
socket$unix(0x1, 0x3, 0x0)
getsockopt$sock_int(0xffffffffffffffff, 0x1, 0x1e, 0x0, 0x0)
ioctl$sock_SIOCGSKNS(0xffffffffffffffff, 0x894c, 0x0)
ioctl$sock_inet_udp_SIOCOUTQ(0xffffffffffffffff, 0x5411, 0x0)
setsockopt$IP_VS_SO_SET_EDIT(0xffffffffffffffff, 0x0, 0x483, 0x0, 0x0)
setsockopt$inet6_IPV6_XFRM_POLICY(0xffffffffffffffff, 0x29, 0x23, 0x0, 0x0)
ioctl$sock_ifreq(0xffffffffffffffff, 0x800000000008923, 0x0)
bind(0xffffffffffffffff, 0x0, 0x0)
socket(0xa, 0x3, 0x3)


C reproducer:
// autogenerated by syzkaller (https://github.com/google/syzkaller)

#define _GNU_SOURCE

#include <dirent.h>
#include <endian.h>
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <sched.h>
#include <signal.h>
#include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>

#include <linux/futex.h>

unsigned long long procid;

static void sleep_ms(uint64_t ms)
{
usleep(ms * 1000);
}

static uint64_t current_time_ms(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_MONOTONIC, &ts))
exit(1);
return (uint64_t)ts.tv_sec * 1000 + (uint64_t)ts.tv_nsec / 1000000;
}

static void thread_start(void* (*fn)(void*), void* arg)
{
pthread_t th;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setstacksize(&attr, 128 << 10);
int i;
for (i = 0; i < 100; i++) {
if (pthread_create(&th, &attr, fn, arg) == 0) {
pthread_attr_destroy(&attr);
return;
}
if (errno == EAGAIN) {
usleep(50);
continue;
}
break;
}
exit(1);
}

typedef struct {
int state;
} event_t;

static void event_init(event_t* ev)
{
ev->state = 0;
}

static void event_reset(event_t* ev)
{
ev->state = 0;
}

static void event_set(event_t* ev)
{
if (ev->state)
exit(1);
__atomic_store_n(&ev->state, 1, __ATOMIC_RELEASE);
syscall(SYS_futex, &ev->state, FUTEX_WAKE | FUTEX_PRIVATE_FLAG);
}

static void event_wait(event_t* ev)
{
while (!__atomic_load_n(&ev->state, __ATOMIC_ACQUIRE))
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, 0);
}

static int event_isset(event_t* ev)
{
return __atomic_load_n(&ev->state, __ATOMIC_ACQUIRE);
}

static int event_timedwait(event_t* ev, uint64_t timeout)
{
uint64_t start = current_time_ms();
uint64_t now = start;
for (;;) {
uint64_t remain = timeout - (now - start);
struct timespec ts;
ts.tv_sec = remain / 1000;
ts.tv_nsec = (remain % 1000) * 1000 * 1000;
syscall(SYS_futex, &ev->state, FUTEX_WAIT | FUTEX_PRIVATE_FLAG, 0, &ts);
if (__atomic_load_n(&ev->state, __ATOMIC_RELAXED))
return 1;
now = current_time_ms();
if (now - start > timeout)
return 0;
}
}

static bool write_file(const char* file, const char* what, ...)
{
char buf[1024];
va_list args;
va_start(args, what);
vsnprintf(buf, sizeof(buf), what, args);
va_end(args);
buf[sizeof(buf) - 1] = 0;
int len = strlen(buf);
int fd = open(file, O_WRONLY | O_CLOEXEC);
if (fd == -1)
return false;
if (write(fd, buf, len) != len) {
int err = errno;
close(fd);
errno = err;
return false;
}
close(fd);
return true;
}

static void setup_common()
{
if (mount(0, "/sys/fs/fuse/connections", "fusectl", 0, 0)) {
}
}

static void loop();

static void sandbox_common()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
setsid();
struct rlimit rlim;
rlim.rlim_cur = rlim.rlim_max = (200 << 20);
setrlimit(RLIMIT_AS, &rlim);
rlim.rlim_cur = rlim.rlim_max = 32 << 20;
setrlimit(RLIMIT_MEMLOCK, &rlim);
rlim.rlim_cur = rlim.rlim_max = 136 << 20;
setrlimit(RLIMIT_FSIZE, &rlim);
rlim.rlim_cur = rlim.rlim_max = 1 << 20;
setrlimit(RLIMIT_STACK, &rlim);
rlim.rlim_cur = rlim.rlim_max = 0;
setrlimit(RLIMIT_CORE, &rlim);
rlim.rlim_cur = rlim.rlim_max = 256;
setrlimit(RLIMIT_NOFILE, &rlim);
if (unshare(CLONE_NEWNS)) {
}
if (unshare(CLONE_NEWIPC)) {
}
if (unshare(0x02000000)) {
}
if (unshare(CLONE_NEWUTS)) {
}
if (unshare(CLONE_SYSVSEM)) {
}
typedef struct {
const char* name;
const char* value;
} sysctl_t;
static const sysctl_t sysctls[] = {
   {"/proc/sys/kernel/shmmax", "16777216"},
   {"/proc/sys/kernel/shmall", "536870912"},
   {"/proc/sys/kernel/shmmni", "1024"},
   {"/proc/sys/kernel/msgmax", "8192"},
   {"/proc/sys/kernel/msgmni", "1024"},
   {"/proc/sys/kernel/msgmnb", "1024"},
   {"/proc/sys/kernel/sem", "1024 1048576 500 1024"},
};
unsigned i;
for (i = 0; i < sizeof(sysctls) / sizeof(sysctls[0]); i++)
write_file(sysctls[i].name, sysctls[i].value);
}

int wait_for_loop(int pid)
{
if (pid < 0)
exit(1);
int status = 0;
while (waitpid(-1, &status, __WALL) != pid) {
}
return WEXITSTATUS(status);
}

static int do_sandbox_none(void)
{
if (unshare(CLONE_NEWPID)) {
}
int pid = fork();
if (pid != 0)
return wait_for_loop(pid);
setup_common();
sandbox_common();
if (unshare(CLONE_NEWNET)) {
}
loop();
exit(1);
}

static void kill_and_wait(int pid, int* status)
{
kill(-pid, SIGKILL);
kill(pid, SIGKILL);
int i;
for (i = 0; i < 100; i++) {
if (waitpid(-1, status, WNOHANG | __WALL) == pid)
return;
usleep(1000);
}
DIR* dir = opendir("/sys/fs/fuse/connections");
if (dir) {
for (;;) {
struct dirent* ent = readdir(dir);
if (!ent)
break;
if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
continue;
char abort[300];
snprintf(abort, sizeof(abort), "/sys/fs/fuse/connections/%s/abort",
ent->d_name);
int fd = open(abort, O_WRONLY);
if (fd == -1) {
continue;
}
if (write(fd, abort, 1) < 0) {
}
close(fd);
}
closedir(dir);
} else {
}
while (waitpid(-1, status, __WALL) != pid) {
}
}

#define SYZ_HAVE_SETUP_TEST 1
static void setup_test()
{
prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
setpgrp();
}

#define SYZ_HAVE_RESET_TEST 1
static void reset_test()
{
int fd;
for (fd = 3; fd < 30; fd++)
close(fd);
}

struct thread_t {
int created, call;
event_t ready, done;
};

static struct thread_t threads[16];
static void execute_call(int call);
static int running;

static void* thr(void* arg)
{
struct thread_t* th = (struct thread_t*)arg;
for (;;) {
event_wait(&th->ready);
event_reset(&th->ready);
execute_call(th->call);
__atomic_fetch_sub(&running, 1, __ATOMIC_RELAXED);
event_set(&th->done);
}
return 0;
}

static void execute_one(void)
{
int i, call, thread;
for (call = 0; call < 21; call++) {
for (thread = 0; thread < (int)(sizeof(threads) / sizeof(threads[0]));
thread++) {
struct thread_t* th = &threads[thread];
if (!th->created) {
th->created = 1;
event_init(&th->ready);
event_init(&th->done);
event_set(&th->done);
thread_start(thr, th);
}
if (!event_isset(&th->done))
continue;
event_reset(&th->done);
th->call = call;
__atomic_fetch_add(&running, 1, __ATOMIC_RELAXED);
event_set(&th->ready);
event_timedwait(&th->done, 45);
break;
}
}
for (i = 0; i < 100 && __atomic_load_n(&running, __ATOMIC_RELAXED); i++)
sleep_ms(1);
}

static void execute_one(void);

#define WAIT_FLAGS __WALL

static void loop(void)
{
int iter;
for (iter = 0;; iter++) {
int pid = fork();
if (pid < 0)
exit(1);
if (pid == 0) {
setup_test();
execute_one();
reset_test();
exit(0);
}
int status = 0;
uint64_t start = current_time_ms();
for (;;) {
if (waitpid(-1, &status, WNOHANG | WAIT_FLAGS) == pid)
break;
sleep_ms(1);
if (current_time_ms() - start < 5 * 1000)
continue;
kill_and_wait(pid, &status);
break;
}
}
}

uint64_t r[2] = {0xffffffffffffffff, 0xffffffffffffffff};

void execute_call(int call)
{
long res; switch (call) {
case 0:
res = syscall(__NR_socketpair, 1, 0x1000000000005, 0, 0x20000340);
if (res != -1) {
r[0] = *(uint32_t*)0x20000340;
r[1] = *(uint32_t*)0x20000344;
}
break;
case 1:
*(uint32_t*)0x200000c0 = 9;
syscall(__NR_setsockopt, r[0], 1, 0x2a, 0x200000c0, 4);
break;
case 2:
*(uint64_t*)0x20001680 = 0;
*(uint32_t*)0x20001688 = 0;
*(uint64_t*)0x20001690 = 0;
*(uint64_t*)0x20001698 = 0;
*(uint64_t*)0x200016a0 = 0;
*(uint64_t*)0x200016a8 = 0;
*(uint32_t*)0x200016b0 = 0;
syscall(__NR_recvmsg, r[0], 0x20001680, 0x40000002);
break;
case 3:
*(uint32_t*)0x200003c0 = 9;
syscall(__NR_setsockopt, r[0], 1, 0x2e, 0x200003c0, 4);
break;
case 4:
*(uint64_t*)0x20000680 = 0;
*(uint32_t*)0x20000688 = 0;
*(uint64_t*)0x20000690 = 0;
*(uint64_t*)0x20000698 = 0;
*(uint64_t*)0x200006a0 = 0;
*(uint64_t*)0x200006a8 = 0;
*(uint32_t*)0x200006b0 = 0;
syscall(__NR_sendmsg, r[1], 0x20000680, 0);
break;
case 5:
syscall(__NR_connect, -1, 0, 0);
break;
case 6:
syscall(__NR_socketpair, 0xa, 0x800, 4, 0);
break;
case 7:
syscall(__NR_fcntl, -1, 0x40b, 0);
break;
case 8:
syscall(__NR_sendto, -1, 0, 0, 0x24000855, 0, 0);
break;
case 9:
syscall(__NR_getsockopt, 0xffffff9c, 0, 0x11, 0, 0);
break;
case 10:
syscall(__NR_socket, 0x10, 3, 6);
break;
case 11:
syscall(__NR_setsockopt, -1, 0x10e, 4, 0, 0);
break;
case 12:
syscall(__NR_socket, 1, 3, 0);
break;
case 13:
syscall(__NR_getsockopt, -1, 1, 0x1e, 0, 0);
break;
case 14:
syscall(__NR_ioctl, -1, 0x894c, 0);
break;
case 15:
syscall(__NR_ioctl, -1, 0x5411, 0);
break;
case 16:
syscall(__NR_setsockopt, -1, 0, 0x483, 0, 0);
break;
case 17:
syscall(__NR_setsockopt, -1, 0x29, 0x23, 0, 0);
break;
case 18:
syscall(__NR_ioctl, -1, 0x800000000008923, 0);
break;
case 19:
syscall(__NR_bind, -1, 0, 0);
break;
case 20:
syscall(__NR_socket, 0xa, 3, 3);
break;
}

}
int main(void)
{
syscall(__NR_mmap, 0x20000000, 0x1000000, 3, 0x32, -1, 0);
for (procid = 0; procid < 8; procid++) {
if (fork() == 0) {
do_sandbox_none();
}
}
sleep(1000000);
return 0;
}
diff mbox

Patch

diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index b82d6ba70a14..c55760f4820f 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -74,7 +74,7 @@  static inline bool busy_loop_timeout(unsigned long end_time)
 	return time_after(now, end_time);
 }
 
-bool sk_busy_loop(struct sock *sk, int nonblock);
+void sk_busy_loop(struct sock *sk, int nonblock);
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
 static inline unsigned long net_busy_loop_on(void)
@@ -97,9 +97,8 @@  static inline bool busy_loop_timeout(unsigned long end_time)
 	return true;
 }
 
-static inline bool sk_busy_loop(struct sock *sk, int nonblock)
+static inline void sk_busy_loop(struct sock *sk, int nonblock)
 {
-	return false;
 }
 
 #endif /* CONFIG_NET_RX_BUSY_POLL */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ea633342ab0d..4608aa245410 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -256,8 +256,12 @@  struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
 		}
 
 		spin_unlock_irqrestore(&queue->lock, cpu_flags);
-	} while (sk_can_busy_loop(sk) &&
-		 sk_busy_loop(sk, flags & MSG_DONTWAIT));
+
+		if (!sk_can_busy_loop(sk))
+			break;
+
+		sk_busy_loop(sk, flags & MSG_DONTWAIT);
+	} while (!skb_queue_empty(&sk->sk_receive_queue));
 
 	error = -EAGAIN;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ab337bf5bbf4..af70eb6ba682 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5060,21 +5060,19 @@  static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
 		do_softirq();
 }
 
-bool sk_busy_loop(struct sock *sk, int nonblock)
+void sk_busy_loop(struct sock *sk, int nonblock)
 {
 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
 	int (*napi_poll)(struct napi_struct *napi, int budget);
 	void *have_poll_lock = NULL;
 	struct napi_struct *napi;
 	unsigned int napi_id;
-	int rc;
 
 restart:
 	napi_id = READ_ONCE(sk->sk_napi_id);
 	if (napi_id < MIN_NAPI_ID)
-		return 0;
+		return;
 
-	rc = false;
 	napi_poll = NULL;
 
 	rcu_read_lock();
@@ -5085,7 +5083,8 @@  bool sk_busy_loop(struct sock *sk, int nonblock)
 
 	preempt_disable();
 	for (;;) {
-		rc = 0;
+		int work = 0;
+
 		local_bh_disable();
 		if (!napi_poll) {
 			unsigned long val = READ_ONCE(napi->state);
@@ -5103,12 +5102,12 @@  bool sk_busy_loop(struct sock *sk, int nonblock)
 			have_poll_lock = netpoll_poll_lock(napi);
 			napi_poll = napi->poll;
 		}
-		rc = napi_poll(napi, BUSY_POLL_BUDGET);
-		trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+		work = napi_poll(napi, BUSY_POLL_BUDGET);
+		trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
 count:
-		if (rc > 0)
+		if (work > 0)
 			__NET_ADD_STATS(sock_net(sk),
-					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
+					LINUX_MIB_BUSYPOLLRXPACKETS, work);
 		local_bh_enable();
 
 		if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) ||
@@ -5121,9 +5120,9 @@  bool sk_busy_loop(struct sock *sk, int nonblock)
 			preempt_enable();
 			rcu_read_unlock();
 			cond_resched();
-			rc = !skb_queue_empty(&sk->sk_receive_queue);
-			if (rc || busy_loop_timeout(end_time))
-				return rc;
+			if (!skb_queue_empty(&sk->sk_receive_queue) ||
+			    busy_loop_timeout(end_time))
+				return;
 			goto restart;
 		}
 		cpu_relax();
@@ -5131,10 +5130,8 @@  bool sk_busy_loop(struct sock *sk, int nonblock)
 	if (napi_poll)
 		busy_poll_stop(napi, have_poll_lock);
 	preempt_enable();
-	rc = !skb_queue_empty(&sk->sk_receive_queue);
 out:
 	rcu_read_unlock();
-	return rc;
 }
 EXPORT_SYMBOL(sk_busy_loop);
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 72cc3ecf6516..ccc08fc39722 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -7518,9 +7518,12 @@  struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags,
 		if (sk->sk_shutdown & RCV_SHUTDOWN)
 			break;
 
-		if (sk_can_busy_loop(sk) &&
-		    sk_busy_loop(sk, noblock))
-			continue;
+		if (sk_can_busy_loop(sk)) {
+			sk_busy_loop(sk, noblock);
+
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				continue;
+		}
 
 		/* User doesn't want to wait.  */
 		error = -EAGAIN;