diff mbox

[v2,net] net: solve a NAPI race

Message ID 1488205298.9415.180.camel@edumazet-glaptop3.roam.corp.google.com
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet Feb. 27, 2017, 2:21 p.m. UTC
From: Eric Dumazet <edumazet@google.com>

While playing with mlx4 hardware timestamping of RX packets, I found
that some packets were received by TCP stack with a ~200 ms delay...

Since the timestamp was provided by the NIC, and my probe was added
in tcp_v4_rcv() while in BH handler, I was confident it was not
a sender issue, or a drop in the network.

This would happen with a very low probability, but hurting RPC
workloads.

A NAPI driver normally arms the IRQ after the napi_complete_done(),
after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
it.

Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
while IRQ are not disabled, we might have later an IRQ firing and
finding this bit set, right before napi_complete_done() clears it.

This can happen with busy polling users, or if gro_flush_timeout is
used. But some other uses of napi_schedule() in drivers can cause this
as well.

This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
can set if it could not grab NAPI_STATE_SCHED

Then napi_complete_done() properly reschedules the napi to make sure
we do not miss something.

Since we manipulate multiple bits at once, use cmpxchg() like in
sk_busy_loop() to provide proper transactions.

In v2, I changed napi_watchdog() to use a relaxed variant of
napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/netdevice.h |   29 ++++++-------------
 net/core/dev.c            |   53 +++++++++++++++++++++++++++++++++---
 2 files changed, 58 insertions(+), 24 deletions(-)

Comments

Alexander H Duyck Feb. 28, 2017, 5:20 p.m. UTC | #1
On Mon, Feb 27, 2017 at 6:21 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> From: Eric Dumazet <edumazet@google.com>
>
> While playing with mlx4 hardware timestamping of RX packets, I found
> that some packets were received by TCP stack with a ~200 ms delay...
>
> Since the timestamp was provided by the NIC, and my probe was added
> in tcp_v4_rcv() while in BH handler, I was confident it was not
> a sender issue, or a drop in the network.
>
> This would happen with a very low probability, but hurting RPC
> workloads.
>
> A NAPI driver normally arms the IRQ after the napi_complete_done(),
> after NAPI_STATE_SCHED is cleared, so that the hard irq handler can grab
> it.
>
> Problem is that if another point in the stack grabs NAPI_STATE_SCHED bit
> while IRQ are not disabled, we might have later an IRQ firing and
> finding this bit set, right before napi_complete_done() clears it.
>
> This can happen with busy polling users, or if gro_flush_timeout is
> used. But some other uses of napi_schedule() in drivers can cause this
> as well.
>
> This patch adds a new NAPI_STATE_MISSED bit, that napi_schedule_prep()
> can set if it could not grab NAPI_STATE_SCHED
>
> Then napi_complete_done() properly reschedules the napi to make sure
> we do not miss something.
>
> Since we manipulate multiple bits at once, use cmpxchg() like in
> sk_busy_loop() to provide proper transactions.
>
> In v2, I changed napi_watchdog() to use a relaxed variant of
> napi_schedule_prep() : No need to set NAPI_STATE_MISSED from this point.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/linux/netdevice.h |   29 ++++++-------------
>  net/core/dev.c            |   53 +++++++++++++++++++++++++++++++++---
>  2 files changed, 58 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index f40f0ab3847a8caaf46bd4d5f224c65014f501cc..97456b2539e46d6232dda804f6a434db6fd7134f 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -330,6 +330,7 @@ struct napi_struct {
>
>  enum {
>         NAPI_STATE_SCHED,       /* Poll is scheduled */
> +       NAPI_STATE_MISSED,      /* reschedule a napi */
>         NAPI_STATE_DISABLE,     /* Disable pending */
>         NAPI_STATE_NPSVC,       /* Netpoll - don't dequeue from poll_list */
>         NAPI_STATE_HASHED,      /* In NAPI hash (busy polling possible) */
> @@ -338,12 +339,13 @@ enum {
>  };
>
>  enum {
> -       NAPIF_STATE_SCHED        = (1UL << NAPI_STATE_SCHED),
> -       NAPIF_STATE_DISABLE      = (1UL << NAPI_STATE_DISABLE),
> -       NAPIF_STATE_NPSVC        = (1UL << NAPI_STATE_NPSVC),
> -       NAPIF_STATE_HASHED       = (1UL << NAPI_STATE_HASHED),
> -       NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
> -       NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
> +       NAPIF_STATE_SCHED        = BIT(NAPI_STATE_SCHED),
> +       NAPIF_STATE_MISSED       = BIT(NAPI_STATE_MISSED),
> +       NAPIF_STATE_DISABLE      = BIT(NAPI_STATE_DISABLE),
> +       NAPIF_STATE_NPSVC        = BIT(NAPI_STATE_NPSVC),
> +       NAPIF_STATE_HASHED       = BIT(NAPI_STATE_HASHED),
> +       NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
> +       NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
>  };
>
>  enum gro_result {
> @@ -414,20 +416,7 @@ static inline bool napi_disable_pending(struct napi_struct *n)
>         return test_bit(NAPI_STATE_DISABLE, &n->state);
>  }
>
> -/**
> - *     napi_schedule_prep - check if NAPI can be scheduled
> - *     @n: NAPI context
> - *
> - * Test if NAPI routine is already running, and if not mark
> - * it as running.  This is used as a condition variable to
> - * insure only one NAPI poll instance runs.  We also make
> - * sure there is no pending NAPI disable.
> - */
> -static inline bool napi_schedule_prep(struct napi_struct *n)
> -{
> -       return !napi_disable_pending(n) &&
> -               !test_and_set_bit(NAPI_STATE_SCHED, &n->state);
> -}
> +bool napi_schedule_prep(struct napi_struct *n);
>
>  /**
>   *     napi_schedule - schedule NAPI poll
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 304f2deae5f9897e60a79ed8b69d6ef208295ded..edeb916487015f279036ecf7ff5d9096dff365d3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -4884,6 +4884,32 @@ void __napi_schedule(struct napi_struct *n)
>  EXPORT_SYMBOL(__napi_schedule);
>
>  /**
> + *     napi_schedule_prep - check if napi can be scheduled
> + *     @n: napi context
> + *
> + * Test if NAPI routine is already running, and if not mark
> + * it as running.  This is used as a condition variable
> + * insure only one NAPI poll instance runs.  We also make
> + * sure there is no pending NAPI disable.
> + */
> +bool napi_schedule_prep(struct napi_struct *n)
> +{
> +       unsigned long val, new;
> +
> +       do {
> +               val = READ_ONCE(n->state);
> +               if (unlikely(val & NAPIF_STATE_DISABLE))
> +                       return false;
> +               new = val | NAPIF_STATE_SCHED;
> +               if (unlikely(val & NAPIF_STATE_SCHED))
> +                       new |= NAPIF_STATE_MISSED;

You might want to consider just using a combination AND, divide,
multiply, and OR to avoid having to have any conditional branches
being added due to this code path.  Basically the logic would look
like:
    new = val | NAPIF_STATE_SCHED;
    new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED;

In assembler that all ends up getting translated out to AND, SHL, OR.
You avoid the branching, or MOV/OR/TEST/CMOV type code you would end
up with otherwise.

> +       } while (cmpxchg(&n->state, val, new) != val);
> +
> +       return !(val & NAPIF_STATE_SCHED);
> +}
> +EXPORT_SYMBOL(napi_schedule_prep);
> +
> +/**
>   * __napi_schedule_irqoff - schedule for receive
>   * @n: entry to schedule
>   *
> @@ -4897,7 +4923,7 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
>
>  bool napi_complete_done(struct napi_struct *n, int work_done)
>  {
> -       unsigned long flags;
> +       unsigned long flags, val, new;
>
>         /*
>          * 1) Don't let napi dequeue from the cpu poll list
> @@ -4927,7 +4953,21 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
>                 list_del_init(&n->poll_list);
>                 local_irq_restore(flags);
>         }
> -       WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
> +
> +       do {
> +               val = READ_ONCE(n->state);
> +
> +               WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
> +
> +               new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
> +
> +               if (unlikely(val & NAPIF_STATE_MISSED))
> +                       new |= NAPIF_STATE_SCHED;

Same kind of thing here.  You could probably just get away with something like:
    new = val & ~NAPIF_STATE_MISSED;
    new &= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NETIF_STATE_SCHED;

> +       } while (cmpxchg(&n->state, val, new) != val);
> +
> +       if (unlikely(val & NAPIF_STATE_MISSED))
> +               __napi_schedule(n);
> +
>         return true;
>  }

If you rescheduled napi should you really be returning true?  Seems
like you should be returning "!(val & NAPIF_STATE_MISSED)" to try to
avoid letting this occur again.

>  EXPORT_SYMBOL(napi_complete_done);
> @@ -5088,8 +5128,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
>         struct napi_struct *napi;
>
>         napi = container_of(timer, struct napi_struct, timer);
> -       if (napi->gro_list)
> -               napi_schedule_irqoff(napi);
> +
> +       /* Note : we use a relaxed variant of napi_schedule_prep() not setting
> +        * NAPI_STATE_MISSED, since we do not react to a device IRQ.
> +        */
> +       if (napi->gro_list && !napi_disable_pending(napi) &&
> +           !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
> +               __napi_schedule_irqoff(napi);
>
>         return HRTIMER_NORESTART;
>  }
>
>
Eric Dumazet Feb. 28, 2017, 5:47 p.m. UTC | #2
On Tue, 2017-02-28 at 09:20 -0800, Alexander Duyck wrote:
> On Mon, Feb 27, 2017 at 6:21 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:

> > +bool napi_schedule_prep(struct napi_struct *n)
> > +{
> > +       unsigned long val, new;
> > +
> > +       do {
> > +               val = READ_ONCE(n->state);
> > +               if (unlikely(val & NAPIF_STATE_DISABLE))
> > +                       return false;
> > +               new = val | NAPIF_STATE_SCHED;
> > +               if (unlikely(val & NAPIF_STATE_SCHED))
> > +                       new |= NAPIF_STATE_MISSED;
> 
> You might want to consider just using a combination AND, divide,
> multiply, and OR to avoid having to have any conditional branches
> being added due to this code path.  Basically the logic would look
> like:


>     new = val | NAPIF_STATE_SCHED;
>     new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED;
> 
> In assembler that all ends up getting translated out to AND, SHL, OR.
> You avoid the branching, or MOV/OR/TEST/CMOV type code you would end
> up with otherwise.

Sure, I can try to optimize this a bit ;)


> > +       } while (cmpxchg(&n->state, val, new) != val);
> > +
> > +       if (unlikely(val & NAPIF_STATE_MISSED))
> > +               __napi_schedule(n);
> > +
> >         return true;
> >  }
> 
> If you rescheduled napi should you really be returning true?  Seems
> like you should be returning "!(val & NAPIF_STATE_MISSED)" to try to
> avoid letting this occur again.

Good idea.

Hmm... you mean that many drivers test napi_complete_done() return
value ?

;)

Thanks !
David Laight March 1, 2017, 10:41 a.m. UTC | #3
From: Alexander Duyck

> Sent: 28 February 2017 17:20

...
> You might want to consider just using a combination AND, divide,

> multiply, and OR to avoid having to have any conditional branches

> being added due to this code path.  Basically the logic would look

> like:

>     new = val | NAPIF_STATE_SCHED;

>     new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED;

> 

> In assembler that all ends up getting translated out to AND, SHL, OR.

> You avoid the branching, or MOV/OR/TEST/CMOV type code you would end

> up with otherwise.


It is a shame gcc doesn't contain that optimisation.
It also doesn't even make a good job of (a & b)/b * c since it
always does a shr and a sal (gcc 4.7.3 and 5.4).

Worthy of a #define or static inline.
Something like:
#define BIT_IF(v, a, b) ((b & (b-1) ? (v & a)/a * b : a > b ? (v & a) / (a/b) : (v & a) * (b/a))

	David
Alexander H Duyck March 1, 2017, 4:14 p.m. UTC | #4
On Wed, Mar 1, 2017 at 2:41 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Alexander Duyck
>> Sent: 28 February 2017 17:20
> ...
>> You might want to consider just using a combination AND, divide,
>> multiply, and OR to avoid having to have any conditional branches
>> being added due to this code path.  Basically the logic would look
>> like:
>>     new = val | NAPIF_STATE_SCHED;
>>     new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED;
>>
>> In assembler that all ends up getting translated out to AND, SHL, OR.
>> You avoid the branching, or MOV/OR/TEST/CMOV type code you would end
>> up with otherwise.
>
> It is a shame gcc doesn't contain that optimisation.
> It also doesn't even make a good job of (a & b)/b * c since it
> always does a shr and a sal (gcc 4.7.3 and 5.4).

What build flags are you using?  With -Os or -O2 I have seen it
convert the /b * c into a single shift.

> Worthy of a #define or static inline.
> Something like:
> #define BIT_IF(v, a, b) ((b & (b-1) ? (v & a)/a * b : a > b ? (v & a) / (a/b) : (v & a) * (b/a))
>
>         David

Feel free to put together a patch.  I use this kind of thing in the
Intel drivers in multiple spots to shift stuff from TX_FLAGS into
descriptor flags.

- Alex
Eric Dumazet March 1, 2017, 5:32 p.m. UTC | #5
On Wed, 2017-03-01 at 08:14 -0800, Alexander Duyck wrote:

> What build flags are you using?  With -Os or -O2 I have seen it
> convert the /b * c into a single shift.
> 


Because b & c are unsigned in our case.

I presume David tried signed integers, this is why gcc does that.
David Laight March 2, 2017, 10:24 a.m. UTC | #6
From: Eric Dumazet

> Sent: 01 March 2017 17:33

> On Wed, 2017-03-01 at 08:14 -0800, Alexander Duyck wrote:

> 

> > What build flags are you using?  With -Os or -O2 I have seen it

> > convert the /b * c into a single shift.

> >

> 

> 

> Because b & c are unsigned in our case.

> 

> I presume David tried signed integers, this is why gcc does that.


I was using integer constants but an unsigned variable.
Changing the constants to unsigned makes no difference (they would get
promoted anyway).

After some experiments I can get gcc to generate a single shift if the
variable being tested is 32bits, the constants are 64bits and a 64bit
result is required.
In every other case it does either and-shr-shl or shr-and-shl (all the
shr are logical).

	David
diff mbox

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f40f0ab3847a8caaf46bd4d5f224c65014f501cc..97456b2539e46d6232dda804f6a434db6fd7134f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -330,6 +330,7 @@  struct napi_struct {
 
 enum {
 	NAPI_STATE_SCHED,	/* Poll is scheduled */
+	NAPI_STATE_MISSED,	/* reschedule a napi */
 	NAPI_STATE_DISABLE,	/* Disable pending */
 	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
 	NAPI_STATE_HASHED,	/* In NAPI hash (busy polling possible) */
@@ -338,12 +339,13 @@  enum {
 };
 
 enum {
-	NAPIF_STATE_SCHED	 = (1UL << NAPI_STATE_SCHED),
-	NAPIF_STATE_DISABLE	 = (1UL << NAPI_STATE_DISABLE),
-	NAPIF_STATE_NPSVC	 = (1UL << NAPI_STATE_NPSVC),
-	NAPIF_STATE_HASHED	 = (1UL << NAPI_STATE_HASHED),
-	NAPIF_STATE_NO_BUSY_POLL = (1UL << NAPI_STATE_NO_BUSY_POLL),
-	NAPIF_STATE_IN_BUSY_POLL = (1UL << NAPI_STATE_IN_BUSY_POLL),
+	NAPIF_STATE_SCHED	 = BIT(NAPI_STATE_SCHED),
+	NAPIF_STATE_MISSED	 = BIT(NAPI_STATE_MISSED),
+	NAPIF_STATE_DISABLE	 = BIT(NAPI_STATE_DISABLE),
+	NAPIF_STATE_NPSVC	 = BIT(NAPI_STATE_NPSVC),
+	NAPIF_STATE_HASHED	 = BIT(NAPI_STATE_HASHED),
+	NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
+	NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
 };
 
 enum gro_result {
@@ -414,20 +416,7 @@  static inline bool napi_disable_pending(struct napi_struct *n)
 	return test_bit(NAPI_STATE_DISABLE, &n->state);
 }
 
-/**
- *	napi_schedule_prep - check if NAPI can be scheduled
- *	@n: NAPI context
- *
- * Test if NAPI routine is already running, and if not mark
- * it as running.  This is used as a condition variable to
- * insure only one NAPI poll instance runs.  We also make
- * sure there is no pending NAPI disable.
- */
-static inline bool napi_schedule_prep(struct napi_struct *n)
-{
-	return !napi_disable_pending(n) &&
-		!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
-}
+bool napi_schedule_prep(struct napi_struct *n);
 
 /**
  *	napi_schedule - schedule NAPI poll
diff --git a/net/core/dev.c b/net/core/dev.c
index 304f2deae5f9897e60a79ed8b69d6ef208295ded..edeb916487015f279036ecf7ff5d9096dff365d3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4884,6 +4884,32 @@  void __napi_schedule(struct napi_struct *n)
 EXPORT_SYMBOL(__napi_schedule);
 
 /**
+ *	napi_schedule_prep - check if napi can be scheduled
+ *	@n: napi context
+ *
+ * Test if NAPI routine is already running, and if not mark
+ * it as running.  This is used as a condition variable
+ * insure only one NAPI poll instance runs.  We also make
+ * sure there is no pending NAPI disable.
+ */
+bool napi_schedule_prep(struct napi_struct *n)
+{
+	unsigned long val, new;
+
+	do {
+		val = READ_ONCE(n->state);
+		if (unlikely(val & NAPIF_STATE_DISABLE))
+			return false;
+		new = val | NAPIF_STATE_SCHED;
+		if (unlikely(val & NAPIF_STATE_SCHED))
+			new |= NAPIF_STATE_MISSED;
+	} while (cmpxchg(&n->state, val, new) != val);
+
+	return !(val & NAPIF_STATE_SCHED);
+}
+EXPORT_SYMBOL(napi_schedule_prep);
+
+/**
  * __napi_schedule_irqoff - schedule for receive
  * @n: entry to schedule
  *
@@ -4897,7 +4923,7 @@  EXPORT_SYMBOL(__napi_schedule_irqoff);
 
 bool napi_complete_done(struct napi_struct *n, int work_done)
 {
-	unsigned long flags;
+	unsigned long flags, val, new;
 
 	/*
 	 * 1) Don't let napi dequeue from the cpu poll list
@@ -4927,7 +4953,21 @@  bool napi_complete_done(struct napi_struct *n, int work_done)
 		list_del_init(&n->poll_list);
 		local_irq_restore(flags);
 	}
-	WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
+
+	do {
+		val = READ_ONCE(n->state);
+
+		WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
+
+		new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+
+		if (unlikely(val & NAPIF_STATE_MISSED))
+			new |= NAPIF_STATE_SCHED;
+	} while (cmpxchg(&n->state, val, new) != val);
+
+	if (unlikely(val & NAPIF_STATE_MISSED))
+		__napi_schedule(n);
+
 	return true;
 }
 EXPORT_SYMBOL(napi_complete_done);
@@ -5088,8 +5128,13 @@  static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 	struct napi_struct *napi;
 
 	napi = container_of(timer, struct napi_struct, timer);
-	if (napi->gro_list)
-		napi_schedule_irqoff(napi);
+
+	/* Note : we use a relaxed variant of napi_schedule_prep() not setting
+	 * NAPI_STATE_MISSED, since we do not react to a device IRQ.
+	 */
+	if (napi->gro_list && !napi_disable_pending(napi) &&
+	    !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+		__napi_schedule_irqoff(napi);
 
 	return HRTIMER_NORESTART;
 }