Patchwork net: fix problem in dequeuing from input_pkt_queue

login
register
mail settings
Submitter Tom Herbert
Date May 19, 2010, 9:47 p.m.
Message ID <alpine.DEB.1.00.1005191440440.23271@pokey.mtv.corp.google.com>
Download mbox | patch
Permalink /patch/53030/
State Superseded
Delegated to: David Miller
Headers show

Comments

Tom Herbert - May 19, 2010, 9:47 p.m.
Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
The primary issue it that the queue head must be incremented only
after a packet has been processed, that is only after
__netif_receive_skb has been called.  This is needed for the mechanism
to prevent OOO packet in RFS.  Also when flushing the input_pkt_queue
and process_queue, the process queue should be done first to prevent
OOO packets.

Because the input_pkt_queue has been effectively split into two queues,
the calculation of the tail ptr is no longer correct.  The correct value
would be head+input_pkt_queue->len+process_queue->len.  To avoid
this calculation we added an explict input_queue_tail in softnet_data.
The tail value is simply incremented when queuing to input_pkt_queue.

In process_backlog the processing of the packet queue can be done
without irq's being disabled.

Made dropped in softnet_data to be "unsigned int" for consistency.

Signed-off-by: Tom Herbert <therbert@google.com>
---
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - May 19, 2010, 11:45 p.m.
On Thu, May 20, 2010 at 5:47 AM, Tom Herbert <therbert@google.com> wrote:
> Fix some issues introduced in batch skb dequeuing for input_pkt_queue.
> The primary issue it that the queue head must be incremented only
> after a packet has been processed, that is only after
> __netif_receive_skb has been called.  This is needed for the mechanism
> to prevent OOO packet in RFS.  Also when flushing the input_pkt_queue
> and process_queue, the process queue should be done first to prevent
> OOO packets.
>
> Because the input_pkt_queue has been effectively split into two queues,
> the calculation of the tail ptr is no longer correct.  The correct value
> would be head+input_pkt_queue->len+process_queue->len.  To avoid
> this calculation we added an explict input_queue_tail in softnet_data.
> The tail value is simply incremented when queuing to input_pkt_queue.
>
> In process_backlog the processing of the packet queue can be done
> without irq's being disabled.
>
>  static int napi_gro_complete(struct sk_buff *skb)
> @@ -3320,26 +3319,24 @@ static int process_backlog(struct napi_struct *napi, int quota)
>        }
>  #endif
>        napi->weight = weight_p;
> -       local_irq_disable();
>        while (work < quota) {
>                struct sk_buff *skb;
>                unsigned int qlen;
>
>                while ((skb = __skb_dequeue(&sd->process_queue))) {
> -                       local_irq_enable();

we need to keep local irq disabled. If not, flush_backlog may be
called, and it will access sd->process_queue.
Tom Herbert - May 19, 2010, 11:58 p.m.
>>        napi->weight = weight_p;
>> -       local_irq_disable();
>>        while (work < quota) {
>>                struct sk_buff *skb;
>>                unsigned int qlen;
>>
>>                while ((skb = __skb_dequeue(&sd->process_queue))) {
>> -                       local_irq_enable();
>
> we need to keep local irq disabled. If not, flush_backlog may be
> called, and it will access sd->process_queue.
>

It should be okay?  process_backlog only runs in softirq so bottom
halves are already disabled, and I don't think flush_backlog runs out
of an interrupt.

>
> --
> Regards,
> Changli Gao(xiaosuo@gmail.com)
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Changli Gao - May 20, 2010, 12:09 a.m.
On Thu, May 20, 2010 at 7:58 AM, Tom Herbert <therbert@google.com> wrote:
>>>        napi->weight = weight_p;
>>> -       local_irq_disable();
>>>        while (work < quota) {
>>>                struct sk_buff *skb;
>>>                unsigned int qlen;
>>>
>>>                while ((skb = __skb_dequeue(&sd->process_queue))) {
>>> -                       local_irq_enable();
>>
>> we need to keep local irq disabled. If not, flush_backlog may be
>> called, and it will access sd->process_queue.
>>
>
> It should be okay?  process_backlog only runs in softirq so bottom
> halves are already disabled, and I don't think flush_backlog runs out
> of an interrupt.
>

Oh no. It is an IRQ handler.

  on_each_cpu(flush_backlog, dev, 1);
...
int on_each_cpu(void (*func) (void *info), void *info, int wait)
{
        int ret = 0;

        preempt_disable();
        ret = smp_call_function(func, info, wait);
        local_irq_disable();
        func(info);
        local_irq_enable();
        preempt_enable();
        return ret;
}
Tom Herbert - May 20, 2010, 2:48 a.m.
>> It should be okay?  process_backlog only runs in softirq so bottom
>> halves are already disabled, and I don't think flush_backlog runs out
>> of an interrupt.
>>
>
> Oh no. It is an IRQ handler.
>
Very well, I will fix that.

Now I'm wondering, though, what the purpose of flush_backlog is...
since __netif_receive_skb is called with interrupts enabled it's
obvious flush_backlog won't catch all the skb's that reference the
device go away.  Is there a reason these packets need to be flushed
and can't just be processed?

>  on_each_cpu(flush_backlog, dev, 1);
> ...
> int on_each_cpu(void (*func) (void *info), void *info, int wait)
> {
>        int ret = 0;
>
>        preempt_disable();
>        ret = smp_call_function(func, info, wait);
>        local_irq_disable();
>        func(info);
>        local_irq_enable();
>        preempt_enable();
>        return ret;
> }
>
> --
> Regards,
> Changli Gao(xiaosuo@gmail.com)
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - May 20, 2010, 4:37 a.m.
Le mercredi 19 mai 2010 à 19:48 -0700, Tom Herbert a écrit :
> >> It should be okay?  process_backlog only runs in softirq so bottom
> >> halves are already disabled, and I don't think flush_backlog runs out
> >> of an interrupt.
> >>
> >
> > Oh no. It is an IRQ handler.
> >
> Very well, I will fix that.
> 
> Now I'm wondering, though, what the purpose of flush_backlog is...
> since __netif_receive_skb is called with interrupts enabled it's
> obvious flush_backlog won't catch all the skb's that reference the
> device go away.  Is there a reason these packets need to be flushed
> and can't just be processed?

flush_backlog is called when device is dismantled.

No new packets should be generated by the device at this moment.

Could you please split your patch in units, I spent 20 minutes to review
it and come to same conclusion than Changli (need to disable interrupts
as they are currently disabled) and also :

input_queue_head_incr(sd); are _not_ needed in flush_backlog()

We are in the very last moments of the life of the device, in a very
unlikely situation (packets in flight, not already consumed by the cpu),
we are _dropping_ packets, so OOO means nothing at this point. 

In dev_cpu_callback(), you reverse the order of input_pkt_queue /
process_queue.

Thats fine, but should be a single patch, because I am not sure the
input_queue_head_incr() are valid here, since we re-inject these packets
to netif_rx(). Could you clarify this point ?

Thanks !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert - May 20, 2010, 6:05 a.m.
On Wed, May 19, 2010 at 9:37 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mercredi 19 mai 2010 à 19:48 -0700, Tom Herbert a écrit :
>> >> It should be okay?  process_backlog only runs in softirq so bottom
>> >> halves are already disabled, and I don't think flush_backlog runs out
>> >> of an interrupt.
>> >>
>> >
>> > Oh no. It is an IRQ handler.
>> >
>> Very well, I will fix that.
>>
>> Now I'm wondering, though, what the purpose of flush_backlog is...
>> since __netif_receive_skb is called with interrupts enabled it's
>> obvious flush_backlog won't catch all the skb's that reference the
>> device go away.  Is there a reason these packets need to be flushed
>> and can't just be processed?
>
> flush_backlog is called when device is dismantled.
>
> No new packets should be generated by the device at this moment.
>
But again since __netif_receive_skb is called with interrupts disabled
there is still a hole that the device could be completely dismantled
but at least one packet from the device still will be processed.  So
it seems like that's a bug, or maybe it's okay to process packets
after flush_backlog-- if the latter case were true why throw out
perfectly good packets?  The only rationale I can think of for
flush_backlog is to eliminate skb's with references to device that has
gone away, but the mechanism does not seem sufficient to cover all
possible skb's with a reference.

> Could you please split your patch in units, I spent 20 minutes to review
> it and come to same conclusion than Changli (need to disable interrupts
> as they are currently disabled) and also :
>
> input_queue_head_incr(sd); are _not_ needed in flush_backlog()
>
I don't see why they wouldn't be needed.  queue tail is incremented
when queuing to the input_pkt_queue, queue head is incremented when
dequeuing (after skb freed or processed).  queue_tail-queue_head ==
input_pkt_queue.len+process_queue.len.  These should be invariants.

> We are in the very last moments of the life of the device, in a very
> unlikely situation (packets in flight, not already consumed by the cpu),
> we are _dropping_ packets, so OOO means nothing at this point.
>
True.  But still seems nice to handle process_queue to be consistent.

> In dev_cpu_callback(), you reverse the order of input_pkt_queue /
> process_queue.
>
> Thats fine, but should be a single patch, because I am not sure the
> input_queue_head_incr() are valid here, since we re-inject these packets
> to netif_rx(). Could you clarify this point ?
>
queue_head advances on every dequeue.  See above...

Thanks for the great comments!

Tom

> Thanks !
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c3487a6..bc0bc85 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1403,17 +1403,25 @@  struct softnet_data {
 	struct softnet_data	*rps_ipi_next;
 	unsigned int		cpu;
 	unsigned int		input_queue_head;
+	unsigned int		input_queue_tail;
 #endif
-	unsigned		dropped;
+	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 };
 
-static inline void input_queue_head_add(struct softnet_data *sd,
-					unsigned int len)
+static inline void input_queue_head_incr(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head += len;
+	sd->input_queue_head++;
+#endif
+}
+
+static inline void input_queue_tail_incr_save(struct softnet_data *sd,
+					      unsigned int *qtail)
+{
+#ifdef CONFIG_RPS
+	*qtail = ++sd->input_queue_tail;
 #endif
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 6c82065..be7d475 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2426,10 +2426,7 @@  static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
-#ifdef CONFIG_RPS
-			*qtail = sd->input_queue_head +
-					skb_queue_len(&sd->input_pkt_queue);
-#endif
+			input_queue_tail_incr_save(sd, qtail);
 			rps_unlock(sd);
 			local_irq_restore(flags);
 			return NET_RX_SUCCESS;
@@ -2959,22 +2956,24 @@  static void flush_backlog(void *arg)
 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
 	struct sk_buff *skb, *tmp;
 
-	rps_lock(sd);
-	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
+	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 		if (skb->dev == dev) {
-			__skb_unlink(skb, &sd->input_pkt_queue);
+			__skb_unlink(skb, &sd->process_queue);
 			kfree_skb(skb);
-			input_queue_head_add(sd, 1);
+			input_queue_head_incr(sd);
 		}
 	}
-	rps_unlock(sd);
 
-	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+	rps_lock(sd);
+	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 		if (skb->dev == dev) {
-			__skb_unlink(skb, &sd->process_queue);
+			__skb_unlink(skb, &sd->input_pkt_queue);
 			kfree_skb(skb);
+			input_queue_head_incr(sd);
 		}
 	}
+	rps_unlock(sd);
+
 }
 
 static int napi_gro_complete(struct sk_buff *skb)
@@ -3320,26 +3319,24 @@  static int process_backlog(struct napi_struct *napi, int quota)
 	}
 #endif
 	napi->weight = weight_p;
-	local_irq_disable();
 	while (work < quota) {
 		struct sk_buff *skb;
 		unsigned int qlen;
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
-			local_irq_enable();
 			__netif_receive_skb(skb);
+			input_queue_head_incr(sd);
 			if (++work >= quota)
 				return work;
-			local_irq_disable();
 		}
 
+		local_irq_disable();
 		rps_lock(sd);
 		qlen = skb_queue_len(&sd->input_pkt_queue);
-		if (qlen) {
-			input_queue_head_add(sd, qlen);
+		if (qlen)
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 						   &sd->process_queue);
-		}
+
 		if (qlen < quota - work) {
 			/*
 			 * Inline a custom version of __napi_complete().
@@ -3354,8 +3351,8 @@  static int process_backlog(struct napi_struct *napi, int quota)
 			quota = work + qlen;
 		}
 		rps_unlock(sd);
+		local_irq_enable();
 	}
-	local_irq_enable();
 
 	return work;
 }
@@ -5679,12 +5676,14 @@  static int dev_cpu_callback(struct notifier_block *nfb,
 	local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */
-	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
+	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 		netif_rx(skb);
-		input_queue_head_add(oldsd, 1);
+		input_queue_head_incr(oldsd);
 	}
-	while ((skb = __skb_dequeue(&oldsd->process_queue)))
+	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
+		input_queue_head_incr(oldsd);
+	}
 
 	return NOTIFY_OK;
 }